Commit d79c07a4 authored by Ingo Molnar's avatar Ingo Molnar

[PATCH] O(1) sys_exit(), threading, scalable-exit-2.5.31-B4

the attached patch updates a number of items:

 - adds cleanups suggested by Christoph Hellwig: needed unlikely()
   statements, a superfluous #define and line length problems.

 - splits up the global ptrace list into per-task ptrace lists. This was
   pretty straightforward, and this makes the worst-case exit() latency
   O(nr_children).

the per-task ptrace lists unearthed a bug that the previous code did not
take care of: tasks on the ptrace list have to be correctly reparented as
well. This patch passed my stresstests as well.
parent 5d6df147
#include <linux/config.h> #include <linux/config.h>
#include <linux/ptrace.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/sched.h> #include <linux/sched.h>
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
*/ */
#include <linux/config.h> #include <linux/config.h>
#include <linux/ptrace.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/sched.h> #include <linux/sched.h>
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/unistd.h> #include <linux/unistd.h>
#include <linux/ptrace.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/user.h> #include <linux/user.h>
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/ptrace.h>
#include <linux/unistd.h> #include <linux/unistd.h>
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/personality.h> #include <linux/personality.h>
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/init.h> #include <linux/init.h>
......
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/ptrace.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
#include <linux/config.h> #include <linux/config.h>
#include <linux/threads.h> #include <linux/threads.h>
#include <linux/ptrace.h>
#endif #endif
#ifdef CONFIG_X86_LOCAL_APIC #ifdef CONFIG_X86_LOCAL_APIC
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define _I386_USER_H #define _I386_USER_H
#include <asm/page.h> #include <asm/page.h>
#include <linux/ptrace.h>
/* Core file format: The core file is written in such a way that gdb /* Core file format: The core file is written in such a way that gdb
can understand it and provide useful information to the user (under can understand it and provide useful information to the user (under
linux we use the 'trad-core' bfd). There are quite a number of linux we use the 'trad-core' bfd). There are quite a number of
......
#ifndef _LINUX_BINFMTS_H #ifndef _LINUX_BINFMTS_H
#define _LINUX_BINFMTS_H #define _LINUX_BINFMTS_H
#include <linux/ptrace.h>
#include <linux/capability.h> #include <linux/capability.h>
/* /*
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/time.h> #include <linux/time.h>
#include <linux/ptrace.h>
#include <linux/user.h> #include <linux/user.h>
struct elf_siginfo struct elf_siginfo
......
...@@ -54,6 +54,8 @@ ...@@ -54,6 +54,8 @@
.run_list = LIST_HEAD_INIT(tsk.run_list), \ .run_list = LIST_HEAD_INIT(tsk.run_list), \
.time_slice = HZ, \ .time_slice = HZ, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \ .tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
.real_parent = &tsk, \ .real_parent = &tsk, \
.parent = &tsk, \ .parent = &tsk, \
.children = LIST_HEAD_INIT(tsk.children), \ .children = LIST_HEAD_INIT(tsk.children), \
......
...@@ -354,12 +354,6 @@ extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned ...@@ -354,12 +354,6 @@ extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end); extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
extern int ptrace_attach(struct task_struct *tsk);
extern int ptrace_detach(struct task_struct *, unsigned int);
extern void ptrace_disable(struct task_struct *);
extern int ptrace_check_attach(struct task_struct *task, int kill);
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
/* ptrace.h */ /* ptrace.h */
/* structs and defines to help the user use the ptrace system call. */ /* structs and defines to help the user use the ptrace system call. */
#include <linux/compiler.h>
/* has the defines to get at the registers. */ /* has the defines to get at the registers. */
#define PTRACE_TRACEME 0 #define PTRACE_TRACEME 0
...@@ -23,4 +25,26 @@ ...@@ -23,4 +25,26 @@
#include <asm/ptrace.h> #include <asm/ptrace.h>
extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
extern int ptrace_attach(struct task_struct *tsk);
extern int ptrace_detach(struct task_struct *, unsigned int);
extern void ptrace_disable(struct task_struct *);
extern int ptrace_check_attach(struct task_struct *task, int kill);
extern void __ptrace_link(struct task_struct *child,
struct task_struct *new_parent);
extern void __ptrace_unlink(struct task_struct *child);
static inline void ptrace_link(struct task_struct *child,
struct task_struct *new_parent)
{
if (unlikely(child->ptrace))
__ptrace_link(child, new_parent);
}
static inline void ptrace_unlink(struct task_struct *child)
{
if (unlikely(child->ptrace))
__ptrace_unlink(child);
}
#endif #endif
...@@ -270,6 +270,8 @@ struct task_struct { ...@@ -270,6 +270,8 @@ struct task_struct {
unsigned int time_slice, first_time_slice; unsigned int time_slice, first_time_slice;
struct list_head tasks; struct list_head tasks;
struct list_head ptrace_children;
struct list_head ptrace_list;
struct mm_struct *mm, *active_mm; struct mm_struct *mm, *active_mm;
struct list_head local_pages; struct list_head local_pages;
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/acct.h> #include <linux/acct.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/ptrace.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -65,6 +66,8 @@ static void release_task(struct task_struct * p) ...@@ -65,6 +66,8 @@ static void release_task(struct task_struct * p)
atomic_dec(&p->user->processes); atomic_dec(&p->user->processes);
security_ops->task_free_security(p); security_ops->task_free_security(p);
free_uid(p->user); free_uid(p->user);
BUG_ON(p->ptrace || !list_empty(&p->ptrace_list) ||
!list_empty(&p->ptrace_children));
unhash_process(p); unhash_process(p);
release_thread(p); release_thread(p);
...@@ -177,6 +180,7 @@ void reparent_to_init(void) ...@@ -177,6 +180,7 @@ void reparent_to_init(void)
{ {
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
ptrace_unlink(current);
/* Reparent to init */ /* Reparent to init */
REMOVE_LINKS(current); REMOVE_LINKS(current);
current->parent = child_reaper; current->parent = child_reaper;
...@@ -231,31 +235,8 @@ void daemonize(void) ...@@ -231,31 +235,8 @@ void daemonize(void)
atomic_inc(&current->files->count); atomic_inc(&current->files->count);
} }
/* static void reparent_thread(task_t *p, task_t *reaper, task_t *child_reaper)
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
static inline void forget_original_parent(struct task_struct * father)
{ {
struct task_struct * p, *reaper;
read_lock(&tasklist_lock);
/* Next in our thread group, if they're not already exiting */
reaper = father;
do {
reaper = next_thread(reaper);
if (!(reaper->flags & PF_EXITING))
break;
} while (reaper != father);
if (reaper == father)
reaper = child_reaper;
for_each_task(p) {
if (p->real_parent == father) {
/* We dont want people slaying init */ /* We dont want people slaying init */
p->exit_signal = SIGCHLD; p->exit_signal = SIGCHLD;
p->self_exec_id++; p->self_exec_id++;
...@@ -266,10 +247,8 @@ static inline void forget_original_parent(struct task_struct * father) ...@@ -266,10 +247,8 @@ static inline void forget_original_parent(struct task_struct * father)
else else
p->real_parent = reaper; p->real_parent = reaper;
if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0); if (p->pdeath_signal)
} send_sig(p->pdeath_signal, p, 0);
}
read_unlock(&tasklist_lock);
} }
static inline void close_files(struct files_struct * files) static inline void close_files(struct files_struct * files)
...@@ -419,13 +398,86 @@ void exit_mm(struct task_struct *tsk) ...@@ -419,13 +398,86 @@ void exit_mm(struct task_struct *tsk)
__exit_mm(tsk); __exit_mm(tsk);
} }
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
static inline void forget_original_parent(struct task_struct * father)
{
struct task_struct *p, *reaper;
list_t *_p;
read_lock(&tasklist_lock);
/* Next in our thread group, if they're not already exiting */
reaper = father;
do {
reaper = next_thread(reaper);
if (!(reaper->flags & PF_EXITING))
break;
} while (reaper != father);
if (reaper == father)
reaper = child_reaper;
/*
* There are only two places where our children can be:
*
* - in our child list
* - in the global ptrace list
*
* Search them and reparent children.
*/
list_for_each(_p, &father->children) {
p = list_entry(_p,struct task_struct,sibling);
reparent_thread(p, reaper, child_reaper);
}
list_for_each(_p, &father->ptrace_children) {
p = list_entry(_p,struct task_struct,ptrace_list);
reparent_thread(p, reaper, child_reaper);
}
read_unlock(&tasklist_lock);
}
static inline void zap_thread(task_t *p, task_t *father)
{
ptrace_unlink(p);
list_del_init(&p->sibling);
p->ptrace = 0;
p->parent = p->real_parent;
list_add_tail(&p->sibling, &p->parent->children);
if (p->state == TASK_ZOMBIE && p->exit_signal != -1)
do_notify_parent(p, p->exit_signal);
/*
* process group orphan check
* Case ii: Our child is in a different pgrp
* than we are, and it was the only connection
* outside, so the child pgrp is now orphaned.
*/
if ((p->pgrp != current->pgrp) &&
(p->session == current->session)) {
int pgrp = p->pgrp;
write_unlock_irq(&tasklist_lock);
if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
kill_pg(pgrp,SIGHUP,1);
kill_pg(pgrp,SIGCONT,1);
}
write_lock_irq(&tasklist_lock);
}
}
/* /*
* Send signals to all our closest relatives so that they know * Send signals to all our closest relatives so that they know
* to properly mourn us.. * to properly mourn us..
*/ */
static void exit_notify(void) static void exit_notify(void)
{ {
struct task_struct * p, *t; struct task_struct *t;
list_t *_p, *_n;
forget_original_parent(current); forget_original_parent(current);
/* /*
...@@ -484,33 +536,20 @@ static void exit_notify(void) ...@@ -484,33 +536,20 @@ static void exit_notify(void)
current->state = TASK_ZOMBIE; current->state = TASK_ZOMBIE;
if (current->exit_signal != -1) if (current->exit_signal != -1)
do_notify_parent(current, current->exit_signal); do_notify_parent(current, current->exit_signal);
while ((p = eldest_child(current))) {
list_del_init(&p->sibling);
p->ptrace = 0;
p->parent = p->real_parent; zap_again:
list_add_tail(&p->sibling,&p->parent->children); list_for_each_safe(_p, _n, &current->children)
if (p->state == TASK_ZOMBIE && p->exit_signal != -1) zap_thread(list_entry(_p,struct task_struct,sibling), current);
do_notify_parent(p, p->exit_signal); list_for_each_safe(_p, _n, &current->ptrace_children)
zap_thread(list_entry(_p,struct task_struct,ptrace_list), current);
/* /*
* process group orphan check * reparent_thread might drop the tasklist lock, thus we could
* Case ii: Our child is in a different pgrp * have new children queued back from the ptrace list into the
* than we are, and it was the only connection * child list:
* outside, so the child pgrp is now orphaned.
*/ */
if ((p->pgrp != current->pgrp) && if (unlikely(!list_empty(&current->children) ||
(p->session == current->session)) { !list_empty(&current->ptrace_children)))
int pgrp = p->pgrp; goto zap_again;
write_unlock_irq(&tasklist_lock);
if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
kill_pg(pgrp,SIGHUP,1);
kill_pg(pgrp,SIGCONT,1);
}
write_lock_irq(&tasklist_lock);
}
}
/* /*
* No need to unlock IRQs, we'll schedule() immediately * No need to unlock IRQs, we'll schedule() immediately
* anyway. In the preemption case this also makes it * anyway. In the preemption case this also makes it
...@@ -623,6 +662,12 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc ...@@ -623,6 +662,12 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc
if (p->pgrp != -pid) if (p->pgrp != -pid)
continue; continue;
} }
/*
* Do not consider detached threads that are
* not ptraced:
*/
if (p->exit_signal == -1 && !p->ptrace)
continue;
/* Wait for all children (clone and not) if __WALL is set; /* Wait for all children (clone and not) if __WALL is set;
* otherwise, wait for clone children *only* if __WCLONE is * otherwise, wait for clone children *only* if __WCLONE is
* set; otherwise, wait for non-clone children *only*. (Note: * set; otherwise, wait for non-clone children *only*. (Note:
...@@ -667,7 +712,7 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc ...@@ -667,7 +712,7 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc
if (retval) if (retval)
goto end_wait4; goto end_wait4;
retval = p->pid; retval = p->pid;
if (p->real_parent != p->parent) { if (p->real_parent != p->parent || p->ptrace) {
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
remove_parent(p); remove_parent(p);
p->parent = p->real_parent; p->parent = p->real_parent;
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/futex.h> #include <linux/futex.h>
#include <linux/ptrace.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
...@@ -808,6 +809,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -808,6 +809,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/ */
p->tgid = p->pid; p->tgid = p->pid;
INIT_LIST_HEAD(&p->thread_group); INIT_LIST_HEAD(&p->thread_group);
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
/* Need tasklist lock for parent etc handling! */ /* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
...@@ -827,6 +830,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -827,6 +830,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
} }
SET_LINKS(p); SET_LINKS(p);
ptrace_link(p, p->parent);
hash_pid(p); hash_pid(p);
nr_threads++; nr_threads++;
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
......
...@@ -13,10 +13,48 @@ ...@@ -13,10 +13,48 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/ptrace.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_link(task_t *child, task_t *new_parent)
{
if (!list_empty(&child->ptrace_list))
BUG();
if (child->parent == new_parent)
BUG();
list_add(&child->ptrace_list, &child->parent->ptrace_children);
REMOVE_LINKS(child);
child->parent = new_parent;
SET_LINKS(child);
}
/*
* unptrace a task: move it back to its original parent and
* remove it from the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_unlink(task_t *child)
{
if (!child->ptrace)
BUG();
child->ptrace = 0;
if (list_empty(&child->ptrace_list))
return;
list_del_init(&child->ptrace_list);
REMOVE_LINKS(child);
child->parent = child->real_parent;
SET_LINKS(child);
}
/* /*
* Check that we have indeed attached to the thing.. * Check that we have indeed attached to the thing..
*/ */
...@@ -75,11 +113,7 @@ int ptrace_attach(struct task_struct *task) ...@@ -75,11 +113,7 @@ int ptrace_attach(struct task_struct *task)
task_unlock(task); task_unlock(task);
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
if (task->parent != current) { __ptrace_link(task, current);
REMOVE_LINKS(task);
task->parent = current;
SET_LINKS(task);
}
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
send_sig(SIGSTOP, task, 1); send_sig(SIGSTOP, task, 1);
...@@ -99,16 +133,15 @@ int ptrace_detach(struct task_struct *child, unsigned int data) ...@@ -99,16 +133,15 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
ptrace_disable(child); ptrace_disable(child);
/* .. re-parent .. */ /* .. re-parent .. */
child->ptrace = 0;
child->exit_code = data; child->exit_code = data;
write_lock_irq(&tasklist_lock);
REMOVE_LINKS(child);
child->parent = child->real_parent;
SET_LINKS(child);
write_unlock_irq(&tasklist_lock);
write_lock_irq(&tasklist_lock);
__ptrace_unlink(child);
/* .. and wake it up. */ /* .. and wake it up. */
if (child->state != TASK_ZOMBIE)
wake_up_process(child); wake_up_process(child);
write_unlock_irq(&tasklist_lock);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment