Commit 64cf8edb authored by Ingo Molnar's avatar Ingo Molnar

[PATCH] generic-pidhash-2.5.36-J2, BK-curr

This is the latest version of the generic pidhash patch.  The biggest
change is the removal of separately allocated pid structures: they are
now part of the task structure and the first task that uses a PID will
provide the pid structure.  Task refcounting is used to avoid the
freeing of the task structure before every member of a process group or
session has exited.

This approach has a number of advantages besides the performance gains.
Besides simplifying the whole hashing code significantly, attach_pid()
is now fundamentally atomic and can be called during create_process()
without worrying about task-list side-effects.  It does not have to
re-search the pidhash to find out about raced PID-adding either, and
attach_pid() cannot fail due to OOM.  detach_pid() can do a simple
put_task_struct() instead of the kmem_cache_free().

The only minimal downside is the potential pending task structures after
session leaders or group leaders have exited - but the number of orphan
sessions and process groups is usually very low - and even if it's
higher, this can be regarded as a slow execution of the final
deallocation of the session leader, not some additional burden.
parent a3a132ea
...@@ -609,8 +609,6 @@ static inline int de_thread(struct signal_struct *oldsig) ...@@ -609,8 +609,6 @@ static inline int de_thread(struct signal_struct *oldsig)
ptrace_unlink(leader); ptrace_unlink(leader);
ptrace_unlink(current); ptrace_unlink(current);
unhash_pid(current);
unhash_pid(leader);
remove_parent(current); remove_parent(current);
remove_parent(leader); remove_parent(leader);
/* /*
...@@ -631,8 +629,6 @@ static inline int de_thread(struct signal_struct *oldsig) ...@@ -631,8 +629,6 @@ static inline int de_thread(struct signal_struct *oldsig)
current->ptrace = ptrace; current->ptrace = ptrace;
__ptrace_link(current, parent); __ptrace_link(current, parent);
} }
hash_pid(current);
hash_pid(leader);
list_add_tail(&current->tasks, &init_task.tasks); list_add_tail(&current->tasks, &init_task.tasks);
state = leader->state; state = leader->state;
......
...@@ -480,7 +480,9 @@ static void send_sigio_to_task(struct task_struct *p, ...@@ -480,7 +480,9 @@ static void send_sigio_to_task(struct task_struct *p,
void send_sigio(struct fown_struct *fown, int fd, int band) void send_sigio(struct fown_struct *fown, int fd, int band)
{ {
struct task_struct * p; struct task_struct *p;
struct list_head *l;
struct pid *pidptr;
int pid; int pid;
read_lock(&fown->lock); read_lock(&fown->lock);
...@@ -493,14 +495,8 @@ void send_sigio(struct fown_struct *fown, int fd, int band) ...@@ -493,14 +495,8 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
send_sigio_to_task(p, fown, fd, band); send_sigio_to_task(p, fown, fd, band);
goto out_unlock_task; goto out_unlock_task;
} }
for_each_process(p) { for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr)
int match = p->pid; send_sigio_to_task(p, fown,fd,band);
if (pid < 0)
match = -p->pgrp;
if (pid != match)
continue;
send_sigio_to_task(p, fown, fd, band);
}
out_unlock_task: out_unlock_task:
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
out_unlock_fown: out_unlock_fown:
......
...@@ -195,6 +195,10 @@ static inline void list_splice_init(struct list_head *list, ...@@ -195,6 +195,10 @@ static inline void list_splice_init(struct list_head *list,
#define list_for_each(pos, head) \ #define list_for_each(pos, head) \
for (pos = (head)->next, prefetch(pos->next); pos != (head); \ for (pos = (head)->next, prefetch(pos->next); pos != (head); \
pos = pos->next, prefetch(pos->next)) pos = pos->next, prefetch(pos->next))
#define list_for_each_noprefetch(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)
/** /**
* list_for_each_prev - iterate over a list backwards * list_for_each_prev - iterate over a list backwards
* @pos: the &struct list_head to use as a loop counter. * @pos: the &struct list_head to use as a loop counter.
......
#ifndef _LINUX_PID_H
#define _LINUX_PID_H
enum pid_type
{
PIDTYPE_PID,
PIDTYPE_PGID,
PIDTYPE_SID,
PIDTYPE_MAX
};
struct pid
{
int nr;
atomic_t count;
struct task_struct *task;
struct list_head task_list;
struct list_head hash_chain;
};
struct pid_link
{
struct list_head pid_chain;
struct pid *pidptr;
struct pid pid;
};
#define pid_task(elem, type) \
list_entry(elem, struct task_struct, pids[type].pid_chain)
/*
* attach_pid() must be called with the tasklist_lock write-held.
*
* It might unlock the tasklist_lock for allocation, so this
* function must be called after installing all other links of
* a new task.
*/
extern int FASTCALL(attach_pid(struct task_struct *, enum pid_type, int));
/*
* detach_pid() must be called with the tasklist_lock write-held.
*/
extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type));
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* held.
*/
extern struct pid *FASTCALL(find_pid(enum pid_type, int));
extern int alloc_pidmap(void);
extern void FASTCALL(free_pidmap(int));
#define for_each_task_pid(who, type, task, elem, pid) \
if ((pid = find_pid(type, who))) \
for (elem = pid->task_list.next, \
prefetch(elem->next), \
task = pid_task(elem, type); \
elem != &pid->task_list; \
elem = elem->next, prefetch(elem->next), \
task = pid_task(elem, type))
#endif /* _LINUX_PID_H */
...@@ -28,6 +28,7 @@ extern unsigned long event; ...@@ -28,6 +28,7 @@ extern unsigned long event;
#include <linux/fs_struct.h> #include <linux/fs_struct.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/pid.h>
struct exec_domain; struct exec_domain;
...@@ -266,6 +267,8 @@ struct user_struct { ...@@ -266,6 +267,8 @@ struct user_struct {
atomic_inc(&__user->__count); \ atomic_inc(&__user->__count); \
__user; }) __user; })
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user; extern struct user_struct root_user;
#define INIT_USER (&root_user) #define INIT_USER (&root_user)
...@@ -326,9 +329,8 @@ struct task_struct { ...@@ -326,9 +329,8 @@ struct task_struct {
struct task_struct *group_leader; struct task_struct *group_leader;
struct list_head thread_group; struct list_head thread_group;
/* PID hash table linkage. */ /* PID/PID hash table linkage. */
struct task_struct *pidhash_next; struct pid_link pids[PIDTYPE_MAX];
struct task_struct **pidhash_pprev;
wait_queue_head_t wait_chldexit; /* for wait4() */ wait_queue_head_t wait_chldexit; /* for wait4() */
struct completion *vfork_done; /* for vfork() */ struct completion *vfork_done; /* for vfork() */
...@@ -474,38 +476,7 @@ extern struct task_struct init_task; ...@@ -474,38 +476,7 @@ extern struct task_struct init_task;
extern struct mm_struct init_mm; extern struct mm_struct init_mm;
/* PID hashing. (shouldnt this be dynamic?) */ extern struct task_struct *find_task_by_pid(int pid);
#define PIDHASH_SZ 8192
extern struct task_struct *pidhash[PIDHASH_SZ];
#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
static inline void hash_pid(struct task_struct *p)
{
struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
if((p->pidhash_next = *htable) != NULL)
(*htable)->pidhash_pprev = &p->pidhash_next;
*htable = p;
p->pidhash_pprev = htable;
}
static inline void unhash_pid(struct task_struct *p)
{
if(p->pidhash_next)
p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
*p->pidhash_pprev = p->pidhash_next;
}
static inline struct task_struct *find_task_by_pid(int pid)
{
struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
;
return p;
}
/* per-UID process charging. */ /* per-UID process charging. */
extern struct user_struct * alloc_uid(uid_t); extern struct user_struct * alloc_uid(uid_t);
......
...@@ -17,8 +17,13 @@ ...@@ -17,8 +17,13 @@
#define MIN_THREADS_LEFT_FOR_ROOT 4 #define MIN_THREADS_LEFT_FOR_ROOT 4
/* /*
* This controls the maximum pid allocated to a process * This controls the default maximum pid allocated to a process
*/ */
#define DEFAULT_PID_MAX 0x8000 #define PID_MAX_DEFAULT 0x8000
/*
* A maximum of 4 million PIDs should be enough for a while:
*/
#define PID_MAX_LIMIT (4*1024*1024)
#endif #endif
...@@ -66,6 +66,7 @@ extern void sbus_init(void); ...@@ -66,6 +66,7 @@ extern void sbus_init(void);
extern void sysctl_init(void); extern void sysctl_init(void);
extern void signals_init(void); extern void signals_init(void);
extern void buffer_init(void); extern void buffer_init(void);
extern void pidhash_init(void);
extern void pte_chain_init(void); extern void pte_chain_init(void);
extern void radix_tree_init(void); extern void radix_tree_init(void);
extern void free_initmem(void); extern void free_initmem(void);
...@@ -432,6 +433,7 @@ asmlinkage void __init start_kernel(void) ...@@ -432,6 +433,7 @@ asmlinkage void __init start_kernel(void)
#endif #endif
mem_init(); mem_init();
kmem_cache_sizes_init(); kmem_cache_sizes_init();
pidhash_init();
pgtable_cache_init(); pgtable_cache_init();
pte_chain_init(); pte_chain_init();
fork_init(num_physpages); fork_init(num_physpages);
......
...@@ -8,7 +8,7 @@ export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \ ...@@ -8,7 +8,7 @@ export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \ module.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \ sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o context.o futex.o platform.o signal.o sys.o kmod.o context.o futex.o platform.o pid.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o obj-$(CONFIG_SMP) += cpu.o
......
...@@ -33,7 +33,12 @@ static struct dentry * __unhash_process(struct task_struct *p) ...@@ -33,7 +33,12 @@ static struct dentry * __unhash_process(struct task_struct *p)
{ {
struct dentry *proc_dentry; struct dentry *proc_dentry;
nr_threads--; nr_threads--;
unhash_pid(p); detach_pid(p, PIDTYPE_PID);
if (thread_group_leader(p)) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
}
REMOVE_LINKS(p); REMOVE_LINKS(p);
p->pid = 0; p->pid = 0;
proc_dentry = p->proc_dentry; proc_dentry = p->proc_dentry;
...@@ -109,22 +114,18 @@ void unhash_process(struct task_struct *p) ...@@ -109,22 +114,18 @@ void unhash_process(struct task_struct *p)
int session_of_pgrp(int pgrp) int session_of_pgrp(int pgrp)
{ {
struct task_struct *p; struct task_struct *p;
int fallback; struct list_head *l;
struct pid *pid;
int sid = -1;
fallback = -1;
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
for_each_process(p) { for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
if (p->session <= 0) if (p->session > 0) {
continue; sid = p->session;
if (p->pgrp == pgrp) {
fallback = p->session;
break; break;
} }
if (p->pid == pgrp)
fallback = p->session;
}
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
return fallback; return sid;
} }
/* /*
...@@ -135,21 +136,25 @@ int session_of_pgrp(int pgrp) ...@@ -135,21 +136,25 @@ int session_of_pgrp(int pgrp)
* *
* "I ask you, have you ever known what it is to be an orphan?" * "I ask you, have you ever known what it is to be an orphan?"
*/ */
static int __will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task) static int __will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
{ {
struct task_struct *p; struct task_struct *p;
struct list_head *l;
for_each_process(p) { struct pid *pid;
if ((p == ignored_task) || (p->pgrp != pgrp) || int ret = 1;
(p->state == TASK_ZOMBIE) ||
(p->real_parent->pid == 1)) for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
if (p == ignored_task
|| p->state == TASK_ZOMBIE
|| p->real_parent->pid == 1)
continue; continue;
if ((p->real_parent->pgrp != pgrp) && if (p->real_parent->pgrp != pgrp
(p->real_parent->session == p->session)) { && p->real_parent->session == p->session) {
return 0; ret = 0;
break;
} }
} }
return 1; /* (sighing) "Often!" */ return ret; /* (sighing) "Often!" */
} }
static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task) static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task)
...@@ -171,11 +176,11 @@ int is_orphaned_pgrp(int pgrp) ...@@ -171,11 +176,11 @@ int is_orphaned_pgrp(int pgrp)
static inline int __has_stopped_jobs(int pgrp) static inline int __has_stopped_jobs(int pgrp)
{ {
int retval = 0; int retval = 0;
struct task_struct * p; struct task_struct *p;
struct list_head *l;
struct pid *pid;
for_each_process(p) { for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
if (p->pgrp != pgrp)
continue;
if (p->state != TASK_STOPPED) if (p->state != TASK_STOPPED)
continue; continue;
retval = 1; retval = 1;
...@@ -605,7 +610,8 @@ NORET_TYPE void do_exit(long code) ...@@ -605,7 +610,8 @@ NORET_TYPE void do_exit(long code)
if (tsk->pid == 1) if (tsk->pid == 1)
panic("Attempted to kill init!"); panic("Attempted to kill init!");
tsk->flags |= PF_EXITING; tsk->flags |= PF_EXITING;
del_timer_sync(&tsk->real_timer); if (timer_pending(&tsk->real_timer))
del_timer_sync(&tsk->real_timer);
if (unlikely(preempt_count())) if (unlikely(preempt_count()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
......
...@@ -47,17 +47,6 @@ int nr_threads; ...@@ -47,17 +47,6 @@ int nr_threads;
int max_threads; int max_threads;
unsigned long total_forks; /* Handle normal Linux uptimes. */ unsigned long total_forks; /* Handle normal Linux uptimes. */
/*
* Protects next_safe, last_pid and pid_max:
*/
spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
static int next_safe = DEFAULT_PID_MAX;
int pid_max = DEFAULT_PID_MAX;
int last_pid;
struct task_struct *pidhash[PIDHASH_SZ];
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
/* /*
...@@ -75,16 +64,14 @@ void __put_task_struct(struct task_struct *tsk) ...@@ -75,16 +64,14 @@ void __put_task_struct(struct task_struct *tsk)
} else { } else {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
tsk = task_cache[cpu]; tsk = xchg(task_cache + cpu, tsk);
if (tsk) { if (tsk) {
free_thread_info(tsk->thread_info); free_thread_info(tsk->thread_info);
kmem_cache_free(task_struct_cachep,tsk); kmem_cache_free(task_struct_cachep,tsk);
} }
task_cache[cpu] = current;
} }
} }
/* Protects next_safe and last_pid. */
void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{ {
unsigned long flags; unsigned long flags;
...@@ -140,73 +127,28 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) ...@@ -140,73 +127,28 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
struct task_struct *tsk; struct task_struct *tsk;
struct thread_info *ti; struct thread_info *ti;
ti = alloc_thread_info(); tsk = xchg(task_cache + smp_processor_id(), NULL);
if (!ti)
return NULL;
tsk = kmem_cache_alloc(task_struct_cachep, GFP_KERNEL);
if (!tsk) { if (!tsk) {
free_thread_info(ti); ti = alloc_thread_info();
return NULL; if (!ti)
} return NULL;
tsk = kmem_cache_alloc(task_struct_cachep, GFP_KERNEL);
if (!tsk) {
free_thread_info(ti);
return NULL;
}
} else
ti = tsk->thread_info;
*ti = *orig->thread_info; *ti = *orig->thread_info;
*tsk = *orig; *tsk = *orig;
tsk->thread_info = ti; tsk->thread_info = ti;
ti->task = tsk; ti->task = tsk;
atomic_set(&tsk->usage,1); atomic_set(&tsk->usage,1);
return tsk; return tsk;
} }
static int get_pid(unsigned long flags)
{
struct task_struct *g, *p;
int pid;
if (flags & CLONE_IDLETASK)
return 0;
spin_lock(&lastpid_lock);
if (++last_pid > pid_max) {
last_pid = 300; /* Skip daemons etc. */
goto inside;
}
if (last_pid >= next_safe) {
inside:
if (nr_threads > pid_max >> 4)
pid_max <<= 1;
next_safe = pid_max;
read_lock(&tasklist_lock);
repeat:
do_each_thread(g, p) {
if (p->pid == last_pid ||
p->pgrp == last_pid ||
p->session == last_pid) {
if (++last_pid >= next_safe) {
if (last_pid >= pid_max)
last_pid = 300;
next_safe = pid_max;
}
goto repeat;
}
if (p->pid > last_pid && next_safe > p->pid)
next_safe = p->pid;
if (p->pgrp > last_pid && next_safe > p->pgrp)
next_safe = p->pgrp;
if (p->session > last_pid && next_safe > p->session)
next_safe = p->session;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
pid = last_pid;
spin_unlock(&lastpid_lock);
return pid;
}
static inline int dup_mmap(struct mm_struct * mm) static inline int dup_mmap(struct mm_struct * mm)
{ {
struct vm_area_struct * mpnt, *tmp, **pprev; struct vm_area_struct * mpnt, *tmp, **pprev;
...@@ -726,7 +668,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -726,7 +668,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->state = TASK_UNINTERRUPTIBLE; p->state = TASK_UNINTERRUPTIBLE;
copy_flags(clone_flags, p); copy_flags(clone_flags, p);
p->pid = get_pid(clone_flags); if (clone_flags & CLONE_IDLETASK)
p->pid = 0;
else {
p->pid = alloc_pidmap();
if (p->pid == -1)
goto bad_fork_cleanup;
}
p->proc_dentry = NULL; p->proc_dentry = NULL;
INIT_LIST_HEAD(&p->run_list); INIT_LIST_HEAD(&p->run_list);
...@@ -889,7 +837,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -889,7 +837,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
SET_LINKS(p); SET_LINKS(p);
if (p->ptrace & PT_PTRACED) if (p->ptrace & PT_PTRACED)
__ptrace_link(p, current->parent); __ptrace_link(p, current->parent);
hash_pid(p);
attach_pid(p, PIDTYPE_PID, p->pid);
if (thread_group_leader(p)) {
attach_pid(p, PIDTYPE_PGID, p->pgrp);
attach_pid(p, PIDTYPE_SID, p->session);
}
nr_threads++; nr_threads++;
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
retval = 0; retval = 0;
...@@ -914,6 +868,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -914,6 +868,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_cleanup_security: bad_fork_cleanup_security:
security_ops->task_free_security(p); security_ops->task_free_security(p);
bad_fork_cleanup: bad_fork_cleanup:
if (p->pid > 0)
free_pidmap(p->pid);
put_exec_domain(p->thread_info->exec_domain); put_exec_domain(p->thread_info->exec_domain);
if (p->binfmt && p->binfmt->module) if (p->binfmt && p->binfmt->module)
__MOD_DEC_USE_COUNT(p->binfmt->module); __MOD_DEC_USE_COUNT(p->binfmt->module);
......
...@@ -602,7 +602,6 @@ EXPORT_SYMBOL(init_task); ...@@ -602,7 +602,6 @@ EXPORT_SYMBOL(init_task);
EXPORT_SYMBOL(init_thread_union); EXPORT_SYMBOL(init_thread_union);
EXPORT_SYMBOL(tasklist_lock); EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(pidhash);
#if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU) #if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(__per_cpu_offset);
#endif #endif
......
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
* (C) 2002 William Irwin, IBM
* (C) 2002 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
* against. There is very little to them aside from hashing them and
* parking tasks using given ID's on a list.
*
* The hash is always changed with the tasklist_lock write-acquired,
* and the hash is only accessed with the tasklist_lock at least
* read-acquired, so there's no additional SMP locking needed here.
*
* We have a list of bitmap pages, which bitmaps represent the PID space.
* Allocating and freeing PIDs is completely lockless. The worst-case
* allocation scenario when all but one out of 1 million PIDs possible are
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#define PIDHASH_SIZE 4096
#define pid_hashfn(nr) ((nr >> 8) ^ nr) & (PIDHASH_SIZE - 1)
static struct list_head pid_hash[PIDTYPE_MAX][PIDHASH_SIZE];
int pid_max = PID_MAX_DEFAULT;
int last_pid;
#define RESERVED_PIDS 300
#define PIDMAP_ENTRIES (PID_MAX_LIMIT/PAGE_SIZE/8)
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
typedef struct pidmap {
atomic_t nr_free;
void *page;
} pidmap_t;
static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
{ [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES;
inline void free_pidmap(int pid)
{
pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
/*
* Here we search for the next map that has free bits left.
* Normally the next map has free PIDs.
*/
static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps)
{
while (--*max_steps) {
if (++map == map_limit)
map = pidmap_array;
if (unlikely(!map->page)) {
unsigned long page = get_zeroed_page(GFP_KERNEL);
/*
* Free the page if someone raced with us
* installing it:
*/
if (cmpxchg(&map->page, NULL, page))
free_page(page);
if (!map->page)
break;
}
if (atomic_read(&map->nr_free))
return map;
}
return NULL;
}
int alloc_pidmap(void)
{
int pid, offset, max_steps = PIDMAP_ENTRIES + 1;
pidmap_t *map;
pid = last_pid + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
map = pidmap_array + pid / BITS_PER_PAGE;
if (likely(map->page && !test_and_set_bit(offset, map->page))) {
/*
* There is a small window for last_pid updates to race,
* but in that case the next allocation will go into the
* slowpath and that fixes things up.
*/
return_pid:
atomic_dec(&map->nr_free);
last_pid = pid;
return pid;
}
if (!offset || !atomic_read(&map->nr_free)) {
next_map:
map = next_free_map(map, &max_steps);
if (!map)
goto failure;
offset = 0;
}
/*
* Find the next zero bit:
*/
scan_more:
offset = find_next_zero_bit(map->page, BITS_PER_PAGE, offset);
if (offset == BITS_PER_PAGE)
goto next_map;
if (test_and_set_bit(offset, map->page))
goto scan_more;
/* we got the PID: */
pid = (map - pidmap_array) * BITS_PER_PAGE + offset;
goto return_pid;
failure:
return -1;
}
inline struct pid *find_pid(enum pid_type type, int nr)
{
struct list_head *elem, *bucket = &pid_hash[type][pid_hashfn(nr)];
struct pid *pid;
list_for_each_noprefetch(elem, bucket) {
pid = list_entry(elem, struct pid, hash_chain);
if (pid->nr == nr)
return pid;
}
return NULL;
}
int attach_pid(task_t *task, enum pid_type type, int nr)
{
struct pid *pid = find_pid(type, nr);
if (pid)
atomic_inc(&pid->count);
else {
pid = &task->pids[type].pid;
pid->nr = nr;
atomic_set(&pid->count, 1);
INIT_LIST_HEAD(&pid->task_list);
pid->task = current;
get_task_struct(current);
list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]);
}
list_add(&task->pids[type].pid_chain, &pid->task_list);
task->pids[type].pidptr = pid;
return 0;
}
void detach_pid(task_t *task, enum pid_type type)
{
struct pid_link *link = task->pids + type;
struct pid *pid = link->pidptr;
int nr;
list_del(&link->pid_chain);
if (!atomic_dec_and_test(&pid->count))
return;
nr = pid->nr;
list_del(&pid->hash_chain);
put_task_struct(pid->task);
for (type = 0; type < PIDTYPE_MAX; ++type)
if (find_pid(type, nr))
return;
free_pidmap(nr);
}
extern task_t *find_task_by_pid(int nr)
{
struct pid *pid = find_pid(PIDTYPE_PID, nr);
if (!pid)
return NULL;
return pid_task(pid->task_list.next, PIDTYPE_PID);
}
void __init pidhash_init(void)
{
int i, j;
/*
* Allocate PID 0, and hash it via all PID types:
*/
pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
set_bit(0, pidmap_array->page);
atomic_dec(&pidmap_array->nr_free);
for (i = 0; i < PIDTYPE_MAX; i++) {
for (j = 0; j < PIDHASH_SIZE; j++)
INIT_LIST_HEAD(&pid_hash[i][j]);
attach_pid(current, i, 0);
}
}
...@@ -943,18 +943,18 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) ...@@ -943,18 +943,18 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
{ {
int retval = -EINVAL; struct task_struct *p;
if (pgrp > 0) { struct list_head *l;
struct task_struct *p; struct pid *pid;
int err, retval = -ESRCH;
retval = -ESRCH; if (pgrp <= 0)
for_each_process(p) { return -EINVAL;
if (p->pgrp == pgrp) {
int err = send_sig_info(sig, info, p); for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
if (retval) err = send_sig_info(sig, info, p);
retval = err; if (retval)
} retval = err;
}
} }
return retval; return retval;
} }
...@@ -977,28 +977,33 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) ...@@ -977,28 +977,33 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
* the connection is lost. * the connection is lost.
*/ */
int int
kill_sl_info(int sig, struct siginfo *info, pid_t sess) kill_sl_info(int sig, struct siginfo *info, pid_t sid)
{ {
int retval = -EINVAL; int err, retval = -EINVAL;
if (sess > 0) { struct pid *pid;
struct task_struct *p; struct list_head *l;
struct task_struct *p;
retval = -ESRCH; if (sid <= 0)
read_lock(&tasklist_lock); goto out;
for_each_process(p) {
if (p->leader && p->session == sess) { retval = -ESRCH;
int err = send_sig_info(sig, info, p); read_lock(&tasklist_lock);
if (retval) for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
retval = err; if (!p->leader)
} continue;
} err = send_sig_info(sig, info, p);
read_unlock(&tasklist_lock); if (retval)
retval = err;
} }
read_unlock(&tasklist_lock);
out:
return retval; return retval;
} }
inline int int
kill_proc_info(int sig, struct siginfo *info, pid_t pid) kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{ {
int error; int error;
......
...@@ -203,35 +203,34 @@ cond_syscall(sys_nfsservctl) ...@@ -203,35 +203,34 @@ cond_syscall(sys_nfsservctl)
cond_syscall(sys_quotactl) cond_syscall(sys_quotactl)
cond_syscall(sys_acct) cond_syscall(sys_acct)
static int proc_sel(struct task_struct *p, int which, int who) static int set_one_prio(struct task_struct *p, int niceval, int error)
{ {
if(p->pid) if (p->uid != current->euid &&
{ p->uid != current->uid && !capable(CAP_SYS_NICE)) {
switch (which) { error = -EPERM;
case PRIO_PROCESS: goto out;
if (!who && p == current)
return 1;
return(p->pid == who);
case PRIO_PGRP:
if (!who)
who = current->pgrp;
return(p->pgrp == who);
case PRIO_USER:
if (!who)
who = current->uid;
return(p->uid == who);
}
} }
return 0;
if (error == -ESRCH)
error = 0;
if (niceval < task_nice(p) && !capable(CAP_SYS_NICE))
error = -EACCES;
else
set_user_nice(p, niceval);
out:
return error;
} }
asmlinkage long sys_setpriority(int which, int who, int niceval) asmlinkage long sys_setpriority(int which, int who, int niceval)
{ {
struct task_struct *g, *p; struct task_struct *g, *p;
int error; struct user_struct *user;
struct pid *pid;
struct list_head *l;
int error = -EINVAL;
if (which > 2 || which < 0) if (which > 2 || which < 0)
return -EINVAL; goto out;
/* normalize: avoid signed division (rounding problems) */ /* normalize: avoid signed division (rounding problems) */
error = -ESRCH; error = -ESRCH;
...@@ -241,31 +240,38 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) ...@@ -241,31 +240,38 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
niceval = 19; niceval = 19;
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
do_each_thread(g, p) { switch (which) {
int no_nice; case PRIO_PROCESS:
if (!proc_sel(p, which, who)) if (!who)
continue; who = current->pid;
if (p->uid != current->euid && p = find_task_by_pid(who);
p->uid != current->uid && !capable(CAP_SYS_NICE)) { if (p)
error = -EPERM; error = set_one_prio(p, niceval, error);
continue; break;
} case PRIO_PGRP:
if (error == -ESRCH) if (!who)
error = 0; who = current->pgrp;
if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) { for_each_task_pid(who, PIDTYPE_PGID, p, l, pid)
error = -EACCES; error = set_one_prio(p, niceval, error);
continue; break;
} case PRIO_USER:
no_nice = security_ops->task_setnice(p, niceval); if (!who)
if (no_nice) { user = current->user;
error = no_nice; else
continue; user = find_user(who);
}
set_user_nice(p, niceval); if (!user)
} while_each_thread(g, p); goto out_unlock;
do_each_thread(g, p)
if (p->uid == who)
error = set_one_prio(p, niceval, error);
while_each_thread(g, p);
break;
}
out_unlock:
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
out:
return error; return error;
} }
...@@ -278,20 +284,54 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) ...@@ -278,20 +284,54 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
asmlinkage long sys_getpriority(int which, int who) asmlinkage long sys_getpriority(int which, int who)
{ {
struct task_struct *g, *p; struct task_struct *g, *p;
long retval = -ESRCH; struct list_head *l;
struct pid *pid;
struct user_struct *user;
long niceval, retval = -ESRCH;
if (which > 2 || which < 0) if (which > 2 || which < 0)
return -EINVAL; return -EINVAL;
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
do_each_thread(g, p) { switch (which) {
long niceval; case PRIO_PROCESS:
if (!proc_sel(p, which, who)) if (!who)
continue; who = current->pid;
niceval = 20 - task_nice(p); p = find_task_by_pid(who);
if (niceval > retval) if (p) {
retval = niceval; niceval = 20 - task_nice(p);
} while_each_thread(g, p); if (niceval > retval)
retval = niceval;
}
break;
case PRIO_PGRP:
if (!who)
who = current->pgrp;
for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
}
break;
case PRIO_USER:
if (!who)
user = current->user;
else
user = find_user(who);
if (!user)
goto out_unlock;
do_each_thread(g, p)
if (p->uid == who) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
}
while_each_thread(g, p);
break;
}
out_unlock:
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
return retval; return retval;
...@@ -849,7 +889,7 @@ asmlinkage long sys_times(struct tms * tbuf) ...@@ -849,7 +889,7 @@ asmlinkage long sys_times(struct tms * tbuf)
asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
{ {
struct task_struct * p; struct task_struct *p;
int err = -EINVAL; int err = -EINVAL;
if (!pid) if (!pid)
...@@ -862,12 +902,15 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) ...@@ -862,12 +902,15 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
/* From this point forward we keep holding onto the tasklist lock /* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM * so that our parent does not change from under us. -DaveM
*/ */
read_lock(&tasklist_lock); write_lock_irq(&tasklist_lock);
err = -ESRCH; err = -ESRCH;
p = find_task_by_pid(pid); p = find_task_by_pid(pid);
if (!p) if (!p)
goto out; goto out;
err = -EINVAL;
if (!thread_group_leader(p))
goto out;
if (p->parent == current || p->real_parent == current) { if (p->parent == current || p->real_parent == current) {
err = -EPERM; err = -EPERM;
...@@ -882,25 +925,26 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) ...@@ -882,25 +925,26 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (p->leader) if (p->leader)
goto out; goto out;
if (pgid != pid) { if (pgid != pid) {
struct task_struct *g, *tmp; struct task_struct *p;
do_each_thread(g, tmp) { struct pid *pid;
if (tmp->pgrp == pgid && struct list_head *l;
tmp->session == current->session)
for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
if (p->session == current->session)
goto ok_pgid; goto ok_pgid;
} while_each_thread(g, tmp);
goto out; goto out;
} }
ok_pgid: ok_pgid:
err = security_ops->task_setpgid(p, pgid); if (p->pgrp != pgid) {
if (err) detach_pid(p, PIDTYPE_PGID);
goto out; p->pgrp = pgid;
attach_pid(p, PIDTYPE_PGID, pgid);
p->pgrp = pgid; }
err = 0; err = 0;
out: out:
/* All paths lead to here, thus we are safe. -DaveM */ /* All paths lead to here, thus we are safe. -DaveM */
read_unlock(&tasklist_lock); write_unlock_irq(&tasklist_lock);
return err; return err;
} }
...@@ -956,22 +1000,34 @@ asmlinkage long sys_getsid(pid_t pid) ...@@ -956,22 +1000,34 @@ asmlinkage long sys_getsid(pid_t pid)
asmlinkage long sys_setsid(void) asmlinkage long sys_setsid(void)
{ {
struct task_struct *g, *p; struct pid *pid;
int err = -EPERM; int err = -EPERM;
read_lock(&tasklist_lock); if (!thread_group_leader(current))
do_each_thread(g, p) return -EINVAL;
if (p->pgrp == current->pid)
goto out; write_lock_irq(&tasklist_lock);
while_each_thread(g, p);
pid = find_pid(PIDTYPE_PGID, current->pid);
if (pid)
goto out;
current->leader = 1; current->leader = 1;
current->session = current->pgrp = current->pid; if (current->session != current->pid) {
detach_pid(current, PIDTYPE_SID);
current->session = current->pid;
attach_pid(current, PIDTYPE_SID, current->pid);
}
if (current->pgrp != current->pid) {
detach_pid(current, PIDTYPE_PGID);
current->pgrp = current->pid;
attach_pid(current, PIDTYPE_PGID, current->pid);
}
current->tty = NULL; current->tty = NULL;
current->tty_old_pgrp = 0; current->tty_old_pgrp = 0;
err = current->pgrp; err = current->pgrp;
out: out:
read_unlock(&tasklist_lock); write_unlock_irq(&tasklist_lock);
return err; return err;
} }
......
...@@ -64,6 +64,11 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *has ...@@ -64,6 +64,11 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *has
return NULL; return NULL;
} }
struct user_struct *find_user(uid_t uid)
{
return uid_hash_find(uid, uidhashentry(uid));
}
void free_uid(struct user_struct *up) void free_uid(struct user_struct *up)
{ {
if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment