Commit cd4699c5 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'prlimit-tasklist_lock-for-v5.18' of...

Merge tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull tasklist_lock optimizations from Eric Biederman:
 "prlimit and getpriority tasklist_lock optimizations

  The tasklist_lock popped up as a scalability bottleneck on some
  testing workloads. The readlocks in do_prlimit and set/getpriority are
  not necessary in all cases.

  Based on a cycles profile, it looked like ~87% of the time was spent
  in the kernel, ~42% of which was just trying to get *some* spinlock
  (queued_spin_lock_slowpath, not necessarily the tasklist_lock).

  The big offenders (with rough percentages in cycles of the overall
  trace):
   - do_wait 11%
   - setpriority 8% (done previously in commit 7f8ca0ed)
   - kill 8%
   - do_exit 5%
   - clone 3%
   - prlimit64 2%   (this patchset)
   - getrlimit 1%   (this patchset)

  I can't easily test this patchset on the original workload for various
  reasons. Instead, I used the microbenchmark below to at least verify
  there was some improvement. This patchset had a 28% speedup (12% from
  baseline to set/getprio, then another 14% for prlimit).

  This series used to do the setpriority case, but an almost identical
  change was merged as commit 7f8ca0ed ("kernel/sys.c: only take
  tasklist_lock for get/setpriority(PRIO_PGRP)") so that has been
  dropped from here.

  One interesting thing is that my libc's getrlimit() was calling
  prlimit64, so hoisting the read_lock(tasklist_lock) into sys_prlimit64
  had no effect - it essentially optimized the older syscalls only. I
  didn't do that in this patchset, but figured I'd mention it since it
  was an option from the previous patch's discussion"

micobenchmark.c:
---------------
	int main(int argc, char **argv)
	{
		pid_t child;
		struct rlimit rlim[1];

		fork(); fork(); fork(); fork(); fork(); fork();

		for (int i = 0; i < 5000; i++) {
			child = fork();
			if (child < 0)
				exit(1);
			if (child > 0) {
				usleep(1000);
				kill(child, SIGTERM);
				waitpid(child, NULL, 0);
			} else {
				for (;;) {
					setpriority(PRIO_PROCESS, 0,
						    getpriority(PRIO_PROCESS, 0));
					getrlimit(RLIMIT_CPU, rlim);
				}
			}
		}

		return 0;
	}

Link: https://lore.kernel.org/lkml/20211213220401.1039578-1-brho@google.com/ [v1]
Link: https://lore.kernel.org/lkml/20220105212828.197013-1-brho@google.com/ [v2]
Link: https://lore.kernel.org/lkml/20220106172041.522167-1-brho@google.com/ [v3]

* tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  prlimit: do not grab the tasklist_lock
  prlimit: make do_prlimit() static
parents 2e2d4650 18c91bb2
...@@ -253,7 +253,7 @@ void posix_cpu_timers_exit_group(struct task_struct *task); ...@@ -253,7 +253,7 @@ void posix_cpu_timers_exit_group(struct task_struct *task);
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
u64 *newval, u64 *oldval); u64 *newval, u64 *oldval);
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
void posixtimer_rearm(struct kernel_siginfo *info); void posixtimer_rearm(struct kernel_siginfo *info);
#endif #endif
...@@ -8,7 +8,5 @@ ...@@ -8,7 +8,5 @@
struct task_struct; struct task_struct;
void getrusage(struct task_struct *p, int who, struct rusage *ru); void getrusage(struct task_struct *p, int who, struct rusage *ru);
int do_prlimit(struct task_struct *tsk, unsigned int resource,
struct rlimit *new_rlim, struct rlimit *old_rlim);
#endif #endif
...@@ -1424,6 +1424,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) ...@@ -1424,6 +1424,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
return errno; return errno;
} }
/* make sure you are allowed to change @tsk limits before calling this */
static int do_prlimit(struct task_struct *tsk, unsigned int resource,
struct rlimit *new_rlim, struct rlimit *old_rlim)
{
struct rlimit *rlim;
int retval = 0;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
if (resource == RLIMIT_NOFILE &&
new_rlim->rlim_max > sysctl_nr_open)
return -EPERM;
}
/* Holding a refcount on tsk protects tsk->signal from disappearing. */
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
/*
* Keep the capable check against init_user_ns until cgroups can
* contain all limits.
*/
if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
if (!retval)
retval = security_task_setrlimit(tsk, resource, new_rlim);
}
if (!retval) {
if (old_rlim)
*old_rlim = *rlim;
if (new_rlim)
*rlim = *new_rlim;
}
task_unlock(tsk->group_leader);
/*
* RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
* infinite. In case of RLIM_INFINITY the posix CPU timer code
* ignores the rlimit.
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
IS_ENABLED(CONFIG_POSIX_TIMERS)) {
/*
* update_rlimit_cpu can fail if the task is exiting, but there
* may be other tasks in the thread group that are not exiting,
* and they need their cpu timers adjusted.
*
* The group_leader is the last task to be released, so if we
* cannot update_rlimit_cpu on it, then the entire process is
* exiting and we do not need to update at all.
*/
update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
}
return retval;
}
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{ {
struct rlimit value; struct rlimit value;
...@@ -1567,63 +1629,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) ...@@ -1567,63 +1629,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
rlim->rlim_max = (unsigned long)rlim64->rlim_max; rlim->rlim_max = (unsigned long)rlim64->rlim_max;
} }
/* make sure you are allowed to change @tsk limits before calling this */
int do_prlimit(struct task_struct *tsk, unsigned int resource,
struct rlimit *new_rlim, struct rlimit *old_rlim)
{
struct rlimit *rlim;
int retval = 0;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
if (resource == RLIMIT_NOFILE &&
new_rlim->rlim_max > sysctl_nr_open)
return -EPERM;
}
/* protect tsk->signal and tsk->sighand from disappearing */
read_lock(&tasklist_lock);
if (!tsk->sighand) {
retval = -ESRCH;
goto out;
}
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
/* Keep the capable check against init_user_ns until
cgroups can contain all limits */
if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
if (!retval)
retval = security_task_setrlimit(tsk, resource, new_rlim);
}
if (!retval) {
if (old_rlim)
*old_rlim = *rlim;
if (new_rlim)
*rlim = *new_rlim;
}
task_unlock(tsk->group_leader);
/*
* RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
* infinite. In case of RLIM_INFINITY the posix CPU timer code
* ignores the rlimit.
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
IS_ENABLED(CONFIG_POSIX_TIMERS))
update_rlimit_cpu(tsk, new_rlim->rlim_cur);
out:
read_unlock(&tasklist_lock);
return retval;
}
/* rcu lock must be held */ /* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task, static int check_prlimit_permission(struct task_struct *task,
unsigned int flags) unsigned int flags)
......
...@@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) ...@@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
* necessary. Needs siglock protection since other code may update the * necessary. Needs siglock protection since other code may update the
* expiration cache as well. * expiration cache as well.
*
* Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
* we cannot lock_task_sighand. Cannot fail if task is current.
*/ */
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{ {
u64 nsecs = rlim_new * NSEC_PER_SEC; u64 nsecs = rlim_new * NSEC_PER_SEC;
unsigned long irq_fl;
spin_lock_irq(&task->sighand->siglock); if (!lock_task_sighand(task, &irq_fl))
return -ESRCH;
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
spin_unlock_irq(&task->sighand->siglock); unlock_task_sighand(task, &irq_fl);
return 0;
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment