Commit afb2bc14 authored by Tejun Heo's avatar Tejun Heo

cgroup: don't guarantee cgroup.procs is sorted if sane_behavior

For some reason, tasks and cgroup.procs guarantee that the result is
sorted.  This is the only reason this whole pidlist logic is necessary
instead of just iterating through sorted member tasks.  We can't do
anything about the existing interface but at least ensure that such
expectation doesn't exist for the new interface so that pidlist logic
may be removed in the distant future.

This patch scrambles the sort order if sane_behavior so that the
output is usually not sorted in the new interface.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarLi Zefan <lizefan@huawei.com>
parent 04502365
......@@ -275,6 +275,9 @@ enum {
* - "tasks" is removed. Everything should be at process
* granularity. Use "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they
* got recycled inbetween reads.
*
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
......
......@@ -3564,11 +3564,49 @@ static int pidlist_uniq(pid_t *list, int length)
return dest;
}
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if
* sane_behavior so that no such expectation exists in the new interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_sane_behavior(cgrp))
return pid_fry(pid);
else
return pid;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
......@@ -3656,7 +3694,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (cgroup_sane_behavior(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
......@@ -3777,10 +3818,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
while (index < end) {
int mid = (index + end) / 2;
if (l->list[mid] == pid) {
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (l->list[mid] <= pid)
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
......@@ -3791,7 +3832,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
*pos = *iter;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
......@@ -3820,7 +3861,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
*pos = *p;
*pos = cgroup_pid_fry(of->cgrp, *p);
return p;
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment