Commit ec64f515 authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds

cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 38460b48
...@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a ...@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
newly-created cgroup if an error occurs after this subsystem's newly-created cgroup if an error occurs after this subsystem's
create() method has been called for the new cgroup). create() method has been called for the new cgroup).
void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
Called before checking the reference count on each subsystem. This may Called before checking the reference count on each subsystem. This may
be useful for subsystems which have some extra references even if be useful for subsystems which have some extra references even if
there are not tasks in the cgroup. there are not tasks in the cgroup. If pre_destroy() returns error code,
rmdir() will fail with it. From this behavior, pre_destroy() can be
called multiple times against a cgroup.
int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task) struct task_struct *task)
......
...@@ -135,6 +135,10 @@ enum { ...@@ -135,6 +135,10 @@ enum {
CGRP_RELEASABLE, CGRP_RELEASABLE,
/* Control Group requires release notifications to userspace */ /* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE, CGRP_NOTIFY_ON_RELEASE,
/*
* A thread in rmdir() is wating for this cgroup.
*/
CGRP_WAIT_ON_RMDIR,
}; };
struct cgroup { struct cgroup {
...@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); ...@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
struct cgroup_subsys { struct cgroup_subsys {
struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
struct cgroup *cgrp); struct cgroup *cgrp);
void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, int (*can_attach)(struct cgroup_subsys *ss,
struct cgroup *cgrp, struct task_struct *tsk); struct cgroup *cgrp, struct task_struct *tsk);
......
...@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) ...@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler. * Call subsys's pre_destroy handler.
* This is called before css refcnt check. * This is called before css refcnt check.
*/ */
static void cgroup_call_pre_destroy(struct cgroup *cgrp) static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int ret = 0;
for_each_subsys(cgrp->root, ss) for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy) if (ss->pre_destroy) {
ss->pre_destroy(ss, cgrp); ret = ss->pre_destroy(ss, cgrp);
return; if (ret)
break;
}
return ret;
} }
static void free_cgroup_rcu(struct rcu_head *obj) static void free_cgroup_rcu(struct rcu_head *obj)
...@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry) ...@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
remove_dir(dentry); remove_dir(dentry);
} }
/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
{
if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
static int rebind_subsystems(struct cgroupfs_root *root, static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits) unsigned long final_bits)
{ {
...@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ...@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
set_bit(CGRP_RELEASABLE, &oldcgrp->flags); set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu(); synchronize_rcu();
put_css_set(cg); put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiters(cgrp);
return 0; return 0;
} }
...@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) ...@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata; struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d; struct dentry *d;
struct cgroup *parent; struct cgroup *parent;
DEFINE_WAIT(wait);
int ret;
/* the vfs holds both inode->i_mutex already */ /* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) { if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
...@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) ...@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
* Call pre_destroy handlers of subsys. Notify subsystems * Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes. * that rmdir() request comes.
*/ */
cgroup_call_pre_destroy(cgrp); ret = cgroup_call_pre_destroy(cgrp);
if (ret)
return ret;
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
parent = cgrp->parent; parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
|| !cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
return -EBUSY; return -EBUSY;
} }
/*
* css_put/get is provided for subsys to grab refcnt to css. In typical
* case, subsystem has no reference after pre_destroy(). But, under
* hierarchy management, some *temporal* refcnt can be hold.
* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
* is really busy, it should return -EBUSY at pre_destroy(). wake_up
* is called when css_put() is called and refcnt goes down to 0.
*/
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
return -EINTR;
goto again;
}
/* NO css_tryget() can success after here. */
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
spin_lock(&release_list_lock); spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags); set_bit(CGRP_REMOVED, &cgrp->flags);
...@@ -3194,11 +3245,13 @@ void __css_put(struct cgroup_subsys_state *css) ...@@ -3194,11 +3245,13 @@ void __css_put(struct cgroup_subsys_state *css)
{ {
struct cgroup *cgrp = css->cgroup; struct cgroup *cgrp = css->cgroup;
rcu_read_lock(); rcu_read_lock();
if ((atomic_dec_return(&css->refcnt) == 1) && if (atomic_dec_return(&css->refcnt) == 1) {
notify_on_release(cgrp)) { if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags); set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp); check_for_release(cgrp);
} }
cgroup_wakeup_rmdir_waiters(cgrp);
}
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -2272,11 +2272,12 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) ...@@ -2272,11 +2272,12 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont) struct cgroup *cont)
{ {
struct mem_cgroup *mem = mem_cgroup_from_cont(cont); struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
mem_cgroup_force_empty(mem, false);
return mem_cgroup_force_empty(mem, false);
} }
static void mem_cgroup_destroy(struct cgroup_subsys *ss, static void mem_cgroup_destroy(struct cgroup_subsys *ss,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment