Commit f075e0f6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "The bulk of changes are cleanups and preparations for the upcoming
  kernfs conversion.

   - cgroup_event mechanism which is and will be used only by memcg is
     moved to memcg.

   - pidlist handling is updated so that it can be served by seq_file.

     Also, the list is not sorted if sane_behavior.  cgroup
     documentation explicitly states that the file is not sorted but it
     has been for quite some time.

   - All cgroup file handling now happens on top of seq_file.  This is
     to prepare for kernfs conversion.  In addition, all operations are
     restructured so that they map 1-1 to kernfs operations.

   - Other cleanups and low-pri fixes"

* 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (40 commits)
  cgroup: trivial style updates
  cgroup: remove stray references to css_id
  doc: cgroups: Fix typo in doc/cgroups
  cgroup: fix fail path in cgroup_load_subsys()
  cgroup: fix missing unlock on error in cgroup_load_subsys()
  cgroup: remove for_each_root_subsys()
  cgroup: implement for_each_css()
  cgroup: factor out cgroup_subsys_state creation into create_css()
  cgroup: combine css handling loops in cgroup_create()
  cgroup: reorder operations in cgroup_create()
  cgroup: make for_each_subsys() useable under cgroup_root_mutex
  cgroup: css iterations and css_from_dir() are safe under cgroup_mutex
  cgroup: unify pidlist and other file handling
  cgroup: replace cftype->read_seq_string() with cftype->seq_show()
  cgroup: attach cgroup_open_file to all cgroup files
  cgroup: generalize cgroup_pidlist_open_file
  cgroup: unify read path so that seq_file is always used
  cgroup: unify cgroup_write_X64() and cgroup_write_string()
  cgroup: remove cftype->read(), ->read_map() and ->write()
  hugetlb_cgroup: convert away from cftype->read()
  ...
parents 5cb7398c dd4b0a46
......@@ -24,7 +24,6 @@ CONTENTS:
2.1 Basic Usage
2.2 Attaching processes
2.3 Mounting hierarchies by name
2.4 Notification API
3. Kernel API
3.1 Overview
3.2 Synchronization
......@@ -472,25 +471,6 @@ you give a subsystem a name.
The name of the subsystem appears as part of the hierarchy description
in /proc/mounts and /proc/<pid>/cgroups.
2.4 Notification API
--------------------
There is mechanism which allows to get notifications about changing
status of a cgroup.
To register a new notification handler you need to:
- create a file descriptor for event notification using eventfd(2);
- open a control file to be monitored (e.g. memory.usage_in_bytes);
- write "<event_fd> <control_fd> <args>" to cgroup.event_control.
Interpretation of args is defined by control file implementation;
eventfd will be woken up by control file implementation or when the
cgroup is removed.
To unregister a notification handler just close eventfd.
NOTE: Support of notifications should be implemented for the control
file. See documentation for the subsystem.
3. Kernel API
=============
......
......@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
per-node page counts including "hierarchical_<counter>" which sums up all
hierarchical children's values in addition to the memcg's own value.
The ouput format of memory.numa_stat is:
The output format of memory.numa_stat is:
total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
......@@ -670,7 +670,7 @@ page tables.
8.1 Interface
This feature is disabled by default. It can be enabledi (and disabled again) by
This feature is disabled by default. It can be enabled (and disabled again) by
writing to memory.move_charge_at_immigrate of the destination cgroup.
If you want to enable it:
......
......@@ -97,8 +97,8 @@ to work with it.
(struct res_counter *rc, struct res_counter *top,
unsinged long val)
Almost same as res_cunter_uncharge() but propagation of uncharge
stops when rc == top. This is useful when kill a res_coutner in
Almost same as res_counter_uncharge() but propagation of uncharge
stops when rc == top. This is useful when kill a res_counter in
child cgroup.
2.1 Other accounting routines
......
......@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
&blkcg_policy_throtl, seq_cft(sf)->private, true);
return 0;
}
......@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
return __blkg_prfill_u64(sf, pd, v);
}
static int tg_print_conf_u64(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_conf_u64(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
&blkcg_policy_throtl, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
static int tg_print_conf_uint(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int tg_print_conf_uint(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
&blkcg_policy_throtl, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
......@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
.read_seq_string = tg_print_conf_u64,
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = offsetof(struct throtl_grp, bps[WRITE]),
.read_seq_string = tg_print_conf_u64,
.seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = offsetof(struct throtl_grp, iops[READ]),
.read_seq_string = tg_print_conf_uint,
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = offsetof(struct throtl_grp, iops[WRITE]),
.read_seq_string = tg_print_conf_uint,
.seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.io_service_bytes",
.private = offsetof(struct tg_stats_cpu, service_bytes),
.read_seq_string = tg_print_cpu_rwstat,
.seq_show = tg_print_cpu_rwstat,
},
{
.name = "throttle.io_serviced",
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
.seq_show = tg_print_cpu_rwstat,
},
{ } /* terminate */
};
......
......@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
}
static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_weight_device, &blkcg_policy_cfq,
0, false);
return 0;
}
......@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
}
static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
struct cftype *cft,
struct seq_file *sf)
static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
0, false);
return 0;
}
static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
static int cfq_print_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
return 0;
}
static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
return 0;
}
......@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
return __cfq_set_weight(css, cft, val, true);
}
static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *sf)
static int cfqg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
&blkcg_policy_cfq, seq_cft(sf)->private, false);
return 0;
}
static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_rwstat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
&blkcg_policy_cfq, seq_cft(sf)->private, true);
return 0;
}
......@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &sum);
}
static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
&blkcg_policy_cfq, cft->private, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
seq_cft(sf)->private, false);
return 0;
}
static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
&blkcg_policy_cfq, cft->private, true);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
seq_cft(sf)->private, true);
return 0;
}
......@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
}
/* print avg_queue_size */
static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *sf)
static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(css);
blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
&blkcg_policy_cfq, 0, false);
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
0, false);
return 0;
}
#endif /* CONFIG_DEBUG_BLK_CGROUP */
......@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cfqg_print_leaf_weight_device,
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cfq_print_leaf_weight,
.seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
......@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfqg_print_weight_device,
.seq_show = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfq_print_weight,
.seq_show = cfq_print_weight,
.write_u64 = cfq_set_weight,
},
{
.name = "leaf_weight_device",
.read_seq_string = cfqg_print_leaf_weight_device,
.seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "leaf_weight",
.read_seq_string = cfq_print_leaf_weight,
.seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
......@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "time",
.private = offsetof(struct cfq_group, stats.time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "sectors",
.private = offsetof(struct cfq_group, stats.sectors),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "io_service_bytes",
.private = offsetof(struct cfq_group, stats.service_bytes),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_serviced",
.private = offsetof(struct cfq_group, stats.serviced),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_service_time",
.private = offsetof(struct cfq_group, stats.service_time),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_wait_time",
.private = offsetof(struct cfq_group, stats.wait_time),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_merged",
.private = offsetof(struct cfq_group, stats.merged),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
{
.name = "io_queued",
.private = offsetof(struct cfq_group, stats.queued),
.read_seq_string = cfqg_print_rwstat,
.seq_show = cfqg_print_rwstat,
},
/* the same statictics which cover the cfqg and its descendants */
{
.name = "time_recursive",
.private = offsetof(struct cfq_group, stats.time),
.read_seq_string = cfqg_print_stat_recursive,
.seq_show = cfqg_print_stat_recursive,
},
{
.name = "sectors_recursive",
.private = offsetof(struct cfq_group, stats.sectors),
.read_seq_string = cfqg_print_stat_recursive,
.seq_show = cfqg_print_stat_recursive,
},
{
.name = "io_service_bytes_recursive",
.private = offsetof(struct cfq_group, stats.service_bytes),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_serviced_recursive",
.private = offsetof(struct cfq_group, stats.serviced),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_service_time_recursive",
.private = offsetof(struct cfq_group, stats.service_time),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_wait_time_recursive",
.private = offsetof(struct cfq_group, stats.wait_time),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_merged_recursive",
.private = offsetof(struct cfq_group, stats.merged),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_queued_recursive",
.private = offsetof(struct cfq_group, stats.queued),
.read_seq_string = cfqg_print_rwstat_recursive,
.seq_show = cfqg_print_rwstat_recursive,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "avg_queue_size",
.read_seq_string = cfqg_print_avg_queue_size,
.seq_show = cfqg_print_avg_queue_size,
},
{
.name = "group_wait_time",
.private = offsetof(struct cfq_group, stats.group_wait_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "idle_time",
.private = offsetof(struct cfq_group, stats.idle_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "empty_time",
.private = offsetof(struct cfq_group, stats.empty_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "dequeue",
.private = offsetof(struct cfq_group, stats.dequeue),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
{
.name = "unaccounted_time",
.private = offsetof(struct cfq_group, stats.unaccounted_time),
.read_seq_string = cfqg_print_stat,
.seq_show = cfqg_print_stat,
},
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
......
......@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
static void bcachecg_destroy(struct cgroup *cgroup)
{
struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
free_css_id(&bcache_subsys, &cg->css);
kfree(cg);
}
......
......@@ -21,6 +21,7 @@
#include <linux/xattr.h>
#include <linux/fs.h>
#include <linux/percpu-refcount.h>
#include <linux/seq_file.h>
#ifdef CONFIG_CGROUPS
......@@ -28,8 +29,6 @@ struct cgroupfs_root;
struct cgroup_subsys;
struct inode;
struct cgroup;
struct css_id;
struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
......@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
struct cgroup_subsys_state *parent;
unsigned long flags;
/* ID for this css, if possible */
struct css_id __rcu *id;
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
......@@ -239,10 +236,6 @@ struct cgroup {
struct rcu_head rcu_head;
struct work_struct destroy_work;
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
/* directory xattrs */
struct simple_xattrs xattrs;
};
......@@ -280,6 +273,9 @@ enum {
* - "tasks" is removed. Everything should be at process
* granularity. Use "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they
* got recycled inbetween reads.
*
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
......@@ -320,9 +316,6 @@ struct cgroupfs_root {
/* Unique id for this hierarchy. */
int hierarchy_id;
/* A list running through the attached subsystems */
struct list_head subsys_list;
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
......@@ -388,16 +381,6 @@ struct css_set {
struct rcu_head rcu_head;
};
/*
* cgroup_map_cb is an abstract callback API for reporting map-valued
* control files
*/
struct cgroup_map_cb {
int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
void *state;
};
/*
* struct cftype: handler definitions for cgroup control files
*
......@@ -445,10 +428,6 @@ struct cftype {
*/
struct cgroup_subsys *ss;
int (*open)(struct inode *inode, struct file *file);
ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos);
/*
* read_u64() is a shortcut for the common case of returning a
* single integer. Use it in place of read()
......@@ -458,24 +437,14 @@ struct cftype {
* read_s64() is a signed version of read_u64()
*/
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
/*
* read_map() is used for defining a map of key/value
* pairs. It should call cb->fill(cb, key, value) for each
* entry. The key/value pairs (and their ordering) should not
* change between reboots.
*/
int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb);
/*
* read_seq_string() is used for outputting a simple sequence
* using seqfile.
*/
int (*read_seq_string)(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m);
ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
struct file *file,
const char __user *buf, size_t nbytes, loff_t *ppos);
/* generic seq_file read interface */
int (*seq_show)(struct seq_file *sf, void *v);
/* optional ops, implement all or none */
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
void (*seq_stop)(struct seq_file *sf, void *v);
/*
* write_u64() is a shortcut for the common case of accepting
......@@ -504,27 +473,6 @@ struct cftype {
* kick type for multiplexing.
*/
int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
int (*release)(struct inode *inode, struct file *file);
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to the cftype. Implement it if
* you want to provide this functionality. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd,
const char *args);
/*
* unregister_event() callback will be called when userspace
* closes the eventfd or on cgroup removing.
* This callback must be implemented, if you want provide
* notification functionality.
*/
void (*unregister_event)(struct cgroup_subsys_state *css,
struct cftype *cft,
struct eventfd_ctx *eventfd);
};
/*
......@@ -537,6 +485,26 @@ struct cftype_set {
struct cftype *cfts;
};
/*
* cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
* access directly.
*/
struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
struct cgroup_subsys_state *css;
/* file xattrs */
struct simple_xattrs xattrs;
};
/* seq_file->private points to the following, only ->priv is public */
struct cgroup_open_file {
struct cfent *cfe;
void *priv;
};
/*
* See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
* function can be called as long as @cgrp is accessible.
......@@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
return rcu_dereference(cgrp->name)->name;
}
static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
struct cgroup_open_file *of = seq->private;
return of->cfe->css;
}
static inline struct cftype *seq_cft(struct seq_file *seq)
{
struct cgroup_open_file *of = seq->private;
return of->cfe->type;
}
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
......@@ -631,12 +611,8 @@ struct cgroup_subsys {
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
/*
* Link to parent, and list entry in parent's children.
* Protected by cgroup_lock()
*/
/* link to parent, protected by cgroup_lock() */
struct cgroupfs_root *root;
struct list_head sibling;
/* list of cftype_sets */
struct list_head cftsets;
......
......@@ -7,6 +7,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/cgroup.h>
#include <linux/eventfd.h>
struct vmpressure {
unsigned long scanned;
......@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
extern void vmpressure_cleanup(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
extern int vmpressure_register_event(struct cgroup_subsys_state *css,
struct cftype *cft,
extern int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd,
const char *args);
extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
......
......@@ -854,7 +854,6 @@ config NUMA_BALANCING
menuconfig CGROUPS
boolean "Control Group support"
depends on EVENTFD
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
......@@ -921,6 +920,7 @@ config MEMCG
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
select MM_OWNER
select EVENTFD
help
Provides a memory resource controller that manages both anonymous
memory and page cache. (See Documentation/cgroups/memory.txt)
......@@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
......
......@@ -41,7 +41,6 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
......@@ -56,14 +55,19 @@
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
#include <linux/file.h>
#include <linux/atomic.h>
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
* Expiring in the middle is a performance problem not a correctness one.
* 1 sec should be enough.
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
......@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
#define cgroup_assert_mutex_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
#ifdef CONFIG_LOCKDEP
#define cgroup_assert_mutex_or_root_locked() \
WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
!lockdep_is_held(&cgroup_root_mutex)))
#else
#define cgroup_assert_mutex_or_root_locked() do { } while (0)
#endif
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
......@@ -97,6 +114,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
*/
static struct workqueue_struct *cgroup_destroy_wq;
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
......@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
/*
* cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
*/
struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
struct cgroup_subsys_state *css;
/* file xattrs */
struct simple_xattrs xattrs;
};
/*
* cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
/*
* css which the event belongs to.
*/
struct cgroup_subsys_state *css;
/*
* Control file which the event associated.
*/
struct cftype *cft;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct remove;
};
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
......@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static int cgroup_file_release(struct inode *inode, struct file *file);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
......@@ -261,17 +242,33 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
* for_each_subsys - iterate all loaded cgroup subsystems
* @ss: the iteration cursor
* @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
*
* Should be called under cgroup_mutex.
* Iterates through all loaded subsystems. Should be called under
* cgroup_mutex or cgroup_root_mutex.
*/
#define for_each_subsys(ss, i) \
for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
if (({ lockdep_assert_held(&cgroup_mutex); \
!((ss) = cgroup_subsys[i]); })) { } \
#define for_each_subsys(ss, ssid) \
for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
(ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((ss) = cgroup_subsys[(ssid)])) { } \
else
/**
......@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[i]) || true); (i)++)
/* iterate each subsystem attached to a hierarchy */
#define for_each_root_subsys(root, ss) \
list_for_each_entry((ss), &(root)->subsys_list, sibling)
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
......@@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work)
*/
deactivate_super(cgrp->root->sb);
/*
* if we're getting rid of the cgroup, refcount should ensure
* that there are no pidlists left.
*/
BUG_ON(!list_empty(&cgrp->pidlists));
cgroup_pidlist_destroy_all(cgrp);
simple_xattrs_free(&cgrp->xattrs);
......@@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
cgroup_css(cgroup_dummy_top, ss));
cgroup_css(cgrp, ss)->cgroup = cgrp;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
ss->bind(cgroup_css(cgrp, ss));
......@@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
RCU_INIT_POINTER(cgrp->subsys[i], NULL);
cgroup_subsys[i]->root = &cgroup_dummy_root;
list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
......@@ -1096,9 +1083,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
struct cgroup_subsys *ss;
int ssid;
mutex_lock(&cgroup_root_mutex);
for_each_root_subsys(root, ss)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
seq_puts(seq, ",sane_behavior");
......@@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
simple_xattrs_init(&cgrp->xattrs);
}
......@@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
INIT_LIST_HEAD(&root->subsys_list);
INIT_LIST_HEAD(&root->root_list);
root->number_of_cgroups = 1;
cgrp->root = root;
......@@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
return ERR_PTR(ret);
}
static void cgroup_kill_sb(struct super_block *sb) {
static void cgroup_kill_sb(struct super_block *sb)
{
struct cgroupfs_root *root = sb->s_fs_info;
struct cgroup *cgrp = &root->top_cgroup;
struct cgrp_cset_link *link, *tmp_link;
......@@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
bool threadgroup)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroupfs_root *root = cgrp->root;
struct cgroup_subsys_state *css, *failed_css = NULL;
/* threadgroup list cursor and array */
struct task_struct *leader = tsk;
struct task_and_cgroup *tc;
......@@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
if (ss->can_attach) {
retval = ss->can_attach(css, &tset);
for_each_css(css, i, cgrp) {
if (css->ss->can_attach) {
retval = css->ss->can_attach(css, &tset);
if (retval) {
failed_ss = ss;
failed_css = css;
goto out_cancel_attach;
}
}
......@@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/*
* step 4: do subsystem attach callbacks.
*/
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
if (ss->attach)
ss->attach(css, &tset);
}
for_each_css(css, i, cgrp)
if (css->ss->attach)
css->ss->attach(css, &tset);
/*
* step 5: success! and cleanup
......@@ -2114,13 +2096,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
}
out_cancel_attach:
if (retval) {
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
if (ss == failed_ss)
for_each_css(css, i, cgrp) {
if (css == failed_css)
break;
if (ss->cancel_attach)
ss->cancel_attach(css, &tset);
if (css->ss->cancel_attach)
css->ss->cancel_attach(css, &tset);
}
}
out_free_group_list:
......@@ -2148,7 +2128,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
tsk = find_task_by_vpid(pid);
if (!tsk) {
rcu_read_unlock();
ret= -ESRCH;
ret = -ESRCH;
goto out_unlock_cgroup;
}
/*
......@@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
return 0;
}
static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *seq)
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = css->cgroup;
struct cgroup *cgrp = seq_css(seq)->cgroup;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
......@@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
return 0;
}
static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *seq)
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
return 0;
}
/* A buffer size big enough for numbers or short strings */
#define CGROUP_LOCAL_BUFFER_SIZE 64
static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
const char __user *userbuf, size_t nbytes,
loff_t *unused_ppos)
static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
size_t nbytes, loff_t *ppos)
{
char buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
char *end;
struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup_subsys_state *css = cfe->css;
size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
char *buf;
int ret;
if (!nbytes)
return -EINVAL;
if (nbytes >= sizeof(buffer))
if (nbytes >= max_bytes)
return -E2BIG;
if (copy_from_user(buffer, userbuf, nbytes))
return -EFAULT;
buffer[nbytes] = 0; /* nul-terminate */
if (cft->write_u64) {
u64 val = simple_strtoull(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_u64(css, cft, val);
buf = kmalloc(nbytes + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_user(buf, userbuf, nbytes)) {
ret = -EFAULT;
goto out_free;
}
buf[nbytes] = '\0';
if (cft->write_string) {
ret = cft->write_string(css, cft, strstrip(buf));
} else if (cft->write_u64) {
unsigned long long v;
ret = kstrtoull(buf, 0, &v);
if (!ret)
ret = cft->write_u64(css, cft, v);
} else if (cft->write_s64) {
long long v;
ret = kstrtoll(buf, 0, &v);
if (!ret)
ret = cft->write_s64(css, cft, v);
} else if (cft->trigger) {
ret = cft->trigger(css, (unsigned int)cft->private);
} else {
s64 val = simple_strtoll(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_s64(css, cft, val);
ret = -EINVAL;
}
if (!retval)
retval = nbytes;
return retval;
out_free:
kfree(buf);
return ret ?: nbytes;
}
static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
const char __user *userbuf, size_t nbytes,
loff_t *unused_ppos)
/*
* seqfile ops/methods for returning structured data. Currently just
* supports string->u64 maps, but can be extended in future.
*/
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
size_t max_bytes = cft->max_write_len;
char *buffer = local_buffer;
struct cftype *cft = seq_cft(seq);
if (!max_bytes)
max_bytes = sizeof(local_buffer) - 1;
if (nbytes >= max_bytes)
return -E2BIG;
/* Allocate a dynamic buffer if we need one */
if (nbytes >= sizeof(local_buffer)) {
buffer = kmalloc(nbytes + 1, GFP_KERNEL);
if (buffer == NULL)
return -ENOMEM;
}
if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
retval = -EFAULT;
goto out;
if (cft->seq_start) {
return cft->seq_start(seq, ppos);
} else {
/*
* The same behavior and code as single_open(). Returns
* !NULL if pos is at the beginning; otherwise, NULL.
*/
return NULL + !*ppos;
}
buffer[nbytes] = 0; /* nul-terminate */
retval = cft->write_string(css, cft, strstrip(buffer));
if (!retval)
retval = nbytes;
out:
if (buffer != local_buffer)
kfree(buffer);
return retval;
}
static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup_subsys_state *css = cfe->css;
struct cftype *cft = seq_cft(seq);
if (cft->write)
return cft->write(css, cft, file, buf, nbytes, ppos);
if (cft->write_u64 || cft->write_s64)
return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
if (cft->write_string)
return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
if (cft->trigger) {
int ret = cft->trigger(css, (unsigned int)cft->private);
return ret ? ret : nbytes;
if (cft->seq_next) {
return cft->seq_next(seq, v, ppos);
} else {
/*
* The same behavior and code as single_open(), always
* terminate after the initial read.
*/
++*ppos;
return NULL;
}
return -EINVAL;
}
static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
u64 val = cft->read_u64(css, cft);
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
struct cftype *cft = seq_cft(seq);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
if (cft->seq_stop)
cft->seq_stop(seq, v);
}
static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
s64 val = cft->read_s64(css, cft);
int len = sprintf(tmp, "%lld\n", (long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup_subsys_state *css = cfe->css;
if (cft->seq_show)
return cft->seq_show(m, arg);
if (cft->read)
return cft->read(css, cft, file, buf, nbytes, ppos);
if (cft->read_u64)
return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
if (cft->read_s64)
return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
return -EINVAL;
return 0;
}
/*
* seqfile ops/methods for returning structured data. Currently just
* supports string->u64 maps, but can be extended in future.
*/
static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
{
struct seq_file *sf = cb->state;
return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct cfent *cfe = m->private;
struct cftype *cft = cfe->type;
struct cgroup_subsys_state *css = cfe->css;
if (cft->read_map) {
struct cgroup_map_cb cb = {
.fill = cgroup_map_add,
.state = m,
};
return cft->read_map(css, cft, &cb);
}
return cft->read_seq_string(css, cft, m);
}
static const struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
.write = cgroup_file_write,
.llseek = seq_lseek,
.release = cgroup_file_release,
static struct seq_operations cgroup_seq_operations = {
.start = cgroup_seqfile_start,
.next = cgroup_seqfile_next,
.stop = cgroup_seqfile_stop,
.show = cgroup_seqfile_show,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
......@@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
struct cgroup_subsys_state *css;
struct cgroup_open_file *of;
int err;
err = generic_file_open(inode, file);
......@@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
WARN_ON_ONCE(cfe->css && cfe->css != css);
cfe->css = css;
if (cft->read_map || cft->read_seq_string) {
file->f_op = &cgroup_seqfile_operations;
err = single_open(file, cgroup_seqfile_show, cfe);
} else if (cft->open) {
err = cft->open(inode, file);
of = __seq_open_private(file, &cgroup_seq_operations,
sizeof(struct cgroup_open_file));
if (of) {
of->cfe = cfe;
return 0;
}
if (css->ss && err)
if (css->ss)
css_put(css);
return err;
return -ENOMEM;
}
static int cgroup_file_release(struct inode *inode, struct file *file)
{
struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup_subsys_state *css = cfe->css;
int ret = 0;
if (cft->release)
ret = cft->release(inode, file);
if (css->ss)
css_put(css);
if (file->f_op == &cgroup_seqfile_operations)
single_release(inode, file);
return ret;
return seq_release_private(inode, file);
}
/*
......@@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
}
static const struct file_operations cgroup_file_operations = {
.read = cgroup_file_read,
.read = seq_read,
.write = cgroup_file_write,
.llseek = generic_file_llseek,
.open = cgroup_file_open,
......@@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
.removexattr = cgroup_removexattr,
};
/*
* Check if a file is a control file
*/
static inline struct cftype *__file_cft(struct file *file)
{
if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
static int cgroup_create_file(struct dentry *dentry, umode_t mode,
struct super_block *sb)
{
......@@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
if (cft->mode)
return cft->mode;
if (cft->read || cft->read_u64 || cft->read_s64 ||
cft->read_map || cft->read_seq_string)
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write || cft->write_u64 || cft->write_s64 ||
cft->write_string || cft->trigger)
if (cft->write_u64 || cft->write_s64 || cft->write_string ||
cft->trigger)
mode |= S_IWUSR;
return mode;
......@@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void)
* @parent_css: css whose children to walk
*
* This function returns the next child of @parent_css and should be called
* under RCU read lock. The only requirement is that @parent_css and
* @pos_css are accessible. The next sibling is guaranteed to be returned
* regardless of their states.
* under either cgroup_mutex or RCU read lock. The only requirement is
* that @parent_css and @pos_css are accessible. The next sibling is
* guaranteed to be returned regardless of their states.
*/
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
......@@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next;
WARN_ON_ONCE(!rcu_read_lock_held());
cgroup_assert_mutex_or_rcu_locked();
/*
* @pos could already have been removed. Once a cgroup is removed,
......@@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child);
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @root are accessible and @pos is a descendant of @root.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
*/
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
......@@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
WARN_ON_ONCE(!rcu_read_lock_held());
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit @root */
if (!pos)
......@@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
* is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct rightmost descendant as long as @pos is
* accessible.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last, *tmp;
WARN_ON_ONCE(!rcu_read_lock_held());
cgroup_assert_mutex_or_rcu_locked();
do {
last = pos;
......@@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
*/
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
......@@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
WARN_ON_ONCE(!rcu_read_lock_held());
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
......@@ -3504,14 +3423,12 @@ struct cgroup_pidlist {
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* protects the other fields */
struct rw_semaphore rwsem;
/* for delayed destruction */
struct delayed_work destroy_dwork;
};
/*
......@@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count)
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
......@@ -3535,6 +3453,47 @@ static void pidlist_free(void *p)
kfree(p);
}
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
*/
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
......@@ -3565,52 +3524,92 @@ static int pidlist_uniq(pid_t *list, int length)
return dest;
}
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if
* sane_behavior so that no such expectation exists in the new interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_sane_behavior(cgrp))
return pid_fry(pid);
else
return pid;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->rwsem in case
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
/* make sure l doesn't vanish out from under us */
down_write(&l->rwsem);
mutex_unlock(&cgrp->pidlist_mutex);
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
}
}
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
if (!l)
return l;
}
init_rwsem(&l->rwsem);
down_write(&l->rwsem);
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
l->key.ns = get_pid_ns(ns);
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
......@@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct task_struct *tsk;
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
......@@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
if (cgroup_sane_behavior(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find(cgrp, type);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
/* store array, freeing old if necessary - lock already held */
/* store array, freeing old if necessary */
pidlist_free(l->list);
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->rwsem);
*lp = l;
return 0;
}
......@@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup_pidlist *l = s->private;
struct cgroup_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @of->priv indicates that this isn't the first start()
* after open. If the matching pidlist is around, we can use that.
* Look for it. Note that @of->priv can't be used directly. It
* could already have been destroyed.
*/
if (of->priv)
of->priv = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!of->priv) {
ret = pidlist_array_load(cgrp, type,
(struct cgroup_pidlist **)&of->priv);
if (ret)
return ERR_PTR(ret);
}
l = of->priv;
down_read(&l->rwsem);
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (l->list[mid] == pid) {
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (l->list[mid] <= pid)
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
......@@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
*pos = *iter;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct cgroup_pidlist *l = s->private;
up_read(&l->rwsem);
struct cgroup_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
CGROUP_PIDLIST_DESTROY_DELAY);
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct cgroup_pidlist *l = s->private;
struct cgroup_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
......@@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
*pos = *p;
*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
......@@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
.show = cgroup_pidlist_show,
};
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
{
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock(&l->owner->pidlist_mutex);
down_write(&l->rwsem);
BUG_ON(!l->use_count);
if (!--l->use_count) {
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
up_write(&l->rwsem);
kfree(l);
return;
}
mutex_unlock(&l->owner->pidlist_mutex);
up_write(&l->rwsem);
}
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
{
struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ((struct seq_file *)file->private_data)->private;
cgroup_release_pid_array(l);
return seq_release(inode, file);
}
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
.release = cgroup_pidlist_release,
};
/*
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
*/
/* helper function for the two below it */
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
/* have the array populated */
retval = pidlist_array_load(cgrp, type, &l);
if (retval)
return retval;
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
cgroup_release_pid_array(l);
return retval;
}
((struct seq_file *)file->private_data)->private = l;
return 0;
}
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
{
......@@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp)
deactivate_super(sb);
}
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove(struct work_struct *work)
{
struct cgroup_event *event = container_of(work, struct cgroup_event,
remove);
struct cgroup_subsys_state *css = event->css;
remove_wait_queue(event->wqh, &event->wait);
event->cft->unregister_event(css, event->cft, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
css_put(css);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
struct cgroup *cgrp = event->css->cgroup;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
*/
spin_lock(&cgrp->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
struct cftype *cft, const char *buffer)
{
struct cgroup *cgrp = dummy_css->cgroup;
struct cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = fdget(efd);
if (!efile.file) {
ret = -EBADF;
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fdget(cfd);
if (!cfile.file) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto out_put_cfile;
event->cft = __file_cft(cfile.file);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto out_put_cfile;
}
if (!event->cft->ss) {
ret = -EBADF;
goto out_put_cfile;
}
/*
* Determine the css of @cfile, verify it belongs to the same
* cgroup as cgroup.event_control, and associate @event with it.
* Remaining events are automatically removed on cgroup destruction
* but the removal is asynchronous, so take an extra ref.
*/
rcu_read_lock();
ret = -EINVAL;
event->css = cgroup_css(cgrp, event->cft->ss);
cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
if (event->css && event->css == cfile_css && css_tryget(event->css))
ret = 0;
rcu_read_unlock();
if (ret)
goto out_put_cfile;
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto out_put_css;
}
ret = event->cft->register_event(event->css, event->cft,
event->eventfd, buffer);
if (ret)
goto out_put_css;
efile.file->f_op->poll(efile.file, &event->pt);
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
fdput(cfile);
fdput(efile);
return 0;
out_put_css:
css_put(event->css);
out_put_cfile:
fdput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fdput(efile);
out_kfree:
kfree(event);
return ret;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
......@@ -4143,16 +3897,14 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
.open = cgroup_procs_open,
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.event_control",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
{
.name = "cgroup.clone_children",
.flags = CFTYPE_INSANE,
......@@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_sane_behavior_show,
.seq_show = cgroup_sane_behavior_show,
},
/*
......@@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = {
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
.open = cgroup_tasks_open,
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
.write_u64 = cgroup_tasks_write,
.release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
{
......@@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = {
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
.seq_show = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
......@@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
}
/**
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
* css is online and installed in @cgrp with all interface files created.
* Returns 0 on success, -errno on failure.
*/
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
struct cgroup *parent = cgrp->parent;
struct cgroup_subsys_state *css;
int err;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(cgroup_css(parent, ss));
if (IS_ERR(css))
return PTR_ERR(css);
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
goto err_free;
init_css(css, ss, cgrp);
err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
if (err)
goto err_free;
err = online_css(css);
if (err)
goto err_free;
dget(cgrp->dentry);
css_get(css->parent);
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
return 0;
err_free:
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
return err;
}
/*
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
......@@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css)
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
umode_t mode)
{
struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
struct cgroup *cgrp;
struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int err = 0;
int ssid, err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
......@@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css;
css = ss->css_alloc(cgroup_css(parent, ss));
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_free_all;
}
css_ar[ss->subsys_id] = css;
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
goto err_free_all;
init_css(css, ss, cgrp);
}
/*
* Create directory. cgroup_create_file() returns with the new
* directory locked on success so that it can be populated without
......@@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
if (err < 0)
goto err_free_all;
goto err_unlock;
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
......@@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
/* hold a ref to the parent's dentry */
dget(parent->dentry);
/* creation succeeded, notify subsystems */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
err = online_css(css);
if (err)
goto err_destroy;
/* each css holds a ref to the cgroup's dentry and parent css */
dget(dentry);
css_get(css->parent);
/* mark it consumed for error path */
css_ar[ss->subsys_id] = NULL;
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
}
/*
* @cgrp is now fully operational. If something fails after this
* point, it'll be released via the normal destruction path.
*/
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
if (err)
goto err_destroy;
err = cgroup_populate_dir(cgrp, root->subsys_mask);
/* let's create and online css's */
for_each_subsys(ss, ssid) {
if (root->subsys_mask & (1 << ssid)) {
err = create_css(cgrp, ss);
if (err)
goto err_destroy;
}
}
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
err_free_all:
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
if (css) {
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
}
}
err_unlock:
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
......@@ -4501,14 +4270,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return err;
err_destroy:
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
if (css) {
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
}
}
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&dentry->d_inode->i_mutex);
......@@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct dentry *d = cgrp->dentry;
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
struct cgroup_subsys_state *css;
struct cgroup *child;
bool empty;
int ssid;
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
......@@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
* will be invoked to perform the rest of destruction once the
* percpu refs of all css's are confirmed to be killed.
*/
for_each_root_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
if (css)
for_each_css(css, ssid, cgrp)
kill_css(css);
}
/*
* Mark @cgrp dead. This prevents further task migration and child
......@@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
dget(d);
cgroup_d_remove_dir(d);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
return 0;
};
......@@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
/* We don't handle early failures gracefully */
......@@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
cgroup_init_cftsets(ss);
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
cgroup_subsys[ss->subsys_id] = ss;
/*
......@@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
if (IS_ERR(css)) {
/* failure case - need to deassign the cgroup_subsys[] slot. */
cgroup_subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
/* our new subsystem will be attached to the dummy hierarchy. */
......@@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
write_unlock(&css_set_lock);
ret = online_css(css);
if (ret)
if (ret) {
ss->css_free(css);
goto err_unload;
}
/* success! */
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return 0;
err_unload:
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
/* @ss can't be mounted here as try_module_get() would fail */
cgroup_unload_subsys(ss);
......@@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cgrp_cset_link *link;
struct cgroup_subsys_state *css;
BUG_ON(ss->module == NULL);
......@@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
BUG_ON(ss->root != &cgroup_dummy_root);
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
offline_css(cgroup_css(cgroup_dummy_top, ss));
css = cgroup_css(cgroup_dummy_top, ss);
if (css)
offline_css(css);
/* deassign the subsys_id */
cgroup_subsys[ss->subsys_id] = NULL;
/* remove subsystem from the dummy root's list of subsystems */
list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummy
* top. as in loading, we need to pay our respects to the hashtable
......@@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
* need to free before marking as null because ss->css_free needs
* the cgrp->subsys pointer to find their state.
*/
ss->css_free(cgroup_css(cgroup_dummy_top, ss));
if (css)
ss->css_free(css);
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
......@@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void)
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
/*
* Used to destroy pidlists and separate to serve as flush domain.
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
0, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
core_initcall(cgroup_wq_init);
......@@ -5143,10 +4904,11 @@ int proc_cgroup_show(struct seq_file *m, void *v)
for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int count = 0;
int ssid, count = 0;
seq_printf(m, "%d:", root->hierarchy_id);
for_each_root_subsys(root, ss)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
......@@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable);
* @dentry: directory dentry of interest
* @ss: subsystem of interest
*
* Must be called under RCU read lock. The caller is responsible for
* pinning the returned css if it needs to be accessed outside the RCU
* critical section.
* Must be called under cgroup_mutex or RCU read lock. The caller is
* responsible for pinning the returned css if it needs to be accessed
* outside the critical section.
*/
struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
{
struct cgroup *cgrp;
WARN_ON_ONCE(!rcu_read_lock_held());
cgroup_assert_mutex_or_rcu_locked();
/* is @dentry a cgroup dir? */
if (!dentry->d_inode ||
......@@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
struct cgroup *cgrp;
rcu_lockdep_assert(rcu_read_lock_held() ||
lockdep_is_held(&cgroup_mutex),
"css_from_id() needs proper protection");
cgroup_assert_mutex_or_rcu_locked();
cgrp = idr_find(&ss->root->cgroup_idr, id);
if (cgrp)
......@@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
return count;
}
static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
struct cftype *cft,
struct seq_file *seq)
static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
......@@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
}
#define MAX_TASKS_SHOWN_PER_CSS 25
static int cgroup_css_links_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *seq)
static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
read_lock(&css_set_lock);
......@@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = {
{
.name = "current_css_set_cg_links",
.read_seq_string = current_css_set_cg_links_read,
.seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
.read_seq_string = cgroup_css_links_read,
.seq_show = cgroup_css_links_read,
},
{
......
......@@ -301,10 +301,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *m)
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *pos;
struct cgroup_subsys_state *css = seq_css(m), *pos;
rcu_read_lock();
......@@ -458,7 +457,7 @@ static struct cftype files[] = {
{
.name = "state",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = freezer_read,
.seq_show = freezer_read,
.write_string = freezer_write,
},
{
......
......@@ -1731,66 +1731,41 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
* A single large read to a buffer that crosses a page boundary is
* ok, because the result being copied to user land is not recomputed
* across a page fault.
*/
static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
size_t count;
mutex_lock(&callback_mutex);
count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
ssize_t count;
char *buf, *s;
int ret = 0;
return count;
}
static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
size_t count;
count = seq_get_buf(sf, &buf);
s = buf;
mutex_lock(&callback_mutex);
count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
return count;
}
static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
char *page;
ssize_t retval = 0;
char *s;
if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
return -ENOMEM;
s = page;
switch (type) {
case FILE_CPULIST:
s += cpuset_sprintf_cpulist(s, cs);
s += cpulist_scnprintf(s, count, cs->cpus_allowed);
break;
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
default:
retval = -EINVAL;
goto out;
ret = -EINVAL;
goto out_unlock;
}
*s++ = '\n';
retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
out:
free_page((unsigned long)page);
return retval;
if (s < buf + count - 1) {
*s++ = '\n';
seq_commit(sf, s - buf);
} else {
seq_commit(sf, -1);
}
out_unlock:
mutex_unlock(&callback_mutex);
return ret;
}
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
......@@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static struct cftype files[] = {
{
.name = "cpus",
.read = cpuset_common_file_read,
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
......@@ -1855,7 +1830,7 @@ static struct cftype files[] = {
{
.name = "mems",
.read = cpuset_common_file_read,
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
......
......@@ -7852,15 +7852,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
static int cpu_stats_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(css);
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
cb->fill(cb, "throttled_time", cfs_b->throttled_time);
seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
return 0;
}
......@@ -7914,7 +7913,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
.read_map = cpu_stats_show,
.seq_show = cpu_stats_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
......
......@@ -163,10 +163,9 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return err;
}
static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
struct cpuacct *ca = css_ca(css);
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
int i;
......@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
static int cpuacct_stats_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct cgroup_map_cb *cb)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(css);
struct cpuacct *ca = css_ca(seq_css(sf));
int cpu;
s64 val = 0;
......@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0;
for_each_online_cpu(cpu) {
......@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
return 0;
}
......@@ -220,11 +218,11 @@ static struct cftype files[] = {
},
{
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
.seq_show = cpuacct_percpu_seq_show,
},
{
.name = "stat",
.read_map = cpuacct_stats_show,
.seq_show = cpuacct_stats_show,
},
{ } /* terminate */
};
......
......@@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
return;
}
static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
u64 val;
char str[64];
int idx, name, len;
int idx, name;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
idx = MEMFILE_IDX(cft->private);
name = MEMFILE_ATTR(cft->private);
val = res_counter_read_u64(&h_cg->hugepage[idx], name);
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
return res_counter_read_u64(&h_cg->hugepage[idx], name);
}
static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
......@@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx)
cft = &h->cgroup_files[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
cft->write_string = hugetlb_cgroup_write;
/* Add the usage file */
cft = &h->cgroup_files[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the MAX usage file */
cft = &h->cgroup_files[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->trigger = hugetlb_cgroup_reset;
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the failcntfile */
cft = &h->cgroup_files[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
cft->trigger = hugetlb_cgroup_reset;
cft->read = hugetlb_cgroup_read;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
cft = &h->cgroup_files[4];
......
......@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
......@@ -55,6 +56,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
......@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
/*
* cgroup_event represents events which userspace want to receive.
*/
struct mem_cgroup_event {
/*
* memcg which the event belongs to.
*/
struct mem_cgroup *memcg;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to this event. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args);
/*
* unregister_event() callback will be called when userspace closes
* the eventfd or on cgroup removing. This callback must be set,
* if you want provide notification functionality.
*/
void (*unregister_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct remove;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
......@@ -331,6 +373,10 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
......@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
return &mem_cgroup_from_css(css)->vmpressure;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
......@@ -2976,10 +3017,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
}
#ifdef CONFIG_SLABINFO
static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
......@@ -5112,14 +5152,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
char str[64];
u64 val;
int name, len;
int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
......@@ -5145,8 +5183,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
return val;
}
static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
......@@ -5383,8 +5420,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
......@@ -5400,7 +5436,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
const struct numa_stat *stat;
int nid;
unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
......@@ -5439,10 +5475,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct seq_file *m)
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
......@@ -5651,13 +5686,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
......@@ -5734,13 +5767,23 @@ static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
return ret;
}
static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd)
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
}
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
}
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, enum res_type type)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
......@@ -5813,14 +5856,23 @@ static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
mutex_unlock(&memcg->thresholds_lock);
}
static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
}
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
}
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
......@@ -5838,14 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd)
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
......@@ -5859,17 +5907,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct cgroup_map_cb *cb)
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
if (atomic_read(&memcg->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
......@@ -5962,41 +6005,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
/*
* DO NOT USE IN NEW FILES.
*
* "cgroup.event_control" implementation.
*
* This is way over-engineered. It tries to support fully configurable
* events for each user. Such level of flexibility is completely
* unnecessary especially in the light of the planned unified hierarchy.
*
* Please deprecate this and replace with something simpler if at all
* possible.
*/
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void memcg_event_remove(struct work_struct *work)
{
struct mem_cgroup_event *event =
container_of(work, struct mem_cgroup_event, remove);
struct mem_cgroup *memcg = event->memcg;
remove_wait_queue(event->wqh, &event->wait);
event->unregister_event(memcg, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
css_put(&memcg->css);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
container_of(wait, struct mem_cgroup_event, wait);
struct mem_cgroup *memcg = event->memcg;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
*/
spin_lock(&memcg->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
}
return 0;
}
static void memcg_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct mem_cgroup_event *event =
container_of(pt, struct mem_cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* DO NOT USE IN NEW FILES.
*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int memcg_write_event_control(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
const char *name;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->memcg = memcg;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
INIT_WORK(&event->remove, memcg_event_remove);
efile = fdget(efd);
if (!efile.file) {
ret = -EBADF;
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fdget(cfd);
if (!cfile.file) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto out_put_cfile;
/*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
* is for compatibility anyway.
*
* DO NOT ADD NEW FILES.
*/
name = cfile.file->f_dentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} else if (!strcmp(name, "memory.oom_control")) {
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
event->register_event = memsw_cgroup_usage_register_event;
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL;
goto out_put_cfile;
}
/*
* Verify @cfile should belong to @css. Also, remaining events are
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
rcu_read_lock();
ret = -EINVAL;
cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
&mem_cgroup_subsys);
if (cfile_css == css && css_tryget(css))
ret = 0;
rcu_read_unlock();
if (ret)
goto out_put_cfile;
ret = event->register_event(memcg, event->eventfd, buffer);
if (ret)
goto out_put_css;
efile.file->f_op->poll(efile.file, &event->pt);
spin_lock(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
spin_unlock(&memcg->event_list_lock);
fdput(cfile);
fdput(efile);
return 0;
out_put_css:
css_put(css);
out_put_cfile:
fdput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fdput(efile);
out_kfree:
kfree(event);
return ret;
}
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
.read_seq_string = memcg_stat_show,
.seq_show = memcg_stat_show,
},
{
.name = "force_empty",
......@@ -6008,6 +6271,12 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "cgroup.event_control", /* XXX: for compat */
.write_string = memcg_write_event_control,
.flags = CFTYPE_NO_PREFIX,
.mode = S_IWUGO,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
......@@ -6020,21 +6289,17 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
.read_map = mem_cgroup_oom_control_read,
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
.register_event = vmpressure_register_event,
.unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.read_seq_string = memcg_numa_stat_show,
.seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
......@@ -6042,29 +6307,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
.read_seq_string = mem_cgroup_slabinfo_read,
.seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
......@@ -6076,27 +6341,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read = mem_cgroup_read,
.read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
......@@ -6268,6 +6531,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
......@@ -6343,6 +6608,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
......
......@@ -451,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
* @ent: swap entry to be looked up.
*
* Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
* Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
......
......@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
/**
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
* @css: css that is interested in vmpressure notifications
* @cft: cgroup control files handle
* @memcg: memcg that is interested in vmpressure notifications
* @eventfd: eventfd context to link notifications with
* @args: event arguments (used to set up a pressure level threshold)
*
......@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
* "critical").
*
* This function should not be used directly, just pass it to (struct
* cftype).register_event, and then cgroup core will handle everything by
* itself.
* To be used as memcg event method.
*/
int vmpressure_register_event(struct cgroup_subsys_state *css,
struct cftype *cft, struct eventfd_ctx *eventfd,
const char *args)
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
struct vmpressure *vmpr = css_to_vmpressure(css);
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
int level;
......@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
/**
* vmpressure_unregister_event() - Unbind eventfd from vmpressure
* @css: css handle
* @cft: cgroup control files handle
* @memcg: memcg handle
* @eventfd: eventfd context that was used to link vmpressure with the @cg
*
* This function does internal manipulations to detach the @eventfd from
* the vmpressure notifications, and then frees internal resources
* associated with the @eventfd (but the @eventfd itself is not freed).
*
* This function should not be used directly, just pass it to (struct
* cftype).unregister_event, and then cgroup core will handle everything
* by itself.
* To be used as memcg event method.
*/
void vmpressure_unregister_event(struct cgroup_subsys_state *css,
struct cftype *cft,
void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
struct vmpressure *vmpr = css_to_vmpressure(css);
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
mutex_lock(&vmpr->events_lock);
......
......@@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
return css->cgroup->id;
}
static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
static int read_priomap(struct seq_file *sf, void *v)
{
struct net_device *dev;
rcu_read_lock();
for_each_netdev_rcu(&init_net, dev)
cb->fill(cb, dev->name, netprio_prio(css, dev));
seq_printf(sf, "%s %u\n", dev->name,
netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
return 0;
}
......@@ -238,7 +238,7 @@ static struct cftype ss_files[] = {
},
{
.name = "ifpriomap",
.read_map = read_priomap,
.seq_show = read_priomap,
.write_string = write_priomap,
},
{ } /* terminate */
......
......@@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m)
sprintf(str, "%u", m);
}
static int devcgroup_seq_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
static int devcgroup_seq_show(struct seq_file *m, void *v)
{
struct dev_cgroup *devcgroup = css_to_devcgroup(css);
struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
struct dev_exception_item *ex;
char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
......@@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = {
},
{
.name = "list",
.read_seq_string = devcgroup_seq_read,
.seq_show = devcgroup_seq_show,
.private = DEVCG_LIST,
},
{ } /* terminate */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment