Commit 324bda9e authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent c818fa9e
...@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern; ...@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
extern struct static_key_false cgroup_bpf_enabled_key; extern struct static_key_false cgroup_bpf_enabled_key;
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
struct bpf_prog_list {
struct list_head node;
struct bpf_prog *prog;
};
struct bpf_prog_array;
struct cgroup_bpf { struct cgroup_bpf {
/* /* array of effective progs in this cgroup */
* Store two sets of bpf_prog pointers, one for programs that are struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
* pinned directly to this cgroup, and one for those that are effective
* when this cgroup is accessed. /* attached progs to this cgroup and attach flags
* when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
* have either zero or one element
* when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
*/ */
struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; struct list_head progs[MAX_BPF_ATTACH_TYPE];
struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE]; u32 flags[MAX_BPF_ATTACH_TYPE];
bool disallow_override[MAX_BPF_ATTACH_TYPE];
/* temp storage for effective prog array used by prog_attach/detach */
struct bpf_prog_array __rcu *inactive;
}; };
void cgroup_bpf_put(struct cgroup *cgrp); void cgroup_bpf_put(struct cgroup *cgrp);
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); int cgroup_bpf_inherit(struct cgroup *cgrp);
int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
struct bpf_prog *prog, enum bpf_attach_type type, enum bpf_attach_type type, u32 flags);
bool overridable); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, u32 flags);
/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, bool overridable); enum bpf_attach_type type, u32 flags);
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, u32 flags);
int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb, struct sk_buff *skb,
...@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, ...@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct cgroup_bpf {}; struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline void cgroup_bpf_inherit(struct cgroup *cgrp, static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
struct cgroup *parent) {}
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
......
...@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr); union bpf_attr __user *uattr);
/* an array of programs to be executed under rcu_lock.
*
* Typical usage:
* ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
*
* the structure returned by bpf_prog_array_alloc() should be populated
* with program pointers and the last pointer must be NULL.
* The user has to keep refcnt on the program and make sure the program
* is removed from the array before bpf_prog_put().
* The 'struct bpf_prog_array *' should only be replaced with xchg()
* since other cpus are walking the array of pointers in parallel.
*/
struct bpf_prog_array {
struct rcu_head rcu;
struct bpf_prog *progs[0];
};
struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
({ \
struct bpf_prog **_prog; \
u32 _ret = 1; \
rcu_read_lock(); \
_prog = rcu_dereference(array)->progs; \
for (; *_prog; _prog++) \
_ret &= func(*_prog, ctx); \
rcu_read_unlock(); \
_ret; \
})
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active); DECLARE_PER_CPU(int, bpf_prog_active);
......
...@@ -481,7 +481,7 @@ struct sk_filter { ...@@ -481,7 +481,7 @@ struct sk_filter {
struct bpf_prog *prog; struct bpf_prog *prog;
}; };
#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) #define BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi)
#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
......
...@@ -143,11 +143,47 @@ enum bpf_attach_type { ...@@ -143,11 +143,47 @@ enum bpf_attach_type {
#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
* to the given target_fd cgroup the descendent cgroup will be able to *
* override effective bpf program that was inherited from this cgroup * NONE(default): No further bpf programs allowed in the subtree.
*
* BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
* the program in this cgroup yields to sub-cgroup program.
*
* BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
* that cgroup program gets run in addition to the program in this cgroup.
*
* Only one program is allowed to be attached to a cgroup with
* NONE or BPF_F_ALLOW_OVERRIDE flag.
* Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
* release old program and attach the new one. Attach flags has to match.
*
* Multiple programs are allowed to be attached to a cgroup with
* BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
* (those that were attached first, run first)
* The programs of sub-cgroup are executed first, then programs of
* this cgroup and then programs of parent cgroup.
* When children program makes decision (like picking TCP CA or sock bind)
* parent program has a chance to override it.
*
* A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
* A cgroup with NONE doesn't allow any programs in sub-cgroups.
* Ex1:
* cgrp1 (MULTI progs A, B) ->
* cgrp2 (OVERRIDE prog C) ->
* cgrp3 (MULTI prog D) ->
* cgrp4 (OVERRIDE prog E) ->
* cgrp5 (NONE prog F)
* the event in cgrp5 triggers execution of F,D,A,B in that order.
* if prog F is detached, the execution is E,D,A,B
* if prog F and D are detached, the execution is E,A,B
* if prog F, E and D are detached, the execution is C,A,B
*
* All eligible programs are executed regardless of return code from
* earlier programs.
*/ */
#define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_OVERRIDE (1U << 0)
#define BPF_F_ALLOW_MULTI (1U << 1)
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will perform strict alignment checking as if the kernel * verifier will perform strict alignment checking as if the kernel
......
This diff is collapsed.
...@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) ...@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
} }
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
/* to avoid allocating empty bpf_prog_array for cgroups that
* don't have bpf program attached use one global 'empty_prog_array'
* It will not be modified the caller of bpf_prog_array_alloc()
* (since caller requested prog_cnt == 0)
* that pointer should be 'freed' by bpf_prog_array_free()
*/
static struct {
struct bpf_prog_array hdr;
struct bpf_prog *null_prog;
} empty_prog_array = {
.null_prog = NULL,
};
struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
if (prog_cnt)
return kzalloc(sizeof(struct bpf_prog_array) +
sizeof(struct bpf_prog *) * (prog_cnt + 1),
flags);
return &empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
{
if (!progs ||
progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
static void bpf_prog_free_deferred(struct work_struct *work) static void bpf_prog_free_deferred(struct work_struct *work)
{ {
struct bpf_prog_aux *aux; struct bpf_prog_aux *aux;
......
...@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) ...@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return 0; return 0;
} }
#define BPF_F_ATTACH_MASK \
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_attach(const union bpf_attr *attr)
{ {
enum bpf_prog_type ptype; enum bpf_prog_type ptype;
...@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH)) if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL; return -EINVAL;
if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL; return -EINVAL;
switch (attr->attach_type) { switch (attr->attach_type) {
...@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp); return PTR_ERR(cgrp);
} }
ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
attr->attach_flags & BPF_F_ALLOW_OVERRIDE); attr->attach_flags);
if (ret) if (ret)
bpf_prog_put(prog); bpf_prog_put(prog);
cgroup_put(cgrp); cgroup_put(cgrp);
...@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ...@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr)
{ {
enum bpf_prog_type ptype;
struct bpf_prog *prog;
struct cgroup *cgrp; struct cgroup *cgrp;
int ret; int ret;
...@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr) ...@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) { switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_EGRESS:
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd); ptype = BPF_PROG_TYPE_SOCK_OPS;
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
cgroup_put(cgrp);
break; break;
case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_STREAM_VERDICT:
ret = sockmap_get_from_fd(attr, false); return sockmap_get_from_fd(attr, false);
break;
default: default:
return -EINVAL; return -EINVAL;
} }
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
prog = NULL;
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
if (prog)
bpf_prog_put(prog);
cgroup_put(cgrp);
return ret; return ret;
} }
......
...@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) ...@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret) if (ret)
goto destroy_root; goto destroy_root;
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
trace_cgroup_setup_root(root); trace_cgroup_setup_root(root);
/* /*
...@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self; cgrp->self.parent = &parent->self;
cgrp->root = root; cgrp->root = root;
cgrp->level = level; cgrp->level = level;
ret = cgroup_bpf_inherit(cgrp);
if (ret)
goto out_idr_free;
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
...@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp)) if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp); cgrp->subtree_control = cgroup_control(cgrp);
if (parent)
cgroup_bpf_inherit(cgrp, parent);
cgroup_propagate_control(cgrp); cgroup_propagate_control(cgrp);
return cgrp; return cgrp;
out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref: out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt); percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp: out_free_cgrp:
...@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) ...@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */ #endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF #ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, bool overridable) enum bpf_attach_type type, u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, u32 flags)
{ {
struct cgroup *parent = cgroup_parent(cgrp);
int ret; int ret;
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment