bpf: Add bpf_link_new_file that doesn't install FD

Add bpf_link_new_file() API for cases when we need to ensure anon_inode is successfully created before we proceed with expensive BPF program attachment procedure, which will require equally (if not more so) expensive and potentially failing compensation detachment procedure just because anon_inode creation failed. This API allows to simplify code by ensuring first that anon_inode is created and after BPF program is attached proceed with fd_install() that can't fail. After anon_inode file is created, link can't be just kfree()'d anymore, because its destruction will be performed by deferred file_operations->release call. For this, bpf_link API required specifying two separate operations: release() and dealloc(), former performing detachment only, while the latter frees memory used by bpf_link itself. dealloc() needs to be specified, because struct bpf_link is frequently embedded into link type-specific container struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to properly free the memory. In case when anon_inode file was successfully created, but subsequent BPF attachment failed, bpf_link needs to be marked as "defunct", so that file's release() callback will perform only memory deallocation, but no detachment. Convert raw tracepoint and tracing attachment to new API and eliminate detachment from error handling path. Signed-off-by: Andrii Nakryiko <andriin@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/bpf/20200309231051.1270337-1-andriin@fb.com

bpf: Add bpf_link_new_file that doesn't install FD
Add bpf_link_new_file() API for cases when we need to ensure anon_inode is successfully created before we proceed with expensive BPF program attachment procedure, which will require equally (if not more so) expensive and potentially failing compensation detachment procedure just because anon_inode creation failed. This API allows to simplify code by ensuring first that anon_inode is created and after BPF program is attached proceed with fd_install() that can't fail. After anon_inode file is created, link can't be just kfree()'d anymore, because its destruction will be performed by deferred file_operations->release call. For this, bpf_link API required specifying two separate operations: release() and dealloc(), former performing detachment only, while the latter frees memory used by bpf_link itself. dealloc() needs to be specified, because struct bpf_link is frequently embedded into link type-specific container struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to properly free the memory. In case when anon_inode file was successfully created, but subsequent BPF attachment failed, bpf_link needs to be marked as "defunct", so that file's release() callback will perform only memory deallocation, but no detachment. Convert raw tracepoint and tracing attachment to new API and eliminate detachment from error handling path. Signed-off-by: Andrii Nakryiko <andriin@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Link: https://lore.kernel.org/bpf/20200309231051.1270337-1-andriin@fb.com
babf3164 · Andrii Nakryiko · Daniel Borkmann · aad32f4c · babf3164 · babf3164
Commit babf3164 authored Mar 09, 2020 by Andrii Nakryiko Committed by Daniel Borkmann Mar 11, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 91 additions and 34 deletions

include/linux/bpf.h include/linux/bpf.h +3 -0

kernel/bpf/syscall.c kernel/bpf/syscall.c +88 -34

No files found.
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1070,13 +1070,16 @@ struct bpf_link;

 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
+	void (*dealloc)(struct bpf_link *link);
 };

 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog);
+void bpf_link_defunct(struct bpf_link *link);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);

 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);

--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2188,6 +2188,11 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 	link->prog = prog;
 }

+void bpf_link_defunct(struct bpf_link *link)
+{
+	link->prog = NULL;
+}
+
 void bpf_link_inc(struct bpf_link *link)
 {
 	atomic64_inc(&link->refcnt);
@@ -2196,14 +2201,13 @@ void bpf_link_inc(struct bpf_link *link)
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
-	struct bpf_prog *prog;
-
-	/* remember prog locally, because release below will free link memory */
-	prog = link->prog;
-	/* extra clean up and kfree of container link struct */
-	link->ops->release(link);
-	/* no more accesing of link members after this point */
-	bpf_prog_put(prog);
+	if (link->prog) {
+		/* detach BPF program, clean up used resources */
+		link->ops->release(link);
+		bpf_prog_put(link->prog);
+	}
+	/* free bpf_link and its containing memory */
+	link->ops->dealloc(link);
 }

 static void bpf_link_put_deferred(struct work_struct *work)
@@ -2281,6 +2285,33 @@ int bpf_link_new_fd(struct bpf_link *link)
 	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
 }

+/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
+ * instead of immediately installing fd in fdtable, just reserve it and
+ * return. Caller then need to either install it with fd_install(fd, file) or
+ * release with put_unused_fd(fd).
+ * This is useful for cases when bpf_link attachment/detachment are
+ * complicated and expensive operations and should be delayed until all the fd
+ * reservation and anon_inode creation succeeds.
+ */
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+{
+	struct file *file;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return file;
+	}
+
+	*reserved_fd = fd;
+	return file;
+}
+
 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 {
 	struct fd f = fdget(ufd);
@@ -2305,21 +2336,27 @@ struct bpf_tracing_link {
 };

 static void bpf_tracing_link_release(struct bpf_link *link)
+{
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+}
+
+static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
 		container_of(link, struct bpf_tracing_link, link);

-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
 	kfree(tr_link);
 }

 static const struct bpf_link_ops bpf_tracing_link_lops = {
 	.release = bpf_tracing_link_release,
+	.dealloc = bpf_tracing_link_dealloc,
 };

 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 {
 	struct bpf_tracing_link *link;
+	struct file *link_file;
 	int link_fd, err;

 	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
@@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 	}
 	bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);

-	err = bpf_trampoline_link_prog(prog);
-	if (err)
-		goto out_free_link;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_prog;
+	}

-	link_fd = bpf_link_new_fd(&link->link);
-	if (link_fd < 0) {
-		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
-		err = link_fd;
-		goto out_free_link;
+	err = bpf_trampoline_link_prog(prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_prog;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;

-out_free_link:
-	kfree(link);
 out_put_prog:
 	bpf_prog_put(prog);
 	return err;
@@ -2368,19 +2409,28 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)

 	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+}
+
+static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_raw_tp_link *raw_tp =
+		container_of(link, struct bpf_raw_tp_link, link);
+
 	kfree(raw_tp);
 }

 static const struct bpf_link_ops bpf_raw_tp_lops = {
 	.release = bpf_raw_tp_link_release,
+	.dealloc = bpf_raw_tp_link_dealloc,
 };

 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd

 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
-	struct bpf_raw_tp_link *raw_tp;
+	struct bpf_raw_tp_link *link;
 	struct bpf_raw_event_map *btp;
+	struct file *link_file;
 	struct bpf_prog *prog;
 	const char *tp_name;
 	char buf[128];
@@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		goto out_put_prog;
 	}

-	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-	if (!raw_tp) {
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
 		err = -ENOMEM;
 		goto out_put_btp;
 	}
-	bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog);
-	raw_tp->btp = btp;
+	bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+	link->btp = btp;

-	err = bpf_probe_register(raw_tp->btp, prog);
-	if (err)
-		goto out_free_tp;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_btp;
+	}

-	link_fd = bpf_link_new_fd(&raw_tp->link);
-	if (link_fd < 0) {
-		bpf_probe_unregister(raw_tp->btp, prog);
-		err = link_fd;
-		goto out_free_tp;
+	err = bpf_probe_register(link->btp, prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_btp;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;

-out_free_tp:
-	kfree(raw_tp);
 out_put_btp:
 	bpf_put_raw_tracepoint(btp);
 out_put_prog: