Commit 368da430 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.19/io_uring-socket-2022-05-22' of git://git.kernel.dk/linux-block

Pull io_uring socket() support from Jens Axboe:
 "This adds support for socket(2) for io_uring. This is handy when using
  direct / registered file descriptors with io_uring.

  Outside of those two patches, a small series from Dylan on top that
  improves the tracing by providing a text representation of the opcode
  rather than needing to decode this by reading the header file every
  time.

  That sits in this branch as it was the last opcode added (until it
  wasn't...)"

* tag 'for-5.19/io_uring-socket-2022-05-22' of git://git.kernel.dk/linux-block:
  io_uring: use the text representation of ops in trace
  io_uring: rename op -> opcode
  io_uring: add io_uring_get_opcode
  io_uring: add type to op enum
  io_uring: add socket(2) support
  net: add __sys_socket_file()
parents 09beaff7 033b87d2
......@@ -600,6 +600,16 @@ struct io_accept {
unsigned long nofile;
};
struct io_socket {
struct file *file;
int domain;
int type;
int protocol;
int flags;
u32 file_slot;
unsigned long nofile;
};
struct io_sync {
struct file *file;
loff_t len;
......@@ -981,6 +991,7 @@ struct io_kiocb {
struct io_hardlink hardlink;
struct io_msg msg;
struct io_xattr xattr;
struct io_socket sock;
};
u8 opcode;
......@@ -1297,6 +1308,9 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1
},
[IORING_OP_GETXATTR] = {},
[IORING_OP_SOCKET] = {
.audit_skip = 1,
},
};
/* requests with any of those set should undergo io_disarm_next() */
......@@ -1341,6 +1355,107 @@ static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
const char *io_uring_get_opcode(u8 opcode)
{
switch ((enum io_uring_op)opcode) {
case IORING_OP_NOP:
return "NOP";
case IORING_OP_READV:
return "READV";
case IORING_OP_WRITEV:
return "WRITEV";
case IORING_OP_FSYNC:
return "FSYNC";
case IORING_OP_READ_FIXED:
return "READ_FIXED";
case IORING_OP_WRITE_FIXED:
return "WRITE_FIXED";
case IORING_OP_POLL_ADD:
return "POLL_ADD";
case IORING_OP_POLL_REMOVE:
return "POLL_REMOVE";
case IORING_OP_SYNC_FILE_RANGE:
return "SYNC_FILE_RANGE";
case IORING_OP_SENDMSG:
return "SENDMSG";
case IORING_OP_RECVMSG:
return "RECVMSG";
case IORING_OP_TIMEOUT:
return "TIMEOUT";
case IORING_OP_TIMEOUT_REMOVE:
return "TIMEOUT_REMOVE";
case IORING_OP_ACCEPT:
return "ACCEPT";
case IORING_OP_ASYNC_CANCEL:
return "ASYNC_CANCEL";
case IORING_OP_LINK_TIMEOUT:
return "LINK_TIMEOUT";
case IORING_OP_CONNECT:
return "CONNECT";
case IORING_OP_FALLOCATE:
return "FALLOCATE";
case IORING_OP_OPENAT:
return "OPENAT";
case IORING_OP_CLOSE:
return "CLOSE";
case IORING_OP_FILES_UPDATE:
return "FILES_UPDATE";
case IORING_OP_STATX:
return "STATX";
case IORING_OP_READ:
return "READ";
case IORING_OP_WRITE:
return "WRITE";
case IORING_OP_FADVISE:
return "FADVISE";
case IORING_OP_MADVISE:
return "MADVISE";
case IORING_OP_SEND:
return "SEND";
case IORING_OP_RECV:
return "RECV";
case IORING_OP_OPENAT2:
return "OPENAT2";
case IORING_OP_EPOLL_CTL:
return "EPOLL_CTL";
case IORING_OP_SPLICE:
return "SPLICE";
case IORING_OP_PROVIDE_BUFFERS:
return "PROVIDE_BUFFERS";
case IORING_OP_REMOVE_BUFFERS:
return "REMOVE_BUFFERS";
case IORING_OP_TEE:
return "TEE";
case IORING_OP_SHUTDOWN:
return "SHUTDOWN";
case IORING_OP_RENAMEAT:
return "RENAMEAT";
case IORING_OP_UNLINKAT:
return "UNLINKAT";
case IORING_OP_MKDIRAT:
return "MKDIRAT";
case IORING_OP_SYMLINKAT:
return "SYMLINKAT";
case IORING_OP_LINKAT:
return "LINKAT";
case IORING_OP_MSG_RING:
return "MSG_RING";
case IORING_OP_FSETXATTR:
return "FSETXATTR";
case IORING_OP_SETXATTR:
return "SETXATTR";
case IORING_OP_FGETXATTR:
return "FGETXATTR";
case IORING_OP_GETXATTR:
return "GETXATTR";
case IORING_OP_SOCKET:
return "SOCKET";
case IORING_OP_LAST:
return "INVALID";
}
return "INVALID";
}
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
......@@ -6237,6 +6352,62 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_socket *sock = &req->sock;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
sock->domain = READ_ONCE(sqe->fd);
sock->type = READ_ONCE(sqe->off);
sock->protocol = READ_ONCE(sqe->len);
sock->file_slot = READ_ONCE(sqe->file_index);
sock->nofile = rlimit(RLIMIT_NOFILE);
sock->flags = sock->type & ~SOCK_TYPE_MASK;
if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
return -EINVAL;
if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
return 0;
}
static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_socket *sock = &req->sock;
bool fixed = !!sock->file_slot;
struct file *file;
int ret, fd;
if (!fixed) {
fd = __get_unused_fd_flags(sock->flags, sock->nofile);
if (unlikely(fd < 0))
return fd;
}
file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
if (IS_ERR(file)) {
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
} else if (!fixed) {
fd_install(fd, file);
ret = fd;
} else {
ret = io_install_fixed_file(req, file, issue_flags,
sock->file_slot - 1);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
static int io_connect_prep_async(struct io_kiocb *req)
{
struct io_async_connect *io = req->async_data;
......@@ -6322,6 +6493,7 @@ IO_NETOP_PREP_ASYNC(sendmsg);
IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
IO_NETOP_PREP(socket);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
......@@ -7651,6 +7823,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_fgetxattr_prep(req, sqe);
case IORING_OP_GETXATTR:
return io_getxattr_prep(req, sqe);
case IORING_OP_SOCKET:
return io_socket_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
......@@ -7974,6 +8148,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_GETXATTR:
ret = io_getxattr(req, issue_flags);
break;
case IORING_OP_SOCKET:
ret = io_socket(req, issue_flags);
break;
default:
ret = -EINVAL;
break;
......
......@@ -10,6 +10,7 @@ struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
static inline void io_uring_files_cancel(void)
{
......@@ -42,6 +43,10 @@ static inline void io_uring_files_cancel(void)
static inline void io_uring_free(struct task_struct *tsk)
{
}
static inline const char *io_uring_get_opcode(u8 opcode)
{
return "";
}
#endif
#endif
......@@ -434,6 +434,7 @@ extern struct file *do_accept(struct file *file, unsigned file_flags,
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
extern struct file *__sys_socket_file(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
int addrlen, int file_flags);
......
......@@ -7,6 +7,7 @@
#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
#include <linux/io_uring.h>
struct io_wq_work;
......@@ -169,8 +170,9 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->rw = rw;
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
io_uring_get_opcode(__entry->opcode),
__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
);
......@@ -205,8 +207,9 @@ TRACE_EVENT(io_uring_defer,
__entry->opcode = opcode;
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d",
__entry->ctx, __entry->req, __entry->data, __entry->opcode)
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
__entry->ctx, __entry->req, __entry->data,
io_uring_get_opcode(__entry->opcode))
);
/**
......@@ -305,9 +308,9 @@ TRACE_EVENT(io_uring_fail_link,
__entry->link = link;
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
__entry->link)
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
__entry->ctx, __entry->req, __entry->user_data,
io_uring_get_opcode(__entry->opcode), __entry->link)
);
/**
......@@ -389,9 +392,9 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, "
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __entry->opcode,
__entry->user_data, io_uring_get_opcode(__entry->opcode),
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
......@@ -433,8 +436,9 @@ TRACE_EVENT(io_uring_poll_arm,
__entry->events = events;
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->user_data,
io_uring_get_opcode(__entry->opcode),
__entry->mask, __entry->events)
);
......@@ -470,8 +474,9 @@ TRACE_EVENT(io_uring_task_add,
__entry->mask = mask;
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x",
__entry->ctx, __entry->req, __entry->user_data, __entry->opcode,
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
__entry->ctx, __entry->req, __entry->user_data,
io_uring_get_opcode(__entry->opcode),
__entry->mask)
);
......@@ -530,12 +535,13 @@ TRACE_EVENT(io_uring_req_failed,
),
TP_printk("ring %p, req %p, user_data 0x%llx, "
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
"opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, buf_index=%d, "
"personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
"error=%d",
__entry->ctx, __entry->req, __entry->user_data,
__entry->opcode, __entry->flags, __entry->ioprio,
io_uring_get_opcode(__entry->opcode),
__entry->flags, __entry->ioprio,
(unsigned long long)__entry->off,
(unsigned long long) __entry->addr, __entry->len,
__entry->op_flags,
......
......@@ -128,7 +128,7 @@ enum {
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
enum {
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
IORING_OP_WRITEV,
......@@ -174,6 +174,7 @@ enum {
IORING_OP_SETXATTR,
IORING_OP_FGETXATTR,
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
/* this goes last, obviously */
IORING_OP_LAST,
......
......@@ -504,7 +504,7 @@ static int sock_map_fd(struct socket *sock, int flags)
struct socket *sock_from_file(struct file *file)
{
if (file->f_op == &socket_file_ops)
return file->private_data; /* set in sock_map_fd */
return file->private_data; /* set in sock_alloc_file */
return NULL;
}
......@@ -1538,11 +1538,10 @@ int sock_create_kern(struct net *net, int family, int type, int protocol, struct
}
EXPORT_SYMBOL(sock_create_kern);
int __sys_socket(int family, int type, int protocol)
static struct socket *__sys_socket_create(int family, int type, int protocol)
{
int retval;
struct socket *sock;
int flags;
int retval;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
......@@ -1550,17 +1549,50 @@ int __sys_socket(int family, int type, int protocol)
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return ERR_PTR(-EINVAL);
type &= SOCK_TYPE_MASK;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return ERR_PTR(retval);
return sock;
}
struct file *__sys_socket_file(int family, int type, int protocol)
{
struct socket *sock;
struct file *file;
int flags;
sock = __sys_socket_create(family, type, protocol);
if (IS_ERR(sock))
return ERR_CAST(sock);
flags = type & ~SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return retval;
file = sock_alloc_file(sock, flags, NULL);
if (IS_ERR(file))
sock_release(sock);
return file;
}
int __sys_socket(int family, int type, int protocol)
{
struct socket *sock;
int flags;
sock = __sys_socket_create(family, type, protocol);
if (IS_ERR(sock))
return PTR_ERR(sock);
flags = type & ~SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment