Commit 918fb1aa authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'stop-corrupting-socket-s-task_frag'

Benjamin Coddington says:

====================
Stop corrupting socket's task_frag

The networking code uses flags in sk_allocation to determine if it can use
current->task_frag, however in-kernel users of sockets may stop setting
sk_allocation when they convert to the preferred memalloc_nofs_save/restore,
as SUNRPC has done in commit a1231fda ("SUNRPC: Set memalloc_nofs_save()
on all rpciod/xprtiod jobs").

This will cause corruption in current->task_frag when recursing into the
network layer for those subsystems during page fault or reclaim.  The
corruption is difficult to diagnose because stack traces may not contain the
offending subsystem at all.  The corruption is unlikely to show up in
testing because it requires memory pressure, and so subsystems that
convert to memalloc_nofs_save/restore are likely to continue to run into
this issue.

Previous reports and proposed fixes:
https://lore.kernel.org/netdev/96a18bd00cbc6cb554603cc0d6ef1c551965b078.1663762494.git.gnault@redhat.com/
https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/
https://lore.kernel.org/linux-nfs/de6d99321d1dcaa2ad456b92b3680aa77c07a747.1665401788.git.gnault@redhat.com/

Guilluame Nault has done all of the hard work tracking this problem down and
finding the best fix for this issue.  I'm just taking a turn posting another
fix.
====================

Link: https://lore.kernel.org/r/cover.1671194454.git.bcodding@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b389a902 08f65892
...@@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection *connection) ...@@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection *connection)
sock.socket->sk->sk_allocation = GFP_NOIO; sock.socket->sk->sk_allocation = GFP_NOIO;
msock.socket->sk->sk_allocation = GFP_NOIO; msock.socket->sk->sk_allocation = GFP_NOIO;
sock.socket->sk->sk_use_task_frag = false;
msock.socket->sk->sk_use_task_frag = false;
sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
......
...@@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, ...@@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
noreclaim_flag = memalloc_noreclaim_save(); noreclaim_flag = memalloc_noreclaim_save();
do { do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
sock->sk->sk_use_task_frag = false;
msg.msg_name = NULL; msg.msg_name = NULL;
msg.msg_namelen = 0; msg.msg_namelen = 0;
msg.msg_control = NULL; msg.msg_control = NULL;
......
...@@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) ...@@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
queue->sock->sk->sk_rcvtimeo = 10 * HZ; queue->sock->sk->sk_rcvtimeo = 10 * HZ;
queue->sock->sk->sk_allocation = GFP_ATOMIC; queue->sock->sk->sk_allocation = GFP_ATOMIC;
queue->sock->sk->sk_use_task_frag = false;
nvme_tcp_set_queue_io_cpu(queue); nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL; queue->request = NULL;
queue->data_remaining = 0; queue->data_remaining = 0;
......
...@@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session, ...@@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
sk->sk_reuse = SK_CAN_REUSE; sk->sk_reuse = SK_CAN_REUSE;
sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */ sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */
sk->sk_allocation = GFP_ATOMIC; sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;
sk_set_memalloc(sk); sk_set_memalloc(sk);
sock_no_linger(sk); sock_no_linger(sk);
......
...@@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size) ...@@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size)
do { do {
sock->sk->sk_allocation = GFP_NOIO; sock->sk->sk_allocation = GFP_NOIO;
sock->sk->sk_use_task_frag = false;
result = sock_recvmsg(sock, &msg, MSG_WAITALL); result = sock_recvmsg(sock, &msg, MSG_WAITALL);
if (result <= 0) if (result <= 0)
......
...@@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server) ...@@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n"); cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket; server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS; socket->sk->sk_allocation = GFP_NOFS;
socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6) if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket); cifs_reclassify_socket6(socket);
else else
......
...@@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con) ...@@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con)
if (dlm_config.ci_protocol == DLM_PROTO_SCTP) if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
sk->sk_state_change = lowcomms_state_change; sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS; sk->sk_allocation = GFP_NOFS;
sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report; sk->sk_error_report = lowcomms_error_report;
release_sock(sk); release_sock(sk);
} }
...@@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void) ...@@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void)
listen_con.sock = sock; listen_con.sock = sock;
sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;
sock->sk->sk_data_ready = lowcomms_listen_data_ready; sock->sk->sk_data_ready = lowcomms_listen_data_ready;
release_sock(sock->sk); release_sock(sock->sk);
......
...@@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work) ...@@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */ sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC; sock->sk->sk_allocation = GFP_ATOMIC;
sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET; myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
......
...@@ -318,6 +318,9 @@ struct sk_filter; ...@@ -318,6 +318,9 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received * @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags * @sk_tsflags: SO_TIMESTAMPING flags
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping * for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_tskey: counter to disambiguate concurrent tstamp requests
...@@ -512,6 +515,7 @@ struct sock { ...@@ -512,6 +515,7 @@ struct sock {
u8 sk_txtime_deadline_mode : 1, u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1, sk_txtime_report_errors : 1,
sk_txtime_unused : 6; sk_txtime_unused : 6;
bool sk_use_task_frag;
struct socket *sk_socket; struct socket *sk_socket;
void *sk_user_data; void *sk_user_data;
...@@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) ...@@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
* Both direct reclaim and page faults can nest inside other * Both direct reclaim and page faults can nest inside other
* socket operations and end up recursing into sk_page_frag() * socket operations and end up recursing into sk_page_frag()
* while it's already in use: explicitly avoid task page_frag * while it's already in use: explicitly avoid task page_frag
* usage if the caller is potentially doing any of them. * when users disable sk_use_task_frag.
* This assumes that page fault handlers use the GFP_NOFS flags.
* *
* Return: a per task page_frag if context allows that, * Return: a per task page_frag if context allows that,
* otherwise a per socket one. * otherwise a per socket one.
*/ */
static inline struct page_frag *sk_page_frag(struct sock *sk) static inline struct page_frag *sk_page_frag(struct sock *sk)
{ {
if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == if (sk->sk_use_task_frag)
(__GFP_DIRECT_RECLAIM | __GFP_FS))
return &current->task_frag; return &current->task_frag;
return &sk->sk_frag; return &sk->sk_frag;
......
...@@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) ...@@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
} }
csocket->sk->sk_allocation = GFP_NOIO; csocket->sk->sk_allocation = GFP_NOIO;
csocket->sk->sk_use_task_frag = false;
file = sock_alloc_file(csocket, 0, NULL); file = sock_alloc_file(csocket, 0, NULL);
if (IS_ERR(file)) { if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n", pr_err("%s (%d): failed to map fd\n",
......
...@@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con) ...@@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con)
if (ret) if (ret)
return ret; return ret;
sock->sk->sk_allocation = GFP_NOFS; sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;
#ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP
lockdep_set_class(&sock->sk->sk_lock, &socket_class); lockdep_set_class(&sock->sk->sk_lock, &socket_class);
......
...@@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) ...@@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE; sk->sk_state = TCP_CLOSE;
sk->sk_use_task_frag = true;
sk_set_socket(sk, sock); sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED); sock_set_flag(sk, SOCK_ZAPPED);
......
...@@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, ...@@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_write_space = xs_udp_write_space; sk->sk_write_space = xs_udp_write_space;
sk->sk_state_change = xs_local_state_change; sk->sk_state_change = xs_local_state_change;
sk->sk_error_report = xs_error_report; sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;
xprt_clear_connected(xprt); xprt_clear_connected(xprt);
...@@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) ...@@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt; sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready; sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space; sk->sk_write_space = xs_udp_write_space;
sk->sk_use_task_frag = false;
xprt_set_connected(xprt); xprt_set_connected(xprt);
...@@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) ...@@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_state_change = xs_tcp_state_change; sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space; sk->sk_write_space = xs_tcp_write_space;
sk->sk_error_report = xs_error_report; sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;
/* socket options */ /* socket options */
sock_reset_flag(sk, SOCK_LINGER); sock_reset_flag(sk, SOCK_LINGER);
......
...@@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk) ...@@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk)
/* avoid using task_frag */ /* avoid using task_frag */
sk->sk_allocation = GFP_ATOMIC; sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment