Commit 918fb1aa authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'stop-corrupting-socket-s-task_frag'

Benjamin Coddington says:

====================
Stop corrupting socket's task_frag

The networking code uses flags in sk_allocation to determine if it can use
current->task_frag, however in-kernel users of sockets may stop setting
sk_allocation when they convert to the preferred memalloc_nofs_save/restore,
as SUNRPC has done in commit a1231fda ("SUNRPC: Set memalloc_nofs_save()
on all rpciod/xprtiod jobs").

This will cause corruption in current->task_frag when recursing into the
network layer for those subsystems during page fault or reclaim.  The
corruption is difficult to diagnose because stack traces may not contain the
offending subsystem at all.  The corruption is unlikely to show up in
testing because it requires memory pressure, and so subsystems that
convert to memalloc_nofs_save/restore are likely to continue to run into
this issue.

Previous reports and proposed fixes:
https://lore.kernel.org/netdev/96a18bd00cbc6cb554603cc0d6ef1c551965b078.1663762494.git.gnault@redhat.com/
https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/
https://lore.kernel.org/linux-nfs/de6d99321d1dcaa2ad456b92b3680aa77c07a747.1665401788.git.gnault@redhat.com/

Guilluame Nault has done all of the hard work tracking this problem down and
finding the best fix for this issue.  I'm just taking a turn posting another
fix.
====================

Link: https://lore.kernel.org/r/cover.1671194454.git.bcodding@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b389a902 08f65892
......@@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection *connection)
sock.socket->sk->sk_allocation = GFP_NOIO;
msock.socket->sk->sk_allocation = GFP_NOIO;
sock.socket->sk->sk_use_task_frag = false;
msock.socket->sk->sk_use_task_frag = false;
sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
......
......@@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
noreclaim_flag = memalloc_noreclaim_save();
do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
sock->sk->sk_use_task_frag = false;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
......
......@@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
queue->sock->sk->sk_allocation = GFP_ATOMIC;
queue->sock->sk->sk_use_task_frag = false;
nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
queue->data_remaining = 0;
......
......@@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
sk->sk_reuse = SK_CAN_REUSE;
sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */
sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;
sk_set_memalloc(sk);
sock_no_linger(sk);
......
......@@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size)
do {
sock->sk->sk_allocation = GFP_NOIO;
sock->sk->sk_use_task_frag = false;
result = sock_recvmsg(sock, &msg, MSG_WAITALL);
if (result <= 0)
......
......@@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket);
else
......
......@@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con)
if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
}
......@@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void)
listen_con.sock = sock;
sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;
sock->sk->sk_data_ready = lowcomms_listen_data_ready;
release_sock(sock->sk);
......
......@@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
......
......@@ -318,6 +318,9 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
......@@ -512,6 +515,7 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
bool sk_use_task_frag;
struct socket *sk_socket;
void *sk_user_data;
......@@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
* Both direct reclaim and page faults can nest inside other
* socket operations and end up recursing into sk_page_frag()
* while it's already in use: explicitly avoid task page_frag
* usage if the caller is potentially doing any of them.
* This assumes that page fault handlers use the GFP_NOFS flags.
* when users disable sk_use_task_frag.
*
* Return: a per task page_frag if context allows that,
* otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
(__GFP_DIRECT_RECLAIM | __GFP_FS))
if (sk->sk_use_task_frag)
return &current->task_frag;
return &sk->sk_frag;
......
......@@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
}
csocket->sk->sk_allocation = GFP_NOIO;
csocket->sk->sk_use_task_frag = false;
file = sock_alloc_file(csocket, 0, NULL);
if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n",
......
......@@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con)
if (ret)
return ret;
sock->sk->sk_allocation = GFP_NOFS;
sock->sk->sk_use_task_frag = false;
#ifdef CONFIG_LOCKDEP
lockdep_set_class(&sock->sk->sk_lock, &socket_class);
......
......@@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk->sk_use_task_frag = true;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
......
......@@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_write_space = xs_udp_write_space;
sk->sk_state_change = xs_local_state_change;
sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;
xprt_clear_connected(xprt);
......@@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_use_task_frag = false;
xprt_set_connected(xprt);
......@@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sk->sk_error_report = xs_error_report;
sk->sk_use_task_frag = false;
/* socket options */
sock_reset_flag(sk, SOCK_LINGER);
......
......@@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk)
/* avoid using task_frag */
sk->sk_allocation = GFP_ATOMIC;
sk->sk_use_task_frag = false;
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment