Commit 681648e6 authored by Sowmini Varadhan's avatar Sowmini Varadhan Committed by David S. Miller

rds: tcp: correctly sequence cleanup on netns deletion.

Commit 8edc3aff ("rds: tcp: Take explicit refcounts on struct net")
introduces a regression in rds-tcp netns cleanup. The cleanup_net(),
(and thus rds_tcp_dev_event notification) is only called from put_net()
when all netns refcounts go to 0, but this cannot happen if the
rds_connection itself is holding a c_net ref that it expects to
release in rds_tcp_kill_sock.

Instead, the rds_tcp_kill_sock callback should make sure to
tear down state carefully, ensuring that the socket teardown
is only done after all data-structures and workqs that depend
on it are quiesced.

The original motivation for commit 8edc3aff ("rds: tcp: Take explicit
refcounts on struct net") was to resolve a race condition reported by
syzkaller where workqs for tx/rx/connect were triggered after the
namespace was deleted. Those worker threads should have been
cancelled/flushed before socket tear-down and indeed,
rds_conn_path_destroy() does try to sequence this by doing
     /* cancel cp_send_w */
     /* cancel cp_recv_w */
     /* flush cp_down_w */
     /* free data structures */
Here the "flush cp_down_w" will trigger rds_conn_shutdown and thus
invoke rds_tcp_conn_path_shutdown() to close the tcp socket, so that
we ought to have satisfied the requirement that "socket-close is
done after all other dependent state is quiesced". However,
rds_conn_shutdown has a bug in that it *always* triggers the reconnect
workq (and if connection is successful, we always restart tx/rx
workqs so with the right timing, we risk the race conditions reported
by syzkaller).

Netns deletion is like module teardown- no need to restart a
reconnect in this case. We can use the c_destroy_in_prog bit
to avoid restarting the reconnect.

Fixes: 8edc3aff ("rds: tcp: Take explicit refcounts on struct net")
Signed-off-by: default avatarSowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: default avatarSantosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 2d746c93
...@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) ...@@ -366,6 +366,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
* to the conn hash, so we never trigger a reconnect on this * to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */ * conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&cp->cp_conn_w); cancel_delayed_work_sync(&cp->cp_conn_w);
if (conn->c_destroy_in_prog)
return;
rcu_read_lock(); rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) { if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock(); rcu_read_unlock();
...@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn) ...@@ -445,7 +447,6 @@ void rds_conn_destroy(struct rds_connection *conn)
*/ */
rds_cong_remove_conn(conn); rds_cong_remove_conn(conn);
put_net(conn->c_net);
kfree(conn->c_path); kfree(conn->c_path);
kmem_cache_free(rds_conn_slab, conn); kmem_cache_free(rds_conn_slab, conn);
......
...@@ -150,7 +150,7 @@ struct rds_connection { ...@@ -150,7 +150,7 @@ struct rds_connection {
/* Protocol version */ /* Protocol version */
unsigned int c_version; unsigned int c_version;
struct net *c_net; possible_net_t c_net;
struct list_head c_map_item; struct list_head c_map_item;
unsigned long c_map_queued; unsigned long c_map_queued;
...@@ -165,13 +165,13 @@ struct rds_connection { ...@@ -165,13 +165,13 @@ struct rds_connection {
static inline static inline
struct net *rds_conn_net(struct rds_connection *conn) struct net *rds_conn_net(struct rds_connection *conn)
{ {
return conn->c_net; return read_pnet(&conn->c_net);
} }
static inline static inline
void rds_conn_net_set(struct rds_connection *conn, struct net *net) void rds_conn_net_set(struct rds_connection *conn, struct net *net)
{ {
conn->c_net = get_net(net); write_pnet(&conn->c_net, net);
} }
#define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_CONG_BITMAP 0x01
......
...@@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net) ...@@ -506,7 +506,7 @@ static void rds_tcp_kill_sock(struct net *net)
rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
spin_lock_irq(&rds_tcp_conn_lock); spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
struct net *c_net = tc->t_cpath->cp_conn->c_net; struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock) if (net != c_net || !tc->t_sock)
continue; continue;
...@@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net) ...@@ -563,7 +563,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
spin_lock_irq(&rds_tcp_conn_lock); spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
struct net *c_net = tc->t_cpath->cp_conn->c_net; struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
if (net != c_net || !tc->t_sock) if (net != c_net || !tc->t_sock)
continue; continue;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment