Commit c68d7f1b authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_get_info-locking'

Eric Dumazet says:

====================
tcp: tcp_get_info() locking changes

This short series prepares tcp_get_info() for more detailed infos.

In order to not slow down fast path, our goal is to use the normal
socket spinlock instead of custom synchronization.

All we need to ensure is that tcp_get_info() is not called with
ehash lock, which might dead lock, since packet processing would acquire
the spinlocks in reverse way.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 721ad321 67db3e4b
...@@ -176,8 +176,6 @@ struct tcp_sock { ...@@ -176,8 +176,6 @@ struct tcp_sock {
* sum(delta(snd_una)), or how many bytes * sum(delta(snd_una)), or how many bytes
* were acked. * were acked.
*/ */
struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
u32 snd_una; /* First byte we want an ack for */ u32 snd_una; /* First byte we want an ack for */
u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
......
...@@ -861,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -861,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r, struct nlattr *bc)
{ {
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states; u32 idiag_states = r->idiag_states;
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); int i, num, s_i, s_num;
struct sock *sk;
if (idiag_states & TCPF_SYN_RECV) if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV; idiag_states |= TCPF_NEW_SYN_RECV;
...@@ -877,7 +878,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -877,7 +878,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) { for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb; struct inet_listen_hashbucket *ilb;
struct sock *sk;
num = 0; num = 0;
ilb = &hashinfo->listening_hash[i]; ilb = &hashinfo->listening_hash[i];
...@@ -922,13 +922,14 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -922,13 +922,14 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (!(idiag_states & ~TCPF_LISTEN)) if (!(idiag_states & ~TCPF_LISTEN))
goto out; goto out;
#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) { for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i]; struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i); spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node; struct hlist_nulls_node *node;
struct sock *sk; struct sock *sk_arr[SKARR_SZ];
int num_arr[SKARR_SZ];
num = 0; int idx, accum, res;
if (hlist_nulls_empty(&head->chain)) if (hlist_nulls_empty(&head->chain))
continue; continue;
...@@ -936,9 +937,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -936,9 +937,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (i > s_i) if (i > s_i)
s_num = 0; s_num = 0;
next_chunk:
num = 0;
accum = 0;
spin_lock_bh(lock); spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) { sk_nulls_for_each(sk, node, &head->chain) {
int state, res; int state;
if (!net_eq(sock_net(sk), net)) if (!net_eq(sock_net(sk), net))
continue; continue;
...@@ -962,21 +966,35 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -962,21 +966,35 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk)) if (!inet_diag_bc_sk(bc, sk))
goto next_normal; goto next_normal;
res = sk_diag_fill(sk, skb, r, sock_hold(sk);
num_arr[accum] = num;
sk_arr[accum] = sk;
if (++accum == SKARR_SZ)
break;
next_normal:
++num;
}
spin_unlock_bh(lock);
res = 0;
for (idx = 0; idx < accum; idx++) {
if (res >= 0) {
res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk), sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin); cb->nlh, net_admin);
if (res < 0) { if (res < 0)
spin_unlock_bh(lock); num = num_arr[idx];
goto done;
} }
next_normal: sock_gen_put(sk_arr[idx]);
++num;
} }
if (res < 0)
spin_unlock_bh(lock); break;
cond_resched(); cond_resched();
if (accum == SKARR_SZ) {
s_num = num + 1;
goto next_chunk;
}
} }
done: done:
......
...@@ -405,7 +405,6 @@ void tcp_init_sock(struct sock *sk) ...@@ -405,7 +405,6 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0; tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT; tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp); tcp_enable_early_retrans(tp);
...@@ -2710,9 +2709,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) ...@@ -2710,9 +2709,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp, intv; u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64; u64 rate64;
bool slow;
u32 rate; u32 rate;
memset(info, 0, sizeof(*info)); memset(info, 0, sizeof(*info));
...@@ -2721,6 +2719,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) ...@@ -2721,6 +2719,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_state = sk_state_load(sk); info->tcpi_state = sk_state_load(sk);
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_max_pacing_rate);
info->tcpi_reordering = tp->reordering;
info->tcpi_snd_cwnd = tp->snd_cwnd;
if (info->tcpi_state == TCP_LISTEN) {
/* listeners aliased fields :
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
return;
}
info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_probes = icsk->icsk_probes_out;
...@@ -2748,13 +2767,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) ...@@ -2748,13 +2767,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache; info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = tp->packets_out;
info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = tp->sacked_out;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
}
info->tcpi_lost = tp->lost_out; info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out; info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out; info->tcpi_fackets = tp->fackets_out;
...@@ -2768,34 +2783,24 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) ...@@ -2768,34 +2783,24 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss; info->tcpi_advmss = tp->advmss;
info->tcpi_reordering = tp->reordering;
info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans; info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate); slow = lock_sock_fast(sk);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate); put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
rate64 = rate != ~0U ? rate : ~0ULL; put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
put_unaligned(rate64, &info->tcpi_max_pacing_rate); info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
unlock_sock_fast(sk, slow);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in; info->tcpi_segs_in = tp->segs_in;
notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
info->tcpi_notsent_bytes = max(0, notsent_bytes);
info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_data_segs_out = tp->data_segs_out;
......
...@@ -3351,9 +3351,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) ...@@ -3351,9 +3351,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una; u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp); sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta; tp->bytes_acked += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack; tp->snd_una = ack;
} }
...@@ -3363,9 +3361,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) ...@@ -3363,9 +3361,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt; u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp); sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta; tp->bytes_received += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq; tp->rcv_nxt = seq;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment