Commit b6c6712a authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: sk_dst_cache RCUification

With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this
work.

sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock)

This rwlock is readlocked for a very small amount of time, and dst
entries are already freed after RCU grace period. This calls for RCU
again :)

This patch converts sk_dst_lock to a spinlock, and use RCU for readers.

__sk_dst_get() is supposed to be called with rcu_read_lock() or if
socket locked by user, so use appropriate rcu_dereference_check()
condition (rcu_read_lock_held() || sock_owned_by_user(sk))

This patch avoids two atomic ops per tx packet on UDP connected sockets,
for example, and permits sk_dst_lock to be much less dirtied.
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 7a161ea9
...@@ -225,21 +225,6 @@ static inline void dst_confirm(struct dst_entry *dst) ...@@ -225,21 +225,6 @@ static inline void dst_confirm(struct dst_entry *dst)
neigh_confirm(dst->neighbour); neigh_confirm(dst->neighbour);
} }
static inline void dst_negative_advice(struct dst_entry **dst_p,
struct sock *sk)
{
struct dst_entry * dst = *dst_p;
if (dst && dst->ops->negative_advice) {
*dst_p = dst->ops->negative_advice(dst);
if (dst != *dst_p) {
extern void sk_reset_txq(struct sock *sk);
sk_reset_txq(sk);
}
}
}
static inline void dst_link_failure(struct sk_buff *skb) static inline void dst_link_failure(struct sk_buff *skb)
{ {
struct dst_entry *dst = skb_dst(skb); struct dst_entry *dst = skb_dst(skb);
......
...@@ -152,9 +152,9 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, ...@@ -152,9 +152,9 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
struct in6_addr *daddr, struct in6_addr *saddr) struct in6_addr *daddr, struct in6_addr *saddr)
{ {
write_lock(&sk->sk_dst_lock); spin_lock(&sk->sk_dst_lock);
__ip6_dst_store(sk, dst, daddr, saddr); __ip6_dst_store(sk, dst, daddr, saddr);
write_unlock(&sk->sk_dst_lock); spin_unlock(&sk->sk_dst_lock);
} }
static inline int ipv6_unicast_destination(struct sk_buff *skb) static inline int ipv6_unicast_destination(struct sk_buff *skb)
......
...@@ -262,7 +262,7 @@ struct sock { ...@@ -262,7 +262,7 @@ struct sock {
#ifdef CONFIG_XFRM #ifdef CONFIG_XFRM
struct xfrm_policy *sk_policy[2]; struct xfrm_policy *sk_policy[2];
#endif #endif
rwlock_t sk_dst_lock; spinlock_t sk_dst_lock;
atomic_t sk_rmem_alloc; atomic_t sk_rmem_alloc;
atomic_t sk_wmem_alloc; atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc; atomic_t sk_omem_alloc;
...@@ -1192,7 +1192,8 @@ extern unsigned long sock_i_ino(struct sock *sk); ...@@ -1192,7 +1192,8 @@ extern unsigned long sock_i_ino(struct sock *sk);
static inline struct dst_entry * static inline struct dst_entry *
__sk_dst_get(struct sock *sk) __sk_dst_get(struct sock *sk)
{ {
return sk->sk_dst_cache; return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
sock_owned_by_user(sk));
} }
static inline struct dst_entry * static inline struct dst_entry *
...@@ -1200,50 +1201,62 @@ sk_dst_get(struct sock *sk) ...@@ -1200,50 +1201,62 @@ sk_dst_get(struct sock *sk)
{ {
struct dst_entry *dst; struct dst_entry *dst;
read_lock(&sk->sk_dst_lock); rcu_read_lock();
dst = sk->sk_dst_cache; dst = rcu_dereference(sk->sk_dst_cache);
if (dst) if (dst)
dst_hold(dst); dst_hold(dst);
read_unlock(&sk->sk_dst_lock); rcu_read_unlock();
return dst; return dst;
} }
extern void sk_reset_txq(struct sock *sk);
static inline void dst_negative_advice(struct sock *sk)
{
struct dst_entry *ndst, *dst = __sk_dst_get(sk);
if (dst && dst->ops->negative_advice) {
ndst = dst->ops->negative_advice(dst);
if (ndst != dst) {
rcu_assign_pointer(sk->sk_dst_cache, ndst);
sk_reset_txq(sk);
}
}
}
static inline void static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst) __sk_dst_set(struct sock *sk, struct dst_entry *dst)
{ {
struct dst_entry *old_dst; struct dst_entry *old_dst;
sk_tx_queue_clear(sk); sk_tx_queue_clear(sk);
old_dst = sk->sk_dst_cache; old_dst = rcu_dereference_check(sk->sk_dst_cache,
sk->sk_dst_cache = dst; lockdep_is_held(&sk->sk_dst_lock));
rcu_assign_pointer(sk->sk_dst_cache, dst);
dst_release(old_dst); dst_release(old_dst);
} }
static inline void static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst) sk_dst_set(struct sock *sk, struct dst_entry *dst)
{ {
write_lock(&sk->sk_dst_lock); spin_lock(&sk->sk_dst_lock);
__sk_dst_set(sk, dst); __sk_dst_set(sk, dst);
write_unlock(&sk->sk_dst_lock); spin_unlock(&sk->sk_dst_lock);
} }
static inline void static inline void
__sk_dst_reset(struct sock *sk) __sk_dst_reset(struct sock *sk)
{ {
struct dst_entry *old_dst; __sk_dst_set(sk, NULL);
sk_tx_queue_clear(sk);
old_dst = sk->sk_dst_cache;
sk->sk_dst_cache = NULL;
dst_release(old_dst);
} }
static inline void static inline void
sk_dst_reset(struct sock *sk) sk_dst_reset(struct sock *sk)
{ {
write_lock(&sk->sk_dst_lock); spin_lock(&sk->sk_dst_lock);
__sk_dst_reset(sk); __sk_dst_reset(sk);
write_unlock(&sk->sk_dst_lock); spin_unlock(&sk->sk_dst_lock);
} }
extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
......
...@@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, ...@@ -2015,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
if (dev->real_num_tx_queues > 1) if (dev->real_num_tx_queues > 1)
queue_index = skb_tx_hash(dev, skb); queue_index = skb_tx_hash(dev, skb);
if (sk && sk->sk_dst_cache) if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
sk_tx_queue_set(sk, queue_index); sk_tx_queue_set(sk, queue_index);
} }
} }
......
...@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq); ...@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq);
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{ {
struct dst_entry *dst = sk->sk_dst_cache; struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk); sk_tx_queue_clear(sk);
sk->sk_dst_cache = NULL; rcu_assign_pointer(sk->sk_dst_cache, NULL);
dst_release(dst); dst_release(dst);
return NULL; return NULL;
} }
...@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) ...@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
skb_queue_head_init(&newsk->sk_async_wait_queue); skb_queue_head_init(&newsk->sk_async_wait_queue);
#endif #endif
rwlock_init(&newsk->sk_dst_lock); spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock); rwlock_init(&newsk->sk_callback_lock);
lockdep_set_class_and_name(&newsk->sk_callback_lock, lockdep_set_class_and_name(&newsk->sk_callback_lock,
af_callback_keys + newsk->sk_family, af_callback_keys + newsk->sk_family,
...@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) ...@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
} else } else
sk->sk_sleep = NULL; sk->sk_sleep = NULL;
rwlock_init(&sk->sk_dst_lock); spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock); rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock, lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family, af_callback_keys + sk->sk_family,
......
...@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk) ...@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk)
if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
if (icsk->icsk_retransmits != 0) if (icsk->icsk_retransmits != 0)
dst_negative_advice(&sk->sk_dst_cache, sk); dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? retry_until = icsk->icsk_syn_retries ?
: sysctl_dccp_request_retries; : sysctl_dccp_request_retries;
} else { } else {
...@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk) ...@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk)
Golden words :-). Golden words :-).
*/ */
dst_negative_advice(&sk->sk_dst_cache, sk); dst_negative_advice(sk);
} }
retry_until = sysctl_dccp_retries2; retry_until = sysctl_dccp_retries2;
......
...@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk) ...@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk)
skb_queue_purge(&scp->other_xmit_queue); skb_queue_purge(&scp->other_xmit_queue);
skb_queue_purge(&scp->other_receive_queue); skb_queue_purge(&scp->other_receive_queue);
dst_release(xchg(&sk->sk_dst_cache, NULL)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
} }
static int dn_memory_pressure; static int dn_memory_pressure;
...@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) ...@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
release_sock(sk); release_sock(sk);
dst = skb_dst(skb); dst = skb_dst(skb);
dst_release(xchg(&newsk->sk_dst_cache, dst)); sk_dst_set(newsk, dst);
skb_dst_set(skb, NULL); skb_dst_set(skb, NULL);
DN_SK(newsk)->state = DN_CR; DN_SK(newsk)->state = DN_CR;
...@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, ...@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
} }
if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
dst_negative_advice(&sk->sk_dst_cache, sk); dst_negative_advice(sk);
mss = scp->segsize_rem; mss = scp->segsize_rem;
fctype = scp->services_rem & NSP_FC_MASK; fctype = scp->services_rem & NSP_FC_MASK;
......
...@@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) ...@@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(sk->sk_forward_alloc); WARN_ON(sk->sk_forward_alloc);
kfree(inet->opt); kfree(inet->opt);
dst_release(sk->sk_dst_cache); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
sk_refcnt_debug_dec(sk); sk_refcnt_debug_dec(sk);
} }
EXPORT_SYMBOL(inet_sock_destruct); EXPORT_SYMBOL(inet_sock_destruct);
......
...@@ -3710,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) ...@@ -3710,7 +3710,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
} }
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
dst_confirm(sk->sk_dst_cache); dst_confirm(__sk_dst_get(sk));
return 1; return 1;
...@@ -5833,7 +5833,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, ...@@ -5833,7 +5833,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->snd_una == tp->write_seq) { if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2); tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_shutdown |= SEND_SHUTDOWN;
dst_confirm(sk->sk_dst_cache); dst_confirm(__sk_dst_get(sk));
if (!sock_flag(sk, SOCK_DEAD)) if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */ /* Wake up lingering close() */
......
...@@ -172,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -172,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) if (icsk->icsk_retransmits)
dst_negative_advice(&sk->sk_dst_cache, sk); dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else { } else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
/* Black hole detection */ /* Black hole detection */
tcp_mtu_probing(icsk, sk); tcp_mtu_probing(icsk, sk);
dst_negative_advice(&sk->sk_dst_cache, sk); dst_negative_advice(sk);
} }
retry_until = sysctl_tcp_retries2; retry_until = sysctl_tcp_retries2;
......
...@@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, ...@@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
} }
opt = xchg(&inet6_sk(sk)->opt, opt); opt = xchg(&inet6_sk(sk)->opt, opt);
} else { } else {
write_lock(&sk->sk_dst_lock); spin_lock(&sk->sk_dst_lock);
opt = xchg(&inet6_sk(sk)->opt, opt); opt = xchg(&inet6_sk(sk)->opt, opt);
write_unlock(&sk->sk_dst_lock); spin_unlock(&sk->sk_dst_lock);
} }
sk_dst_reset(sk); sk_dst_reset(sk);
...@@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, ...@@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
case IPV6_MTU: case IPV6_MTU:
{ {
struct dst_entry *dst; struct dst_entry *dst;
val = 0; val = 0;
lock_sock(sk); rcu_read_lock();
dst = sk_dst_get(sk); dst = __sk_dst_get(sk);
if (dst) { if (dst)
val = dst_mtu(dst); val = dst_mtu(dst);
dst_release(dst); rcu_read_unlock();
}
release_sock(sk);
if (!val) if (!val)
return -ENOTCONN; return -ENOTCONN;
break; break;
...@@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, ...@@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
else else
val = np->mcast_hops; val = np->mcast_hops;
dst = sk_dst_get(sk); if (val < 0) {
if (dst) { rcu_read_lock();
if (val < 0) dst = __sk_dst_get(sk);
if (dst)
val = ip6_dst_hoplimit(dst); val = ip6_dst_hoplimit(dst);
dst_release(dst); rcu_read_unlock();
} }
if (val < 0) if (val < 0)
val = sock_net(sk)->ipv6.devconf_all->hop_limit; val = sock_net(sk)->ipv6.devconf_all->hop_limit;
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment