Commit 2d912da0 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-use-indirect_call-in-some-dst_ops'

Brian Vazquez says:

====================
net: use INDIRECT_CALL in some dst_ops

This patch series uses the INDIRECT_CALL wrappers in some dst_ops
functions to mitigate retpoline costs. Benefits depend on the
platform as described below.

Background: The kernel rewrites the retpoline code at
__x86_indirect_thunk_r11 depending on the CPU's requirements.
The INDIRECT_CALL wrappers provide hints on possible targets and
save the retpoline overhead using a direct call in case the
target matches one of the hints.

The retpoline overhead for the following three cases has been
measured by Luigi Rizzo in microbenchmarks, using CPU performance
counters, and cover reasonably well the range of possible retpoline
overheads compared to a plain indirect call (in equal conditions,
specifically with predicted branch, hot cache):

- just "jmp *(%r11)" on modern platforms like Intel Cascadelake.
  In this case the overhead is just 2 clock cycles:

- "lfence; jmp *(%r11)" on e.g. some recent AMD CPUs.
  In this case the lfence is blocked until pending reads complete,
  so the actual overhead depends on previous instructions.
  The best case we have measured 15 clock cycles of overhead.

- worst case, e.g. skylake, the full retpoline is used

    __x86_indirect_thunk_r11:     call set_u_target
    capture_speculation:          pause
                                  lfence
                                  jmp capture_speculation
    .align 16
    set_up_target:                mov %r11, (%rsp)
                                  ret

   In this case the overhead has been measured in 35-40 clock cycles.

The actual time saved hence depends on the platform and current
clock speed (which varies heavily, especially when C-states are active).
Also note that actual benefit might be lower than expected if the
longer retpoline overlaps with some pending memory read.

MEASUREMENTS:
The INDIRECT_CALL wrappers in this patchset involve the processing
of incoming SYN and generation of syncookies. Hence, the test has been
run by configuring a receiving host with a single NIC rx queue, disabling
RPS and RFS so that all processing occurs on the same core.
An external source generates SYN fast enough to saturate the receiving CPU.
We ran two sets of experiments, with and without the dst_output patch,
comparing the number of syncookies generated over a 20s period
in multiple runs.

Assuming the CPU is saturated, the time per packet is
   t = number_of_packets/total_time
and if the two datasets have statistically meaningful difference,
the difference in times between the two cases gives an estimate
of the benefits from one INDIRECT_CALL.

Here are the experimental results:

Skylake     Syncookies over 20s (5 tests)
---------------------------------------------------
indirect    9166325 9182023 9170093 9134014 9171082
retpoline   9099308 9126350 9154841 9056377 9122376

Computing the stats on the ns_pkt = 20e6/total_packets gives the following:

$ ministat -c 95 -w 70 /tmp/sk-indirect /tmp/sk-retp
x /tmp/sk-indirect
+ /tmp/sk-retp
+----------------------------------------------------------------------+
|x     xx x     +          x    + +           +                       +|
||______M__A_______|_|____________M_____A___________________|          |
+----------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x   5   2.17817e-06   2.18962e-06     2.181e-06  2.182292e-06 4.3252133e-09
+   5   2.18464e-06   2.20839e-06   2.19241e-06  2.194974e-06 8.8695958e-09
Difference at 95.0% confidence
        1.2682e-08 +/- 1.01766e-08
        0.581132% +/- 0.466326%
        (Student's t, pooled s = 6.97772e-09)

This suggests a difference of 13ns +/- 10ns
Our expectation from microbenchmarks was 35-40 cycles per call,
but part of the gains may be eaten by stalls from pending memory reads.

For Cascadelake:
Cascadelake     Syncookies over 20s (5 tests)
---------------------------------------------------------
indirect     10339797 10297547 10366826 10378891 10384854
retpoline    10332674 10366805 10320374 10334272 10374087

Computing the stats on the ns_pkt = 20e6/total_packets gives no
meaningful difference even at just 80% (this was expected):

$ ministat -c 80 -w 70 /tmp/cl-indirect /tmp/cl-retp
x /tmp/cl-indirect
+ /tmp/cl-retp
+----------------------------------------------------------------------+
|   x    x  +     *                   x   + +        +                x|
||______________|_M_________A_____A_______M________|___|               |
+----------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x   5   1.92588e-06   1.94221e-06   1.92923e-06  1.931716e-06 6.6936746e-09
+   5   1.92788e-06   1.93791e-06   1.93531e-06  1.933188e-06 4.3734106e-09
No difference proven at 80.0% confidence
====================

Link: https://lore.kernel.org/r/20210201174132.3534118-1-brianvv@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 4f4e5436 bbd807df
......@@ -18,6 +18,7 @@
#include <linux/refcount.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>
struct sk_buff;
......@@ -193,9 +194,11 @@ dst_feature(const struct dst_entry *dst, u32 feature)
return dst_metric(dst, RTAX_FEATURES) & feature;
}
INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *));
INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *));
static inline u32 dst_mtu(const struct dst_entry *dst)
{
return dst->ops->mtu(dst);
return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}
/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
......@@ -435,22 +438,36 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
dst->expires = expires;
}
INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
struct sk_buff *));
/* Output packet to network from transport. */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return skb_dst(skb)->output(net, sk, skb);
return INDIRECT_CALL_INET(skb_dst(skb)->output,
ip6_output, ip_output,
net, sk, skb);
}
INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
return skb_dst(skb)->input(skb);
return INDIRECT_CALL_INET(skb_dst(skb)->input,
ip6_input, ip_local_deliver, skb);
}
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
if (dst->obsolete)
dst = dst->ops->check(dst, cookie);
dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
ipv4_dst_check, dst, cookie);
return dst;
}
......
......@@ -526,11 +526,17 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL(__sk_receive_skb);
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
if (dst && dst->obsolete &&
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
......@@ -546,7 +552,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
if (dst && dst->obsolete &&
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
......
......@@ -253,6 +253,7 @@ int ip_local_deliver(struct sk_buff *skb)
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
EXPORT_SYMBOL(ip_local_deliver);
static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
......
......@@ -434,6 +434,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
EXPORT_SYMBOL(ip_output);
/*
* copy saddr and daddr, possibly using 64bit load/stores
......
......@@ -133,9 +133,11 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
* Interface to generic destination cache.
*/
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
INDIRECT_CALLABLE_SCOPE
struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
static unsigned int ipv4_mtu(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int ipv4_mtu(const struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
......@@ -1187,7 +1189,8 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
u32 cookie)
{
struct rtable *rt = (struct rtable *) dst;
......@@ -1203,6 +1206,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
return NULL;
return dst;
}
EXPORT_SYMBOL(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
......@@ -1311,7 +1315,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
return min(advmss, IPV4_MAX_PMTU - header_size);
}
static unsigned int ipv4_mtu(const struct dst_entry *dst)
INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
const struct rtable *rt = (const struct rtable *)dst;
unsigned int mtu = rt->rt_pmtu;
......@@ -1333,6 +1337,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
EXPORT_SYMBOL(ipv4_mtu);
static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
......
......@@ -1649,6 +1649,8 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
return mss;
}
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
......@@ -1668,7 +1670,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
......
......@@ -217,6 +217,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ip6_finish_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
EXPORT_SYMBOL(ip6_output);
bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
{
......
......@@ -81,9 +81,11 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
INDIRECT_CALLABLE_SCOPE
struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int ip6_mtu(const struct dst_entry *dst);
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
......@@ -2611,7 +2613,8 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
return NULL;
}
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
u32 cookie)
{
struct dst_entry *dst_ret;
struct fib6_info *from;
......@@ -2641,6 +2644,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
return dst_ret;
}
EXPORT_SYMBOL(ip6_dst_check);
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
......@@ -3089,7 +3093,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
return mtu;
}
static unsigned int ip6_mtu(const struct dst_entry *dst)
INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
struct inet6_dev *idev;
unsigned int mtu;
......@@ -3111,6 +3115,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
EXPORT_SYMBOL(ip6_mtu);
/* MTU selection:
* 1. mtu on route is locked - use it
......
......@@ -1420,6 +1420,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
return NULL;
}
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
......@@ -1473,7 +1475,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
dst, np->rx_dst_cookie) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment