Commit a88e24f2 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-switch-to-Early-Departure-Time-model'

Eric Dumazet says:

====================
tcp: switch to Early Departure Time model

In the early days, pacing has been implemented in sch_fq (FQ)
in a generic way :

- SO_MAX_PACING_RATE could be used by any sockets.

- TCP would vary effective pacing rate based on CWND*MSS/SRTT

- FQ would ensure delays between packets based on current
  sk->sk_pacing_rate, but with some quantum based artifacts.
  (inflating RPC tail latencies)

- BBR then tweaked the pacing rate in its various phases
  (PROBE, DRAIN, ...)

This worked reasonably well, but had the side effect that TCP RTT
samples would be inflated by the sojourn time of the packets in FQ.

Also note that when FQ is not used and TCP wants pacing, the
internal pacing fallback has very different behavior, since TCP
emits packets at the time they should be sent (with unreasonable
assumptions about scheduling costs)

Van Jacobson gave a talk at Netdev 0x12 in Montreal, about letting
TCP (or applications for UDP messages) decide of the Earliest
Departure Time, instead of letting packet schedulers derive it
from pacing rate.

https://www.netdevconf.org/0x12/session.html?evolving-from-afap-teaching-nics-about-time
https://www.files.netdevconf.org/d/46def75c2ef345809bbe/files/?p=/Evolving%20from%20AFAP%20%E2%80%93%20Teaching%20NICs%20about%20time.pdf

Recent additions in linux provided SO_TXTIME and a new ETF qdisc
supporting the new skb->tstamp role

This patch series converts TCP and FQ to the same model.

This might in the future allow us to relax tight TSQ limits
(if FQ is present in the output path), and thus lower
number of callbacks to tcp_write_xmit(), thanks to batching.

This will be followed by FQ change allowing SO_TXTIME support
so that QUIC servers can let the pacing being done in FQ (or
offloaded if network device permits)

For example, a TCP flow rated at 24Mbps now shows a more meaningful RTT

Before :

ESTAB  0  211408 10.246.7.151:41558   10.246.7.152:33723
	 cubic wscale:8,8 rto:203 rtt:2.195/0.084 mss:1448 rcvmss:536
  advmss:1448 cwnd:20 ssthresh:20 bytes_acked:36897937
  segs_out:25488 segs_in:12454 data_segs_out:25486
  send 105.5Mbps lastsnd:1 lastrcv:12851 lastack:1
  pacing_rate 24.0Mbps/24.0Mbps delivery_rate 22.9Mbps
  busy:12851ms unacked:4 rcv_space:29200 notsent:205616 minrtt:0.026

After :

ESTAB  0  192584 10.246.7.151:61612   10.246.7.152:34375
	 cubic wscale:8,8 rto:201 rtt:0.165/0.129 mss:1448 rcvmss:536
  advmss:1448 cwnd:20 ssthresh:20 bytes_acked:170755401
  segs_out:117931 segs_in:57651 data_segs_out:117929
  send 1404.1Mbps lastsnd:1 lastrcv:56915 lastack:1
  pacing_rate 24.0Mbps/24.0Mbps delivery_rate 24.2Mbps
  busy:56915ms unacked:4 rcv_space:29200 notsent:186792 minrtt:0.054

A nice side effect of this patch series is a reduction of max/p99
latencies of RPC workloads, since the FQ quantum no longer adds
artifact.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 4f4b93a8 90caf67b
...@@ -689,7 +689,7 @@ struct sk_buff { ...@@ -689,7 +689,7 @@ struct sk_buff {
union { union {
ktime_t tstamp; ktime_t tstamp;
u64 skb_mstamp; u64 skb_mstamp_ns; /* earliest departure time */
}; };
/* /*
* This is the control buffer. It is free to use for every * This is the control buffer. It is free to use for every
......
...@@ -248,6 +248,8 @@ struct tcp_sock { ...@@ -248,6 +248,8 @@ struct tcp_sock {
syn_smc:1; /* SYN includes SMC */ syn_smc:1; /* SYN includes SMC */
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
/* RTT measurement */ /* RTT measurement */
u64 tcp_mstamp; /* most recent packet received/sent */ u64 tcp_mstamp; /* most recent packet received/sent */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
......
...@@ -732,7 +732,7 @@ void tcp_send_window_probe(struct sock *sk); ...@@ -732,7 +732,7 @@ void tcp_send_window_probe(struct sock *sk);
static inline u64 tcp_clock_ns(void) static inline u64 tcp_clock_ns(void)
{ {
return local_clock(); return ktime_get_tai_ns();
} }
static inline u64 tcp_clock_us(void) static inline u64 tcp_clock_us(void)
...@@ -752,17 +752,7 @@ static inline u32 tcp_time_stamp_raw(void) ...@@ -752,17 +752,7 @@ static inline u32 tcp_time_stamp_raw(void)
return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
} }
void tcp_mstamp_refresh(struct tcp_sock *tp);
/* Refresh 1us clock of a TCP socket,
* ensuring monotically increasing values.
*/
static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_us();
if (val > tp->tcp_mstamp)
tp->tcp_mstamp = val;
}
static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{ {
...@@ -771,7 +761,13 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) ...@@ -771,7 +761,13 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{ {
return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ); return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ);
}
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
} }
...@@ -817,7 +813,7 @@ struct tcp_skb_cb { ...@@ -817,7 +813,7 @@ struct tcp_skb_cb {
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */ #define TCPCB_LOST 0x04 /* SKB is lost */
#define TCPCB_TAGBITS 0x07 /* All tag bits */ #define TCPCB_TAGBITS 0x07 /* All tag bits */
#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */ #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
TCPCB_REPAIRED) TCPCB_REPAIRED)
...@@ -1940,7 +1936,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk) ...@@ -1940,7 +1936,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk)
{ {
const struct sk_buff *skb = tcp_rtx_queue_head(sk); const struct sk_buff *skb = tcp_rtx_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto; u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
} }
......
...@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req) ...@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
ts <<= TSBITS; ts <<= TSBITS;
ts |= options; ts |= options;
} }
return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
} }
......
...@@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = size_goal; copy = size_goal;
/* All packets are restored as if they have /* All packets are restored as if they have
* already been sent. skb_mstamp isn't set to * already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation. * avoid wrong rtt estimation.
*/ */
if (tp->repair) if (tp->repair)
......
...@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; ...@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */ /* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000; static const int bbr_min_tso_rate = 1200000;
/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
static const int bbr_pacing_marging_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT * that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting * and send the same number of packets per RTT that an un-paced, slow-starting
...@@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ...@@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{ {
unsigned int mss = tcp_sk(sk)->mss_cache; unsigned int mss = tcp_sk(sk)->mss_cache;
if (!tcp_needs_internal_pacing(sk))
mss = tcp_mss_to_mtu(sk, mss);
rate *= mss; rate *= mss;
rate *= gain; rate *= gain;
rate >>= BBR_SCALE; rate >>= BBR_SCALE;
rate *= USEC_PER_SEC; rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent);
return rate >> BW_SCALE; return rate >> BW_SCALE;
} }
......
...@@ -1305,7 +1305,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, ...@@ -1305,7 +1305,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
*/ */
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount, start_seq, end_seq, dup_sack, pcount,
skb->skb_mstamp); tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate); tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint) if (skb == tp->lost_skb_hint)
...@@ -1580,7 +1580,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, ...@@ -1580,7 +1580,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->end_seq,
dup_sack, dup_sack,
tcp_skb_pcount(skb), tcp_skb_pcount(skb),
skb->skb_mstamp); tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate); tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor); list_del_init(&skb->tcp_tsorted_anchor);
...@@ -3103,7 +3103,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, ...@@ -3103,7 +3103,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retrans_out -= acked_pcount; tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED; flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) { } else if (!(sacked & TCPCB_SACKED_ACKED)) {
last_ackt = skb->skb_mstamp; last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0); WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt) if (!first_ackt)
first_ackt = last_ackt; first_ackt = last_ackt;
...@@ -3121,7 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, ...@@ -3121,7 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->delivered += acked_pcount; tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb)) if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq, tcp_rack_advance(tp, sacked, scb->end_seq,
skb->skb_mstamp); tcp_skb_timestamp_us(skb));
} }
if (sacked & TCPCB_LOST) if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount; tp->lost_out -= acked_pcount;
...@@ -3215,7 +3215,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, ...@@ -3215,7 +3215,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
} }
} else if (skb && rtt_update && sack_rtt_us >= 0 && } else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent /* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the * after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery. * timeout may continue to extend in loss recovery.
......
...@@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
BUG_ON(!skb); BUG_ON(!skb);
tcp_mstamp_refresh(tp); tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto - remaining = icsk->icsk_rto -
usecs_to_jiffies(delta_us); usecs_to_jiffies(delta_us);
......
...@@ -45,6 +45,22 @@ ...@@ -45,6 +45,22 @@
#include <trace/events/tcp.h> #include <trace/events/tcp.h>
/* Refresh clocks of a TCP socket,
* ensuring monotically increasing values.
*/
void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_ns();
/* departure time for next data packet */
if (val > tp->tcp_wstamp_ns)
tp->tcp_wstamp_ns = val;
val = div_u64(val, NSEC_PER_USEC);
if (val > tp->tcp_mstamp)
tp->tcp_mstamp = val;
}
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp); int push_one, gfp_t gfp);
...@@ -977,28 +993,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) ...@@ -977,28 +993,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) static void tcp_internal_pacing(struct sock *sk)
{ {
u64 len_ns;
u32 rate;
if (!tcp_needs_internal_pacing(sk)) if (!tcp_needs_internal_pacing(sk))
return; return;
rate = sk->sk_pacing_rate;
if (!rate || rate == ~0U)
return;
len_ns = (u64)skb->len * NSEC_PER_SEC;
do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer, hrtimer_start(&tcp_sk(sk)->pacing_timer,
ktime_add_ns(ktime_get(), len_ns), ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns),
HRTIMER_MODE_ABS_PINNED_SOFT); HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk); sock_hold(sk);
} }
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
{ {
skb->skb_mstamp = tp->tcp_mstamp; struct tcp_sock *tp = tcp_sk(sk);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) {
u32 rate = sk->sk_pacing_rate;
/* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance.
*/
if (rate != ~0U && rate && tp->data_segs_out >= 10) {
tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);
tcp_internal_pacing(sk);
}
}
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
} }
...@@ -1045,7 +1067,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1045,7 +1067,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb)) if (unlikely(!skb))
return -ENOBUFS; return -ENOBUFS;
} }
skb->skb_mstamp = tp->tcp_mstamp; skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet = inet_sk(sk); inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb); tcb = TCP_SKB_CB(skb);
...@@ -1137,7 +1159,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1137,7 +1159,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcp_event_data_sent(tp, sk); tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb); tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size; tp->bytes_sent += skb->len - tcp_header_size;
tcp_internal_pacing(sk, skb);
} }
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
...@@ -1149,8 +1170,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1149,8 +1170,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
skb->tstamp = 0;
/* Cleanup our debris for IP stacks */ /* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
...@@ -1163,7 +1183,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1163,7 +1183,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = net_xmit_eval(err); err = net_xmit_eval(err);
} }
if (!err && oskb) { if (!err && oskb) {
tcp_update_skb_after_send(tp, oskb); tcp_update_skb_after_send(sk, oskb);
tcp_rate_skb_sent(sk, oskb); tcp_rate_skb_sent(sk, oskb);
} }
return err; return err;
...@@ -1966,7 +1986,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, ...@@ -1966,7 +1986,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk); head = tcp_rtx_queue_head(sk);
if (!head) if (!head)
goto send_now; goto send_now;
age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
/* If next ACK is likely to come too late (half srtt), do not defer */ /* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4)) if (age < (tp->srtt_us >> 4))
goto send_now; goto send_now;
...@@ -2312,7 +2332,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2312,7 +2332,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */ /* "skb_mstamp" is used as a start point for the retransmit timer */
tcp_update_skb_after_send(tp, skb); tcp_update_skb_after_send(sk, skb);
goto repair; /* Skip network transmission */ goto repair; /* Skip network transmission */
} }
...@@ -2887,7 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) ...@@ -2887,7 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
} tcp_skb_tsorted_restore(skb); } tcp_skb_tsorted_restore(skb);
if (!err) { if (!err) {
tcp_update_skb_after_send(tp, skb); tcp_update_skb_after_send(sk, skb);
tcp_rate_skb_sent(sk, skb); tcp_rate_skb_sent(sk, skb);
} }
} else { } else {
...@@ -3205,10 +3225,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -3205,10 +3225,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts)); memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts)) if (unlikely(req->cookie_ts))
skb->skb_mstamp = cookie_init_timestamp(req); skb->skb_mstamp_ns = cookie_init_timestamp(req);
else else
#endif #endif
skb->skb_mstamp = tcp_clock_us(); skb->skb_mstamp_ns = tcp_clock_ns();
#ifdef CONFIG_TCP_MD5SIG #ifdef CONFIG_TCP_MD5SIG
rcu_read_lock(); rcu_read_lock();
...@@ -3424,7 +3444,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) ...@@ -3424,7 +3444,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
syn->skb_mstamp = syn_data->skb_mstamp; syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
/* Now full SYN+DATA was cloned and sent (or not), /* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data) * remove the SYN from the original skb (syn_data)
......
...@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) ...@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
* bandwidth estimate. * bandwidth estimate.
*/ */
if (!tp->packets_out) { if (!tp->packets_out) {
tp->first_tx_mstamp = skb->skb_mstamp; u64 tstamp_us = tcp_skb_timestamp_us(skb);
tp->delivered_mstamp = skb->skb_mstamp;
tp->first_tx_mstamp = tstamp_us;
tp->delivered_mstamp = tstamp_us;
} }
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
...@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, ...@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
rs->is_app_limited = scb->tx.is_app_limited; rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->is_retrans = scb->sacked & TCPCB_RETRANS;
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
/* Find the duration of the "send phase" of this window: */ /* Find the duration of the "send phase" of this window: */
rs->interval_us = tcp_stamp_us_delta( rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
skb->skb_mstamp, scb->tx.first_tx_mstamp);
scb->tx.first_tx_mstamp);
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = skb->skb_mstamp;
} }
/* Mark off the skb delivered once it's sacked to avoid being /* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets * used again when it's cumulatively acked. For acked packets
......
...@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) ...@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{ {
return tp->rack.rtt_us + reo_wnd - return tp->rack.rtt_us + reo_wnd -
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
} }
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
...@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) ...@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS)) !(scb->sacked & TCPCB_SACKED_RETRANS))
continue; continue;
if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, if (!tcp_rack_sent_after(tp->rack.mstamp,
tcp_skb_timestamp_us(skb),
tp->rack.end_seq, scb->end_seq)) tp->rack.end_seq, scb->end_seq))
break; break;
......
...@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
*/ */
start_ts = tcp_skb_timestamp(skb); start_ts = tcp_skb_timestamp(skb);
if (!start_ts) if (!start_ts)
skb->skb_mstamp = tp->tcp_mstamp; skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
else if (icsk->icsk_user_timeout && else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort; goto abort;
...@@ -758,7 +758,7 @@ void tcp_init_xmit_timers(struct sock *sk) ...@@ -758,7 +758,7 @@ void tcp_init_xmit_timers(struct sock *sk)
{ {
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer); &tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_TAI,
HRTIMER_MODE_ABS_PINNED_SOFT); HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
......
...@@ -106,7 +106,6 @@ struct fq_sched_data { ...@@ -106,7 +106,6 @@ struct fq_sched_data {
u64 stat_gc_flows; u64 stat_gc_flows;
u64 stat_internal_packets; u64 stat_internal_packets;
u64 stat_tcp_retrans;
u64 stat_throttled; u64 stat_throttled;
u64 stat_flows_plimit; u64 stat_flows_plimit;
u64 stat_pkts_too_long; u64 stat_pkts_too_long;
...@@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) ...@@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
return skb; return skb;
} }
/* We might add in the future detection of retransmits
* For the time being, just return false
*/
static bool skb_is_retransmit(struct sk_buff *skb)
{
return false;
}
/* add skb to flow queue
* flow queue is a linked list, kind of FIFO, except for TCP retransmits
* We special case tcp retransmits to be transmitted before other packets.
* We rely on fact that TCP retransmits are unlikely, so we do not waste
* a separate queue or a pointer.
* head-> [retrans pkt 1]
* [retrans pkt 2]
* [ normal pkt 1]
* [ normal pkt 2]
* [ normal pkt 3]
* tail-> [ normal pkt 4]
*/
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{ {
struct sk_buff *prev, *head = flow->head; struct sk_buff *head = flow->head;
skb->next = NULL; skb->next = NULL;
if (!head) { if (!head)
flow->head = skb; flow->head = skb;
flow->tail = skb; else
return;
}
if (likely(!skb_is_retransmit(skb))) {
flow->tail->next = skb; flow->tail->next = skb;
flow->tail = skb;
return;
}
/* This skb is a tcp retransmit, flow->tail = skb;
* find the last retrans packet in the queue
*/
prev = NULL;
while (skb_is_retransmit(head)) {
prev = head;
head = head->next;
if (!head)
break;
}
if (!prev) { /* no rtx packet in queue, become the new head */
skb->next = flow->head;
flow->head = skb;
} else {
if (prev == flow->tail)
flow->tail = skb;
else
skb->next = prev->next;
prev->next = skb;
}
} }
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
...@@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, ...@@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
} }
f->qlen++; f->qlen++;
if (skb_is_retransmit(skb))
q->stat_tcp_retrans++;
qdisc_qstats_backlog_inc(sch, skb); qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) { if (fq_flow_is_detached(f)) {
struct sock *sk = skb->sk; struct sock *sk = skb->sk;
...@@ -460,7 +412,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) ...@@ -460,7 +412,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
static struct sk_buff *fq_dequeue(struct Qdisc *sch) static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{ {
struct fq_sched_data *q = qdisc_priv(sch); struct fq_sched_data *q = qdisc_priv(sch);
u64 now = ktime_get_ns(); u64 now = ktime_get_tai_ns();
struct fq_flow_head *head; struct fq_flow_head *head;
struct sk_buff *skb; struct sk_buff *skb;
struct fq_flow *f; struct fq_flow *f;
...@@ -491,11 +443,16 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -491,11 +443,16 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
} }
skb = f->head; skb = f->head;
if (unlikely(skb && now < f->time_next_packet && if (skb && !skb_is_tcp_pure_ack(skb)) {
!skb_is_tcp_pure_ack(skb))) { u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
head->first = f->next; f->time_next_packet);
fq_flow_set_throttled(q, f);
goto begin; if (now < time_next_packet) {
head->first = f->next;
f->time_next_packet = time_next_packet;
fq_flow_set_throttled(q, f);
goto begin;
}
} }
skb = fq_dequeue_head(sch, f); skb = fq_dequeue_head(sch, f);
...@@ -513,11 +470,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -513,11 +470,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
prefetch(&skb->end); prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb); f->credit -= qdisc_pkt_len(skb);
if (!q->rate_enable) if (ktime_to_ns(skb->tstamp) || !q->rate_enable)
goto out;
/* Do not pace locally generated ack packets */
if (skb_is_tcp_pure_ack(skb))
goto out; goto out;
rate = q->flow_max_rate; rate = q->flow_max_rate;
...@@ -823,7 +776,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, ...@@ -823,7 +776,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->fq_trees_log = ilog2(1024); q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1; q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8; q->low_rate_threshold = 550000 / 8;
qdisc_watchdog_init(&q->watchdog, sch); qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_TAI);
if (opt) if (opt)
err = fq_change(sch, opt, extack); err = fq_change(sch, opt, extack);
...@@ -873,12 +826,12 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) ...@@ -873,12 +826,12 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.gc_flows = q->stat_gc_flows; st.gc_flows = q->stat_gc_flows;
st.highprio_packets = q->stat_internal_packets; st.highprio_packets = q->stat_internal_packets;
st.tcp_retrans = q->stat_tcp_retrans; st.tcp_retrans = 0;
st.throttled = q->stat_throttled; st.throttled = q->stat_throttled;
st.flows_plimit = q->stat_flows_plimit; st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long; st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors; st.allocation_errors = q->stat_allocation_errors;
st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_tai_ns();
st.flows = q->flows; st.flows = q->flows;
st.inactive_flows = q->inactive_flows; st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows; st.throttled_flows = q->throttled_flows;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment