Commit b1394967 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-second-round-for-EDT-conversion'

Eric Dumazet says:

====================
tcp: second round for EDT conversion

First round of EDT patches left TCP stack in a non optimal state.

- High speed flows suffered from loss of performance, addressed
  by the first patch of this series.

- Second patch brings pacing to the current state of networking,
  since we now reach ~100 Gbit on a single TCP flow.

- Third patch implements a mitigation for scheduling delays,
  like the one we did in sch_fq in the past.

- Fourth patch removes one special case in sch_fq for ACK packets.

- Fifth patch removes a serious perfomance cost for TCP internal
  pacing. We should setup the high resolution timer only if
  really needed.

- Sixth patch fixes a typo in BBR.

- Last patch is one minor change in cdg congestion control.

Neal Cardwell also has a patch series fixing BBR after
EDT adoption.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1a3aea25 825e1c52
......@@ -249,6 +249,7 @@ struct tcp_sock {
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
/* RTT measurement */
u64 tcp_mstamp; /* most recent packet received/sent */
......
......@@ -422,8 +422,8 @@ struct sock {
struct timer_list sk_timer;
__u32 sk_priority;
__u32 sk_mark;
u32 sk_pacing_rate; /* bytes per second */
u32 sk_max_pacing_rate;
unsigned long sk_pacing_rate; /* bytes per second */
unsigned long sk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
......
......@@ -3927,8 +3927,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
break;
case SO_MAX_PACING_RATE:
sk->sk_max_pacing_rate = val;
case SO_MAX_PACING_RATE: /* 32bit version */
sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
......
......@@ -998,7 +998,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
sk->sk_max_pacing_rate = val;
sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
......@@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
#endif
case SO_MAX_PACING_RATE:
v.val = sk->sk_max_pacing_rate;
/* 32bit version */
v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
break;
case SO_INCOMING_CPU:
......@@ -2810,8 +2811,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_ll_usec = sysctl_net_busy_read;
#endif
sk->sk_max_pacing_rate = ~0U;
sk->sk_pacing_rate = ~0U;
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
......
......@@ -3111,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long rate;
u32 now;
u64 rate64;
bool slow;
u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
......@@ -3124,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
......@@ -3254,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
unsigned long rate;
u64 rate64;
u32 rate;
stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
......@@ -3274,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp);
......
......@@ -129,7 +129,7 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
static const int bbr_min_tso_rate = 1200000;
/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
static const int bbr_pacing_marging_percent = 1;
static const int bbr_pacing_margin_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
......@@ -214,12 +214,12 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent);
rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
return rate >> BW_SCALE;
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
......@@ -258,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
......@@ -280,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
......
......@@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
return;
if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
u32 now_us = tp->tcp_mstamp;
if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
ca->last_ack = now_us;
......
......@@ -52,9 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_ns();
/* departure time for next data packet */
if (val > tp->tcp_wstamp_ns)
tp->tcp_wstamp_ns = val;
if (val > tp->tcp_clock_cache)
tp->tcp_clock_cache = val;
val = div_u64(val, NSEC_PER_USEC);
if (val > tp->tcp_mstamp)
......@@ -976,32 +975,26 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
static void tcp_internal_pacing(struct sock *sk)
{
if (!tcp_needs_internal_pacing(sk))
return;
hrtimer_start(&tcp_sk(sk)->pacing_timer,
ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns),
HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
}
static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
u64 prior_wstamp)
{
struct tcp_sock *tp = tcp_sk(sk);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) {
u32 rate = sk->sk_pacing_rate;
unsigned long rate = sk->sk_pacing_rate;
/* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance.
*/
if (rate != ~0U && rate && tp->data_segs_out >= 10) {
tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);
if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
tcp_internal_pacing(sk);
/* take into account OS jitter */
len_ns -= min_t(u64, len_ns / 2, credit);
tp->tcp_wstamp_ns += len_ns;
}
}
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
......@@ -1030,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
......@@ -1050,6 +1044,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
}
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet = inet_sk(sk);
......@@ -1166,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = net_xmit_eval(err);
}
if (!err && oskb) {
tcp_update_skb_after_send(sk, oskb);
tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
......@@ -1701,7 +1699,8 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{
u32 bytes, segs;
bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> sk->sk_pacing_shift,
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
......@@ -2175,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_pacing_check(struct sock *sk)
{
return tcp_needs_internal_pacing(sk) &&
hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
struct tcp_sock *tp = tcp_sk(sk);
if (!tcp_needs_internal_pacing(sk))
return false;
if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
return false;
if (!hrtimer_is_queued(&tp->pacing_timer)) {
hrtimer_start(&tp->pacing_timer,
ns_to_ktime(tp->tcp_wstamp_ns),
HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
}
return true;
}
/* TCP Small Queues :
......@@ -2195,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
unsigned int limit;
unsigned long limit;
limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
limit = min_t(u32, limit,
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
......@@ -2315,7 +2329,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
tcp_update_skb_after_send(sk, skb);
tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
goto repair; /* Skip network transmission */
}
......@@ -2890,7 +2904,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
} tcp_skb_tsorted_restore(skb);
if (!err) {
tcp_update_skb_after_send(sk, skb);
tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
......
......@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
*/
start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
skb->skb_mstamp_ns = tp->tcp_clock_cache;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
......
......@@ -92,8 +92,8 @@ struct fq_sched_data {
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
......@@ -416,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
u32 rate, plen;
unsigned long rate;
u32 plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
......@@ -443,7 +444,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
}
skb = f->head;
if (skb && !skb_is_tcp_pure_ack(skb)) {
if (skb) {
u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
f->time_next_packet);
......@@ -485,11 +486,11 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
if (f->credit > 0)
goto out;
}
if (rate != ~0U) {
if (rate != ~0UL) {
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
do_div(len, rate);
len = div64_ul(len, rate);
/* Since socket rate can change later,
* clamp the delay to 1 second.
* Really, providers of too big packets should be fixed !
......@@ -701,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
if (tb[TCA_FQ_FLOW_MAX_RATE]) {
u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
}
if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
q->low_rate_threshold =
nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
......@@ -766,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
......@@ -802,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
min_t(unsigned long, q->flow_max_rate, ~0U)) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment