Commit 73a6bab5 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: switch pacing timer to softirq based hrtimer

linux-4.16 got support for softirq based hrtimers.
TCP can switch its pacing hrtimer to this variant, since this
avoids going through a tasklet and some atomic operations.

pacing timer logic looks like other (jiffies based) tcp timers.

v2: use hrtimer_try_to_cancel() in tcp_clear_xmit_timers()
    to correctly release reference on socket if needed.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 4cbd7a7d
...@@ -557,7 +557,9 @@ void tcp_fin(struct sock *sk); ...@@ -557,7 +557,9 @@ void tcp_fin(struct sock *sk);
void tcp_init_xmit_timers(struct sock *); void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk) static inline void tcp_clear_xmit_timers(struct sock *sk)
{ {
hrtimer_cancel(&tcp_sk(sk)->pacing_timer); if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
sock_put(sk);
inet_csk_clear_xmit_timers(sk); inet_csk_clear_xmit_timers(sk);
} }
......
...@@ -772,7 +772,7 @@ struct tsq_tasklet { ...@@ -772,7 +772,7 @@ struct tsq_tasklet {
}; };
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
static void tcp_tsq_handler(struct sock *sk) static void tcp_tsq_write(struct sock *sk)
{ {
if ((1 << sk->sk_state) & if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
...@@ -789,6 +789,16 @@ static void tcp_tsq_handler(struct sock *sk) ...@@ -789,6 +789,16 @@ static void tcp_tsq_handler(struct sock *sk)
0, GFP_ATOMIC); 0, GFP_ATOMIC);
} }
} }
static void tcp_tsq_handler(struct sock *sk)
{
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
tcp_tsq_write(sk);
else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
bh_unlock_sock(sk);
}
/* /*
* One tasklet per cpu tries to send more skbs. * One tasklet per cpu tries to send more skbs.
* We run in tasklet context but need to disable irqs when * We run in tasklet context but need to disable irqs when
...@@ -816,16 +826,7 @@ static void tcp_tasklet_func(unsigned long data) ...@@ -816,16 +826,7 @@ static void tcp_tasklet_func(unsigned long data)
smp_mb__before_atomic(); smp_mb__before_atomic();
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
if (!sk->sk_lock.owned &&
test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
tcp_tsq_handler(sk); tcp_tsq_handler(sk);
}
bh_unlock_sock(sk);
}
sk_free(sk); sk_free(sk);
} }
} }
...@@ -853,9 +854,10 @@ void tcp_release_cb(struct sock *sk) ...@@ -853,9 +854,10 @@ void tcp_release_cb(struct sock *sk)
nflags = flags & ~TCP_DEFERRED_ALL; nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
if (flags & TCPF_TSQ_DEFERRED) if (flags & TCPF_TSQ_DEFERRED) {
tcp_tsq_handler(sk); tcp_tsq_write(sk);
__sock_put(sk);
}
/* Here begins the tricky part : /* Here begins the tricky part :
* We are called from release_sock() with : * We are called from release_sock() with :
* 1) BH disabled * 1) BH disabled
...@@ -929,7 +931,7 @@ void tcp_wfree(struct sk_buff *skb) ...@@ -929,7 +931,7 @@ void tcp_wfree(struct sk_buff *skb)
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out; goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval) if (nval != oval)
continue; continue;
...@@ -948,37 +950,17 @@ void tcp_wfree(struct sk_buff *skb) ...@@ -948,37 +950,17 @@ void tcp_wfree(struct sk_buff *skb)
sk_free(sk); sk_free(sk);
} }
/* Note: Called under hard irq. /* Note: Called under soft irq.
* We can not call TCP stack right away. * We can call TCP stack right away, unless socket is owned by user.
*/ */
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{ {
struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
struct sock *sk = (struct sock *)tp; struct sock *sk = (struct sock *)tp;
unsigned long nval, oval;
for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { tcp_tsq_handler(sk);
struct tsq_tasklet *tsq; sock_put(sk);
bool empty;
if (oval & TSQF_QUEUED)
break;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
break;
/* queue this socket to tasklet queue */
tsq = this_cpu_ptr(&tsq_tasklet);
empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
if (empty)
tasklet_schedule(&tsq->tasklet);
break;
}
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
...@@ -1011,7 +993,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) ...@@ -1011,7 +993,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
do_div(len_ns, rate); do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer, hrtimer_start(&tcp_sk(sk)->pacing_timer,
ktime_add_ns(ktime_get(), len_ns), ktime_add_ns(ktime_get(), len_ns),
HRTIMER_MODE_ABS_PINNED); HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
} }
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
...@@ -1078,7 +1061,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ...@@ -1078,7 +1061,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* if no packet is in qdisc/device queue, then allow XPS to select /* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler() * another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk_wmem_alloc. * which holds one reference to sk.
* *
* TODO: Ideally, in-flight pure ACK packets should not matter here. * TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them. * One way to get this would be to set skb->truesize = 2 on them.
...@@ -2185,7 +2168,7 @@ static int tcp_mtu_probe(struct sock *sk) ...@@ -2185,7 +2168,7 @@ static int tcp_mtu_probe(struct sock *sk)
static bool tcp_pacing_check(const struct sock *sk) static bool tcp_pacing_check(const struct sock *sk)
{ {
return tcp_needs_internal_pacing(sk) && return tcp_needs_internal_pacing(sk) &&
hrtimer_active(&tcp_sk(sk)->pacing_timer); hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
} }
/* TCP Small Queues : /* TCP Small Queues :
...@@ -2365,8 +2348,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2365,8 +2348,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
skb, limit, mss_now, gfp))) skb, limit, mss_now, gfp)))
break; break;
if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0)) if (tcp_small_queue_check(sk, skb, 0))
break; break;
......
...@@ -713,6 +713,6 @@ void tcp_init_xmit_timers(struct sock *sk) ...@@ -713,6 +713,6 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer); &tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED); HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment