Commit 4a8e320c authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: sched: use pinned timers

While using a MQ + NETEM setup, I had confirmation that the default
timer migration ( /proc/sys/kernel/timer_migration ) is killing us.

Installing this on a receiver side of a TCP_STREAM test, (NIC has 8 TX
queues) :

EST="est 1sec 4sec"
for ETH in eth1
do
 tc qd del dev $ETH root 2>/dev/null
 tc qd add dev $ETH root handle 1: mq
 tc qd add dev $ETH parent 1:1 $EST netem limit 70000 delay 6ms
 tc qd add dev $ETH parent 1:2 $EST netem limit 70000 delay 8ms
 tc qd add dev $ETH parent 1:3 $EST netem limit 70000 delay 10ms
 tc qd add dev $ETH parent 1:4 $EST netem limit 70000 delay 12ms
 tc qd add dev $ETH parent 1:5 $EST netem limit 70000 delay 14ms
 tc qd add dev $ETH parent 1:6 $EST netem limit 70000 delay 16ms
 tc qd add dev $ETH parent 1:7 $EST netem limit 80000 delay 18ms
 tc qd add dev $ETH parent 1:8 $EST netem limit 90000 delay 20ms
done

We can see that timers get migrated into a single cpu, presumably idle
at the time timers are set up.
Then all qdisc dequeues run from this cpu and huge lock contention
happens. This single cpu is stuck in softirq mode and cannot dequeue
fast enough.

    39.24%  [kernel]          [k] _raw_spin_lock
     2.65%  [kernel]          [k] netem_enqueue
     1.80%  [kernel]          [k] netem_dequeue
     1.63%  [kernel]          [k] copy_user_enhanced_fast_string
     1.45%  [kernel]          [k] _raw_spin_lock_bh

By pinning qdisc timers on the cpu running the qdisc, we respect proper
XPS setting and remove this lock contention.

     5.84%  [kernel]          [k] netem_enqueue
     4.83%  [kernel]          [k] _raw_spin_lock
     2.92%  [kernel]          [k] copy_user_enhanced_fast_string

Current Qdiscs that benefit from this change are :

	netem, cbq, fq, hfsc, tbf, htb.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 9fb426a6
...@@ -586,7 +586,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) ...@@ -586,7 +586,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
{ {
hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
wd->timer.function = qdisc_watchdog; wd->timer.function = qdisc_watchdog;
wd->qdisc = qdisc; wd->qdisc = qdisc;
} }
...@@ -602,7 +602,7 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) ...@@ -602,7 +602,7 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
hrtimer_start(&wd->timer, hrtimer_start(&wd->timer,
ns_to_ktime(expires), ns_to_ktime(expires),
HRTIMER_MODE_ABS); HRTIMER_MODE_ABS_PINNED);
} }
EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
......
...@@ -617,7 +617,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) ...@@ -617,7 +617,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
time = ktime_set(0, 0); time = ktime_set(0, 0);
time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
} }
qdisc_unthrottled(sch); qdisc_unthrottled(sch);
...@@ -1386,7 +1386,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) ...@@ -1386,7 +1386,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
q->link.minidle = -0x7FFFFFFF; q->link.minidle = -0x7FFFFFFF;
qdisc_watchdog_init(&q->watchdog, sch); qdisc_watchdog_init(&q->watchdog, sch);
hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
q->delay_timer.function = cbq_undelay; q->delay_timer.function = cbq_undelay;
q->toplevel = TC_CBQ_MAXLEVEL; q->toplevel = TC_CBQ_MAXLEVEL;
q->now = psched_get_time(); q->now = psched_get_time();
......
...@@ -932,7 +932,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) ...@@ -932,7 +932,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
ktime_t time = ns_to_ktime(next_event); ktime_t time = ns_to_ktime(next_event);
qdisc_throttled(q->watchdog.qdisc); qdisc_throttled(q->watchdog.qdisc);
hrtimer_start(&q->watchdog.timer, time, hrtimer_start(&q->watchdog.timer, time,
HRTIMER_MODE_ABS); HRTIMER_MODE_ABS_PINNED);
} }
} else { } else {
schedule_work(&q->work); schedule_work(&q->work);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment