Commit 583396f4 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net_sched: sch_fq: enable use of hrtimer slack

Add a new attribute to control the fq qdisc hrtimer slack.

Default is set to 10 usec.

When/if packets are throttled, fq set up an hrtimer that can
lead to one interrupt per packet in the throttled queue.

By using a timer slack, we allow better use of timer interrupts,
by giving them a chance to call multiple timer callbacks
at each hardware interrupt.

Also, giving a slack allows FQ to dequeue batches of packets
instead of a single one, thus increasing xmit_more efficiency.

This has no negative effect on the rate a TCP flow can sustain,
since each TCP flow maintains its own precise vtime (tp->tcp_wstamp_ns)

v2: added strict netlink checking (as feedback from Jakub Kicinski)

Tested:
 1000 concurrent flows all using paced packets.
 1,000,000 packets sent per second.

Before the patch :

$ vmstat 2 10
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0 60726784  23628 3485992    0    0   138     1  977  535  0 12 87  0  0
 0  0      0 60714700  23628 3485628    0    0     0     0 1568827 26462  0 22 78  0  0
 1  0      0 60716012  23628 3485656    0    0     0     0 1570034 26216  0 22 78  0  0
 0  0      0 60722420  23628 3485492    0    0     0     0 1567230 26424  0 22 78  0  0
 0  0      0 60727484  23628 3485556    0    0     0     0 1568220 26200  0 22 78  0  0
 2  0      0 60718900  23628 3485380    0    0     0    40 1564721 26630  0 22 78  0  0
 2  0      0 60718096  23628 3485332    0    0     0     0 1562593 26432  0 22 78  0  0
 0  0      0 60719608  23628 3485064    0    0     0     0 1563806 26238  0 22 78  0  0
 1  0      0 60722876  23628 3485236    0    0     0   130 1565874 26566  0 22 78  0  0
 1  0      0 60722752  23628 3484908    0    0     0     0 1567646 26247  0 22 78  0  0

After the patch, slack of 10 usec, we can see a reduction of interrupts
per second, and a small decrease of reported cpu usage.

$ vmstat 2 10
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 1  0      0 60722564  23628 3484728    0    0   133     1  696  545  0 13 87  0  0
 1  0      0 60722568  23628 3484824    0    0     0     0 977278 25469  0 20 80  0  0
 0  0      0 60716396  23628 3484764    0    0     0     0 979997 25326  0 20 80  0  0
 0  0      0 60713844  23628 3484960    0    0     0     0 981394 25249  0 20 80  0  0
 2  0      0 60720468  23628 3484916    0    0     0     0 982860 25062  0 20 80  0  0
 1  0      0 60721236  23628 3484856    0    0     0     0 982867 25100  0 20 80  0  0
 1  0      0 60722400  23628 3484456    0    0     0     8 982698 25303  0 20 80  0  0
 0  0      0 60715396  23628 3484428    0    0     0     0 981777 25176  0 20 80  0  0
 0  0      0 60716520  23628 3486544    0    0     0    36 978965 27857  0 21 79  0  0
 0  0      0 60719592  23628 3486516    0    0     0    22 977318 25106  0 20 80  0  0
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b88948fb
...@@ -911,6 +911,8 @@ enum { ...@@ -911,6 +911,8 @@ enum {
TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */
TCA_FQ_TIMER_SLACK, /* timer slack */
__TCA_FQ_MAX __TCA_FQ_MAX
}; };
......
...@@ -121,6 +121,8 @@ struct fq_sched_data { ...@@ -121,6 +121,8 @@ struct fq_sched_data {
u64 stat_flows_plimit; u64 stat_flows_plimit;
u64 stat_pkts_too_long; u64 stat_pkts_too_long;
u64 stat_allocation_errors; u64 stat_allocation_errors;
u32 timer_slack; /* hrtimer slack in ns */
struct qdisc_watchdog watchdog; struct qdisc_watchdog watchdog;
}; };
...@@ -504,8 +506,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -504,8 +506,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
head = &q->old_flows; head = &q->old_flows;
if (!head->first) { if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL) if (q->time_next_delayed_flow != ~0ULL)
qdisc_watchdog_schedule_ns(&q->watchdog, qdisc_watchdog_schedule_range_ns(&q->watchdog,
q->time_next_delayed_flow); q->time_next_delayed_flow,
q->timer_slack);
return NULL; return NULL;
} }
} }
...@@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log) ...@@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log)
} }
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
[TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
...@@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { ...@@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
}; };
static int fq_change(struct Qdisc *sch, struct nlattr *opt, static int fq_change(struct Qdisc *sch, struct nlattr *opt,
...@@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, ...@@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
q->ce_threshold = (u64)NSEC_PER_USEC * q->ce_threshold = (u64)NSEC_PER_USEC *
nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
if (tb[TCA_FQ_TIMER_SLACK])
q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
if (!err) { if (!err) {
sch_tree_unlock(sch); sch_tree_unlock(sch);
err = fq_resize(sch, fq_log); err = fq_resize(sch, fq_log);
...@@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, ...@@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->orphan_mask = 1024 - 1; q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8; q->low_rate_threshold = 550000 / 8;
q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
/* Default ce_threshold of 4294 seconds */ /* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
...@@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) ...@@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) || q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack))
goto nla_put_failure; goto nla_put_failure;
return nla_nest_end(skb, opts); return nla_nest_end(skb, opts);
...@@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) ...@@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.flows_plimit = q->stat_flows_plimit; st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long; st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors; st.allocation_errors = q->stat_allocation_errors;
st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
ktime_get_ns();
st.flows = q->flows; st.flows = q->flows;
st.inactive_flows = q->inactive_flows; st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows; st.throttled_flows = q->throttled_flows;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment