Commit c9d0c6eb authored by David S. Miller's avatar David S. Miller

Merge branch 'pie-next'

Leslie Monis says:

====================
net: sched: pie: align PIE implementation with RFC 8033

The current implementation of the PIE queuing discipline is according to the
IETF draft [http://tools.ietf.org/html/draft-pan-aqm-pie-00] and the paper
[PIE: A Lightweight Control Scheme to Address the Bufferbloat Problem].
However, a lot of necessary modifications and enhancements have been proposed
in RFC 8033, which have not yet been incorporated in the source code of Linux.
This patch series helps in achieving the same.

Performance tests carried out using Flent [https://flent.org/]

Changes from v2 to v3:
  - Used div_u64() instead of direct division after explicit type casting as
    recommended by David

Changes from v1 to v2:
  - Excluded the patch setting PIE dynamically active/inactive as the test
    results were unsatisfactory
  - Fixed a scaling issue when adding more auto-tuning cases which caused
    local variables to underflow
  - Changed the long if/else chain to a loop as suggested by Stephen
  - Changed the position of the accu_prob variable in the pie_vars
    structure as recommended by Stephen
====================
Acked-by: default avatarDave Taht <dave.taht@gmail.com>
Acked-by: default avatarJamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 78844068 c9d2ac5e
...@@ -954,7 +954,7 @@ enum { ...@@ -954,7 +954,7 @@ enum {
#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) #define TCA_PIE_MAX (__TCA_PIE_MAX - 1)
struct tc_pie_xstats { struct tc_pie_xstats {
__u32 prob; /* current probability */ __u64 prob; /* current probability */
__u32 delay; /* current delay in ms */ __u32 delay; /* current delay in ms */
__u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */
__u32 packets_in; /* total number of packets enqueued */ __u32 packets_in; /* total number of packets enqueued */
......
...@@ -17,9 +17,7 @@ ...@@ -17,9 +17,7 @@
* University of Oslo, Norway. * University of Oslo, Norway.
* *
* References: * References:
* IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 * RFC 8033: https://tools.ietf.org/html/rfc8034
* IEEE Conference on High Performance Switching and Routing 2013 :
* "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
*/ */
#include <linux/module.h> #include <linux/module.h>
...@@ -31,9 +29,9 @@ ...@@ -31,9 +29,9 @@
#include <net/pkt_sched.h> #include <net/pkt_sched.h>
#include <net/inet_ecn.h> #include <net/inet_ecn.h>
#define QUEUE_THRESHOLD 10000 #define QUEUE_THRESHOLD 16384
#define DQCOUNT_INVALID -1 #define DQCOUNT_INVALID -1
#define MAX_PROB 0xffffffff #define MAX_PROB 0xffffffffffffffff
#define PIE_SCALE 8 #define PIE_SCALE 8
/* parameters used */ /* parameters used */
...@@ -49,14 +47,16 @@ struct pie_params { ...@@ -49,14 +47,16 @@ struct pie_params {
/* variables used */ /* variables used */
struct pie_vars { struct pie_vars {
u32 prob; /* probability but scaled by u32 limit. */ u64 prob; /* probability but scaled by u64 limit. */
psched_time_t burst_time; psched_time_t burst_time;
psched_time_t qdelay; psched_time_t qdelay;
psched_time_t qdelay_old; psched_time_t qdelay_old;
u64 dq_count; /* measured in bytes */ u64 dq_count; /* measured in bytes */
psched_time_t dq_tstamp; /* drain rate */ psched_time_t dq_tstamp; /* drain rate */
u64 accu_prob; /* accumulated drop probability */
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
u32 qlen_old; /* in bytes */ u32 qlen_old; /* in bytes */
u8 accu_prob_overflows; /* overflows of accu_prob */
}; };
/* statistics gathering */ /* statistics gathering */
...@@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params) ...@@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params)
{ {
params->alpha = 2; params->alpha = 2;
params->beta = 20; params->beta = 20;
params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
params->limit = 1000; /* default of 1000 packets */ params->limit = 1000; /* default of 1000 packets */
params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
params->ecn = false; params->ecn = false;
params->bytemode = false; params->bytemode = false;
} }
...@@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params) ...@@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params)
static void pie_vars_init(struct pie_vars *vars) static void pie_vars_init(struct pie_vars *vars)
{ {
vars->dq_count = DQCOUNT_INVALID; vars->dq_count = DQCOUNT_INVALID;
vars->accu_prob = 0;
vars->avg_dq_rate = 0; vars->avg_dq_rate = 0;
/* default of 100 ms in pschedtime */ /* default of 150 ms in pschedtime */
vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
vars->accu_prob_overflows = 0;
} }
static bool drop_early(struct Qdisc *sch, u32 packet_size) static bool drop_early(struct Qdisc *sch, u32 packet_size)
{ {
struct pie_sched_data *q = qdisc_priv(sch); struct pie_sched_data *q = qdisc_priv(sch);
u32 rnd; u64 rnd;
u32 local_prob = q->vars.prob; u64 local_prob = q->vars.prob;
u32 mtu = psched_mtu(qdisc_dev(sch)); u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */ /* If there is still burst allowance left skip random early drop */
...@@ -124,14 +126,34 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) ...@@ -124,14 +126,34 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
* probablity. Smaller packets will have lower drop prob in this case * probablity. Smaller packets will have lower drop prob in this case
*/ */
if (q->params.bytemode && packet_size <= mtu) if (q->params.bytemode && packet_size <= mtu)
local_prob = (local_prob / mtu) * packet_size; local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else else
local_prob = q->vars.prob; local_prob = q->vars.prob;
rnd = prandom_u32(); if (local_prob == 0) {
if (rnd < local_prob) q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
}
if (local_prob > MAX_PROB - q->vars.accu_prob)
q->vars.accu_prob_overflows++;
q->vars.accu_prob += local_prob;
if (q->vars.accu_prob_overflows == 0 &&
q->vars.accu_prob < (MAX_PROB / 100) * 85)
return false;
if (q->vars.accu_prob_overflows == 8 &&
q->vars.accu_prob >= MAX_PROB / 2)
return true; return true;
prandom_bytes(&rnd, 8);
if (rnd < local_prob) {
q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
return true;
}
return false; return false;
} }
...@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, ...@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out: out:
q->stats.dropped++; q->stats.dropped++;
q->vars.accu_prob = 0;
q->vars.accu_prob_overflows = 0;
return qdisc_drop(skb, sch, to_free); return qdisc_drop(skb, sch, to_free);
} }
...@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch) ...@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch)
u32 qlen = sch->qstats.backlog; /* queue size in bytes */ u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay = 0; /* in pschedtime */
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
s32 delta = 0; /* determines the change in probability */ s64 delta = 0; /* determines the change in probability */
u32 oldprob; u64 oldprob;
u32 alpha, beta; u64 alpha, beta;
u32 power;
bool update_prob = true; bool update_prob = true;
q->vars.qdelay_old = q->vars.qdelay; q->vars.qdelay_old = q->vars.qdelay;
...@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch) ...@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch)
* value for alpha as 0.125. In this implementation, we use values 0-32 * value for alpha as 0.125. In this implementation, we use values 0-32
* passed from user space to represent this. Also, alpha and beta have * passed from user space to represent this. Also, alpha and beta have
* unit of HZ and need to be scaled before they can used to update * unit of HZ and need to be scaled before they can used to update
* probability. alpha/beta are updated locally below by 1) scaling them * probability. alpha/beta are updated locally below by scaling down
* appropriately 2) scaling down by 16 to come to 0-2 range. * by 16 to come to 0-2 range.
* Please see paper for details.
*
* We scale alpha and beta differently depending on whether we are in
* light, medium or high dropping mode.
*/ */
if (q->vars.prob < MAX_PROB / 100) { alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
alpha = beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
beta = /* We scale alpha and beta differently depending on how heavy the
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; * congestion is. Please see RFC 8033 for details.
} else if (q->vars.prob < MAX_PROB / 10) { */
alpha = if (q->vars.prob < MAX_PROB / 10) {
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; alpha >>= 1;
beta = beta >>= 1;
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
} else { power = 100;
alpha = while (q->vars.prob < div_u64(MAX_PROB, power) &&
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; power <= 1000000) {
beta = alpha >>= 2;
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; beta >>= 2;
power *= 10;
}
} }
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */ /* alpha and beta should be between 0 and 32, in multiples of 1/16 */
delta += alpha * ((qdelay - q->params.target)); delta += alpha * (u64)(qdelay - q->params.target);
delta += beta * ((qdelay - qdelay_old)); delta += beta * (u64)(qdelay - qdelay_old);
oldprob = q->vars.prob; oldprob = q->vars.prob;
/* to ensure we increase probability in steps of no more than 2% */ /* to ensure we increase probability in steps of no more than 2% */
if (delta > (s32)(MAX_PROB / (100 / 2)) && if (delta > (s64)(MAX_PROB / (100 / 2)) &&
q->vars.prob >= MAX_PROB / 10) q->vars.prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2; delta = (MAX_PROB / 100) * 2;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment