Commit 12472b41 authored by Linus Torvalds's avatar Linus Torvalds

Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

* master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6:
  [SUNHME]: Fix for sunhme failures on x86
  [XFRM] xfrm_user: Better validation of user templates.
  [DCCP] tfrc: Binary search for reverse TFRC lookup
  [DCCP] ccid3: Deprecate TFRC_SMALLEST_P
  [DCCP] tfrc: Identify TFRC table limits and simplify code
  [DCCP] tfrc: Add protection against invalid parameters to TFRC routines
  [DCCP] tfrc: Fix small error in reverse lookup of p for given f(p)
  [DCCP] tfrc: Document boundaries and limits of the TFRC lookup table
  [DCCP] ccid3: Fix warning message about illegal ACK
  [DCCP] ccid3: Fix bug in calculation of send rate
  [DCCP]: Fix BUG in retransmission delay calculation
  [DCCP]: Use higher RTO default for CCID3
parents a79f43a5 ef9467f8
...@@ -3012,6 +3012,11 @@ static int __devinit happy_meal_pci_probe(struct pci_dev *pdev, ...@@ -3012,6 +3012,11 @@ static int __devinit happy_meal_pci_probe(struct pci_dev *pdev,
#endif #endif
err = -ENODEV; err = -ENODEV;
if (pci_enable_device(pdev))
goto err_out;
pci_set_master(pdev);
if (!strcmp(prom_name, "SUNW,qfe") || !strcmp(prom_name, "qfe")) { if (!strcmp(prom_name, "SUNW,qfe") || !strcmp(prom_name, "qfe")) {
qp = quattro_pci_find(pdev); qp = quattro_pci_find(pdev);
if (qp == NULL) if (qp == NULL)
......
...@@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG ...@@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG
parameter to 0 or 1. parameter to 0 or 1.
If in doubt, say N. If in doubt, say N.
config IP_DCCP_CCID3_RTO
int "Use higher bound for nofeedback timer"
default 100
depends on IP_DCCP_CCID3 && EXPERIMENTAL
---help---
Use higher lower bound for nofeedback timer expiration.
The TFRC nofeedback timer normally expires after the maximum of 4
RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
with a small RTT this can mean a high processing load and reduced
performance, since then the nofeedback timer is triggered very
frequently.
This option enables to set a higher lower bound for the nofeedback
value. Values in units of milliseconds can be set here.
A value of 0 disables this feature by enforcing the value specified
in RFC 3448. The following values have been suggested as bounds for
experimental use:
* 16-20ms to match the typical multimedia inter-frame interval
* 100ms as a reasonable compromise [default]
* 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
The default of 100ms is a compromise between a large value for
efficient DCCP implementations, and a small value to avoid disrupting
the network in times of congestion.
The purpose of the nofeedback timer is to slow DCCP down when there
is serious network congestion: experimenting with larger values should
therefore not be performed on WANs.
endmenu endmenu
...@@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) ...@@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx)
/* /*
* Update X by * Update X by
* If (p > 0) * If (p > 0)
* x_calc = calcX(s, R, p); * X_calc = calcX(s, R, p);
* X = max(min(X_calc, 2 * X_recv), s / t_mbi); * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
* Else * Else
* If (now - tld >= R) * If (now - tld >= R)
* X = max(min(2 * X, 2 * X_recv), s / R); * X = max(min(2 * X, 2 * X_recv), s / R);
* tld = now; * tld = now;
*
* If X has changed, we also update the scheduled send time t_now,
* the inter-packet interval t_ipi, and the delta value.
*/ */
static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
...@@ -134,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) ...@@ -134,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
const __u32 old_x = hctx->ccid3hctx_x; const __u32 old_x = hctx->ccid3hctx_x;
/* To avoid large error in calcX */ if (hctx->ccid3hctx_p > 0) {
if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
hctx->ccid3hctx_rtt, hctx->ccid3hctx_rtt,
hctx->ccid3hctx_p); hctx->ccid3hctx_p);
...@@ -223,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ...@@ -223,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
ccid3_tx_state_name(hctx->ccid3hctx_state)); ccid3_tx_state_name(hctx->ccid3hctx_state));
/* Halve sending rate */ /* Halve sending rate */
/* If (X_calc > 2 * X_recv) /* If (p == 0 || X_calc > 2 * X_recv)
* X_recv = max(X_recv / 2, s / (2 * t_mbi)); * X_recv = max(X_recv / 2, s / (2 * t_mbi));
* Else * Else
* X_recv = X_calc / 4; * X_recv = X_calc / 4;
*/ */
BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
hctx->ccid3hctx_x_calc == 0);
/* check also if p is zero -> x_calc is infinity? */ if (hctx->ccid3hctx_p == 0 ||
if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
hctx->ccid3hctx_s / (2 * TFRC_T_MBI)); hctx->ccid3hctx_s / (2 * TFRC_T_MBI));
...@@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ...@@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
} }
/* /*
* Schedule no feedback timer to expire in * Schedule no feedback timer to expire in
* max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
* See comments in packet_recv() regarding the value of t_RTO.
*/ */
t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
break; break;
case TFRC_SSTATE_NO_SENT: case TFRC_SSTATE_NO_SENT:
DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk); DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk);
...@@ -338,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) ...@@ -338,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
* else * else
* // send the packet in (t_nom - t_now) milliseconds. * // send the packet in (t_nom - t_now) milliseconds.
*/ */
if (delay >= hctx->ccid3hctx_delta) if (delay - (long)hctx->ccid3hctx_delta >= 0)
return delay / 1000L; return delay / 1000L;
break; break;
case TFRC_SSTATE_TERM: case TFRC_SSTATE_TERM:
...@@ -412,10 +413,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ...@@ -412,10 +413,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
struct dccp_tx_hist_entry *packet; struct dccp_tx_hist_entry *packet;
struct timeval now; struct timeval now;
unsigned long t_nfb; unsigned long t_nfb;
u32 t_elapsed;
u32 pinv; u32 pinv;
u32 x_recv; long r_sample, t_elapsed;
u32 r_sample;
BUG_ON(hctx == NULL); BUG_ON(hctx == NULL);
...@@ -426,31 +425,44 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ...@@ -426,31 +425,44 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
opt_recv = &hctx->ccid3hctx_options_received; opt_recv = &hctx->ccid3hctx_options_received;
t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
x_recv = opt_recv->ccid3or_receive_rate;
pinv = opt_recv->ccid3or_loss_event_rate;
switch (hctx->ccid3hctx_state) { switch (hctx->ccid3hctx_state) {
case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_NO_FBACK:
case TFRC_SSTATE_FBACK: case TFRC_SSTATE_FBACK:
/* Calculate new round trip sample by /* get packet from history to look up t_recvdata */
* R_sample = (now - t_recvdata) - t_delay */
/* get t_recvdata from history */
packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
DCCP_SKB_CB(skb)->dccpd_ack_seq); DCCP_SKB_CB(skb)->dccpd_ack_seq);
if (unlikely(packet == NULL)) { if (unlikely(packet == NULL)) {
DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist " DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist "
"in history!\n", dccp_role(sk), sk, "in history!\n", dccp_role(sk), sk,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
return; return;
} }
/* Update RTT */ /* Update receive rate */
hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
/* Update loss event rate */
pinv = opt_recv->ccid3or_loss_event_rate;
if (pinv == ~0U || pinv == 0)
hctx->ccid3hctx_p = 0;
else
hctx->ccid3hctx_p = 1000000 / pinv;
dccp_timestamp(sk, &now); dccp_timestamp(sk, &now);
r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
if (unlikely(r_sample <= t_elapsed)) /*
DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n", * Calculate new round trip sample as per [RFC 3448, 4.3] by
* R_sample = (now - t_recvdata) - t_elapsed
*/
r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
if (unlikely(r_sample <= 0)) {
DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample);
r_sample = 0;
} else if (unlikely(r_sample <= t_elapsed))
DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n",
r_sample, t_elapsed); r_sample, t_elapsed);
else else
r_sample -= t_elapsed; r_sample -= t_elapsed;
...@@ -473,31 +485,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ...@@ -473,31 +485,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hctx->ccid3hctx_t_ld = now; hctx->ccid3hctx_t_ld = now;
ccid3_update_send_time(hctx); ccid3_update_send_time(hctx);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
} else {
hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
r_sample / 10;
ccid3_hc_tx_update_x(sk, &now);
}
ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " ccid3_pr_debug("%s(%p), s=%u, w_init=%u, "
"r_sample=%us\n", dccp_role(sk), sk, "R_sample=%ldus, X=%u\n", dccp_role(sk),
hctx->ccid3hctx_rtt, r_sample); sk, hctx->ccid3hctx_s, w_init, r_sample,
hctx->ccid3hctx_x);
/* Update receive rate */ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ } else {
hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt +
(u32)r_sample ) / 10;
/* Update loss event rate */ ccid3_hc_tx_update_x(sk, &now);
if (pinv == ~0 || pinv == 0)
hctx->ccid3hctx_p = 0;
else {
hctx->ccid3hctx_p = 1000000 / pinv;
if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, "
hctx->ccid3hctx_p = TFRC_SMALLEST_P; "p=%u, X_calc=%u, X=%u\n", dccp_role(sk),
ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", sk, hctx->ccid3hctx_rtt, r_sample,
dccp_role(sk), sk); hctx->ccid3hctx_s, hctx->ccid3hctx_p,
} hctx->ccid3hctx_x_calc,
hctx->ccid3hctx_x);
} }
/* unschedule no feedback timer */ /* unschedule no feedback timer */
...@@ -512,16 +518,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ...@@ -512,16 +518,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
*/ */
sk->sk_write_space(sk); sk->sk_write_space(sk);
/* Update timeout interval. We use the alternative variant of /*
* [RFC 3448, 3.1] which sets the upper bound of t_rto to one * Update timeout interval for the nofeedback timer.
* second, as it is suggested for TCP (see RFC 2988, 2.4). */ * We use a configuration option to increase the lower bound.
* This can help avoid triggering the nofeedback timer too often
* ('spinning') on LANs with small RTTs.
*/
hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
USEC_PER_SEC ); CONFIG_IP_DCCP_CCID3_RTO *
(USEC_PER_SEC/1000) );
/* /*
* Schedule no feedback timer to expire in * Schedule no feedback timer to expire in
* max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
*/ */
t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
"expire in %lu jiffies (%luus)\n", "expire in %lu jiffies (%luus)\n",
...@@ -535,7 +545,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ...@@ -535,7 +545,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hctx->ccid3hctx_idle = 1; hctx->ccid3hctx_idle = 1;
break; break;
case TFRC_SSTATE_NO_SENT: case TFRC_SSTATE_NO_SENT:
DCCP_WARN("Illegal ACK received - no packet has been sent\n"); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT)
DCCP_WARN("Illegal ACK received - no packet sent\n");
/* fall through */ /* fall through */
case TFRC_SSTATE_TERM: /* ignore feedback when closing */ case TFRC_SSTATE_TERM: /* ignore feedback when closing */
break; break;
......
...@@ -51,8 +51,6 @@ ...@@ -51,8 +51,6 @@
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64 #define TFRC_T_MBI 64
#define TFRC_SMALLEST_P 40
enum ccid3_options { enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193, TFRC_OPT_LOSS_INTERVALS = 193,
......
...@@ -18,10 +18,79 @@ ...@@ -18,10 +18,79 @@
#include "tfrc.h" #include "tfrc.h"
#define TFRC_CALC_X_ARRSIZE 500 #define TFRC_CALC_X_ARRSIZE 500
#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
#define TFRC_CALC_X_SPLIT 50000 /*
/* equivalent to 0.05 */ TFRC TCP Reno Throughput Equation Lookup Table for f(p)
The following two-column lookup table implements a part of the TCP throughput
equation from [RFC 3448, sec. 3.1]:
s
X_calc = --------------------------------------------------------------
R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
Where:
X is the transmit rate in bytes/second
s is the packet size in bytes
R is the round trip time in seconds
p is the loss event rate, between 0 and 1.0, of the number of loss
events as a fraction of the number of packets transmitted
t_RTO is the TCP retransmission timeout value in seconds
b is the number of packets acknowledged by a single TCP ACK
We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
s
X_calc = -------------------------------------------------------
R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
which we can break down into:
s
X_calc = ---------
R * f(p)
where f(p) is given for 0 < p <= 1 by:
f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
Since this is kernel code, floating-point arithmetic is avoided in favour of
integer arithmetic. This means that nearly all fractional parameters are
scaled by 1000000:
* the parameters p and R
* the return result f(p)
The lookup table therefore actually tabulates the following function g(q):
g(q) = 1000000 * f(q/1000000)
Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
granularity for the practically more relevant case of small values of p (up to
5%), the second column is used; the first one ranges up to 100%. This split
corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
determines the smallest resolution possible with this lookup table:
TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
The entire table is generated by:
for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
}
With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
lookup[M][0] = g(1000000) = 1000000 * f(100%)
lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
In summary, the two columns represent f(p) for the following ranges:
* The first column is for 0.002 <= p <= 1.0
* The second column is for 0.0001 <= p <= 0.05
Where the columns overlap, the second (finer-grained) is given preference,
i.e. the first column is used only for p >= 0.05.
*/
static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 37172, 8172 }, { 37172, 8172 },
{ 53499, 11567 }, { 53499, 11567 },
...@@ -525,85 +594,69 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { ...@@ -525,85 +594,69 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 243315981, 271305 } { 243315981, 271305 }
}; };
/* Calculate the send rate as per section 3.1 of RFC3448 /* return largest index i such that fval <= lookup[i][small] */
static inline u32 tfrc_binsearch(u32 fval, u8 small)
Returns send rate in bytes per second {
u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
Integer maths and lookups are used as not allowed floating point in kernel
while (low < high) {
The function for Xcalc as per section 3.1 of RFC3448 is: try = (low + high) / 2;
if (fval <= tfrc_calc_x_lookup[try][small])
X = s high = try;
------------------------------------------------------------- else
R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) low = try + 1;
}
where return high;
X is the trasmit rate in bytes/second }
s is the packet size in bytes
R is the round trip time in seconds
p is the loss event rate, between 0 and 1.0, of the number of loss events
as a fraction of the number of packets transmitted
t_RTO is the TCP retransmission timeout value in seconds
b is the number of packets acknowledged by a single TCP acknowledgement
we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
X = s
-----------------------------------------------------------------------
R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
which we can break down into:
X = s
--------
R * f(p)
where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
Function parameters:
s - bytes
R - RTT in usecs
p - loss rate (decimal fraction multiplied by 1,000,000)
Returns Xcalc in bytes per second
DON'T alter this code unless you run test cases against it as the code
has been manipulated to stop underflow/overlow.
*/ /**
* tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
*
* @s: packet size in bytes
* @R: RTT scaled by 1000000 (i.e., microseconds)
* @p: loss ratio estimate scaled by 1000000
* Returns X_calc in bytes per second (not scaled).
*
* Note: DO NOT alter this code unless you run test cases against it,
* as the code has been optimized to stop underflow/overflow.
*/
u32 tfrc_calc_x(u16 s, u32 R, u32 p) u32 tfrc_calc_x(u16 s, u32 R, u32 p)
{ {
int index; int index;
u32 f; u32 f;
u64 tmp1, tmp2; u64 tmp1, tmp2;
if (p < TFRC_CALC_X_SPLIT) /* check against invalid parameters and divide-by-zero */
index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; BUG_ON(p > 1000000); /* p must not exceed 100% */
else BUG_ON(p == 0); /* f(0) = 0, divide by zero */
index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; if (R == 0) { /* possible divide by zero */
DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
return ~0U;
}
if (index < 0) if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
/* p should be 0 unless there is a bug in my code */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
index = 0; DCCP_WARN("Value of p (%d) below resolution. "
"Substituting %d\n", p, TFRC_SMALLEST_P);
index = 0;
} else /* 0.0001 <= p <= 0.05 */
index = p/TFRC_SMALLEST_P - 1;
if (R == 0) { f = tfrc_calc_x_lookup[index][1];
DCCP_WARN("RTT==0, setting to 1\n");
R = 1; /* RTT can't be zero or else divide by zero */
}
BUG_ON(index >= TFRC_CALC_X_ARRSIZE); } else { /* 0.05 < p <= 1.00 */
index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
if (p >= TFRC_CALC_X_SPLIT)
f = tfrc_calc_x_lookup[index][0]; f = tfrc_calc_x_lookup[index][0];
else }
f = tfrc_calc_x_lookup[index][1];
/* The following computes X = s/(R*f(p)) in bytes per second. Since f(p)
* and R are both scaled by 1000000, we need to multiply by 1000000^2.
* ==> DO NOT alter this unless you test against overflow on 32 bit */
tmp1 = ((u64)s * 100000000); tmp1 = ((u64)s * 100000000);
tmp2 = ((u64)R * (u64)f); tmp2 = ((u64)R * (u64)f);
do_div(tmp2, 10000); do_div(tmp2, 10000);
do_div(tmp1, tmp2); do_div(tmp1, tmp2);
/* Don't alter above math unless you test due to overflow on 32 bit */
return (u32)tmp1; return (u32)tmp1;
} }
...@@ -611,33 +664,36 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) ...@@ -611,33 +664,36 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
EXPORT_SYMBOL_GPL(tfrc_calc_x); EXPORT_SYMBOL_GPL(tfrc_calc_x);
/* /*
* args: fvalue - function value to match * tfrc_calc_x_reverse_lookup - try to find p given f(p)
* returns: p closest to that value
* *
* both fvalue and p are multiplied by 1,000,000 to use ints * @fvalue: function value to match, scaled by 1000000
* Returns closest match for p, also scaled by 1000000
*/ */
u32 tfrc_calc_x_reverse_lookup(u32 fvalue) u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
{ {
int ctr = 0; int index;
int small;
if (fvalue < tfrc_calc_x_lookup[0][1]) if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
return 0; return 0;
if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) /* Error cases. */
small = 1; if (fvalue < tfrc_calc_x_lookup[0][1]) {
else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
return tfrc_calc_x_lookup[0][1];
}
if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
return 1000000; return 1000000;
else }
small = 0;
while (fvalue > tfrc_calc_x_lookup[ctr][small])
ctr++;
if (small) if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; index = tfrc_binsearch(fvalue, 1);
else return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; }
/* else ... it must be in the coarse-grained column */
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
} }
EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
...@@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, ...@@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
int i; int i;
xp->xfrm_nr = nr; xp->xfrm_nr = nr;
xp->family = ut->family;
for (i = 0; i < nr; i++, ut++) { for (i = 0; i < nr; i++, ut++) {
struct xfrm_tmpl *t = &xp->xfrm_vec[i]; struct xfrm_tmpl *t = &xp->xfrm_vec[i];
...@@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, ...@@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
} }
} }
static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
{
int i;
if (nr > XFRM_MAX_DEPTH)
return -EINVAL;
for (i = 0; i < nr; i++) {
/* We never validated the ut->family value, so many
* applications simply leave it at zero. The check was
* never made and ut->family was ignored because all
* templates could be assumed to have the same family as
* the policy itself. Now that we will have ipv4-in-ipv6
* and ipv6-in-ipv4 tunnels, this is no longer true.
*/
if (!ut[i].family)
ut[i].family = family;
switch (ut[i].family) {
case AF_INET:
break;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
case AF_INET6:
break;
#endif
default:
return -EINVAL;
};
}
return 0;
}
static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma)
{ {
struct rtattr *rt = xfrma[XFRMA_TMPL-1]; struct rtattr *rt = xfrma[XFRMA_TMPL-1];
struct xfrm_user_tmpl *utmpl;
int nr;
if (!rt) { if (!rt) {
pol->xfrm_nr = 0; pol->xfrm_nr = 0;
} else { } else {
nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); struct xfrm_user_tmpl *utmpl = RTA_DATA(rt);
int nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl);
int err;
if (nr > XFRM_MAX_DEPTH) err = validate_tmpl(nr, utmpl, pol->family);
return -EINVAL; if (err)
return err;
copy_templates(pol, RTA_DATA(rt), nr); copy_templates(pol, RTA_DATA(rt), nr);
} }
...@@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf ...@@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf
} }
/* build an XP */ /* build an XP */
xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); if (!xp) { xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err);
if (!xp) {
kfree(x); kfree(x);
return err; return err;
} }
...@@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, ...@@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
return NULL; return NULL;
nr = ((len - sizeof(*p)) / sizeof(*ut)); nr = ((len - sizeof(*p)) / sizeof(*ut));
if (nr > XFRM_MAX_DEPTH) if (validate_tmpl(nr, ut, p->sel.family))
return NULL; return NULL;
if (p->dir > XFRM_POLICY_OUT) if (p->dir > XFRM_POLICY_OUT)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment