Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
79ffeeb9
Commit
79ffeeb9
authored
Nov 10, 2005
by
Linus Torvalds
Browse files
Options
Browse Files
Download
Plain Diff
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
parents
a5aac37f
6a438bbe
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
413 additions
and
199 deletions
+413
-199
Documentation/networking/ip-sysctl.txt
Documentation/networking/ip-sysctl.txt
+5
-0
include/linux/sysctl.h
include/linux/sysctl.h
+1
-0
include/linux/tcp.h
include/linux/tcp.h
+16
-0
include/net/sock.h
include/net/sock.h
+6
-0
include/net/tcp.h
include/net/tcp.h
+65
-6
net/ipv4/sysctl_net_ipv4.c
net/ipv4/sysctl_net_ipv4.c
+8
-0
net/ipv4/tcp.c
net/ipv4/tcp.c
+2
-1
net/ipv4/tcp_bic.c
net/ipv4/tcp_bic.c
+5
-7
net/ipv4/tcp_cong.c
net/ipv4/tcp_cong.c
+24
-16
net/ipv4/tcp_highspeed.c
net/ipv4/tcp_highspeed.c
+5
-6
net/ipv4/tcp_htcp.c
net/ipv4/tcp_htcp.c
+6
-7
net/ipv4/tcp_hybla.c
net/ipv4/tcp_hybla.c
+3
-3
net/ipv4/tcp_input.c
net/ipv4/tcp_input.c
+194
-94
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_ipv4.c
+2
-2
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_minisocks.c
+4
-3
net/ipv4/tcp_output.c
net/ipv4/tcp_output.c
+47
-14
net/ipv4/tcp_scalable.c
net/ipv4/tcp_scalable.c
+7
-7
net/ipv4/tcp_timer.c
net/ipv4/tcp_timer.c
+2
-2
net/ipv4/tcp_vegas.c
net/ipv4/tcp_vegas.c
+11
-31
No files found.
Documentation/networking/ip-sysctl.txt
View file @
79ffeeb9
...
...
@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER
TCP variables:
tcp_abc - INTEGER
Controls Appropriate Byte Count defined in RFC3465. If set to
0 then does congestion avoid once per ack. 1 is conservative
value, and 2 is more agressive.
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value
...
...
include/linux/sysctl.h
View file @
79ffeeb9
...
...
@@ -390,6 +390,7 @@ enum
NET_TCP_BIC_BETA
=
108
,
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR
=
109
,
NET_TCP_CONG_CONTROL
=
110
,
NET_TCP_ABC
=
111
,
};
enum
{
...
...
include/linux/tcp.h
View file @
79ffeeb9
...
...
@@ -307,6 +307,21 @@ struct tcp_sock {
struct
tcp_sack_block
duplicate_sack
[
1
];
/* D-SACK block */
struct
tcp_sack_block
selective_acks
[
4
];
/* The SACKS themselves*/
struct
tcp_sack_block
recv_sack_cache
[
4
];
/* from STCP, retrans queue hinting */
struct
sk_buff
*
lost_skb_hint
;
struct
sk_buff
*
scoreboard_skb_hint
;
struct
sk_buff
*
retransmit_skb_hint
;
struct
sk_buff
*
forward_skb_hint
;
struct
sk_buff
*
fastpath_skb_hint
;
int
fastpath_cnt_hint
;
int
lost_cnt_hint
;
int
retransmit_cnt_hint
;
int
forward_cnt_hint
;
__u16
advmss
;
/* Advertised MSS */
__u16
prior_ssthresh
;
/* ssthresh saved at recovery start */
__u32
lost_out
;
/* Lost packets */
...
...
@@ -326,6 +341,7 @@ struct tcp_sock {
__u32
snd_up
;
/* Urgent pointer */
__u32
total_retrans
;
/* Total retransmits for entire connection */
__u32
bytes_acked
;
/* Appropriate Byte Counting - RFC3465 */
unsigned
int
keepalive_time
;
/* time before keep alive takes place */
unsigned
int
keepalive_intvl
;
/* time interval between keep alive probes */
...
...
include/net/sock.h
View file @
79ffeeb9
...
...
@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next)
/*from STCP for fast SACK Process*/
#define sk_stream_for_retrans_queue_from(skb, sk) \
for (; (skb != (sk)->sk_send_head) && \
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next)
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
...
...
include/net/tcp.h
View file @
79ffeeb9
...
...
@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
*/
#define TCP_SYN_RETRIES 5
/* number of times to retry active opening a
* connection: ~180sec is RFC min
u
mum */
* connection: ~180sec is RFC min
i
mum */
#define TCP_SYNACK_RETRIES 5
/* number of times to retry passive opening a
* connection: ~180sec is RFC min
u
mum */
* connection: ~180sec is RFC min
i
mum */
#define TCP_ORPHAN_RETRIES 7
/* number of times to retry on an orphaned
...
...
@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1
/* Nagle's algo is disabled */
#define TCP_NAGLE_CORK 2
/* Socket is corked */
#define TCP_NAGLE_PUSH 4
/* Cork is overriden for already queued data */
#define TCP_NAGLE_PUSH 4
/* Cork is overrid
d
en for already queued data */
extern
struct
inet_timewait_death_row
tcp_death_row
;
...
...
@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency;
extern
int
sysctl_tcp_nometrics_save
;
extern
int
sysctl_tcp_moderate_rcvbuf
;
extern
int
sysctl_tcp_tso_win_divisor
;
extern
int
sysctl_tcp_abc
;
extern
atomic_t
tcp_memory_allocated
;
extern
atomic_t
tcp_sockets_allocated
;
...
...
@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk);
/* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decidely
* of jiffies in the buffer control blocks below. We decide
d
ly
* only use of the low 32-bits of jiffies and hide the ugly
* casts with the following macro.
*/
#define tcp_time_stamp ((__u32)(jiffies))
/* This is what the send packet queu
e
ing engine uses to pass
/* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission
* code. We also store the host-order sequence numbers in
* here too. This is 36 bytes on 32-bit architectures,
...
...
@@ -597,7 +598,7 @@ struct tcp_skb_cb {
#define TCPCB_EVER_RETRANS 0x80
/* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
#define TCPCB_URG 0x20
/* Urgent pointer adv
e
nced here */
#define TCPCB_URG 0x20
/* Urgent pointer adv
a
nced here */
#define TCPCB_AT_TAIL (TCPCB_URG)
...
...
@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
(
tp
->
snd_cwnd
>>
2
)));
}
/*
* Linear increase during slow start
*/
static
inline
void
tcp_slow_start
(
struct
tcp_sock
*
tp
)
{
if
(
sysctl_tcp_abc
)
{
/* RFC3465: Slow Start
* TCP sender SHOULD increase cwnd by the number of
* previously unacknowledged bytes ACKed by each incoming
* acknowledgment, provided the increase is not more than L
*/
if
(
tp
->
bytes_acked
<
tp
->
mss_cache
)
return
;
/* We MAY increase by 2 if discovered delayed ack */
if
(
sysctl_tcp_abc
>
1
&&
tp
->
bytes_acked
>
2
*
tp
->
mss_cache
)
{
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
}
tp
->
bytes_acked
=
0
;
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
static
inline
void
tcp_sync_left_out
(
struct
tcp_sock
*
tp
)
{
if
(
tp
->
rx_opt
.
sack_ok
&&
...
...
@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk)
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
tp
->
prior_ssthresh
=
0
;
tp
->
bytes_acked
=
0
;
if
(
inet_csk
(
sk
)
->
icsk_ca_state
<
TCP_CA_CWR
)
{
__tcp_enter_cwr
(
sk
);
tcp_set_ca_state
(
sk
,
TCP_CA_CWR
);
...
...
@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
return
3
;
}
/* RFC2861 Check whether we are limited by application or congestion window
* This is the inverse of cwnd check in tcp_tso_should_defer
*/
static
inline
int
tcp_is_cwnd_limited
(
const
struct
sock
*
sk
,
u32
in_flight
)
{
const
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
u32
left
;
if
(
in_flight
>=
tp
->
snd_cwnd
)
return
1
;
if
(
!
(
sk
->
sk_route_caps
&
NETIF_F_TSO
))
return
0
;
left
=
tp
->
snd_cwnd
-
in_flight
;
if
(
sysctl_tcp_tso_win_divisor
)
return
left
*
sysctl_tcp_tso_win_divisor
<
tp
->
snd_cwnd
;
else
return
left
<=
tcp_max_burst
(
tp
);
}
static
__inline__
void
tcp_minshall_update
(
struct
tcp_sock
*
tp
,
int
mss
,
const
struct
sk_buff
*
skb
)
{
...
...
@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void)
TCP_ADD_STATS_USER
(
TCP_MIB_MAXCONN
,
-
1
);
}
/*from STCP */
static
inline
void
clear_all_retrans_hints
(
struct
tcp_sock
*
tp
){
tp
->
lost_skb_hint
=
NULL
;
tp
->
scoreboard_skb_hint
=
NULL
;
tp
->
retransmit_skb_hint
=
NULL
;
tp
->
forward_skb_hint
=
NULL
;
tp
->
fastpath_skb_hint
=
NULL
;
}
/* /proc */
enum
tcp_seq_states
{
TCP_SEQ_STATE_LISTENING
,
...
...
net/ipv4/sysctl_net_ipv4.c
View file @
79ffeeb9
...
...
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.
proc_handler
=
&
proc_tcp_congestion_control
,
.
strategy
=
&
sysctl_tcp_congestion_control
,
},
{
.
ctl_name
=
NET_TCP_ABC
,
.
procname
=
"tcp_abc"
,
.
data
=
&
sysctl_tcp_abc
,
.
maxlen
=
sizeof
(
int
),
.
mode
=
0644
,
.
proc_handler
=
&
proc_dointvec
,
},
{
.
ctl_name
=
0
}
};
...
...
net/ipv4/tcp.c
View file @
79ffeeb9
...
...
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
}
else
if
(
tcp_need_reset
(
old_state
)
||
(
tp
->
snd_nxt
!=
tp
->
write_seq
&&
(
1
<<
old_state
)
&
(
TCPF_CLOSING
|
TCPF_LAST_ACK
)))
{
/* The last check adjusts for discrepanc
e
of Linux wrt. RFC
/* The last check adjusts for discrepanc
y
of Linux wrt. RFC
* states
*/
tcp_send_active_reset
(
sk
,
gfp_any
());
...
...
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp
->
packets_out
=
0
;
tp
->
snd_ssthresh
=
0x7fffffff
;
tp
->
snd_cwnd_cnt
=
0
;
tp
->
bytes_acked
=
0
;
tcp_set_ca_state
(
sk
,
TCP_CA_Open
);
tcp_clear_retrans
(
tp
);
inet_csk_delack_init
(
sk
);
...
...
net/ipv4/tcp_bic.c
View file @
79ffeeb9
...
...
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization
(
sk
,
data_acked
);
if
(
in_flight
<
tp
->
snd_cwnd
)
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
)
)
return
;
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
{
/* In "safe" area, increase. */
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
else
{
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
tcp_slow_start
(
tp
);
else
{
bictcp_update
(
ca
,
tp
->
snd_cwnd
);
/* In dangerous area, increase slowly.
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if
(
tp
->
snd_cwnd_cnt
>=
ca
->
cnt
)
{
...
...
net/ipv4/tcp_cong.c
View file @
79ffeeb9
...
...
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
if
(
in_flight
<
tp
->
snd_cwnd
)
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
)
)
return
;
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
{
/* In "safe" area, increase. */
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
else
{
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if
(
tp
->
snd_cwnd_cnt
>=
tp
->
snd_cwnd
)
{
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
tp
->
snd_cwnd_cnt
=
0
;
}
else
tp
->
snd_cwnd_cnt
++
;
}
/* In "safe" area, increase. */
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
tcp_slow_start
(
tp
);
/* In dangerous area, increase slowly. */
else
if
(
sysctl_tcp_abc
)
{
/* RFC3465: Apppriate Byte Count
* increase once for each full cwnd acked
*/
if
(
tp
->
bytes_acked
>=
tp
->
snd_cwnd
*
tp
->
mss_cache
)
{
tp
->
bytes_acked
-=
tp
->
snd_cwnd
*
tp
->
mss_cache
;
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
}
else
{
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
if
(
tp
->
snd_cwnd_cnt
>=
tp
->
snd_cwnd
)
{
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
tp
->
snd_cwnd_cnt
=
0
;
}
else
tp
->
snd_cwnd_cnt
++
;
}
}
EXPORT_SYMBOL_GPL
(
tcp_reno_cong_avoid
);
...
...
net/ipv4/tcp_highspeed.c
View file @
79ffeeb9
...
...
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
}
static
void
hstcp_cong_avoid
(
struct
sock
*
sk
,
u32
adk
,
u32
rtt
,
u32
in_flight
,
int
goo
d
)
u32
in_flight
,
u32
pkts_acke
d
)
{
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
struct
hstcp
*
ca
=
inet_csk_ca
(
sk
);
if
(
in_flight
<
tp
->
snd_cwnd
)
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
)
)
return
;
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
{
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
else
{
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
tcp_slow_start
(
tp
);
else
{
/* Update AIMD parameters */
if
(
tp
->
snd_cwnd
>
hstcp_aimd_vals
[
ca
->
ai
].
cwnd
)
{
while
(
tp
->
snd_cwnd
>
hstcp_aimd_vals
[
ca
->
ai
].
cwnd
&&
...
...
net/ipv4/tcp_htcp.c
View file @
79ffeeb9
...
...
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
struct
htcp
*
ca
=
inet_csk_ca
(
sk
);
if
(
in_flight
<
tp
->
snd_cwnd
)
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
)
)
return
;
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
{
/* In "safe" area, increase. */
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
}
else
{
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
tcp_slow_start
(
tp
);
else
{
measure_rtt
(
sk
);
/* keep track of number of round-trip times since last backoff event */
...
...
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
htcp_alpha_update
(
ca
);
}
/* In dangerous area, increase slowly.
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
if
((
tp
->
snd_cwnd_cnt
++
*
ca
->
alpha
)
>>
7
>=
tp
->
snd_cwnd
)
{
...
...
net/ipv4/tcp_hybla.c
View file @
79ffeeb9
...
...
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca
->
minrtt
=
tp
->
srtt
;
}
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
))
return
;
if
(
!
ca
->
hybla_en
)
return
tcp_reno_cong_avoid
(
sk
,
ack
,
rtt
,
in_flight
,
flag
);
if
(
in_flight
<
tp
->
snd_cwnd
)
return
;
if
(
ca
->
rho
==
0
)
hybla_recalc_param
(
sk
);
...
...
net/ipv4/tcp_input.c
View file @
79ffeeb9
...
...
@@ -42,7 +42,7 @@
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
* Andrey Savochkin: Fix RTT measurements in the presnce of
* Andrey Savochkin: Fix RTT measurements in the pres
e
nce of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
...
...
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
int
sysctl_tcp_nometrics_save
;
int
sysctl_tcp_moderate_rcvbuf
=
1
;
int
sysctl_tcp_abc
=
1
;
#define FLAG_DATA 0x01
/* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02
/* Incoming ACK was a window update. */
...
...
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
* window and then starts to feed us spagetti. But it should work
* window and then starts to feed us spag
h
etti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
...
...
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
{
/* Optimize this! */
int
truesize
=
tcp_win_from_space
(
skb
->
truesize
)
/
2
;
int
window
=
tcp_
full_space
(
sk
)
/
2
;
int
window
=
tcp_
win_from_space
(
sysctl_tcp_rmem
[
2
]
)
/
2
;
while
(
tp
->
rcv_ssthresh
<=
window
)
{
if
(
truesize
<=
skb
->
len
)
...
...
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
int
rcvmem
=
tp
->
advmss
+
MAX_TCP_HEADER
+
16
+
sizeof
(
struct
sk_buff
);
/* Try to select rcvbuf so that 4 mss-sized segments
* will fit to window and correspoding skbs will fit to our rcvbuf.
* will fit to window and correspo
n
ding skbs will fit to our rcvbuf.
* (was 3; 4 is minimum to allow fast retransmit to work.)
*/
while
(
tcp_win_from_space
(
rcvmem
)
<
tp
->
advmss
)
...
...
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
sk
->
sk_rcvbuf
=
min
(
4
*
rcvmem
,
sysctl_tcp_rmem
[
2
]);
}
/* 4. Try to fixup all. It is made i
i
mediately after connection enters
/* 4. Try to fixup all. It is made i
m
mediately after connection enters
* established state.
*/
static
void
tcp_init_buffer_space
(
struct
sock
*
sk
)
...
...
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
static
void
tcp_clamp_window
(
struct
sock
*
sk
,
struct
tcp_sock
*
tp
)
{
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
struct
sk_buff
*
skb
;
unsigned
int
app_win
=
tp
->
rcv_nxt
-
tp
->
copied_seq
;
int
ofo_win
=
0
;
icsk
->
icsk_ack
.
quick
=
0
;
skb_queue_walk
(
&
tp
->
out_of_order_queue
,
skb
)
{
ofo_win
+=
skb
->
len
;
}
/* If overcommit is due to out of order segments,
* do not clamp window. Try to expand rcvbuf instead.
*/
if
(
ofo_win
)
{
if
(
sk
->
sk_rcvbuf
<
sysctl_tcp_rmem
[
2
]
&&
!
(
sk
->
sk_userlocks
&
SOCK_RCVBUF_LOCK
)
&&
!
tcp_memory_pressure
&&
atomic_read
(
&
tcp_memory_allocated
)
<
sysctl_tcp_mem
[
0
])
sk
->
sk_rcvbuf
=
min
(
atomic_read
(
&
sk
->
sk_rmem_alloc
),
sysctl_tcp_rmem
[
2
]);
if
(
sk
->
sk_rcvbuf
<
sysctl_tcp_rmem
[
2
]
&&
!
(
sk
->
sk_userlocks
&
SOCK_RCVBUF_LOCK
)
&&
!
tcp_memory_pressure
&&
atomic_read
(
&
tcp_memory_allocated
)
<
sysctl_tcp_mem
[
0
])
{
sk
->
sk_rcvbuf
=
min
(
atomic_read
(
&
sk
->
sk_rmem_alloc
),
sysctl_tcp_rmem
[
2
]);
}
if
(
atomic_read
(
&
sk
->
sk_rmem_alloc
)
>
sk
->
sk_rcvbuf
)
{
app_win
+=
ofo_win
;
if
(
atomic_read
(
&
sk
->
sk_rmem_alloc
)
>=
2
*
sk
->
sk_rcvbuf
)
app_win
>>=
1
;
if
(
app_win
>
icsk
->
icsk_ack
.
rcv_mss
)
app_win
-=
icsk
->
icsk_ack
.
rcv_mss
;
app_win
=
max
(
app_win
,
2U
*
tp
->
advmss
);
if
(
atomic_read
(
&
sk
->
sk_rmem_alloc
)
>
sk
->
sk_rcvbuf
)
tp
->
rcv_ssthresh
=
min
(
tp
->
window_clamp
,
2U
*
tp
->
advmss
);
}
}
/* Receiver "autotuning" code.
...
...
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
* non-timestamp case, we do not smoothe things out
* else with timestamps disabled converg
a
nce takes too
* non-timestamp case, we do not smoothe
r
things out
* else with timestamps disabled converg
e
nce takes too
* long.
*/
if
(
!
win_dep
)
{
...
...
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
}
else
if
(
m
<
new_sample
)
new_sample
=
m
<<
3
;
}
else
{
/* No previous me
sa
ure. */
/* No previous me
as
ure. */
new_sample
=
m
<<
3
;
}
...
...
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
if
(
icsk
->
icsk_ack
.
ato
>
icsk
->
icsk_rto
)
icsk
->
icsk_ack
.
ato
=
icsk
->
icsk_rto
;
}
else
if
(
m
>
icsk
->
icsk_rto
)
{
/* Too long gap. Apparently sender fa
l
led to
/* Too long gap. Apparently sender fa
i
led to
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack
(
sk
);
...
...
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
static
void
tcp_rtt_estimator
(
struct
sock
*
sk
,
const
__u32
mrtt
,
u32
*
usrtt
)
static
void
tcp_rtt_estimator
(
struct
sock
*
sk
,
const
__u32
mrtt
)
{
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
const
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
long
m
=
mrtt
;
/* RTT */
/* The following amusing code comes from Jacobson's
...
...
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
* too slowly, when it should be incresed fastly, decrease too fastly
* too slowly, when it should be incre
a
sed fastly, decrease too fastly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
...
...
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
tp
->
mdev_max
=
tp
->
rttvar
=
max
(
tp
->
mdev
,
TCP_RTO_MIN
);
tp
->
rtt_seq
=
tp
->
snd_nxt
;
}
if
(
icsk
->
icsk_ca_ops
->
rtt_sample
)
icsk
->
icsk_ca_ops
->
rtt_sample
(
sk
,
*
usrtt
);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
...
...
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some c
u
rcumstances.
* ACKs in some c
i
rcumstances.
*/
inet_csk
(
sk
)
->
icsk_rto
=
(
tp
->
srtt
>>
3
)
+
tp
->
rttvar
;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
* with correct one. It is exac
lt
y, which we pretend to do.
* with correct one. It is exac
tl
y, which we pretend to do.
*/
}
...
...
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
* to make it more realistic.
*
* A bit of theory. RTT is time passed after "normal" sized packet
* is sent until it is ACKed. In normal c
u
rcumstances sending small
* is sent until it is ACKed. In normal c
i
rcumstances sending small
* packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever
...
...
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int
prior_fackets
;
u32
lost_retrans
=
0
;
int
flag
=
0
;
int
dup_sack
=
0
;
int
i
;
if
(
!
tp
->
sacked_out
)
tp
->
fackets_out
=
0
;
prior_fackets
=
tp
->
fackets_out
;
for
(
i
=
0
;
i
<
num_sacks
;
i
++
,
sp
++
)
{
struct
sk_buff
*
skb
;
__u32
start_seq
=
ntohl
(
sp
->
start_seq
);
__u32
end_seq
=
ntohl
(
sp
->
end_seq
);
int
fack_count
=
0
;
int
dup_sack
=
0
;
/* SACK fastpath:
* if the only SACK change is the increase of the end_seq of
* the first block then only apply that SACK block
* and use retrans queue hinting otherwise slowpath */
flag
=
1
;
for
(
i
=
0
;
i
<
num_sacks
;
i
++
)
{
__u32
start_seq
=
ntohl
(
sp
[
i
].
start_seq
);
__u32
end_seq
=
ntohl
(
sp
[
i
].
end_seq
);
if
(
i
==
0
){
if
(
tp
->
recv_sack_cache
[
i
].
start_seq
!=
start_seq
)
flag
=
0
;
}
else
{
if
((
tp
->
recv_sack_cache
[
i
].
start_seq
!=
start_seq
)
||
(
tp
->
recv_sack_cache
[
i
].
end_seq
!=
end_seq
))
flag
=
0
;
}
tp
->
recv_sack_cache
[
i
].
start_seq
=
start_seq
;
tp
->
recv_sack_cache
[
i
].
end_seq
=
end_seq
;
/* Check for D-SACK. */
if
(
i
==
0
)
{
...
...
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if
(
before
(
ack
,
prior_snd_una
-
tp
->
max_window
))
return
0
;
}
}
if
(
flag
)
num_sacks
=
1
;
else
{
int
j
;
tp
->
fastpath_skb_hint
=
NULL
;
/* order SACK blocks to allow in order walk of the retrans queue */
for
(
i
=
num_sacks
-
1
;
i
>
0
;
i
--
)
{
for
(
j
=
0
;
j
<
i
;
j
++
){
if
(
after
(
ntohl
(
sp
[
j
].
start_seq
),
ntohl
(
sp
[
j
+
1
].
start_seq
))){
sp
[
j
].
start_seq
=
htonl
(
tp
->
recv_sack_cache
[
j
+
1
].
start_seq
);
sp
[
j
].
end_seq
=
htonl
(
tp
->
recv_sack_cache
[
j
+
1
].
end_seq
);
sp
[
j
+
1
].
start_seq
=
htonl
(
tp
->
recv_sack_cache
[
j
].
start_seq
);
sp
[
j
+
1
].
end_seq
=
htonl
(
tp
->
recv_sack_cache
[
j
].
end_seq
);
}
}
}
}
/* clear flag as used for different purpose in following code */
flag
=
0
;
for
(
i
=
0
;
i
<
num_sacks
;
i
++
,
sp
++
)
{
struct
sk_buff
*
skb
;
__u32
start_seq
=
ntohl
(
sp
->
start_seq
);
__u32
end_seq
=
ntohl
(
sp
->
end_seq
);
int
fack_count
;
/* Use SACK fastpath hint if valid */
if
(
tp
->
fastpath_skb_hint
)
{
skb
=
tp
->
fastpath_skb_hint
;
fack_count
=
tp
->
fastpath_cnt_hint
;
}
else
{
skb
=
sk
->
sk_write_queue
.
next
;
fack_count
=
0
;
}
/* Event "B" in the comment above. */
if
(
after
(
end_seq
,
tp
->
high_seq
))
flag
|=
FLAG_DATA_LOST
;
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
sk_stream_for_retrans_queue
_from
(
skb
,
sk
)
{
int
in_sack
,
pcount
;
u8
sacked
;
tp
->
fastpath_skb_hint
=
skb
;
tp
->
fastpath_cnt_hint
=
fack_count
;
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
...
...
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB
(
skb
)
->
sacked
&=
~
(
TCPCB_LOST
|
TCPCB_SACKED_RETRANS
);
tp
->
lost_out
-=
tcp_skb_pcount
(
skb
);
tp
->
retrans_out
-=
tcp_skb_pcount
(
skb
);
/* clear lost hint */
tp
->
retransmit_skb_hint
=
NULL
;
}
}
else
{
/* New sack for not retransmitted frame,
...
...
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if
(
sacked
&
TCPCB_LOST
)
{
TCP_SKB_CB
(
skb
)
->
sacked
&=
~
TCPCB_LOST
;
tp
->
lost_out
-=
tcp_skb_pcount
(
skb
);
/* clear lost hint */
tp
->
retransmit_skb_hint
=
NULL
;
}
}
...
...
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(
TCP_SKB_CB
(
skb
)
->
sacked
&
TCPCB_SACKED_RETRANS
))
{
TCP_SKB_CB
(
skb
)
->
sacked
&=
~
TCPCB_SACKED_RETRANS
;
tp
->
retrans_out
-=
tcp_skb_pcount
(
skb
);
tp
->
retransmit_skb_hint
=
NULL
;
}
}
}
...
...
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB
(
skb
)
->
sacked
&=
~
TCPCB_SACKED_RETRANS
;
tp
->
retrans_out
-=
tcp_skb_pcount
(
skb
);
/* clear lost hint */
tp
->
retransmit_skb_hint
=
NULL
;
if
(
!
(
TCP_SKB_CB
(
skb
)
->
sacked
&
(
TCPCB_LOST
|
TCPCB_SACKED_ACKED
)))
{
tp
->
lost_out
+=
tcp_skb_pcount
(
skb
);
TCP_SKB_CB
(
skb
)
->
sacked
|=
TCPCB_LOST
;
...
...
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
tcp_set_ca_state
(
sk
,
TCP_CA_Loss
);
tp
->
high_seq
=
tp
->
frto_highmark
;
TCP_ECN_queue_cwr
(
tp
);
clear_all_retrans_hints
(
tp
);
}
void
tcp_clear_retrans
(
struct
tcp_sock
*
tp
)
...
...
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp
->
snd_cwnd_cnt
=
0
;
tp
->
snd_cwnd_stamp
=
tcp_time_stamp
;
tp
->
bytes_acked
=
0
;
tcp_clear_retrans
(
tp
);
/* Push undo marker, if it was plain RTO and nothing
...
...
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state
(
sk
,
TCP_CA_Loss
);
tp
->
high_seq
=
tp
->
snd_nxt
;
TCP_ECN_queue_cwr
(
tp
);
clear_all_retrans_hints
(
tp
);
}
static
int
tcp_check_sack_reneging
(
struct
sock
*
sk
)
...
...
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int
packets
,
u32
high_seq
)
{
struct
sk_buff
*
skb
;
int
cnt
=
packets
;
int
cnt
;
BUG_TRAP
(
cnt
<=
tp
->
packets_out
);
BUG_TRAP
(
packets
<=
tp
->
packets_out
);
if
(
tp
->
lost_skb_hint
)
{
skb
=
tp
->
lost_skb_hint
;
cnt
=
tp
->
lost_cnt_hint
;
}
else
{
skb
=
sk
->
sk_write_queue
.
next
;
cnt
=
0
;
}
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
cnt
-=
tcp_skb_pcount
(
skb
);
if
(
cnt
<
0
||
after
(
TCP_SKB_CB
(
skb
)
->
end_seq
,
high_seq
))
sk_stream_for_retrans_queue_from
(
skb
,
sk
)
{
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp
->
lost_skb_hint
=
skb
;
tp
->
lost_cnt_hint
=
cnt
;
cnt
+=
tcp_skb_pcount
(
skb
);
if
(
cnt
>
packets
||
after
(
TCP_SKB_CB
(
skb
)
->
end_seq
,
high_seq
))
break
;
if
(
!
(
TCP_SKB_CB
(
skb
)
->
sacked
&
TCPCB_TAGBITS
))
{
TCP_SKB_CB
(
skb
)
->
sacked
|=
TCPCB_LOST
;
tp
->
lost_out
+=
tcp_skb_pcount
(
skb
);
/* clear xmit_retransmit_queue hints
* if this is beyond hint */
if
(
tp
->
retransmit_skb_hint
!=
NULL
&&
before
(
TCP_SKB_CB
(
skb
)
->
seq
,
TCP_SKB_CB
(
tp
->
retransmit_skb_hint
)
->
seq
))
{
tp
->
retransmit_skb_hint
=
NULL
;
}
}
}
tcp_sync_left_out
(
tp
);
...
...
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
if
(
tcp_head_timedout
(
sk
,
tp
))
{
struct
sk_buff
*
skb
;
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
if
(
tcp_skb_timedout
(
sk
,
skb
)
&&
!
(
TCP_SKB_CB
(
skb
)
->
sacked
&
TCPCB_TAGBITS
))
{
skb
=
tp
->
scoreboard_skb_hint
?
tp
->
scoreboard_skb_hint
:
sk
->
sk_write_queue
.
next
;
sk_stream_for_retrans_queue_from
(
skb
,
sk
)
{
if
(
!
tcp_skb_timedout
(
sk
,
skb
))
break
;
if
(
!
(
TCP_SKB_CB
(
skb
)
->
sacked
&
TCPCB_TAGBITS
))
{
TCP_SKB_CB
(
skb
)
->
sacked
|=
TCPCB_LOST
;
tp
->
lost_out
+=
tcp_skb_pcount
(
skb
);
/* clear xmit_retrans hint */
if
(
tp
->
retransmit_skb_hint
&&
before
(
TCP_SKB_CB
(
skb
)
->
seq
,
TCP_SKB_CB
(
tp
->
retransmit_skb_hint
)
->
seq
))
tp
->
retransmit_skb_hint
=
NULL
;
}
}
tp
->
scoreboard_skb_hint
=
skb
;
tcp_sync_left_out
(
tp
);
}
}
...
...
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
}
tcp_moderate_cwnd
(
tp
);
tp
->
snd_cwnd_stamp
=
tcp_time_stamp
;
/* There is something screwy going on with the retrans hints after
an undo */
clear_all_retrans_hints
(
tp
);
}
static
inline
int
tcp_may_undo
(
struct
tcp_sock
*
tp
)
...
...
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
TCP_SKB_CB
(
skb
)
->
sacked
&=
~
TCPCB_LOST
;
}
clear_all_retrans_hints
(
tp
);
DBGUNDO
(
sk
,
tp
,
"partial loss"
);
tp
->
lost_out
=
0
;
tp
->
left_out
=
tp
->
sacked_out
;
...
...
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
TCP_ECN_queue_cwr
(
tp
);
}
tp
->
bytes_acked
=
0
;
tp
->
snd_cwnd_cnt
=
0
;
tcp_set_ca_state
(
sk
,
TCP_CA_Recovery
);
}
...
...
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Super
ceed
s RFC1323)
* with this code. (Super
sede
s RFC1323)
*/
static
void
tcp_ack_saw_tstamp
(
struct
sock
*
sk
,
u32
*
usrtt
,
int
flag
)
static
void
tcp_ack_saw_tstamp
(
struct
sock
*
sk
,
int
flag
)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
...
...
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
* If we do not, we get strongly overstimated rto. With timestamps
* If we do not, we get strongly over
e
stimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
...
...
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
*/
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
const
__u32
seq_rtt
=
tcp_time_stamp
-
tp
->
rx_opt
.
rcv_tsecr
;
tcp_rtt_estimator
(
sk
,
seq_rtt
,
usrtt
);
tcp_rtt_estimator
(
sk
,
seq_rtt
);
tcp_set_rto
(
sk
);
inet_csk
(
sk
)
->
icsk_backoff
=
0
;
tcp_bound_rto
(
sk
);
}
static
void
tcp_ack_no_tstamp
(
struct
sock
*
sk
,
u32
seq_rtt
,
u32
*
usrtt
,
int
flag
)
static
void
tcp_ack_no_tstamp
(
struct
sock
*
sk
,
u32
seq_rtt
,
int
flag
)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
...
...
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
if
(
flag
&
FLAG_RETRANS_DATA_ACKED
)
return
;
tcp_rtt_estimator
(
sk
,
seq_rtt
,
usrtt
);
tcp_rtt_estimator
(
sk
,
seq_rtt
);
tcp_set_rto
(
sk
);
inet_csk
(
sk
)
->
icsk_backoff
=
0
;
tcp_bound_rto
(
sk
);
}
static
inline
void
tcp_ack_update_rtt
(
struct
sock
*
sk
,
const
int
flag
,
const
s32
seq_rtt
,
u32
*
usrtt
)
const
s32
seq_rtt
)
{
const
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if
(
tp
->
rx_opt
.
saw_tstamp
&&
tp
->
rx_opt
.
rcv_tsecr
)
tcp_ack_saw_tstamp
(
sk
,
usrtt
,
flag
);
tcp_ack_saw_tstamp
(
sk
,
flag
);
else
if
(
seq_rtt
>=
0
)
tcp_ack_no_tstamp
(
sk
,
seq_rtt
,
usrtt
,
flag
);
tcp_ack_no_tstamp
(
sk
,
seq_rtt
,
flag
);
}
static
inline
void
tcp_cong_avoid
(
struct
sock
*
sk
,
u32
ack
,
u32
rtt
,
...
...
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return
acked
;
}
static
inline
u32
tcp_usrtt
(
const
struct
sk_buff
*
skb
)
{
struct
timeval
tv
,
now
;
do_gettimeofday
(
&
now
);
skb_get_timestamp
(
skb
,
&
tv
);
return
(
now
.
tv_sec
-
tv
.
tv_sec
)
*
1000000
+
(
now
.
tv_usec
-
tv
.
tv_usec
);
}
/* Remove acknowledged frames from the retransmission queue. */
static
int
tcp_clean_rtx_queue
(
struct
sock
*
sk
,
__s32
*
seq_rtt_p
,
s32
*
seq_usrtt
)
static
int
tcp_clean_rtx_queue
(
struct
sock
*
sk
,
__s32
*
seq_rtt_p
)
{
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
const
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
struct
sk_buff
*
skb
;
__u32
now
=
tcp_time_stamp
;
int
acked
=
0
;
__s32
seq_rtt
=
-
1
;
struct
timeval
usnow
;
u32
pkts_acked
=
0
;
if
(
seq_usrtt
)
do_gettimeofday
(
&
usnow
);
void
(
*
rtt_sample
)(
struct
sock
*
sk
,
u32
usrtt
)
=
icsk
->
icsk_ca_ops
->
rtt_sample
;
while
((
skb
=
skb_peek
(
&
sk
->
sk_write_queue
))
&&
skb
!=
sk
->
sk_send_head
)
{
...
...
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
tp
->
retrans_out
-=
tcp_skb_pcount
(
skb
);
acked
|=
FLAG_RETRANS_DATA_ACKED
;
seq_rtt
=
-
1
;
}
else
if
(
seq_rtt
<
0
)
}
else
if
(
seq_rtt
<
0
)
{
seq_rtt
=
now
-
scb
->
when
;
if
(
seq_usrtt
)
{
struct
timeval
tv
;
skb_get_timestamp
(
skb
,
&
tv
);
*
seq_usrtt
=
(
usnow
.
tv_sec
-
tv
.
tv_sec
)
*
1000000
+
(
usnow
.
tv_usec
-
tv
.
tv_usec
);
if
(
rtt_sample
)
(
*
rtt_sample
)(
sk
,
tcp_usrtt
(
skb
));
}
if
(
sacked
&
TCPCB_SACKED_ACKED
)
tp
->
sacked_out
-=
tcp_skb_pcount
(
skb
);
if
(
sacked
&
TCPCB_LOST
)
...
...
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
!
before
(
scb
->
end_seq
,
tp
->
snd_up
))
tp
->
urg_mode
=
0
;
}
}
else
if
(
seq_rtt
<
0
)
}
else
if
(
seq_rtt
<
0
)
{
seq_rtt
=
now
-
scb
->
when
;
if
(
rtt_sample
)
(
*
rtt_sample
)(
sk
,
tcp_usrtt
(
skb
));
}
tcp_dec_pcount_approx
(
&
tp
->
fackets_out
,
skb
);
tcp_packets_out_dec
(
tp
,
skb
);
__skb_unlink
(
skb
,
&
sk
->
sk_write_queue
);
sk_stream_free_skb
(
sk
,
skb
);
clear_all_retrans_hints
(
tp
);
}
if
(
acked
&
FLAG_ACKED
)
{
const
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
tcp_ack_update_rtt
(
sk
,
acked
,
seq_rtt
,
seq_usrtt
);
tcp_ack_update_rtt
(
sk
,
acked
,
seq_rtt
);
tcp_ack_packets_out
(
sk
,
tp
);
if
(
icsk
->
icsk_ca_ops
->
pkts_acked
)
...
...
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
}
/* F-RTO affects on two new ACKs following RTO.
* At latest on third ACK the TCP behavor is back to normal.
* At latest on third ACK the TCP behav
i
or is back to normal.
*/
tp
->
frto_counter
=
(
tp
->
frto_counter
+
1
)
%
3
;
}
...
...
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32
ack
=
TCP_SKB_CB
(
skb
)
->
ack_seq
;
u32
prior_in_flight
;
s32
seq_rtt
;
s32
seq_usrtt
=
0
;
int
prior_packets
;
/* If the ack is newer than sent or older than previous acks
...
...
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if
(
before
(
ack
,
prior_snd_una
))
goto
old_ack
;
if
(
sysctl_tcp_abc
&&
icsk
->
icsk_ca_state
<
TCP_CA_CWR
)
tp
->
bytes_acked
+=
ack
-
prior_snd_una
;
if
(
!
(
flag
&
FLAG_SLOWPATH
)
&&
after
(
ack
,
prior_snd_una
))
{
/* Window is constant, pure forward advance.
* No more checks are required.
...
...
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight
=
tcp_packets_in_flight
(
tp
);
/* See if we can take anything off of the retransmit queue. */
flag
|=
tcp_clean_rtx_queue
(
sk
,
&
seq_rtt
,
icsk
->
icsk_ca_ops
->
rtt_sample
?
&
seq_usrtt
:
NULL
);
flag
|=
tcp_clean_rtx_queue
(
sk
,
&
seq_rtt
);
if
(
tp
->
frto_counter
)
tcp_process_frto
(
sk
,
prior_snd_una
);
if
(
tcp_ack_is_dubious
(
sk
,
flag
))
{
/* Advan
v
e CWND, if state allows this. */
/* Advan
c
e CWND, if state allows this. */
if
((
flag
&
FLAG_DATA_ACKED
)
&&
tcp_may_raise_cwnd
(
sk
,
flag
))
tcp_cong_avoid
(
sk
,
ack
,
seq_rtt
,
prior_in_flight
,
0
);
tcp_fastretrans_alert
(
sk
,
prior_snd_una
,
prior_packets
,
flag
);
...
...
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
{
struct
sk_buff
*
skb
;
/* First, check that queue is collaps
a
ble and find
/* First, check that queue is collaps
i
ble and find
* the point where collapsing can be useful. */
for
(
skb
=
head
;
skb
!=
tail
;
)
{
/* No new bits? It is possible on ofo queue. */
...
...
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/*
* This routine is only called when we have urgent data
* signal
l
ed. Its the 'slow' part of tcp_urg. It could be
* signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
...
...
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the sematics of SIOCATMARK (and thus sockatmark())
* or we break the sema
n
tics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
...
...
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp
->
rx_opt
.
saw_tstamp
=
0
;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_predition is to be made
* if header_predi
c
tion is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
...
...
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if
(
tp
->
rx_opt
.
saw_tstamp
&&
tp
->
rx_opt
.
rcv_tsecr
&&
!
tp
->
srtt
)
tcp_ack_saw_tstamp
(
sk
,
NULL
,
0
);
tcp_ack_saw_tstamp
(
sk
,
0
);
if
(
tp
->
rx_opt
.
tstamp_ok
)
tp
->
advmss
-=
TCPOLEN_TSTAMP_ALIGNED
;
...
...
@@ -4372,6 +4471,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
EXPORT_SYMBOL
(
sysctl_tcp_ecn
);
EXPORT_SYMBOL
(
sysctl_tcp_reordering
);
EXPORT_SYMBOL
(
sysctl_tcp_abc
);
EXPORT_SYMBOL
(
tcp_parse_options
);
EXPORT_SYMBOL
(
tcp_rcv_established
);
EXPORT_SYMBOL
(
tcp_rcv_state_process
);
net/ipv4/tcp_ipv4.c
View file @
79ffeeb9
...
...
@@ -39,7 +39,7 @@
* request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
* Added new listen sematics.
* Added new listen sema
n
tics.
* Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes.
...
...
@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is elimin
e
ted.
* provided case of th->doff==0 is elimin
a
ted.
* So, we defer the checks. */
if
((
skb
->
ip_summed
!=
CHECKSUM_UNNECESSARY
&&
tcp_v4_checksum_init
(
skb
)))
...
...
net/ipv4/tcp_minisocks.c
View file @
79ffeeb9
...
...
@@ -158,7 +158,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6
* do not under
tsna
d recycling in any case, it not
* do not under
stan
d recycling in any case, it not
* a big problem in practice. --ANK */
if
(
tw
->
tw_family
==
AF_INET
&&
tcp_death_row
.
sysctl_tw_recycle
&&
tcptw
->
tw_ts_recent_stamp
&&
...
...
@@ -194,7 +194,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* In window segment, it may be only reset or bare ack. */
if
(
th
->
rst
)
{
/* This is TIME_WAIT assasination, in two flavors.
/* This is TIME_WAIT assas
s
ination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
...
...
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp
->
snd_cwnd
=
2
;
newtp
->
snd_cwnd_cnt
=
0
;
newtp
->
bytes_acked
=
0
;
newtp
->
frto_counter
=
0
;
newtp
->
frto_highmark
=
0
;
...
...
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
* sent (the segment carries an unacc
a
ptable ACK) ...
* sent (the segment carries an unacc
e
ptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
...
...
net/ipv4/tcp_output.c
View file @
79ffeeb9
...
...
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16
flags
;
BUG_ON
(
len
>
skb
->
len
);
clear_all_retrans_hints
(
tp
);
nsize
=
skb_headlen
(
skb
)
-
len
;
if
(
nsize
<
0
)
nsize
=
0
;
...
...
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is min
u
mum of user_mss and mss received with SYN.
It is min
i
mum of user_mss and mss received with SYN.
It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function.
...
...
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
/* MSS for the peer's data. Previous verions used mss_clamp
/* MSS for the peer's data. Previous ver
s
ions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
...
...
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON
(
tcp_skb_pcount
(
skb
)
!=
1
||
tcp_skb_pcount
(
next_skb
)
!=
1
);
/* Ok. We will be able to collapse the packet. */
/* changing transmit queue under us so clear hints */
clear_all_retrans_hints
(
tp
);
/* Ok. We will be able to collapse the packet. */
__skb_unlink
(
next_skb
,
&
sk
->
sk_write_queue
);
memcpy
(
skb_put
(
skb
,
next_skb_size
),
next_skb
->
data
,
next_skb_size
);
...
...
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
clear_all_retrans_hints
(
tp
);
if
(
!
lost
)
return
;
...
...
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int
err
;
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fr
g
agmentation, tunneling, mangling etc.
* copying overhead: fragmentation, tunneling, mangling etc.
*/
if
(
atomic_read
(
&
sk
->
sk_wmem_alloc
)
>
min
(
sk
->
sk_wmem_queued
+
(
sk
->
sk_wmem_queued
>>
2
),
sk
->
sk_sndbuf
))
...
...
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const
struct
inet_connection_sock
*
icsk
=
inet_csk
(
sk
);
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
struct
sk_buff
*
skb
;
int
packet_cnt
=
tp
->
lost_out
;
int
packet_cnt
;
if
(
tp
->
retransmit_skb_hint
)
{
skb
=
tp
->
retransmit_skb_hint
;
packet_cnt
=
tp
->
retransmit_cnt_hint
;
}
else
{
skb
=
sk
->
sk_write_queue
.
next
;
packet_cnt
=
0
;
}
/* First pass: retransmit lost packets. */
if
(
packet_cn
t
)
{
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
if
(
tp
->
lost_ou
t
)
{
sk_stream_for_retrans_queue
_from
(
skb
,
sk
)
{
__u8
sacked
=
TCP_SKB_CB
(
skb
)
->
sacked
;
/* we could do better than to assign each time */
tp
->
retransmit_skb_hint
=
skb
;
tp
->
retransmit_cnt_hint
=
packet_cnt
;
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
...
...
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if
(
tcp_packets_in_flight
(
tp
)
>=
tp
->
snd_cwnd
)
return
;
if
(
sacked
&
TCPCB_LOST
)
{
if
(
sacked
&
TCPCB_LOST
)
{
if
(
!
(
sacked
&
(
TCPCB_SACKED_ACKED
|
TCPCB_SACKED_RETRANS
)))
{
if
(
tcp_retransmit_skb
(
sk
,
skb
))
if
(
tcp_retransmit_skb
(
sk
,
skb
))
{
tp
->
retransmit_skb_hint
=
NULL
;
return
;
}
if
(
icsk
->
icsk_ca_state
!=
TCP_CA_Loss
)
NET_INC_STATS_BH
(
LINUX_MIB_TCPFASTRETRANS
);
else
...
...
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX
);
}
packet_cnt
-
=
tcp_skb_pcount
(
skb
);
if
(
packet_cnt
<=
0
)
packet_cnt
+
=
tcp_skb_pcount
(
skb
);
if
(
packet_cnt
>=
tp
->
lost_out
)
break
;
}
}
...
...
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if
(
tcp_may_send_now
(
sk
,
tp
))
return
;
packet_cnt
=
0
;
if
(
tp
->
forward_skb_hint
)
{
skb
=
tp
->
forward_skb_hint
;
packet_cnt
=
tp
->
forward_cnt_hint
;
}
else
{
skb
=
sk
->
sk_write_queue
.
next
;
packet_cnt
=
0
;
}
sk_stream_for_retrans_queue_from
(
skb
,
sk
)
{
tp
->
forward_cnt_hint
=
packet_cnt
;
tp
->
forward_skb_hint
=
skb
;
sk_stream_for_retrans_queue
(
skb
,
sk
)
{
/* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB
* we send out here will be composed of one
...
...
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue
;
/* Ok, retransmit it. */
if
(
tcp_retransmit_skb
(
sk
,
skb
))
if
(
tcp_retransmit_skb
(
sk
,
skb
))
{
tp
->
forward_skb_hint
=
NULL
;
break
;
}
if
(
skb
==
skb_peek
(
&
sk
->
sk_write_queue
))
inet_csk_reset_xmit_timer
(
sk
,
ICSK_TIME_RETRANS
,
...
...
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL
(
tcp_make_synack
);
EXPORT_SYMBOL
(
tcp_simple_retransmit
);
EXPORT_SYMBOL
(
tcp_sync_mss
);
EXPORT_SYMBOL
(
sysctl_tcp_tso_win_divisor
);
net/ipv4/tcp_scalable.c
View file @
79ffeeb9
...
...
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32
in_flight
,
int
flag
)
{
struct
tcp_sock
*
tp
=
tcp_sk
(
sk
);
if
(
in_flight
<
tp
->
snd_cwnd
)
if
(
!
tcp_is_cwnd_limited
(
sk
,
in_flight
))
return
;
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
{
t
p
->
snd_cwnd
++
;
}
else
{
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
t
cp_slow_start
(
tp
)
;
else
{
tp
->
snd_cwnd_cnt
++
;
if
(
tp
->
snd_cwnd_cnt
>
min
(
tp
->
snd_cwnd
,
TCP_SCALABLE_AI_CNT
)){
tp
->
snd_cwnd
++
;
if
(
tp
->
snd_cwnd
<
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
++
;
tp
->
snd_cwnd_cnt
=
0
;
}
}
tp
->
snd_cwnd
=
min_t
(
u32
,
tp
->
snd_cwnd
,
tp
->
snd_cwnd_clamp
);
tp
->
snd_cwnd_stamp
=
tcp_time_stamp
;
}
static
u32
tcp_scalable_ssthresh
(
struct
sock
*
sk
)
...
...
net/ipv4/tcp_timer.c
View file @
79ffeeb9
...
...
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
* Criteri
um
is still not confirmed experimentally and may change.
* Criteri
a
is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
...
...
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-(
It is place to make it. It is not made. I do not want
to make it. It is disgu
i
sting. It does not work in any
to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for
us to implement this:
...
...
net/ipv4/tcp_vegas.c
View file @
79ffeeb9
...
...
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
if
(
tp
->
snd_cwnd
>
tp
->
snd_ssthresh
)
tp
->
snd_cwnd
++
;
tcp_reno_cong_avoid
(
sk
,
ack
,
seq_rtt
,
in_flight
,
cnt
);
}
else
{
u32
rtt
,
target_cwnd
,
diff
;
...
...
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/
diff
=
(
old_wnd
<<
V_PARAM_SHIFT
)
-
target_cwnd
;
if
(
tp
->
snd_cwnd
<
tp
->
snd_ssthresh
)
{
if
(
tp
->
snd_cwnd
<
=
tp
->
snd_ssthresh
)
{
/* Slow start. */
if
(
diff
>
gamma
)
{
/* Going too fast. Time to slow down
...
...
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT
)
+
1
);
}
tcp_slow_start
(
tp
);
}
else
{
/* Congestion avoidance. */
u32
next_snd_cwnd
;
...
...
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else
if
(
next_snd_cwnd
<
tp
->
snd_cwnd
)
tp
->
snd_cwnd
--
;
}
}
/* Wipe the slate clean for the next RTT. */
vegas
->
cntRTT
=
0
;
vegas
->
minRTT
=
0x7fffffff
;
if
(
tp
->
snd_cwnd
<
2
)
tp
->
snd_cwnd
=
2
;
else
if
(
tp
->
snd_cwnd
>
tp
->
snd_cwnd_clamp
)
tp
->
snd_cwnd
=
tp
->
snd_cwnd_clamp
;
}
}
/* The following code is executed for every ack we receive,
* except for conditions checked in should_advance_cwnd()
* before the call to tcp_cong_avoid(). Mainly this means that
* we only execute this code if the ack actually acked some
* data.
*/
/* If we are in slow start, increase our cwnd in response to this ACK.
* (If we are not in slow start then we are in congestion avoidance,
* and adjust our congestion window only once per RTT. See the code
* above.)
*/
if
(
tp
->
snd_cwnd
<=
tp
->
snd_ssthresh
)
tp
->
snd_cwnd
++
;
/* to keep cwnd from growing without bound */
tp
->
snd_cwnd
=
min_t
(
u32
,
tp
->
snd_cwnd
,
tp
->
snd_cwnd_clamp
);
/* Make sure that we are never so timid as to reduce our cwnd below
* 2 MSS.
*
* Going below 2 MSS would risk huge delayed ACKs from our receiver.
*/
tp
->
snd_cwnd
=
max
(
tp
->
snd_cwnd
,
2U
);
/* Wipe the slate clean for the next RTT. */
vegas
->
cntRTT
=
0
;
vegas
->
minRTT
=
0x7fffffff
;
}
/* Extract info for Tcp socket info provided via netlink. */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment