Commit 79ffeeb9 authored by Linus Torvalds's avatar Linus Torvalds

Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

parents a5aac37f 6a438bbe
...@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER ...@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER
TCP variables: TCP variables:
tcp_abc - INTEGER
Controls Appropriate Byte Count defined in RFC3465. If set to
0 then does congestion avoid once per ack. 1 is conservative
value, and 2 is more agressive.
tcp_syn_retries - INTEGER tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value will be retransmitted. Should not be higher than 255. Default value
......
...@@ -390,6 +390,7 @@ enum ...@@ -390,6 +390,7 @@ enum
NET_TCP_BIC_BETA=108, NET_TCP_BIC_BETA=108,
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
NET_TCP_CONG_CONTROL=110, NET_TCP_CONG_CONTROL=110,
NET_TCP_ABC=111,
}; };
enum { enum {
......
...@@ -307,6 +307,21 @@ struct tcp_sock { ...@@ -307,6 +307,21 @@ struct tcp_sock {
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
struct tcp_sack_block recv_sack_cache[4];
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
struct sk_buff *fastpath_skb_hint;
int fastpath_cnt_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
int forward_cnt_hint;
__u16 advmss; /* Advertised MSS */ __u16 advmss; /* Advertised MSS */
__u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */
__u32 lost_out; /* Lost packets */ __u32 lost_out; /* Lost packets */
...@@ -326,6 +341,7 @@ struct tcp_sock { ...@@ -326,6 +341,7 @@ struct tcp_sock {
__u32 snd_up; /* Urgent pointer */ __u32 snd_up; /* Urgent pointer */
__u32 total_retrans; /* Total retransmits for entire connection */ __u32 total_retrans; /* Total retransmits for entire connection */
__u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */
unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */ unsigned int keepalive_intvl; /* time interval between keep alive probes */
......
...@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) ...@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \ (skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next) skb = skb->next)
/*from STCP for fast SACK Process*/
#define sk_stream_for_retrans_queue_from(skb, sk) \
for (; (skb != (sk)->sk_send_head) && \
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \
skb = skb->next)
/* /*
* Default write policy as shown to user space via poll/select/SIGIO * Default write policy as shown to user space via poll/select/SIGIO
*/ */
......
...@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
*/ */
#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
* connection: ~180sec is RFC minumum */ * connection: ~180sec is RFC minimum */
#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
* connection: ~180sec is RFC minumum */ * connection: ~180sec is RFC minimum */
#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
...@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Flags in tp->nonagle */ /* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
extern struct inet_timewait_death_row tcp_death_row; extern struct inet_timewait_death_row tcp_death_row;
...@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency; ...@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern atomic_t tcp_memory_allocated; extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated; extern atomic_t tcp_sockets_allocated;
...@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk); ...@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk);
/* TCP timestamps are only 32-bits, this causes a slight /* TCP timestamps are only 32-bits, this causes a slight
* complication on 64-bit systems since we store a snapshot * complication on 64-bit systems since we store a snapshot
* of jiffies in the buffer control blocks below. We decidely * of jiffies in the buffer control blocks below. We decidedly
* only use of the low 32-bits of jiffies and hide the ugly * only use of the low 32-bits of jiffies and hide the ugly
* casts with the following macro. * casts with the following macro.
*/ */
#define tcp_time_stamp ((__u32)(jiffies)) #define tcp_time_stamp ((__u32)(jiffies))
/* This is what the send packet queueing engine uses to pass /* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission * TCP per-packet control information to the transmission
* code. We also store the host-order sequence numbers in * code. We also store the host-order sequence numbers in
* here too. This is 36 bytes on 32-bit architectures, * here too. This is 36 bytes on 32-bit architectures,
...@@ -597,7 +598,7 @@ struct tcp_skb_cb { ...@@ -597,7 +598,7 @@ struct tcp_skb_cb {
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
#define TCPCB_URG 0x20 /* Urgent pointer advenced here */ #define TCPCB_URG 0x20 /* Urgent pointer advanced here */
#define TCPCB_AT_TAIL (TCPCB_URG) #define TCPCB_AT_TAIL (TCPCB_URG)
...@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) ...@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
(tp->snd_cwnd >> 2))); (tp->snd_cwnd >> 2)));
} }
/*
* Linear increase during slow start
*/
static inline void tcp_slow_start(struct tcp_sock *tp)
{
if (sysctl_tcp_abc) {
/* RFC3465: Slow Start
* TCP sender SHOULD increase cwnd by the number of
* previously unacknowledged bytes ACKed by each incoming
* acknowledgment, provided the increase is not more than L
*/
if (tp->bytes_acked < tp->mss_cache)
return;
/* We MAY increase by 2 if discovered delayed ack */
if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
}
}
tp->bytes_acked = 0;
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
}
static inline void tcp_sync_left_out(struct tcp_sock *tp) static inline void tcp_sync_left_out(struct tcp_sock *tp)
{ {
if (tp->rx_opt.sack_ok && if (tp->rx_opt.sack_ok &&
...@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk) ...@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0; tp->prior_ssthresh = 0;
tp->bytes_acked = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
__tcp_enter_cwr(sk); __tcp_enter_cwr(sk);
tcp_set_ca_state(sk, TCP_CA_CWR); tcp_set_ca_state(sk, TCP_CA_CWR);
...@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) ...@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp)
return 3; return 3;
} }
/* RFC2861 Check whether we are limited by application or congestion window
* This is the inverse of cwnd check in tcp_tso_should_defer
*/
static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 left;
if (in_flight >= tp->snd_cwnd)
return 1;
if (!(sk->sk_route_caps & NETIF_F_TSO))
return 0;
left = tp->snd_cwnd - in_flight;
if (sysctl_tcp_tso_win_divisor)
return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd;
else
return left <= tcp_max_burst(tp);
}
static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
const struct sk_buff *skb) const struct sk_buff *skb)
{ {
...@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void) ...@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void)
TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
} }
/*from STCP */
static inline void clear_all_retrans_hints(struct tcp_sock *tp){
tp->lost_skb_hint = NULL;
tp->scoreboard_skb_hint = NULL;
tp->retransmit_skb_hint = NULL;
tp->forward_skb_hint = NULL;
tp->fastpath_skb_hint = NULL;
}
/* /proc */ /* /proc */
enum tcp_seq_states { enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_LISTENING,
......
...@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { ...@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_tcp_congestion_control, .proc_handler = &proc_tcp_congestion_control,
.strategy = &sysctl_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control,
}, },
{
.ctl_name = NET_TCP_ABC,
.procname = "tcp_abc",
.data = &sysctl_tcp_abc,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ .ctl_name = 0 } { .ctl_name = 0 }
}; };
......
...@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
} else if (tcp_need_reset(old_state) || } else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq && (tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
/* The last check adjusts for discrepance of Linux wrt. RFC /* The last check adjusts for discrepancy of Linux wrt. RFC
* states * states
*/ */
tcp_send_active_reset(sk, gfp_any()); tcp_send_active_reset(sk, gfp_any());
...@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0; tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff; tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
tcp_set_ca_state(sk, TCP_CA_Open); tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp); tcp_clear_retrans(tp);
inet_csk_delack_init(sk); inet_csk_delack_init(sk);
......
...@@ -217,14 +217,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, ...@@ -217,14 +217,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization(sk, data_acked); bictcp_low_utilization(sk, data_acked);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
/* In "safe" area, increase. */ tcp_slow_start(tp);
if (tp->snd_cwnd < tp->snd_cwnd_clamp) else {
tp->snd_cwnd++;
} else {
bictcp_update(ca, tp->snd_cwnd); bictcp_update(ca, tp->snd_cwnd);
/* In dangerous area, increase slowly. /* In dangerous area, increase slowly.
......
...@@ -186,17 +186,25 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, ...@@ -186,17 +186,25 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* In "safe" area, increase. */ /* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
/* In dangerous area, increase slowly. */
else if (sysctl_tcp_abc) {
/* RFC3465: Apppriate Byte Count
* increase once for each full cwnd acked
*/
if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
if (tp->snd_cwnd < tp->snd_cwnd_clamp) if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
}
} else { } else {
/* In dangerous area, increase slowly. /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
if (tp->snd_cwnd < tp->snd_cwnd_clamp) if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
......
...@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) ...@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
} }
static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
u32 in_flight, int good) u32 in_flight, u32 pkts_acked)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk); struct hstcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
if (tp->snd_cwnd < tp->snd_cwnd_clamp) tcp_slow_start(tp);
tp->snd_cwnd++; else {
} else {
/* Update AIMD parameters */ /* Update AIMD parameters */
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
......
...@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk); struct htcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd) if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
/* In "safe" area, increase. */ tcp_slow_start(tp);
if (tp->snd_cwnd < tp->snd_cwnd_clamp) else {
tp->snd_cwnd++;
} else {
measure_rtt(sk); measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */ /* keep track of number of round-trip times since last backoff event */
......
...@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca->minrtt = tp->srtt; ca->minrtt = tp->srtt;
} }
if (!tcp_is_cwnd_limited(sk, in_flight))
return;
if (!ca->hybla_en) if (!ca->hybla_en)
return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
if (in_flight < tp->snd_cwnd)
return;
if (ca->rho == 0) if (ca->rho == 0)
hybla_recalc_param(sk); hybla_recalc_param(sk);
......
This diff is collapsed.
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
* request_sock handling and moved * request_sock handling and moved
* most of it into the af independent code. * most of it into the af independent code.
* Added tail drop and some other bugfixes. * Added tail drop and some other bugfixes.
* Added new listen sematics. * Added new listen semantics.
* Mike McLagan : Routing by source * Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits * Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes. * Andi Kleen: various fixes.
...@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think. /* An explanation is required here, I think.
* Packet length and doff are validated by header prediction, * Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is elimineted. * provided case of th->doff==0 is eliminated.
* So, we defer the checks. */ * So, we defer the checks. */
if ((skb->ip_summed != CHECKSUM_UNNECESSARY && if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
tcp_v4_checksum_init(skb))) tcp_v4_checksum_init(skb)))
......
...@@ -158,7 +158,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -158,7 +158,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* I am shamed, but failed to make it more elegant. /* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible * Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6 * to generalize to IPv6. Taking into account that IPv6
* do not undertsnad recycling in any case, it not * do not understand recycling in any case, it not
* a big problem in practice. --ANK */ * a big problem in practice. --ANK */
if (tw->tw_family == AF_INET && if (tw->tw_family == AF_INET &&
tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
...@@ -194,7 +194,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -194,7 +194,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
/* In window segment, it may be only reset or bare ack. */ /* In window segment, it may be only reset or bare ack. */
if (th->rst) { if (th->rst) {
/* This is TIME_WAIT assasination, in two flavors. /* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this * Oh well... nobody has a sufficient solution to this
* protocol bug yet. * protocol bug yet.
*/ */
...@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/ */
newtp->snd_cwnd = 2; newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0; newtp->snd_cwnd_cnt = 0;
newtp->bytes_acked = 0;
newtp->frto_counter = 0; newtp->frto_counter = 0;
newtp->frto_highmark = 0; newtp->frto_highmark = 0;
...@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, ...@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ... /* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet * and the incoming segment acknowledges something not yet
* sent (the segment carries an unaccaptable ACK) ... * sent (the segment carries an unacceptable ACK) ...
* a reset is sent." * a reset is sent."
* *
* Invalid ACK: reset will be sent by listening socket * Invalid ACK: reset will be sent by listening socket
......
...@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss ...@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16 flags; u16 flags;
BUG_ON(len > skb->len); BUG_ON(len > skb->len);
clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len; nsize = skb_headlen(skb) - len;
if (nsize < 0) if (nsize < 0)
nsize = 0; nsize = 0;
...@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) ...@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header. for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup. tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is minumum of user_mss and mss received with SYN. It is minimum of user_mss and mss received with SYN.
It also does not include TCP options. It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function. tp->pmtu_cookie is last pmtu, seen by this function.
...@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) ...@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous verions used mss_clamp /* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses * here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct * of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss * but may be worse for the performance because of rcv_mss
...@@ -1260,6 +1262,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m ...@@ -1260,6 +1262,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 || BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1); tcp_skb_pcount(next_skb) != 1);
/* changing transmit queue under us so clear hints */
clear_all_retrans_hints(tp);
/* Ok. We will be able to collapse the packet. */ /* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, &sk->sk_write_queue); __skb_unlink(next_skb, &sk->sk_write_queue);
...@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) ...@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
} }
} }
clear_all_retrans_hints(tp);
if (!lost) if (!lost)
return; return;
...@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err; int err;
/* Do not sent more than we queued. 1/4 is reserved for possible /* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: frgagmentation, tunneling, mangling etc. * copying overhead: fragmentation, tunneling, mangling etc.
*/ */
if (atomic_read(&sk->sk_wmem_alloc) > if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
...@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
int packet_cnt = tp->lost_out; int packet_cnt;
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
packet_cnt = tp->retransmit_cnt_hint;
}else{
skb = sk->sk_write_queue.next;
packet_cnt = 0;
}
/* First pass: retransmit lost packets. */ /* First pass: retransmit lost packets. */
if (packet_cnt) { if (tp->lost_out) {
sk_stream_for_retrans_queue(skb, sk) { sk_stream_for_retrans_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked; __u8 sacked = TCP_SKB_CB(skb)->sacked;
/* we could do better than to assign each time */
tp->retransmit_skb_hint = skb;
tp->retransmit_cnt_hint = packet_cnt;
/* Assume this retransmit will generate /* Assume this retransmit will generate
* only one packet for congestion window * only one packet for congestion window
* calculation purposes. This works because * calculation purposes. This works because
...@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return; return;
if (sacked&TCPCB_LOST) { if (sacked & TCPCB_LOST) {
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb)) if (tcp_retransmit_skb(sk, skb)) {
tp->retransmit_skb_hint = NULL;
return; return;
}
if (icsk->icsk_ca_state != TCP_CA_Loss) if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else else
...@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX); TCP_RTO_MAX);
} }
packet_cnt -= tcp_skb_pcount(skb); packet_cnt += tcp_skb_pcount(skb);
if (packet_cnt <= 0) if (packet_cnt >= tp->lost_out)
break; break;
} }
} }
...@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_may_send_now(sk, tp)) if (tcp_may_send_now(sk, tp))
return; return;
if (tp->forward_skb_hint) {
skb = tp->forward_skb_hint;
packet_cnt = tp->forward_cnt_hint;
} else{
skb = sk->sk_write_queue.next;
packet_cnt = 0; packet_cnt = 0;
}
sk_stream_for_retrans_queue_from(skb, sk) {
tp->forward_cnt_hint = packet_cnt;
tp->forward_skb_hint = skb;
sk_stream_for_retrans_queue(skb, sk) {
/* Similar to the retransmit loop above we /* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB * can pretend that the retransmitted SKB
* we send out here will be composed of one * we send out here will be composed of one
...@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue; continue;
/* Ok, retransmit it. */ /* Ok, retransmit it. */
if (tcp_retransmit_skb(sk, skb)) if (tcp_retransmit_skb(sk, skb)) {
tp->forward_skb_hint = NULL;
break; break;
}
if (skb == skb_peek(&sk->sk_write_queue)) if (skb == skb_peek(&sk->sk_write_queue))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
...@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); ...@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(tcp_sync_mss);
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
...@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ...@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag) u32 in_flight, int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd)
if (!tcp_is_cwnd_limited(sk, in_flight))
return; return;
if (tp->snd_cwnd <= tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh)
tp->snd_cwnd++; tcp_slow_start(tp);
} else { else {
tp->snd_cwnd_cnt++; tp->snd_cwnd_cnt++;
if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++; tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0; tp->snd_cwnd_cnt = 0;
} }
} }
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
tp->snd_cwnd_stamp = tcp_time_stamp;
} }
static u32 tcp_scalable_ssthresh(struct sock *sk) static u32 tcp_scalable_ssthresh(struct sock *sk)
......
...@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) ...@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout * to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket. * or zero probe timeout occurs on orphaned socket.
* *
* Criterium is still not confirmed experimentally and may change. * Criteria is still not confirmed experimentally and may change.
* We kill the socket, if: * We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured * 1. If number of orphaned sockets exceeds an administratively configured
* limit. * limit.
...@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) ...@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-( hole detection. :-(
It is place to make it. It is not made. I do not want It is place to make it. It is not made. I do not want
to make it. It is disguisting. It does not work in any to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for case. Let me to cite the same draft, which requires for
us to implement this: us to implement this:
......
...@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas /* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno. * calculation, so we'll behave like Reno.
*/ */
if (tp->snd_cwnd > tp->snd_ssthresh) tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt);
tp->snd_cwnd++;
} else { } else {
u32 rtt, target_cwnd, diff; u32 rtt, target_cwnd, diff;
...@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/ */
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
if (tp->snd_cwnd < tp->snd_ssthresh) { if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */ /* Slow start. */
if (diff > gamma) { if (diff > gamma) {
/* Going too fast. Time to slow down /* Going too fast. Time to slow down
...@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT)+1); V_PARAM_SHIFT)+1);
} }
tcp_slow_start(tp);
} else { } else {
/* Congestion avoidance. */ /* Congestion avoidance. */
u32 next_snd_cwnd; u32 next_snd_cwnd;
...@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, ...@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (next_snd_cwnd < tp->snd_cwnd) else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--; tp->snd_cwnd--;
} }
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
tp->snd_cwnd = tp->snd_cwnd_clamp;
}
} }
/* Wipe the slate clean for the next RTT. */ /* Wipe the slate clean for the next RTT. */
vegas->cntRTT = 0; vegas->cntRTT = 0;
vegas->minRTT = 0x7fffffff; vegas->minRTT = 0x7fffffff;
}
/* The following code is executed for every ack we receive,
* except for conditions checked in should_advance_cwnd()
* before the call to tcp_cong_avoid(). Mainly this means that
* we only execute this code if the ack actually acked some
* data.
*/
/* If we are in slow start, increase our cwnd in response to this ACK.
* (If we are not in slow start then we are in congestion avoidance,
* and adjust our congestion window only once per RTT. See the code
* above.)
*/
if (tp->snd_cwnd <= tp->snd_ssthresh)
tp->snd_cwnd++;
/* to keep cwnd from growing without bound */
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
/* Make sure that we are never so timid as to reduce our cwnd below
* 2 MSS.
*
* Going below 2 MSS would risk huge delayed ACKs from our receiver.
*/
tp->snd_cwnd = max(tp->snd_cwnd, 2U);
} }
/* Extract info for Tcp socket info provided via netlink. */ /* Extract info for Tcp socket info provided via netlink. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment