Commit 834e0ecf authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-remove-prequeue-and-header-prediction'

Florian Westphal says:

====================
tcp: remove prequeue and header prediction

During a hallway discussion with Eric Dumazet at Netdev 1.2 in
Tokyo some maybe-not-so-useful-anymore TCP stack features came up,
among these header prediction and prequeueing.

In brief, TCP prequeue assumes a single-process-blocking-read design,
which is not that common anymore. The most frequently used high-performance
networking program that is an excellent fit for these features is netperf.

The idea behind prequeueing is to move part of tcp processing, including
retransmit queue cleaning, to process context.

With (e)poll designs, prequeue is always skipped, so for such programs
this is dead-code removal.

Header prediction is also less useful nowadays.
For packet trains, GRO will do packet aggregation so we do not get the
per-packet benefit that this had before GRO anymore.

Because of SACK, header prediction also will be ineffective once
a connection suffers even light packet losses.

code removal aside, after this change processing always occurs in BH
context, this allows to experiment e.g. with doing bulk freeing of
skb heads when incoming ACKs clean packets from the retransmit queue.

There are no changes since the RFC, except in last patch (i missed
another no-longer-used mib counter). I also edited a few commit messages.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 764646b0 3282e655
......@@ -353,12 +353,7 @@ tcp_l3mdev_accept - BOOLEAN
compiled with CONFIG_NET_L3_MASTER_DEV.
tcp_low_latency - BOOLEAN
If set, the TCP stack makes decisions that prefer lower
latency as opposed to higher throughput. By default, this
option is not set meaning that higher throughput is preferred.
An example of an application where this default should be
changed would be a Beowulf compute cluster.
Default: 0
This is a legacy option, it has no effect anymore.
tcp_max_orphans - INTEGER
Maximal number of TCP sockets not attached to any user file handle,
......
......@@ -147,12 +147,6 @@ struct tcp_sock {
u16 tcp_header_len; /* Bytes of tcp header to send */
u16 gso_segs; /* Max number of segs per GSO packet */
/*
* Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
......@@ -192,15 +186,6 @@ struct tcp_sock {
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
/* Data for direct copy to user */
struct {
struct sk_buff_head prequeue;
struct task_struct *task;
struct msghdr *msg;
int memory;
int len;
} ucopy;
u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
u32 max_window; /* Maximal window ever seen from peer */
......
......@@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
......@@ -632,29 +631,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}
static inline void tcp_fast_path_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data)
tcp_fast_path_on(tp);
}
/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
......@@ -904,9 +880,8 @@ enum tcp_ca_event {
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
CA_ACK_WIN_UPDATE = (1 << 0), /* ACK updated window */
CA_ACK_ECE = (1 << 1), /* ECE bit is set on ack */
};
/*
......@@ -1244,17 +1219,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
__tcp_checksum_complete(skb);
}
/* Prequeue for VJ style copy to user, combined with checksumming. */
static inline void tcp_prequeue_init(struct tcp_sock *tp)
{
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
tp->ucopy.memory = 0;
skb_queue_head_init(&tp->ucopy.prequeue);
}
bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);
......
......@@ -184,14 +184,7 @@ enum
LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */
LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */
LINUX_MIB_LISTENDROPS, /* ListenDrops */
LINUX_MIB_TCPPREQUEUED, /* TCPPrequeued */
LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, /* TCPDirectCopyFromBacklog */
LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, /* TCPDirectCopyFromPrequeue */
LINUX_MIB_TCPPREQUEUEDROPPED, /* TCPPrequeueDropped */
LINUX_MIB_TCPHPHITS, /* TCPHPHits */
LINUX_MIB_TCPHPHITSTOUSER, /* TCPHPHitsToUser */
LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */
LINUX_MIB_TCPHPACKS, /* TCPHPAcks */
LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */
LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */
LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */
......@@ -208,14 +201,12 @@ enum
LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */
LINUX_MIB_TCPLOSSFAILURES, /* TCPLossFailures */
LINUX_MIB_TCPFASTRETRANS, /* TCPFastRetrans */
LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */
LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */
LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */
LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */
LINUX_MIB_TCPLOSSPROBERECOVERY, /* TCPLossProbeRecovery */
LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */
LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */
LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */
LINUX_MIB_TCPRCVCOLLAPSED, /* TCPRcvCollapsed */
LINUX_MIB_TCPDSACKOLDSENT, /* TCPDSACKOldSent */
LINUX_MIB_TCPDSACKOFOSENT, /* TCPDSACKOfoSent */
......
......@@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
......@@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
......
......@@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
......
......@@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)
tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
......@@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
static void tcp_prequeue_process(struct sock *sk)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
/* Clear memory counter. */
tp->ucopy.memory = 0;
}
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
......@@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
......@@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_cleanup_rbuf(sk, copied);
if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
tp->ucopy.msg = msg;
}
tp->ucopy.len = len;
WARN_ON(tp->copied_seq != tp->rcv_nxt &&
!(flags & (MSG_PEEK | MSG_TRUNC)));
/* Ugly... If prequeue is not empty, we have to
* process it before releasing socket, otherwise
* order will be broken at second iteration.
* More elegant solution is required!!!
*
* Look: we have the following (pseudo)queues:
*
* 1. packets in flight
* 2. backlog
* 3. prequeue
* 4. receive_queue
*
* Each queue can be processed only if the next ones
* are empty. At this point we have empty receive_queue.
* But prequeue _can_ be not empty after 2nd iteration,
* when we jumped to start of loop because backlog
* processing added something to receive_queue.
* We cannot release_sock(), because backlog contains
* packets arrived _after_ prequeued ones.
*
* Shortly, algorithm is clear --- to process all
* the queues in order. We could make it more directly,
* requeueing packets from backlog to prequeue, if
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
}
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
......@@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
sk_wait_data(sk, &timeo, last);
}
if (user_recv) {
int chunk;
/* __ Restore normal policy in scheduler __ */
chunk = len - tp->ucopy.len;
if (chunk != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
}
if (tp->rcv_nxt == tp->copied_seq &&
!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);
chunk = len - tp->ucopy.len;
if (chunk != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
......@@ -1934,10 +1848,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_rcv_space_adjust(sk);
skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
tp->urg_data = 0;
tcp_fast_path_check(sk);
}
if (used + offset < skb->len)
continue;
......@@ -1955,25 +1867,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
} while (len > 0);
if (user_recv) {
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
tcp_prequeue_process(sk);
if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
}
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
......
......@@ -103,7 +103,6 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
......@@ -3367,12 +3366,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tp->pred_flags = 0;
tcp_fast_path_check(sk);
if (tcp_send_head(sk))
tcp_slow_start_after_idle_check(sk);
......@@ -3554,6 +3547,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
u32 ack_ev_flags = 0;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
......@@ -3597,21 +3591,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
......@@ -3625,14 +3604,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
ack_ev_flags |= CA_ACK_ECE;
ack_ev_flags = CA_ACK_ECE;
}
if (flag & FLAG_WIN_UPDATE)
ack_ev_flags |= CA_ACK_WIN_UPDATE;
tcp_in_ack_event(sk, ack_ev_flags);
}
/* We passed data and got it acked, remove any soft error
* log. Something worked...
......@@ -4398,8 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
return;
}
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
......@@ -4611,23 +4587,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto out_of_window;
/* Ok. In sequence. In window. */
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
sock_owned_by_user(sk) && !tp->urg_data) {
int chunk = min_t(unsigned int, skb->len,
tp->ucopy.len);
__set_current_state(TASK_RUNNING);
if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
}
if (eaten <= 0) {
queue_and_out:
if (eaten < 0) {
if (skb_queue_len(&sk->sk_receive_queue) == 0)
......@@ -4636,7 +4595,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto drop;
}
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
......@@ -4656,8 +4614,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
......@@ -4983,7 +4939,6 @@ static int tcp_prune_queue(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
return -1;
}
......@@ -5155,9 +5110,6 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
......@@ -5186,26 +5138,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
}
}
static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int chunk = skb->len - hlen;
int err;
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
if (!err) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
tcp_rcv_space_adjust(sk);
}
return err;
}
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
......@@ -5336,26 +5268,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/*
* TCP receive function for the ESTABLISHED state.
*
* It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
* (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
......@@ -5366,172 +5278,19 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
bool fragstolen = false;
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len &&
sock_owned_by_user(sk)) {
__set_current_state(TASK_RUNNING);
if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) +
TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
}
if (!eaten) {
if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
&fragstolen);
}
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
__tcp_ack_snd_check(sk, 0);
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk->sk_data_ready(sk);
return;
}
}
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
......@@ -5585,11 +5344,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
......@@ -5718,7 +5476,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, FLAG_SLOWPATH);
tcp_ack(sk, skb, 0);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
......@@ -5954,8 +5712,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT |
acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
......@@ -6023,7 +5781,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {
......
......@@ -85,8 +85,6 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
int sysctl_tcp_low_latency __read_mostly;
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
......@@ -1541,61 +1539,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
}
}
/* Packet is added to VJ-style prequeue for processing in process
* context, if a reader task is waiting. Apparently, this exciting
* idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
* failed somewhere. Latency? Burstiness? Well, at least now we will
* see, why it failed. 8)8) --ANK
*
*/
bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (sysctl_tcp_low_latency || !tp->ucopy.task)
return false;
if (skb->len <= tcp_hdrlen(skb) &&
skb_queue_len(&tp->ucopy.prequeue) == 0)
return false;
/* Before escaping RCU protected region, we need to take care of skb
* dst. Prequeue is only enabled for established sockets.
* For such sockets, we might need the skb dst only to set sk->sk_rx_dst
* Instead of doing full sk_rx_dst validity here, let's perform
* an optimistic check.
*/
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
struct sk_buff *skb1;
BUG_ON(sock_owned_by_user(sk));
__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
skb_queue_len(&tp->ucopy.prequeue));
while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb1);
tp->ucopy.memory = 0;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible_sync_poll(sk_sleep(sk),
POLLIN | POLLRDNORM | POLLRDBAND);
if (!inet_csk_ack_scheduled(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
(3 * tcp_rto_min(sk)) / 4,
TCP_RTO_MAX);
}
return true;
}
EXPORT_SYMBOL(tcp_prequeue);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
......@@ -1770,7 +1713,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
......@@ -1936,9 +1878,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
}
#endif
/* Clean prequeue, it must be empty really */
__skb_queue_purge(&tp->ucopy.prequeue);
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
......
......@@ -436,8 +436,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_sock *newtp = tcp_sk(newsk);
/* Now setup tcp_sock */
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->segs_in = 1;
......@@ -445,7 +443,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
INIT_LIST_HEAD(&newtp->tsq_node);
tcp_init_wl(newtp, treq->rcv_isn);
......
......@@ -295,9 +295,7 @@ static u16 tcp_select_window(struct sock *sk)
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
if (new_win == 0) {
tp->pred_flags = 0;
if (old_win)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPTOZEROWINDOWADV);
......
......@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk);
......@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
tp->ucopy.memory = 0;
}
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
......
......@@ -153,24 +153,6 @@ static inline void update_rtt_min(struct westwood *w)
w->rtt_min = min(w->rtt, w->rtt_min);
}
/*
* @westwood_fast_bw
* It is called when we are in fast path. In particular it is called when
* header prediction is successful. In such case in fact update is
* straight forward and doesn't need any particular care.
*/
static inline void westwood_fast_bw(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct westwood *w = inet_csk_ca(sk);
westwood_update_window(sk);
w->bk += tp->snd_una - w->snd_una;
w->snd_una = tp->snd_una;
update_rtt_min(w);
}
/*
* @westwood_acked_count
* This function evaluates cumul_ack for evaluating bk in case of
......@@ -223,17 +205,12 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
{
if (ack_flags & CA_ACK_SLOWPATH) {
struct westwood *w = inet_csk_ca(sk);
westwood_update_window(sk);
w->bk += westwood_acked_count(sk);
update_rtt_min(w);
return;
}
westwood_fast_bw(sk);
}
static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
......
......@@ -1505,7 +1505,6 @@ static int tcp_v6_rcv(struct sk_buff *skb)
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
ret = tcp_v6_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment