Merge

d5ae22d9 · David S. Miller · 5e051f29 · fe4f70cf · d5ae22d9 · d5ae22d9
Commit d5ae22d9 authored Aug 28, 2002 by David S. Miller
21 changed files
--- a/drivers/net/e1000/e1000.h
+++ b/drivers/net/e1000/e1000.h
@@ -63,6 +63,7 @@
 #include <net/pkt_sched.h>
 #include <linux/list.h>
 #include <linux/reboot.h>
+#include <net/checksum.h>
 #include <linux/tqueue.h>
 #include <linux/ethtool.h>
 #include <linux/if_vlan.h>

--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -427,6 +427,11 @@ e1000_probe(struct pci_dev *pdev,
 		netdev->features = NETIF_F_SG;
 	}

+#ifdef NETIF_F_TSO
+	if(adapter->hw.mac_type >= e1000_82544)
+		netdev->features |= NETIF_F_TSO;
+#endif
+ 
 	if(pci_using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;

@@ -1284,9 +1289,62 @@ e1000_watchdog(unsigned long data)

 #define E1000_TX_FLAGS_CSUM		0x00000001
 #define E1000_TX_FLAGS_VLAN		0x00000002
+#define E1000_TX_FLAGS_TSO		0x00000004
 #define E1000_TX_FLAGS_VLAN_MASK	0xffff0000
 #define E1000_TX_FLAGS_VLAN_SHIFT	16

+static inline boolean_t
+e1000_tso(struct e1000_adapter *adapter, struct sk_buff *skb, int tx_flags)
+{
+#ifdef NETIF_F_TSO
+	struct e1000_context_desc *context_desc;
+	int i;
+	uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
+	uint16_t ipcse, tucse, mss;
+	
+	if(skb_shinfo(skb)->tso_size) {
+		hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
+		mss = skb_shinfo(skb)->tso_size;
+		skb->nh.iph->tot_len = 0;
+		skb->nh.iph->check = 0;
+		skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
+		                                      skb->nh.iph->daddr,
+		                                      0,
+		                                      IPPROTO_TCP,
+		                                      0);
+		ipcss = skb->nh.raw - skb->data;
+		ipcso = (void *)&(skb->nh.iph->check) - (void *)skb->data;
+		ipcse = skb->h.raw - skb->data - 1;
+		tucss = skb->h.raw - skb->data;
+		tucso = (void *)&(skb->h.th->check) - (void *)skb->data;
+		tucse = 0;
+
+		i = adapter->tx_ring.next_to_use;
+		context_desc = E1000_CONTEXT_DESC(adapter->tx_ring, i);
+		
+		context_desc->lower_setup.ip_fields.ipcss  = ipcss;
+		context_desc->lower_setup.ip_fields.ipcso  = ipcso;
+		context_desc->lower_setup.ip_fields.ipcse  = cpu_to_le16(ipcse);
+		context_desc->upper_setup.tcp_fields.tucss = tucss;
+		context_desc->upper_setup.tcp_fields.tucso = tucso;
+		context_desc->upper_setup.tcp_fields.tucse = cpu_to_le16(tucse);
+		context_desc->tcp_seg_setup.fields.mss     = cpu_to_le16(mss);
+		context_desc->tcp_seg_setup.fields.hdr_len = hdr_len;
+		context_desc->cmd_and_length = cpu_to_le32(adapter->txd_cmd |
+			E1000_TXD_CMD_DEXT | E1000_TXD_CMD_TSE |
+			E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP |
+			(skb->len - (hdr_len)));
+
+		i = (i + 1) % adapter->tx_ring.count;
+		adapter->tx_ring.next_to_use = i;
+
+		return TRUE;
+	}
+#endif
+	
+	return FALSE;
+}
+
 static inline boolean_t
 e1000_tx_csum(struct e1000_adapter *adapter, struct sk_buff *skb)
 {
@@ -1386,6 +1444,12 @@ e1000_tx_queue(struct e1000_adapter *adapter, int count, int tx_flags)
 	txd_upper = 0;
 	txd_lower = adapter->txd_cmd;

+	if(tx_flags & E1000_TX_FLAGS_TSO) {
+		txd_lower |= E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
+		             E1000_TXD_CMD_TSE;
+		txd_upper |= (E1000_TXD_POPTS_IXSM | E1000_TXD_POPTS_TXSM) << 8;
+	}
+
 	if(tx_flags & E1000_TX_FLAGS_CSUM) {
 		txd_lower |= E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
 		txd_upper |= E1000_TXD_POPTS_TXSM << 8;
@@ -1435,22 +1499,29 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	for(f = 0; f < skb_shinfo(skb)->nr_frags; f++)
 		count += TXD_USE_COUNT(skb_shinfo(skb)->frags[f].size,
 		                       adapter->max_data_per_txd);
+#ifdef NETIF_F_TSO
+	if((skb_shinfo(skb)->tso_size) || (skb->ip_summed == CHECKSUM_HW))
+		count++;
+#else
 	if(skb->ip_summed == CHECKSUM_HW)
 		count++;
+#endif

 	if(E1000_DESC_UNUSED(&adapter->tx_ring) < count) {
 		netif_stop_queue(netdev);
 		return 1;
 	}

-	if(e1000_tx_csum(adapter, skb))
-		tx_flags |= E1000_TX_FLAGS_CSUM;
-
 	if(adapter->vlgrp && vlan_tx_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
 		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
 	}

+	if(e1000_tso(adapter, skb, tx_flags))
+		tx_flags |= E1000_TX_FLAGS_TSO;
+	else if(e1000_tx_csum(adapter, skb))
+		tx_flags |= E1000_TX_FLAGS_CSUM;
+
 	count = e1000_tx_map(adapter, skb);

 	e1000_tx_queue(adapter, count, tx_flags);

--- a/drivers/net/e1000/e1000_proc.c
+++ b/drivers/net/e1000/e1000_proc.c
@@ -622,9 +622,12 @@ e1000_proc_list_setup(struct e1000_adapter *adapter)
 	LIST_ADD_U("Rx_Long_Length_Errors", &adapter->stats.roc);
 	LIST_ADD_U("Rx_Short_Length_Errors", &adapter->stats.ruc);
 	
-	/* The 82542 does not have an alignment error count register */
-	if(adapter->hw.mac_type >= e1000_82543)
+	/* The 82542 does not have some of these stats */
+	if(adapter->hw.mac_type >= e1000_82543) {
 		LIST_ADD_U("Rx_Align_Errors", &adapter->stats.algnerrc);
+		LIST_ADD_U("Tx_TCP_Seg_Good", &adapter->stats.tsctc);
+		LIST_ADD_U("Tx_TCP_Seg_Failed", &adapter->stats.tsctfc);
+	}
 	
 	LIST_ADD_U("Rx_Flow_Control_XON", &adapter->stats.xonrxc);
 	LIST_ADD_U("Rx_Flow_Control_XOFF", &adapter->stats.xoffrxc);

--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -49,11 +49,72 @@
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
+#include <net/checksum.h>
 #include <linux/if_ether.h>	/* For the statistics structure. */
 #include <linux/if_arp.h>	/* For ARPHRD_ETHER */
+#include <linux/ip.h>
+#include <linux/tcp.h>

 #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)

+/* KISS: just allocate small chunks and copy bits.
+ *
+ * So, in fact, this is documentation, explaining what we expect
+ * of largesending device modulo TCP checksum, which is ignored for loopback.
+ */
+
+static void emulate_large_send_offload(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
+	unsigned int doffset = (iph->ihl + th->doff) * 4;
+	unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
+	unsigned int offset = 0;
+	u32 seq = ntohl(th->seq);
+	u16 id  = ntohs(iph->id);
+
+	while (offset + doffset < skb->len) {
+		unsigned int frag_size = min(mtu, skb->len - offset) - doffset;
+		struct sk_buff *nskb = alloc_skb(mtu + 32, GFP_ATOMIC);
+
+		if (!nskb)
+			break;
+		skb_reserve(nskb, 32);
+		nskb->mac.raw = nskb->data - 14;
+		nskb->nh.raw = nskb->data;
+		iph = nskb->nh.iph;
+		memcpy(nskb->data, skb->nh.raw, doffset);
+		if (skb_copy_bits(skb,
+				  doffset + offset,
+				  nskb->data + doffset,
+				  frag_size))
+			BUG();
+		skb_put(nskb, doffset + frag_size);
+		nskb->ip_summed = CHECKSUM_UNNECESSARY;
+		nskb->dev = skb->dev;
+		nskb->priority = skb->priority;
+		nskb->protocol = skb->protocol;
+		nskb->dst = dst_clone(skb->dst);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		nskb->pkt_type = skb->pkt_type;
+
+		th = (struct tcphdr*)(nskb->nh.raw + iph->ihl*4);
+		iph->tot_len = htons(frag_size + doffset);
+		iph->id = htons(id);
+		iph->check = 0;
+		iph->check = ip_fast_csum((unsigned char *) iph, iph->ihl);
+		th->seq = htonl(seq);
+		if (offset + doffset + frag_size < skb->len)
+			th->fin = th->psh = 0;
+		netif_rx(nskb);
+		offset += frag_size;
+		seq += frag_size;
+		id++;
+	}
+
+	dev_kfree_skb(skb);
+}
+
 /*
 * The higher levels take care of making this non-reentrant (it's
 * called with bh's disabled).
@@ -86,6 +147,18 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 #endif

+	if (skb_shinfo(skb)->tso_size) {
+		struct iphdr *iph = skb->nh.iph;
+
+		if (skb->protocol != htons(ETH_P_IP))
+			BUG();
+		if (iph->protocol != IPPROTO_TCP)
+			BUG();
+
+		emulate_large_send_offload(skb);
+		return 0;
+	}
+
 	dev->last_rx = jiffies;
 	stats->rx_bytes+=skb->len;
 	stats->tx_bytes+=skb->len;
@@ -117,6 +190,12 @@ int __init loopback_init(struct net_device *dev)
 	dev->rebuild_header	= eth_rebuild_header;
 	dev->flags		= IFF_LOOPBACK;
 	dev->features		= NETIF_F_SG|NETIF_F_FRAGLIST|NETIF_F_NO_CSUM|NETIF_F_HIGHDMA;
+
+	/* Current netfilter will die with oom linearizing large skbs,
+	 * however this will be cured before 2.5.x is done.
+	 */
+	dev->features	       |= NETIF_F_TSO;
+
 	dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
 	if (dev->priv == NULL)
 			return -ENOMEM;

--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -365,6 +365,7 @@ struct net_device
 #define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
+#define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */

 	/* Called after device is detached from network. */
 	void			(*uninit)(struct net_device *dev);

--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -109,7 +109,8 @@ struct sk_buff_head {

 struct sk_buff;

-#define MAX_SKB_FRAGS 6
+/* To allow 64K frame to be packed as single skb without frag_list */
+#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)

 typedef struct skb_frag_struct skb_frag_t;

@@ -125,6 +126,8 @@ struct skb_frag_struct {
 struct skb_shared_info {
 	atomic_t	dataref;
 	unsigned int	nr_frags;
+	unsigned short	tso_size;
+	unsigned short	tso_segs;
 	struct sk_buff	*frag_list;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };

--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -241,7 +241,8 @@ struct tcp_opt {
 	__u32	snd_wnd;	/* The window we expect to receive	*/
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
 	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
-	__u16	mss_cache;	/* Cached effective mss, not including SACKS */
+	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
+	__u16	mss_cache_std;	/* Like mss_cache, but without TSO */
 	__u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
 	__u8	ca_state;	/* State of fast-retransmit machine 	*/

--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -53,12 +53,13 @@ static inline void	inet_putpeer(struct inet_peer *p)

 extern spinlock_t inet_peer_idlock;
 /* can be called with or without local BH being disabled */
-static inline __u16	inet_getid(struct inet_peer *p)
+static inline __u16	inet_getid(struct inet_peer *p, int more)
 {
 	__u16 id;

 	spin_lock_bh(&inet_peer_idlock);
-	id = p->ip_id_count++;
+	id = p->ip_id_count;
+	p->ip_id_count += 1 + more;
 	spin_unlock_bh(&inet_peer_idlock);
 	return id;
 }

--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -187,7 +187,7 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 		 !(dst->mxlock&(1<<RTAX_MTU))));
 }

-extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst);
+extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);

 static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, struct sock *sk)
 {
@@ -200,7 +200,19 @@ static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, str
 		iph->id = (sk && inet_sk(sk)->daddr) ?
 					htons(inet_sk(sk)->id++) : 0;
 	} else
-		__ip_select_ident(iph, dst);
+		__ip_select_ident(iph, dst, 0);
+}
+
+static inline void ip_select_ident_more(struct iphdr *iph, struct dst_entry *dst, struct sock *sk, int more)
+{
+	if (iph->frag_off&__constant_htons(IP_DF)) {
+		if (sk && inet_sk(sk)->daddr) {
+			iph->id = htons(inet_sk(sk)->id);
+			inet_sk(sk)->id += 1 + more;
+		} else
+			iph->id = 0;
+	} else
+		__ip_select_ident(iph, dst, more);
 }

 /*

--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -130,7 +130,7 @@ struct sock {
 				bsdism;
 	unsigned char		debug;
 	unsigned char		rcvtstamp;
-	/* Hole of 1 byte. Try to pack. */
+	unsigned char		no_largesend;
 	int			route_caps;
 	int			proc;
 	unsigned long	        lingertime;

--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -905,13 +905,18 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long

 /* Compute the current effective MSS, taking SACKs and IP options,
 * and even PMTU discovery events into account.
+ *
+ * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * cannot be large. However, taking into account rare use of URG, this
+ * is not a big flaw.
 */

-static __inline__ unsigned int tcp_current_mss(struct sock *sk)
+static __inline__ unsigned int tcp_current_mss(struct sock *sk, int large)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	int mss_now = tp->mss_cache; 
+	int mss_now = large && (sk->route_caps&NETIF_F_TSO) && !tp->urg_mode ?
+		tp->mss_cache : tp->mss_cache_std;

 	if (dst && dst->pmtu != tp->pmtu_cookie)
 		mss_now = tcp_sync_mss(sk, dst->pmtu);
@@ -933,7 +938,7 @@ static __inline__ unsigned int tcp_current_mss(struct sock *sk)
 static inline void tcp_initialize_rcv_mss(struct sock *sk)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
-	unsigned int hint = min(tp->advmss, tp->mss_cache);
+	unsigned int hint = min(tp->advmss, tp->mss_cache_std);

 	hint = min(hint, tp->rcv_wnd/2);
 	hint = min(hint, TCP_MIN_RCVMSS);
@@ -1269,7 +1274,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
 static __inline__ void tcp_push_pending_frames(struct sock *sk,
 					       struct tcp_opt *tp)
 {
-	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk), tp->nonagle);
+	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
 }

 static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
@@ -1277,7 +1282,7 @@ static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
 	struct sk_buff *skb = tp->send_head;

 	return (skb &&
-		tcp_snd_test(tp, skb, tcp_current_mss(sk),
+		tcp_snd_test(tp, skb, tcp_current_mss(sk, 1),
 			     tcp_skb_is_last(sk, skb) ? 1 : tp->nonagle));
 }

@@ -1839,6 +1844,15 @@ static inline int tcp_paws_check(struct tcp_opt *tp, int rst)
 	return 1;
 }

+static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+	sk->route_caps = dst->dev->features;
+	if (sk->route_caps & NETIF_F_TSO) {
+		if (sk->no_largesend)
+			sk->route_caps &= ~NETIF_F_TSO;
+	}
+}
+
 #define TCP_CHECK_TIMER(sk) do { } while (0)

 #endif	/* _TCP_H */
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -28,12 +28,13 @@ TCP_ECN_send_synack(struct tcp_opt *tp, struct sk_buff *skb)
 }

 static __inline__ void
-TCP_ECN_send_syn(struct tcp_opt *tp, struct sk_buff *skb)
+TCP_ECN_send_syn(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
 {
 	tp->ecn_flags = 0;
-	if (sysctl_tcp_ecn) {
+	if (sysctl_tcp_ecn && !(sk->route_caps&NETIF_F_TSO)) {
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
+		sk->no_largesend = 1;
 	}
 }


--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -209,6 +209,8 @@ struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
 	atomic_set(&skb->users, 1);
 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
 	skb_shinfo(skb)->nr_frags  = 0;
+	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->tso_segs = 0;
 	skb_shinfo(skb)->frag_list = NULL;
 out:
 	return skb;
@@ -490,6 +492,7 @@ int skb_linearize(struct sk_buff *skb, int gfp_mask)
 	unsigned int size;
 	u8 *data;
 	long offset;
+	struct skb_shared_info *ninfo;
 	int headerlen = skb->data - skb->head;
 	int expand = (skb->tail + skb->data_len) - skb->end;

@@ -509,6 +512,14 @@ int skb_linearize(struct sk_buff *skb, int gfp_mask)
 	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
 		BUG();

+	/* Set up shinfo */
+	ninfo = (struct skb_shared_info*)(data + size);
+	atomic_set(&ninfo->dataref, 1);
+	ninfo->tso_size = skb_shinfo(skb)->tso_size;
+	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+	ninfo->nr_frags = 0;
+	ninfo->frag_list = NULL;
+
 	/* Offset between the two in bytes */
 	offset = data - skb->head;

@@ -525,11 +536,6 @@ int skb_linearize(struct sk_buff *skb, int gfp_mask)
 	skb->tail    += offset;
 	skb->data    += offset;

-	/* Set up shinfo */
-	atomic_set(&(skb_shinfo(skb)->dataref), 1);
-	skb_shinfo(skb)->nr_frags  = 0;
-	skb_shinfo(skb)->frag_list = NULL;
-
 	/* We are no longer a clone, even if we were. */
 	skb->cloned    = 0;

@@ -583,6 +589,8 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
 		}
 		skb_shinfo(n)->nr_frags = i;
 	}
+	skb_shinfo(n)->tso_size = skb_shinfo(skb)->tso_size;
+	skb_shinfo(n)->tso_segs = skb_shinfo(skb)->tso_segs;

 	if (skb_shinfo(skb)->frag_list) {
 		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
@@ -694,6 +702,9 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 *
 *	You must pass %GFP_ATOMIC as the allocation priority if this function
 *	is called from an interrupt.
+ *
+ *	BUG ALERT: ip_summed is not copied. Why does this work? Is it used
+ *	only by netfilter in the cases when checksum is recalculated? --ANK
 */
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				int newheadroom, int newtailroom, int gfp_mask)
@@ -716,6 +727,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 		BUG();

 	copy_skb_header(n, skb);
+	skb_shinfo(n)->tso_size = skb_shinfo(skb)->tso_size;
+	skb_shinfo(n)->tso_segs = skb_shinfo(skb)->tso_segs;

 	return n;
 }

--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -306,10 +306,20 @@ static inline int ip_queue_xmit2(struct sk_buff *skb)
 		iph = skb->nh.iph;
 	}

-	if (skb->len > rt->u.dst.pmtu)
+	if (skb->len > rt->u.dst.pmtu) {
+		unsigned int hlen;
+		if (!(sk->route_caps&NETIF_F_TSO))
 			goto fragment;

-	ip_select_ident(iph, &rt->u.dst, sk);
+		/* Hack zone: all this must be done by TCP. */
+		hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
+		skb_shinfo(skb)->tso_size = rt->u.dst.pmtu - hlen;
+		skb_shinfo(skb)->tso_segs =
+			(skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
+				skb_shinfo(skb)->tso_size - 1;
+	}
+
+	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);

 	/* Add an IP checksum. */
 	ip_send_check(iph);
@@ -371,7 +381,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 				    sk->bound_dev_if))
 			goto no_route;
 		__sk_dst_set(sk, &rt->u.dst);
-		sk->route_caps = rt->u.dst.dev->features;
+		tcp_v4_setup_caps(sk, &rt->u.dst);
 	}
 	skb->dst = dst_clone(&rt->u.dst);

@@ -577,7 +587,7 @@ static int ip_build_xmit_slow(struct sock *sk,
 					 * for packets without DF or having
 					 * been fragmented.
 					 */
-					__ip_select_ident(iph, &rt->u.dst);
+					__ip_select_ident(iph, &rt->u.dst, 0);
 					id = iph->id;
 				}


--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -729,7 +729,7 @@ static void ip_select_fb_ident(struct iphdr *iph)
 	spin_unlock_bh(&ip_fb_id_lock);
 }

-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 {
 	struct rtable *rt = (struct rtable *) dst;

@@ -741,7 +741,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
 		   so that we need not to grab a lock to dereference it.
 		 */
 		if (rt->peer) {
-			iph->id = htons(inet_getid(rt->peer));
+			iph->id = htons(inet_getid(rt->peer, more));
 			return;
 		}
 	} else

--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -846,7 +846,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,

 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);

-	mss_now = tcp_current_mss(sk);
+	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 	copied = 0;

 	err = -EPIPE;
@@ -921,7 +921,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 		if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 			goto do_error;

-		mss_now = tcp_current_mss(sk);
+		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 	}

 out:
@@ -1001,7 +1001,7 @@ static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)

 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
 {
-	int tmp = tp->mss_cache;
+	int tmp = tp->mss_cache_std;

 	if (sk->route_caps & NETIF_F_SG) {
 		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
@@ -1037,7 +1037,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 	/* This should be in poll */
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);

-	mss_now = tcp_current_mss(sk);
+	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));

 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -1192,7 +1192,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 			if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 				goto do_error;

-			mss_now = tcp_current_mss(sk);
+			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 		}
 	}

@@ -2444,7 +2444,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,

 	switch (optname) {
 	case TCP_MAXSEG:
-		val = tp->mss_cache;
+		val = tp->mss_cache_std;
 		if (!val && ((1 << sk->state) & (TCPF_CLOSE | TCPF_LISTEN)))
 			val = tp->user_mss;
 		break;
@@ -2507,7 +2507,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,

 		info.tcpi_rto = (1000000 * tp->rto) / HZ;
 		info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
-		info.tcpi_snd_mss = tp->mss_cache;
+		info.tcpi_snd_mss = tp->mss_cache_std;
 		info.tcpi_rcv_mss = tp->ack.rcv_mss;

 		info.tcpi_unacked = tp->packets_out;

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -772,6 +772,14 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	int flag = 0;
 	int i;

+	/* So, SACKs for already sent large segments will be lost.
+	 * Not good, but alternative is to resegment the queue. */
+	if (sk->route_caps&NETIF_F_TSO) {
+		sk->route_caps &= ~NETIF_F_TSO;
+		sk->no_largesend = 1;
+		tp->mss_cache = tp->mss_cache_std;
+	}
+
 	if (!tp->sacked_out)
 		tp->fackets_out = 0;
 	prior_fackets = tp->fackets_out;
@@ -2963,6 +2971,8 @@ void tcp_cwnd_application_limited(struct sock *sk)
 /* When incoming ACK allowed to free some skb from write_queue,
 * we remember this event in flag tp->queue_shrunk and wake up socket
 * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
 */
 static void tcp_new_space(struct sock *sk)
 {
@@ -2972,8 +2982,8 @@ static void tcp_new_space(struct sock *sk)
 	    !(sk->userlocks&SOCK_SNDBUF_LOCK) &&
 	    !tcp_memory_pressure &&
 	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
-		int sndmem = tp->mss_clamp + MAX_TCP_HEADER + 16 +
-			     sizeof(struct sk_buff),
+ 		int sndmem = max_t(u32, tp->mss_clamp, tp->mss_cache) +
+			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
 						   tp->reordering + 1);
 		sndmem *= 2*demanded;
@@ -3502,6 +3512,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 */

 		TCP_ECN_rcv_synack(tp, th);
+		if (tp->ecn_flags&TCP_ECN_OK)
+			sk->no_largesend = 1;

 		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -3627,10 +3639,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
 		tp->max_window = tp->snd_wnd;

+		TCP_ECN_rcv_syn(tp, th);
+		if (tp->ecn_flags&TCP_ECN_OK)
+			sk->no_largesend = 1;
+
 		tcp_sync_mss(sk, tp->pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);

-		TCP_ECN_rcv_syn(tp, th);

 		tcp_send_synack(sk);
 #if 0

--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -780,7 +780,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}

 	__sk_dst_set(sk, &rt->u.dst);
-	sk->route_caps = rt->u.dst.dev->features;
+	tcp_v4_setup_caps(sk, &rt->u.dst);

 	if (!inet->opt || !inet->opt->srr)
 		daddr = rt->rt_dst;
@@ -1559,7 +1559,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto exit;

 	newsk->dst_cache = dst;
-	newsk->route_caps = dst->dev->features;
+	tcp_v4_setup_caps(newsk, dst);

 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
@@ -1865,7 +1865,7 @@ static int tcp_v4_reselect_saddr(struct sock *sk)
 		return err;

 	__sk_dst_set(sk, &rt->u.dst);
-	sk->route_caps = rt->u.dst.dev->features;
+	tcp_v4_setup_caps(sk, &rt->u.dst);

 	new_saddr = rt->rt_src;

@@ -1913,7 +1913,7 @@ int tcp_v4_rebuild_header(struct sock *sk)
 			      RT_CONN_FLAGS(sk), sk->bound_dev_if);
 	if (!err) {
 		__sk_dst_set(sk, &rt->u.dst);
-		sk->route_caps = rt->u.dst.dev->features;
+		tcp_v4_setup_caps(sk, &rt->u.dst);
 		return 0;
 	}


--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -786,6 +786,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 			newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 		newtp->mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
+		if (newtp->ecn_flags&TCP_ECN_OK)
+			newsk->no_largesend = 1;

 		TCP_INC_STATS_BH(TcpPassiveOpens);
 	}

--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -531,7 +531,21 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)

 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
-	tp->mss_cache = mss_now;
+	tp->mss_cache = tp->mss_cache_std = mss_now;
+
+	if (sk->route_caps&NETIF_F_TSO) {
+		int large_mss;
+
+		large_mss = 65535 - tp->af_specific->net_header_len -
+			tp->ext_header_len - tp->tcp_header_len;
+
+		if (tp->max_window && large_mss > (tp->max_window>>1))
+			large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+
+		/* Always keep large mss multiple of real mss. */
+		tp->mss_cache = mss_now*(large_mss/mss_now);
+	}
+
 	return mss_now;
 }

@@ -561,7 +575,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle)
 		 * We also handle things correctly when the user adds some
 		 * IP options mid-stream.  Silly to do, but cover it.
 		 */
-		mss_now = tcp_current_mss(sk); 
+		mss_now = tcp_current_mss(sk, 1);

 		while((skb = tp->send_head) &&
 		      tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) {
@@ -767,7 +781,7 @@ void tcp_simple_retransmit(struct sock *sk)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int mss = tcp_current_mss(sk);
+	unsigned int mss = tcp_current_mss(sk, 0);
 	int lost = 0;

 	for_retrans_queue(skb, sk, tp) {
@@ -812,7 +826,7 @@ void tcp_simple_retransmit(struct sock *sk)
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt *tp = tcp_sk(sk);
-	unsigned int cur_mss = tcp_current_mss(sk);
+ 	unsigned int cur_mss = tcp_current_mss(sk, 0);
 	int err;

 	/* Do not sent more than we queued. 1/4 is reserved for possible
@@ -821,6 +835,27 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
 		return -EAGAIN;

+	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+		struct sk_buff *skb2;
+
+		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+			BUG();
+
+		if (sk->route_caps&NETIF_F_TSO) {
+			sk->route_caps &= ~NETIF_F_TSO;
+			sk->no_largesend = 1;
+			tp->mss_cache = tp->mss_cache_std;
+		}
+
+		if(tcp_fragment(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+			return -ENOMEM;
+
+		skb2 = skb->next;
+		__skb_unlink(skb, skb->list);
+		tcp_free_skb(sk, skb);
+		skb = skb2;
+	}
+
 	/* If receiver has shrunk his window, and skb is out of
 	 * new window, do not retransmit it. The exception is the
 	 * case, when window is shrunk to zero. In this case
@@ -998,7 +1033,7 @@ void tcp_send_fin(struct sock *sk)
 	 * unsent frames.  But be careful about outgoing SACKS
 	 * and IP options.
 	 */
-	mss_now = tcp_current_mss(sk); 
+	mss_now = tcp_current_mss(sk, 1); 

 	if(tp->send_head != NULL) {
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -1121,6 +1156,8 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	memset(th, 0, sizeof(struct tcphdr));
 	th->syn = 1;
 	th->ack = 1;
+	if (dst->dev->features&NETIF_F_TSO)
+		req->ecn_ok = 0;
 	TCP_ECN_make_synack(req, th);
 	th->source = inet_sk(sk)->sport;
 	th->dest = req->rmt_port;
@@ -1224,7 +1261,7 @@ int tcp_connect(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);

 	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
-	TCP_ECN_send_syn(tp, buff);
+	TCP_ECN_send_syn(sk, tp, buff);
 	TCP_SKB_CB(buff)->sacked = 0;
 	buff->csum = 0;
 	TCP_SKB_CB(buff)->seq = tp->write_seq++;
@@ -1379,7 +1416,7 @@ int tcp_write_wakeup(struct sock *sk)
 		if ((skb = tp->send_head) != NULL &&
 		    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
 			int err;
-			int mss = tcp_current_mss(sk);
+			int mss = tcp_current_mss(sk, 0);
 			int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;

 			if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
@@ -1395,6 +1432,13 @@ int tcp_write_wakeup(struct sock *sk)
 				TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 				if (tcp_fragment(sk, skb, seg_size))
 					return -1;
+				/* SWS override triggered forced fragmentation.
+				 * Disable TSO, the connection is too sick. */
+				if (sk->route_caps&NETIF_F_TSO) {
+					sk->no_largesend = 1;
+					sk->route_caps &= ~NETIF_F_TSO;
+					tp->mss_cache = tp->mss_cache_std;
+				}
 			}
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;

--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -659,7 +659,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	}

 	ip6_dst_store(sk, dst, NULL);
-	sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
+	sk->route_caps = dst->dev->features&~(NETIF_F_IP_CSUM|NETIF_F_TSO);

 	if (saddr == NULL) {
 		err = ipv6_get_saddr(dst, &np->daddr, &saddr_buf);
@@ -1333,7 +1333,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	MOD_INC_USE_COUNT;

 	ip6_dst_store(newsk, dst, NULL);
-	sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
+	sk->route_caps = dst->dev->features&~(NETIF_F_IP_CSUM|NETIF_F_TSO);

 	newtcp6sk = (struct tcp6_sock *)newsk;
 	newtcp6sk->pinet6 = &newtcp6sk->inet6;
@@ -1721,7 +1721,7 @@ static int tcp_v6_rebuild_header(struct sock *sk)
 		}

 		ip6_dst_store(sk, dst, NULL);
-		sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
+		sk->route_caps = dst->dev->features&~(NETIF_F_IP_CSUM|NETIF_F_TSO);
 	}

 	return 0;