Commit b8439924 authored by Alexey Kuznetsov's avatar Alexey Kuznetsov Committed by David S. Miller

Allow to bind to an already in use local port

during connect when the connection will still have a unique
identity.  Fixes port space exhaustion, especially in web
caches.

Initial work done by Andi Kleen.
parent 9a218f37
......@@ -126,7 +126,13 @@ tcp_max_tw_buckets - INTEGER
if network conditions require more than default value.
tcp_tw_recycle - BOOLEAN
Enable fast recycling TIME-WAIT sockets. Default value is 1.
Enable fast recycling TIME-WAIT sockets. Default value is 0.
It should not be changed without advice/request of technical
experts.
tcp_tw_reuse - BOOLEAN
Allow to reuse TIME-WAIT sockets for new connections when it is
safe from protocol viewpoint. Default value is 0.
It should not be changed without advice/request of technical
experts.
......
......@@ -288,7 +288,8 @@ enum
NET_TCP_ADV_WIN_SCALE=87,
NET_IPV4_NONLOCAL_BIND=88,
NET_IPV4_ICMP_RATELIMIT=89,
NET_IPV4_ICMP_RATEMASK=90
NET_IPV4_ICMP_RATEMASK=90,
NET_TCP_TW_REUSE=91
};
enum {
......
......@@ -78,7 +78,7 @@ struct tcp_ehash_bucket {
*/
struct tcp_bind_bucket {
unsigned short port;
unsigned short fastreuse;
signed short fastreuse;
struct tcp_bind_bucket *next;
struct sock *owners;
struct tcp_bind_bucket **pprev;
......@@ -469,6 +469,7 @@ extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_tw_reuse;
extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated;
......@@ -577,9 +578,7 @@ struct tcp_func {
struct sk_buff *skb,
struct open_request *req,
struct dst_entry *dst);
int (*hash_connecting) (struct sock *sk);
int (*remember_stamp) (struct sock *sk);
__u16 net_header_len;
......@@ -781,8 +780,7 @@ extern int tcp_v4_connect(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
extern int tcp_connect(struct sock *sk,
struct sk_buff *skb);
extern int tcp_connect(struct sock *sk);
extern struct sk_buff * tcp_make_synack(struct sock *sk,
struct dst_entry *dst,
......
......@@ -655,13 +655,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
if (sk->state != TCP_CLOSE)
goto out;
err = -EAGAIN;
if (!inet->num) {
if (sk->prot->get_port(sk, 0) != 0)
goto out;
inet->sport = htons(inet->num);
}
err = sk->prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;
......
......@@ -219,6 +219,8 @@ ctl_table ipv4_table[] = {
&sysctl_icmp_ratelimit, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_ICMP_RATEMASK, "icmp_ratemask",
&sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_TW_REUSE, "tcp_tw_reuse",
&sysctl_tcp_tw_reuse, sizeof(int), 0644, NULL, &proc_dointvec},
{0}
};
......
......@@ -64,6 +64,7 @@
#include <linux/ipsec.h>
extern int sysctl_ip_dynaddr;
int sysctl_tcp_tw_reuse = 0;
/* Check TCP sequence numbers in ICMP packets. */
#define ICMP_MIN_LENGTH 8
......@@ -163,18 +164,18 @@ __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
local_bh_enable();
}
static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
{
inet_sk(sk)->num = snum;
static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
{
inet_sk(sk)->num = snum;
if ((sk->bind_next = tb->owners) != NULL)
tb->owners->bind_pprev = &sk->bind_next;
tb->owners = sk;
sk->bind_pprev = &tb->owners;
sk->prev = (struct sock *) tb;
}
}
static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
{
{
struct inet_opt *inet = inet_sk(sk);
struct sock *sk2 = tb->owners;
int sk_reuse = sk->reuse;
......@@ -193,8 +194,8 @@ static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
}
}
}
return sk2 != NULL;
}
return sk2 != NULL;
}
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
......@@ -247,12 +248,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
break;
}
if (tb != NULL && tb->owners != NULL) {
if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
if (sk->reuse > 1)
goto success;
if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
goto success;
} else {
ret = 1;
ret = 1;
if (tcp_bind_conflict(sk, tb))
goto fail_unlock;
goto fail_unlock;
}
}
ret = 1;
......@@ -269,7 +272,7 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
tb->fastreuse = 0;
success:
if (sk->prev == NULL)
tcp_bind_hash(sk, tb, snum);
tcp_bind_hash(sk, tb, snum);
BUG_TRAP(sk->prev == (struct sock *) tb);
ret = 0;
......@@ -341,13 +344,13 @@ void tcp_listen_wlock(void)
}
}
static __inline__ void __tcp_v4_hash(struct sock *sk)
static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
{
struct sock **skp;
rwlock_t *lock;
BUG_TRAP(sk->pprev==NULL);
if(sk->state == TCP_LISTEN) {
if(listen_possible && sk->state == TCP_LISTEN) {
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
lock = &tcp_lhash_lock;
tcp_listen_wlock();
......@@ -362,7 +365,7 @@ static __inline__ void __tcp_v4_hash(struct sock *sk)
sk->pprev = skp;
sock_prot_inc_use(sk->prot);
write_unlock(lock);
if (sk->state == TCP_LISTEN)
if (listen_possible && sk->state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
......@@ -370,7 +373,7 @@ static void tcp_v4_hash(struct sock *sk)
{
if (sk->state != TCP_CLOSE) {
local_bh_disable();
__tcp_v4_hash(sk);
__tcp_v4_hash(sk, 1);
local_bh_enable();
}
}
......@@ -379,6 +382,9 @@ void tcp_unhash(struct sock *sk)
{
rwlock_t *lock;
if (!sk->pprev)
goto ende;
if (sk->state == TCP_LISTEN) {
local_bh_disable();
tcp_listen_wlock();
......@@ -397,6 +403,8 @@ void tcp_unhash(struct sock *sk)
sock_prot_dec_use(sk->prot);
}
write_unlock_bh(lock);
ende:
if (sk->state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
......@@ -538,20 +546,22 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
skb->h.th->source);
}
static int tcp_v4_check_established(struct sock *sk)
/* called with local bh disabled */
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
struct tcp_tw_bucket **twp)
{
struct inet_opt *inet = inet_sk(sk);
u32 daddr = inet->rcv_saddr;
u32 saddr = inet->daddr;
int dif = sk->bound_dev_if;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
__u32 ports = TCP_COMBINED_PORTS(inet->dport, inet->num);
int hash = tcp_hashfn(daddr, inet->num, saddr, inet->dport);
__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
struct tcp_ehash_bucket *head = &tcp_ehash[hash];
struct sock *sk2, **skp;
struct tcp_tw_bucket *tw;
write_lock_bh(&head->lock);
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
......@@ -575,7 +585,9 @@ static int tcp_v4_check_established(struct sock *sk)
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
if (tw->ts_recent_stamp) {
if (tw->ts_recent_stamp &&
(!twp || (sysctl_tcp_tw_reuse &&
xtime.tv_sec - tw->ts_recent_stamp > 1))) {
if ((tp->write_seq =
tw->snd_nxt + 65535 + 2) == 0)
tp->write_seq = 1;
......@@ -597,6 +609,10 @@ static int tcp_v4_check_established(struct sock *sk)
}
unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
sk->num = lport;
sk->sport = htons(lport);
BUG_TRAP(sk->pprev==NULL);
if ((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
......@@ -605,15 +621,16 @@ static int tcp_v4_check_established(struct sock *sk)
sk->pprev = skp;
sk->hashent = hash;
sock_prot_inc_use(sk->prot);
write_unlock_bh(&head->lock);
write_unlock(&head->lock);
if (tw) {
if (twp) {
*twp = tw;
NET_INC_STATS_BH(TimeWaitRecycled);
} else if (tw) {
/* Silly. Should hash-dance instead... */
local_bh_disable();
tcp_tw_deschedule(tw);
tcp_timewait_kill(tw);
NET_INC_STATS_BH(TimeWaitRecycled);
local_bh_enable();
tcp_tw_put(tw);
}
......@@ -621,34 +638,120 @@ static int tcp_v4_check_established(struct sock *sk)
return 0;
not_unique:
write_unlock_bh(&head->lock);
write_unlock(&head->lock);
return -EADDRNOTAVAIL;
}
/* Hash SYN-SENT socket to established hash table after
* checking that it is unique. Note, that without kernel lock
* we MUST make these two operations atomically.
*
* Optimization: if it is bound and tcp_bind_bucket has the only
* owner (us), we need not to scan established bucket.
/*
* Bind a port for a connect operation and hash it.
*/
int tcp_v4_hash_connecting(struct sock *sk)
static int tcp_v4_hash_connect(struct sock *sk)
{
unsigned short snum = inet_sk(sk)->num;
struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
struct tcp_bind_hashbucket *head;
struct tcp_bind_bucket *tb;
if (snum == 0) {
int rover;
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
struct tcp_tw_bucket *tw = NULL;
local_bh_disable();
/* TODO. Actually it is not so bad idea to remove
* tcp_portalloc_lock before next submission to Linus.
* As soon as we touch this place at all it is time to think.
*
* Now it protects single _advisory_ variable tcp_port_rover,
* hence it is mostly useless.
* Code will work nicely if we just delete it, but
* I am afraid in contented case it will work not better or
* even worse: another cpu just will hit the same bucket
* and spin there.
* So some cpu salt could remove both contention and
* memory pingpong. Any ideas how to do this in a nice way?
*/
spin_lock(&tcp_portalloc_lock);
rover = tcp_port_rover;
do {
rover++;
if ((rover < low) || (rover > high))
rover = low;
head = &tcp_bhash[tcp_bhashfn(rover)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
for (tb = head->chain; tb; tb = tb->next) {
if (tb->port == rover) {
BUG_TRAP(tb->owners != NULL);
if (tb->fastreuse >= 0)
goto next_port;
if (!__tcp_v4_check_established(sk, rover, &tw))
goto ok;
goto next_port;
}
}
tb = tcp_bucket_create(head, rover);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;
next_port:
spin_unlock(&head->lock);
} while (--remaining > 0);
tcp_port_rover = rover;
spin_unlock(&tcp_portalloc_lock);
local_bh_enable();
return -EADDRNOTAVAIL;
ok:
/* All locks still held and bhs disabled */
tcp_port_rover = rover;
spin_unlock(&tcp_portalloc_lock);
tcp_bind_hash(sk, tb, rover);
if (!sk->pprev) {
sk->sport = htons(rover);
__tcp_v4_hash(sk, 0);
}
spin_unlock(&head->lock);
if (tw) {
tcp_tw_deschedule(tw);
tcp_timewait_kill(tw);
tcp_tw_put(tw);
}
local_bh_enable();
return 0;
}
head = &tcp_bhash[tcp_bhashfn(snum)];
tb = (struct tcp_bind_bucket *)sk->prev;
spin_lock_bh(&head->lock);
if (tb->owners == sk && sk->bind_next == NULL) {
__tcp_v4_hash(sk);
__tcp_v4_hash(sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock_bh(&head->lock);
int ret;
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
return tcp_v4_check_established(sk);
ret = __tcp_v4_check_established(sk, snum, NULL);
local_bh_enable();
return ret;
}
}
......@@ -658,7 +761,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct inet_opt *inet = inet_sk(sk);
struct tcp_opt *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
struct sk_buff *buff;
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
......@@ -693,12 +795,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
err = -ENOBUFS;
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
if (buff == NULL)
goto failure;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
......@@ -729,24 +825,38 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->dport = usin->sin_port;
inet->daddr = daddr;
tp->ext_header_len = 0;
if (inet->opt)
tp->ext_header_len = inet->opt->optlen;
tp->mss_clamp = 536;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initalization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = tcp_v4_hash_connect(sk);
if (err)
goto failure;
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
tp->ext_header_len = 0;
if (inet->opt)
tp->ext_header_len = inet->opt->optlen;
inet->id = tp->write_seq ^ jiffies;
tp->mss_clamp = 536;
err = tcp_connect(sk);
if (err)
goto failure;
err = tcp_connect(sk, buff);
if (err == 0)
return 0;
return 0;
failure:
tcp_set_state(sk, TCP_CLOSE);
__sk_dst_reset(sk);
sk->route_caps = 0;
inet->dport = 0;
......@@ -1477,7 +1587,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = dst->advmss;
tcp_initialize_rcv_mss(newsk);
__tcp_v4_hash(newsk);
__tcp_v4_hash(newsk, 0);
__tcp_inherit_port(sk, newsk);
return newsk;
......@@ -1901,7 +2011,6 @@ struct tcp_func ipv4_specific = {
tcp_v4_rebuild_header,
tcp_v4_conn_request,
tcp_v4_syn_recv_sock,
tcp_v4_hash_connecting,
tcp_v4_remember_stamp,
sizeof(struct iphdr),
......
......@@ -38,6 +38,7 @@
#include <net/tcp.h>
#include <linux/compiler.h>
#include <linux/smp_lock.h>
/* People can turn this off for buggy TCP's found in printers etc. */
......@@ -1156,14 +1157,14 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
return skb;
}
int tcp_connect(struct sock *sk, struct sk_buff *buff)
/*
* Do all connect socket setups that can be done AF independent.
*/
static inline void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_opt *tp = tcp_sk(sk);
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
......@@ -1190,14 +1191,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->rcv_ssthresh = tp->rcv_wnd;
/* Socket identity change complete, no longer
* in TCP_CLOSE, so enter ourselves into the
* hash tables.
*/
tcp_set_state(sk,TCP_SYN_SENT);
if (tp->af_specific->hash_connecting(sk))
goto err_out;
sk->err = 0;
sk->done = 0;
tp->snd_wnd = 0;
......@@ -1211,6 +1204,24 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
tp->rto = TCP_TIMEOUT_INIT;
tp->retransmits = 0;
tcp_clear_retrans(tp);
}
/*
* Build a SYN and send it off.
*/
int tcp_connect(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *buff;
tcp_connect_init(sk);
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
TCP_ECN_send_syn(tp, buff);
......@@ -1233,11 +1244,6 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
/* Timer for repeating the SYN until an answer. */
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
return 0;
err_out:
tcp_set_state(sk,TCP_CLOSE);
kfree_skb(buff);
return -EADDRNOTAVAIL;
}
/* Send out a delayed ack, the caller does the policy checking
......
......@@ -136,7 +136,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break;
}
if (tb != NULL && tb->owners != NULL) {
if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
goto success;
} else {
struct ipv6_pinfo *np = inet6_sk(sk);
......@@ -499,11 +499,21 @@ static int tcp_v6_check_established(struct sock *sk)
return -EADDRNOTAVAIL;
}
static int tcp_v6_hash_connecting(struct sock *sk)
static int tcp_v6_hash_connect(struct sock *sk)
{
unsigned short snum = inet_sk(sk)->num;
struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
struct tcp_bind_bucket *tb = head->chain;
struct tcp_bind_hashbucket *head;
struct tcp_bind_bucket *tb;
/* XXX */
if (inet_sk(sk)->num == 0) {
int err = tcp_v6_get_port(sk, inet_sk(sk)->num);
if (err)
return err;
inet_sk(sk)->sport = htons(inet_sk(sk)->num);
}
head = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
tb = head->chain;
spin_lock_bh(&head->lock);
......@@ -534,7 +544,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct in6_addr saddr_buf;
struct flowi fl;
struct dst_entry *dst;
struct sk_buff *buff;
int addr_type;
int err;
......@@ -675,17 +684,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
err = -ENOBUFS;
buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation);
if (buff == NULL)
goto failure;
inet->dport = usin->sin6_port;
/*
* Init variables
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = tcp_v6_hash_connect(sk);
if (err)
goto late_failure;
if (!tp->write_seq)
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
......@@ -693,10 +697,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->sport,
inet->dport);
err = tcp_connect(sk, buff);
if (err == 0)
return 0;
err = tcp_connect(sk);
if (err)
goto late_failure;
return 0;
late_failure:
tcp_set_state(sk, TCP_CLOSE);
failure:
__sk_dst_reset(sk);
inet->dport = 0;
......@@ -1785,7 +1793,6 @@ static struct tcp_func ipv6_specific = {
tcp_v6_rebuild_header,
tcp_v6_conn_request,
tcp_v6_syn_recv_sock,
tcp_v6_hash_connecting,
tcp_v6_remember_stamp,
sizeof(struct ipv6hdr),
......@@ -1805,7 +1812,6 @@ static struct tcp_func ipv6_mapped = {
tcp_v4_rebuild_header,
tcp_v6_conn_request,
tcp_v6_syn_recv_sock,
tcp_v4_hash_connecting,
tcp_v4_remember_stamp,
sizeof(struct iphdr),
......
......@@ -364,7 +364,6 @@ EXPORT_SYMBOL(tcp_inherit_port);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
EXPORT_SYMBOL(tcp_v4_do_rcv);
EXPORT_SYMBOL(tcp_v4_connect);
EXPORT_SYMBOL(tcp_v4_hash_connecting);
EXPORT_SYMBOL(tcp_unhash);
EXPORT_SYMBOL(udp_prot);
EXPORT_SYMBOL(tcp_prot);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment