Commit 789f558c authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp/dccp: get rid of central timewait timer

Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.

This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)

We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.

Tested:

On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)

Before patch :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171

While test is running, we can observe 25 or even 33 ms latencies.

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2

After patch :

About 90% increase of throughput :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992

And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 20a1d165
...@@ -31,67 +31,14 @@ ...@@ -31,67 +31,14 @@
struct inet_hashinfo; struct inet_hashinfo;
#define INET_TWDR_RECYCLE_SLOTS_LOG 5
#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
/*
* If time > 4sec, it is "slow" path, no recycling is required,
* so that we select tick to get range about 4 seconds.
*/
#if HZ <= 16 || HZ > 4096
# error Unsupported: HZ <= 16 or HZ > 4096
#elif HZ <= 32
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 128
# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 256
# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 512
# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 1024
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#else
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#endif
static inline u32 inet_tw_time_stamp(void)
{
return jiffies;
}
/* TIME_WAIT reaping mechanism. */
#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
#define INET_TWDR_TWKILL_QUOTA 100
struct inet_timewait_death_row { struct inet_timewait_death_row {
/* Short-time timewait calendar */ atomic_t tw_count;
int twcal_hand;
unsigned long twcal_jiffie; struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle; int sysctl_tw_recycle;
int sysctl_max_tw_buckets; int sysctl_max_tw_buckets;
}; };
void inet_twdr_hangman(unsigned long data);
void inet_twdr_twkill_work(struct work_struct *work);
void inet_twdr_twcal_tick(unsigned long data);
struct inet_bind_bucket; struct inet_bind_bucket;
/* /*
...@@ -133,52 +80,18 @@ struct inet_timewait_sock { ...@@ -133,52 +80,18 @@ struct inet_timewait_sock {
__be16 tw_sport; __be16 tw_sport;
kmemcheck_bitfield_begin(flags); kmemcheck_bitfield_begin(flags);
/* And these are ours. */ /* And these are ours. */
unsigned int tw_pad0 : 1, /* 1 bit hole */ unsigned int tw_kill : 1,
tw_transparent : 1, tw_transparent : 1,
tw_flowlabel : 20, tw_flowlabel : 20,
tw_pad : 2, /* 2 bits hole */ tw_pad : 2, /* 2 bits hole */
tw_tos : 8; tw_tos : 8;
kmemcheck_bitfield_end(flags); kmemcheck_bitfield_end(flags);
u32 tw_ttd; struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb; struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node; struct inet_timewait_death_row *tw_dr;
}; };
#define tw_tclass tw_tos #define tw_tclass tw_tos
static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
{
return !hlist_unhashed(&tw->tw_death_node);
}
static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
{
tw->tw_death_node.pprev = NULL;
}
static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
__hlist_del(&tw->tw_death_node);
inet_twsk_dead_node_init(tw);
}
static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
if (inet_twsk_dead_hashed(tw)) {
__inet_twsk_del_dead_node(tw);
return 1;
}
return 0;
}
#define inet_twsk_for_each(tw, node, head) \
hlist_nulls_for_each_entry(tw, node, head, tw_node)
#define inet_twsk_for_each_inmate(tw, jail) \
hlist_for_each_entry(tw, jail, tw_death_node)
#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{ {
return (struct inet_timewait_sock *)sk; return (struct inet_timewait_sock *)sk;
...@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, ...@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo); struct inet_hashinfo *hashinfo);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
struct inet_timewait_death_row *dr,
const int state); const int state);
void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
struct inet_hashinfo *hashinfo); struct inet_hashinfo *hashinfo);
void inet_twsk_schedule(struct inet_timewait_sock *tw, void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
struct inet_timewait_death_row *twdr, void inet_twsk_deschedule(struct inet_timewait_sock *tw);
const int timeo, const int timewait_len);
void inet_twsk_deschedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr);
void inet_twsk_purge(struct inet_hashinfo *hashinfo, void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family); struct inet_timewait_death_row *twdr, int family);
......
...@@ -27,28 +27,16 @@ ...@@ -27,28 +27,16 @@
struct inet_timewait_death_row dccp_death_row = { struct inet_timewait_death_row dccp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2, .sysctl_max_tw_buckets = NR_FILE * 2,
.period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
.hashinfo = &dccp_hashinfo, .hashinfo = &dccp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&dccp_death_row),
.twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
inet_twdr_twkill_work),
/* Short-time timewait calendar */
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&dccp_death_row),
}; };
EXPORT_SYMBOL_GPL(dccp_death_row); EXPORT_SYMBOL_GPL(dccp_death_row);
void dccp_time_wait(struct sock *sk, int state, int timeo) void dccp_time_wait(struct sock *sk, int state, int timeo)
{ {
struct inet_timewait_sock *tw = NULL; struct inet_timewait_sock *tw;
if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, &dccp_death_row, state);
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) { if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
...@@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) ...@@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (state == DCCP_TIME_WAIT) if (state == DCCP_TIME_WAIT)
timeo = DCCP_TIMEWAIT_LEN; timeo = DCCP_TIMEWAIT_LEN;
inet_twsk_schedule(tw, &dccp_death_row, timeo, inet_twsk_schedule(tw, timeo);
DCCP_TIMEWAIT_LEN);
inet_twsk_put(tw); inet_twsk_put(tw);
} else { } else {
/* Sorry, if we're out of memory, just CLOSE this /* Sorry, if we're out of memory, just CLOSE this
......
...@@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk, ...@@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r; struct inet_diag_msg *r;
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
s32 tmo; long tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags); nlmsg_flags);
...@@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk, ...@@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
r = nlmsg_data(nlh); r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT); BUG_ON(tw->tw_state != TCP_TIME_WAIT);
tmo = tw->tw_ttd - inet_tw_time_stamp(); tmo = tw->tw_timer.expires - jiffies;
if (tmo < 0) if (tmo < 0)
tmo = 0; tmo = 0;
......
...@@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, ...@@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
*twp = tw; *twp = tw;
} else if (tw) { } else if (tw) {
/* Silly. Should hash-dance instead... */ /* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row); inet_twsk_deschedule(tw);
inet_twsk_put(tw); inet_twsk_put(tw);
} }
...@@ -565,7 +565,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, ...@@ -565,7 +565,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
spin_unlock(&head->lock); spin_unlock(&head->lock);
if (tw) { if (tw) {
inet_twsk_deschedule(tw, death_row); inet_twsk_deschedule(tw);
while (twrefcnt) { while (twrefcnt) {
twrefcnt--; twrefcnt--;
inet_twsk_put(tw); inet_twsk_put(tw);
......
...@@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, ...@@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
} }
/* Must be called with locally disabled BHs. */ /* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw, static void inet_twsk_kill(struct inet_timewait_sock *tw)
struct inet_hashinfo *hashinfo)
{ {
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
struct inet_bind_hashbucket *bhead; struct inet_bind_hashbucket *bhead;
int refcnt; int refcnt;
/* Unlink from established hashes. */ /* Unlink from established hashes. */
...@@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, ...@@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
atomic_sub(refcnt, &tw->tw_refcnt); atomic_sub(refcnt, &tw->tw_refcnt);
atomic_dec(&tw->tw_dr->tw_count);
inet_twsk_put(tw);
} }
void inet_twsk_free(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw)
...@@ -168,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, ...@@ -168,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
} }
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) void tw_timer_handler(unsigned long data)
{ {
struct inet_timewait_sock *tw = struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
GFP_ATOMIC); if (tw->tw_kill)
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
else
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
struct inet_timewait_death_row *dr,
const int state)
{
struct inet_timewait_sock *tw;
if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
GFP_ATOMIC);
if (tw) { if (tw) {
const struct inet_sock *inet = inet_sk(sk); const struct inet_sock *inet = inet_sk(sk);
kmemcheck_annotate_bitfield(tw, flags); kmemcheck_annotate_bitfield(tw, flags);
tw->tw_dr = dr;
/* Give us an identity. */ /* Give us an identity. */
tw->tw_daddr = inet->inet_daddr; tw->tw_daddr = inet->inet_daddr;
tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr;
...@@ -196,13 +216,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat ...@@ -196,13 +216,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_prot = sk->sk_prot_creator; tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk)); twsk_net_set(tw, sock_net(sk));
setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
/* /*
* Because we use RCU lookups, we should not set tw_refcnt * Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this * to a non null value before everything is setup for this
* timewait socket. * timewait socket.
*/ */
atomic_set(&tw->tw_refcnt, 0); atomic_set(&tw->tw_refcnt, 0);
inet_twsk_dead_node_init(tw);
__module_get(tw->tw_prot->owner); __module_get(tw->tw_prot->owner);
} }
...@@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat ...@@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
} }
EXPORT_SYMBOL_GPL(inet_twsk_alloc); EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* Returns non-zero if quota exceeded. */
static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
const int slot)
{
struct inet_timewait_sock *tw;
unsigned int killed;
int ret;
/* NOTE: compare this to previous version where lock
* was released after detaching chain. It was racy,
* because tw buckets are scheduled in not serialized context
* in 2.3 (with netfilter), and with softnet it is common, because
* soft irqs are not sequenced.
*/
killed = 0;
ret = 0;
rescan:
inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
#ifdef CONFIG_NET_NS
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
#endif
inet_twsk_put(tw);
killed++;
spin_lock(&twdr->death_lock);
if (killed > INET_TWDR_TWKILL_QUOTA) {
ret = 1;
break;
}
/* While we dropped twdr->death_lock, another cpu may have
* killed off the next TW bucket in the list, therefore
* do a fresh re-read of the hlist head node with the
* lock reacquired. We still use the hlist traversal
* macro in order to get the prefetches.
*/
goto rescan;
}
twdr->tw_count -= killed;
#ifndef CONFIG_NET_NS
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
#endif
return ret;
}
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
unsigned int need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
if (twdr->tw_count == 0)
goto out;
need_timer = 0;
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
twdr->thread_slots |= (1 << twdr->slot);
schedule_work(&twdr->twkill_work);
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
}
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
spin_unlock(&twdr->death_lock);
}
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
void inet_twdr_twkill_work(struct work_struct *work)
{
struct inet_timewait_death_row *twdr =
container_of(work, struct inet_timewait_death_row, twkill_work);
int i;
BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
(sizeof(twdr->thread_slots) * 8));
while (twdr->thread_slots) {
spin_lock_bh(&twdr->death_lock);
for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
if (!(twdr->thread_slots & (1 << i)))
continue;
while (inet_twdr_do_twkill_work(twdr, i) != 0) {
if (need_resched()) {
spin_unlock_bh(&twdr->death_lock);
schedule();
spin_lock_bh(&twdr->death_lock);
}
}
twdr->thread_slots &= ~(1 << i);
}
spin_unlock_bh(&twdr->death_lock);
}
}
EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
/* These are always called from BH context. See callers in /* These are always called from BH context. See callers in
* tcp_input.c to verify this. * tcp_input.c to verify this.
*/ */
/* This is for handling early-kills of TIME_WAIT sockets. */ /* This is for handling early-kills of TIME_WAIT sockets. */
void inet_twsk_deschedule(struct inet_timewait_sock *tw, void inet_twsk_deschedule(struct inet_timewait_sock *tw)
struct inet_timewait_death_row *twdr)
{ {
spin_lock(&twdr->death_lock); if (del_timer_sync(&tw->tw_timer))
if (inet_twsk_del_dead_node(tw)) { inet_twsk_kill(tw);
inet_twsk_put(tw);
if (--twdr->tw_count == 0)
del_timer(&twdr->tw_timer);
}
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
} }
EXPORT_SYMBOL(inet_twsk_deschedule); EXPORT_SYMBOL(inet_twsk_deschedule);
void inet_twsk_schedule(struct inet_timewait_sock *tw, void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
struct inet_timewait_death_row *twdr,
const int timeo, const int timewait_len)
{ {
struct hlist_head *list;
int slot;
/* timeout := RTO * 3.5 /* timeout := RTO * 3.5
* *
* 3.5 = 1+2+0.5 to wait for two retransmits. * 3.5 = 1+2+0.5 to wait for two retransmits.
...@@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, ...@@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
* is greater than TS tick!) and detect old duplicates with help * is greater than TS tick!) and detect old duplicates with help
* of PAWS. * of PAWS.
*/ */
slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
spin_lock(&twdr->death_lock); tw->tw_kill = timeo <= 4*HZ;
if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
/* Unlink it, if it was scheduled */
if (inet_twsk_del_dead_node(tw))
twdr->tw_count--;
else
atomic_inc(&tw->tw_refcnt); atomic_inc(&tw->tw_refcnt);
atomic_inc(&tw->tw_dr->tw_count);
if (slot >= INET_TWDR_RECYCLE_SLOTS) {
/* Schedule to slow timer */
if (timeo >= timewait_len) {
slot = INET_TWDR_TWKILL_SLOTS - 1;
} else {
slot = DIV_ROUND_UP(timeo, twdr->period);
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
tw->tw_ttd = inet_tw_time_stamp() + timeo;
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
list = &twdr->cells[slot];
} else {
tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
if (twdr->twcal_hand < 0) {
twdr->twcal_hand = 0;
twdr->twcal_jiffie = jiffies;
twdr->twcal_timer.expires = twdr->twcal_jiffie +
(slot << INET_TWDR_RECYCLE_TICK);
add_timer(&twdr->twcal_timer);
} else {
if (time_after(twdr->twcal_timer.expires,
jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
mod_timer(&twdr->twcal_timer,
jiffies + (slot << INET_TWDR_RECYCLE_TICK));
slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
}
list = &twdr->twcal_row[slot];
} }
hlist_add_head(&tw->tw_death_node, list);
if (twdr->tw_count++ == 0)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
spin_unlock(&twdr->death_lock);
} }
EXPORT_SYMBOL_GPL(inet_twsk_schedule); EXPORT_SYMBOL_GPL(inet_twsk_schedule);
void inet_twdr_twcal_tick(unsigned long data)
{
struct inet_timewait_death_row *twdr;
int n, slot;
unsigned long j;
unsigned long now = jiffies;
int killed = 0;
int adv = 0;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
if (twdr->twcal_hand < 0)
goto out;
slot = twdr->twcal_hand;
j = twdr->twcal_jiffie;
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
if (time_before_eq(j, now)) {
struct hlist_node *safe;
struct inet_timewait_sock *tw;
inet_twsk_for_each_inmate_safe(tw, safe,
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
#ifdef CONFIG_NET_NS
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
#endif
inet_twsk_put(tw);
killed++;
}
} else {
if (!adv) {
adv = 1;
twdr->twcal_jiffie = j;
twdr->twcal_hand = slot;
}
if (!hlist_empty(&twdr->twcal_row[slot])) {
mod_timer(&twdr->twcal_timer, j);
goto out;
}
}
j += 1 << INET_TWDR_RECYCLE_TICK;
slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
}
twdr->twcal_hand = -1;
out:
if ((twdr->tw_count -= killed) == 0)
del_timer(&twdr->tw_timer);
#ifndef CONFIG_NET_NS
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
#endif
spin_unlock(&twdr->death_lock);
}
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
void inet_twsk_purge(struct inet_hashinfo *hashinfo, void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family) struct inet_timewait_death_row *twdr, int family)
{ {
...@@ -509,7 +311,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, ...@@ -509,7 +311,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
rcu_read_unlock(); rcu_read_unlock();
local_bh_disable(); local_bh_disable();
inet_twsk_deschedule(tw, twdr); inet_twsk_deschedule(tw);
local_bh_enable(); local_bh_enable();
inet_twsk_put(tw); inet_twsk_put(tw);
goto restart_rcu; goto restart_rcu;
......
...@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) ...@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq); socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans, sock_prot_inuse_get(net, &tcp_prot), orphans,
tcp_death_row.tw_count, sockets, atomic_read(&tcp_death_row.tw_count), sockets,
proto_memory_allocated(&tcp_prot)); proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n", seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot), sock_prot_inuse_get(net, &udp_prot),
......
...@@ -1685,7 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1685,7 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
iph->daddr, th->dest, iph->daddr, th->dest,
inet_iif(skb)); inet_iif(skb));
if (sk2) { if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_deschedule(inet_twsk(sk));
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
sk = sk2; sk = sk2;
goto process; goto process;
...@@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) ...@@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
static void get_timewait4_sock(const struct inet_timewait_sock *tw, static void get_timewait4_sock(const struct inet_timewait_sock *tw,
struct seq_file *f, int i) struct seq_file *f, int i)
{ {
long delta = tw->tw_timer.expires - jiffies;
__be32 dest, src; __be32 dest, src;
__u16 destp, srcp; __u16 destp, srcp;
s32 delta = tw->tw_ttd - inet_tw_time_stamp();
dest = tw->tw_daddr; dest = tw->tw_daddr;
src = tw->tw_rcv_saddr; src = tw->tw_rcv_saddr;
......
...@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; ...@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly;
struct inet_timewait_death_row tcp_death_row = { struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2, .sysctl_max_tw_buckets = NR_FILE * 2,
.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
.hashinfo = &tcp_hashinfo, .hashinfo = &tcp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&tcp_death_row),
.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
inet_twdr_twkill_work),
/* Short-time timewait calendar */
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
}; };
EXPORT_SYMBOL_GPL(tcp_death_row); EXPORT_SYMBOL_GPL(tcp_death_row);
...@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (!th->fin || if (!th->fin ||
TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst: kill_with_rst:
inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_deschedule(tw);
inet_twsk_put(tw); inet_twsk_put(tw);
return TCP_TW_RST; return TCP_TW_RST;
} }
...@@ -174,11 +163,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -174,11 +163,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (tcp_death_row.sysctl_tw_recycle && if (tcp_death_row.sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp && tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw)) tcp_tw_remember_stamp(tw))
inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, inet_twsk_schedule(tw, tw->tw_timeout);
TCP_TIMEWAIT_LEN);
else else
inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
TCP_TIMEWAIT_LEN);
return TCP_TW_ACK; return TCP_TW_ACK;
} }
...@@ -211,13 +198,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -211,13 +198,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
*/ */
if (sysctl_tcp_rfc1337 == 0) { if (sysctl_tcp_rfc1337 == 0) {
kill: kill:
inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_deschedule(tw);
inet_twsk_put(tw); inet_twsk_put(tw);
return TCP_TW_SUCCESS; return TCP_TW_SUCCESS;
} }
} }
inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) { if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
...@@ -267,8 +253,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, ...@@ -267,8 +253,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* Do not reschedule in the last case. * Do not reschedule in the last case.
*/ */
if (paws_reject || th->ack) if (paws_reject || th->ack)
inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
TCP_TIMEWAIT_LEN);
return tcp_timewait_check_oow_rate_limit( return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
...@@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process); ...@@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
*/ */
void tcp_time_wait(struct sock *sk, int state, int timeo) void tcp_time_wait(struct sock *sk, int state, int timeo)
{ {
struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
bool recycle_ok = false; bool recycle_ok = false;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk); recycle_ok = tcp_remember_stamp(sk);
if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, &tcp_death_row, state);
tw = inet_twsk_alloc(sk, state);
if (tw) { if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
...@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ...@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = TCP_TIMEWAIT_LEN; timeo = TCP_TIMEWAIT_LEN;
} }
inet_twsk_schedule(tw, &tcp_death_row, timeo, inet_twsk_schedule(tw, timeo);
TCP_TIMEWAIT_LEN);
inet_twsk_put(tw); inet_twsk_put(tw);
} else { } else {
/* Sorry, if we're out of memory, just CLOSE this /* Sorry, if we're out of memory, just CLOSE this
......
...@@ -246,7 +246,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, ...@@ -246,7 +246,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
*twp = tw; *twp = tw;
} else if (tw) { } else if (tw) {
/* Silly. Should hash-dance instead... */ /* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row); inet_twsk_deschedule(tw);
inet_twsk_put(tw); inet_twsk_put(tw);
} }
......
...@@ -1486,7 +1486,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1486,7 +1486,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
ntohs(th->dest), tcp_v6_iif(skb)); ntohs(th->dest), tcp_v6_iif(skb));
if (sk2) { if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_timewait_sock *tw = inet_twsk(sk);
inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_deschedule(tw);
inet_twsk_put(tw); inet_twsk_put(tw);
sk = sk2; sk = sk2;
tcp_v6_restore_cb(skb); tcp_v6_restore_cb(skb);
...@@ -1728,9 +1728,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) ...@@ -1728,9 +1728,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
static void get_timewait6_sock(struct seq_file *seq, static void get_timewait6_sock(struct seq_file *seq,
struct inet_timewait_sock *tw, int i) struct inet_timewait_sock *tw, int i)
{ {
long delta = tw->tw_timer.expires - jiffies;
const struct in6_addr *dest, *src; const struct in6_addr *dest, *src;
__u16 destp, srcp; __u16 destp, srcp;
s32 delta = tw->tw_ttd - inet_tw_time_stamp();
dest = &tw->tw_v6_daddr; dest = &tw->tw_v6_daddr;
src = &tw->tw_v6_rcv_saddr; src = &tw->tw_v6_rcv_saddr;
......
...@@ -272,7 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, ...@@ -272,7 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
hp->source, lport ? lport : hp->dest, hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER); skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) { if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_deschedule(inet_twsk(sk));
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
sk = sk2; sk = sk2;
} }
...@@ -437,7 +437,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, ...@@ -437,7 +437,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
tgi->lport ? tgi->lport : hp->dest, tgi->lport ? tgi->lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER); skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) { if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_deschedule(inet_twsk(sk));
inet_twsk_put(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk));
sk = sk2; sk = sk2;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment