Commit 7f5c6d4f authored by Eric Dumazet's avatar Eric Dumazet Committed by Patrick McHardy

netfilter: get rid of atomic ops in fast path

We currently use a percpu spinlock to 'protect' rule bytes/packets
counters, after various attempts to use RCU instead.

Lately we added a seqlock so that get_counters() can run without
blocking BH or 'writers'. But we really only need the seqcount in it.

Spinlock itself is only locked by the current/owner cpu, so we can
remove it completely.

This cleanups api, using correct 'writer' vs 'reader' semantic.

At replace time, the get_counters() call makes sure all cpus are done
using the old table.
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
parent 8f7b01a1
...@@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af); ...@@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
extern struct xt_table_info *xt_alloc_table_info(unsigned int size); extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern void xt_free_table_info(struct xt_table_info *info); extern void xt_free_table_info(struct xt_table_info *info);
/* /**
* Per-CPU spinlock associated with per-cpu table entries, and * xt_recseq - recursive seqcount for netfilter use
* with a counter for the "reading" side that allows a recursive *
* reader to avoid taking the lock and deadlocking. * Packet processing changes the seqcount only if no recursion happened
* * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
* "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. * because we use the normal seqcount convention :
* It needs to ensure that the rules are not being changed while the packet * Low order bit set to 1 if a writer is active.
* is being processed. In some cases, the read lock will be acquired
* twice on the same CPU; this is okay because of the count.
*
* "writing" is used when reading counters.
* During replace any readers that are using the old tables have to complete
* before freeing the old table. This is handled by the write locking
* necessary for reading the counters.
*/ */
struct xt_info_lock { DECLARE_PER_CPU(seqcount_t, xt_recseq);
seqlock_t lock;
unsigned char readers;
};
DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
/* /**
* Note: we need to ensure that preemption is disabled before acquiring * xt_write_recseq_begin - start of a write section
* the per-cpu-variable, so we do it as a two step process rather than
* using "spin_lock_bh()".
*
* We _also_ need to disable bottom half processing before updating our
* nesting count, to make sure that the only kind of re-entrancy is this
* code being called by itself: since the count+lock is not an atomic
* operation, we can allow no races.
* *
* _Only_ that special combination of being per-cpu and never getting * Begin packet processing : all readers must wait the end
* re-entered asynchronously means that the count is safe. * 1) Must be called with preemption disabled
* 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
* Returns :
* 1 if no recursion on this cpu
* 0 if recursion detected
*/ */
static inline void xt_info_rdlock_bh(void) static inline unsigned int xt_write_recseq_begin(void)
{ {
struct xt_info_lock *lock; unsigned int addend;
local_bh_disable(); /*
lock = &__get_cpu_var(xt_info_locks); * Low order bit of sequence is set if we already
if (likely(!lock->readers++)) * called xt_write_recseq_begin().
write_seqlock(&lock->lock); */
} addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;
static inline void xt_info_rdunlock_bh(void) /*
{ * This is kind of a write_seqcount_begin(), but addend is 0 or 1
struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); * We dont check addend value to avoid a test and conditional jump,
* since addend is most likely 1
*/
__this_cpu_add(xt_recseq.sequence, addend);
smp_wmb();
if (likely(!--lock->readers)) return addend;
write_sequnlock(&lock->lock);
local_bh_enable();
} }
/* /**
* The "writer" side needs to get exclusive access to the lock, * xt_write_recseq_end - end of a write section
* regardless of readers. This must be called with bottom half * @addend: return value from previous xt_write_recseq_begin()
* processing (and thus also preemption) disabled. *
* End packet processing : all readers can proceed
* 1) Must be called with preemption disabled
* 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
*/ */
static inline void xt_info_wrlock(unsigned int cpu) static inline void xt_write_recseq_end(unsigned int addend)
{
write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
}
static inline void xt_info_wrunlock(unsigned int cpu)
{ {
write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
smp_wmb();
__this_cpu_add(xt_recseq.sequence, addend);
} }
/* /*
......
...@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, ...@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
void *table_base; void *table_base;
const struct xt_table_info *private; const struct xt_table_info *private;
struct xt_action_param acpar; struct xt_action_param acpar;
unsigned int addend;
if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
return NF_DROP; return NF_DROP;
...@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, ...@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
indev = in ? in->name : nulldevname; indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname; outdev = out ? out->name : nulldevname;
xt_info_rdlock_bh(); local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private; private = table->private;
table_base = private->entries[smp_processor_id()]; table_base = private->entries[smp_processor_id()];
...@@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, ...@@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
/* Verdict */ /* Verdict */
break; break;
} while (!acpar.hotdrop); } while (!acpar.hotdrop);
xt_info_rdunlock_bh(); xt_write_recseq_end(addend);
local_bh_enable();
if (acpar.hotdrop) if (acpar.hotdrop)
return NF_DROP; return NF_DROP;
...@@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t, ...@@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t,
unsigned int i; unsigned int i;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0; i = 0;
xt_entry_foreach(iter, t->entries[cpu], t->size) { xt_entry_foreach(iter, t->entries[cpu], t->size) {
...@@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t, ...@@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t,
unsigned int start; unsigned int start;
do { do {
start = read_seqbegin(lock); start = read_seqcount_begin(s);
bcnt = iter->counters.bcnt; bcnt = iter->counters.bcnt;
pcnt = iter->counters.pcnt; pcnt = iter->counters.pcnt;
} while (read_seqretry(lock, start)); } while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; ++i;
...@@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user, ...@@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
int ret = 0; int ret = 0;
void *loc_cpu_entry; void *loc_cpu_entry;
struct arpt_entry *iter; struct arpt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp; struct compat_xt_counters_info compat_tmp;
...@@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user, ...@@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
/* Choose the copy that is on our node */ /* Choose the copy that is on our node */
curcpu = smp_processor_id(); curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu]; loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu); addend = xt_write_recseq_begin();
xt_entry_foreach(iter, loc_cpu_entry, private->size) { xt_entry_foreach(iter, loc_cpu_entry, private->size) {
ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
++i; ++i;
} }
xt_info_wrunlock(curcpu); xt_write_recseq_end(addend);
unlock_up_free: unlock_up_free:
local_bh_enable(); local_bh_enable();
xt_table_unlock(t); xt_table_unlock(t);
......
...@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info) ...@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
} }
EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
/*
We keep a set of rules for each CPU, so we can avoid write-locking
them in the softirq when updating the counters and therefore
only need to read-lock in the softirq; doing a write_lock_bh() in user
context stops packets coming through and allows user context to read
the counters or update the rules.
Hence the start of any table is given by get_table() below. */
/* Returns whether matches rule or not. */ /* Returns whether matches rule or not. */
/* Performance critical - called for every packet */ /* Performance critical - called for every packet */
static inline bool static inline bool
...@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb, ...@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
unsigned int *stackptr, origptr, cpu; unsigned int *stackptr, origptr, cpu;
const struct xt_table_info *private; const struct xt_table_info *private;
struct xt_action_param acpar; struct xt_action_param acpar;
unsigned int addend;
/* Initialization */ /* Initialization */
ip = ip_hdr(skb); ip = ip_hdr(skb);
...@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb, ...@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
acpar.hooknum = hook; acpar.hooknum = hook;
IP_NF_ASSERT(table->valid_hooks & (1 << hook)); IP_NF_ASSERT(table->valid_hooks & (1 << hook));
xt_info_rdlock_bh(); local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private; private = table->private;
cpu = smp_processor_id(); cpu = smp_processor_id();
table_base = private->entries[cpu]; table_base = private->entries[cpu];
...@@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb, ...@@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb,
pr_debug("Exiting %s; resetting sp from %u to %u\n", pr_debug("Exiting %s; resetting sp from %u to %u\n",
__func__, *stackptr, origptr); __func__, *stackptr, origptr);
*stackptr = origptr; *stackptr = origptr;
xt_info_rdunlock_bh(); xt_write_recseq_end(addend);
local_bh_enable();
#ifdef DEBUG_ALLOW_ALL #ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT; return NF_ACCEPT;
#else #else
...@@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t, ...@@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t,
unsigned int i; unsigned int i;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0; i = 0;
xt_entry_foreach(iter, t->entries[cpu], t->size) { xt_entry_foreach(iter, t->entries[cpu], t->size) {
...@@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t, ...@@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t,
unsigned int start; unsigned int start;
do { do {
start = read_seqbegin(lock); start = read_seqcount_begin(s);
bcnt = iter->counters.bcnt; bcnt = iter->counters.bcnt;
pcnt = iter->counters.pcnt; pcnt = iter->counters.pcnt;
} while (read_seqretry(lock, start)); } while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */ ++i; /* macro does multi eval of i */
...@@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user, ...@@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
int ret = 0; int ret = 0;
void *loc_cpu_entry; void *loc_cpu_entry;
struct ipt_entry *iter; struct ipt_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp; struct compat_xt_counters_info compat_tmp;
...@@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user, ...@@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
/* Choose the copy that is on our node */ /* Choose the copy that is on our node */
curcpu = smp_processor_id(); curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu]; loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu); addend = xt_write_recseq_begin();
xt_entry_foreach(iter, loc_cpu_entry, private->size) { xt_entry_foreach(iter, loc_cpu_entry, private->size) {
ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
++i; ++i;
} }
xt_info_wrunlock(curcpu); xt_write_recseq_end(addend);
unlock_up_free: unlock_up_free:
local_bh_enable(); local_bh_enable();
xt_table_unlock(t); xt_table_unlock(t);
......
...@@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb, ...@@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb,
unsigned int *stackptr, origptr, cpu; unsigned int *stackptr, origptr, cpu;
const struct xt_table_info *private; const struct xt_table_info *private;
struct xt_action_param acpar; struct xt_action_param acpar;
unsigned int addend;
/* Initialization */ /* Initialization */
indev = in ? in->name : nulldevname; indev = in ? in->name : nulldevname;
...@@ -358,7 +359,8 @@ ip6t_do_table(struct sk_buff *skb, ...@@ -358,7 +359,8 @@ ip6t_do_table(struct sk_buff *skb,
IP_NF_ASSERT(table->valid_hooks & (1 << hook)); IP_NF_ASSERT(table->valid_hooks & (1 << hook));
xt_info_rdlock_bh(); local_bh_disable();
addend = xt_write_recseq_begin();
private = table->private; private = table->private;
cpu = smp_processor_id(); cpu = smp_processor_id();
table_base = private->entries[cpu]; table_base = private->entries[cpu];
...@@ -442,7 +444,9 @@ ip6t_do_table(struct sk_buff *skb, ...@@ -442,7 +444,9 @@ ip6t_do_table(struct sk_buff *skb,
} while (!acpar.hotdrop); } while (!acpar.hotdrop);
*stackptr = origptr; *stackptr = origptr;
xt_info_rdunlock_bh();
xt_write_recseq_end(addend);
local_bh_enable();
#ifdef DEBUG_ALLOW_ALL #ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT; return NF_ACCEPT;
...@@ -899,7 +903,7 @@ get_counters(const struct xt_table_info *t, ...@@ -899,7 +903,7 @@ get_counters(const struct xt_table_info *t,
unsigned int i; unsigned int i;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; seqcount_t *s = &per_cpu(xt_recseq, cpu);
i = 0; i = 0;
xt_entry_foreach(iter, t->entries[cpu], t->size) { xt_entry_foreach(iter, t->entries[cpu], t->size) {
...@@ -907,10 +911,10 @@ get_counters(const struct xt_table_info *t, ...@@ -907,10 +911,10 @@ get_counters(const struct xt_table_info *t,
unsigned int start; unsigned int start;
do { do {
start = read_seqbegin(lock); start = read_seqcount_begin(s);
bcnt = iter->counters.bcnt; bcnt = iter->counters.bcnt;
pcnt = iter->counters.pcnt; pcnt = iter->counters.pcnt;
} while (read_seqretry(lock, start)); } while (read_seqcount_retry(s, start));
ADD_COUNTER(counters[i], bcnt, pcnt); ADD_COUNTER(counters[i], bcnt, pcnt);
++i; ++i;
...@@ -1325,6 +1329,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, ...@@ -1325,6 +1329,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
int ret = 0; int ret = 0;
const void *loc_cpu_entry; const void *loc_cpu_entry;
struct ip6t_entry *iter; struct ip6t_entry *iter;
unsigned int addend;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp; struct compat_xt_counters_info compat_tmp;
...@@ -1381,13 +1386,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, ...@@ -1381,13 +1386,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
i = 0; i = 0;
/* Choose the copy that is on our node */ /* Choose the copy that is on our node */
curcpu = smp_processor_id(); curcpu = smp_processor_id();
xt_info_wrlock(curcpu); addend = xt_write_recseq_begin();
loc_cpu_entry = private->entries[curcpu]; loc_cpu_entry = private->entries[curcpu];
xt_entry_foreach(iter, loc_cpu_entry, private->size) { xt_entry_foreach(iter, loc_cpu_entry, private->size) {
ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
++i; ++i;
} }
xt_info_wrunlock(curcpu); xt_write_recseq_end(addend);
unlock_up_free: unlock_up_free:
local_bh_enable(); local_bh_enable();
......
...@@ -762,8 +762,8 @@ void xt_compat_unlock(u_int8_t af) ...@@ -762,8 +762,8 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock); EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif #endif
DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); DEFINE_PER_CPU(seqcount_t, xt_recseq);
EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
static int xt_jumpstack_alloc(struct xt_table_info *i) static int xt_jumpstack_alloc(struct xt_table_info *i)
{ {
...@@ -1362,10 +1362,7 @@ static int __init xt_init(void) ...@@ -1362,10 +1362,7 @@ static int __init xt_init(void)
int rv; int rv;
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); seqcount_init(&per_cpu(xt_recseq, i));
seqlock_init(&lock->lock);
lock->readers = 0;
} }
xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment