Commit 596cf3fe authored by Vishwanath Pai's avatar Vishwanath Pai Committed by Pablo Neira Ayuso

netfilter: ipset: fix race condition in ipset save, swap and delete

This fix adds a new reference counter (ref_netlink) for the struct ip_set.
The other reference counter (ref) can be swapped out by ip_set_swap and we
need a separate counter to keep track of references for netlink events
like dump. Using the same ref counter for dump causes a race condition
which can be demonstrated by the following script:

ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 500000 \
counters
ipset create hash_ip2 hash:ip family inet hashsize 300000 maxelem 500000 \
counters
ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 500000 \
counters

ipset save &

ipset swap hash_ip3 hash_ip2
ipset destroy hash_ip3 /* will crash the machine */

Swap will exchange the values of ref so destroy will see ref = 0 instead of
ref = 1. With this fix in place swap will not succeed because ipset save
still has ref_netlink on the set (ip_set_swap doesn't swap ref_netlink).

Both delete and swap will error out if ref_netlink != 0 on the set.

Note: The changes to *_head functions is because previously we would
increment ref whenever we called these functions, we don't do that
anymore.
Reviewed-by: default avatarJoshua Hunt <johunt@akamai.com>
Signed-off-by: default avatarVishwanath Pai <vpai@akamai.com>
Signed-off-by: default avatarJozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent d7be81a5
...@@ -234,6 +234,10 @@ struct ip_set { ...@@ -234,6 +234,10 @@ struct ip_set {
spinlock_t lock; spinlock_t lock;
/* References to the set */ /* References to the set */
u32 ref; u32 ref;
/* References to the set for netlink events like dump,
* ref can be swapped out by ip_set_swap
*/
u32 ref_netlink;
/* The core set type */ /* The core set type */
struct ip_set_type *type; struct ip_set_type *type;
/* The type variant doing the real job */ /* The type variant doing the real job */
......
...@@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) ...@@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (!nested) if (!nested)
goto nla_put_failure; goto nla_put_failure;
if (mtype_do_head(skb, map) || if (mtype_do_head(skb, map) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure; goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set))) if (unlikely(ip_set_put_flags(skb, set)))
......
...@@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set) ...@@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock); write_unlock_bh(&ip_set_ref_lock);
} }
/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
* a separate reference counter
*/
static inline void
__ip_set_get_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
set->ref_netlink++;
write_unlock_bh(&ip_set_ref_lock);
}
static inline void
__ip_set_put_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
BUG_ON(set->ref_netlink == 0);
set->ref_netlink--;
write_unlock_bh(&ip_set_ref_lock);
}
/* Add, del and test set entries from kernel. /* Add, del and test set entries from kernel.
* *
* The set behind the index must exist and must be referenced * The set behind the index must exist and must be referenced
...@@ -1002,7 +1022,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, ...@@ -1002,7 +1022,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (!attr[IPSET_ATTR_SETNAME]) { if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) { for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i); s = ip_set(inst, i);
if (s && s->ref) { if (s && (s->ref || s->ref_netlink)) {
ret = -IPSET_ERR_BUSY; ret = -IPSET_ERR_BUSY;
goto out; goto out;
} }
...@@ -1024,7 +1044,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, ...@@ -1024,7 +1044,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (!s) { if (!s) {
ret = -ENOENT; ret = -ENOENT;
goto out; goto out;
} else if (s->ref) { } else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY; ret = -IPSET_ERR_BUSY;
goto out; goto out;
} }
...@@ -1171,6 +1191,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, ...@@ -1171,6 +1191,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
from->family == to->family)) from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH; return -IPSET_ERR_TYPE_MISMATCH;
if (from->ref_netlink || to->ref_netlink)
return -EBUSY;
strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from_name, from->name, IPSET_MAXNAMELEN);
strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN);
strncpy(to->name, from_name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN);
...@@ -1206,7 +1229,7 @@ ip_set_dump_done(struct netlink_callback *cb) ...@@ -1206,7 +1229,7 @@ ip_set_dump_done(struct netlink_callback *cb)
if (set->variant->uref) if (set->variant->uref)
set->variant->uref(set, cb, false); set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name); pr_debug("release set %s\n", set->name);
__ip_set_put_byindex(inst, index); __ip_set_put_netlink(set);
} }
return 0; return 0;
} }
...@@ -1328,7 +1351,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1328,7 +1351,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
if (!cb->args[IPSET_CB_ARG0]) { if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */ /* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n"); pr_debug("reference set\n");
set->ref++; set->ref_netlink++;
} }
write_unlock_bh(&ip_set_ref_lock); write_unlock_bh(&ip_set_ref_lock);
nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
...@@ -1396,7 +1419,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1396,7 +1419,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
if (set->variant->uref) if (set->variant->uref)
set->variant->uref(set, cb, false); set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name); pr_debug("release set %s\n", set->name);
__ip_set_put_byindex(inst, index); __ip_set_put_netlink(set);
cb->args[IPSET_CB_ARG0] = 0; cb->args[IPSET_CB_ARG0] = 0;
} }
out: out:
......
...@@ -1082,7 +1082,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) ...@@ -1082,7 +1082,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure; goto nla_put_failure;
#endif #endif
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure; goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set))) if (unlikely(ip_set_put_flags(skb, set)))
......
...@@ -458,7 +458,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) ...@@ -458,7 +458,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
if (!nested) if (!nested)
goto nla_put_failure; goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
htonl(sizeof(*map) + n * set->dsize))) htonl(sizeof(*map) + n * set->dsize)))
goto nla_put_failure; goto nla_put_failure;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment