Commit 1cdba550 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next

Pablo Neira Ayuso says:

====================
Netfilter/IPVS/OVS updates for net-next

The following patchset contains Netfilter/IPVS fixes and OVS NAT
support, more specifically this batch is composed of:

1) Fix a crash in ipset when performing a parallel flush/dump with
   set:list type, from Jozsef Kadlecsik.

2) Make sure NFACCT_FILTER_* netlink attributes are in place before
   accessing them, from Phil Turnbull.

3) Check return error code from ip_vs_fill_iph_skb_off() in IPVS SIP
   helper, from Arnd Bergmann.

4) Add workaround to IPVS to reschedule existing connections to new
   destination server by dropping the packet and wait for retransmission
   of TCP syn packet, from Julian Anastasov.

5) Allow connection rescheduling in IPVS when in CLOSE state, also
   from Julian.

6) Fix wrong offset of SIP Call-ID in IPVS helper, from Marco Angaroni.

7) Validate IPSET_ATTR_ETHER netlink attribute length, from Jozsef.

8) Check match/targetinfo netlink attribute size in nft_compat,
   patch from Florian Westphal.

9) Check for integer overflow on 32-bit systems in x_tables, from
   Florian Westphal.

Several patches from Jarno Rajahalme to prepare the introduction of
NAT support to OVS based on the Netfilter infrastructure:

10) Schedule IP_CT_NEW_REPLY definition for removal in
    nf_conntrack_common.h.

11) Simplify checksumming recalculation in nf_nat.

12) Add comments to the openvswitch conntrack code, from Jarno.

13) Update the CT state key only after successful nf_conntrack_in()
    invocation.

14) Find existing conntrack entry after upcall.

15) Handle NF_REPEAT case due to templates in nf_conntrack_in().

16) Call the conntrack helper functions once the conntrack has been
    confirmed.

17) And finally, add the NAT interface to OVS.

The batch closes with:

18) Cleanup to use spin_unlock_wait() instead of
    spin_lock()/spin_unlock(), from Nicholas Mc Guire.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents acffb584 e39365be
......@@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
}
#endif /* CONFIG_IP_VS_NFCT */
/* Really using conntrack? */
static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
struct sk_buff *skb)
{
#ifdef CONFIG_IP_VS_NFCT
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
return false;
ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct))
return true;
#endif
return false;
}
static inline int
ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
{
......
......@@ -20,9 +20,15 @@ enum ip_conntrack_info {
IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,
/* Number of distinct IP_CT types (no NEW in reply dirn). */
IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
/* No NEW in reply direction. */
/* Number of distinct IP_CT types. */
IP_CT_NUMBER,
/* only for userspace compatibility */
#ifndef __KERNEL__
IP_CT_NEW_REPLY = IP_CT_NUMBER,
#endif
};
#define NF_CT_STATE_INVALID_BIT (1 << 0)
......
......@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
#define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */
#define OVS_CS_F_INVALID 0x10 /* Could not track connection. */
#define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */
#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was
* mangled by NAT.
*/
#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port
* was mangled by NAT.
*/
#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
/**
* enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
......@@ -632,6 +640,8 @@ struct ovs_action_hash {
* mask. For each bit set in the mask, the corresponding bit in the value is
* copied to the connection tracking label field in the connection.
* @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
* @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
* translation (NAT) on the packet.
*/
enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
......@@ -641,11 +651,50 @@ enum ovs_ct_attr {
OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */
OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of
related connections. */
OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */
__OVS_CT_ATTR_MAX
};
#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
/**
* enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
*
* @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
* @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
* address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
* specified. Effective only for packets for ct_state NEW connections.
* Packets of committed connections are mangled by the NAT action according to
* the committed NAT type regardless of the flags specified. As a corollary, a
* NAT action without a NAT type flag will only mangle packets of committed
* connections. The following NAT attributes only apply for NEW
* (non-committed) connections, and they may be included only when the CT
* action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
* @OVS_NAT_ATTR_DST is also included.
* @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
* @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
* @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
* @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
* @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
* @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
* @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
*/
enum ovs_nat_attr {
OVS_NAT_ATTR_UNSPEC,
OVS_NAT_ATTR_SRC,
OVS_NAT_ATTR_DST,
OVS_NAT_ATTR_IP_MIN,
OVS_NAT_ATTR_IP_MAX,
OVS_NAT_ATTR_PROTO_MIN,
OVS_NAT_ATTR_PROTO_MAX,
OVS_NAT_ATTR_PERSISTENT,
OVS_NAT_ATTR_PROTO_HASH,
OVS_NAT_ATTR_PROTO_RANDOM,
__OVS_NAT_ATTR_MAX,
};
#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
/**
* enum ovs_action_attr - Action types.
*
......
......@@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);
if (skb->ip_summed != CHECKSUM_PARTIAL) {
if (!(rt->rt_flags & RTCF_LOCAL) &&
(!skb->dev || skb->dev->features &
(NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) +
skb_network_offset(skb) +
skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
ip_hdrlen(skb);
skb->csum_offset = (void *)check - data;
*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
datalen, proto, 0);
} else {
*check = 0;
*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
datalen, proto,
csum_partial(data, datalen,
0));
if (proto == IPPROTO_UDP && !*check)
*check = CSUM_MANGLED_0;
}
*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
......
......@@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
if (skb->ip_summed != CHECKSUM_PARTIAL) {
if (!(rt->rt6i_flags & RTF_LOCAL) &&
(!skb->dev || skb->dev->features &
(NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) +
skb_network_offset(skb) +
skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
(data - (void *)skb->data);
skb->csum_offset = (void *)check - data;
*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
datalen, proto, 0);
} else {
*check = 0;
*check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
datalen, proto,
csum_partial(data, datalen,
0));
if (proto == IPPROTO_UDP && !*check)
*check = CSUM_MANGLED_0;
}
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
......
......@@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
e.id = ip_to_id(map, ip);
if (tb[IPSET_ATTR_ETHER]) {
if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)
return -IPSET_ERR_PROTOCOL;
memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
e.add_mac = 1;
}
......
......@@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;
/* Must wait for flush to be really finished in list:set */
rcu_barrier();
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
......
......@@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
if (unlikely(!tb[IPSET_ATTR_ETHER]))
if (unlikely(!tb[IPSET_ATTR_ETHER] ||
nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))
return -IPSET_ERR_PROTOCOL;
ret = ip_set_get_extensions(set, tb, &ext);
......
......@@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set");
struct set_elem {
struct rcu_head rcu;
struct list_head list;
struct ip_set *set; /* Sigh, in order to cleanup reference */
ip_set_id_t id;
} __aligned(__alignof__(u64));
......@@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
/* Userspace interfaces: we are protected by the nfnl mutex */
static void
__list_set_del(struct ip_set *set, struct set_elem *e)
__list_set_del_rcu(struct rcu_head * rcu)
{
struct set_elem *e = container_of(rcu, struct set_elem, rcu);
struct ip_set *set = e->set;
struct list_set *map = set->data;
ip_set_put_byindex(map->net, e->id);
/* We may call it, because we don't have a to be destroyed
* extension which is used by the kernel.
*/
ip_set_ext_destroy(set, e);
kfree_rcu(e, rcu);
kfree(e);
}
static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
list_del_rcu(&e->list);
__list_set_del(set, e);
call_rcu(&e->rcu, __list_set_del_rcu);
}
static inline void
list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
list_set_replace(struct set_elem *e, struct set_elem *old)
{
list_replace_rcu(&old->list, &e->list);
__list_set_del(set, old);
call_rcu(&old->rcu, __list_set_del_rcu);
}
static void
......@@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct set_elem *e, *n, *prev, *next;
bool flag_exist = flags & IPSET_FLAG_EXIST;
if (SET_WITH_TIMEOUT(set))
set_cleanup_entries(set);
/* Find where to add the new entry */
n = prev = next = NULL;
list_for_each_entry(e, &map->members, list) {
......@@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (!e)
return -ENOMEM;
e->id = d->id;
e->set = set;
INIT_LIST_HEAD(&e->list);
list_set_init_extensions(set, ext, e);
if (n)
list_set_replace(set, e, n);
list_set_replace(e, n);
else if (next)
list_add_tail_rcu(&e->list, &next->list);
else if (prev)
......@@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list);
ip_set_put_byindex(map->net, e->id);
......@@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
struct set_elem *e;
u32 n = 0;
list_for_each_entry(e, &map->members, list)
rcu_read_lock();
list_for_each_entry_rcu(e, &map->members, list)
n++;
rcu_read_unlock();
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
......@@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
list_for_each_entry(e, &map->members, list) {
if (i == first)
break;
i++;
}
rcu_read_lock();
list_for_each_entry_from(e, &map->members, list) {
list_for_each_entry_rcu(e, &map->members, list) {
if (i < first ||
(SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))) {
i++;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
if (i == first) {
nla_nest_cancel(skb, atd);
ret = -EMSGSIZE;
goto out;
}
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
}
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(map->net, e->id)))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
ipset_nest_end(skb, nested);
i++;
}
ipset_nest_end(skb, atd);
......@@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
nla_nest_cancel(skb, atd);
cb->args[IPSET_CB_ARG0] = 0;
ret = -EMSGSIZE;
} else {
cb->args[IPSET_CB_ARG0] = i;
}
cb->args[IPSET_CB_ARG0] = i - 1;
ipset_nest_end(skb, atd);
out:
rcu_read_unlock();
......
......@@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
switch (cp->protocol) {
case IPPROTO_TCP:
return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
(cp->state == IP_VS_TCP_S_CLOSE) ||
((conn_reuse_mode & 2) &&
(cp->state == IP_VS_TCP_S_FIN_WAIT) &&
(cp->flags & IP_VS_CONN_F_NOOUTPUT));
......@@ -1757,16 +1758,35 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
cp = pp->conn_in_get(ipvs, af, skb, &iph);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs &&
is_new_conn(skb, &iph) && cp &&
((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) ||
unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
bool uses_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
resched = true;
uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
} else if (is_new_conn_expected(cp, conn_reuse_mode)) {
uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;
} else {
/* Do not reschedule controlling connection
* that uses conntrack while it is still
* referenced by controlled connection(s).
*/
resched = !uses_ct;
}
}
if (resched) {
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
if (uses_ct)
return NF_DROP;
cp = NULL;
}
}
if (unlikely(!cp)) {
int v;
......
......@@ -70,10 +70,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
const char *dptr;
int retc;
ip_vs_fill_iph_skb(p->af, skb, false, &iph);
retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph);
/* Only useful with UDP */
if (iph.protocol != IPPROTO_UDP)
if (!retc || iph.protocol != IPPROTO_UDP)
return -EINVAL;
/* todo: IPv6 fragments:
* I think this only should be done for the first fragment. /HS
......@@ -88,7 +88,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
if (get_callid(dptr, 0, datalen, &matchoff, &matchlen))
return -EINVAL;
/* N.B: pe_data is only set on success,
......
......@@ -74,8 +74,7 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
spin_lock(lock);
while (unlikely(nf_conntrack_locks_all)) {
spin_unlock(lock);
spin_lock(&nf_conntrack_locks_all_lock);
spin_unlock(&nf_conntrack_locks_all_lock);
spin_unlock_wait(&nf_conntrack_locks_all_lock);
spin_lock(lock);
}
}
......@@ -121,8 +120,7 @@ static void nf_conntrack_all_lock(void)
nf_conntrack_locks_all = true;
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
spin_unlock(&nf_conntrack_locks[i]);
spin_unlock_wait(&nf_conntrack_locks[i]);
}
}
......
......@@ -242,6 +242,9 @@ nfacct_filter_alloc(const struct nlattr * const attr)
if (err < 0)
return ERR_PTR(err);
if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
return ERR_PTR(-EINVAL);
filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
if (!filter)
return ERR_PTR(-ENOMEM);
......
......@@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(match))
return ERR_PTR(-ENOENT);
if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO]))
return ERR_PTR(-EINVAL);
/* This is the first time we use this match, allocate operations */
nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
if (nft_match == NULL)
......@@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(target))
return ERR_PTR(-ENOENT);
if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO]))
return ERR_PTR(-EINVAL);
/* This is the first time we use this target, allocate operations */
nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
if (nft_target == NULL)
......
......@@ -659,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
struct xt_table_info *info = NULL;
size_t sz = sizeof(*info) + size;
if (sz < sizeof(*info))
return NULL;
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
......
......@@ -6,7 +6,8 @@ config OPENVSWITCH
tristate "Open vSwitch"
depends on INET
depends on !NF_CONNTRACK || \
(NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
(NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
(!NF_NAT || NF_NAT)))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
......
This diff is collapsed.
......@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
#else
#include <linux/errno.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment