Commit 4e144d3a authored by David S. Miller's avatar David S. Miller

Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next

Pablo Neira Ayuso says:

====================
The following batch contains Netfilter/IPVS updates for net-next,
they are:

* Enforce policy to several nfnetlink subsystem, from Daniel
  Borkmann.

* Use xt_socket to match the third packet (to perform simplistic
  socket-based stateful filtering), from Eric Dumazet.

* Avoid large timeout for picked up from the middle TCP flows,
  from Florian Westphal.

* Exclude IPVS from struct net if IPVS is disabled and removal
  of unnecessary included header file, from JunweiZhang.

* Release SCTP connection immediately under load, to mimic current
  TCP behaviour, from Julian Anastasov.

* Replace and enhance SCTP state machine, from Julian Anastasov.

* Add tweak to reduce sync traffic in the presence of persistence,
  also from Julian Anastasov.

* Add tweak for the IPVS SH scheduler not to reject connections
  directed to a server, choose a new one instead, from Alexander
  Frolkin.

* Add support for sloppy TCP and SCTP modes, that creates state
  information on any packet, not only initial handshake packets,
  from Alexander Frolkin.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 008aebde 496e4ae7
......@@ -181,6 +181,19 @@ snat_reroute - BOOLEAN
always be the same as the original route so it is an optimisation
to disable snat_reroute and avoid the recalculation.
sync_persist_mode - INTEGER
default 0
Controls the synchronisation of connections when using persistence
0: All types of connections are synchronised
1: Attempt to reduce the synchronisation traffic depending on
the connection type. For persistent services avoid synchronisation
for normal connections, do it only for persistence templates.
In such case, for TCP and SCTP it may need enabling sloppy_tcp and
sloppy_sctp flags on backup servers. For non-persistent services
such optimization is not applied, mode 0 is assumed.
sync_version - INTEGER
default 1
......
......@@ -197,31 +197,6 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr)
}
}
/* This function is a faster version of ip_vs_fill_iph_skb().
* Where we only populate {s,d}addr (and avoid calling ipv6_find_hdr()).
* This is used by the some of the ip_vs_*_schedule() functions.
* (Mostly done to avoid ABI breakage of external schedulers)
*/
static inline void
ip_vs_fill_iph_addr_only(int af, const struct sk_buff *skb,
struct ip_vs_iphdr *iphdr)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
const struct ipv6hdr *iph =
(struct ipv6hdr *)skb_network_header(skb);
iphdr->saddr.in6 = iph->saddr;
iphdr->daddr.in6 = iph->daddr;
} else
#endif
{
const struct iphdr *iph =
(struct iphdr *)skb_network_header(skb);
iphdr->saddr.ip = iph->saddr;
iphdr->daddr.ip = iph->daddr;
}
}
static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst,
const union nf_inet_addr *src)
{
......@@ -405,17 +380,18 @@ enum {
*/
enum ip_vs_sctp_states {
IP_VS_SCTP_S_NONE,
IP_VS_SCTP_S_INIT_CLI,
IP_VS_SCTP_S_INIT_SER,
IP_VS_SCTP_S_INIT_ACK_CLI,
IP_VS_SCTP_S_INIT_ACK_SER,
IP_VS_SCTP_S_ECHO_CLI,
IP_VS_SCTP_S_ECHO_SER,
IP_VS_SCTP_S_INIT1,
IP_VS_SCTP_S_INIT,
IP_VS_SCTP_S_COOKIE_SENT,
IP_VS_SCTP_S_COOKIE_REPLIED,
IP_VS_SCTP_S_COOKIE_WAIT,
IP_VS_SCTP_S_COOKIE,
IP_VS_SCTP_S_COOKIE_ECHOED,
IP_VS_SCTP_S_ESTABLISHED,
IP_VS_SCTP_S_SHUT_CLI,
IP_VS_SCTP_S_SHUT_SER,
IP_VS_SCTP_S_SHUT_ACK_CLI,
IP_VS_SCTP_S_SHUT_ACK_SER,
IP_VS_SCTP_S_SHUTDOWN_SENT,
IP_VS_SCTP_S_SHUTDOWN_RECEIVED,
IP_VS_SCTP_S_SHUTDOWN_ACK_SENT,
IP_VS_SCTP_S_REJECTED,
IP_VS_SCTP_S_CLOSED,
IP_VS_SCTP_S_LAST
};
......@@ -814,7 +790,8 @@ struct ip_vs_scheduler {
/* selecting a server from the given service */
struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
const struct sk_buff *skb);
const struct sk_buff *skb,
struct ip_vs_iphdr *iph);
};
/* The persistence engine object */
......@@ -998,10 +975,13 @@ struct netns_ipvs {
int sysctl_snat_reroute;
int sysctl_sync_ver;
int sysctl_sync_ports;
int sysctl_sync_persist_mode;
unsigned long sysctl_sync_qlen_max;
int sysctl_sync_sock_size;
int sysctl_cache_bypass;
int sysctl_expire_nodest_conn;
int sysctl_sloppy_tcp;
int sysctl_sloppy_sctp;
int sysctl_expire_quiescent_template;
int sysctl_sync_threshold[2];
unsigned int sysctl_sync_refresh_period;
......@@ -1044,6 +1024,8 @@ struct netns_ipvs {
#define DEFAULT_SYNC_THRESHOLD 3
#define DEFAULT_SYNC_PERIOD 50
#define DEFAULT_SYNC_VER 1
#define DEFAULT_SLOPPY_TCP 0
#define DEFAULT_SLOPPY_SCTP 0
#define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ)
#define DEFAULT_SYNC_RETRIES 0
#define IPVS_SYNC_WAKEUP_RATE 8
......@@ -1080,11 +1062,26 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
return ipvs->sysctl_sync_ver;
}
static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_sloppy_tcp;
}
static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_sloppy_sctp;
}
static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
{
return ACCESS_ONCE(ipvs->sysctl_sync_ports);
}
static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_sync_persist_mode;
}
static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_sync_qlen_max;
......@@ -1133,11 +1130,26 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
return DEFAULT_SYNC_VER;
}
static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
{
return DEFAULT_SLOPPY_TCP;
}
static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
{
return DEFAULT_SLOPPY_SCTP;
}
static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
{
return 1;
}
static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
{
return 0;
}
static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
{
return IPVS_SYNC_QLEN_MAX;
......
......@@ -115,7 +115,9 @@ struct net {
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
#if IS_ENABLED(CONFIG_IP_VS)
struct netns_ipvs *ipvs;
#endif
struct sock *diag_nlsk;
atomic_t rt_genid;
atomic_t fnhe_genid;
......
......@@ -20,6 +20,12 @@
#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
#define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */
#define IP_VS_SVC_F_SCHED1 0x0008 /* scheduler flag 1 */
#define IP_VS_SVC_F_SCHED2 0x0010 /* scheduler flag 2 */
#define IP_VS_SVC_F_SCHED3 0x0020 /* scheduler flag 3 */
#define IP_VS_SVC_F_SCHED_SH_FALLBACK IP_VS_SVC_F_SCHED1 /* SH fallback */
#define IP_VS_SVC_F_SCHED_SH_PORT IP_VS_SVC_F_SCHED2 /* SH use port */
/*
* Destination Server Flags
......
......@@ -105,5 +105,7 @@ enum nfqnl_attr_config {
#define NFQA_SKB_CSUMNOTREADY (1 << 0)
/* packet is GSO (i.e., exceeds device mtu) */
#define NFQA_SKB_GSO (1 << 1)
/* csum not validated (incoming device doesn't support hw checksum, etc.) */
#define NFQA_SKB_CSUM_NOTVERIFIED (1 << 2)
#endif /* _NFNETLINK_QUEUE_H */
......@@ -5,10 +5,17 @@
enum {
XT_SOCKET_TRANSPARENT = 1 << 0,
XT_SOCKET_NOWILDCARD = 1 << 1,
};
struct xt_socket_mtinfo1 {
__u8 flags;
};
#define XT_SOCKET_FLAGS_V1 XT_SOCKET_TRANSPARENT
struct xt_socket_mtinfo2 {
__u8 flags;
};
#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD)
#endif /* _XT_SOCKET_H */
......@@ -3,7 +3,6 @@
#include "../fs/xfs/xfs_sysctl.h"
#include <linux/sunrpc/debug.h>
#include <linux/string.h>
#include <net/ip_vs.h>
#include <linux/syscalls.h>
#include <linux/namei.h>
#include <linux/mount.h>
......
......@@ -1231,6 +1231,18 @@ void ip_vs_random_dropentry(struct net *net)
default:
continue;
}
} else if (cp->protocol == IPPROTO_SCTP) {
switch (cp->state) {
case IP_VS_SCTP_S_INIT1:
case IP_VS_SCTP_S_INIT:
break;
case IP_VS_SCTP_S_ESTABLISHED:
if (todrop_entry(cp))
break;
continue;
default:
continue;
}
} else {
if (!todrop_entry(cp))
continue;
......
......@@ -305,7 +305,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* return *ignored=0 i.e. ICMP and NF_DROP
*/
sched = rcu_dereference(svc->scheduler);
dest = sched->schedule(svc, skb);
dest = sched->schedule(svc, skb, iph);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
......@@ -452,7 +452,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
sched = rcu_dereference(svc->scheduler);
dest = sched->schedule(svc, skb);
dest = sched->schedule(svc, skb, iph);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
......
......@@ -1714,6 +1714,12 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = &proc_do_sync_ports,
},
{
.procname = "sync_persist_mode",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_qlen_max",
.maxlen = sizeof(unsigned long),
......@@ -1738,6 +1744,18 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sloppy_tcp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sloppy_sctp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "expire_quiescent_template",
.maxlen = sizeof(int),
......@@ -3717,12 +3735,15 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_sync_ver;
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
ipvs->sysctl_sync_sock_size = 0;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
......
......@@ -214,18 +214,16 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Destination hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_dh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
s = (struct ip_vs_dh_state *) svc->sched_data;
dest = ip_vs_dh_get(svc->af, s, &iph.daddr);
dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
......@@ -235,7 +233,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph.daddr),
IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
......
......@@ -487,19 +487,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest = NULL;
struct ip_vs_lblc_entry *en;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
......@@ -529,12 +527,12 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* If we fail to create a cache entry, we'll just use the valid dest */
spin_lock_bh(&svc->sched_lock);
if (!tbl->dead)
ip_vs_lblc_new(tbl, &iph.daddr, dest);
ip_vs_lblc_new(tbl, &iph->daddr, dest);
spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph.daddr),
IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
......
......@@ -655,19 +655,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
struct ip_vs_lblcr_entry *en;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
if (en) {
en->lastuse = jiffies;
......@@ -718,12 +716,12 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* If we fail to create a cache entry, we'll just use the valid dest */
spin_lock_bh(&svc->sched_lock);
if (!tbl->dead)
ip_vs_lblcr_new(tbl, &iph.daddr, dest);
ip_vs_lblcr_new(tbl, &iph->daddr, dest);
spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph.daddr),
IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
......
......@@ -26,7 +26,8 @@
* Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
......
......@@ -55,7 +55,8 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
......
This diff is collapsed.
......@@ -39,6 +39,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct netns_ipvs *ipvs;
th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
if (th == NULL) {
......@@ -46,14 +47,15 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
return 0;
}
net = skb_net(skb);
ipvs = net_ipvs(net);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
rcu_read_lock();
if (th->syn &&
if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
(svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
&iph->daddr, th->dest))) {
int ignored;
if (ip_vs_todrop(net_ipvs(net))) {
if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
......@@ -401,7 +403,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
......@@ -415,7 +417,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
......@@ -424,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
......@@ -438,7 +440,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
......
......@@ -55,7 +55,8 @@ static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct list_head *p;
struct ip_vs_dest *dest, *last;
......
......@@ -59,7 +59,8 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
......
......@@ -48,6 +48,10 @@
#include <net/ip_vs.h>
#include <net/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
/*
* IPVS SH bucket
......@@ -71,10 +75,19 @@ struct ip_vs_sh_state {
struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
};
/* Helper function to determine if server is unavailable */
static inline bool is_unavailable(struct ip_vs_dest *dest)
{
return atomic_read(&dest->weight) <= 0 ||
dest->flags & IP_VS_DEST_F_OVERLOAD;
}
/*
* Returns hash value for IPVS SH entry
*/
static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
static inline unsigned int
ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
__be16 port, unsigned int offset)
{
__be32 addr_fold = addr->ip;
......@@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
IP_VS_SH_TAB_MASK;
}
......@@ -91,12 +105,42 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
return (!dest || is_unavailable(dest)) ? NULL : dest;
}
/* As ip_vs_sh_get, but with fallback if selected server is unavailable */
static inline struct ip_vs_dest *
ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
unsigned int offset;
unsigned int hash;
struct ip_vs_dest *dest;
for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
dest = rcu_dereference(s->buckets[hash].dest);
if (!dest)
break;
if (is_unavailable(dest))
IP_VS_DBG_BUF(6, "SH: selected unavailable server "
"%s:%d (offset %d)",
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port), offset);
else
return dest;
}
return NULL;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
......@@ -213,13 +257,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
}
/*
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
* consider that the server is overloaded here.
*/
static inline int is_overloaded(struct ip_vs_dest *dest)
/* Helper function to get port number */
static inline __be16
ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
return dest->flags & IP_VS_DEST_F_OVERLOAD;
__be16 port;
struct tcphdr _tcph, *th;
struct udphdr _udph, *uh;
sctp_sctphdr_t _sctph, *sh;
switch (iph->protocol) {
case IPPROTO_TCP:
th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
port = th->source;
break;
case IPPROTO_UDP:
uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
port = uh->source;
break;
case IPPROTO_SCTP:
sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
port = sh->source;
break;
default:
port = 0;
}
return port;
}
......@@ -227,28 +291,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Source Hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
__be16 port = 0;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
port = ip_vs_sh_get_port(skb, iph);
s = (struct ip_vs_sh_state *) svc->sched_data;
dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
else
dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph.saddr),
IP_VS_DBG_ADDR(svc->af, &iph->saddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
......
......@@ -425,6 +425,16 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
return sb;
}
/* Check if connection is controlled by persistence */
static inline bool in_persistence(struct ip_vs_conn *cp)
{
for (cp = cp->control; cp; cp = cp->control) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
return true;
}
return false;
}
/* Check if conn should be synced.
* pkts: conn packets, use sysctl_sync_threshold to avoid packet check
* - (1) sync_refresh_period: reduce sync rate. Additionally, retry
......@@ -447,6 +457,8 @@ static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
/* Check if we sync in current state */
if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
force = 0;
else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
return 0;
else if (likely(cp->protocol == IPPROTO_TCP)) {
if (!((1 << cp->state) &
((1 << IP_VS_TCP_S_ESTABLISHED) |
......@@ -461,9 +473,10 @@ static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
} else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
if (!((1 << cp->state) &
((1 << IP_VS_SCTP_S_ESTABLISHED) |
(1 << IP_VS_SCTP_S_CLOSED) |
(1 << IP_VS_SCTP_S_SHUT_ACK_CLI) |
(1 << IP_VS_SCTP_S_SHUT_ACK_SER))))
(1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
(1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
(1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
(1 << IP_VS_SCTP_S_CLOSED))))
return 0;
force = cp->state != cp->old_state;
if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
......
......@@ -31,7 +31,8 @@
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
......
......@@ -162,7 +162,8 @@ static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *last, *stop = NULL;
struct ip_vs_wrr_mark *mark = svc->sched_data;
......
......@@ -828,7 +828,9 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
struct nf_conntrack_l3proto *l3proto;
int ret = 0;
nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
if (ret < 0)
return ret;
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
......@@ -895,7 +897,9 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
memset(tuple, 0, sizeof(*tuple));
nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
if (err < 0)
return err;
if (!tb[CTA_TUPLE_IP])
return -EINVAL;
......@@ -946,9 +950,12 @@ static inline int
ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
struct nlattr **helpinfo)
{
int err;
struct nlattr *tb[CTA_HELP_MAX+1];
nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
if (err < 0)
return err;
if (!tb[CTA_HELP_NAME])
return -EINVAL;
......@@ -1431,7 +1438,9 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]
struct nf_conntrack_l4proto *l4proto;
int err = 0;
nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
if (err < 0)
return err;
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
......@@ -1452,9 +1461,12 @@ static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
static inline int
change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
{
int err;
struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
if (err < 0)
return err;
if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
return -EINVAL;
......@@ -2115,7 +2127,9 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
struct nlattr *cda[CTA_MAX+1];
int ret;
nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
if (ret < 0)
return ret;
spin_lock_bh(&nf_conntrack_lock);
ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
......@@ -2710,7 +2724,9 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr,
struct nf_conntrack_tuple nat_tuple = {};
int err;
nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
if (err < 0)
return err;
if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
return -EINVAL;
......
......@@ -1043,6 +1043,12 @@ static int tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
if (new_state == TCP_CONNTRACK_ESTABLISHED &&
timeout > timeouts[TCP_CONNTRACK_UNACK])
timeout = timeouts[TCP_CONNTRACK_UNACK];
} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
......
......@@ -67,9 +67,12 @@ static int
nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
const struct nlattr *attr)
{
int err;
struct nlattr *tb[NFCTH_TUPLE_MAX+1];
nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
if (err < 0)
return err;
if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
return -EINVAL;
......@@ -121,9 +124,12 @@ static int
nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
const struct nlattr *attr)
{
int err;
struct nlattr *tb[NFCTH_POLICY_MAX+1];
nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
if (err < 0)
return err;
if (!tb[NFCTH_POLICY_NAME] ||
!tb[NFCTH_POLICY_EXPECT_MAX] ||
......@@ -153,8 +159,10 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
struct nf_conntrack_expect_policy *expect_policy;
struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
nfnl_cthelper_expect_policy_set);
ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
nfnl_cthelper_expect_policy_set);
if (ret < 0)
return ret;
if (!tb[NFCTH_POLICY_SET_NUM])
return -EINVAL;
......
......@@ -59,8 +59,10 @@ ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,
if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) {
struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1];
nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
attr, l4proto->ctnl_timeout.nla_policy);
ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
attr, l4proto->ctnl_timeout.nla_policy);
if (ret < 0)
return ret;
ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net,
&timeout->data);
......
......@@ -280,12 +280,17 @@ nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
skb_shinfo(to)->nr_frags = j;
}
static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet)
static int
nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
bool csum_verify)
{
__u32 flags = 0;
if (packet->ip_summed == CHECKSUM_PARTIAL)
flags = NFQA_SKB_CSUMNOTREADY;
else if (csum_verify)
flags = NFQA_SKB_CSUM_NOTVERIFIED;
if (skb_is_gso(packet))
flags |= NFQA_SKB_GSO;
......@@ -310,6 +315,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
struct net_device *outdev;
struct nf_conn *ct = NULL;
enum ip_conntrack_info uninitialized_var(ctinfo);
bool csum_verify;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
......@@ -327,6 +333,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
if (entskb->tstamp.tv64)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
if (entry->hook <= NF_INET_FORWARD ||
(entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
csum_verify = !skb_csum_unnecessary(entskb);
else
csum_verify = false;
outdev = entry->outdev;
switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
......@@ -476,7 +488,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
goto nla_put_failure;
if (nfqnl_put_packet_info(skb, entskb))
if (nfqnl_put_packet_info(skb, entskb, csum_verify))
goto nla_put_failure;
if (data_len) {
......
......@@ -163,8 +163,11 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
bool wildcard;
bool transparent = true;
/* Ignore sockets listening on INADDR_ANY */
wildcard = (sk->sk_state != TCP_TIME_WAIT &&
/* Ignore sockets listening on INADDR_ANY,
* unless XT_SOCKET_NOWILDCARD is set
*/
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
sk->sk_state != TCP_TIME_WAIT &&
inet_sk(sk)->inet_rcv_saddr == 0);
/* Ignore non-transparent sockets,
......@@ -197,7 +200,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
}
static bool
socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
......@@ -259,7 +262,7 @@ extract_icmp6_fields(const struct sk_buff *skb,
}
static bool
socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
struct udphdr _hdr, *hp = NULL;
......@@ -302,8 +305,11 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
bool wildcard;
bool transparent = true;
/* Ignore sockets listening on INADDR_ANY */
wildcard = (sk->sk_state != TCP_TIME_WAIT &&
/* Ignore sockets listening on INADDR_ANY
* unless XT_SOCKET_NOWILDCARD is set
*/
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
sk->sk_state != TCP_TIME_WAIT &&
ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
/* Ignore non-transparent sockets,
......@@ -331,6 +337,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
}
#endif
static int socket_mt_v1_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
if (info->flags & ~XT_SOCKET_FLAGS_V1) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
return -EINVAL;
}
return 0;
}
static int socket_mt_v2_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
if (info->flags & ~XT_SOCKET_FLAGS_V2) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
return -EINVAL;
}
return 0;
}
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
......@@ -345,7 +373,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
.match = socket_mt4_v1,
.match = socket_mt4_v1_v2,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
......@@ -356,7 +385,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV6,
.match = socket_mt6_v1,
.match = socket_mt6_v1_v2,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
#endif
{
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
#ifdef XT_SOCKET_HAVE_IPV6
{
.name = "socket",
.revision = 2,
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2,
.checkentry = socket_mt_v2_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment