Commit e06e7c61 authored by David S. Miller's avatar David S. Miller

[IPV4]: The scheduled removal of multipath cached routing support.

With help from Chris Wedgwood.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 4eb6bf6b
......@@ -262,25 +262,6 @@ Who: Richard Purdie <rpurdie@rpsys.net>
---------------------------
What: Multipath cached routing support in ipv4
When: in 2.6.23
Why: Code was merged, then submitter immediately disappeared leaving
us with no maintainer and lots of bugs. The code should not have
been merged in the first place, and many aspects of it's
implementation are blocking more critical core networking
development. It's marked EXPERIMENTAL and no distribution
enables it because it cause obscure crashes due to unfixable bugs
(interfaces don't return errors so memory allocation can't be
handled, calling contexts of these interfaces make handling
errors impossible too because they get called after we've
totally commited to creating a route object, for example).
This problem has existed for years and no forward progress
has ever been made, and nobody steps up to try and salvage
this code, so we're going to finally just get rid of it.
Who: David S. Miller <davem@davemloft.net>
---------------------------
What: read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer)
When: December 2007
Why: These functions are a leftover from 2.4 times. They have several
......
......@@ -91,7 +91,6 @@ header-y += in6.h
header-y += in_route.h
header-y += ioctl.h
header-y += ipmi_msgdefs.h
header-y += ip_mp_alg.h
header-y += ipsec.h
header-y += ipx.h
header-y += irda.h
......
/* ip_mp_alg.h: IPV4 multipath algorithm support, user-visible values.
*
* Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
* Copyright (C) 2005 David S. Miller <davem@davemloft.net>
*/
#ifndef _LINUX_IP_MP_ALG_H
#define _LINUX_IP_MP_ALG_H
enum ip_mp_alg {
IP_MP_ALG_NONE,
IP_MP_ALG_RR,
IP_MP_ALG_DRR,
IP_MP_ALG_RANDOM,
IP_MP_ALG_WRANDOM,
__IP_MP_ALG_MAX
};
#define IP_MP_ALG_MAX (__IP_MP_ALG_MAX - 1)
#endif /* _LINUX_IP_MP_ALG_H */
......@@ -261,7 +261,7 @@ enum rtattr_type_t
RTA_FLOW,
RTA_CACHEINFO,
RTA_SESSION,
RTA_MP_ALGO,
RTA_MP_ALGO, /* no longer used */
RTA_TABLE,
__RTA_MAX
};
......
......@@ -47,7 +47,6 @@ struct dst_entry
#define DST_NOXFRM 2
#define DST_NOPOLICY 4
#define DST_NOHASH 8
#define DST_BALANCED 0x10
unsigned long expires;
unsigned short header_len; /* more space at head required */
......
......@@ -39,7 +39,6 @@ struct fib_config {
int fc_mx_len;
int fc_mp_len;
u32 fc_flow;
u32 fc_mp_alg;
u32 fc_nlflags;
struct nl_info fc_nlinfo;
};
......@@ -85,9 +84,6 @@ struct fib_info {
int fib_nhs;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_power;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
u32 fib_mp_alg;
#endif
struct fib_nh fib_nh[0];
#define fib_dev fib_nh[0].nh_dev
......@@ -103,10 +99,6 @@ struct fib_result {
unsigned char nh_sel;
unsigned char type;
unsigned char scope;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
__be32 network;
__be32 netmask;
#endif
struct fib_info *fi;
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule *r;
......@@ -145,14 +137,6 @@ struct fib_result_nl {
#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev)
#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif)
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
#define FIB_RES_NETWORK(res) ((res).network)
#define FIB_RES_NETMASK(res) ((res).netmask)
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
#define FIB_RES_NETWORK(res) (0)
#define FIB_RES_NETMASK(res) (0)
#endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */
struct fib_table {
struct hlist_node tb_hlist;
u32 tb_id;
......
/* ip_mp_alg.h: IPV4 multipath algorithm support.
*
* Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
* Copyright (C) 2005 David S. Miller <davem@davemloft.net>
*/
#ifndef _NET_IP_MP_ALG_H
#define _NET_IP_MP_ALG_H
#include <linux/ip_mp_alg.h>
#include <net/flow.h>
#include <net/route.h>
struct fib_nh;
struct ip_mp_alg_ops {
void (*mp_alg_select_route)(const struct flowi *flp,
struct rtable *rth, struct rtable **rp);
void (*mp_alg_flush)(void);
void (*mp_alg_set_nhinfo)(__be32 network, __be32 netmask,
unsigned char prefixlen,
const struct fib_nh *nh);
void (*mp_alg_remove)(struct rtable *rth);
};
extern int multipath_alg_register(struct ip_mp_alg_ops *, enum ip_mp_alg);
extern void multipath_alg_unregister(struct ip_mp_alg_ops *, enum ip_mp_alg);
extern struct ip_mp_alg_ops *ip_mp_alg_table[];
static inline int multipath_select_route(const struct flowi *flp,
struct rtable *rth,
struct rtable **rp)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
/* mp_alg_select_route _MUST_ be implemented */
if (ops && (rth->u.dst.flags & DST_BALANCED)) {
ops->mp_alg_select_route(flp, rth, rp);
return 1;
}
#endif
return 0;
}
static inline void multipath_flush(void)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
int i;
for (i = IP_MP_ALG_NONE; i <= IP_MP_ALG_MAX; i++) {
struct ip_mp_alg_ops *ops = ip_mp_alg_table[i];
if (ops && ops->mp_alg_flush)
ops->mp_alg_flush();
}
#endif
}
static inline void multipath_set_nhinfo(struct rtable *rth,
__be32 network, __be32 netmask,
unsigned char prefixlen,
const struct fib_nh *nh)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
if (ops && ops->mp_alg_set_nhinfo)
ops->mp_alg_set_nhinfo(network, netmask, prefixlen, nh);
#endif
}
static inline void multipath_remove(struct rtable *rth)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
if (ops && ops->mp_alg_remove &&
(rth->u.dst.flags & DST_BALANCED))
ops->mp_alg_remove(rth);
#endif
}
static inline int multipath_comparekeys(const struct flowi *flp1,
const struct flowi *flp2)
{
return flp1->fl4_dst == flp2->fl4_dst &&
flp1->fl4_src == flp2->fl4_src &&
flp1->oif == flp2->oif &&
flp1->mark == flp2->mark &&
!((flp1->fl4_tos ^ flp2->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK));
}
#endif /* _NET_IP_MP_ALG_H */
......@@ -62,7 +62,6 @@ struct rtable
unsigned rt_flags;
__u16 rt_type;
__u16 rt_multipath_alg;
__be32 rt_dst; /* Path destination */
__be32 rt_src; /* Path source */
......
......@@ -116,48 +116,6 @@ config IP_ROUTE_MULTIPATH
equal "cost" and chooses one of them in a non-deterministic fashion
if a matching packet arrives.
config IP_ROUTE_MULTIPATH_CACHED
bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
depends on IP_ROUTE_MULTIPATH
help
Normally, equal cost multipath routing is not supported by the
routing cache. If you say Y here, alternative routes are cached
and on cache lookup a route is chosen in a configurable fashion.
If unsure, say N.
config IP_ROUTE_MULTIPATH_RR
tristate "MULTIPATH: round robin algorithm"
depends on IP_ROUTE_MULTIPATH_CACHED
help
Multipath routes are chosen according to Round Robin
config IP_ROUTE_MULTIPATH_RANDOM
tristate "MULTIPATH: random algorithm"
depends on IP_ROUTE_MULTIPATH_CACHED
help
Multipath routes are chosen in a random fashion. Actually,
there is no weight for a route. The advantage of this policy
is that it is implemented stateless and therefore introduces only
a very small delay.
config IP_ROUTE_MULTIPATH_WRANDOM
tristate "MULTIPATH: weighted random algorithm"
depends on IP_ROUTE_MULTIPATH_CACHED
help
Multipath routes are chosen in a weighted random fashion.
The per route weights are the weights visible via ip route 2. As the
corresponding state management introduces some overhead routing delay
is increased.
config IP_ROUTE_MULTIPATH_DRR
tristate "MULTIPATH: interface round robin algorithm"
depends on IP_ROUTE_MULTIPATH_CACHED
help
Connections are distributed in a round robin fashion over the
available interfaces. This policy makes sense if the connections
should be primarily distributed on interfaces and not on routes.
config IP_ROUTE_VERBOSE
bool "IP: verbose route monitoring"
depends on IP_ADVANCED_ROUTER
......
......@@ -29,14 +29,9 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
......
......@@ -453,7 +453,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_PROTOINFO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_MP_ALGO] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
......@@ -515,9 +514,6 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
case RTA_FLOW:
cfg->fc_flow = nla_get_u32(attr);
break;
case RTA_MP_ALGO:
cfg->fc_mp_alg = nla_get_u32(attr);
break;
case RTA_TABLE:
cfg->fc_table = nla_get_u32(attr);
break;
......
......@@ -42,7 +42,6 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/ip_mp_alg.h>
#include <net/netlink.h>
#include <net/nexthop.h>
......@@ -697,13 +696,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
}
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (cfg->fc_mp_alg) {
if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
cfg->fc_mp_alg > IP_MP_ALG_MAX)
goto err_inval;
}
#endif
err = -ENOBUFS;
if (fib_info_cnt >= fib_hash_size) {
......@@ -791,10 +783,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
fi->fib_mp_alg = cfg->fc_mp_alg;
#endif
if (fib_props[cfg->fc_type].error) {
if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
goto err_inval;
......@@ -940,10 +928,6 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
res->netmask = mask;
res->network = zone & inet_make_mask(prefixlen);
#endif
atomic_inc(&res->fi->fib_clntref);
return 0;
}
......
/* multipath.c: IPV4 multipath algorithm support.
*
* Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
* Copyright (C) 2005 David S. Miller <davem@davemloft.net>
*/
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/spinlock.h>
#include <net/ip_mp_alg.h>
static DEFINE_SPINLOCK(alg_table_lock);
struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
{
struct ip_mp_alg_ops **slot;
int err;
if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
!ops->mp_alg_select_route)
return -EINVAL;
spin_lock(&alg_table_lock);
slot = &ip_mp_alg_table[n];
if (*slot != NULL) {
err = -EBUSY;
} else {
*slot = ops;
err = 0;
}
spin_unlock(&alg_table_lock);
return err;
}
EXPORT_SYMBOL(multipath_alg_register);
void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
{
struct ip_mp_alg_ops **slot;
if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
return;
spin_lock(&alg_table_lock);
slot = &ip_mp_alg_table[n];
if (*slot == ops)
*slot = NULL;
spin_unlock(&alg_table_lock);
synchronize_net();
}
EXPORT_SYMBOL(multipath_alg_unregister);
/*
* Device round robin policy for multipath.
*
*
* Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
*
* Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/ip_mp_alg.h>
struct multipath_device {
int ifi; /* interface index of device */
atomic_t usecount;
int allocated;
};
#define MULTIPATH_MAX_DEVICECANDIDATES 10
static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
static DEFINE_SPINLOCK(state_lock);
static int inline __multipath_findslot(void)
{
int i;
for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
if (state[i].allocated == 0)
return i;
}
return -1;
}
static int inline __multipath_finddev(int ifindex)
{
int i;
for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
if (state[i].allocated != 0 &&
state[i].ifi == ifindex)
return i;
}
return -1;
}
static int drr_dev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
int devidx;
switch (event) {
case NETDEV_UNREGISTER:
case NETDEV_DOWN:
spin_lock_bh(&state_lock);
devidx = __multipath_finddev(dev->ifindex);
if (devidx != -1) {
state[devidx].allocated = 0;
state[devidx].ifi = 0;
atomic_set(&state[devidx].usecount, 0);
}
spin_unlock_bh(&state_lock);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block drr_dev_notifier = {
.notifier_call = drr_dev_event,
};
static void drr_safe_inc(atomic_t *usecount)
{
int n;
atomic_inc(usecount);
n = atomic_read(usecount);
if (n <= 0) {
int i;
spin_lock_bh(&state_lock);
for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
atomic_set(&state[i].usecount, 0);
spin_unlock_bh(&state_lock);
}
}
static void drr_select_route(const struct flowi *flp,
struct rtable *first, struct rtable **rp)
{
struct rtable *nh, *result, *cur_min;
int min_usecount = -1;
int devidx = -1;
int cur_min_devidx = -1;
/* 1. make sure all alt. nexthops have the same GC related data */
/* 2. determine the new candidate to be returned */
result = NULL;
cur_min = NULL;
for (nh = rcu_dereference(first); nh;
nh = rcu_dereference(nh->u.dst.rt_next)) {
if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
multipath_comparekeys(&nh->fl, flp)) {
int nh_ifidx = nh->u.dst.dev->ifindex;
nh->u.dst.lastuse = jiffies;
nh->u.dst.__use++;
if (result != NULL)
continue;
/* search for the output interface */
/* this is not SMP safe, only add/remove are
* SMP safe as wrong usecount updates have no big
* impact
*/
devidx = __multipath_finddev(nh_ifidx);
if (devidx == -1) {
/* add the interface to the array
* SMP safe
*/
spin_lock_bh(&state_lock);
/* due to SMP: search again */
devidx = __multipath_finddev(nh_ifidx);
if (devidx == -1) {
/* add entry for device */
devidx = __multipath_findslot();
if (devidx == -1) {
/* unlikely but possible */
continue;
}
state[devidx].allocated = 1;
state[devidx].ifi = nh_ifidx;
atomic_set(&state[devidx].usecount, 0);
min_usecount = 0;
}
spin_unlock_bh(&state_lock);
}
if (min_usecount == 0) {
/* if the device has not been used it is
* the primary target
*/
drr_safe_inc(&state[devidx].usecount);
result = nh;
} else {
int count =
atomic_read(&state[devidx].usecount);
if (min_usecount == -1 ||
count < min_usecount) {
cur_min = nh;
cur_min_devidx = devidx;
min_usecount = count;
}
}
}
}
if (!result) {
if (cur_min) {
drr_safe_inc(&state[cur_min_devidx].usecount);
result = cur_min;
} else {
result = first;
}
}
*rp = result;
}
static struct ip_mp_alg_ops drr_ops = {
.mp_alg_select_route = drr_select_route,
};
static int __init drr_init(void)
{
int err = register_netdevice_notifier(&drr_dev_notifier);
if (err)
return err;
err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR);
if (err)
goto fail;
return 0;
fail:
unregister_netdevice_notifier(&drr_dev_notifier);
return err;
}
static void __exit drr_exit(void)
{
unregister_netdevice_notifier(&drr_dev_notifier);
multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
}
module_init(drr_init);
module_exit(drr_exit);
MODULE_LICENSE("GPL");
/*
* Random policy for multipath.
*
*
* Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
*
* Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/random.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/ip_mp_alg.h>
#define MULTIPATH_MAX_CANDIDATES 40
static void random_select_route(const struct flowi *flp,
struct rtable *first,
struct rtable **rp)
{
struct rtable *rt;
struct rtable *decision;
unsigned char candidate_count = 0;
/* count all candidate */
for (rt = rcu_dereference(first); rt;
rt = rcu_dereference(rt->u.dst.rt_next)) {
if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
multipath_comparekeys(&rt->fl, flp))
++candidate_count;
}
/* choose a random candidate */
decision = first;
if (candidate_count > 1) {
unsigned char i = 0;
unsigned char candidate_no = (unsigned char)
(random32() % candidate_count);
/* find chosen candidate and adjust GC data for all candidates
* to ensure they stay in cache
*/
for (rt = first; rt; rt = rt->u.dst.rt_next) {
if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
multipath_comparekeys(&rt->fl, flp)) {
rt->u.dst.lastuse = jiffies;
if (i == candidate_no)
decision = rt;
if (i >= candidate_count)
break;
i++;
}
}
}
decision->u.dst.__use++;
*rp = decision;
}
static struct ip_mp_alg_ops random_ops = {
.mp_alg_select_route = random_select_route,
};
static int __init random_init(void)
{
return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
}
static void __exit random_exit(void)
{
multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
}
module_init(random_init);
module_exit(random_exit);
MODULE_LICENSE("GPL");
/*
* Round robin policy for multipath.
*
*
* Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
*
* Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/ip_mp_alg.h>
static void rr_select_route(const struct flowi *flp,
struct rtable *first, struct rtable **rp)
{
struct rtable *nh, *result, *min_use_cand = NULL;
int min_use = -1;
/* 1. make sure all alt. nexthops have the same GC related data
* 2. determine the new candidate to be returned
*/
result = NULL;
for (nh = rcu_dereference(first); nh;
nh = rcu_dereference(nh->u.dst.rt_next)) {
if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
multipath_comparekeys(&nh->fl, flp)) {
nh->u.dst.lastuse = jiffies;
if (min_use == -1 || nh->u.dst.__use < min_use) {
min_use = nh->u.dst.__use;
min_use_cand = nh;
}
}
}
result = min_use_cand;
if (!result)
result = first;
result->u.dst.__use++;
*rp = result;
}
static struct ip_mp_alg_ops rr_ops = {
.mp_alg_select_route = rr_select_route,
};
static int __init rr_init(void)
{
return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
}
static void __exit rr_exit(void)
{
multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
}
module_init(rr_init);
module_exit(rr_exit);
MODULE_LICENSE("GPL");
/*
* Weighted random policy for multipath.
*
*
* Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
*
* Authors: Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/random.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/ip_fib.h>
#include <net/ip_mp_alg.h>
#define MULTIPATH_STATE_SIZE 15
struct multipath_candidate {
struct multipath_candidate *next;
int power;
struct rtable *rt;
};
struct multipath_dest {
struct list_head list;
const struct fib_nh *nh_info;
__be32 netmask;
__be32 network;
unsigned char prefixlen;
struct rcu_head rcu;
};
struct multipath_bucket {
struct list_head head;
spinlock_t lock;
};
struct multipath_route {
struct list_head list;
int oif;
__be32 gw;
struct list_head dests;
struct rcu_head rcu;
};
/* state: primarily weight per route information */
static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
static unsigned char __multipath_lookup_weight(const struct flowi *fl,
const struct rtable *rt)
{
const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
struct multipath_route *r;
struct multipath_route *target_route = NULL;
struct multipath_dest *d;
int weight = 1;
/* lookup the weight information for a certain route */
rcu_read_lock();
/* find state entry for gateway or add one if necessary */
list_for_each_entry_rcu(r, &state[state_idx].head, list) {
if (r->gw == rt->rt_gateway &&
r->oif == rt->idev->dev->ifindex) {
target_route = r;
break;
}
}
if (!target_route) {
/* this should not happen... but we are prepared */
printk( KERN_CRIT"%s: missing state for gateway: %u and " \
"device %d\n", __FUNCTION__, rt->rt_gateway,
rt->idev->dev->ifindex);
goto out;
}
/* find state entry for destination */
list_for_each_entry_rcu(d, &target_route->dests, list) {
__be32 targetnetwork = fl->fl4_dst &
inet_make_mask(d->prefixlen);
if ((targetnetwork & d->netmask) == d->network) {
weight = d->nh_info->nh_weight;
goto out;
}
}
out:
rcu_read_unlock();
return weight;
}
static void wrandom_init_state(void)
{
int i;
for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
INIT_LIST_HEAD(&state[i].head);
spin_lock_init(&state[i].lock);
}
}
static void wrandom_select_route(const struct flowi *flp,
struct rtable *first,
struct rtable **rp)
{
struct rtable *rt;
struct rtable *decision;
struct multipath_candidate *first_mpc = NULL;
struct multipath_candidate *mpc, *last_mpc = NULL;
int power = 0;
int last_power;
int selector;
const size_t size_mpc = sizeof(struct multipath_candidate);
/* collect all candidates and identify their weights */
for (rt = rcu_dereference(first); rt;
rt = rcu_dereference(rt->u.dst.rt_next)) {
if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
multipath_comparekeys(&rt->fl, flp)) {
struct multipath_candidate* mpc =
(struct multipath_candidate*)
kmalloc(size_mpc, GFP_ATOMIC);
if (!mpc)
return;
power += __multipath_lookup_weight(flp, rt) * 10000;
mpc->power = power;
mpc->rt = rt;
mpc->next = NULL;
if (!first_mpc)
first_mpc = mpc;
else
last_mpc->next = mpc;
last_mpc = mpc;
}
}
/* choose a weighted random candidate */
decision = first;
selector = random32() % power;
last_power = 0;
/* select candidate, adjust GC data and cleanup local state */
decision = first;
last_mpc = NULL;
for (mpc = first_mpc; mpc; mpc = mpc->next) {
mpc->rt->u.dst.lastuse = jiffies;
if (last_power <= selector && selector < mpc->power)
decision = mpc->rt;
last_power = mpc->power;
kfree(last_mpc);
last_mpc = mpc;
}
/* concurrent __multipath_flush may lead to !last_mpc */
kfree(last_mpc);
decision->u.dst.__use++;
*rp = decision;
}
static void wrandom_set_nhinfo(__be32 network,
__be32 netmask,
unsigned char prefixlen,
const struct fib_nh *nh)
{
const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
struct multipath_route *r, *target_route = NULL;
struct multipath_dest *d, *target_dest = NULL;
/* store the weight information for a certain route */
spin_lock_bh(&state[state_idx].lock);
/* find state entry for gateway or add one if necessary */
list_for_each_entry_rcu(r, &state[state_idx].head, list) {
if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
target_route = r;
break;
}
}
if (!target_route) {
const size_t size_rt = sizeof(struct multipath_route);
target_route = (struct multipath_route *)
kmalloc(size_rt, GFP_ATOMIC);
target_route->gw = nh->nh_gw;
target_route->oif = nh->nh_oif;
memset(&target_route->rcu, 0, sizeof(struct rcu_head));
INIT_LIST_HEAD(&target_route->dests);
list_add_rcu(&target_route->list, &state[state_idx].head);
}
/* find state entry for destination or add one if necessary */
list_for_each_entry_rcu(d, &target_route->dests, list) {
if (d->nh_info == nh) {
target_dest = d;
break;
}
}
if (!target_dest) {
const size_t size_dst = sizeof(struct multipath_dest);
target_dest = (struct multipath_dest*)
kmalloc(size_dst, GFP_ATOMIC);
target_dest->nh_info = nh;
target_dest->network = network;
target_dest->netmask = netmask;
target_dest->prefixlen = prefixlen;
memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
list_add_rcu(&target_dest->list, &target_route->dests);
}
/* else: we already stored this info for another destination =>
* we are finished
*/
spin_unlock_bh(&state[state_idx].lock);
}
static void __multipath_free(struct rcu_head *head)
{
struct multipath_route *rt = container_of(head, struct multipath_route,
rcu);
kfree(rt);
}
static void __multipath_free_dst(struct rcu_head *head)
{
struct multipath_dest *dst = container_of(head,
struct multipath_dest,
rcu);
kfree(dst);
}
static void wrandom_flush(void)
{
int i;
/* defere delete to all entries */
for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
struct multipath_route *r;
spin_lock_bh(&state[i].lock);
list_for_each_entry_rcu(r, &state[i].head, list) {
struct multipath_dest *d;
list_for_each_entry_rcu(d, &r->dests, list) {
list_del_rcu(&d->list);
call_rcu(&d->rcu,
__multipath_free_dst);
}
list_del_rcu(&r->list);
call_rcu(&r->rcu,
__multipath_free);
}
spin_unlock_bh(&state[i].lock);
}
}
static struct ip_mp_alg_ops wrandom_ops = {
.mp_alg_select_route = wrandom_select_route,
.mp_alg_flush = wrandom_flush,
.mp_alg_set_nhinfo = wrandom_set_nhinfo,
};
static int __init wrandom_init(void)
{
wrandom_init_state();
return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
}
static void __exit wrandom_exit(void)
{
multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
}
module_init(wrandom_init);
module_exit(wrandom_exit);
MODULE_LICENSE("GPL");
......@@ -101,7 +101,6 @@
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/ip_mp_alg.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
......@@ -495,13 +494,11 @@ static const struct file_operations rt_cpu_seq_fops = {
static __inline__ void rt_free(struct rtable *rt)
{
multipath_remove(rt);
call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}
static __inline__ void rt_drop(struct rtable *rt)
{
multipath_remove(rt);
ip_rt_put(rt);
call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}
......@@ -574,52 +571,6 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
(fl1->iif ^ fl2->iif)) == 0;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
struct rtable *expentry,
int *removed_count)
{
int passedexpired = 0;
struct rtable **nextstep = NULL;
struct rtable **rthp = chain_head;
struct rtable *rth;
if (removed_count)
*removed_count = 0;
while ((rth = *rthp) != NULL) {
if (rth == expentry)
passedexpired = 1;
if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
compare_keys(&(*rthp)->fl, &expentry->fl)) {
if (*rthp == expentry) {
*rthp = rth->u.dst.rt_next;
continue;
} else {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
if (removed_count)
++(*removed_count);
}
} else {
if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
passedexpired && !nextstep)
nextstep = &rth->u.dst.rt_next;
rthp = &rth->u.dst.rt_next;
}
}
rt_free(expentry);
if (removed_count)
++(*removed_count);
return nextstep;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
......@@ -658,22 +609,8 @@ static void rt_check_expire(unsigned long dummy)
}
/* Cleanup aged off entries. */
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
/* remove all related balanced entries if necessary */
if (rth->u.dst.flags & DST_BALANCED) {
rthp = rt_remove_balanced_route(
&rt_hash_table[i].chain,
rth, NULL);
if (!rthp)
break;
} else {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
*rthp = rth->u.dst.rt_next;
rt_free(rth);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
spin_unlock(rt_hash_lock_addr(i));
......@@ -721,9 +658,6 @@ void rt_cache_flush(int delay)
if (delay < 0)
delay = ip_rt_min_delay;
/* flush existing multipath state*/
multipath_flush();
spin_lock_bh(&rt_flush_lock);
if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
......@@ -842,30 +776,9 @@ static int rt_garbage_collect(void)
rthp = &rth->u.dst.rt_next;
continue;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
/* remove all related balanced entries
* if necessary
*/
if (rth->u.dst.flags & DST_BALANCED) {
int r;
rthp = rt_remove_balanced_route(
&rt_hash_table[k].chain,
rth,
&r);
goal -= r;
if (!rthp)
break;
} else {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
goal--;
}
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
*rthp = rth->u.dst.rt_next;
rt_free(rth);
goal--;
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
spin_unlock_bh(rt_hash_lock_addr(k));
if (goal <= 0)
......@@ -939,12 +852,7 @@ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (!(rth->u.dst.flags & DST_BALANCED) &&
compare_keys(&rth->fl, &rt->fl)) {
#else
if (compare_keys(&rth->fl, &rt->fl)) {
#endif
/* Put it first */
*rthp = rth->u.dst.rt_next;
/*
......@@ -1774,10 +1682,6 @@ static inline int __mkroute_input(struct sk_buff *skb,
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi->fib_nhs > 1)
rth->u.dst.flags |= DST_BALANCED;
#endif
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
......@@ -1812,11 +1716,11 @@ static inline int __mkroute_input(struct sk_buff *skb,
return err;
}
static inline int ip_mkroute_input_def(struct sk_buff *skb,
struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
static inline int ip_mkroute_input(struct sk_buff *skb,
struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
struct rtable* rth = NULL;
int err;
......@@ -1837,63 +1741,6 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
}
static inline int ip_mkroute_input(struct sk_buff *skb,
struct fib_result* res,
const struct flowi *fl,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
struct rtable* rth = NULL, *rtres;
unsigned char hop, hopcount;
int err = -EINVAL;
unsigned int hash;
if (res->fi)
hopcount = res->fi->fib_nhs;
else
hopcount = 1;
/* distinguish between multipath and singlepath */
if (hopcount < 2)
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
saddr, tos);
/* add all alternatives to the routing cache */
for (hop = 0; hop < hopcount; hop++) {
res->nh_sel = hop;
/* put reference to previous result */
if (hop)
ip_rt_put(rtres);
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
&rth);
if (err)
return err;
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif);
err = rt_intern_hash(hash, rth, &rtres);
if (err)
return err;
/* forward hop information to multipath impl. */
multipath_set_nhinfo(rth,
FIB_RES_NETWORK(*res),
FIB_RES_NETMASK(*res),
res->prefixlen,
&FIB_RES_NH(*res));
}
skb->dst = &rtres->u.dst;
return err;
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
......@@ -2211,13 +2058,6 @@ static inline int __mkroute_output(struct rtable **result,
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi) {
rth->rt_multipath_alg = res->fi->fib_mp_alg;
if (res->fi->fib_nhs > 1)
rth->u.dst.flags |= DST_BALANCED;
}
#endif
if (IN_DEV_CONF_GET(in_dev, NOXFRM))
rth->u.dst.flags |= DST_NOXFRM;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
......@@ -2277,12 +2117,12 @@ static inline int __mkroute_output(struct rtable **result,
return err;
}
static inline int ip_mkroute_output_def(struct rtable **rp,
struct fib_result* res,
const struct flowi *fl,
const struct flowi *oldflp,
struct net_device *dev_out,
unsigned flags)
static inline int ip_mkroute_output(struct rtable **rp,
struct fib_result* res,
const struct flowi *fl,
const struct flowi *oldflp,
struct net_device *dev_out,
unsigned flags)
{
struct rtable *rth = NULL;
int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
......@@ -2295,68 +2135,6 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
return err;
}
static inline int ip_mkroute_output(struct rtable** rp,
struct fib_result* res,
const struct flowi *fl,
const struct flowi *oldflp,
struct net_device *dev_out,
unsigned flags)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
unsigned char hop;
unsigned hash;
int err = -EINVAL;
struct rtable *rth = NULL;
if (res->fi && res->fi->fib_nhs > 1) {
unsigned char hopcount = res->fi->fib_nhs;
for (hop = 0; hop < hopcount; hop++) {
struct net_device *dev2nexthop;
res->nh_sel = hop;
/* hold a work reference to the output device */
dev2nexthop = FIB_RES_DEV(*res);
dev_hold(dev2nexthop);
/* put reference to previous result */
if (hop)
ip_rt_put(*rp);
err = __mkroute_output(&rth, res, fl, oldflp,
dev2nexthop, flags);
if (err != 0)
goto cleanup;
hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
oldflp->oif);
err = rt_intern_hash(hash, rth, rp);
/* forward hop information to multipath impl. */
multipath_set_nhinfo(rth,
FIB_RES_NETWORK(*res),
FIB_RES_NETMASK(*res),
res->prefixlen,
&FIB_RES_NH(*res));
cleanup:
/* release work reference to output device */
dev_put(dev2nexthop);
if (err != 0)
return err;
}
return err;
} else {
return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
flags);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
#endif
}
/*
* Major route resolver routine.
*/
......@@ -2570,17 +2348,6 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
rth->fl.mark == flp->mark &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK))) {
/* check for multipath routes and choose one if
* necessary
*/
if (multipath_select_route(flp, rth, rp)) {
dst_hold(&(*rp)->u.dst);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
return 0;
}
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
......@@ -2728,10 +2495,6 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
#ifdef CONFIG_NET_CLS_ROUTE
if (rt->u.dst.tclassid)
NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
#endif
if (rt->fl.iif)
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment