Commit 2894d353 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'net-introduce-rps_default_mask'

Paolo Abeni says:

====================
net: introduce rps_default_mask

Real-time setups try hard to ensure proper isolation between time
critical applications and e.g. network processing performed by the
network stack in softirq and RPS is used to move the softirq
activity away from the isolated core.

If the network configuration is dynamic, with netns and devices
routinely created at run-time, enforcing the correct RPS setting
on each newly created device allowing to transient bad configuration
became complex.

Additionally, when multi-queue devices are involved, configuring rps
in user-space on each queue easily becomes very expensive, e.g.
some setups use veths with per cpu queues.

These series try to address the above, introducing a new
sysctl knob: rps_default_mask. The new sysctl entry allows
configuring a netns-wide RPS mask, to be enforced since receive
queue creation time without any fourther per device configuration
required.

Additionally, a simple self-test is introduced to check the
rps_default_mask behavior.
====================

Link: https://lore.kernel.org/r/cover.1675789134.git.pabeni@redhat.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 8697a258 c12e0d5f
...@@ -215,6 +215,12 @@ rmem_max ...@@ -215,6 +215,12 @@ rmem_max
The maximum receive socket buffer size in bytes. The maximum receive socket buffer size in bytes.
rps_default_mask
----------------
The default RPS CPU mask used on newly created network devices. An empty
mask means RPS disabled by default.
tstamp_allow_data tstamp_allow_data
----------------- -----------------
Allow processes to receive tx timestamps looped together with the original Allow processes to receive tx timestamps looped together with the original
......
...@@ -223,6 +223,7 @@ struct net_device_core_stats { ...@@ -223,6 +223,7 @@ struct net_device_core_stats {
#include <linux/static_key.h> #include <linux/static_key.h>
extern struct static_key_false rps_needed; extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed; extern struct static_key_false rfs_needed;
extern struct cpumask rps_default_mask;
#endif #endif
struct neighbour; struct neighbour;
......
...@@ -9,6 +9,7 @@ struct net_device; ...@@ -9,6 +9,7 @@ struct net_device;
struct netdev_bpf; struct netdev_bpf;
struct netdev_phys_item_id; struct netdev_phys_item_id;
struct netlink_ext_ack; struct netlink_ext_ack;
struct cpumask;
/* Random bits of netdevice that don't need to be exposed */ /* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
...@@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, ...@@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size); WRITE_ONCE(dev->gro_ipv4_max_size, size);
} }
int rps_cpumask_housekeeping(struct cpumask *mask);
#endif #endif
...@@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) ...@@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
return len < PAGE_SIZE ? len : -EINVAL; return len < PAGE_SIZE ? len : -EINVAL;
} }
static ssize_t store_rps_map(struct netdev_rx_queue *queue, static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
const char *buf, size_t len) cpumask_var_t mask)
{ {
struct rps_map *old_map, *map;
cpumask_var_t mask;
int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex); static DEFINE_MUTEX(rps_map_mutex);
struct rps_map *old_map, *map;
if (!capable(CAP_NET_ADMIN)) int cpu, i;
return -EPERM;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
if (err) {
free_cpumask_var(mask);
return err;
}
if (!cpumask_empty(mask)) {
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
if (cpumask_empty(mask)) {
free_cpumask_var(mask);
return -EINVAL;
}
}
map = kzalloc(max_t(unsigned int, map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL); GFP_KERNEL);
if (!map) { if (!map)
free_cpumask_var(mask);
return -ENOMEM; return -ENOMEM;
}
i = 0; i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask) for_each_cpu_and(cpu, mask, cpu_online_mask)
...@@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, ...@@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
if (old_map) if (old_map)
kfree_rcu(old_map, rcu); kfree_rcu(old_map, rcu);
return 0;
}
int rps_cpumask_housekeeping(struct cpumask *mask)
{
if (!cpumask_empty(mask)) {
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
if (cpumask_empty(mask))
return -EINVAL;
}
return 0;
}
static ssize_t store_rps_map(struct netdev_rx_queue *queue,
const char *buf, size_t len)
{
cpumask_var_t mask;
int err;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
if (err)
goto out;
err = rps_cpumask_housekeeping(mask);
if (err)
goto out;
err = netdev_rx_queue_set_rps_mask(queue, mask);
out:
free_cpumask_var(mask); free_cpumask_var(mask);
return len; return err ? : len;
} }
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
...@@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) ...@@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
goto err; goto err;
} }
#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
if (!cpumask_empty(&rps_default_mask)) {
error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask);
if (error)
goto err;
}
#endif
kobject_uevent(kobj, KOBJ_ADD); kobject_uevent(kobj, KOBJ_ADD);
return error; return error;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <net/ip.h> #include <net/ip.h>
#include <net/sock.h> #include <net/sock.h>
...@@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); ...@@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
int sysctl_devconf_inherit_init_net __read_mostly; int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
struct cpumask *mask)
{
char kbuf[128];
int len;
if (*ppos || !*lenp) {
*lenp = 0;
return;
}
len = min(sizeof(kbuf) - 1, *lenp);
len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
if (!len) {
*lenp = 0;
return;
}
if (len < *lenp)
kbuf[len++] = '\n';
memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
}
#endif
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
struct cpumask rps_default_mask;
static int rps_default_mask_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int err = 0;
rtnl_lock();
if (write) {
err = cpumask_parse(buffer, &rps_default_mask);
if (err)
goto done;
err = rps_cpumask_housekeeping(&rps_default_mask);
if (err)
goto done;
} else {
dump_cpumask(buffer, lenp, ppos, &rps_default_mask);
}
done:
rtnl_unlock();
return err;
}
static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos) void *buffer, size_t *lenp, loff_t *ppos)
{ {
...@@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ...@@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
write_unlock: write_unlock:
mutex_unlock(&flow_limit_update_mutex); mutex_unlock(&flow_limit_update_mutex);
} else { } else {
char kbuf[128];
if (*ppos || !*lenp) {
*lenp = 0;
goto done;
}
cpumask_clear(mask); cpumask_clear(mask);
rcu_read_lock(); rcu_read_lock();
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
...@@ -171,17 +217,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ...@@ -171,17 +217,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
} }
rcu_read_unlock(); rcu_read_unlock();
len = min(sizeof(kbuf) - 1, *lenp); dump_cpumask(buffer, lenp, ppos, mask);
len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
if (!len) {
*lenp = 0;
goto done;
}
if (len < *lenp)
kbuf[len++] = '\n';
memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
} }
done: done:
...@@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = { ...@@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = rps_sock_flow_sysctl .proc_handler = rps_sock_flow_sysctl
}, },
{
.procname = "rps_default_mask",
.mode = 0644,
.proc_handler = rps_default_mask_sysctl
},
#endif #endif
#ifdef CONFIG_NET_FLOW_LIMIT #ifdef CONFIG_NET_FLOW_LIMIT
{ {
...@@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = { ...@@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {
static __init int sysctl_core_init(void) static __init int sysctl_core_init(void)
{ {
#if IS_ENABLED(CONFIG_RPS)
cpumask_copy(&rps_default_mask, cpu_none_mask);
#endif
register_net_sysctl(&init_net, "net/core", net_core_table); register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops); return register_pernet_subsys(&sysctl_core_ops);
} }
......
...@@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh ...@@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS += l2_tos_ttl_inherit.sh TEST_PROGS += l2_tos_ttl_inherit.sh
TEST_PROGS += bind_bhash.sh TEST_PROGS += bind_bhash.sh
TEST_PROGS += ip_local_port_range.sh TEST_PROGS += ip_local_port_range.sh
TEST_PROGS += rps_default_mask.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest TEST_GEN_FILES = socket nettest
......
...@@ -3,6 +3,9 @@ CONFIG_NET_NS=y ...@@ -3,6 +3,9 @@ CONFIG_NET_NS=y
CONFIG_BPF_SYSCALL=y CONFIG_BPF_SYSCALL=y
CONFIG_TEST_BPF=m CONFIG_TEST_BPF=m
CONFIG_NUMA=y CONFIG_NUMA=y
CONFIG_RPS=y
CONFIG_SYSFS=y
CONFIG_PROC_SYSCTL=y
CONFIG_NET_VRF=y CONFIG_NET_VRF=y
CONFIG_NET_L3_MASTER_DEV=y CONFIG_NET_L3_MASTER_DEV=y
CONFIG_IPV6=y CONFIG_IPV6=y
......
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
readonly ksft_skip=4
readonly cpus=$(nproc)
ret=0
[ $cpus -gt 2 ] || exit $ksft_skip
readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask)
readonly NETNS="ns-$(mktemp -u XXXXXX)"
setup() {
ip netns add "${NETNS}"
ip -netns "${NETNS}" link set lo up
}
cleanup() {
echo $INITIAL_RPS_DEFAULT_MASK > /proc/sys/net/core/rps_default_mask
ip netns del $NETNS
}
chk_rps() {
local rps_mask expected_rps_mask=$3
local dev_name=$2
local msg=$1
rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
printf "%-60s" "$msg"
if [ $rps_mask -eq $expected_rps_mask ]; then
echo "[ ok ]"
else
echo "[fail] expected $expected_rps_mask found $rps_mask"
ret=1
fi
}
trap cleanup EXIT
echo 0 > /proc/sys/net/core/rps_default_mask
setup
chk_rps "empty rps_default_mask" lo 0
cleanup
echo 1 > /proc/sys/net/core/rps_default_mask
setup
chk_rps "non zero rps_default_mask" lo 1
echo 3 > /proc/sys/net/core/rps_default_mask
chk_rps "changing rps_default_mask dont affect existing netns" lo 1
ip -n $NETNS link add type veth
ip -n $NETNS link set dev veth0 up
ip -n $NETNS link set dev veth1 up
chk_rps "changing rps_default_mask affect newly created devices" veth0 3
chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3
exit $ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment