Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'

Yury Norov says: ==================== sched: cpumask: improve on cpumask_local_spread() locality cpumask_local_spread() currently checks local node for presence of i'th CPU, and then if it finds nothing makes a flat search among all non-local CPUs. We can do it better by checking CPUs per NUMA hops. This has significant performance implications on NUMA machines, for example when using NUMA-aware allocated memory together with NUMA-aware IRQ affinity hints. Performance tests from patch 8 of this series for mellanox network driver show: TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on). Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121 +-------------------------+-----------+------------------+------------------+ | | BW (Gbps) | TX side CPU util | RX side CPU util | +-------------------------+-----------+------------------+------------------+ | Baseline | 52.3 | 6.4 % | 17.9 % | +-------------------------+-----------+------------------+------------------+ | Applied on TX side only | 52.6 | 5.2 % | 18.5 % | +-------------------------+-----------+------------------+------------------+ | Applied on RX side only | 94.9 | 11.9 % | 27.2 % | +-------------------------+-----------+------------------+------------------+ | Applied on both sides | 95.1 | 8.4 % | 27.3 % | +-------------------------+-----------+------------------+------------------+ Bottleneck in RX side is released, reached linerate (~1.8x speedup). ~30% less cpu util on TX. ==================== Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'
Yury Norov says: ==================== sched: cpumask: improve on cpumask_local_spread() locality cpumask_local_spread() currently checks local node for presence of i'th CPU, and then if it finds nothing makes a flat search among all non-local CPUs. We can do it better by checking CPUs per NUMA hops. This has significant performance implications on NUMA machines, for example when using NUMA-aware allocated memory together with NUMA-aware IRQ affinity hints. Performance tests from patch 8 of this series for mellanox network driver show: TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on). Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121 +-------------------------+-----------+------------------+------------------+ | | BW (Gbps) | TX side CPU util | RX side CPU util | +-------------------------+-----------+------------------+------------------+ | Baseline | 52.3 | 6.4 % | 17.9 % | +-------------------------+-----------+------------------+------------------+ | Applied on TX side only | 52.6 | 5.2 % | 18.5 % | +-------------------------+-----------+------------------+------------------+ | Applied on RX side only | 94.9 | 11.9 % | 27.2 % | +-------------------------+-----------+------------------+------------------+ | Applied on both sides | 95.1 | 8.4 % | 27.3 % | +-------------------------+-----------+------------------+------------------+ Bottleneck in RX side is released, reached linerate (~1.8x speedup). ~30% less cpu util on TX. ==================== Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
cc74ca30 · Jakub Kicinski · 383d9f87 · 2ac4980c · cc74ca30 · cc74ca30
Commit cc74ca30 authored Feb 07, 2023 by Jakub Kicinski
7 changed files
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
 static int comp_irqs_request(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
+	const struct cpumask *prev = cpu_none_mask;
+	const struct cpumask *mask;
 	int ncomp_eqs = table->num_comp_eqs;
 	u16 *cpus;
 	int ret;
+	int cpu;
 	int i;

 	ncomp_eqs = table->num_comp_eqs;
@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
 		ret = -ENOMEM;
 		goto free_irqs;
 	}
-	for (i = 0; i < ncomp_eqs; i++)
-		cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+
+	i = 0;
+	rcu_read_lock();
+	for_each_numa_hop_mask(mask, dev->priv.numa_node) {
+		for_each_cpu_andnot(cpu, mask, prev) {
+			cpus[i] = cpu;
+			if (++i == ncomp_eqs)
+				goto spread_done;
+		}
+		prev = mask;
+	}
+spread_done:
+	rcu_read_unlock();
 	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
 	kfree(cpus);
 	if (ret < 0)

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
 				nr_cpumask_bits, cpumask_check(cpu));
 }

+/**
+ * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @srcp3: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static __always_inline
+unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
+							const struct cpumask *srcp2,
+							const struct cpumask *srcp3)
+{
+	return find_nth_and_andnot_bit(cpumask_bits(srcp1),
+					cpumask_bits(srcp2),
+					cpumask_bits(srcp3),
+					nr_cpumask_bits, cpumask_check(cpu));
+}
+
 #define CPU_BITS_NONE						\
 {								\
 	[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL			\

--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
 				unsigned long size, unsigned long n);
 unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
 					unsigned long size, unsigned long n);
+unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+					const unsigned long *addr3, unsigned long size,
+					unsigned long n);
 extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 					 const unsigned long *addr2, unsigned long size);
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
 	return __find_nth_andnot_bit(addr1, addr2, size, n);
 }

+/**
+ * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
+ *			     excluding those set in 3rd region
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @addr3: The 3rd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static __always_inline
+unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
+					const unsigned long *addr2,
+					const unsigned long *addr3,
+					unsigned long size, unsigned long n)
+{
+	if (n >= size)
+		return size;
+
+	if (small_const_nbits(size)) {
+		unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);
+
+		return val ? fns(val, n) : size;
+	}
+
+	return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
+}
+
 #ifndef find_first_and_bit
 /**
 * find_first_and_bit - find the first set bit in both memory regions

--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }

+#ifdef CONFIG_NUMA
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
+#else
+static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	return cpumask_nth(cpu, cpus);
+}
+
+static inline const struct cpumask *
+sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif	/* CONFIG_NUMA */
+
+/**
+ * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
+ *                          from a given node.
+ * @mask: the iteration variable.
+ * @node: the NUMA node to start the search from.
+ *
+ * Requires rcu_lock to be held.
+ *
+ * Yields cpu_online_mask for @node == NUMA_NO_NODE.
+ */
+#define for_each_numa_hop_mask(mask, node)				       \
+	for (unsigned int __hops = 0;					       \
+	     mask = (node != NUMA_NO_NODE || __hops) ?			       \
+		     sched_numa_hop_mask(node, __hops) :		       \
+		     cpu_online_mask,					       \
+	     !IS_ERR_OR_NULL(mask);					       \
+	     __hops++)

 #endif /* _LINUX_TOPOLOGY_H */
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,8 @@
 * Scheduler topology setup/handling methods
 */

+#include <linux/bsearch.h>
+
 DEFINE_MUTEX(sched_domains_mutex);

 /* Protected by sched_domains_mutex: */
@@ -2067,6 +2069,94 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 	return found;
 }

+struct __cmp_key {
+	const struct cpumask *cpus;
+	struct cpumask ***masks;
+	int node;
+	int cpu;
+	int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+	struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
+	struct cpumask **cur_hop = *(struct cpumask ***)b;
+	struct __cmp_key *k = (struct __cmp_key *)a;
+
+	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+		return 1;
+
+	k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
+	if (k->w <= k->cpu)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ *                             closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+	struct cpumask ***hop_masks;
+	int hop, ret = nr_cpu_ids;
+
+	rcu_read_lock();
+
+	k.masks = rcu_dereference(sched_domains_numa_masks);
+	if (!k.masks)
+		goto unlock;
+
+	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+	hop = hop_masks	- k.masks;
+
+	ret = hop ?
+		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+		cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ *                         @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+	struct cpumask ***masks;
+
+	if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+		return ERR_PTR(-EINVAL);
+
+	masks = rcu_dereference(sched_domains_numa_masks);
+	if (!masks)
+		return ERR_PTR(-EBUSY);
+
+	return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
 #endif /* CONFIG_NUMA */

 static int __sdt_alloc(const struct cpumask *cpu_map)

--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 #endif

 /**
- * cpumask_local_spread - select the i'th cpu with local numa cpu's first
+ * cpumask_local_spread - select the i'th cpu based on NUMA distances
 * @i: index number
 * @node: local numa_node
 *
- * This function selects an online CPU according to a numa aware policy;
- * local cpus are returned first, followed by non-local ones, then it
- * wraps around.
+ * Returns online CPU according to a numa aware policy; local cpus are returned
+ * first, followed by non-local ones, then it wraps around.
 *
- * It's not very efficient, but useful for setup.
+ * For those who wants to enumerate all CPUs based on their NUMA distances,
+ * i.e. call this function in a loop, like:
+ *
+ * for (i = 0; i < num_online_cpus(); i++) {
+ *	cpu = cpumask_local_spread(i, node);
+ *	do_something(cpu);
+ * }
+ *
+ * There's a better alternative based on for_each()-like iterators:
+ *
+ *	for_each_numa_hop_mask(mask, node) {
+ *		for_each_cpu_andnot(cpu, mask, prev)
+ *			do_something(cpu);
+ *		prev = mask;
+ *	}
+ *
+ * It's simpler and more verbose than above. Complexity of iterator-based
+ * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
+ * cpumask_local_spread() when called for each cpu is
+ * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
 */
 unsigned int cpumask_local_spread(unsigned int i, int node)
 {
@@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();

-	if (node == NUMA_NO_NODE) {
-		cpu = cpumask_nth(i, cpu_online_mask);
-		if (cpu < nr_cpu_ids)
-			return cpu;
-	} else {
-		/* NUMA first. */
-		cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node));
-		if (cpu < nr_cpu_ids)
-			return cpu;
-
-		i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node));
-
-		/* Skip NUMA nodes, done above. */
-		cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node));
-		if (cpu < nr_cpu_ids)
-			return cpu;
-	}
-	BUG();
+	cpu = (node == NUMA_NO_NODE) ?
+		cpumask_nth(i, cpu_online_mask) :
+		sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+
+	WARN_ON(cpu >= nr_cpu_ids);
+	return cpu;
 }
 EXPORT_SYMBOL(cpumask_local_spread);


--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l
 }
 EXPORT_SYMBOL(__find_nth_andnot_bit);

+unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
+					const unsigned long *addr2,
+					const unsigned long *addr3,
+					unsigned long size, unsigned long n)
+{
+	return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
+}
+EXPORT_SYMBOL(__find_nth_and_andnot_bit);
+
 #ifndef find_next_and_bit
 unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
 					unsigned long nbits, unsigned long start)