Commit 046f4c69 authored by Jianfeng Wang's avatar Jianfeng Wang Committed by Vlastimil Babka

slub: introduce count_partial_free_approx()

When reading "/proc/slabinfo", the kernel needs to report the number
of free objects for each kmem_cache. The current implementation uses
count_partial() to get it by scanning each kmem_cache_node's partial
slab list and summing free objects from every partial slab. This
process must hold per-kmem_cache_node spinlock and disable IRQ, and
may take a long time. Consequently, it can block slab allocations on
other CPUs and cause timeouts for network devices, when the partial
list is long. In production, even NMI watchdog can be triggered due
to this matter: e.g., for "buffer_head", the number of partial slabs
was observed to be ~1M in one kmem_cache_node. This problem was also
confirmed by others [1-3].

Iterating a partial list to get the exact count of objects can cause
soft lockups for a long list with or without the lock (e.g., if
preemption is disabled), and may not be very useful: the object count
can change after the lock is released. The approach of maintaining
free-object counters requires atomic operations on the fast path [3].

So, the fix is to introduce count_partial_free_approx(). This function
can be used for getting the free object count in a kmem_cache_node's
partial list. It limits the number of slabs to scan and avoids scanning
the whole list by giving an approximation for a long list. Suppose the
limit is N. If the list's length is not greater than N, output the exact
count by traversing the list; if its length is greater than N, output an
approximated count by traversing a subset of the list. The proposed
method is to scan N/2 slabs from the list's head and N/2 slabs from
the tail. For a partial list with ~280K slabs, benchmarks show that
it performs better than just counting from the list's head, after slabs
get sorted by kmem_cache_shrink(). Default the limit to 10000, as it
produces an approximation within 1% of the exact count for both
scenarios. Then, use count_partial_free_approx() in get_slabinfo().

Benchmarks: Diff = (exact - approximated) / exact
* Normal case (w/o kmem_cache_shrink()):
| MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)|
| 1000        |  0.43  %              |  1.09  %              |
| 5000        |  0.06  %              |  0.37  %              |
| 10000       |  0.02  %              |  0.16  %              |
| 20000       |  0.009 %              | -0.003 %              |

* Skewed case (w/ kmem_cache_shrink()):
| MAX_TO_SCAN | Diff (count from head)| Diff (count head+tail)|
| 1000        |  12.46 %              |  6.75  %              |
| 5000        |  5.38  %              |  1.27  %              |
| 10000       |  4.99  %              |  0.22  %              |
| 20000       |  4.86  %              | -0.06  %              |

[1] https://lore.kernel.org/linux-mm/alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
[2] https://lore.kernel.org/lkml/alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/
[3] https://lore.kernel.org/lkml/1e01092b-140d-2bab-aeba-321a74a194ee@linux.com/T/Signed-off-by: default avatarJianfeng Wang <jianfeng.w.wang@oracle.com>
Acked-by: default avatarDavid Rientjes <rientjes@google.com>
Signed-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
parent 5b15f3fb
...@@ -3238,6 +3238,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, ...@@ -3238,6 +3238,43 @@ static unsigned long count_partial(struct kmem_cache_node *n,
#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
#ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG
#define MAX_PARTIAL_TO_SCAN 10000
static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
{
unsigned long flags;
unsigned long x = 0;
struct slab *slab;
spin_lock_irqsave(&n->list_lock, flags);
if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
list_for_each_entry(slab, &n->partial, slab_list)
x += slab->objects - slab->inuse;
} else {
/*
* For a long list, approximate the total count of objects in
* it to meet the limit on the number of slabs to scan.
* Scan from both the list's head and tail for better accuracy.
*/
unsigned long scanned = 0;
list_for_each_entry(slab, &n->partial, slab_list) {
x += slab->objects - slab->inuse;
if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
break;
}
list_for_each_entry_reverse(slab, &n->partial, slab_list) {
x += slab->objects - slab->inuse;
if (++scanned == MAX_PARTIAL_TO_SCAN)
break;
}
x = mult_frac(x, n->nr_partial, scanned);
x = min(x, node_nr_objs(n));
}
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
static noinline void static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{ {
...@@ -7116,7 +7153,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) ...@@ -7116,7 +7153,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
for_each_kmem_cache_node(s, node, n) { for_each_kmem_cache_node(s, node, n) {
nr_slabs += node_nr_slabs(n); nr_slabs += node_nr_slabs(n);
nr_objs += node_nr_objs(n); nr_objs += node_nr_objs(n);
nr_free += count_partial(n, count_free); nr_free += count_partial_free_approx(n);
} }
sinfo->active_objs = nr_objs - nr_free; sinfo->active_objs = nr_objs - nr_free;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment