Commit f4776199 authored by Mina Almasry's avatar Mina Almasry Committed by Linus Torvalds

hugetlb: add hugetlb.*.numa_stat file

For hugetlb backed jobs/VMs it's critical to understand the numa
information for the memory backing these jobs to deliver optimal
performance.

Currently this technically can be queried from /proc/self/numa_maps, but
there are significant issues with that.  Namely:

1. Memory can be mapped or unmapped.

2. numa_maps are per process and need to be aggregated across all
   processes in the cgroup.  For shared memory this is more involved as
   the userspace needs to make sure it doesn't double count shared
   mappings.

3. I believe querying numa_maps needs to hold the mmap_lock which adds
   to the contention on this lock.

For these reasons I propose simply adding hugetlb.*.numa_stat file,
   which shows the numa information of the cgroup similarly to
   memory.numa_stat.

On cgroup-v2:
   cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat
   total=2097152 N0=2097152 N1=0

On cgroup-v1:
   cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat
   total=2097152 N0=2097152 N1=0
   hierarichal_total=2097152 N0=2097152 N1=0

This patch was tested manually by allocating hugetlb memory and querying
the hugetlb.*.numa_stat file of the cgroup and its parents.

[colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"]
  Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com
[keescook@chromium.org: fix copy/paste array assignment]
  Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org

Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.comSigned-off-by: default avatarMina Almasry <almasrymina@google.com>
Signed-off-by: default avatarColin Ian King <colin.i.king@gmail.com>
Signed-off-by: default avatarKees Cook <keescook@chromium.org>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Reviewed-by: default avatarMuchun Song <songmuchun@bytedance.com>
Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jue Wang <juew@google.com>
Cc: Yang Yao <ygyao@google.com>
Cc: Joanna Li <joannali@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c4dc63f0
...@@ -29,12 +29,14 @@ Brief summary of control files:: ...@@ -29,12 +29,14 @@ Brief summary of control files::
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
For a system supporting three hugepage sizes (64k, 32M and 1G), the control For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include:: files include::
hugetlb.1GB.limit_in_bytes hugetlb.1GB.limit_in_bytes
hugetlb.1GB.max_usage_in_bytes hugetlb.1GB.max_usage_in_bytes
hugetlb.1GB.numa_stat
hugetlb.1GB.usage_in_bytes hugetlb.1GB.usage_in_bytes
hugetlb.1GB.failcnt hugetlb.1GB.failcnt
hugetlb.1GB.rsvd.limit_in_bytes hugetlb.1GB.rsvd.limit_in_bytes
...@@ -43,6 +45,7 @@ files include:: ...@@ -43,6 +45,7 @@ files include::
hugetlb.1GB.rsvd.failcnt hugetlb.1GB.rsvd.failcnt
hugetlb.64KB.limit_in_bytes hugetlb.64KB.limit_in_bytes
hugetlb.64KB.max_usage_in_bytes hugetlb.64KB.max_usage_in_bytes
hugetlb.64KB.numa_stat
hugetlb.64KB.usage_in_bytes hugetlb.64KB.usage_in_bytes
hugetlb.64KB.failcnt hugetlb.64KB.failcnt
hugetlb.64KB.rsvd.limit_in_bytes hugetlb.64KB.rsvd.limit_in_bytes
...@@ -51,6 +54,7 @@ files include:: ...@@ -51,6 +54,7 @@ files include::
hugetlb.64KB.rsvd.failcnt hugetlb.64KB.rsvd.failcnt
hugetlb.32MB.limit_in_bytes hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes hugetlb.32MB.max_usage_in_bytes
hugetlb.32MB.numa_stat
hugetlb.32MB.usage_in_bytes hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt hugetlb.32MB.failcnt
hugetlb.32MB.rsvd.limit_in_bytes hugetlb.32MB.rsvd.limit_in_bytes
......
...@@ -2266,6 +2266,11 @@ HugeTLB Interface Files ...@@ -2266,6 +2266,11 @@ HugeTLB Interface Files
are local to the cgroup i.e. not hierarchical. The file modified event are local to the cgroup i.e. not hierarchical. The file modified event
generated on this file reflects only the local events. generated on this file reflects only the local events.
hugetlb.<hugepagesize>.numa_stat
Similar to memory.numa_stat, it shows the numa information of the
hugetlb pages of <hugepagesize> in this cgroup. Only active in
use hugetlb pages are included. The per-node values are in bytes.
Misc Misc
---- ----
......
...@@ -622,8 +622,8 @@ struct hstate { ...@@ -622,8 +622,8 @@ struct hstate {
#endif #endif
#ifdef CONFIG_CGROUP_HUGETLB #ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */ /* cgroup control files */
struct cftype cgroup_files_dfl[7]; struct cftype cgroup_files_dfl[8];
struct cftype cgroup_files_legacy[9]; struct cftype cgroup_files_legacy[10];
#endif #endif
char name[HSTATE_NAME_LEN]; char name[HSTATE_NAME_LEN];
}; };
......
...@@ -36,6 +36,11 @@ enum hugetlb_memory_event { ...@@ -36,6 +36,11 @@ enum hugetlb_memory_event {
HUGETLB_NR_MEMORY_EVENTS, HUGETLB_NR_MEMORY_EVENTS,
}; };
struct hugetlb_cgroup_per_node {
/* hugetlb usage in pages over all hstates. */
unsigned long usage[HUGE_MAX_HSTATE];
};
struct hugetlb_cgroup { struct hugetlb_cgroup {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
...@@ -57,6 +62,8 @@ struct hugetlb_cgroup { ...@@ -57,6 +62,8 @@ struct hugetlb_cgroup {
/* Handle for "hugetlb.events.local" */ /* Handle for "hugetlb.events.local" */
struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
struct hugetlb_cgroup_per_node *nodeinfo[];
}; };
static inline struct hugetlb_cgroup * static inline struct hugetlb_cgroup *
......
...@@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, ...@@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
} }
} }
static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
{
int node;
for_each_node(node)
kfree(h_cgroup->nodeinfo[node]);
kfree(h_cgroup);
}
static struct cgroup_subsys_state * static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{ {
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup; struct hugetlb_cgroup *h_cgroup;
int node;
h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
GFP_KERNEL);
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup) if (!h_cgroup)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (!parent_h_cgroup) if (!parent_h_cgroup)
root_h_cgroup = h_cgroup; root_h_cgroup = h_cgroup;
/*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
for_each_node(node) {
/* Set node_to_alloc to -1 for offline nodes. */
int node_to_alloc =
node_state(node, N_NORMAL_MEMORY) ? node : -1;
h_cgroup->nodeinfo[node] =
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
GFP_KERNEL, node_to_alloc);
if (!h_cgroup->nodeinfo[node])
goto fail_alloc_nodeinfo;
}
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css; return &h_cgroup->css;
fail_alloc_nodeinfo:
hugetlb_cgroup_free(h_cgroup);
return ERR_PTR(-ENOMEM);
} }
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{ {
struct hugetlb_cgroup *h_cgroup; hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
h_cgroup = hugetlb_cgroup_from_css(css);
kfree(h_cgroup);
} }
/* /*
...@@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, ...@@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
return; return;
__set_hugetlb_cgroup(page, h_cg, rsvd); __set_hugetlb_cgroup(page, h_cg, rsvd);
return; if (!rsvd) {
unsigned long usage =
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
usage + nr_pages);
}
} }
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
...@@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, ...@@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (rsvd) if (rsvd)
css_put(&h_cg->css); css_put(&h_cg->css);
else {
return; unsigned long usage =
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
usage - nr_pages);
}
} }
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
...@@ -418,6 +466,59 @@ enum { ...@@ -418,6 +466,59 @@ enum {
RES_RSVD_FAILCNT, RES_RSVD_FAILCNT,
}; };
static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
{
int nid;
struct cftype *cft = seq_cft(seq);
int idx = MEMFILE_IDX(cft->private);
bool legacy = MEMFILE_ATTR(cft->private);
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
struct cgroup_subsys_state *css;
unsigned long usage;
if (legacy) {
/* Add up usage across all nodes for the non-hierarchical total. */
usage = 0;
for_each_node_state(nid, N_MEMORY)
usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
/* Simply print the per-node usage for the non-hierarchical total. */
for_each_node_state(nid, N_MEMORY)
seq_printf(seq, " N%d=%lu", nid,
READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
PAGE_SIZE);
seq_putc(seq, '\n');
}
/*
* The hierarchical total is pretty much the value recorded by the
* counter, so use that.
*/
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
/*
* For each node, transverse the css tree to obtain the hierarchical
* node usage.
*/
for_each_node_state(nid, N_MEMORY) {
usage = 0;
rcu_read_lock();
css_for_each_descendant_pre(css, &h_cg->css) {
usage += READ_ONCE(hugetlb_cgroup_from_css(css)
->nodeinfo[nid]
->usage[idx]);
}
rcu_read_unlock();
seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
}
seq_putc(seq, '\n');
return 0;
}
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
...@@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) ...@@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
events_local_file[idx]); events_local_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT; cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */ /* Add the numa stat file */
cft = &h->cgroup_files_dfl[6]; cft = &h->cgroup_files_dfl[6];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
cft->seq_show = hugetlb_cgroup_read_numa_stat;
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
cft = &h->cgroup_files_dfl[7];
memset(cft, 0, sizeof(*cft)); memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
...@@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) ...@@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
cft->write = hugetlb_cgroup_reset; cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64; cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */ /* Add the numa stat file */
cft = &h->cgroup_files_legacy[8]; cft = &h->cgroup_files_legacy[8];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
cft->private = MEMFILE_PRIVATE(idx, 1);
cft->seq_show = hugetlb_cgroup_read_numa_stat;
/* NULL terminate the last cft */
cft = &h->cgroup_files_legacy[9];
memset(cft, 0, sizeof(*cft)); memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment