Commit 2edfd104 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_cache_for_v6.9_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull resource control updates from Borislav Petkov:

 - Rework different aspects of the resctrl code like adding
   arch-specific accessors and splitting the locking, in order to
   accomodate ARM's MPAM implementation of hw resource control and be
   able to use the same filesystem control interface like on x86. Work
   by James Morse

 - Improve the memory bandwidth throttling heuristic to handle workloads
   with not too regular load levels which end up penalized unnecessarily

 - Use CPUID to detect the memory bandwidth enforcement limit on AMD

 - The usual set of fixes

* tag 'x86_cache_for_v6.9_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (30 commits)
  x86/resctrl: Remove lockdep annotation that triggers false positive
  x86/resctrl: Separate arch and fs resctrl locks
  x86/resctrl: Move domain helper migration into resctrl_offline_cpu()
  x86/resctrl: Add CPU offline callback for resctrl work
  x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but CPU
  x86/resctrl: Add CPU online callback for resctrl work
  x86/resctrl: Add helpers for system wide mon/alloc capable
  x86/resctrl: Make rdt_enable_key the arch's decision to switch
  x86/resctrl: Move alloc/mon static keys into helpers
  x86/resctrl: Make resctrl_mounted checks explicit
  x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read()
  x86/resctrl: Allow resctrl_arch_rmid_read() to sleep
  x86/resctrl: Queue mon_event_read() instead of sending an IPI
  x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow
  x86/resctrl: Move CLOSID/RMID matching and setting to use helpers
  x86/resctrl: Allocate the cleanest CLOSID by searching closid_num_dirty_rmid
  x86/resctrl: Use __set_bit()/__clear_bit() instead of open coding
  x86/resctrl: Track the number of dirty RMID a CLOSID has
  x86/resctrl: Allow RMID allocation to be scoped by CLOSID
  x86/resctrl: Access per-rmid structures by index
  ...
parents bfdb395a c0d848fc
...@@ -7,6 +7,13 @@ ...@@ -7,6 +7,13 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/jump_label.h> #include <linux/jump_label.h>
/*
* This value can never be a valid CLOSID, and is used when mapping a
* (closid, rmid) pair to an index and back. On x86 only the RMID is
* needed. The index is a software defined value.
*/
#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0)
/** /**
* struct resctrl_pqr_state - State cache for the PQR MSR * struct resctrl_pqr_state - State cache for the PQR MSR
* @cur_rmid: The cached Resource Monitoring ID * @cur_rmid: The cached Resource Monitoring ID
...@@ -31,10 +38,47 @@ struct resctrl_pqr_state { ...@@ -31,10 +38,47 @@ struct resctrl_pqr_state {
DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
static inline bool resctrl_arch_alloc_capable(void)
{
return rdt_alloc_capable;
}
static inline void resctrl_arch_enable_alloc(void)
{
static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
static_branch_inc_cpuslocked(&rdt_enable_key);
}
static inline void resctrl_arch_disable_alloc(void)
{
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_dec_cpuslocked(&rdt_enable_key);
}
static inline bool resctrl_arch_mon_capable(void)
{
return rdt_mon_capable;
}
static inline void resctrl_arch_enable_mon(void)
{
static_branch_enable_cpuslocked(&rdt_mon_enable_key);
static_branch_inc_cpuslocked(&rdt_enable_key);
}
static inline void resctrl_arch_disable_mon(void)
{
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_dec_cpuslocked(&rdt_enable_key);
}
/* /*
* __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
* *
...@@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) ...@@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
return val * scale; return val * scale;
} }
static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
u32 closid, u32 rmid)
{
WRITE_ONCE(tsk->closid, closid);
WRITE_ONCE(tsk->rmid, rmid);
}
static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
{
return READ_ONCE(tsk->closid) == closid;
}
static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
u32 rmid)
{
return READ_ONCE(tsk->rmid) == rmid;
}
static inline void resctrl_sched_in(struct task_struct *tsk) static inline void resctrl_sched_in(struct task_struct *tsk)
{ {
if (static_branch_likely(&rdt_enable_key)) if (static_branch_likely(&rdt_enable_key))
__resctrl_sched_in(tsk); __resctrl_sched_in(tsk);
} }
static inline u32 resctrl_arch_system_num_rmid_idx(void)
{
/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
return boot_cpu_data.x86_cache_max_rmid + 1;
}
static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
*closid = X86_RESCTRL_EMPTY_CLOSID;
}
static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
{
return rmid;
}
/* x86 can always read an rmid, nothing needs allocating */
struct rdt_resource;
static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid)
{
might_sleep();
return NULL;
};
static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
void *ctx) { };
void resctrl_cpu_detect(struct cpuinfo_x86 *c); void resctrl_cpu_detect(struct cpuinfo_x86 *c);
#else #else
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#define pr_fmt(fmt) "resctrl: " fmt #define pr_fmt(fmt) "resctrl: " fmt
#include <linux/cpu.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/cacheinfo.h> #include <linux/cacheinfo.h>
...@@ -25,8 +26,15 @@ ...@@ -25,8 +26,15 @@
#include <asm/resctrl.h> #include <asm/resctrl.h>
#include "internal.h" #include "internal.h"
/* Mutex to protect rdtgroup access. */ /*
DEFINE_MUTEX(rdtgroup_mutex); * rdt_domain structures are kfree()d when their last CPU goes offline,
* and allocated when the first CPU in a new domain comes online.
* The rdt_resource's domain list is updated when this happens. Readers of
* the domain list must either take cpus_read_lock(), or rely on an RCU
* read-side critical section, to avoid observing concurrent modification.
* All writers take this mutex:
*/
static DEFINE_MUTEX(domain_list_lock);
/* /*
* The cached resctrl_pqr_state is strictly per CPU and can never be * The cached resctrl_pqr_state is strictly per CPU and can never be
...@@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void) ...@@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void)
{ {
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r = &hw_res->r_resctrl; struct rdt_resource *r = &hw_res->r_resctrl;
u32 l, h, max_cbm = BIT_MASK(20) - 1; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
return; return;
rdmsr(MSR_IA32_L3_CBM_BASE, l, h); rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
/* If all the bits were set in MSR, return success */ /* If all the bits were set in MSR, return success */
if (l != max_cbm) if (l3_cbm_0 != max_cbm)
return; return;
hw_res->num_closid = 4; hw_res->num_closid = 4;
...@@ -231,9 +239,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) ...@@ -231,9 +239,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
static bool __rdt_get_mem_config_amd(struct rdt_resource *r) static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{ {
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax; u32 eax, ebx, ecx, edx, subleaf;
union cpuid_0x10_x_edx edx;
u32 ebx, ecx, subleaf;
/* /*
* Query CPUID_Fn80000020_EDX_x01 for MBA and * Query CPUID_Fn80000020_EDX_x01 for MBA and
...@@ -241,9 +247,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) ...@@ -241,9 +247,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
*/ */
subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
hw_res->num_closid = edx.split.cos_max + 1; hw_res->num_closid = edx + 1;
r->default_ctrl = MAX_MBA_BW_AMD; r->default_ctrl = 1 << eax;
/* AMD does not use delay */ /* AMD does not use delay */
r->membw.delay_linear = false; r->membw.delay_linear = false;
...@@ -512,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) ...@@ -512,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
struct rdt_domain *d; struct rdt_domain *d;
int err; int err;
lockdep_assert_held(&domain_list_lock);
d = rdt_find_domain(r, id, &add_pos); d = rdt_find_domain(r, id, &add_pos);
if (IS_ERR(d)) { if (IS_ERR(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu); pr_warn("Couldn't find cache id for CPU %d\n", cpu);
...@@ -545,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) ...@@ -545,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return; return;
} }
list_add_tail(&d->list, add_pos); list_add_tail_rcu(&d->list, add_pos);
err = resctrl_online_domain(r, d); err = resctrl_online_domain(r, d);
if (err) { if (err) {
list_del(&d->list); list_del_rcu(&d->list);
synchronize_rcu();
domain_free(hw_dom); domain_free(hw_dom);
} }
} }
...@@ -560,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) ...@@ -560,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
struct rdt_hw_domain *hw_dom; struct rdt_hw_domain *hw_dom;
struct rdt_domain *d; struct rdt_domain *d;
lockdep_assert_held(&domain_list_lock);
d = rdt_find_domain(r, id, NULL); d = rdt_find_domain(r, id, NULL);
if (IS_ERR_OR_NULL(d)) { if (IS_ERR_OR_NULL(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu); pr_warn("Couldn't find cache id for CPU %d\n", cpu);
...@@ -570,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) ...@@ -570,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask); cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) { if (cpumask_empty(&d->cpu_mask)) {
resctrl_offline_domain(r, d); resctrl_offline_domain(r, d);
list_del(&d->list); list_del_rcu(&d->list);
synchronize_rcu();
/* /*
* rdt_domain "d" is going to be freed below, so clear * rdt_domain "d" is going to be freed below, so clear
...@@ -582,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) ...@@ -582,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
return; return;
} }
if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0);
}
if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
has_busy_rmid(r, d)) {
cancel_delayed_work(&d->cqm_limbo);
cqm_setup_limbo_handler(d, 0);
}
}
} }
static void clear_closid_rmid(int cpu) static void clear_closid_rmid(int cpu)
{ {
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
state->default_closid = 0; state->default_closid = RESCTRL_RESERVED_CLOSID;
state->default_rmid = 0; state->default_rmid = RESCTRL_RESERVED_RMID;
state->cur_closid = 0; state->cur_closid = RESCTRL_RESERVED_CLOSID;
state->cur_rmid = 0; state->cur_rmid = RESCTRL_RESERVED_RMID;
wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
RESCTRL_RESERVED_CLOSID);
} }
static int resctrl_online_cpu(unsigned int cpu) static int resctrl_arch_online_cpu(unsigned int cpu)
{ {
struct rdt_resource *r; struct rdt_resource *r;
mutex_lock(&rdtgroup_mutex); mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r) for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r); domain_add_cpu(cpu, r);
/* The cpu is set in default rdtgroup after online. */ mutex_unlock(&domain_list_lock);
cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
clear_closid_rmid(cpu); clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex); resctrl_online_cpu(cpu);
return 0; return 0;
} }
static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) static int resctrl_arch_offline_cpu(unsigned int cpu)
{
struct rdtgroup *cr;
list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
break;
}
}
}
static int resctrl_offline_cpu(unsigned int cpu)
{ {
struct rdtgroup *rdtgrp;
struct rdt_resource *r; struct rdt_resource *r;
mutex_lock(&rdtgroup_mutex); resctrl_offline_cpu(cpu);
mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r) for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r); domain_remove_cpu(cpu, r);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { mutex_unlock(&domain_list_lock);
if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
clear_childcpus(rdtgrp, cpu);
break;
}
}
clear_closid_rmid(cpu); clear_closid_rmid(cpu);
mutex_unlock(&rdtgroup_mutex);
return 0; return 0;
} }
...@@ -968,7 +954,8 @@ static int __init resctrl_late_init(void) ...@@ -968,7 +954,8 @@ static int __init resctrl_late_init(void)
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:", "x86/resctrl/cat:online:",
resctrl_online_cpu, resctrl_offline_cpu); resctrl_arch_online_cpu,
resctrl_arch_offline_cpu);
if (state < 0) if (state < 0)
return state; return state;
...@@ -992,8 +979,14 @@ late_initcall(resctrl_late_init); ...@@ -992,8 +979,14 @@ late_initcall(resctrl_late_init);
static void __exit resctrl_exit(void) static void __exit resctrl_exit(void)
{ {
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
cpuhp_remove_state(rdt_online); cpuhp_remove_state(rdt_online);
rdtgroup_exit(); rdtgroup_exit();
if (r->mon_capable)
rdt_put_mon_l3_config();
} }
__exitcall(resctrl_exit); __exitcall(resctrl_exit);
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <linux/kernfs.h> #include <linux/kernfs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/tick.h>
#include "internal.h" #include "internal.h"
/* /*
...@@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, ...@@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct rdt_domain *d; struct rdt_domain *d;
unsigned long dom_id; unsigned long dom_id;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
...@@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) ...@@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
struct rdt_domain *d; struct rdt_domain *d;
u32 idx; u32 idx;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM; return -ENOMEM;
...@@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, ...@@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
return -EINVAL; return -EINVAL;
buf[nbytes - 1] = '\0'; buf[nbytes - 1] = '\0';
cpus_read_lock();
rdtgrp = rdtgroup_kn_lock_live(of->kn); rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) { if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn); rdtgroup_kn_unlock(of->kn);
cpus_read_unlock();
return -ENOENT; return -ENOENT;
} }
rdt_last_cmd_clear(); rdt_last_cmd_clear();
...@@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, ...@@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
out: out:
rdt_staged_configs_clear(); rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn); rdtgroup_kn_unlock(of->kn);
cpus_read_unlock();
return ret ?: nbytes; return ret ?: nbytes;
} }
...@@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo ...@@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
bool sep = false; bool sep = false;
u32 ctrl_val; u32 ctrl_val;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
seq_printf(s, "%*s:", max_name_width, schema->name); seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(dom, &r->domains, list) { list_for_each_entry(dom, &r->domains, list) {
if (sep) if (sep)
...@@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, ...@@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
return ret; return ret;
} }
static int smp_mon_event_count(void *arg)
{
mon_event_count(arg);
return 0;
}
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp, struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first) int evtid, int first)
{ {
int cpu;
/* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
/* /*
* setup the parameters to send to the IPI to read the data. * Setup the parameters to pass to mon_event_count() to read the data.
*/ */
rr->rgrp = rdtgrp; rr->rgrp = rdtgrp;
rr->evtid = evtid; rr->evtid = evtid;
...@@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, ...@@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
rr->d = d; rr->d = d;
rr->val = 0; rr->val = 0;
rr->first = first; rr->first = first;
rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
if (IS_ERR(rr->arch_mon_ctx)) {
rr->err = -EINVAL;
return;
}
cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
/*
* cpumask_any_housekeeping() prefers housekeeping CPUs, but
* are all the CPUs nohz_full? If yes, pick a CPU to IPI.
* MPAM's resctrl_arch_rmid_read() is unable to read the
* counters on some platforms if its called in IRQ context.
*/
if (tick_nohz_full_cpu(cpu))
smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
else
smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
} }
int rdtgroup_mondata_show(struct seq_file *m, void *arg) int rdtgroup_mondata_show(struct seq_file *m, void *arg)
......
...@@ -7,6 +7,9 @@ ...@@ -7,6 +7,9 @@
#include <linux/kernfs.h> #include <linux/kernfs.h>
#include <linux/fs_context.h> #include <linux/fs_context.h>
#include <linux/jump_label.h> #include <linux/jump_label.h>
#include <linux/tick.h>
#include <asm/resctrl.h>
#define L3_QOS_CDP_ENABLE 0x01ULL #define L3_QOS_CDP_ENABLE 0x01ULL
...@@ -18,7 +21,6 @@ ...@@ -18,7 +21,6 @@
#define MBM_OVERFLOW_INTERVAL 1000 #define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u #define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4 #define MBA_IS_LINEAR 0x4
#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_ERROR BIT_ULL(63)
...@@ -54,6 +56,46 @@ ...@@ -54,6 +56,46 @@
/* Max event bits supported */ /* Max event bits supported */
#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) #define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
/**
* cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
* aren't marked nohz_full
* @mask: The mask to pick a CPU from.
* @exclude_cpu:The CPU to avoid picking.
*
* Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
* CPUs that don't use nohz_full, these are preferred. Pass
* RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
*
* When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
*/
static inline unsigned int
cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
{
unsigned int cpu, hk_cpu;
if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
cpu = cpumask_any(mask);
else
cpu = cpumask_any_but(mask, exclude_cpu);
if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
return cpu;
/* If the CPU picked isn't marked nohz_full nothing more needs doing. */
if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
return cpu;
/* Try to find a CPU that isn't nohz_full to use in preference */
hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
if (hk_cpu == exclude_cpu)
hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
if (hk_cpu < nr_cpu_ids)
cpu = hk_cpu;
return cpu;
}
struct rdt_fs_context { struct rdt_fs_context {
struct kernfs_fs_context kfc; struct kernfs_fs_context kfc;
bool enable_cdpl2; bool enable_cdpl2;
...@@ -69,9 +111,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) ...@@ -69,9 +111,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
return container_of(kfc, struct rdt_fs_context, kfc); return container_of(kfc, struct rdt_fs_context, kfc);
} }
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
/** /**
* struct mon_evt - Entry in the event list of a resource * struct mon_evt - Entry in the event list of a resource
* @evtid: event id * @evtid: event id
...@@ -112,12 +151,12 @@ struct rmid_read { ...@@ -112,12 +151,12 @@ struct rmid_read {
bool first; bool first;
int err; int err;
u64 val; u64 val;
void *arch_mon_ctx;
}; };
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features; extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all; extern struct list_head resctrl_schema_all;
extern bool resctrl_mounted;
enum rdt_group_type { enum rdt_group_type {
RDTCTRL_GROUP = 0, RDTCTRL_GROUP = 0,
...@@ -296,14 +335,10 @@ struct rftype { ...@@ -296,14 +335,10 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain * struct mbm_state - status for each MBM counter in each domain
* @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps * @prev_bw: The most recent bandwidth in MBps
* @delta_bw: Difference between the current and previous bandwidth
* @delta_comp: Indicates whether to compute the delta_bw
*/ */
struct mbm_state { struct mbm_state {
u64 prev_bw_bytes; u64 prev_bw_bytes;
u32 prev_bw; u32 prev_bw;
u32 delta_bw;
bool delta_comp;
}; };
/** /**
...@@ -395,6 +430,8 @@ struct rdt_parse_data { ...@@ -395,6 +430,8 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs * @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow. * @mbm_width: Monitor width, to detect and correct for overflow.
* @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
* Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource * @cdp_enabled: CDP state of this resource
* *
* Members of this structure are either private to the architecture * Members of this structure are either private to the architecture
...@@ -409,6 +446,7 @@ struct rdt_hw_resource { ...@@ -409,6 +446,7 @@ struct rdt_hw_resource {
struct rdt_resource *r); struct rdt_resource *r);
unsigned int mon_scale; unsigned int mon_scale;
unsigned int mbm_width; unsigned int mbm_width;
unsigned int mbm_cfg_mask;
bool cdp_enabled; bool cdp_enabled;
}; };
...@@ -426,8 +464,6 @@ extern struct mutex rdtgroup_mutex; ...@@ -426,8 +464,6 @@ extern struct mutex rdtgroup_mutex;
extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default; extern struct rdtgroup rdtgroup_default;
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
extern struct dentry *debugfs_resctrl; extern struct dentry *debugfs_resctrl;
enum resctrl_res_level { enum resctrl_res_level {
...@@ -543,9 +579,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); ...@@ -543,9 +579,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void); int closids_supported(void);
void closid_free(int closid); void closid_free(int closid);
int alloc_rmid(void); int alloc_rmid(u32 closid);
void free_rmid(u32 rmid); void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r); int rdt_get_mon_l3_config(struct rdt_resource *r);
void __exit rdt_put_mon_l3_config(void);
bool __init rdt_cpu_has(int flag); bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info); void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg); int rdtgroup_mondata_show(struct seq_file *m, void *arg);
...@@ -553,17 +590,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, ...@@ -553,17 +590,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp, struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first); int evtid, int first);
void mbm_setup_overflow_handler(struct rdt_domain *dom, void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms); unsigned long delay_ms,
int exclude_cpu);
void mbm_handle_overflow(struct work_struct *work); void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void); void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r); bool is_mba_sc(struct rdt_resource *r);
void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
int exclude_cpu);
void cqm_handle_limbo(struct work_struct *work); void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); bool has_busy_rmid(struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free); void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void); void __init thread_throttle_mode_init(void);
void __init mbm_config_rftype_init(const char *config); void __init mbm_config_rftype_init(const char *config);
void rdt_staged_configs_clear(void); void rdt_staged_configs_clear(void);
bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */ #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
This diff is collapsed.
...@@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) ...@@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
if (ret) if (ret)
goto err_cpus; goto err_cpus;
if (rdt_mon_capable) { if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
if (ret) if (ret)
goto err_cpus_list; goto err_cpus_list;
...@@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) ...@@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
if (ret) if (ret)
goto err_cpus; goto err_cpus;
if (rdt_mon_capable) { if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
if (ret) if (ret)
goto err_cpus_list; goto err_cpus_list;
...@@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) ...@@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* anymore when this group would be used for pseudo-locking. This * anymore when this group would be used for pseudo-locking. This
* is safe to call on platforms not capable of monitoring. * is safe to call on platforms not capable of monitoring.
*/ */
free_rmid(rdtgrp->mon.rmid); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
ret = 0; ret = 0;
goto out; goto out;
...@@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ...@@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
{ {
int ret; int ret;
if (rdt_mon_capable) { if (resctrl_arch_mon_capable()) {
ret = alloc_rmid(); ret = alloc_rmid(rdtgrp->closid);
if (ret < 0) { if (ret < 0) {
rdt_last_cmd_puts("Out of RMIDs\n"); rdt_last_cmd_puts("Out of RMIDs\n");
return ret; return ret;
...@@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ...@@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
ret = rdtgroup_locksetup_user_restore(rdtgrp); ret = rdtgroup_locksetup_user_restore(rdtgrp);
if (ret) { if (ret) {
free_rmid(rdtgrp->mon.rmid); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
return ret; return ret;
} }
...@@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) ...@@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
struct rdt_domain *d_i; struct rdt_domain *d_i;
bool ret = false; bool ret = false;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
return true; return true;
......
This diff is collapsed.
...@@ -6,6 +6,12 @@ ...@@ -6,6 +6,12 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/pid.h> #include <linux/pid.h>
/* CLOSID, RMID value used by the default control group */
#define RESCTRL_RESERVED_CLOSID 0
#define RESCTRL_RESERVED_RMID 0
#define RESCTRL_PICK_ANY_CPU -1
#ifdef CONFIG_PROC_CPU_RESCTRL #ifdef CONFIG_PROC_CPU_RESCTRL
int proc_resctrl_show(struct seq_file *m, int proc_resctrl_show(struct seq_file *m,
...@@ -153,7 +159,7 @@ struct resctrl_schema; ...@@ -153,7 +159,7 @@ struct resctrl_schema;
* @cache_level: Which cache level defines scope of this resource * @cache_level: Which cache level defines scope of this resource
* @cache: Cache allocation related data * @cache: Cache allocation related data
* @membw: If the component has bandwidth controls, their properties. * @membw: If the component has bandwidth controls, their properties.
* @domains: All domains for this resource * @domains: RCU list of all domains for this resource
* @name: Name to use in "schemata" file. * @name: Name to use in "schemata" file.
* @data_width: Character width of data when displaying * @data_width: Character width of data when displaying
* @default_ctrl: Specifies default cache cbm or memory B/W percent. * @default_ctrl: Specifies default cache cbm or memory B/W percent.
...@@ -219,36 +225,70 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, ...@@ -219,36 +225,70 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
u32 closid, enum resctrl_conf_type type); u32 closid, enum resctrl_conf_type type);
int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
void resctrl_online_cpu(unsigned int cpu);
void resctrl_offline_cpu(unsigned int cpu);
/** /**
* resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
* for this resource and domain. * for this resource and domain.
* @r: resource that the counter should be read from. * @r: resource that the counter should be read from.
* @d: domain that the counter should be read from. * @d: domain that the counter should be read from.
* @closid: closid that matches the rmid. Depending on the architecture, the
* counter may match traffic of both @closid and @rmid, or @rmid
* only.
* @rmid: rmid of the counter to read. * @rmid: rmid of the counter to read.
* @eventid: eventid to read, e.g. L3 occupancy. * @eventid: eventid to read, e.g. L3 occupancy.
* @val: result of the counter read in bytes. * @val: result of the counter read in bytes.
* @arch_mon_ctx: An architecture specific value from
* resctrl_arch_mon_ctx_alloc(), for MPAM this identifies
* the hardware monitor allocated for this read request.
* *
* Call from process context on a CPU that belongs to domain @d. * Some architectures need to sleep when first programming some of the counters.
* (specifically: arm64's MPAM cache occupancy counters can return 'not ready'
* for a short period of time). Call from a non-migrateable process context on
* a CPU that belongs to domain @d. e.g. use smp_call_on_cpu() or
* schedule_work_on(). This function can be called with interrupts masked,
* e.g. using smp_call_function_any(), but may consistently return an error.
* *
* Return: * Return:
* 0 on success, or -EIO, -EINVAL etc on error. * 0 on success, or -EIO, -EINVAL etc on error.
*/ */
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
u32 rmid, enum resctrl_event_id eventid, u64 *val); u32 closid, u32 rmid, enum resctrl_event_id eventid,
u64 *val, void *arch_mon_ctx);
/**
* resctrl_arch_rmid_read_context_check() - warn about invalid contexts
*
* When built with CONFIG_DEBUG_ATOMIC_SLEEP generate a warning when
* resctrl_arch_rmid_read() is called with preemption disabled.
*
* The contract with resctrl_arch_rmid_read() is that if interrupts
* are unmasked, it can sleep. This allows NOHZ_FULL systems to use an
* IPI, (and fail if the call needed to sleep), while most of the time
* the work is scheduled, allowing the call to sleep.
*/
static inline void resctrl_arch_rmid_read_context_check(void)
{
if (!irqs_disabled())
might_sleep();
}
/** /**
* resctrl_arch_reset_rmid() - Reset any private state associated with rmid * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
* and eventid. * and eventid.
* @r: The domain's resource. * @r: The domain's resource.
* @d: The rmid's domain. * @d: The rmid's domain.
* @closid: closid that matches the rmid. Depending on the architecture, the
* counter may match traffic of both @closid and @rmid, or @rmid only.
* @rmid: The rmid whose counter values should be reset. * @rmid: The rmid whose counter values should be reset.
* @eventid: The eventid whose counter values should be reset. * @eventid: The eventid whose counter values should be reset.
* *
* This can be called from any CPU. * This can be called from any CPU.
*/ */
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
u32 rmid, enum resctrl_event_id eventid); u32 closid, u32 rmid,
enum resctrl_event_id eventid);
/** /**
* resctrl_arch_reset_rmid_all() - Reset all private state associated with * resctrl_arch_reset_rmid_all() - Reset all private state associated with
......
...@@ -164,9 +164,16 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } ...@@ -164,9 +164,16 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#endif /* !CONFIG_NO_HZ_COMMON */ #endif /* !CONFIG_NO_HZ_COMMON */
/*
* Mask of CPUs that are nohz_full.
*
* Users should be guarded by CONFIG_NO_HZ_FULL or a tick_nohz_full_cpu()
* check.
*/
extern cpumask_var_t tick_nohz_full_mask;
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running; extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void) static inline bool tick_nohz_full_enabled(void)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment