Commit 1156b441 authored by Davidlohr Bueso's avatar Davidlohr Bueso Committed by Dan Williams

memregion: Add cpu_cache_invalidate_memregion() interface

With CXL security features, and CXL dynamic provisioning, global CPU
cache flushing nvdimm requirements are no longer specific to that
subsystem, even beyond the scope of security_ops. CXL will need such
semantics for features not necessarily limited to persistent memory.

The functionality this is enabling is to be able to instantaneously
secure erase potentially terabytes of memory at once and the kernel
needs to be sure that none of the data from before the erase is still
present in the cache. It is also used when unlocking a memory device
where speculative reads and firmware accesses could have cached poison
from before the device was unlocked. Lastly this facility is used when
mapping new devices, or new capacity into an established physical
address range. I.e. when the driver switches DeviceA mapping AddressX to
DeviceB mapping AddressX then any cached data from DeviceA:AddressX
needs to be invalidated.

This capability is typically only used once per-boot (for unlock), or
once per bare metal provisioning event (secure erase), like when handing
off the system to another tenant or decommissioning a device. It may
also be used for dynamic CXL region provisioning.

Users must first call cpu_cache_has_invalidate_memregion() to know
whether this functionality is available on the architecture. On x86 this
respects the constraints of when wbinvd() is tolerable. It is already
the case that wbinvd() is problematic to allow in VMs due its global
performance impact and KVM, for example, has been known to just trap and
ignore the call. With confidential computing guest execution of wbinvd()
may even trigger an exception. Given guests should not be messing with
the bare metal address map via CXL configuration changes
cpu_cache_has_invalidate_memregion() returns false in VMs.

While this global cache invalidation facility, is exported to modules,
since NVDIMM and CXL support can be built as a module, it is not for
general use. The intent is that this facility is not available outside
of specific "device-memory" use cases. To make that expectation as clear
as possible the API is scoped to a new "DEVMEM" module namespace that
only the NVDIMM and CXL subsystems are expected to import.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Tested-by: default avatarDave Jiang <dave.jiang@intel.com>
Signed-off-by: default avatarDavidlohr Bueso <dave@stgolabs.net>
Acked-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Co-developed-by: default avatarDan Williams <dan.j.williams@intel.com>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 487d828d
...@@ -69,6 +69,7 @@ config X86 ...@@ -69,6 +69,7 @@ config X86
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/cc_platform.h> #include <linux/cc_platform.h>
#include <linux/set_memory.h> #include <linux/set_memory.h>
#include <linux/memregion.h>
#include <asm/e820/api.h> #include <asm/e820/api.h>
#include <asm/processor.h> #include <asm/processor.h>
...@@ -330,6 +331,23 @@ void arch_invalidate_pmem(void *addr, size_t size) ...@@ -330,6 +331,23 @@ void arch_invalidate_pmem(void *addr, size_t size)
EXPORT_SYMBOL_GPL(arch_invalidate_pmem); EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif #endif
#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool cpu_cache_has_invalidate_memregion(void)
{
return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);
int cpu_cache_invalidate_memregion(int res_desc)
{
if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
return -ENXIO;
wbinvd_on_all_cpus();
return 0;
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
#endif
static void __cpa_flush_all(void *arg) static void __cpa_flush_all(void *arg)
{ {
unsigned long cache = (unsigned long)arg; unsigned long cache = (unsigned long)arg;
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <linux/libnvdimm.h> #include <linux/libnvdimm.h>
#include <linux/ndctl.h> #include <linux/ndctl.h>
#include <linux/acpi.h> #include <linux/acpi.h>
#include <linux/memregion.h>
#include <asm/smp.h> #include <asm/smp.h>
#include "intel.h" #include "intel.h"
#include "nfit.h" #include "nfit.h"
...@@ -190,8 +191,6 @@ static int intel_security_change_key(struct nvdimm *nvdimm, ...@@ -190,8 +191,6 @@ static int intel_security_change_key(struct nvdimm *nvdimm,
} }
} }
static void nvdimm_invalidate_cache(void);
static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm, static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data) const struct nvdimm_key_data *key_data)
{ {
...@@ -213,6 +212,9 @@ static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm, ...@@ -213,6 +212,9 @@ static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, &nfit_mem->dsm_mask)) if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, &nfit_mem->dsm_mask))
return -ENOTTY; return -ENOTTY;
if (!cpu_cache_has_invalidate_memregion())
return -EINVAL;
memcpy(nd_cmd.cmd.passphrase, key_data->data, memcpy(nd_cmd.cmd.passphrase, key_data->data,
sizeof(nd_cmd.cmd.passphrase)); sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL); rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL);
...@@ -228,7 +230,7 @@ static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm, ...@@ -228,7 +230,7 @@ static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
} }
/* DIMM unlocked, invalidate all CPU caches before we read it */ /* DIMM unlocked, invalidate all CPU caches before we read it */
nvdimm_invalidate_cache(); cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
return 0; return 0;
} }
...@@ -297,8 +299,11 @@ static int __maybe_unused intel_security_erase(struct nvdimm *nvdimm, ...@@ -297,8 +299,11 @@ static int __maybe_unused intel_security_erase(struct nvdimm *nvdimm,
if (!test_bit(cmd, &nfit_mem->dsm_mask)) if (!test_bit(cmd, &nfit_mem->dsm_mask))
return -ENOTTY; return -ENOTTY;
if (!cpu_cache_has_invalidate_memregion())
return -EINVAL;
/* flush all cache before we erase DIMM */ /* flush all cache before we erase DIMM */
nvdimm_invalidate_cache(); cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
memcpy(nd_cmd.cmd.passphrase, key->data, memcpy(nd_cmd.cmd.passphrase, key->data,
sizeof(nd_cmd.cmd.passphrase)); sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL); rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL);
...@@ -318,7 +323,7 @@ static int __maybe_unused intel_security_erase(struct nvdimm *nvdimm, ...@@ -318,7 +323,7 @@ static int __maybe_unused intel_security_erase(struct nvdimm *nvdimm,
} }
/* DIMM erased, invalidate all CPU caches before we read it */ /* DIMM erased, invalidate all CPU caches before we read it */
nvdimm_invalidate_cache(); cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
return 0; return 0;
} }
...@@ -341,6 +346,9 @@ static int __maybe_unused intel_security_query_overwrite(struct nvdimm *nvdimm) ...@@ -341,6 +346,9 @@ static int __maybe_unused intel_security_query_overwrite(struct nvdimm *nvdimm)
if (!test_bit(NVDIMM_INTEL_QUERY_OVERWRITE, &nfit_mem->dsm_mask)) if (!test_bit(NVDIMM_INTEL_QUERY_OVERWRITE, &nfit_mem->dsm_mask))
return -ENOTTY; return -ENOTTY;
if (!cpu_cache_has_invalidate_memregion())
return -EINVAL;
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL); rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL);
if (rc < 0) if (rc < 0)
return rc; return rc;
...@@ -355,7 +363,7 @@ static int __maybe_unused intel_security_query_overwrite(struct nvdimm *nvdimm) ...@@ -355,7 +363,7 @@ static int __maybe_unused intel_security_query_overwrite(struct nvdimm *nvdimm)
} }
/* flush all cache before we make the nvdimms available */ /* flush all cache before we make the nvdimms available */
nvdimm_invalidate_cache(); cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
return 0; return 0;
} }
...@@ -380,8 +388,11 @@ static int __maybe_unused intel_security_overwrite(struct nvdimm *nvdimm, ...@@ -380,8 +388,11 @@ static int __maybe_unused intel_security_overwrite(struct nvdimm *nvdimm,
if (!test_bit(NVDIMM_INTEL_OVERWRITE, &nfit_mem->dsm_mask)) if (!test_bit(NVDIMM_INTEL_OVERWRITE, &nfit_mem->dsm_mask))
return -ENOTTY; return -ENOTTY;
if (!cpu_cache_has_invalidate_memregion())
return -EINVAL;
/* flush all cache before we erase DIMM */ /* flush all cache before we erase DIMM */
nvdimm_invalidate_cache(); cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
memcpy(nd_cmd.cmd.passphrase, nkey->data, memcpy(nd_cmd.cmd.passphrase, nkey->data,
sizeof(nd_cmd.cmd.passphrase)); sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL); rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, &nd_cmd, sizeof(nd_cmd), NULL);
...@@ -401,22 +412,6 @@ static int __maybe_unused intel_security_overwrite(struct nvdimm *nvdimm, ...@@ -401,22 +412,6 @@ static int __maybe_unused intel_security_overwrite(struct nvdimm *nvdimm,
} }
} }
/*
* TODO: define a cross arch wbinvd equivalent when/if
* NVDIMM_FAMILY_INTEL command support arrives on another arch.
*/
#ifdef CONFIG_X86
static void nvdimm_invalidate_cache(void)
{
wbinvd_on_all_cpus();
}
#else
static void nvdimm_invalidate_cache(void)
{
WARN_ON_ONCE("cache invalidation required after unlock\n");
}
#endif
static const struct nvdimm_security_ops __intel_security_ops = { static const struct nvdimm_security_ops __intel_security_ops = {
.get_flags = intel_security_flags, .get_flags = intel_security_flags,
.freeze = intel_security_freeze, .freeze = intel_security_freeze,
...@@ -775,3 +770,5 @@ static const struct nvdimm_fw_ops __intel_fw_ops = { ...@@ -775,3 +770,5 @@ static const struct nvdimm_fw_ops __intel_fw_ops = {
}; };
const struct nvdimm_fw_ops *intel_fw_ops = &__intel_fw_ops; const struct nvdimm_fw_ops *intel_fw_ops = &__intel_fw_ops;
MODULE_IMPORT_NS(DEVMEM);
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#define _MEMREGION_H_ #define _MEMREGION_H_
#include <linux/types.h> #include <linux/types.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/bug.h>
struct memregion_info { struct memregion_info {
int target_node; int target_node;
...@@ -20,4 +21,41 @@ static inline void memregion_free(int id) ...@@ -20,4 +21,41 @@ static inline void memregion_free(int id)
{ {
} }
#endif #endif
/**
* cpu_cache_invalidate_memregion - drop any CPU cached data for
* memregions described by @res_desc
* @res_desc: one of the IORES_DESC_* types
*
* Perform cache maintenance after a memory event / operation that
* changes the contents of physical memory in a cache-incoherent manner.
* For example, device memory technologies like NVDIMM and CXL have
* device secure erase, and dynamic region provision that can replace
* the memory mapped to a given physical address.
*
* Limit the functionality to architectures that have an efficient way
* to writeback and invalidate potentially terabytes of address space at
* once. Note that this routine may or may not write back any dirty
* contents while performing the invalidation. It is only exported for
* the explicit usage of the NVDIMM and CXL modules in the 'DEVMEM'
* symbol namespace on bare platforms.
*
* Returns 0 on success or negative error code on a failure to perform
* the cache maintenance.
*/
#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
int cpu_cache_invalidate_memregion(int res_desc);
bool cpu_cache_has_invalidate_memregion(void);
#else
static inline bool cpu_cache_has_invalidate_memregion(void)
{
return false;
}
static inline int cpu_cache_invalidate_memregion(int res_desc)
{
WARN_ON_ONCE("CPU cache invalidation required");
return -ENXIO;
}
#endif
#endif /* _MEMREGION_H_ */ #endif /* _MEMREGION_H_ */
...@@ -672,6 +672,9 @@ config ARCH_HAS_PMEM_API ...@@ -672,6 +672,9 @@ config ARCH_HAS_PMEM_API
config MEMREGION config MEMREGION
bool bool
config ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool
config ARCH_HAS_MEMREMAP_COMPAT_ALIGN config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
bool bool
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment