Commit 13c76ad8 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Enable full ASLR randomization for 32-bit programs (Hector
     Marco-Gisbert)

   - Add initial minimal INVPCI support, to flush global mappings (Andy
     Lutomirski)

   - Add KASAN enhancements (Andrey Ryabinin)

   - Fix mmiotrace for huge pages (Karol Herbst)

   - ... misc cleanups and small enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/32: Enable full randomization on i386 and X86_32
  x86/mm/kmmio: Fix mmiotrace for hugepages
  x86/mm: Avoid premature success when changing page attributes
  x86/mm/ptdump: Remove paravirt_enabled()
  x86/mm: Fix INVPCID asm constraint
  x86/dmi: Switch dmi_remap() from ioremap() [uncached] to ioremap_cache()
  x86/mm: If INVPCID is available, use it to flush global mappings
  x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID
  x86/mm: Add INVPCID helpers
  x86/kasan: Write protect kasan zero shadow
  x86/kasan: Clear kasan_zero_page after TLB flush
  x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug()
  x86/mm/numa: Clean up numa_clear_kernel_node_hotplug()
  x86/mm: Make kmap_prot into a #define
  x86/mm/32: Set NX in __supported_pte_mask before enabling paging
  x86/mm: Streamline and restore probe_memory_block_size()
parents 9cf8d636 8b8addf8
......@@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nointroute [IA-64]
noinvpcid [X86] Disable the INVPCID cpu feature.
nojitter [IA-64] Disables jitter checking for ITC timers.
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
......
......@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
/* Use early IO mappings for DMI because it's initialized early */
#define dmi_early_remap early_ioremap
#define dmi_early_unmap early_iounmap
#define dmi_remap ioremap
#define dmi_remap ioremap_cache
#define dmi_unmap iounmap
#endif /* _ASM_X86_DMI_H */
......@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
extern int fixmaps_set;
extern pte_t *kmap_pte;
extern pgprot_t kmap_prot;
#define kmap_prot PAGE_KERNEL
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
......
......@@ -8,6 +8,54 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
* mappings, we don't want the compiler to reorder any subsequent
* memory accesses before the TLB flush.
*
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
* invpcid (%rcx), %rax in long mode.
*/
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
}
#define INVPCID_TYPE_INDIV_ADDR 0
#define INVPCID_TYPE_SINGLE_CTXT 1
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
#define INVPCID_TYPE_ALL_NON_GLOBAL 3
/* Flush all mappings for a given pcid and addr, not including globals. */
static inline void invpcid_flush_one(unsigned long pcid,
unsigned long addr)
{
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
}
/* Flush all mappings for a given PCID, not including globals. */
static inline void invpcid_flush_single_context(unsigned long pcid)
{
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
}
/* Flush all mappings, including globals, for all PCIDs. */
static inline void invpcid_flush_all(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
}
/* Flush all mappings for all PCIDs except globals. */
static inline void invpcid_flush_all_nonglobals(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
......@@ -105,6 +153,15 @@ static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*/
invpcid_flush_all();
return;
}
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
......
......@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
static int __init x86_noinvpcid_setup(char *s)
{
/* noinvpcid doesn't accept parameters */
if (s)
return -EINVAL;
/* do not emit a message if the feature is not present */
if (!boot_cpu_has(X86_FEATURE_INVPCID))
return 0;
setup_clear_cpu_cap(X86_FEATURE_INVPCID);
pr_info("noinvpcid: INVPCID feature disabled\n");
return 0;
}
early_param("noinvpcid", x86_noinvpcid_setup);
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
......
......@@ -389,6 +389,12 @@ default_entry:
/* Make changes effective */
wrmsr
/*
* And make sure that all the mappings we set up have NX set from
* the beginning.
*/
orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
enable_paging:
/*
......
......@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
#ifdef CONFIG_X86_64
static inline bool is_hypervisor_range(int idx)
{
#ifdef CONFIG_X86_64
/*
* ffff800000000000 - ffff87ffffffffff is reserved for
* the hypervisor.
*/
return paravirt_enabled() &&
(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
(idx < pgd_index(__PAGE_OFFSET));
}
#else
static inline bool is_hypervisor_range(int idx) { return false; }
return false;
#endif
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
......
......@@ -388,7 +388,6 @@ kernel_physical_mapping_init(unsigned long start,
}
pte_t *kmap_pte;
pgprot_t kmap_prot;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
......@@ -405,8 +404,6 @@ static void __init kmap_init(void)
*/
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
kmap_prot = PAGE_KERNEL;
}
#ifdef CONFIG_HIGHMEM
......
......@@ -53,6 +53,7 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
#include <asm/uv/uv.h>
#include <asm/setup.h>
#include "mm_internal.h"
......@@ -1203,26 +1204,13 @@ int kern_addr_valid(unsigned long addr)
static unsigned long probe_memory_block_size(void)
{
/* start from 2g */
unsigned long bz = 1UL<<31;
unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
pr_info("Using 2GB memory block size for large-memory system\n");
return 2UL * 1024 * 1024 * 1024;
}
/* less than 64g installed */
if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
return MIN_MEMORY_BLOCK_SIZE;
/* get the tail size */
while (bz > MIN_MEMORY_BLOCK_SIZE) {
if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
break;
bz >>= 1;
}
/* if system is UV or has 64GB of RAM or more, use large blocks */
if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
bz = 2UL << 30; /* 2GB */
printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
return bz;
}
......
......@@ -120,11 +120,22 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
memset(kasan_zero_page, 0, PAGE_SIZE);
load_cr3(init_level4_pgt);
__flush_tlb_all();
init_task.kasan_depth = 0;
/*
* kasan_zero_page has been used as early shadow memory, thus it may
* contain some garbage. Now we can clear and write protect it, since
* after the TLB flush no one should write to it.
*/
memset(kasan_zero_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
set_pte(&kasan_zero_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
}
......@@ -33,7 +33,7 @@
struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
unsigned long addr; /* the requested address */
pteval_t old_presence; /* page presence prior to arming */
bool armed;
......@@ -70,9 +70,16 @@ unsigned int kmmio_count;
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
static LIST_HEAD(kmmio_probes);
static struct list_head *kmmio_page_list(unsigned long page)
static struct list_head *kmmio_page_list(unsigned long addr)
{
return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
unsigned int l;
pte_t *pte = lookup_address(addr, &l);
if (!pte)
return NULL;
addr &= page_level_mask(l);
return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
}
/* Accessed per-cpu */
......@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
}
/* You must be holding RCU read lock. */
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
{
struct list_head *head;
struct kmmio_fault_page *f;
unsigned int l;
pte_t *pte = lookup_address(addr, &l);
page &= PAGE_MASK;
head = kmmio_page_list(page);
if (!pte)
return NULL;
addr &= page_level_mask(l);
head = kmmio_page_list(addr);
list_for_each_entry_rcu(f, head, list) {
if (f->page == page)
if (f->addr == addr)
return f;
}
return NULL;
......@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
pte_t *pte = lookup_address(f->page, &level);
pte_t *pte = lookup_address(f->addr, &level);
if (!pte) {
pr_err("no pte for page 0x%08lx\n", f->page);
pr_err("no pte for addr 0x%08lx\n", f->addr);
return -1;
}
......@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
__flush_tlb_one(f->page);
__flush_tlb_one(f->addr);
return 0;
}
......@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
int ret;
WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
f->page, f->count, !!f->old_presence);
pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
f->addr, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
f->page);
WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
f->addr);
f->armed = true;
return ret;
}
......@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
f->armed = false;
}
......@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
int ret = 0; /* default to fault not handled */
unsigned long page_base = addr;
unsigned int l;
pte_t *pte = lookup_address(addr, &l);
if (!pte)
return -EINVAL;
page_base &= page_level_mask(l);
/*
* Preemption is now disabled to prevent process switch during
......@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
preempt_disable();
rcu_read_lock();
faultpage = get_kmmio_fault_page(addr);
faultpage = get_kmmio_fault_page(page_base);
if (!faultpage) {
/*
* Either this page fault is not caused by kmmio, or
......@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
if (addr == ctx->addr) {
if (page_base == ctx->addr) {
/*
* A second fault on the same page means some other
* condition needs handling by do_page_fault(), the
......@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx->active++;
ctx->fpage = faultpage;
ctx->probe = get_kmmio_probe(addr);
ctx->probe = get_kmmio_probe(page_base);
ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
ctx->addr = addr;
ctx->addr = page_base;
if (ctx->probe && ctx->probe->pre_handler)
ctx->probe->pre_handler(ctx->probe, regs, addr);
......@@ -354,12 +371,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
}
/* You must be holding kmmio_lock. */
static int add_kmmio_fault_page(unsigned long page)
static int add_kmmio_fault_page(unsigned long addr)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
f = get_kmmio_fault_page(addr);
if (f) {
if (!f->count)
arm_kmmio_fault_page(f);
......@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
return -1;
f->count = 1;
f->page = page;
f->addr = addr;
if (arm_kmmio_fault_page(f)) {
kfree(f);
return -1;
}
list_add_rcu(&f->list, kmmio_page_list(f->page));
list_add_rcu(&f->list, kmmio_page_list(f->addr));
return 0;
}
/* You must be holding kmmio_lock. */
static void release_kmmio_fault_page(unsigned long page,
static void release_kmmio_fault_page(unsigned long addr,
struct kmmio_fault_page **release_list)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
f = get_kmmio_fault_page(addr);
if (!f)
return;
......@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
int ret = 0;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
unsigned int l;
pte_t *pte;
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
ret = -EEXIST;
goto out;
}
pte = lookup_address(p->addr, &l);
if (!pte) {
ret = -EINVAL;
goto out;
}
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
pr_err("Unable to set page fault.\n");
size += PAGE_SIZE;
size += page_level_size(l);
}
out:
spin_unlock_irqrestore(&kmmio_lock, flags);
......@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
unsigned int l;
pte_t *pte;
pte = lookup_address(p->addr, &l);
if (!pte)
return;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
size += PAGE_SIZE;
size += page_level_size(l);
}
list_del_rcu(&p->list);
kmmio_count--;
......
......@@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd)
return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
static unsigned long mmap_legacy_base(unsigned long rnd)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
return TASK_UNMAPPED_BASE + rnd;
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
......@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
mm->mmap_legacy_base = mmap_legacy_base(random_factor);
mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
......
......@@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
return true;
}
/*
* Mark all currently memblock-reserved physical memory (which covers the
* kernel's own memory ranges) as hot-unswappable.
*/
static void __init numa_clear_kernel_node_hotplug(void)
{
int i, nid;
nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
phys_addr_t start, end;
struct memblock_region *r;
nodemask_t reserved_nodemask = NODE_MASK_NONE;
struct memblock_region *mb_region;
int i;
/*
* We have to do some preprocessing of memblock regions, to
* make them suitable for reservation.
*
* At this time, all memory regions reserved by memblock are
* used by the kernel. Set the nid in memblock.reserved will
* mark out all the nodes the kernel resides in.
* used by the kernel, but those regions are not split up
* along node boundaries yet, and don't necessarily have their
* node ID set yet either.
*
* So iterate over all memory known to the x86 architecture,
* and use those ranges to set the nid in memblock.reserved.
* This will split up the memblock regions along node
* boundaries and will set the node IDs as well.
*/
for (i = 0; i < numa_meminfo.nr_blks; i++) {
struct numa_memblk *mb = &numa_meminfo.blk[i];
struct numa_memblk *mb = numa_meminfo.blk + i;
int ret;
memblock_set_node(mb->start, mb->end - mb->start,
&memblock.reserved, mb->nid);
ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
WARN_ON_ONCE(ret);
}
/*
* Mark all kernel nodes.
* Now go over all reserved memblock regions, to construct a
* node mask of all kernel reserved memory areas.
*
* When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
* may not include all the memblock.reserved memory ranges because
* trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
* [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
* numa_meminfo might not include all memblock.reserved
* memory ranges, because quirks such as trim_snb_memory()
* reserve specific pages for Sandy Bridge graphics. ]
*/
for_each_memblock(reserved, r)
if (r->nid != MAX_NUMNODES)
node_set(r->nid, numa_kernel_nodes);
for_each_memblock(reserved, mb_region) {
if (mb_region->nid != MAX_NUMNODES)
node_set(mb_region->nid, reserved_nodemask);
}
/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
/*
* Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
* belonging to the reserved node mask.
*
* Note that this will include memory regions that reside
* on nodes that contain kernel memory - entire nodes
* become hot-unpluggable:
*/
for (i = 0; i < numa_meminfo.nr_blks; i++) {
nid = numa_meminfo.blk[i].nid;
if (!node_isset(nid, numa_kernel_nodes))
continue;
struct numa_memblk *mb = numa_meminfo.blk + i;
start = numa_meminfo.blk[i].start;
end = numa_meminfo.blk[i].end;
if (!node_isset(mb->nid, reserved_nodemask))
continue;
memblock_clear_hotplug(start, end - start);
memblock_clear_hotplug(mb->start, mb->end - mb->start);
}
}
......
......@@ -1128,8 +1128,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
/*
* Ignore all non primary paths.
*/
if (!primary)
if (!primary) {
cpa->numpages = 1;
return 0;
}
/*
* Ignore the NULL PTE for kernel identity mapping, as it is expected
......
......@@ -32,9 +32,8 @@ early_param("noexec", noexec_setup);
void x86_configure_nx(void)
{
if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
else
/* If disable_nx is set, clear NX on all new mappings going forward. */
if (disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment