Commit 99792e0c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
parents 382d72a9 977e4be5
====================================================
Complete virtual memory map with 4-level page tables
====================================================
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Notes:
- Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
from the top of the 64-bit address space. It's easier to understand the layout
when seen both in absolute addresses and in distance-from-top notation.
For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
64-bit address space (ffffffffffffffff).
Note that as we get closer to the top of the address space, the notation changes
from TB to GB and then MB/KB.
- "16M TB" might look weird at first sight, but it's an easier to visualize size
notation than "16 EB", which few will recognize at first sight as 16 exabytes.
It also shows it nicely how incredibly large 64-bit address space is.
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
====================================================
Complete virtual memory map with 5-level page tables
====================================================
Notes:
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
offset and many of the regions expand to support the much larger physical
memory supported.
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
......
......@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
Expose statistics about the Change Page Attribute mechanims, which
helps to determine the effectivness of preserving large and huge
page mappings when mapping protections are changed.
config ARCH_HAS_MEM_ENCRYPT
def_bool y
......
......@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space
......
......@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already
......
......@@ -59,13 +59,16 @@
#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
* 512MiB by default, leaving 1.5GiB for modules once the page tables
* are fully set up. If kernel ASLR is configured, it can extend the
* kernel page table mapping, reducing the size of the modules area.
* Maximum kernel image size is limited to 1 GiB, due to the fixmap living
* in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
*
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* page tables are fully set up.
*
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
* of the modules area to 1.5 GiB.
*/
#if defined(CONFIG_RANDOMIZE_BASE)
#ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
......
......@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) \
{ \
if (!tlb->fullmm && !tlb->need_flush_all) \
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
else \
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
}
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
if (!tlb->fullmm && !tlb->need_flush_all) {
start = tlb->start;
end = tlb->end;
}
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}
/*
* While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides
......
......@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
......@@ -547,23 +531,30 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
......
......@@ -11,40 +11,62 @@
#include <linux/uaccess.h>
#include <linux/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (encrypted)
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
else
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
iounmap(vaddr);
iounmap((void __iomem *)vaddr);
return csize;
}
/**
* copy_oldmem_page - copy one page of memory
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from the old kernel's memory. For this page, there is no pte
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
}
/**
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accomodate kdump on SME-enabled
* machines.
*/
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}
......@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot;
return 0;
......
......@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
......
......@@ -19,7 +19,9 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
......@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
static void note_wx(struct pg_state *st)
{
unsigned long npages;
npages = (st->current_address - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
st->current_address <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
#endif
/* Account the WX pages */
st->wx_pages += npages;
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
......@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
WARN_ONCE(1,
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address,
(void *)st->start_address);
st->wx_pages += (st->current_address -
st->start_address) / PAGE_SIZE;
}
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
note_wx(st);
/*
* Now print the actual finished series
......
......@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, int si_code)
......@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
......@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
}
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & X86_PF_PK))
return 1;
return 1;
}
......@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
......@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d);
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud);
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
......@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
ret = spurious_fault_check(error_code, (pte_t *) pmd);
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
NOKPROBE_SYMBOL(spurious_fault);
NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
......@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX;
}
......@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
* ran in userspace or the kernel.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* Protection keys exceptions only happen on user pages. We
* have no user pages in the kernel portion of the address
* space, so do not expect them here.
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
* We fault-in kernel-space virtual memory on-demand. The
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
......@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
}
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;
mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
/*
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Check for invalid kernel (supervisor) access to user
* pages in the user address space.
*/
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
......@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
......@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
/*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);
sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
......@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE)
if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
* Instruction fetch faults in the vsyscall page might need
* emulation. The vsyscall page is at a high address
* (>PAGE_OFFSET), but is considered to be part of the user
* address space.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_sem might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
* 2. The access did not originate in userspace. Note: either the
* hardware or earlier page fault code may set X86_PF_USER
* in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return;
}
retry:
......@@ -1351,16 +1419,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (error_code & X86_PF_USER) {
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
......@@ -1368,12 +1436,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
......@@ -1382,8 +1450,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
......@@ -1425,13 +1493,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return;
}
......@@ -1449,6 +1517,28 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address)))
do_kern_addr_fault(regs, hw_error_code, address);
else
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void
......
......@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
......
......@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
unsigned long size, enum page_cache_mode pcm,
void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
......@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
if (sev_active() && mem_flags.desc_other)
if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
......@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
......@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
......@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
......@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0), true);
}
EXPORT_SYMBOL(ioremap_encrypted);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
......@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
......
......@@ -37,11 +37,20 @@ struct cpa_data {
unsigned long numpages;
int flags;
unsigned long pfn;
unsigned force_split : 1;
unsigned force_split : 1,
force_static_prot : 1;
int curpage;
struct page **pages;
};
enum cpa_warn {
CPA_CONFLICT,
CPA_PROTECT,
CPA_DETECT,
};
static const int cpa_warn_level = CPA_PROTECT;
/*
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb
......@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { }
#endif
#ifdef CONFIG_X86_CPA_STATISTICS
static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;
static inline void cpa_inc_1g_checked(void)
{
cpa_1g_checked++;
}
static inline void cpa_inc_2m_checked(void)
{
cpa_2m_checked++;
}
static inline void cpa_inc_4k_install(void)
{
cpa_4k_install++;
}
static inline void cpa_inc_lp_sameprot(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_sameprot++;
else
cpa_2m_sameprot++;
}
static inline void cpa_inc_lp_preserved(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_preserved++;
else
cpa_2m_preserved++;
}
static int cpastats_show(struct seq_file *m, void *p)
{
seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
return 0;
}
static int cpastats_open(struct inode *inode, struct file *file)
{
return single_open(file, cpastats_show, NULL);
}
static const struct file_operations cpastats_fops = {
.open = cpastats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init cpa_stats_init(void)
{
debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
&cpastats_fops);
return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
......@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
static void __cpa_flush_range(void *arg)
static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
{
/*
* We could optimize that further and do individual per page
* tlb invalidates for a low number of pages. Caveat: we must
* flush the high aliases on 64bit as well.
*/
__flush_tlb_all();
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
return true;
}
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
return !cache;
}
static void cpa_flush_range(unsigned long start, int numpages, int cache)
......@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
on_each_cpu(__cpa_flush_range, NULL, 1);
if (!cache)
if (__cpa_flush_range(start, numpages, cache))
return;
/*
......@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
static void cpa_flush_array(unsigned long *start, int numpages, int cache,
static void cpa_flush_array(unsigned long baddr, unsigned long *start,
int numpages, int cache,
int in_flags, struct page **pages)
{
unsigned int i, level;
#ifdef CONFIG_PREEMPT
/*
* Avoid wbinvd() because it causes latencies on all CPUs,
* regardless of any CPU isolation that may be in effect.
*
* This should be extended for CAT enabled systems independent of
* PREEMPT because wbinvd() does not respect the CAT partitions and
* this is exposed to unpriviledged users through the graphics
* subsystem.
*/
unsigned long do_wbinvd = 0;
#else
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
#endif
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
if (!cache || do_wbinvd)
if (__cpa_flush_range(baddr, numpages, cache))
return;
/*
......@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
}
}
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
unsigned long pfn)
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
unsigned long r2_start, unsigned long r2_end)
{
pgprot_t forbidden = __pgprot(0);
return (r1_start <= r2_end && r1_end >= r2_start) ||
(r2_start <= r1_end && r2_end >= r1_start);
}
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
#ifdef CONFIG_PCI_BIOS
if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
/*
* The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
* based config access (CONFIG_PCI_GOBIOS) support.
*/
#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
return _PAGE_NX;
return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
return 0;
}
#endif
/*
* The kernel text needs to be executable for obvious reasons
* Does not cover __inittext since that is gone later on. On
* 64bit we do not enforce !NX on the low mapping
*/
if (within(address, (unsigned long)_text, (unsigned long)_etext))
pgprot_val(forbidden) |= _PAGE_NX;
/*
* The .rodata section needs to be read-only. Using the pfn catches all
* aliases. This also includes __ro_after_init, so do not enforce until
* kernel_set_to_readonly is true.
*/
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
{
unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/*
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases. This also includes __ro_after_init,
* so do not enforce until kernel_set_to_readonly is true.
* Note: __end_rodata is at page aligned and not inclusive, so
* subtract 1 to get the last enforced PFN in the rodata area.
*/
if (kernel_set_to_readonly &&
within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
return _PAGE_RW;
return 0;
}
/*
* Protect kernel text against becoming non executable by forbidding
* _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
* out of which the kernel actually executes. Do not protect the low
* mapping.
*
* This does not cover __inittext since that is gone after boot.
*/
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
unsigned long t_end = (unsigned long)_etext - 1;
unsigned long t_start = (unsigned long)_text;
if (overlaps(start, end, t_start, t_end))
return _PAGE_NX;
return 0;
}
#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering the
* holes caused by this alignment can be anything that user asks.
*
* This will preserve the large page mappings for kernel text/data at no
* extra cost.
*/
static pgprotval_t protect_kernel_text_ro(unsigned long start,
unsigned long end)
{
unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
unsigned long t_start = (unsigned long)_text;
unsigned int level;
if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
return 0;
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering
* the holes caused by this alignment can be anything that user asks.
* Don't enforce the !RW mapping for the kernel text mapping, if
* the current mapping is already using small page mapping. No
* need to work hard to preserve large page mappings in this case.
*
* This will preserve the large page mappings for kernel text/data
* at no extra cost.
* This also fixes the Linux Xen paravirt guest boot failure caused
* by unexpected read-only mappings for kernel identity
* mappings. In this paravirt guest case, the kernel text mapping
* and the kernel identity mapping share the same page-table pages,
* so the protections for kernel text and identity mappings have to
* be the same.
*/
if (kernel_set_to_readonly &&
within(address, (unsigned long)_text,
(unsigned long)__end_rodata_hpage_align)) {
unsigned int level;
/*
* Don't enforce the !RW mapping for the kernel text mapping,
* if the current mapping is already using small page mapping.
* No need to work hard to preserve large page mappings in this
* case.
*
* This also fixes the Linux Xen paravirt guest boot failure
* (because of unexpected read-only mappings for kernel identity
* mappings). In this paravirt guest case, the kernel text
* mapping and the kernel identity mapping share the same
* page-table pages. Thus we can't really use different
* protections for the kernel text and identity mappings. Also,
* these shared mappings are made of small page mappings.
* Thus this don't enforce !RW mapping for small page kernel
* text mapping logic will help Linux Xen parvirt guest boot
* as well.
*/
if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
pgprot_val(forbidden) |= _PAGE_RW;
}
if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
return _PAGE_RW;
return 0;
}
#else
static pgprotval_t protect_kernel_text_ro(unsigned long start,
unsigned long end)
{
return 0;
}
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}
return prot;
static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
unsigned long start, unsigned long end,
unsigned long pfn, const char *txt)
{
static const char *lvltxt[] = {
[CPA_CONFLICT] = "conflict",
[CPA_PROTECT] = "protect",
[CPA_DETECT] = "detect",
};
if (warnlvl > cpa_warn_level || !conflicts(prot, val))
return;
pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
(unsigned long long)val);
}
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
unsigned long pfn, unsigned long npg,
int warnlvl)
{
pgprotval_t forbidden, res;
unsigned long end;
/*
* There is no point in checking RW/NX conflicts when the requested
* mapping is setting the page !PRESENT.
*/
if (!(pgprot_val(prot) & _PAGE_PRESENT))
return prot;
/* Operate on the virtual address */
end = start + npg * PAGE_SIZE - 1;
res = protect_kernel_text(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
forbidden = res;
res = protect_kernel_text_ro(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
forbidden |= res;
/* Check the PFN directly */
res = protect_pci_bios(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
forbidden |= res;
res = protect_rodata(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
forbidden |= res;
return __pgprot(pgprot_val(prot) & ~forbidden);
}
/*
......@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
if (cpa->pgd)
if (cpa->pgd)
return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
return lookup_address(address, level);
return lookup_address(address, level);
}
/*
......@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
return prot;
}
static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
static int __should_split_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
enum pg_level level;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
return 1;
switch (level) {
case PG_LEVEL_2M:
old_prot = pmd_pgprot(*(pmd_t *)kpte);
old_pfn = pmd_pfn(*(pmd_t *)kpte);
cpa_inc_2m_checked();
break;
case PG_LEVEL_1G:
old_prot = pud_pgprot(*(pud_t *)kpte);
old_pfn = pud_pfn(*(pud_t *)kpte);
cpa_inc_1g_checked();
break;
default:
do_split = -EINVAL;
goto out_unlock;
return -EINVAL;
}
psize = page_level_size(level);
......@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Calculate the number of pages, which fit into this large
* page starting at address:
*/
nextpage_addr = (address + psize) & pmask;
numpages = (nextpage_addr - address) >> PAGE_SHIFT;
lpaddr = (address + psize) & pmask;
numpages = (lpaddr - address) >> PAGE_SHIFT;
if (numpages < cpa->numpages)
cpa->numpages = numpages;
......@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pgprot_val(req_prot) |= _PAGE_PSE;
/*
* old_pfn points to the large page base pfn. So we need
* to add the offset of the virtual address:
* old_pfn points to the large page base pfn. So we need to add the
* offset of the virtual address:
*/
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
new_prot = static_protections(req_prot, address, pfn);
/*
* Calculate the large page base address and the number of 4K pages
* in the large page
*/
lpaddr = address & pmask;
numpages = psize >> PAGE_SHIFT;
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
* Sanity check that the existing mapping is correct versus the static
* protections. static_protections() guards against !PRESENT, so no
* extra conditional required here.
*/
addr = address & pmask;
pfn = old_pfn;
for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
CPA_CONFLICT);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
/*
* Split the large page and tell the split code to
* enforce static protections.
*/
cpa->force_static_prot = 1;
return 1;
}
/*
* If there are no changes, return. maxpages has been updated
* above:
* Optimization: If the requested pgprot is the same as the current
* pgprot, then the large page can be preserved and no updates are
* required independent of alignment and length of the requested
* range. The above already established that the current pgprot is
* correct, which in consequence makes the requested pgprot correct
* as well if it is the same. The static protection scan below will
* not come to a different conclusion.
*/
if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
do_split = 0;
goto out_unlock;
if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
cpa_inc_lp_sameprot(level);
return 0;
}
/*
* We need to change the attributes. Check, whether we can
* change the large page in one go. We request a split, when
* the address is not aligned and the number of pages is
* smaller than the number of pages in the large page. Note
* that we limited the number of possible pages already to
* the number of pages in the large page.
* If the requested range does not cover the full page, split it up
*/
if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
/*
* The address is aligned and the number of pages
* covers the full page.
*/
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
if (address != lpaddr || cpa->numpages != numpages)
return 1;
out_unlock:
/*
* Check whether the requested pgprot is conflicting with a static
* protection requirement in the large page.
*/
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
CPA_DETECT);
/*
* If there is a conflict, split the large page.
*
* There used to be a 4k wise evaluation trying really hard to
* preserve the large pages, but experimentation has shown, that this
* does not help at all. There might be corner cases which would
* preserve one large page occasionally, but it's really not worth the
* extra code and cycles for the common case.
*/
if (pgprot_val(req_prot) != pgprot_val(new_prot))
return 1;
/* All checks passed. Update the large page mapping. */
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
cpa_inc_lp_preserved(level);
return 0;
}
static int should_split_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
int do_split;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
do_split = __should_split_large_page(kpte, address, cpa);
spin_unlock(&pgd_lock);
return do_split;
}
static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
pgprot_t ref_prot, unsigned long address,
unsigned long size)
{
unsigned int npg = PFN_DOWN(size);
pgprot_t prot;
/*
* If should_split_large_page() discovered an inconsistent mapping,
* remove the invalid protection in the split mapping.
*/
if (!cpa->force_static_prot)
goto set;
prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
if (pgprot_val(prot) == pgprot_val(ref_prot))
goto set;
/*
* If this is splitting a PMD, fix it up. PUD splits cannot be
* fixed trivially as that would require to rescan the newly
* installed PMD mappings after returning from split_large_page()
* so an eventual further split can allocate the necessary PTE
* pages. Warn for now and revisit it in case this actually
* happens.
*/
if (size == PAGE_SIZE)
ref_prot = prot;
else
pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
set_pte(pte, pfn_pte(pfn, ref_prot));
}
static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
pte_t *pbase = (pte_t *)page_address(base);
unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
pte_t *tmp;
pgprot_t ref_prot;
pte_t *tmp;
spin_lock(&pgd_lock);
/*
......@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* PAT bit to correct position.
*/
ref_prot = pgprot_large_2_4k(ref_prot);
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
lpaddr = address & PMD_MASK;
lpinc = PAGE_SIZE;
break;
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true
......@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Get the target pfn from the original entry:
*/
pfn = ref_pfn;
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) {
unsigned long pfn = PFN_DOWN(__pa(address));
......@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/*
* Intel Atom errata AAH41 workaround.
* Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* Without this, we violate the TLB application note, that says:
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
*
* The real fix should be in hw or in a microcode update, but
* we also probabilistically try to reduce the window of having
* a large TLB mixed with 4K TLBs while instruction fetches are
* going on.
* We do this global tlb flush inside the cpa_lock, so that we
* don't allow any other cpu, with stale tlb entries change the
* page attribute in parallel, that also falls into the
* just split large page entry.
*/
__flush_tlb_all();
flush_tlb_all();
spin_unlock(&pgd_lock);
return 0;
......@@ -1247,7 +1494,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
new_prot = static_protections(new_prot, address, pfn);
cpa_inc_4k_install();
new_prot = static_protections(new_prot, address, pfn, 1,
CPA_PROTECT);
new_prot = pgprot_clear_protnone_bits(new_prot);
......@@ -1273,7 +1522,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
* Check, whether we can keep the large page intact
* and just change the pte:
*/
do_split = try_preserve_large_page(kpte, address, cpa);
do_split = should_split_large_page(kpte, address, cpa);
/*
* When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in
......@@ -1286,28 +1535,8 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
* We have to split the large page:
*/
err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* With out this, we violate the TLB application note, that says
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
*
* We do this global tlb flush inside the cpa_lock, so that we
* don't allow any other cpu, with stale tlb entries change the
* page attribute in parallel, that also falls into the
* just split large page entry.
*/
flush_tlb_all();
if (!err)
goto repeat;
}
return err;
}
......@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
/*
* On success we use CLFLUSH, when the CPU supports it to
* avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
* WBINVD):
* On error; flush everything to be sure.
*/
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
cpa_flush_range(baddr, numpages, cache);
} else
if (ret) {
cpa_flush_all(cache);
goto out;
}
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(baddr, addr, numpages, cache,
cpa.flags, pages);
} else {
cpa_flush_range(baddr, numpages, cache);
}
out:
return ret;
......@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
if (static_cpu_has(X86_FEATURE_CLFLUSH))
cpa_flush_range(start, numpages, 1);
else
cpa_flush_all(1);
cpa_flush_range(start, numpages, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
......@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
* in case TLB flushing gets optimized in the cpa_flush_range()
* path use the same logic as above.
*/
if (static_cpu_has(X86_FEATURE_CLFLUSH))
cpa_flush_range(start, numpages, 0);
else
cpa_flush_all(0);
cpa_flush_range(start, numpages, 0);
return ret;
}
......
......@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
......@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;
/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;
/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
} else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
......@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
load_mm_cr4(next);
switch_ldt(real_prev, next);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
}
/*
......@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
......@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
......@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += PAGE_SIZE;
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
......@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
......@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func_remote,
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
......@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
......@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) &&
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
......
......@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/nmi.h>
#include <linux/cpuhotplug.h>
#include <linux/stackprotector.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
......@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
......
......@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
old_devtb_phys = entry & PAGE_MASK;
/*
* When SME is enabled in the first kernel, the entry includes the
* memory encryption mask(sme_me_mask), we must remove the memory
* encryption mask to obtain the true physical address in kdump kernel.
*/
old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
if (old_devtb_phys >= 0x100000000ULL) {
pr_err("The address of old device table is above 4G, not trustworthy!\n");
return false;
}
old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
old_devtb = (sme_active() && is_kdump_kernel())
? (__force void *)ioremap_encrypted(old_devtb_phys,
dev_table_size)
: memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
if (!old_devtb)
return false;
......
......@@ -24,6 +24,8 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
......@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf)
u64 *ppos, int userbuf,
bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
......@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
offset,
userbuf);
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (tmp < 0)
return tmp;
}
......@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, false);
}
/*
......@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, sme_active());
}
/*
......@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot);
}
/*
* Architectures which support memory encryption override this.
*/
ssize_t __weak
copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
}
/*
* Copy to either kernel or user space
*/
......@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
m->offset + m->size - *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
tmp = read_from_oldmem(buffer, tsz, &start,
userbuf, sme_active());
if (tmp < 0)
return tmp;
buflen -= tsz;
......
......@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
unsigned long, int);
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
size_t csize, unsigned long offset,
int userbuf);
void vmcore_cleanup(void);
/* Architecture code defines this if there are other possible ELF
......
......@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
#ifdef CONFIG_SMP
......
......@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
}
}
/* Ensure that these pages are decrypted if SME is enabled. */
if (pages)
arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
return pages;
}
......@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
......@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
......
......@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/*
* Finds the lowest iomem resource existing within [res->start.res->end).
* The caller must specify res->start, res->end, res->flags, and optionally
* desc. If found, returns 0, res is overwritten, if not found, returns -1.
* This function walks the whole tree and not just first level children until
* and unless first_level_children_only is true.
/**
* Finds the lowest iomem resource that covers part of [start..end]. The
* caller must specify start, end, flags, and desc (which may be
* IORES_DESC_NONE).
*
* If a resource is found, returns 0 and *res is overwritten with the part
* of the resource that's within [start..end]; if none is found, returns
* -1.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
*/
static int find_next_iomem_res(struct resource *res, unsigned long desc,
bool first_level_children_only)
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{
resource_size_t start, end;
struct resource *p;
bool sibling_only = false;
BUG_ON(!res);
start = res->start;
end = res->end;
BUG_ON(start >= end);
if (!res)
return -EINVAL;
if (first_level_children_only)
sibling_only = true;
if (start >= end)
return -EINVAL;
read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if ((p->flags & res->flags) != res->flags)
for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
if ((p->flags & flags) != flags)
continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue;
......@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
p = NULL;
break;
}
if ((p->end >= start) && (p->start < end))
if ((p->end >= start) && (p->start <= end))
break;
}
read_unlock(&resource_lock);
if (!p)
return -1;
/* copy data */
if (res->start < p->start)
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
res->start = max(start, p->start);
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
return 0;
}
static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
bool first_level_children_only,
void *arg,
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, void *arg,
int (*func)(struct resource *, void *))
{
u64 orig_end = res->end;
struct resource res;
int ret = -1;
while ((res->start < res->end) &&
!find_next_iomem_res(res, desc, first_level_children_only)) {
ret = (*func)(res, arg);
while (start < end &&
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
res->start = res->end + 1;
res->end = orig_end;
start = res.end + 1;
}
return ret;
}
/*
/**
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
......@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = flags;
return __walk_iomem_res_desc(&res, desc, false, arg, func);
return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
......@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
int (*func)(struct resource *, void *))
{
struct resource res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
int walk_mem_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
* It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
void *arg, int (*func)(unsigned long, unsigned long, void *))
{
resource_size_t start, end;
unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
u64 orig_end;
int ret = -1;
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
start = (u64) start_pfn << PAGE_SHIFT;
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
true, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
break;
res.start = res.end + 1;
res.end = orig_end;
start = res.end + 1;
}
return ret;
}
......@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
* @constraint: the size and alignment constraints to be met.
*/
static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint *constraint)
resource_size_t newsize,
struct resource_constraint *constraint)
{
int err=0;
struct resource new = *old;
......@@ -972,7 +957,7 @@ static int __adjust_resource(struct resource *res, resource_size_t start,
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
resource_size_t size)
{
int result;
......@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
}
EXPORT_SYMBOL(adjust_resource);
static void __init __reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
static void __init
__reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
struct resource *parent = root;
struct resource *conflict;
......@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
}
void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
void __init
reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
int abort = 0;
......@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
* The described resource region must match a currently busy region.
*/
void __release_region(struct resource *parent, resource_size_t start,
resource_size_t n)
resource_size_t n)
{
struct resource **p;
resource_size_t end;
......@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
* simplicity. Enhance this logic when necessary.
*/
int release_mem_region_adjustable(struct resource *parent,
resource_size_t start, resource_size_t size)
resource_size_t start, resource_size_t size)
{
struct resource **p;
struct resource *res;
......@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
this->start == match->start && this->n == match->n;
}
struct resource * __devm_request_region(struct device *dev,
struct resource *parent, resource_size_t start,
resource_size_t n, const char *name)
struct resource *
__devm_request_region(struct device *dev, struct resource *parent,
resource_size_t start, resource_size_t n, const char *name)
{
struct region_devres *dr = NULL;
struct resource *res;
......
......@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (ARM and SH have never invoked the canary
* init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
......
......@@ -56,7 +56,6 @@
#include <linux/profile.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
#include <linux/suspend.h>
#include <linux/swait.h>
......
......@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
gfp_t gfp_flags, const struct cpumask *mask)
{
cpumask_var_t cpus;
int cpu, ret;
......@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info))
cpumask_set_cpu(cpu, cpus);
__cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait);
preempt_enable();
free_cpumask_var(cpus);
......@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
* just have to IPI them one by one.
*/
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
......@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable();
}
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
static void do_nothing(void *unused)
......
......@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask)
{
unsigned long flags;
......@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
......
......@@ -8,6 +8,7 @@
*/
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment