Commit 99792e0c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
parents 382d72a9 977e4be5
====================================================
Complete virtual memory map with 4-level page tables
====================================================
Virtual memory map with 4 level page tables: Notes:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
hole caused by [47:63] sign extension from the top of the 64-bit address space. It's easier to understand the layout
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor when seen both in absolute addresses and in distance-from-top notation.
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space 64-bit address space (ffffffffffffffff).
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) Note that as we get closer to the top of the address space, the notation changes
... unused hole ... from TB to GB and then MB/KB.
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ... - "16M TB" might look weird at first sight, but it's an easier to visualize size
vaddr_end for KASLR notation than "16 EB", which few will recognize at first sight as 16 exabytes.
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping It also shows it nicely how incredibly large 64-bit address space is.
fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ========================================================================================================================
... unused hole ... Start addr | Offset | End addr | Size | VM area description
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ========================================================================================================================
... unused hole ... | | | |
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space __________________|____________|__________________|_________|___________________________________________________________
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range | | | |
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole | | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
Virtual memory map with 5 level page tables: __________________|____________|__________________|_________|___________________________________________________________
|
0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm | Kernel-space virtual memory, shared between all processes:
hole caused by [56:63] sign extension ____________________________________________________________|___________________________________________________________
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor | | | |
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
... unused hole ... ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
... unused hole ... ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
vaddr_end for KASLR fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping | | | | vaddr_end for KASLR
... unused hole ... fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
... unused hole ... ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space __________________|____________|__________________|_________|____________________________________________________________
... unused hole ... |
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 | Identical layout to the 47-bit one from here on:
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space ____________________________________________________________|____________________________________________________________
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range | | | |
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
====================================================
Complete virtual memory map with 5-level page tables
====================================================
Notes:
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
offset and many of the regions expand to support the much larger physical
memory supported.
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
Architecture defines a 64-bit virtual address. Implementations can support Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
......
...@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES ...@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing supports them), so don't confuse the user by printing
that we have them enabled. that we have them enabled.
config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
Expose statistics about the Change Page Attribute mechanims, which
helps to determine the effectivness of preserving large and huge
page mappings when mapping protections are changed.
config ARCH_HAS_MEM_ENCRYPT config ARCH_HAS_MEM_ENCRYPT
def_bool y def_bool y
......
...@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size) ...@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache #define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc #define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache #define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot #define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/** /**
* ioremap - map bus memory into CPU space * ioremap - map bus memory into CPU space
......
...@@ -67,7 +67,7 @@ struct kimage; ...@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */ /* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL) #define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ #define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/* /*
* CPU does not save ss and sp on stack if execution is already * CPU does not save ss and sp on stack if execution is already
......
...@@ -59,13 +59,16 @@ ...@@ -59,13 +59,16 @@
#endif #endif
/* /*
* Kernel image size is limited to 1GiB due to the fixmap living in the * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
* 512MiB by default, leaving 1.5GiB for modules once the page tables *
* are fully set up. If kernel ASLR is configured, it can extend the * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* kernel page table mapping, reducing the size of the modules area. * page tables are fully set up.
*
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
* of the modules area to 1.5 GiB.
*/ */
#if defined(CONFIG_RANDOMIZE_BASE) #ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else #else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
......
...@@ -6,16 +6,23 @@ ...@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) \ static inline void tlb_flush(struct mmu_gather *tlb);
{ \
if (!tlb->fullmm && !tlb->need_flush_all) \
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
else \
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
}
#include <asm-generic/tlb.h> #include <asm-generic/tlb.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
if (!tlb->fullmm && !tlb->need_flush_all) {
start = tlb->start;
end = tlb->end;
}
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}
/* /*
* While x86 architecture in general requires an IPI to perform TLB * While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides * shootdown, enablement code for several hypervisors overrides
......
...@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) ...@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif #endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context { struct tlb_context {
u64 ctx_id; u64 ctx_id;
u64 tlb_gen; u64 tlb_gen;
...@@ -547,23 +531,30 @@ struct flush_tlb_info { ...@@ -547,23 +531,30 @@ struct flush_tlb_info {
unsigned long start; unsigned long start;
unsigned long end; unsigned long end;
u64 new_tlb_gen; u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
}; };
#define local_flush_tlb() __flush_tlb() #define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) #define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \ #define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void); extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag); unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{ {
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
} }
void native_flush_tlb_others(const struct cpumask *cpumask, void native_flush_tlb_others(const struct cpumask *cpumask,
......
...@@ -11,40 +11,62 @@ ...@@ -11,40 +11,62 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/io.h> #include <linux/io.h>
/** static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
* copy_oldmem_page - copy one page from "oldmem" unsigned long offset, int userbuf,
* @pfn: page frame number to be copied bool encrypted)
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
{ {
void *vaddr; void *vaddr;
if (!csize) if (!csize)
return 0; return 0;
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (encrypted)
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
else
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr) if (!vaddr)
return -ENOMEM; return -ENOMEM;
if (userbuf) { if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) { if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
iounmap(vaddr); iounmap((void __iomem *)vaddr);
return -EFAULT; return -EFAULT;
} }
} else } else
memcpy(buf, vaddr + offset, csize); memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy(); set_iounmap_nonlazy();
iounmap(vaddr); iounmap((void __iomem *)vaddr);
return csize; return csize;
} }
/**
* copy_oldmem_page - copy one page of memory
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from the old kernel's memory. For this page, there is no pte
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
}
/**
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accomodate kdump on SME-enabled
* machines.
*/
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}
...@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) ...@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm); map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot); va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot; ldt->slot = slot;
return 0; return 0;
......
...@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) ...@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl); pte_unmap_unlock(pte, ptl);
out: out:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
} }
......
...@@ -19,7 +19,9 @@ ...@@ -19,7 +19,9 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pci.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
/* /*
...@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u) ...@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift; return (signed long)(u << shift) >> shift;
} }
static void note_wx(struct pg_state *st)
{
unsigned long npages;
npages = (st->current_address - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
st->current_address <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
#endif
/* Account the WX pages */
st->wx_pages += npages;
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
}
/* /*
* This function gets called on a break in a continuous series * This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to * of PTE entries; the next one is different so we need to
...@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, ...@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta; unsigned long delta;
int width = sizeof(unsigned long) * 2; int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
WARN_ONCE(1, note_wx(st);
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address,
(void *)st->start_address);
st->wx_pages += (st->current_address -
st->start_address) / PAGE_SIZE;
}
/* /*
* Now print the actual finished series * Now print the actual finished series
......
...@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, ...@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl); show_opcodes(regs, loglvl);
} }
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, int si_code) unsigned long address, u32 *pkey, int si_code)
...@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ...@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address)) if (is_errata100(regs, address))
return; return;
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/* /*
* To avoid leaking information about the kernel page table * To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses * layout, pretend that user-mode accesses to kernel addresses
...@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, ...@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} }
} }
static int spurious_fault_check(unsigned long error_code, pte_t *pte) static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{ {
if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0; return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0; return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & X86_PF_PK))
return 1;
return 1; return 1;
} }
...@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) ...@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation). * (Optional Invalidation).
*/ */
static noinline int static noinline int
spurious_fault(unsigned long error_code, unsigned long address) spurious_kernel_fault(unsigned long error_code, unsigned long address)
{ {
pgd_t *pgd; pgd_t *pgd;
p4d_t *p4d; p4d_t *p4d;
...@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address) ...@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0; return 0;
if (p4d_large(*p4d)) if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d); return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address); pud = pud_offset(p4d, address);
if (!pud_present(*pud)) if (!pud_present(*pud))
return 0; return 0;
if (pud_large(*pud)) if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud); return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address); pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd)) if (!pmd_present(*pmd))
return 0; return 0;
if (pmd_large(*pmd)) if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd); return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address); pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte)) if (!pte_present(*pte))
return 0; return 0;
ret = spurious_fault_check(error_code, pte); ret = spurious_kernel_fault_check(error_code, pte);
if (!ret) if (!ret)
return 0; return 0;
...@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address) ...@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD. * Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables: * If not, then there's a bug in the page tables:
*/ */
ret = spurious_fault_check(error_code, (pte_t *) pmd); ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret; return ret;
} }
NOKPROBE_SYMBOL(spurious_fault); NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1; int show_unhandled_signals = 1;
...@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) ...@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address) static int fault_in_kernel_space(unsigned long address)
{ {
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX; return address >= TASK_SIZE_MAX;
} }
...@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) ...@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
} }
/* /*
* This routine handles page faults. It determines the address, * Called for all faults where 'address' is part of the kernel address
* and the problem, and then passes it off to one of the appropriate * space. Might get called for faults that originate from *code* that
* routines. * ran in userspace or the kernel.
*/ */
static noinline void static void
__do_page_fault(struct pt_regs *regs, unsigned long error_code, do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address) unsigned long address)
{ {
struct vm_area_struct *vma; /*
struct task_struct *tsk; * Protection keys exceptions only happen on user pages. We
struct mm_struct *mm; * have no user pages in the kernel portion of the address
vm_fault_t fault, major = 0; * space, so do not expect them here.
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; */
u32 pkey; WARN_ON_ONCE(hw_error_code & X86_PF_PK);
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* /*
* We fault-in kernel-space virtual memory on-demand. The * We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd. * 'reference' page table is init_mm.pgd.
* *
* NOTE! We MUST NOT take any locks for this case. We may * NOTE! We MUST NOT take any locks for this case. We may
...@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* only copy the information from the master page table, * only copy the information from the master page table,
* nothing more. * nothing more.
* *
* This verifies that the fault happens in kernel space * Before doing this on-demand faulting, ensure that the
* (error_code & 4) == 0, and that the fault was not a * fault is not any of the following:
* protection error (error_code & 9) == 0. * 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*/ */
if (unlikely(fault_in_kernel_space(address))) { if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0)
if (vmalloc_fault(address) >= 0)
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
return; return;
}
/* kprobes don't want to hook the spurious faults: */ /* Was the fault spurious, caused by lazy TLB invalidation? */
if (kprobes_fault(regs)) if (spurious_kernel_fault(hw_error_code, address))
return; return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return; return;
}
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;
mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */ /* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs))) if (unlikely(kprobes_fault(regs)))
return; return;
if (unlikely(error_code & X86_PF_RSVD)) /*
pgtable_bad(regs, error_code, address); * Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);
if (unlikely(smap_violation(error_code, regs))) { /*
bad_area_nosemaphore(regs, error_code, address, NULL); * Check for invalid kernel (supervisor) access to user
* pages in the user address space.
*/
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return; return;
} }
...@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault * in a region with pagefaults disabled then we must not take the fault
*/ */
if (unlikely(faulthandler_disabled() || !mm)) { if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL); bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return; return;
} }
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;
/* /*
* It's safe to allow irq's after cr2 has been saved and the * It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled. * vmalloc fault has been handled.
...@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/ */
if (user_mode(regs)) { if (user_mode(regs)) {
local_irq_enable(); local_irq_enable();
error_code |= X86_PF_USER; /*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);
sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER; flags |= FAULT_FLAG_USER;
} else { } else {
if (regs->flags & X86_EFLAGS_IF) if (regs->flags & X86_EFLAGS_IF)
...@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE) if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE; flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR) if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION; flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/* /*
* When running in the kernel we expect faults to occur only to * Instruction fetch faults in the vsyscall page might need
* addresses in user space. All other faults represent errors in * emulation. The vsyscall page is at a high address
* the kernel and should generate an OOPS. Unfortunately, in the * (>PAGE_OFFSET), but is considered to be part of the user
* case of an erroneous fault occurring in a code path which already * address space.
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
* *
* As the vast majority of faults will be valid we will only perform * The vsyscall page does not have a "real" VMA, so do this
* the source reference check when there is a possibility of a * emulation before we go searching for VMAs.
* deadlock. Attempt to lock the address space, if we cannot we then */
* validate the source. If this is invalid we can skip the address if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
* space check, thus avoiding the deadlock: if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_sem might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
* 2. The access did not originate in userspace. Note: either the
* hardware or earlier page fault code may set X86_PF_USER
* in sw_error_code.
*/ */
if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) && if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) { !search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL); /*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return; return;
} }
retry: retry:
...@@ -1351,16 +1419,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1351,16 +1419,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
vma = find_vma(mm, address); vma = find_vma(mm, address);
if (unlikely(!vma)) { if (unlikely(!vma)) {
bad_area(regs, error_code, address); bad_area(regs, sw_error_code, address);
return; return;
} }
if (likely(vma->vm_start <= address)) if (likely(vma->vm_start <= address))
goto good_area; goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address); bad_area(regs, sw_error_code, address);
return; return;
} }
if (error_code & X86_PF_USER) { if (sw_error_code & X86_PF_USER) {
/* /*
* Accessing the stack below %sp is always a bug. * Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter * The large cushion allows instructions like enter
...@@ -1368,12 +1436,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1368,12 +1436,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* 32 pointers and then decrements %sp by 65535.) * 32 pointers and then decrements %sp by 65535.)
*/ */
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address); bad_area(regs, sw_error_code, address);
return; return;
} }
} }
if (unlikely(expand_stack(vma, address))) { if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address); bad_area(regs, sw_error_code, address);
return; return;
} }
...@@ -1382,8 +1450,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1382,8 +1450,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* we can handle it.. * we can handle it..
*/ */
good_area: good_area:
if (unlikely(access_error(error_code, vma))) { if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma); bad_area_access_error(regs, sw_error_code, address, vma);
return; return;
} }
...@@ -1425,13 +1493,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1425,13 +1493,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
return; return;
/* Not returning to user mode? Handle exceptions or die: */ /* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return; return;
} }
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) { if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault); mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return; return;
} }
...@@ -1449,6 +1517,28 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, ...@@ -1449,6 +1517,28 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
check_v8086_mode(regs, address, tsk); check_v8086_mode(regs, address, tsk);
} }
NOKPROBE_SYMBOL(do_user_addr_fault);
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address)))
do_kern_addr_fault(regs, hw_error_code, address);
else
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault); NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void static nokprobe_inline void
......
...@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void) ...@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void) void mark_rodata_ro(void)
{ {
unsigned long start = PFN_ALIGN(_text); unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start; unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n", pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10); size >> 10);
kernel_set_to_readonly = 1; kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG #ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n"); pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif #endif
mark_nxdata_nx(); mark_nxdata_nx();
......
...@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, ...@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail. * caller shouldn't need to know that small detail.
*/ */
static void __iomem *__ioremap_caller(resource_size_t phys_addr, static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller) unsigned long size, enum page_cache_mode pcm,
void *caller, bool encrypted)
{ {
unsigned long offset, vaddr; unsigned long offset, vaddr;
resource_size_t last_addr; resource_size_t last_addr;
...@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, ...@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping. * resulting mapping.
*/ */
prot = PAGE_KERNEL_IO; prot = PAGE_KERNEL_IO;
if (sev_active() && mem_flags.desc_other) if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot); prot = pgprot_encrypted(prot);
switch (pcm) { switch (pcm) {
...@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) ...@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm, return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL(ioremap_nocache); EXPORT_SYMBOL(ioremap_nocache);
...@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) ...@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm, return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL_GPL(ioremap_uc); EXPORT_SYMBOL_GPL(ioremap_uc);
...@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); ...@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{ {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL(ioremap_wc); EXPORT_SYMBOL(ioremap_wc);
...@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc); ...@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{ {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL(ioremap_wt); EXPORT_SYMBOL(ioremap_wt);
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0), true);
}
EXPORT_SYMBOL(ioremap_encrypted);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{ {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL(ioremap_cache); EXPORT_SYMBOL(ioremap_cache);
...@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, ...@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{ {
return __ioremap_caller(phys_addr, size, return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)), pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0)); __builtin_return_address(0), false);
} }
EXPORT_SYMBOL(ioremap_prot); EXPORT_SYMBOL(ioremap_prot);
......
...@@ -37,11 +37,20 @@ struct cpa_data { ...@@ -37,11 +37,20 @@ struct cpa_data {
unsigned long numpages; unsigned long numpages;
int flags; int flags;
unsigned long pfn; unsigned long pfn;
unsigned force_split : 1; unsigned force_split : 1,
force_static_prot : 1;
int curpage; int curpage;
struct page **pages; struct page **pages;
}; };
enum cpa_warn {
CPA_CONFLICT,
CPA_PROTECT,
CPA_DETECT,
};
static const int cpa_warn_level = CPA_PROTECT;
/* /*
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
...@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m) ...@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { } static inline void split_page_count(int level) { }
#endif #endif
#ifdef CONFIG_X86_CPA_STATISTICS
static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;
static inline void cpa_inc_1g_checked(void)
{
cpa_1g_checked++;
}
static inline void cpa_inc_2m_checked(void)
{
cpa_2m_checked++;
}
static inline void cpa_inc_4k_install(void)
{
cpa_4k_install++;
}
static inline void cpa_inc_lp_sameprot(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_sameprot++;
else
cpa_2m_sameprot++;
}
static inline void cpa_inc_lp_preserved(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_preserved++;
else
cpa_2m_preserved++;
}
static int cpastats_show(struct seq_file *m, void *p)
{
seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
return 0;
}
static int cpastats_open(struct inode *inode, struct file *file)
{
return single_open(file, cpastats_show, NULL);
}
static const struct file_operations cpastats_fops = {
.open = cpastats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init cpa_stats_init(void)
{
debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
&cpastats_fops);
return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif
static inline int static inline int
within(unsigned long addr, unsigned long start, unsigned long end) within(unsigned long addr, unsigned long start, unsigned long end)
{ {
...@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache) ...@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1); on_each_cpu(__cpa_flush_all, (void *) cache, 1);
} }
static void __cpa_flush_range(void *arg) static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
{ {
/* BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
* We could optimize that further and do individual per page
* tlb invalidates for a low number of pages. Caveat: we must WARN_ON(PAGE_ALIGN(start) != start);
* flush the high aliases on 64bit as well.
*/ if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
__flush_tlb_all(); cpa_flush_all(cache);
return true;
}
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
return !cache;
} }
static void cpa_flush_range(unsigned long start, int numpages, int cache) static void cpa_flush_range(unsigned long start, int numpages, int cache)
...@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) ...@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level; unsigned int i, level;
unsigned long addr; unsigned long addr;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (__cpa_flush_range(start, numpages, cache))
WARN_ON(PAGE_ALIGN(start) != start);
on_each_cpu(__cpa_flush_range, NULL, 1);
if (!cache)
return; return;
/* /*
...@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) ...@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
} }
} }
static void cpa_flush_array(unsigned long *start, int numpages, int cache, static void cpa_flush_array(unsigned long baddr, unsigned long *start,
int numpages, int cache,
int in_flags, struct page **pages) int in_flags, struct page **pages)
{ {
unsigned int i, level; unsigned int i, level;
#ifdef CONFIG_PREEMPT
/*
* Avoid wbinvd() because it causes latencies on all CPUs,
* regardless of any CPU isolation that may be in effect.
*
* This should be extended for CAT enabled systems independent of
* PREEMPT because wbinvd() does not respect the CAT partitions and
* this is exposed to unpriviledged users through the graphics
* subsystem.
*/
unsigned long do_wbinvd = 0;
#else
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
#endif
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); if (__cpa_flush_range(baddr, numpages, cache))
if (!cache || do_wbinvd)
return; return;
/* /*
...@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, ...@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
} }
} }
/* static bool overlaps(unsigned long r1_start, unsigned long r1_end,
* Certain areas of memory on x86 require very specific protection flags, unsigned long r2_start, unsigned long r2_end)
* for example the BIOS area or kernel text. Callers don't always get this
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
unsigned long pfn)
{ {
pgprot_t forbidden = __pgprot(0); return (r1_start <= r2_end && r1_end >= r2_start) ||
(r2_start <= r1_end && r2_end >= r1_start);
}
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
#ifdef CONFIG_PCI_BIOS #ifdef CONFIG_PCI_BIOS
if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) /*
pgprot_val(forbidden) |= _PAGE_NX; * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
* based config access (CONFIG_PCI_GOBIOS) support.
*/
#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
return _PAGE_NX;
return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
return 0;
}
#endif #endif
/* /*
* The kernel text needs to be executable for obvious reasons * The .rodata section needs to be read-only. Using the pfn catches all
* Does not cover __inittext since that is gone later on. On * aliases. This also includes __ro_after_init, so do not enforce until
* 64bit we do not enforce !NX on the low mapping * kernel_set_to_readonly is true.
*/ */
if (within(address, (unsigned long)_text, (unsigned long)_etext)) static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
pgprot_val(forbidden) |= _PAGE_NX; {
unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/* /*
* The .rodata section needs to be read-only. Using the pfn * Note: __end_rodata is at page aligned and not inclusive, so
* catches all aliases. This also includes __ro_after_init, * subtract 1 to get the last enforced PFN in the rodata area.
* so do not enforce until kernel_set_to_readonly is true.
*/ */
if (kernel_set_to_readonly && epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
__pa_symbol(__end_rodata) >> PAGE_SHIFT)) if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
pgprot_val(forbidden) |= _PAGE_RW; return _PAGE_RW;
return 0;
}
/*
* Protect kernel text against becoming non executable by forbidding
* _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
* out of which the kernel actually executes. Do not protect the low
* mapping.
*
* This does not cover __inittext since that is gone after boot.
*/
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
unsigned long t_end = (unsigned long)_etext - 1;
unsigned long t_start = (unsigned long)_text;
if (overlaps(start, end, t_start, t_end))
return _PAGE_NX;
return 0;
}
#if defined(CONFIG_X86_64) #if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering the
* holes caused by this alignment can be anything that user asks.
*
* This will preserve the large page mappings for kernel text/data at no
* extra cost.
*/
static pgprotval_t protect_kernel_text_ro(unsigned long start,
unsigned long end)
{
unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
unsigned long t_start = (unsigned long)_text;
unsigned int level;
if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
return 0;
/* /*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set), * Don't enforce the !RW mapping for the kernel text mapping, if
* kernel text mappings for the large page aligned text, rodata sections * the current mapping is already using small page mapping. No
* will be always read-only. For the kernel identity mappings covering * need to work hard to preserve large page mappings in this case.
* the holes caused by this alignment can be anything that user asks.
* *
* This will preserve the large page mappings for kernel text/data * This also fixes the Linux Xen paravirt guest boot failure caused
* at no extra cost. * by unexpected read-only mappings for kernel identity
* mappings. In this paravirt guest case, the kernel text mapping
* and the kernel identity mapping share the same page-table pages,
* so the protections for kernel text and identity mappings have to
* be the same.
*/ */
if (kernel_set_to_readonly && if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
within(address, (unsigned long)_text, return _PAGE_RW;
(unsigned long)__end_rodata_hpage_align)) { return 0;
unsigned int level; }
#else
/* static pgprotval_t protect_kernel_text_ro(unsigned long start,
* Don't enforce the !RW mapping for the kernel text mapping, unsigned long end)
* if the current mapping is already using small page mapping. {
* No need to work hard to preserve large page mappings in this return 0;
* case. }
*
* This also fixes the Linux Xen paravirt guest boot failure
* (because of unexpected read-only mappings for kernel identity
* mappings). In this paravirt guest case, the kernel text
* mapping and the kernel identity mapping share the same
* page-table pages. Thus we can't really use different
* protections for the kernel text and identity mappings. Also,
* these shared mappings are made of small page mappings.
* Thus this don't enforce !RW mapping for small page kernel
* text mapping logic will help Linux Xen parvirt guest boot
* as well.
*/
if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
pgprot_val(forbidden) |= _PAGE_RW;
}
#endif #endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}
return prot; static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
unsigned long start, unsigned long end,
unsigned long pfn, const char *txt)
{
static const char *lvltxt[] = {
[CPA_CONFLICT] = "conflict",
[CPA_PROTECT] = "protect",
[CPA_DETECT] = "detect",
};
if (warnlvl > cpa_warn_level || !conflicts(prot, val))
return;
pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
(unsigned long long)val);
}
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
unsigned long pfn, unsigned long npg,
int warnlvl)
{
pgprotval_t forbidden, res;
unsigned long end;
/*
* There is no point in checking RW/NX conflicts when the requested
* mapping is setting the page !PRESENT.
*/
if (!(pgprot_val(prot) & _PAGE_PRESENT))
return prot;
/* Operate on the virtual address */
end = start + npg * PAGE_SIZE - 1;
res = protect_kernel_text(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
forbidden = res;
res = protect_kernel_text_ro(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
forbidden |= res;
/* Check the PFN directly */
res = protect_pci_bios(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
forbidden |= res;
res = protect_rodata(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
forbidden |= res;
return __pgprot(pgprot_val(prot) & ~forbidden);
} }
/* /*
...@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, ...@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/ */
pte_t *lookup_address(unsigned long address, unsigned int *level) pte_t *lookup_address(unsigned long address, unsigned int *level)
{ {
return lookup_address_in_pgd(pgd_offset_k(address), address, level); return lookup_address_in_pgd(pgd_offset_k(address), address, level);
} }
EXPORT_SYMBOL_GPL(lookup_address); EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level) unsigned int *level)
{ {
if (cpa->pgd) if (cpa->pgd)
return lookup_address_in_pgd(cpa->pgd + pgd_index(address), return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level); address, level);
return lookup_address(address, level); return lookup_address(address, level);
} }
/* /*
...@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) ...@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
return prot; return prot;
} }
static int static int __should_split_large_page(pte_t *kpte, unsigned long address,
try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa)
struct cpa_data *cpa)
{ {
unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, old_pte, *tmp; pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
enum pg_level level; enum pg_level level;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
/* /*
* Check for races, another CPU might have split this page * Check for races, another CPU might have split this page
* up already: * up already:
*/ */
tmp = _lookup_address_cpa(cpa, address, &level); tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte) if (tmp != kpte)
goto out_unlock; return 1;
switch (level) { switch (level) {
case PG_LEVEL_2M: case PG_LEVEL_2M:
old_prot = pmd_pgprot(*(pmd_t *)kpte); old_prot = pmd_pgprot(*(pmd_t *)kpte);
old_pfn = pmd_pfn(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte);
cpa_inc_2m_checked();
break; break;
case PG_LEVEL_1G: case PG_LEVEL_1G:
old_prot = pud_pgprot(*(pud_t *)kpte); old_prot = pud_pgprot(*(pud_t *)kpte);
old_pfn = pud_pfn(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte);
cpa_inc_1g_checked();
break; break;
default: default:
do_split = -EINVAL; return -EINVAL;
goto out_unlock;
} }
psize = page_level_size(level); psize = page_level_size(level);
...@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, ...@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Calculate the number of pages, which fit into this large * Calculate the number of pages, which fit into this large
* page starting at address: * page starting at address:
*/ */
nextpage_addr = (address + psize) & pmask; lpaddr = (address + psize) & pmask;
numpages = (nextpage_addr - address) >> PAGE_SHIFT; numpages = (lpaddr - address) >> PAGE_SHIFT;
if (numpages < cpa->numpages) if (numpages < cpa->numpages)
cpa->numpages = numpages; cpa->numpages = numpages;
...@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, ...@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pgprot_val(req_prot) |= _PAGE_PSE; pgprot_val(req_prot) |= _PAGE_PSE;
/* /*
* old_pfn points to the large page base pfn. So we need * old_pfn points to the large page base pfn. So we need to add the
* to add the offset of the virtual address: * offset of the virtual address:
*/ */
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn; cpa->pfn = pfn;
new_prot = static_protections(req_prot, address, pfn); /*
* Calculate the large page base address and the number of 4K pages
* in the large page
*/
lpaddr = address & pmask;
numpages = psize >> PAGE_SHIFT;
/* /*
* We need to check the full range, whether * Sanity check that the existing mapping is correct versus the static
* static_protection() requires a different pgprot for one of * protections. static_protections() guards against !PRESENT, so no
* the pages in the range we try to preserve: * extra conditional required here.
*/ */
addr = address & pmask; chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
pfn = old_pfn; CPA_CONFLICT);
for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
if (pgprot_val(chk_prot) != pgprot_val(new_prot)) if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
goto out_unlock; /*
* Split the large page and tell the split code to
* enforce static protections.
*/
cpa->force_static_prot = 1;
return 1;
} }
/* /*
* If there are no changes, return. maxpages has been updated * Optimization: If the requested pgprot is the same as the current
* above: * pgprot, then the large page can be preserved and no updates are
* required independent of alignment and length of the requested
* range. The above already established that the current pgprot is
* correct, which in consequence makes the requested pgprot correct
* as well if it is the same. The static protection scan below will
* not come to a different conclusion.
*/ */
if (pgprot_val(new_prot) == pgprot_val(old_prot)) { if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
do_split = 0; cpa_inc_lp_sameprot(level);
goto out_unlock; return 0;
} }
/* /*
* We need to change the attributes. Check, whether we can * If the requested range does not cover the full page, split it up
* change the large page in one go. We request a split, when
* the address is not aligned and the number of pages is
* smaller than the number of pages in the large page. Note
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/ */
if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { if (address != lpaddr || cpa->numpages != numpages)
/* return 1;
* The address is aligned and the number of pages
* covers the full page.
*/
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
out_unlock: /*
* Check whether the requested pgprot is conflicting with a static
* protection requirement in the large page.
*/
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
CPA_DETECT);
/*
* If there is a conflict, split the large page.
*
* There used to be a 4k wise evaluation trying really hard to
* preserve the large pages, but experimentation has shown, that this
* does not help at all. There might be corner cases which would
* preserve one large page occasionally, but it's really not worth the
* extra code and cycles for the common case.
*/
if (pgprot_val(req_prot) != pgprot_val(new_prot))
return 1;
/* All checks passed. Update the large page mapping. */
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
cpa_inc_lp_preserved(level);
return 0;
}
static int should_split_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
int do_split;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
do_split = __should_split_large_page(kpte, address, cpa);
spin_unlock(&pgd_lock); spin_unlock(&pgd_lock);
return do_split; return do_split;
} }
static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
pgprot_t ref_prot, unsigned long address,
unsigned long size)
{
unsigned int npg = PFN_DOWN(size);
pgprot_t prot;
/*
* If should_split_large_page() discovered an inconsistent mapping,
* remove the invalid protection in the split mapping.
*/
if (!cpa->force_static_prot)
goto set;
prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
if (pgprot_val(prot) == pgprot_val(ref_prot))
goto set;
/*
* If this is splitting a PMD, fix it up. PUD splits cannot be
* fixed trivially as that would require to rescan the newly
* installed PMD mappings after returning from split_large_page()
* so an eventual further split can allocate the necessary PTE
* pages. Warn for now and revisit it in case this actually
* happens.
*/
if (size == PAGE_SIZE)
ref_prot = prot;
else
pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
set_pte(pte, pfn_pte(pfn, ref_prot));
}
static int static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base) struct page *base)
{ {
unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
pte_t *pbase = (pte_t *)page_address(base); pte_t *pbase = (pte_t *)page_address(base);
unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level; unsigned int i, level;
pte_t *tmp;
pgprot_t ref_prot; pgprot_t ref_prot;
pte_t *tmp;
spin_lock(&pgd_lock); spin_lock(&pgd_lock);
/* /*
...@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, ...@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* PAT bit to correct position. * PAT bit to correct position.
*/ */
ref_prot = pgprot_large_2_4k(ref_prot); ref_prot = pgprot_large_2_4k(ref_prot);
ref_pfn = pmd_pfn(*(pmd_t *)kpte); ref_pfn = pmd_pfn(*(pmd_t *)kpte);
lpaddr = address & PMD_MASK;
lpinc = PAGE_SIZE;
break; break;
case PG_LEVEL_1G: case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte); ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/* /*
* Clear the PSE flags if the PRESENT flag is not set * Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true * otherwise pmd_present/pmd_huge will return true
...@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, ...@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Get the target pfn from the original entry: * Get the target pfn from the original entry:
*/ */
pfn = ref_pfn; pfn = ref_pfn;
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) { if (virt_addr_valid(address)) {
unsigned long pfn = PFN_DOWN(__pa(address)); unsigned long pfn = PFN_DOWN(__pa(address));
...@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, ...@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/* /*
* Intel Atom errata AAH41 workaround. * Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* Without this, we violate the TLB application note, that says:
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
* *
* The real fix should be in hw or in a microcode update, but * We do this global tlb flush inside the cpa_lock, so that we
* we also probabilistically try to reduce the window of having * don't allow any other cpu, with stale tlb entries change the
* a large TLB mixed with 4K TLBs while instruction fetches are * page attribute in parallel, that also falls into the
* going on. * just split large page entry.
*/ */
__flush_tlb_all(); flush_tlb_all();
spin_unlock(&pgd_lock); spin_unlock(&pgd_lock);
return 0; return 0;
...@@ -1247,7 +1494,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) ...@@ -1247,7 +1494,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
new_prot = static_protections(new_prot, address, pfn); cpa_inc_4k_install();
new_prot = static_protections(new_prot, address, pfn, 1,
CPA_PROTECT);
new_prot = pgprot_clear_protnone_bits(new_prot); new_prot = pgprot_clear_protnone_bits(new_prot);
...@@ -1273,7 +1522,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) ...@@ -1273,7 +1522,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
* Check, whether we can keep the large page intact * Check, whether we can keep the large page intact
* and just change the pte: * and just change the pte:
*/ */
do_split = try_preserve_large_page(kpte, address, cpa); do_split = should_split_large_page(kpte, address, cpa);
/* /*
* When the range fits into the existing large page, * When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in * return. cp->numpages and cpa->tlbflush have been updated in
...@@ -1286,28 +1535,8 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) ...@@ -1286,28 +1535,8 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
* We have to split the large page: * We have to split the large page:
*/ */
err = split_large_page(cpa, kpte, address); err = split_large_page(cpa, kpte, address);
if (!err) { if (!err)
/*
* Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* With out this, we violate the TLB application note, that says
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
*
* We do this global tlb flush inside the cpa_lock, so that we
* don't allow any other cpu, with stale tlb entries change the
* page attribute in parallel, that also falls into the
* just split large page entry.
*/
flush_tlb_all();
goto repeat; goto repeat;
}
return err; return err;
} }
...@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, ...@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = !!pgprot2cachemode(mask_set); cache = !!pgprot2cachemode(mask_set);
/* /*
* On success we use CLFLUSH, when the CPU supports it to * On error; flush everything to be sure.
* avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
* WBINVD):
*/ */
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (ret) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
cpa_flush_range(baddr, numpages, cache);
} else
cpa_flush_all(cache); cpa_flush_all(cache);
goto out;
}
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(baddr, addr, numpages, cache,
cpa.flags, pages);
} else {
cpa_flush_range(baddr, numpages, cache);
}
out: out:
return ret; return ret;
...@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) ...@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/* /*
* Before changing the encryption attribute, we need to flush caches. * Before changing the encryption attribute, we need to flush caches.
*/ */
if (static_cpu_has(X86_FEATURE_CLFLUSH)) cpa_flush_range(start, numpages, 1);
cpa_flush_range(start, numpages, 1);
else
cpa_flush_all(1);
ret = __change_page_attr_set_clr(&cpa, 1); ret = __change_page_attr_set_clr(&cpa, 1);
...@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) ...@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
* in case TLB flushing gets optimized in the cpa_flush_range() * in case TLB flushing gets optimized in the cpa_flush_range()
* path use the same logic as above. * path use the same logic as above.
*/ */
if (static_cpu_has(X86_FEATURE_CLFLUSH)) cpa_flush_range(start, numpages, 0);
cpa_flush_range(start, numpages, 0);
else
cpa_flush_all(0);
return ret; return ret;
} }
......
...@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{ {
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id(); unsigned cpu = smp_processor_id();
u64 next_tlb_gen; u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
/* /*
* NB: The scheduler will call us with prev == next when switching * NB: The scheduler will call us with prev == next when switching
...@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id); next->context.ctx_id);
/* /*
* We don't currently support having a real mm loaded without * Even in lazy TLB mode, the CPU should stay set in the
* our cpu set in mm_cpumask(). We have all the bookkeeping * mm_cpumask. The TLB shootdown code can figure out from
* in place to figure out whether we would need to flush * from cpu_tlbstate.is_lazy whether or not to send an IPI.
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
*/ */
if (WARN_ON_ONCE(real_prev != &init_mm && if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next)))) !cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next)); cpumask_set_cpu(cpu, mm_cpumask(next));
return; /*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;
/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;
/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
} else { } else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/* /*
...@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */ /* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier(); barrier();
}
if (need_flush) { if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true); load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/* /*
* Record last user mm's context id, so we can avoid * NB: This gets called via leave_mm() in the idle path
* flushing branch buffer with IBPB if we switch back * where RCU functions differently. Tracing normally
* to the same user. * uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/ */
if (next != &init_mm) trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); } else {
/* The new ASID is already up to date. */
/* Make sure we write CR3 before loaded_mm. */ load_new_mm_cr3(next->pgd, new_asid, false);
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next); /* See above wrt _rcuidle. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
} }
load_mm_cr4(next); /*
switch_ldt(real_prev, next); * Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
} }
/* /*
...@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) ...@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return; return;
if (tlb_defer_switch_to_init_mm()) { this_cpu_write(cpu_tlbstate.is_lazy, true);
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
} }
/* /*
...@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, ...@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading * paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely * garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm. * slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/ */
switch_mm_irqs_off(NULL, &init_mm, NULL); switch_mm_irqs_off(NULL, &init_mm, NULL);
return; return;
...@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, ...@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) { f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */ /* Partial flush */
unsigned long addr; unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; unsigned long addr = f->start;
addr = f->start;
while (addr < f->end) { while (addr < f->end) {
__flush_tlb_one_user(addr); __flush_tlb_one_user(addr);
addr += PAGE_SIZE; addr += 1UL << f->stride_shift;
} }
if (local) if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
trace_tlb_flush(reason, nr_pages); trace_tlb_flush(reason, nr_invalidate);
} else { } else {
/* Full flush. */ /* Full flush. */
local_flush_tlb(); local_flush_tlb();
...@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info) ...@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
} }
static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask, void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info) const struct flush_tlb_info *info)
{ {
...@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask, ...@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1); (void *)info, 1);
return; return;
} }
smp_call_function_many(cpumask, flush_tlb_func_remote,
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1); (void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
} }
/* /*
...@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask, ...@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag) unsigned long end, unsigned int stride_shift,
bool freed_tables)
{ {
int cpu; int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm, .mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
}; };
cpu = get_cpu(); cpu = get_cpu();
...@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, ...@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */ /* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) && if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) && ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
info.start = start; info.start = start;
info.end = end; info.end = end;
} else { } else {
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/tick.h> #include <linux/tick.h>
#include <linux/nmi.h> #include <linux/nmi.h>
#include <linux/cpuhotplug.h> #include <linux/cpuhotplug.h>
#include <linux/stackprotector.h>
#include <asm/paravirt.h> #include <asm/paravirt.h>
#include <asm/desc.h> #include <asm/desc.h>
...@@ -88,6 +89,7 @@ static void cpu_bringup(void) ...@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void) asmlinkage __visible void cpu_bringup_and_idle(void)
{ {
cpu_bringup(); cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
} }
......
...@@ -902,12 +902,22 @@ static bool copy_device_table(void) ...@@ -902,12 +902,22 @@ static bool copy_device_table(void)
} }
} }
old_devtb_phys = entry & PAGE_MASK; /*
* When SME is enabled in the first kernel, the entry includes the
* memory encryption mask(sme_me_mask), we must remove the memory
* encryption mask to obtain the true physical address in kdump kernel.
*/
old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
if (old_devtb_phys >= 0x100000000ULL) { if (old_devtb_phys >= 0x100000000ULL) {
pr_err("The address of old device table is above 4G, not trustworthy!\n"); pr_err("The address of old device table is above 4G, not trustworthy!\n");
return false; return false;
} }
old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); old_devtb = (sme_active() && is_kdump_kernel())
? (__force void *)ioremap_encrypted(old_devtb_phys,
dev_table_size)
: memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
if (!old_devtb) if (!old_devtb)
return false; return false;
......
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
#include <asm/pgtable.h>
#include <asm/io.h> #include <asm/io.h>
#include "internal.h" #include "internal.h"
...@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn) ...@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
/* Reads a page from the oldmem device from given offset. */ /* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count, static ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf) u64 *ppos, int userbuf,
bool encrypted)
{ {
unsigned long pfn, offset; unsigned long pfn, offset;
size_t nr_bytes; size_t nr_bytes;
...@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, ...@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0) if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes); memset(buf, 0, nr_bytes);
else { else {
tmp = copy_oldmem_page(pfn, buf, nr_bytes, if (encrypted)
offset, userbuf); tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
offset,
userbuf);
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (tmp < 0) if (tmp < 0)
return tmp; return tmp;
} }
...@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr) ...@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/ */
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{ {
return read_from_oldmem(buf, count, ppos, 0); return read_from_oldmem(buf, count, ppos, 0, false);
} }
/* /*
...@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) ...@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/ */
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{ {
return read_from_oldmem(buf, count, ppos, 0); return read_from_oldmem(buf, count, ppos, 0, sme_active());
} }
/* /*
...@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, ...@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn, unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot) unsigned long size, pgprot_t prot)
{ {
prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot); return remap_pfn_range(vma, from, pfn, size, prot);
} }
/*
* Architectures which support memory encryption override this.
*/
ssize_t __weak
copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
}
/* /*
* Copy to either kernel or user space * Copy to either kernel or user space
*/ */
...@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, ...@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
m->offset + m->size - *fpos, m->offset + m->size - *fpos,
buflen); buflen);
start = m->paddr + *fpos - m->offset; start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf); tmp = read_from_oldmem(buffer, tsz, &start,
userbuf, sme_active());
if (tmp < 0) if (tmp < 0)
return tmp; return tmp;
buflen -= tsz; buflen -= tsz;
......
...@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, ...@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
unsigned long, int); unsigned long, int);
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
size_t csize, unsigned long offset,
int userbuf);
void vmcore_cleanup(void); void vmcore_cleanup(void);
/* Architecture code defines this if there are other possible ELF /* Architecture code defines this if there are other possible ELF
......
...@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), ...@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait, smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags); gfp_t gfp_flags);
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd); int smp_call_function_single_async(int cpu, call_single_data_t *csd);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
......
...@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, ...@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
} }
} }
/* Ensure that these pages are decrypted if SME is enabled. */
if (pages)
arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
return pages; return pages;
} }
...@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image, ...@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -ENOMEM; result = -ENOMEM;
goto out; goto out;
} }
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page); ptr = kmap(page);
ptr += maddr & ~PAGE_MASK; ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes, mchunk = min_t(size_t, mbytes,
...@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, ...@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk); result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page); kexec_flush_icache_page(page);
kunmap(page); kunmap(page);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) { if (result) {
result = -EFAULT; result = -EFAULT;
goto out; goto out;
......
...@@ -318,33 +318,34 @@ int release_resource(struct resource *old) ...@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource); EXPORT_SYMBOL(release_resource);
/* /**
* Finds the lowest iomem resource existing within [res->start.res->end). * Finds the lowest iomem resource that covers part of [start..end]. The
* The caller must specify res->start, res->end, res->flags, and optionally * caller must specify start, end, flags, and desc (which may be
* desc. If found, returns 0, res is overwritten, if not found, returns -1. * IORES_DESC_NONE).
* This function walks the whole tree and not just first level children until *
* and unless first_level_children_only is true. * If a resource is found, returns 0 and *res is overwritten with the part
* of the resource that's within [start..end]; if none is found, returns
* -1.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
*/ */
static int find_next_iomem_res(struct resource *res, unsigned long desc, static int find_next_iomem_res(resource_size_t start, resource_size_t end,
bool first_level_children_only) unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{ {
resource_size_t start, end;
struct resource *p; struct resource *p;
bool sibling_only = false;
BUG_ON(!res); if (!res)
return -EINVAL;
start = res->start;
end = res->end;
BUG_ON(start >= end);
if (first_level_children_only) if (start >= end)
sibling_only = true; return -EINVAL;
read_lock(&resource_lock); read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
if ((p->flags & res->flags) != res->flags) if ((p->flags & flags) != flags)
continue; continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc)) if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue; continue;
...@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, ...@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
p = NULL; p = NULL;
break; break;
} }
if ((p->end >= start) && (p->start < end)) if ((p->end >= start) && (p->start <= end))
break; break;
} }
read_unlock(&resource_lock); read_unlock(&resource_lock);
if (!p) if (!p)
return -1; return -1;
/* copy data */ /* copy data */
if (res->start < p->start) res->start = max(start, p->start);
res->start = p->start; res->end = min(end, p->end);
if (res->end > p->end)
res->end = p->end;
res->flags = p->flags; res->flags = p->flags;
res->desc = p->desc; res->desc = p->desc;
return 0; return 0;
} }
static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
bool first_level_children_only, unsigned long flags, unsigned long desc,
void *arg, bool first_lvl, void *arg,
int (*func)(struct resource *, void *)) int (*func)(struct resource *, void *))
{ {
u64 orig_end = res->end; struct resource res;
int ret = -1; int ret = -1;
while ((res->start < res->end) && while (start < end &&
!find_next_iomem_res(res, desc, first_level_children_only)) { !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
ret = (*func)(res, arg); ret = (*func)(&res, arg);
if (ret) if (ret)
break; break;
res->start = res->end + 1; start = res.end + 1;
res->end = orig_end;
} }
return ret; return ret;
} }
/* /**
* Walks through iomem resources and calls func() with matching resource * Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children. * ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and * All the memory ranges which overlap start,end and also match flags and
...@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, ...@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *)) u64 end, void *arg, int (*func)(struct resource *, void *))
{ {
struct resource res; return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
res.start = start;
res.end = end;
res.flags = flags;
return __walk_iomem_res_desc(&res, desc, false, arg, func);
} }
EXPORT_SYMBOL_GPL(walk_iomem_res_desc); EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
...@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc); ...@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
* ranges. * ranges.
*/ */
int walk_system_ram_res(u64 start, u64 end, void *arg, int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *)) int (*func)(struct resource *, void *))
{ {
struct resource res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
res.start = start; return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
arg, func); arg, func);
} }
...@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, ...@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
int walk_mem_res(u64 start, u64 end, void *arg, int walk_mem_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *)) int (*func)(struct resource *, void *))
{ {
struct resource res; unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
res.start = start;
res.end = end;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func); arg, func);
} }
...@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg, ...@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
* It is to be used only for System RAM. * It is to be used only for System RAM.
*/ */
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *)) void *arg, int (*func)(unsigned long, unsigned long, void *))
{ {
resource_size_t start, end;
unsigned long flags;
struct resource res; struct resource res;
unsigned long pfn, end_pfn; unsigned long pfn, end_pfn;
u64 orig_end;
int ret = -1; int ret = -1;
res.start = (u64) start_pfn << PAGE_SHIFT; start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end; while (start < end &&
while ((res.start < res.end) && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { true, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn) if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg); ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret) if (ret)
break; break;
res.start = res.end + 1; start = res.end + 1;
res.end = orig_end;
} }
return ret; return ret;
} }
...@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new, ...@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
* @constraint: the size and alignment constraints to be met. * @constraint: the size and alignment constraints to be met.
*/ */
static int reallocate_resource(struct resource *root, struct resource *old, static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize, resource_size_t newsize,
struct resource_constraint *constraint) struct resource_constraint *constraint)
{ {
int err=0; int err=0;
struct resource new = *old; struct resource new = *old;
...@@ -972,7 +957,7 @@ static int __adjust_resource(struct resource *res, resource_size_t start, ...@@ -972,7 +957,7 @@ static int __adjust_resource(struct resource *res, resource_size_t start,
* Existing children of the resource are assumed to be immutable. * Existing children of the resource are assumed to be immutable.
*/ */
int adjust_resource(struct resource *res, resource_size_t start, int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size) resource_size_t size)
{ {
int result; int result;
...@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start, ...@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
} }
EXPORT_SYMBOL(adjust_resource); EXPORT_SYMBOL(adjust_resource);
static void __init __reserve_region_with_split(struct resource *root, static void __init
resource_size_t start, resource_size_t end, __reserve_region_with_split(struct resource *root, resource_size_t start,
const char *name) resource_size_t end, const char *name)
{ {
struct resource *parent = root; struct resource *parent = root;
struct resource *conflict; struct resource *conflict;
...@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root, ...@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
} }
void __init reserve_region_with_split(struct resource *root, void __init
resource_size_t start, resource_size_t end, reserve_region_with_split(struct resource *root, resource_size_t start,
const char *name) resource_size_t end, const char *name)
{ {
int abort = 0; int abort = 0;
...@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region); ...@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
* The described resource region must match a currently busy region. * The described resource region must match a currently busy region.
*/ */
void __release_region(struct resource *parent, resource_size_t start, void __release_region(struct resource *parent, resource_size_t start,
resource_size_t n) resource_size_t n)
{ {
struct resource **p; struct resource **p;
resource_size_t end; resource_size_t end;
...@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region); ...@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
* simplicity. Enhance this logic when necessary. * simplicity. Enhance this logic when necessary.
*/ */
int release_mem_region_adjustable(struct resource *parent, int release_mem_region_adjustable(struct resource *parent,
resource_size_t start, resource_size_t size) resource_size_t start, resource_size_t size)
{ {
struct resource **p; struct resource **p;
struct resource *res; struct resource *res;
...@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data) ...@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
this->start == match->start && this->n == match->n; this->start == match->start && this->n == match->n;
} }
struct resource * __devm_request_region(struct device *dev, struct resource *
struct resource *parent, resource_size_t start, __devm_request_region(struct device *dev, struct resource *parent,
resource_size_t n, const char *name) resource_size_t start, resource_size_t n, const char *name)
{ {
struct region_devres *dr = NULL; struct region_devres *dr = NULL;
struct resource *res; struct resource *res;
......
...@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle); ...@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state) void cpu_startup_entry(enum cpuhp_state state)
{ {
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (ARM and SH have never invoked the canary
* init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare(); arch_cpu_idle_prepare();
cpuhp_online_idle(state); cpuhp_online_idle(state);
while (1) while (1)
......
...@@ -56,7 +56,6 @@ ...@@ -56,7 +56,6 @@
#include <linux/profile.h> #include <linux/profile.h>
#include <linux/rcupdate_wait.h> #include <linux/rcupdate_wait.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h> #include <linux/stop_machine.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/swait.h> #include <linux/swait.h>
......
...@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); ...@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or * You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler. * from a hardware interrupt handler or from a bottom half handler.
*/ */
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait, smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags) gfp_t gfp_flags, const struct cpumask *mask)
{ {
cpumask_var_t cpus; cpumask_var_t cpus;
int cpu, ret; int cpu, ret;
...@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), ...@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable(); preempt_disable();
for_each_online_cpu(cpu) for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) if (cond_func(cpu, info))
cpumask_set_cpu(cpu, cpus); __cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait); on_each_cpu_mask(cpus, func, info, wait);
preempt_enable(); preempt_enable();
free_cpumask_var(cpus); free_cpumask_var(cpus);
...@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), ...@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
* just have to IPI them one by one. * just have to IPI them one by one.
*/ */
preempt_disable(); preempt_disable();
for_each_online_cpu(cpu) for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) { if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func, ret = smp_call_function_single(cpu, func,
info, wait); info, wait);
...@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), ...@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable(); preempt_enable();
} }
} }
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond); EXPORT_SYMBOL(on_each_cpu_cond);
static void do_nothing(void *unused) static void do_nothing(void *unused)
......
...@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); ...@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the * Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP. * same condtions in UP and SMP.
*/ */
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait, smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags) gfp_t gfp_flags, const struct cpumask *mask)
{ {
unsigned long flags; unsigned long flags;
...@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), ...@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
} }
preempt_enable(); preempt_enable();
} }
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond); EXPORT_SYMBOL(on_each_cpu_cond);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
*/ */
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm-generic/pgtable.h> #include <asm-generic/pgtable.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment