Commit ae821d21 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_mm_for_v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm cleanups from Borislav Petkov:

 - PTRACE_GETREGS/PTRACE_PUTREGS regset selection cleanup

 - Another initial cleanup - more to follow - to the fault handling
   code.

 - Other minor cleanups and corrections.

* tag 'x86_mm_for_v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/{fault,efi}: Fix and rename efi_recover_from_page_fault()
  x86/fault: Don't run fixups for SMAP violations
  x86/fault: Don't look for extable entries for SMEP violations
  x86/fault: Rename no_context() to kernelmode_fixup_or_oops()
  x86/fault: Bypass no_context() for implicit kernel faults from usermode
  x86/fault: Split the OOPS code out from no_context()
  x86/fault: Improve kernel-executing-user-memory handling
  x86/fault: Correct a few user vs kernel checks wrt WRUSS
  x86/fault: Document the locking in the fault_signal_pending() path
  x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault()
  x86/fault: Fold mm_fault_error() into do_user_addr_fault()
  x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs
  x86/fault: Fix AMD erratum #91 errata fixup for user code
  x86/Kconfig: Remove HPET_EMULATE_RTC depends on RTC
  x86/asm: Fixup TASK_SIZE_MAX comment
  x86/ptrace: Clean up PTRACE_GETREGS/PTRACE_PUTREGS regset selection
  x86/vm86/32: Remove VM86_SCREEN_BITMAP support
  x86: Remove definition of DEBUG
  x86/entry: Remove now unused do_IRQ() declaration
  x86/mm: Remove duplicate definition of _PAGE_PAT_LARGE
  ...
parents 1255f440 40c1fa52
...@@ -890,7 +890,7 @@ config HPET_TIMER ...@@ -890,7 +890,7 @@ config HPET_TIMER
config HPET_EMULATE_RTC config HPET_EMULATE_RTC
def_bool y def_bool y
depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER config APB_TIMER
def_bool y if X86_INTEL_MID def_bool y if X86_INTEL_MID
......
...@@ -139,7 +139,7 @@ extern void __init efi_dump_pagetable(void); ...@@ -139,7 +139,7 @@ extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void); extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables); extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void); extern void efi_delete_dummy_variable(void);
extern void efi_recover_from_page_fault(unsigned long phys_addr); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void); extern void efi_free_boot_services(void);
void efi_enter_mm(void); void efi_enter_mm(void);
......
...@@ -40,8 +40,6 @@ extern void native_init_IRQ(void); ...@@ -40,8 +40,6 @@ extern void native_init_IRQ(void);
extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
extern void init_ISA_irqs(void); extern void init_ISA_irqs(void);
extern void __init init_IRQ(void); extern void __init init_IRQ(void);
......
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a * address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously. * non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything executable * We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address. * from being mapped at the maximum canonical address.
* *
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
......
...@@ -177,8 +177,6 @@ enum page_cache_mode { ...@@ -177,8 +177,6 @@ enum page_cache_mode {
#define __pgprot(x) ((pgprot_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } )
#define __pg(x) __pgprot(x) #define __pg(x) __pgprot(x)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
......
...@@ -36,7 +36,6 @@ struct vm86 { ...@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0; unsigned long saved_sp0;
unsigned long flags; unsigned long flags;
unsigned long screen_bitmap;
unsigned long cpu_type; unsigned long cpu_type;
struct revectored_struct int_revectored; struct revectored_struct int_revectored;
struct revectored_struct int21_revectored; struct revectored_struct int21_revectored;
......
...@@ -97,7 +97,7 @@ struct revectored_struct { ...@@ -97,7 +97,7 @@ struct revectored_struct {
struct vm86_struct { struct vm86_struct {
struct vm86_regs regs; struct vm86_regs regs;
unsigned long flags; unsigned long flags;
unsigned long screen_bitmap; unsigned long screen_bitmap; /* unused, preserved by vm86() */
unsigned long cpu_type; unsigned long cpu_type;
struct revectored_struct int_revectored; struct revectored_struct int_revectored;
struct revectored_struct int21_revectored; struct revectored_struct int21_revectored;
...@@ -106,7 +106,7 @@ struct vm86_struct { ...@@ -106,7 +106,7 @@ struct vm86_struct {
/* /*
* flags masks * flags masks
*/ */
#define VM86_SCREEN_BITMAP 0x0001 #define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
struct vm86plus_info_struct { struct vm86plus_info_struct {
unsigned long force_return_for_pic:1; unsigned long force_return_for_pic:1;
......
...@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) ...@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base) if (!size_base)
continue; continue;
size_base = to_size_factor(size_base, &size_factor), size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
start_base = to_size_factor(start_base, &start_factor), start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type; type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86) * because MTRRs can span up to 40 bits (36bits on most modern x86)
*/ */
#define DEBUG
#include <linux/export.h> #include <linux/export.h>
#include <linux/init.h> #include <linux/init.h>
......
...@@ -31,8 +31,6 @@ ...@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro). System Programming Guide; Section 9.11. (1997 edition - PPro).
*/ */
#define DEBUG
#include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h> #include <linux/stop_machine.h>
......
...@@ -4,9 +4,6 @@ ...@@ -4,9 +4,6 @@
#include <linux/string.h> #include <linux/string.h>
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#define DEBUG 1
static struct iommu_table_entry * __init static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start, find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish, struct iommu_table_entry *finish,
......
...@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) ...@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */ static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif #endif
#ifdef CONFIG_X86_64
static const struct user_regset_view user_x86_64_view; /* Initialized below. */
#endif
long arch_ptrace(struct task_struct *child, long request, long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data) unsigned long addr, unsigned long data)
...@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, ...@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret; int ret;
unsigned long __user *datap = (unsigned long __user *)data; unsigned long __user *datap = (unsigned long __user *)data;
#ifdef CONFIG_X86_64
/* This is native 64-bit ptrace() */
const struct user_regset_view *regset_view = &user_x86_64_view;
#else
/* This is native 32-bit ptrace() */
const struct user_regset_view *regset_view = &user_x86_32_view;
#endif
switch (request) { switch (request) {
/* read the word at location addr in the USER area. */ /* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: { case PTRACE_PEEKUSR: {
...@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, ...@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */ case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child, return copy_regset_to_user(child,
task_user_regset_view(current), regset_view,
REGSET_GENERAL, REGSET_GENERAL,
0, sizeof(struct user_regs_struct), 0, sizeof(struct user_regs_struct),
datap); datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */ case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child, return copy_regset_from_user(child,
task_user_regset_view(current), regset_view,
REGSET_GENERAL, REGSET_GENERAL,
0, sizeof(struct user_regs_struct), 0, sizeof(struct user_regs_struct),
datap); datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */ case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child, return copy_regset_to_user(child,
task_user_regset_view(current), regset_view,
REGSET_FP, REGSET_FP,
0, sizeof(struct user_i387_struct), 0, sizeof(struct user_i387_struct),
datap); datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */ case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child, return copy_regset_from_user(child,
task_user_regset_view(current), regset_view,
REGSET_FP, REGSET_FP,
0, sizeof(struct user_i387_struct), 0, sizeof(struct user_i387_struct),
datap); datap);
...@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, ...@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */ case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child, return copy_regset_to_user(child,
task_user_regset_view(current), &user_x86_64_view,
REGSET_GENERAL, REGSET_GENERAL,
0, sizeof(struct user_regs_struct), 0, sizeof(struct user_regs_struct),
datap); datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */ case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child, return copy_regset_from_user(child,
task_user_regset_view(current), &user_x86_64_view,
REGSET_GENERAL, REGSET_GENERAL,
0, sizeof(struct user_regs_struct), 0, sizeof(struct user_regs_struct),
datap); datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */ case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child, return copy_regset_to_user(child,
task_user_regset_view(current), &user_x86_64_view,
REGSET_FP, REGSET_FP,
0, sizeof(struct user_i387_struct), 0, sizeof(struct user_i387_struct),
datap); datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */ case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child, return copy_regset_from_user(child,
task_user_regset_view(current), &user_x86_64_view,
REGSET_FP, REGSET_FP,
0, sizeof(struct user_i387_struct), 0, sizeof(struct user_i387_struct),
datap); datap);
...@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) ...@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
} }
/*
* This is used by the core dump code to decide which regset to dump. The
* core dump code writes out the resulting .e_machine and the corresponding
* regsets. This is suboptimal if the task is messing around with its CS.L
* field, but at worst the core dump will end up missing some information.
*
* Unfortunately, it is also used by the broken PTRACE_GETREGSET and
* PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
* no way to make sure that the e_machine they use matches the caller's
* expectations. The result is that the data format returned by
* PTRACE_GETREGSET depends on the returned CS field (and even the offset
* of the returned CS field depends on its value!) and the data format
* accepted by PTRACE_SETREGSET is determined by the old CS value. The
* upshot is that it is basically impossible to use these APIs correctly.
*
* The best way to fix it in the long run would probably be to add new
* improved ptrace() APIs to read and write registers reliably, possibly by
* allowing userspace to select the ELF e_machine variant that they expect.
*/
const struct user_regset_view *task_user_regset_view(struct task_struct *task) const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{ {
#ifdef CONFIG_IA32_EMULATION #ifdef CONFIG_IA32_EMULATION
......
...@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, ...@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags, unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off) unsigned long, fd, unsigned long, off)
{ {
long error;
error = -EINVAL;
if (off & ~PAGE_MASK) if (off & ~PAGE_MASK)
goto out; return -EINVAL;
error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
out:
return error;
} }
static void find_start_end(unsigned long addr, unsigned long flags, static void find_start_end(unsigned long addr, unsigned long flags,
......
...@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) ...@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
/*
* Don't write screen_bitmap in case some user had a value there
* and expected it to remain unchanged.
*/
user_access_end(); user_access_end();
...@@ -160,49 +164,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) ...@@ -160,49 +164,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV); do_exit(SIGSEGV);
} }
static void mark_screen_rdonly(struct mm_struct *mm)
{
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int i;
mmap_write_lock(mm);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
p4d = p4d_offset(pgd, 0xA0000);
if (p4d_none_or_clear_bad(p4d))
goto out;
pud = pud_offset(p4d, 0xA0000);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
if (pmd_trans_huge(*pmd)) {
vma = find_vma(mm, 0xA0000);
split_huge_pmd(vma, pmd, 0xA0000);
}
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
set_pte(pte, pte_wrprotect(*pte));
pte++;
}
pte_unmap_unlock(pte, ptl);
out:
mmap_write_unlock(mm);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
static int do_vm86_irq_handling(int subfunction, int irqnumber); static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
...@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) ...@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored))) offsetof(struct vm86_struct, int_revectored)))
return -EFAULT; return -EFAULT;
/* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
if (v.flags & VM86_SCREEN_BITMAP) {
char comm[TASK_COMM_LEN];
pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
return -EINVAL;
}
memset(&vm86regs, 0, sizeof(vm86regs)); memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx; vm86regs.pt.bx = v.regs.ebx;
...@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) ...@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs; vm86regs.gs = v.regs.gs;
vm86->flags = v.flags; vm86->flags = v.flags;
vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type; vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored, if (copy_from_user(&vm86->int_revectored,
...@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) ...@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk); update_task_stack(tsk);
preempt_enable(); preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax; return regs->ax;
} }
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */ #include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_recover_from_page_fault()*/ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */ #include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */ #include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
...@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) ...@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
* 32-bit mode: * 32-bit mode:
* *
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
* Check that here and ignore it. * Check that here and ignore it. This is AMD erratum #91.
* *
* 64-bit mode: * 64-bit mode:
* *
...@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, ...@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
case 0x40: case 0x40:
/* /*
* In AMD64 long mode 0x40..0x4F are valid REX prefixes * In 64-bit mode 0x40..0x4F are valid REX prefixes
* Need to figure out under what instruction mode the
* instruction was issued. Could check the LDT for lm,
* but for now it's good enough to assume that long
* mode only uses well known segments or kernel.
*/ */
return (!user_mode(regs) || user_64bit_mode(regs)); return (!user_mode(regs) || user_64bit_mode(regs));
#endif #endif
...@@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, ...@@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
} }
} }
static bool is_amd_k8_pre_npt(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
c->x86_vendor == X86_VENDOR_AMD &&
c->x86 == 0xf && c->x86_model < 0x40);
}
static int static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{ {
...@@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) ...@@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char *instr; unsigned char *instr;
int prefetch = 0; int prefetch = 0;
/* Erratum #91 affects AMD K8, pre-NPT CPUs */
if (!is_amd_k8_pre_npt())
return 0;
/* /*
* If it was a exec (instruction fetch) fault on NX page, then * If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault: * do not ignore the fault:
...@@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) ...@@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs); instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15; max_instr = instr + 15;
if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) /*
return 0; * This code has historically always bailed out if IP points to a
* not-present page (e.g. due to a race). No one has ever
* complained about this.
*/
pagefault_disable();
while (instr < max_instr) { while (instr < max_instr) {
unsigned char opcode; unsigned char opcode;
if (get_kernel_nofault(opcode, instr)) if (user_mode(regs)) {
break; if (get_user(opcode, instr))
break;
} else {
if (get_kernel_nofault(opcode, instr))
break;
}
instr++; instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break; break;
} }
pagefault_enable();
return prefetch; return prefetch;
} }
...@@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) ...@@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
} }
} }
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
struct task_struct *tsk)
{
#ifdef CONFIG_VM86
unsigned long bit;
if (!v8086_mode(regs) || !tsk->thread.vm86)
return;
bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
tsk->thread.vm86->screen_bitmap |= 1 << bit;
#endif
}
static bool low_pfn(unsigned long pfn) static bool low_pfn(unsigned long pfn)
{ {
return pfn < max_low_pfn; return pfn < max_low_pfn;
...@@ -335,15 +336,6 @@ KERN_ERR ...@@ -335,15 +336,6 @@ KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n"; "******* Disabling USB legacy in the BIOS may also help.\n";
#endif #endif
/*
* No vm86 mode in 64-bit mode:
*/
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
struct task_struct *tsk)
{
}
static int bad_address(void *p) static int bad_address(void *p)
{ {
unsigned long dummy; unsigned long dummy;
...@@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) ...@@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
|| boot_cpu_data.x86 != 0xf) || boot_cpu_data.x86 != 0xf)
return 0; return 0;
if (user_mode(regs))
return 0;
if (address != regs->ip) if (address != regs->ip)
return 0; return 0;
...@@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) ...@@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
} }
/* Pentium F0 0F C7 C8 bug workaround: */ /* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long address) static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{ {
#ifdef CONFIG_X86_F00F_BUG #ifdef CONFIG_X86_F00F_BUG
if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
idt_is_f00f_address(address)) {
handle_invalid_op(regs); handle_invalid_op(regs);
return 1; return 1;
} }
...@@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address, ...@@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address,
} }
static noinline void static noinline void
no_context(struct pt_regs *regs, unsigned long error_code, page_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code) unsigned long address)
{ {
struct task_struct *tsk = current;
unsigned long flags; unsigned long flags;
int sig; int sig;
if (user_mode(regs)) { if (user_mode(regs)) {
/* /*
* This is an implicit supervisor-mode access from user * Implicit kernel access from user mode? Skip the stack
* mode. Bypass all the kernel-mode recovery code and just * overflow and EFI special cases.
* OOPS.
*/ */
goto oops; goto oops;
} }
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
* Any interrupt that takes a fault gets the fixup. This makes
* the below recursive fault logic only apply to a faults from
* task context.
*/
if (in_interrupt())
return;
/*
* Per the above we're !in_interrupt(), aka. task context.
*
* In this case we need to make sure we're not recursively
* faulting through the emulate_vsyscall() logic.
*/
if (current->thread.sig_on_uaccess_err && signal) {
sanitize_error_code(address, &error_code);
set_signal_archinfo(address, error_code);
/* XXX: hwpoison faults will set the wrong code. */
force_sig_fault(signal, si_code, (void __user *)address);
}
/*
* Barring that, we can do the fixup and be happy.
*/
return;
}
#ifdef CONFIG_VMAP_STACK #ifdef CONFIG_VMAP_STACK
/* /*
* Stack overflow? During boot, we can fault near the initial * Stack overflow? During boot, we can fault near the initial
...@@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, ...@@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this. * that we're in vmalloc space to avoid this.
*/ */
if (is_vmalloc_addr((void *)address) && if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/* /*
* We're likely to be running with very little stack space * We're likely to be running with very little stack space
...@@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code, ...@@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code,
#endif #endif
/* /*
* 32-bit: * Buggy firmware could access regions which might page fault. If
* * this happens, EFI has a special OOPS path that will try to
* Valid to do another page fault here, because if this fault * avoid hanging the system.
* had been triggered by is_prefetch fixup_exception would have
* handled it.
*
* 64-bit:
*
* Hall of shame of CPU/BIOS bugs.
*/
if (is_prefetch(regs, error_code, address))
return;
if (is_errata93(regs, address))
return;
/*
* Buggy firmware could access regions which might page fault, try to
* recover from such faults.
*/ */
if (IS_ENABLED(CONFIG_EFI)) if (IS_ENABLED(CONFIG_EFI))
efi_recover_from_page_fault(address); efi_crash_gracefully_on_page_fault(address);
oops: oops:
/* /*
...@@ -741,7 +689,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, ...@@ -741,7 +689,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
show_fault_oops(regs, error_code, address); show_fault_oops(regs, error_code, address);
if (task_stack_end_corrupted(tsk)) if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL; sig = SIGKILL;
...@@ -754,6 +702,53 @@ no_context(struct pt_regs *regs, unsigned long error_code, ...@@ -754,6 +702,53 @@ no_context(struct pt_regs *regs, unsigned long error_code,
oops_end(flags, regs, sig); oops_end(flags, regs, sig);
} }
static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code)
{
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
* Any interrupt that takes a fault gets the fixup. This makes
* the below recursive fault logic only apply to a faults from
* task context.
*/
if (in_interrupt())
return;
/*
* Per the above we're !in_interrupt(), aka. task context.
*
* In this case we need to make sure we're not recursively
* faulting through the emulate_vsyscall() logic.
*/
if (current->thread.sig_on_uaccess_err && signal) {
sanitize_error_code(address, &error_code);
set_signal_archinfo(address, error_code);
/* XXX: hwpoison faults will set the wrong code. */
force_sig_fault(signal, si_code, (void __user *)address);
}
/*
* Barring that, we can do the fixup and be happy.
*/
return;
}
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
* instruction.
*/
if (is_prefetch(regs, error_code, address))
return;
page_fault_oops(regs, error_code, address);
}
/* /*
* Print out info about fatal segfaults, if the show_unhandled_signals * Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set: * sysctl is set:
...@@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ...@@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */ if (!user_mode(regs)) {
if (user_mode(regs) && (error_code & X86_PF_USER)) { kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
/* return;
* It's possible to have interrupts off here: }
*/
local_irq_enable();
/*
* Valid to do another page fault here because this one came
* from user space:
*/
if (is_prefetch(regs, error_code, address))
return;
if (is_errata100(regs, address)) if (!(error_code & X86_PF_USER)) {
return; /* Implicit user access to kernel memory -- just oops */
page_fault_oops(regs, error_code, address);
return;
}
sanitize_error_code(address, &error_code); /*
* User mode accesses just cause a SIGSEGV.
* It's possible to have interrupts off here:
*/
local_irq_enable();
if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) /*
return; * Valid to do another page fault here because this one came
* from user space:
*/
if (is_prefetch(regs, error_code, address))
return;
if (likely(show_unhandled_signals)) if (is_errata100(regs, address))
show_signal_msg(regs, error_code, address, tsk); return;
set_signal_archinfo(address, error_code); sanitize_error_code(address, &error_code);
if (si_code == SEGV_PKUERR) if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
force_sig_pkuerr((void __user *)address, pkey); return;
force_sig_fault(SIGSEGV, si_code, (void __user *)address); if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
local_irq_disable(); set_signal_archinfo(address, error_code);
return; if (si_code == SEGV_PKUERR)
} force_sig_pkuerr((void __user *)address, pkey);
if (is_f00f_bug(regs, address)) force_sig_fault(SIGSEGV, si_code, (void __user *)address);
return;
no_context(regs, error_code, address, SIGSEGV, si_code); local_irq_disable();
} }
static noinline void static noinline void
...@@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, ...@@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault) vm_fault_t fault)
{ {
/* Kernel mode? Handle exceptions or die: */ /* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) { if (!user_mode(regs)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
return; return;
} }
...@@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, ...@@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
} }
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
}
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
* oom-killed):
*/
pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
else if (fault & VM_FAULT_SIGSEGV)
bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
}
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{ {
if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
...@@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, ...@@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
} }
#endif #endif
if (is_f00f_bug(regs, hw_error_code, address))
return;
/* Was the fault spurious, caused by lazy TLB invalidation? */ /* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address)) if (spurious_kernel_fault(hw_error_code, address))
return; return;
...@@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, ...@@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
} }
NOKPROBE_SYMBOL(do_kern_addr_fault); NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */ /*
* Handle faults in the user portion of the address space. Nothing in here
* should check X86_PF_USER without a specific justification: for almost
* all purposes, we should treat a normal kernel access to user memory
* (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
* The one exception is AC flag handling, which is, per the x86
* architecture, special for WRUSS.
*/
static inline static inline
void do_user_addr_fault(struct pt_regs *regs, void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code, unsigned long error_code,
unsigned long address) unsigned long address)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
...@@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current; tsk = current;
mm = tsk->mm; mm = tsk->mm;
if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
/*
* Whoops, this is kernel mode code trying to execute from
* user memory. Unless this is AMD erratum #93, which
* corrupts RIP such that it looks like a user address,
* this is unrecoverable. Don't even try to look up the
* VMA or look for extable entries.
*/
if (is_errata93(regs, address))
return;
page_fault_oops(regs, error_code, address);
return;
}
/* kprobes don't want to hook the spurious faults: */ /* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return; return;
...@@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* Reserved bits are never expected to be set on * Reserved bits are never expected to be set on
* entries in the user portion of the page tables. * entries in the user portion of the page tables.
*/ */
if (unlikely(hw_error_code & X86_PF_RSVD)) if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address); pgtable_bad(regs, error_code, address);
/* /*
* If SMAP is on, check for invalid kernel (supervisor) access to user * If SMAP is on, check for invalid kernel (supervisor) access to user
...@@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs,
* enforcement appears to be consistent with the USER bit. * enforcement appears to be consistent with the USER bit.
*/ */
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
!(hw_error_code & X86_PF_USER) && !(error_code & X86_PF_USER) &&
!(regs->flags & X86_EFLAGS_AC))) !(regs->flags & X86_EFLAGS_AC))) {
{ /*
bad_area_nosemaphore(regs, hw_error_code, address); * No extable entry here. This was a kernel access to an
* invalid pointer. get_kernel_nofault() will not get here.
*/
page_fault_oops(regs, error_code, address);
return; return;
} }
...@@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* in a region with pagefaults disabled then we must not take the fault * in a region with pagefaults disabled then we must not take the fault
*/ */
if (unlikely(faulthandler_disabled() || !mm)) { if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, hw_error_code, address); bad_area_nosemaphore(regs, error_code, address);
return; return;
} }
...@@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (hw_error_code & X86_PF_WRITE) if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE; flags |= FAULT_FLAG_WRITE;
if (hw_error_code & X86_PF_INSTR) if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION; flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit. * to consider the PF_PK bit.
*/ */
if (is_vsyscall_vaddr(address)) { if (is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(hw_error_code, regs, address)) if (emulate_vsyscall(error_code, regs, address))
return; return;
} }
#endif #endif
...@@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from * Fault from code in kernel from
* which we do not expect faults. * which we do not expect faults.
*/ */
bad_area_nosemaphore(regs, hw_error_code, address); bad_area_nosemaphore(regs, error_code, address);
return; return;
} }
retry: retry:
...@@ -1353,17 +1344,17 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1353,17 +1344,17 @@ void do_user_addr_fault(struct pt_regs *regs,
vma = find_vma(mm, address); vma = find_vma(mm, address);
if (unlikely(!vma)) { if (unlikely(!vma)) {
bad_area(regs, hw_error_code, address); bad_area(regs, error_code, address);
return; return;
} }
if (likely(vma->vm_start <= address)) if (likely(vma->vm_start <= address))
goto good_area; goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, hw_error_code, address); bad_area(regs, error_code, address);
return; return;
} }
if (unlikely(expand_stack(vma, address))) { if (unlikely(expand_stack(vma, address))) {
bad_area(regs, hw_error_code, address); bad_area(regs, error_code, address);
return; return;
} }
...@@ -1372,8 +1363,8 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1372,8 +1363,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* we can handle it.. * we can handle it..
*/ */
good_area: good_area:
if (unlikely(access_error(hw_error_code, vma))) { if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, hw_error_code, address, vma); bad_area_access_error(regs, error_code, address, vma);
return; return;
} }
...@@ -1392,11 +1383,14 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1392,11 +1383,14 @@ void do_user_addr_fault(struct pt_regs *regs,
*/ */
fault = handle_mm_fault(vma, address, flags, regs); fault = handle_mm_fault(vma, address, flags, regs);
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) { if (fault_signal_pending(fault, regs)) {
/*
* Quick path to respond to signals. The core mm code
* has unlocked the mm for us if we get here.
*/
if (!user_mode(regs)) if (!user_mode(regs))
no_context(regs, hw_error_code, address, SIGBUS, kernelmode_fixup_or_oops(regs, error_code, address,
BUS_ADRERR); SIGBUS, BUS_ADRERR);
return; return;
} }
...@@ -1412,12 +1406,37 @@ void do_user_addr_fault(struct pt_regs *regs, ...@@ -1412,12 +1406,37 @@ void do_user_addr_fault(struct pt_regs *regs,
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);
if (unlikely(fault & VM_FAULT_ERROR)) { if (likely(!(fault & VM_FAULT_ERROR)))
mm_fault_error(regs, hw_error_code, address, fault); return;
if (fatal_signal_pending(current) && !user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return; return;
} }
check_v8086_mode(regs, address, tsk); if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
}
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
* oom-killed):
*/
pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
else if (fault & VM_FAULT_SIGSEGV)
bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
} }
NOKPROBE_SYMBOL(do_user_addr_fault); NOKPROBE_SYMBOL(do_user_addr_fault);
......
...@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num) ...@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num)
} }
/* /*
* By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. * By default need to be able to allocate page tables below PGD firstly for
* With KASLR memory randomization, depending on the machine e820 memory * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
* and the PUD alignment. We may need twice more pages when KASLR memory * With KASLR memory randomization, depending on the machine e820 memory and the
* PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled. * randomization is enabled.
*/ */
#ifndef CONFIG_X86_5LEVEL
#define INIT_PGD_PAGE_TABLES 3
#else
#define INIT_PGD_PAGE_TABLES 4
#endif
#ifndef CONFIG_RANDOMIZE_MEMORY #ifndef CONFIG_RANDOMIZE_MEMORY
#define INIT_PGD_PAGE_COUNT 6 #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else #else
#define INIT_PGD_PAGE_COUNT 12 #define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif #endif
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void) void __init early_alloc_pgt_buf(void)
......
...@@ -10,8 +10,6 @@ ...@@ -10,8 +10,6 @@
#define pr_fmt(fmt) "mmiotrace: " fmt #define pr_fmt(fmt) "mmiotrace: " fmt
#define DEBUG 1
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/slab.h> #include <linux/slab.h>
......
...@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, ...@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
* @return: Returns, if the page fault is not handled. This function * @return: Returns, if the page fault is not handled. This function
* will never return if the page fault is handled successfully. * will never return if the page fault is handled successfully.
*/ */
void efi_recover_from_page_fault(unsigned long phys_addr) void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
{ {
if (!IS_ENABLED(CONFIG_X86_64)) if (!IS_ENABLED(CONFIG_X86_64))
return; return;
/*
* If we get an interrupt/NMI while processing an EFI runtime service
* then this is a regular OOPS, not an EFI failure.
*/
if (in_interrupt())
return;
/* /*
* Make sure that an efi runtime service caused the page fault. * Make sure that an efi runtime service caused the page fault.
* READ_ONCE() because we might be OOPSing in a different thread,
* and we don't want to trip KTSAN while trying to OOPS.
*/ */
if (efi_rts_work.efi_rts_id == EFI_NONE) if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
current_work() != &efi_rts_work.work)
return; return;
/* /*
...@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr) ...@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE); set_current_state(TASK_IDLE);
schedule(); schedule();
} }
return;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment