Commit 10a0c0f0 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Misc changes:
   - fix lguest bug
   - fix /proc/meminfo output on certain configs
   - fix pvclock bug
   - fix reboot on certain iMacs by adding new reboot quirk
   - fix bootup crash
   - fix FPU boot line option parsing
   - add more x86 self-tests
   - small cleanups, documentation improvements, etc"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/cpu/amd: Remove an unneeded condition in srat_detect_node()
  x86/vdso/pvclock: Protect STABLE check with the seqcount
  x86/mm: Improve switch_mm() barrier comments
  selftests/x86: Test __kernel_sigreturn and __kernel_rt_sigreturn
  x86/reboot/quirks: Add iMac10,1 to pci_reboot_dmi_table[]
  lguest: Map switcher text R/O
  x86/boot: Hide local labels in verify_cpu()
  x86/fpu: Disable AVX when eagerfpu is off
  x86/fpu: Disable MPX when eagerfpu is off
  x86/fpu: Disable XGETBV1 when no XSAVE
  x86/fpu: Fix early FPU command-line parsing
  x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED
  selftests/x86: Disable the ldt_gdt_64 test for now
  x86/mm/pat: Make split_page_count() check for empty levels to fix /proc/meminfo output
  x86/boot: Double BOOT_HEAP_SIZE to 64KB
  x86/mm: Add barriers and document switch_mm()-vs-flush synchronization
parents dcd1bfd5 7030a7e9
...@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode) ...@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
* *
* On Xen, we don't appear to have that guarantee, but Xen still * On Xen, we don't appear to have that guarantee, but Xen still
* supplies a valid seqlock using the version field. * supplies a valid seqlock using the version field.
*
* We only do pvclock vdso timing at all if * We only do pvclock vdso timing at all if
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
* mean that all vCPUs have matching pvti and that the TSC is * mean that all vCPUs have matching pvti and that the TSC is
* synced, so we can just look at vCPU 0's pvti. * synced, so we can just look at vCPU 0's pvti.
*/ */
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
*mode = VCLOCK_NONE;
return 0;
}
do { do {
version = pvti->version; version = pvti->version;
smp_rmb(); smp_rmb();
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
*mode = VCLOCK_NONE;
return 0;
}
tsc = rdtsc_ordered(); tsc = rdtsc_ordered();
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
pvti_tsc_shift = pvti->tsc_shift; pvti_tsc_shift = pvti->tsc_shift;
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000 #define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */ #else /* !CONFIG_KERNEL_BZIP2 */
#define BOOT_HEAP_SIZE 0x8000 #define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */ #endif /* !CONFIG_KERNEL_BZIP2 */
......
...@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void); ...@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void); extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void); extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);
/* /*
* Debugging facility: * Debugging facility:
......
...@@ -20,15 +20,16 @@ ...@@ -20,15 +20,16 @@
/* Supported features which support lazy state saving */ /* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ #define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \ XFEATURE_MASK_SSE)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_YMM | \ XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM) XFEATURE_MASK_Hi16_ZMM)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
/* All currently supported features */ /* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) #define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
#define GUEST_PL 1 #define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */ /* Page for Switcher text itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) #define SWITCHER_TEXT_PAGES (1)
#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */ /* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr; extern unsigned long switcher_addr;
......
...@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ...@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif #endif
cpumask_set_cpu(cpu, mm_cpumask(next)); cpumask_set_cpu(cpu, mm_cpumask(next));
/* Re-load page tables */ /*
* Re-load page tables.
*
* This logic has an ordering constraint:
*
* CPU 0: Write to a PTE for 'next'
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
* CPU 1: set bit 1 in next's mm_cpumask
* CPU 1: load from the PTE that CPU 0 writes (implicit)
*
* We need to prevent an outcome in which CPU 1 observes
* the new PTE value and CPU 0 observes bit 1 clear in
* mm_cpumask. (If that occurs, then the IPI will never
* be sent, and CPU 0's TLB will contain a stale entry.)
*
* The bad outcome can occur if either CPU's load is
* reordered before that CPU's store, so both CPUs must
* execute full barriers to prevent this from happening.
*
* Thus, switch_mm needs a full barrier between the
* store to mm_cpumask and any operation that could load
* from next->pgd. TLB fills are special and can happen
* due to instruction fetches or for no reason at all,
* and neither LOCK nor MFENCE orders them.
* Fortunately, load_cr3() is serializing and gives the
* ordering guarantee we need.
*
*/
load_cr3(next->pgd); load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */ /* Stop flush ipis for the previous mm */
...@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ...@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes. * schedule, protecting us from simultaneous changes.
*/ */
cpumask_set_cpu(cpu, mm_cpumask(next)); cpumask_set_cpu(cpu, mm_cpumask(next));
/* /*
* We were in lazy tlb mode and leave_mm disabled * We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3 * tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables. * to make sure to use no freed page tables.
*
* As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write.
*/ */
load_cr3(next->pgd); load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
......
...@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) ...@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/ */
int ht_nodeid = c->initial_apicid; int ht_nodeid = c->initial_apicid;
if (ht_nodeid >= 0 && if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid]; node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */ /* Pick a nearby node */
if (!node_online(node)) if (!node_online(node))
......
...@@ -3,8 +3,11 @@ ...@@ -3,8 +3,11 @@
*/ */
#include <asm/fpu/internal.h> #include <asm/fpu/internal.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/setup.h>
#include <asm/cmdline.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/init.h>
/* /*
* Initialize the TS bit in CR0 according to the style of context-switches * Initialize the TS bit in CR0 according to the style of context-switches
...@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void) ...@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/ */
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
static int __init eager_fpu_setup(char *s) /*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{ {
if (!strcmp(s, "on")) /* Support all xfeatures known to us */
eagerfpu = ENABLE; if (eagerfpu != DISABLE)
else if (!strcmp(s, "off")) return XCNTXT_MASK;
eagerfpu = DISABLE;
else if (!strcmp(s, "auto")) /* Warning of xfeatures being disabled for no eagerfpu mode */
eagerfpu = AUTO; if (xfeatures_mask & XFEATURE_MASK_EAGER) {
return 1; pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
xfeatures_mask & XFEATURE_MASK_EAGER);
}
/* Return a mask that masks out all features requiring eagerfpu mode */
return ~XFEATURE_MASK_EAGER;
}
/*
* Disable features dependent on eagerfpu.
*/
static void __init fpu__clear_eager_fpu_features(void)
{
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
} }
__setup("eagerfpu=", eager_fpu_setup);
/* /*
* Pick the FPU context switching strategy: * Pick the FPU context switching strategy:
*
* When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
* the following is true:
*
* (1) the cpu has xsaveopt, as it has the optimization and doing eager
* FPU switching has a relatively low cost compared to a plain xsave;
* (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
* switching. Should the kernel boot with noxsaveopt, we support MPX
* with eager FPU switching at a higher cost.
*/ */
static void __init fpu__init_system_ctx_switch(void) static void __init fpu__init_system_ctx_switch(void)
{ {
...@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void) ...@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active); WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0; current_thread_info()->status = 0;
/* Auto enable eagerfpu for xsaveopt */
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE; eagerfpu = ENABLE;
if (xfeatures_mask & XFEATURE_MASK_EAGER) { if (xfeatures_mask & XFEATURE_MASK_EAGER)
if (eagerfpu == DISABLE) {
pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
xfeatures_mask & XFEATURE_MASK_EAGER);
xfeatures_mask &= ~XFEATURE_MASK_EAGER;
} else {
eagerfpu = ENABLE; eagerfpu = ENABLE;
}
}
if (eagerfpu == ENABLE) if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
...@@ -315,12 +342,49 @@ static void __init fpu__init_system_ctx_switch(void) ...@@ -315,12 +342,49 @@ static void __init fpu__init_system_ctx_switch(void)
printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
} }
/*
* We parse fpu parameters early because fpu__init_system() is executed
* before parse_early_param().
*/
static void __init fpu__init_parse_early_param(void)
{
/*
* No need to check "eagerfpu=auto" again, since it is the
* initial default.
*/
if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
eagerfpu = DISABLE;
fpu__clear_eager_fpu_features();
} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
eagerfpu = ENABLE;
}
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
setup_clear_cpu_cap(X86_FEATURE_XMM);
}
if (cmdline_find_option_bool(boot_command_line, "noxsave"))
fpu__xstate_clear_all_cpu_caps();
if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
}
/* /*
* Called on the boot CPU once per system bootup, to set up the initial * Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes: * FPU state that is later cloned into all processes:
*/ */
void __init fpu__init_system(struct cpuinfo_x86 *c) void __init fpu__init_system(struct cpuinfo_x86 *c)
{ {
fpu__init_parse_early_param();
fpu__init_system_early_generic(c); fpu__init_system_early_generic(c);
/* /*
...@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) ...@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_ctx_switch(); fpu__init_system_ctx_switch();
} }
/*
* Boot parameter to turn off FPU support and fall back to math-emu:
*/
static int __init no_387(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_FPU);
return 1;
}
__setup("no387", no_387);
/*
* Disable all xstate CPU features:
*/
static int __init x86_noxsave_setup(char *s)
{
if (strlen(s))
return 0;
fpu__xstate_clear_all_cpu_caps();
return 1;
}
__setup("noxsave", x86_noxsave_setup);
/*
* Disable the XSAVEOPT instruction specifically:
*/
static int __init x86_noxsaveopt_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
return 1;
}
__setup("noxsaveopt", x86_noxsaveopt_setup);
/*
* Disable the XSAVES instruction:
*/
static int __init x86_noxsaves_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
return 1;
}
__setup("noxsaves", x86_noxsaves_setup);
/*
* Disable FX save/restore and SSE support:
*/
static int __init x86_nofxsr_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
setup_clear_cpu_cap(X86_FEATURE_XMM);
return 1;
}
__setup("nofxsr", x86_nofxsr_setup);
...@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void) ...@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD); setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
} }
/* /*
...@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void) ...@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
BUG(); BUG();
} }
/* Support only the state known to the OS: */ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
/* Enable xstate instructions to be able to continue with initialization: */ /* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate(); fpu__init_cpu_xstate();
......
...@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { ...@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
}, },
}, },
{ /* Handle problems with rebooting on the iMac10,1. */
.callback = set_pci_reboot,
.ident = "Apple iMac10,1",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
},
},
/* ASRock */ /* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
......
...@@ -48,31 +48,31 @@ verify_cpu: ...@@ -48,31 +48,31 @@ verify_cpu:
pushfl pushfl
popl %eax popl %eax
cmpl %eax,%ebx cmpl %eax,%ebx
jz verify_cpu_no_longmode # cpu has no cpuid jz .Lverify_cpu_no_longmode # cpu has no cpuid
#endif #endif
movl $0x0,%eax # See if cpuid 1 is implemented movl $0x0,%eax # See if cpuid 1 is implemented
cpuid cpuid
cmpl $0x1,%eax cmpl $0x1,%eax
jb verify_cpu_no_longmode # no cpuid 1 jb .Lverify_cpu_no_longmode # no cpuid 1
xor %di,%di xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD cmpl $0x68747541,%ebx # AuthenticAMD
jnz verify_cpu_noamd jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx cmpl $0x69746e65,%edx
jnz verify_cpu_noamd jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx cmpl $0x444d4163,%ecx
jnz verify_cpu_noamd jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD mov $1,%di # cpu is from AMD
jmp verify_cpu_check jmp .Lverify_cpu_check
verify_cpu_noamd: .Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel? cmpl $0x756e6547,%ebx # GenuineIntel?
jnz verify_cpu_check jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx cmpl $0x49656e69,%edx
jnz verify_cpu_check jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx cmpl $0x6c65746e,%ecx
jnz verify_cpu_check jnz .Lverify_cpu_check
# only call IA32_MISC_ENABLE when: # only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd) # family > 6 || (family == 6 && model >= 0xd)
...@@ -83,59 +83,59 @@ verify_cpu_noamd: ...@@ -83,59 +83,59 @@ verify_cpu_noamd:
andl $0x0ff00f00, %eax # mask family and extended family andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax shrl $8, %eax
cmpl $6, %eax cmpl $6, %eax
ja verify_cpu_clear_xd # family > 6, ok ja .Lverify_cpu_clear_xd # family > 6, ok
jb verify_cpu_check # family < 6, skip jb .Lverify_cpu_check # family < 6, skip
andl $0x000f00f0, %ecx # mask model and extended model andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx shrl $4, %ecx
cmpl $0xd, %ecx cmpl $0xd, %ecx
jb verify_cpu_check # family == 6, model < 0xd, skip jb .Lverify_cpu_check # family == 6, model < 0xd, skip
verify_cpu_clear_xd: .Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
jnc verify_cpu_check # only write MSR if bit was changed jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr wrmsr
verify_cpu_check: .Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes movl $0x1,%eax # Does the cpu have what it takes
cpuid cpuid
andl $REQUIRED_MASK0,%edx andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx xorl $REQUIRED_MASK0,%edx
jnz verify_cpu_no_longmode jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid cpuid
cmpl $0x80000001,%eax cmpl $0x80000001,%eax
jb verify_cpu_no_longmode # no extended cpuid jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes movl $0x80000001,%eax # Does the cpu have what it takes
cpuid cpuid
andl $REQUIRED_MASK1,%edx andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx xorl $REQUIRED_MASK1,%edx
jnz verify_cpu_no_longmode jnz .Lverify_cpu_no_longmode
verify_cpu_sse_test: .Lverify_cpu_sse_test:
movl $1,%eax movl $1,%eax
cpuid cpuid
andl $SSE_MASK,%edx andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx cmpl $SSE_MASK,%edx
je verify_cpu_sse_ok je .Lverify_cpu_sse_ok
test %di,%di test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx movl $MSR_K7_HWCR,%ecx
rdmsr rdmsr
btr $15,%eax # enable SSE btr $15,%eax # enable SSE
wrmsr wrmsr
xor %di,%di # don't loop xor %di,%di # don't loop
jmp verify_cpu_sse_test # try again jmp .Lverify_cpu_sse_test # try again
verify_cpu_no_longmode: .Lverify_cpu_no_longmode:
popf # Restore caller passed flags popf # Restore caller passed flags
movl $1,%eax movl $1,%eax
ret ret
verify_cpu_sse_ok: .Lverify_cpu_sse_ok:
popf # Restore caller passed flags popf # Restore caller passed flags
xorl %eax, %eax xorl %eax, %eax
ret ret
...@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, ...@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000) if (phys_addr < (phys_addr_t)0x40000000)
return; return;
if (IS_ALIGNED(addr, PAGE_SIZE) && if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
IS_ALIGNED(next, PAGE_SIZE)) {
/* /*
* Do not free direct mapping pages since they were * Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use. * freed when offlining, or simplely not in use.
......
...@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages) ...@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
static void split_page_count(int level) static void split_page_count(int level)
{ {
if (direct_pages_count[level] == 0)
return;
direct_pages_count[level]--; direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE; direct_pages_count[level - 1] += PTRS_PER_PTE;
} }
......
...@@ -161,7 +161,10 @@ void flush_tlb_current_task(void) ...@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable(); preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
/* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb(); local_flush_tlb();
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
...@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, ...@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL; unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable(); preempt_disable();
if (current->active_mm != mm) if (current->active_mm != mm) {
/* Synchronize with switch_mm. */
smp_mb();
goto out; goto out;
}
if (!current->mm) { if (!current->mm) {
leave_mm(smp_processor_id()); leave_mm(smp_processor_id());
/* Synchronize with switch_mm. */
smp_mb();
goto out; goto out;
} }
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT; base_pages_to_flush = (end - start) >> PAGE_SHIFT;
/*
* Both branches below are implicit full barriers (MOV to CR or
* INVLPG) that synchronize with switch_mm.
*/
if (base_pages_to_flush > tlb_single_page_flush_ceiling) { if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL; base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
...@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) ...@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable(); preempt_disable();
if (current->active_mm == mm) { if (current->active_mm == mm) {
if (current->mm) if (current->mm) {
/*
* Implicit full barrier (INVLPG) that synchronizes
* with switch_mm.
*/
__flush_tlb_one(start); __flush_tlb_one(start);
else } else {
leave_mm(smp_processor_id()); leave_mm(smp_processor_id());
/* Synchronize with switch_mm. */
smp_mb();
}
} }
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
......
...@@ -22,7 +22,8 @@ ...@@ -22,7 +22,8 @@
unsigned long switcher_addr; unsigned long switcher_addr;
struct page **lg_switcher_pages; struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma; static struct vm_struct *switcher_text_vma;
static struct vm_struct *switcher_stacks_vma;
/* This One Big lock protects all inter-guest data structures. */ /* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock); DEFINE_MUTEX(lguest_lock);
...@@ -82,55 +83,81 @@ static __init int map_switcher(void) ...@@ -82,55 +83,81 @@ static __init int map_switcher(void)
} }
} }
/*
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
* It goes in the first page, which we map in momentarily.
*/
memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
end_switcher_text - start_switcher_text);
kunmap(lg_switcher_pages[0]);
/* /*
* We place the Switcher underneath the fixmap area, which is the * We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we * highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling * tell the Guest it can't access this memory, so we want its ceiling
* as high as possible. * as high as possible.
*/ */
switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
/* /*
* Now we reserve the "virtual memory area" we want. We might * Now we reserve the "virtual memory area"s we want. We might
* not get it in theory, but in practice it's worked so far. * not get them in theory, but in practice it's worked so far.
* The end address needs +1 because __get_vm_area allocates an *
* extra guard page, so we need space for that. * We want the switcher text to be read-only and executable, and
* the stacks to be read-write and non-executable.
*/ */
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
VM_ALLOC, switcher_addr, switcher_addr switcher_addr,
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); switcher_addr + PAGE_SIZE);
if (!switcher_vma) {
if (!switcher_text_vma) {
err = -ENOMEM; err = -ENOMEM;
printk("lguest: could not map switcher pages high\n"); printk("lguest: could not map switcher pages high\n");
goto free_pages; goto free_pages;
} }
switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
VM_ALLOC|VM_NO_GUARD,
switcher_addr + PAGE_SIZE,
switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
if (!switcher_stacks_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_text_vma;
}
/* /*
* This code actually sets up the pages we've allocated to appear at * This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our * kind of pages we're mapping (kernel text pages and kernel writable
* array of struct pages. * pages respectively), and a pointer to our array of struct pages.
*/ */
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
if (err) {
printk("lguest: text map_vm_area failed: %i\n", err);
goto free_vmas;
}
err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
lg_switcher_pages + SWITCHER_TEXT_PAGES);
if (err) { if (err) {
printk("lguest: map_vm_area failed: %i\n", err); printk("lguest: stacks map_vm_area failed: %i\n", err);
goto free_vma; goto free_vmas;
} }
/* /*
* Now the Switcher is mapped at the right address, we can't fail! * Now the Switcher is mapped at the right address, we can't fail!
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/ */
memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text);
printk(KERN_INFO "lguest: mapped switcher at %p\n", printk(KERN_INFO "lguest: mapped switcher at %p\n",
switcher_vma->addr); switcher_text_vma->addr);
/* And we succeeded... */ /* And we succeeded... */
return 0; return 0;
free_vma: free_vmas:
vunmap(switcher_vma->addr); /* Undoes map_vm_area and __get_vm_area */
vunmap(switcher_stacks_vma->addr);
free_text_vma:
vunmap(switcher_text_vma->addr);
free_pages: free_pages:
i = TOTAL_SWITCHER_PAGES; i = TOTAL_SWITCHER_PAGES;
free_some_pages: free_some_pages:
...@@ -148,7 +175,8 @@ static void unmap_switcher(void) ...@@ -148,7 +175,8 @@ static void unmap_switcher(void)
unsigned int i; unsigned int i;
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
vunmap(switcher_vma->addr); vunmap(switcher_text_vma->addr);
vunmap(switcher_stacks_vma->addr);
/* Now we just need to free the pages we copied the switcher into */ /* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0); __free_pages(lg_switcher_pages[i], 0);
......
...@@ -4,9 +4,11 @@ include ../lib.mk ...@@ -4,9 +4,11 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean .PHONY: all all_32 all_64 warn_32bit_failure clean
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP test_FCMOV test_FCOMI test_FISTTP \
ldt_gdt \
vdso_restorer
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
......
/*
* vdso_restorer.c - tests vDSO-based signal restore
* Copyright (c) 2015 Andrew Lutomirski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This makes sure that sa_restorer == NULL keeps working on 32-bit
* configurations. Modern glibc doesn't use it under any circumstances,
* so it's easy to overlook breakage.
*
* 64-bit userspace has never supported sa_restorer == NULL, so this is
* 32-bit only.
*/
#define _GNU_SOURCE
#include <err.h>
#include <stdio.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <syscall.h>
#include <sys/syscall.h>
/* Open-code this -- the headers are too messy to easily use them. */
struct real_sigaction {
void *handler;
unsigned long flags;
void *restorer;
unsigned int mask[2];
};
static volatile sig_atomic_t handler_called;
static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
{
handler_called = 1;
}
static void handler_without_siginfo(int sig)
{
handler_called = 1;
}
int main()
{
int nerrs = 0;
struct real_sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.handler = handler_with_siginfo;
sa.flags = SA_SIGINFO;
sa.restorer = NULL; /* request kernel-provided restorer */
if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
err(1, "raw rt_sigaction syscall");
raise(SIGUSR1);
if (handler_called) {
printf("[OK]\tSA_SIGINFO handler returned successfully\n");
} else {
printf("[FAIL]\tSA_SIGINFO handler was not called\n");
nerrs++;
}
sa.flags = 0;
sa.handler = handler_without_siginfo;
if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
err(1, "raw sigaction syscall");
handler_called = 0;
raise(SIGUSR1);
if (handler_called) {
printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
} else {
printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
nerrs++;
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment