Commit 5cc97bf2 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen

* 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (44 commits)
  xen: disable all non-virtual drivers
  xen: use iret directly when possible
  xen: suppress abs symbol warnings for unused reloc pointers
  xen: Attempt to patch inline versions of common operations
  xen: Place vcpu_info structure into per-cpu memory
  xen: handle external requests for shutdown, reboot and sysrq
  xen: machine operations
  xen: add virtual network device driver
  xen: add virtual block device driver.
  xen: add the Xenbus sysfs and virtual device hotplug driver
  xen: Add grant table support
  xen: use the hvc console infrastructure for Xen console
  xen: hack to prevent bad segment register reload
  xen: lazy-mmu operations
  xen: Add support for preemption
  xen: SMP guest support
  xen: Implement sched_clock
  xen: Account for stolen time
  xen: ignore RW mapping of RO pages in pagetable_init
  xen: Complete pagetable pinning
  ...
parents 826ea8f2 dfdcdd42
......@@ -222,6 +222,8 @@ config PARAVIRT
However, when run without a hypervisor the kernel is
theoretically slower. If in doubt, say N.
source "arch/i386/xen/Kconfig"
config VMI
bool "VMI Paravirt-ops support"
depends on PARAVIRT
......
......@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
mcore-$(CONFIG_X86_ES7000) := mach-default
core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
# Xen paravirtualization support
core-$(CONFIG_XEN) += arch/i386/xen/
# default subarch .h files
mflags-y += -Iinclude/asm-i386/mach-default
......
......@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
"__kernel_rt_sigreturn",
"__kernel_sigreturn",
"SYSENTER_RETURN",
"xen_irq_disable_direct_reloc",
"xen_save_fl_direct_reloc",
};
static int is_safe_abs_reloc(const char* sym_name)
......
......@@ -17,6 +17,8 @@
#include <asm/thread_info.h>
#include <asm/elf.h>
#include <xen/interface/xen.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
......@@ -59,6 +61,7 @@ void foo(void)
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
OFFSET(TI_cpu, thread_info, cpu);
BLANK();
OFFSET(GDS_size, Xgt_desc_struct, size);
......@@ -115,4 +118,10 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
}
......@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
TRACE_IRQS_OFF
/* Check to see if we got the event in the critical
region in xen_iret_direct, after we've reenabled
events and checked for pending events. This simulates
iret instruction's behaviour where it delivers a
pending interrupt when enabling interrupts. */
movl PT_EIP(%esp),%eax
cmpl $xen_iret_start_crit,%eax
jb 1f
cmpl $xen_iret_end_crit,%eax
jae 1f
call xen_iret_crit_fixup
1: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
.section .fixup,"ax"
6: xorl %eax,%eax
movl %eax,4(%esp)
jmp 1b
7: xorl %eax,%eax
movl %eax,8(%esp)
jmp 2b
8: xorl %eax,%eax
movl %eax,12(%esp)
jmp 3b
9: xorl %eax,%eax
movl %eax,16(%esp)
jmp 4b
.previous
.section __ex_table,"a"
.align 4
.long 1b,6b
.long 2b,7b
.long 3b,8b
.long 4b,9b
.previous
ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
.section .rodata,"a"
#include "syscall_table.S"
......
......@@ -510,7 +510,8 @@ ENTRY(_stext)
/*
* BSS section
*/
.section ".bss.page_aligned","w"
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
.fill 1024,4,0
ENTRY(swapper_pg_pmd)
......@@ -538,6 +539,8 @@ fault_msg:
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n"
#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
......
......@@ -228,6 +228,41 @@ static int __init print_banner(void)
}
core_initcall(print_banner);
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
.name = "paravirt-ioport",
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
};
static struct resource reserve_iomem = {
.start = 0,
.end = -1,
.name = "paravirt-iomem",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
/*
* Reserve the whole legacy IO space to prevent any legacy drivers
* from wasting time probing for their hardware. This is a fairly
* brute-force approach to disabling all non-virtual drivers.
*
* Note that this must be called very early to have any effect.
*/
int paravirt_disable_iospace(void)
{
int ret;
ret = request_resource(&ioport_resource, &reserve_ioports);
if (ret == 0) {
ret = request_resource(&iomem_resource, &reserve_iomem);
if (ret)
release_resource(&reserve_ioports);
}
return ret;
}
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
......@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.get_scheduled_cycles = native_read_tsc,
.sched_clock = native_sched_clock,
.get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
......
......@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
* NOTE: at this point the bootmem allocator is fully available.
*/
paravirt_post_allocator_init();
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH
......
......@@ -22,6 +22,7 @@
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <mach_apic.h>
/*
......@@ -249,13 +250,13 @@ static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
/*
* We cannot call mmdrop() because we are in interrupt context,
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*
* We need to reload %cr3 since the page tables may be going
* away from under us..
*/
static inline void leave_mm (unsigned long cpu)
void leave_mm(unsigned long cpu)
{
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
BUG();
......
......@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
* a given CPU
*/
static void __cpuinit smp_store_cpu_info(int id)
void __cpuinit smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = cpu_data + id;
......@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
static inline void
set_cpu_sibling_map(int cpu)
void set_cpu_sibling_map(int cpu)
{
int i;
struct cpuinfo_x86 *c = cpu_data;
......@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
}
#ifdef CONFIG_HOTPLUG_CPU
static void
remove_siblinginfo(int cpu)
void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = cpu_data;
......
......@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
static unsigned long cyc2ns_scale __read_mostly;
unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
......@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
unsigned long long sched_clock(void)
unsigned long long native_sched_clock(void)
{
unsigned long long this_offset;
......@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */
get_scheduled_cycles(this_offset);
rdtscll(this_offset);
/* return the value in ns */
return cycles_2_ns(this_offset);
}
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
#else
unsigned long long sched_clock(void)
__attribute__((alias("native_sched_clock")));
#endif
unsigned long native_calculate_cpu_khz(void)
{
unsigned long long start, end;
......
......@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
static void vmi_allocate_pt(u32 pfn)
static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
......@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
paravirt_ops.sched_clock = vmi_sched_clock;
paravirt_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */
......
......@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
return 0;
}
/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
unsigned long long vmi_get_sched_cycles(void)
/* paravirt_ops.sched_clock = vmi_sched_clock */
unsigned long long vmi_sched_clock(void)
{
return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
......
......@@ -88,6 +88,7 @@ SECTIONS
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
*(.data.idt)
}
......
......@@ -3,23 +3,40 @@
* Here we can supply some information useful to userland.
*/
#include <linux/uts.h>
#include <linux/version.h>
#include <linux/elfnote.h>
#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \
.section name, flags; \
.balign 4; \
.long 1f - 0f; /* name length */ \
.long 3f - 2f; /* data length */ \
.long type; /* note type */ \
0: .asciz vendor; /* vendor name */ \
1: .balign 4; \
2:
/* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
#define ASM_ELF_NOTE_END \
3: .balign 4; /* pad out section */ \
.previous
#ifdef CONFIG_XEN
ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
.long LINUX_VERSION_CODE
ASM_ELF_NOTE_END
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*/
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1
ELFNOTE_START(GNU, 2, "a")
.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif
......@@ -52,7 +52,7 @@ execute(const char *string)
NULL,
};
if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
string, ret);
}
......
......@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
......@@ -473,6 +473,7 @@ void zap_low_mappings (void)
static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/*
* noexec = on|off
......
......@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
paravirt_alloc_pt(page_to_pfn(base));
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));
......
#
# This Kconfig describes xen options
#
config XEN
bool "Enable support for Xen hypervisor"
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
events.o time.o manage.o xen-asm.o
obj-$(CONFIG_SMP) += smp.o
This diff is collapsed.
This diff is collapsed.
/******************************************************************************
* features.c
*
* Xen feature flags.
*
* Copyright (c) 2006, Ian Campbell, XenSource Inc.
*/
#include <linux/types.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <asm/xen/hypervisor.h>
#include <xen/features.h>
u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
EXPORT_SYMBOL_GPL(xen_features);
void xen_setup_features(void)
{
struct xen_feature_info fi;
int i, j;
for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
fi.submap_idx = i;
if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
break;
for (j = 0; j < 32; j++)
xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
}
}
/*
* Handle extern requests for shutdown, reboot and sysrq
*/
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <xen/xenbus.h>
#define SHUTDOWN_INVALID -1
#define SHUTDOWN_POWEROFF 0
#define SHUTDOWN_SUSPEND 2
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
* report a crash, not be instructed to crash!
* HALT is the same as POWEROFF, as far as we're concerned. The tools use
* the distinction when we return the reason code to them.
*/
#define SHUTDOWN_HALT 4
/* Ignore multiple shutdown requests. */
static int shutting_down = SHUTDOWN_INVALID;
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
char *str;
struct xenbus_transaction xbt;
int err;
if (shutting_down != SHUTDOWN_INVALID)
return;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
/* Ignore read errors and empty reads. */
if (XENBUS_IS_ERR_READ(str)) {
xenbus_transaction_end(xbt, 1);
return;
}
xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
kfree(str);
goto again;
}
if (strcmp(str, "poweroff") == 0 ||
strcmp(str, "halt") == 0)
orderly_poweroff(false);
else if (strcmp(str, "reboot") == 0)
ctrl_alt_del();
else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
}
kfree(str);
}
static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
char sysrq_key = '\0';
struct xenbus_transaction xbt;
int err;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
printk(KERN_ERR "Unable to read sysrq code in "
"control/sysrq\n");
xenbus_transaction_end(xbt, 1);
return;
}
if (sysrq_key != '\0')
xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
goto again;
if (sysrq_key != '\0')
handle_sysrq(sysrq_key, NULL);
}
static struct xenbus_watch shutdown_watch = {
.node = "control/shutdown",
.callback = shutdown_handler
};
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
static int setup_shutdown_watcher(void)
{
int err;
err = register_xenbus_watch(&shutdown_watch);
if (err) {
printk(KERN_ERR "Failed to set shutdown watcher\n");
return err;
}
err = register_xenbus_watch(&sysrq_watch);
if (err) {
printk(KERN_ERR "Failed to set sysrq watcher\n");
return err;
}
return 0;
}
static int shutdown_event(struct notifier_block *notifier,
unsigned long event,
void *data)
{
setup_shutdown_watcher();
return NOTIFY_DONE;
}
static int __init setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
register_xenstore_notifier(&xenstore_notifier);
return 0;
}
subsys_initcall(setup_shutdown_event);
This diff is collapsed.
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
#ifdef CONFIG_X86_PAE
unsigned long long xen_pte_val(pte_t);
unsigned long long xen_pmd_val(pmd_t);
unsigned long long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long long);
pmd_t xen_make_pmd(unsigned long long);
pgd_t xen_make_pgd(unsigned long long);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#else
unsigned long xen_pte_val(pte_t);
unsigned long xen_pmd_val(pmd_t);
unsigned long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long);
pmd_t xen_make_pmd(unsigned long);
pgd_t xen_make_pgd(unsigned long);
#endif
#endif /* _XEN_MMU_H */
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
u64 args[MC_ARGS];
unsigned mcidx, argidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
if (b->mcidx) {
int i;
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
b->mcidx = 0;
b->argidx = 0;
} else
BUG_ON(b->argidx != 0);
local_irq_restore(flags);
BUG_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
BUG_ON(preemptible());
BUG_ON(argspace > MC_ARGS);
if (b->mcidx == MC_BATCH ||
(b->argidx + argspace) > MC_ARGS)
xen_mc_flush();
ret.mc = &b->entries[b->mcidx];
b->mcidx++;
ret.args = &b->args[b->argidx];
b->argidx += argspace;
return ret;
}
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
/* need to disable interrupts until this entry is complete */
local_irq_save(__get_cpu_var(xen_mc_irq_flags));
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
if ((xen_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
}
#endif /* _XEN_MULTICALLS_H */
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <asm/elf.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
unsigned long *phys_to_machine_mapping;
EXPORT_SYMBOL(phys_to_machine_mapping);
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
unsigned long max_pfn = xen_start_info->nr_pages;
e820.nr_map = 0;
add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
return "Xen";
}
static void xen_idle(void)
{
local_irq_disable();
if (need_resched())
local_irq_enable();
else {
current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
safe_halt();
current_thread_info()->status |= TS_POLLING;
}
}
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
int rc;
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
printk(KERN_INFO "physdev_op failed %d\n", rc);
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
pm_idle = xen_idle;
#ifdef CONFIG_SMP
/* fill cpus_possible with all available cpus */
xen_fill_possible_map();
#endif
paravirt_disable_iospace();
}
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*
* This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "mmu.h"
static cpumask_t cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
static DEFINE_SPINLOCK(call_lock);
struct call_data_struct {
void (*func) (void *info);
void *info;
atomic_t started;
atomic_t finished;
int wait;
};
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static struct call_data_struct *call_data;
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
return IRQ_HANDLED;
}
static __cpuinit void cpu_bringup_and_idle(void)
{
int cpu = smp_processor_id();
cpu_init();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
xen_setup_cpu_clockevents();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
cpu_idle();
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name;
per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
return 0;
fail:
if (per_cpu(resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
return rc;
}
void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0)
cpu_set(i, cpu_possible_map);
}
}
void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu__gdt_page);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
xen_setup_vcpu_info_placement();
}
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
smp_store_cpu_info(0);
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
cpu_initialized_map = cpumask_of_cpu(0);
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
continue;
cpu_clear(cpu, cpu_possible_map);
}
for_each_possible_cpu (cpu) {
struct task_struct *idle;
if (cpu == 0)
continue;
idle = fork_idle(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
cpu_set(cpu, cpu_present_map);
}
//init_xenbus_allowed_cpumask();
}
static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
if (cpu_test_and_set(cpu, cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt->gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.esp0;
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
#if 0
rc = cpu_up_check(cpu);
if (rc)
return rc;
#endif
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
xen_setup_timer(cpu);
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
alternatives_smp_switch(1);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
smp_store_cpu_info(cpu);
set_cpu_sibling_map(cpu);
/* This must be done before setting cpu_online_map */
wmb();
cpu_set(cpu, cpu_online_map);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
return 0;
}
void xen_smp_cpus_done(unsigned int max_cpus)
{
}
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0, 0);
}
void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
{
unsigned cpu;
cpus_and(mask, mask, cpu_online_map);
for_each_cpu_mask(cpu, mask)
xen_send_IPI_one(cpu, vector);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
irq_enter();
(*func)(info);
irq_exit();
if (wait) {
mb(); /* commit everything before setting finished */
atomic_inc(&call_data->finished);
}
return IRQ_HANDLED;
}
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait)
{
struct call_data_struct data;
int cpus;
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
cpu_clear(smp_processor_id(), mask);
cpus = cpus_weight(mask);
if (!cpus) {
spin_unlock(&call_lock);
return 0;
}
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
mb(); /* write everything before IPI */
/* Send a message to other CPUs and wait for them to respond */
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run.
XXX too severe? Maybe we should check the other CPU's states? */
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
/* Wait for response */
while (atomic_read(&data.started) != cpus ||
(wait && atomic_read(&data.finished) != cpus))
cpu_relax();
spin_unlock(&call_lock);
return 0;
}
This diff is collapsed.
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Clear mask and test pending */
andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
This is run where a normal iret would be run, with the same stack setup:
8: eflags
4: cs
esp-> 0: eip
This attempts to make sure that any pending events are dealt
with on return to usermode, but there is a small window in
which an event can happen just before entering usermode. If
the nested interrupt ends up setting one of the TIF_WORK_MASK
pending work flags, they will not be tested again before
returning to usermode. This means that a process can end up
with pending work, which will be unprocessed until the process
enters and leaves the kernel again, which could be an
unbounded amount of time. This means that a pending signal or
reschedule event could be indefinitely delayed.
The fix is to notice a nested interrupt in the critical
window, and if one occurs, then fold the nested interrupt into
the current interrupt stack frame, and re-process it
iteratively rather than recursively. This means that it will
exit via the normal path, and all pending work will be dealt
with appropriately.
Because the nested interrupt handler needs to deal with the
current stack state in whatever form its in, we keep things
simple by only using a single register which is pushed/popped
on the stack.
Non-direct iret could be done in the same way, but it would
require an annoying amount of code duplication. We'll assume
that direct mode will be the common case once the hypervisor
support becomes commonplace.
*/
ENTRY(xen_iret_direct)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/* Store vcpu_info pointer for easy access. Do it this
way to avoid having to reload %fs */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax),%eax
movl __per_cpu_offset(,%eax,4),%eax
lea per_cpu__xen_vcpu_info(%eax),%eax
#else
movl $per_cpu__xen_vcpu_info, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/* Maybe enable events. Once this happens we could get a
recursive event, so the critical region starts immediately
afterwards. However, if that happens we don't end up
resuming the code, so we don't have to be worried about
being preempted to another CPU. */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/* If there's something pending, mask events again so we
can jump back into xen_hypervisor_callback */
sete XEN_vcpu_info_mask(%eax)
popl %eax
/* From this point on the registers are restored and the stack
updated, so we don't need to worry about it if we're preempted */
iret_restore_end:
/* Jump to hypervisor_callback after fixing up the stack.
Events are masked, so jumping out of the critical
region is OK. */
je xen_hypervisor_callback
iret
xen_iret_end_crit:
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
This is called by xen_hypervisor_callback in entry.S when it sees
that the EIP at the time of interrupt was between xen_iret_start_crit
and xen_iret_end_crit. We're passed the EIP in %eax so we can do
a more refined determination of what to do.
The stack format at this point is:
----------------
ss : (ss/esp may be present if we came from usermode)
esp :
eflags } outer exception info
cs }
eip }
---------------- <- edi (copy dest)
eax : outer eax if it hasn't been restored
----------------
eflags } nested exception info
cs } (no ss/esp because we're nested
eip } from the same ring)
orig_eax }<- esi (copy src)
- - - - - - - -
fs }
es }
ds } SAVE_ALL state
eax }
: :
ebx }
----------------
return addr <- esp
----------------
In order to deliver the nested exception properly, we need to shift
everything from the return addr up to the error code so it
sits just under the outer exception info. This means that when we
handle the exception, we do it in the context of the outer exception
rather than starting a new one.
The only caveat is that if the outer eax hasn't been
restored yet (ie, it's still on stack), we need to insert
its value into the SAVE_ALL state before going on, since
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/* offsets +4 for return address */
/*
Paranoia: Make sure we're really coming from userspace.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
Definitely. Easy to avoid? Yes. The Intel documents
explicitly say that the reported EIP for a bad jump is the
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
movl PT_CS+4(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX+4(%esp), %esi
lea PT_EFLAGS+4(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi),%eax /* copy EAX */
movl %eax, PT_EAX+4(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
2: ret
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <asm/boot.h>
#include <xen/interface/elfnote.h>
ENTRY(startup_xen)
movl %esi,xen_start_info
cld
movl $(init_thread_union+THREAD_SIZE),%esp
jmp xen_start_kernel
.pushsection ".bss.page_aligned"
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
#else
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info *HYPERVISOR_shared_info;
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_setup_timer(int cpu);
void xen_setup_cpu_clockevents(void);
unsigned long xen_cpu_khz(void);
void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
void xen_mark_init_mm_pinned(void);
DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
static inline unsigned xen_get_lazy_mode(void)
{
return x86_read_percpu(xen_lazy_mode);
}
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
void xen_smp_prepare_boot_cpu(void);
void xen_smp_prepare_cpus(unsigned int max_cpus);
int xen_cpu_up(unsigned int cpu);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
int wait);
int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait);
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait);
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
void xen_iret_direct(void);
#endif /* XEN_OPS_H */
......@@ -6,6 +6,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
#include <xen/hvc-console.h>
/* Simple VGA output */
......@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
simnow_init(buf + 6);
early_console = &simnow_console;
keep_early = 1;
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
#endif
}
if (keep_early)
......
......@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
if (events != atomic_read(&mce_logged) && trigger[0]) {
/* Small race window, but should be harmless. */
atomic_set(&mce_logged, events);
call_usermodehelper(trigger, trigger_argv, NULL, -1);
call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
}
......
......@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_PNP) += pnp/
obj-$(CONFIG_ARM_AMBA) += amba/
obj-$(CONFIG_XEN) += xen/
# char/ comes before serial/ etc so that the VT console is the boot-time
# default.
obj-y += char/
......
......@@ -40,6 +40,7 @@
#include <linux/jiffies.h>
#include <linux/kmod.h>
#include <linux/seq_file.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
#include <acpi/acpi_bus.h>
......@@ -59,7 +60,6 @@
#define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0
#define ACPI_THERMAL_NOTIFY_HOT 0xF1
#define ACPI_THERMAL_MODE_ACTIVE 0x00
#define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff"
#define ACPI_THERMAL_MAX_ACTIVE 10
#define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
......@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
return 0;
}
static int acpi_thermal_call_usermode(char *path)
{
char *argv[2] = { NULL, NULL };
char *envp[3] = { NULL, NULL, NULL };
if (!path)
return -EINVAL;
argv[0] = path;
/* minimal command environment */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
call_usermodehelper(argv[0], argv, envp, 0);
return 0;
}
static int acpi_thermal_critical(struct acpi_thermal *tz)
{
if (!tz || !tz->trips.critical.flags.valid)
......@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
tz->trips.critical.flags.enabled);
acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
orderly_poweroff(true);
return 0;
}
......
......@@ -427,4 +427,13 @@ config XILINX_SYSACE
help
Include support for the Xilinx SystemACE CompactFlash interface
config XEN_BLKDEV_FRONTEND
tristate "Xen virtual block device support"
depends on XEN
default y
help
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
in another domain which drives the actual block device.
endif # BLK_DEV
......@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
This diff is collapsed.
......@@ -604,6 +604,14 @@ config HVC_BEAT
help
Toshiba's Cell Reference Set Beat Console device driver
config HVC_XEN
bool "Xen Hypervisor Console support"
depends on XEN
select HVC_DRIVER
default y
help
Xen virtual console device driver
config HVCS
tristate "IBM Hypervisor Virtual Console Server support"
depends on PPC_PSERIES
......
......@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
obj-$(CONFIG_RAW_DRIVER) += raw.o
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
obj-$(CONFIG_MSPEC) += mspec.o
......
/*
* xen console driver interface to hvc_console.c
*
* (c) 2007 Gerd Hoffmann <kraxel@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/types.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/interface/io/console.h>
#include <xen/hvc-console.h>
#include "hvc_console.h"
#define HVC_COOKIE 0x58656e /* "Xen" in hex */
static struct hvc_struct *hvc;
static int xencons_irq;
/* ------------------------------------------------------------------ */
static inline struct xencons_interface *xencons_interface(void)
{
return mfn_to_virt(xen_start_info->console.domU.mfn);
}
static inline void notify_daemon(void)
{
/* Use evtchn: this is called early, before irq is set up. */
notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
}
static int write_console(uint32_t vtermno, const char *data, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int sent = 0;
cons = intf->out_cons;
prod = intf->out_prod;
mb(); /* update queue values before going on */
BUG_ON((prod - cons) > sizeof(intf->out));
while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
wmb(); /* write ring before updating pointer */
intf->out_prod = prod;
notify_daemon();
return sent;
}
static int read_console(uint32_t vtermno, char *buf, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int recv = 0;
cons = intf->in_cons;
prod = intf->in_prod;
mb(); /* get pointers before reading ring */
BUG_ON((prod - cons) > sizeof(intf->in));
while (cons != prod && recv < len)
buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
mb(); /* read ring before consuming */
intf->in_cons = cons;
notify_daemon();
return recv;
}
static struct hv_ops hvc_ops = {
.get_chars = read_console,
.put_chars = write_console,
};
static int __init xen_init(void)
{
struct hvc_struct *hp;
if (!is_running_on_xen())
return 0;
xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
if (xencons_irq < 0)
xencons_irq = 0 /* NO_IRQ */;
hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
if (IS_ERR(hp))
return PTR_ERR(hp);
hvc = hp;
return 0;
}
static void __exit xen_fini(void)
{
if (hvc)
hvc_remove(hvc);
}
static int xen_cons_init(void)
{
if (!is_running_on_xen())
return 0;
hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
return 0;
}
module_init(xen_init);
module_exit(xen_fini);
console_initcall(xen_cons_init);
static void xenboot_write_console(struct console *console, const char *string,
unsigned len)
{
unsigned int linelen, off = 0;
const char *pos;
while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
linelen = pos-string+off;
if (off + linelen > len)
break;
write_console(0, string+off, linelen);
write_console(0, "\r\n", 2);
off += linelen + 1;
}
if (off < len)
write_console(0, string+off, len-off);
}
struct console xenboot_console = {
.name = "xenboot",
.write = xenboot_write_console,
.flags = CON_PRINTBUFFER | CON_BOOT,
};
......@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
}
......
......@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
}
EXPORT_SYMBOL_GPL(wf_critical_overtemp);
......
......@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
source "drivers/s390/net/Kconfig"
config XEN_NETDEV_FRONTEND
tristate "Xen network device frontend driver"
depends on XEN
default y
help
The network device frontend driver allows the kernel to
access network devices exported exported by a virtual
machine containing a physical network device driver. The
frontend driver is intended for unprivileged guest domains;
if you are compiling a kernel for a Xen guest, you almost
certainly want to enable this.
config ISERIES_VETH
tristate "iSeries Virtual Ethernet driver support"
depends on PPC_ISERIES
......
......@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
obj-$(CONFIG_SLIP) += slip.o
obj-$(CONFIG_SLHC) += slhc.o
obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o
......
......@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
sprintf(portarg, "%ld", bc->pdev->port->base);
printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
return call_usermodehelper(eppconfig_path, argv, envp, 1);
return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
}
/* ---------------------------------------------------------------------- */
......
This diff is collapsed.
......@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
info->location_id, info->serial, info->capabilities);
envp[i] = NULL;
value = call_usermodehelper (argv [0], argv, envp, 0);
value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
kfree (buf);
kfree (envp);
return 0;
......
......@@ -7,6 +7,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/oplib.h>
#include <asm/ebus.h>
......@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
{
static int shutting_down = 0;
static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
char *type = "???";
s8 val = -1;
......@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
shutting_down = 1;
if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
if (orderly_poweroff(true) < 0)
printk(KERN_CRIT "envctrl: shutdown execution failed\n");
}
......
......@@ -26,6 +26,7 @@
#include <linux/ioport.h>
#include <linux/miscdevice.h>
#include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/ebus.h>
#include <asm/uaccess.h>
......@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
static void envctrl_do_shutdown(void)
{
static int inprog = 0;
static char *envp[] = {
"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = {
"/sbin/shutdown", "-h", "now", NULL };
int ret;
if (inprog != 0)
......@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
inprog = 1;
printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
ret = orderly_poweroff(true);
if (ret < 0) {
printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n");
inprog = 0; /* unlikely to succeed, but we could try again */
......
obj-y += grant-table.o
obj-y += xenbus/
This diff is collapsed.
obj-y += xenbus.o
xenbus-objs =
xenbus-objs += xenbus_client.o
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment