Merge tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from Juergen Gross: "Xen features and fixes: - a series from Boris Ostrovsky adding support for booting Linux as Xen PVH guest - a series from Juergen Gross streamlining the xenbus driver - a series from Paul Durrant adding support for the new device model hypercall - several small corrections" * tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: xen/privcmd: add IOCTL_PRIVCMD_RESTRICT xen/privcmd: Add IOCTL_PRIVCMD_DM_OP xen/privcmd: return -ENOTTY for unimplemented IOCTLs xen: optimize xenbus driver for multiple concurrent xenstore accesses xen: modify xenstore watch event interface xen: clean up xenbus internal headers xenbus: Neaten xenbus_va_dev_error xen/pvh: Use Xen's emergency_restart op for PVH guests xen/pvh: Enable CPU hotplug xen/pvh: PVH guests always have PV devices xen/pvh: Initialize grant table for PVH guests xen/pvh: Make sure we don't use ACPI_IRQ_MODEL_PIC for SCI xen/pvh: Bootstrap PVH guest xen/pvh: Import PVH-related Xen public interfaces xen/x86: Remove PVH support x86/boot/32: Convert the 32-bit pgtable setup code from assembly to C xen/manage: correct return value check on xenbus_scanf() x86/xen: Fix APIC id mismatch warning on Intel xen/netback: set default upper limit of tx/rx queues to 8 xen/netfront: set default upper limit of tx/rx queues to 8

Merge tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from Juergen Gross: "Xen features and fixes: - a series from Boris Ostrovsky adding support for booting Linux as Xen PVH guest - a series from Juergen Gross streamlining the xenbus driver - a series from Paul Durrant adding support for the new device model hypercall - several small corrections" * tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: xen/privcmd: add IOCTL_PRIVCMD_RESTRICT xen/privcmd: Add IOCTL_PRIVCMD_DM_OP xen/privcmd: return -ENOTTY for unimplemented IOCTLs xen: optimize xenbus driver for multiple concurrent xenstore accesses xen: modify xenstore watch event interface xen: clean up xenbus internal headers xenbus: Neaten xenbus_va_dev_error xen/pvh: Use Xen's emergency_restart op for PVH guests xen/pvh: Enable CPU hotplug xen/pvh: PVH guests always have PV devices xen/pvh: Initialize grant table for PVH guests xen/pvh: Make sure we don't use ACPI_IRQ_MODEL_PIC for SCI xen/pvh: Bootstrap PVH guest xen/pvh: Import PVH-related Xen public interfaces xen/x86: Remove PVH support x86/boot/32: Convert the 32-bit pgtable setup code from assembly to C xen/manage: correct return value check on xenbus_scanf() x86/xen: Fix APIC id mismatch warning on Intel xen/netback: set default upper limit of tx/rx queues to 8 xen/netfront: set default upper limit of tx/rx queues to 8
252b95c0 · Linus Torvalds · b8989bcc · 4610d240 · 252b95c0 · 252b95c0
Commit 252b95c0 authored Feb 21, 2017 by Linus Torvalds
47 changed files
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -457,4 +457,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_multicall);
 EXPORT_SYMBOL_GPL(HYPERVISOR_vm_assist);
+EXPORT_SYMBOL_GPL(HYPERVISOR_dm_op);
 EXPORT_SYMBOL_GPL(privcmd_call);
--- a/arch/arm/xen/hypercall.S
+++ b/arch/arm/xen/hypercall.S
@@ -92,6 +92,7 @@ HYPERCALL1(tmem_op);
 HYPERCALL1(platform_op_raw);
 HYPERCALL2(multicall);
 HYPERCALL2(vm_assist);
+HYPERCALL3(dm_op);

 ENTRY(privcmd_call)
 	stmdb sp!, {r4}

--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -84,6 +84,7 @@ HYPERCALL1(tmem_op);
 HYPERCALL1(platform_op_raw);
 HYPERCALL2(multicall);
 HYPERCALL2(vm_assist);
+HYPERCALL3(dm_op);

 ENTRY(privcmd_call)
 	mov x16, x0

--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
 	return _hypercall2(int, xenpmu_op, op, arg);
 }

+static inline int
+HYPERVISOR_dm_op(
+	domid_t dom, unsigned int nr_bufs, void *bufs)
+{
+	return _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {

--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -53,5 +53,5 @@ config XEN_DEBUG_FS

 config XEN_PVH
 	bool "Support for running as a PVH guest"
-	depends on X86_64 && XEN && XEN_PVHVM
+	depends on XEN && XEN_PVHVM && ACPI
 	def_bool n
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
 obj-$(CONFIG_XEN_EFI)		+= efi.o
+obj-$(CONFIG_XEN_PVH)	 	+= xen-pvh.o
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid)
 static int xen_cpu_present_to_apicid(int cpu)
 {
 	if (cpu_present(cpu))
-		return xen_get_apic_id(xen_apic_read(APIC_ID));
+		return cpu_data(cpu).apicid;
 	else
 		return BAD_APICID;
 }

--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -45,6 +45,7 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/nmi.h>
 #include <xen/interface/xen-mca.h>
+#include <xen/interface/hvm/start_info.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -176,6 +177,20 @@ struct tls_descs {
 */
 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);

+#ifdef CONFIG_XEN_PVH
+/*
+ * PVH variables.
+ *
+ * xen_pvh and pvh_bootparams need to live in data segment since they
+ * are used after startup_{32|64}, which clear .bss, are invoked.
+ */
+bool xen_pvh __attribute__((section(".data"))) = 0;
+struct boot_params pvh_bootparams __attribute__((section(".data")));
+
+struct hvm_start_info pvh_start_info;
+unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+#endif
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -1138,10 +1153,11 @@ void xen_setup_vcpu_info_placement(void)
 		xen_vcpu_setup(cpu);
 	}

-	/* xen_vcpu_setup managed to place the vcpu_info within the
-	 * percpu area for all cpus, so make use of it. Note that for
-	 * PVH we want to use native IRQ mechanism. */
-	if (have_vcpu_info_placement && !xen_pvh_domain()) {
+	/*
+	 * xen_vcpu_setup managed to place the vcpu_info within the
+	 * percpu area for all cpus, so make use of it.
+	 */
+	if (have_vcpu_info_placement) {
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1413,49 +1429,9 @@ static void __init xen_boot_params_init_edd(void)
 * Set up the GDT and segment registers for -fstack-protector.  Until
 * we do this, we have to be careful not to call any stack-protected
 * function, which is most of the kernel.
- *
- * Note, that it is __ref because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
 */
-static void __ref xen_setup_gdt(int cpu)
+static void xen_setup_gdt(int cpu)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_X86_64
-		unsigned long dummy;
-
-		load_percpu_segment(cpu); /* We need to access per-cpu area */
-		switch_to_new_gdt(cpu); /* GDT and GS set */
-
-		/* We are switching of the Xen provided GDT to our HVM mode
-		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
-		 * and we are jumping to reload it.
-		 */
-		asm volatile ("pushq %0\n"
-			      "leaq 1f(%%rip),%0\n"
-			      "pushq %0\n"
-			      "lretq\n"
-			      "1:\n"
-			      : "=&r" (dummy) : "0" (__KERNEL_CS));
-
-		/*
-		 * While not needed, we also set the %es, %ds, and %fs
-		 * to zero. We don't care about %ss as it is NULL.
-		 * Strictly speaking this is not needed as Xen zeros those
-		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
-		 *
-		 * Linux zeros them in cpu_init() and in secondary_startup_64
-		 * (for BSP).
-		 */
-		loadsegment(es, 0);
-		loadsegment(ds, 0);
-		loadsegment(fs, 0);
-#else
-		/* PVH: TODO Implement. */
-		BUG();
-#endif
-		return; /* PVH does not need any PV GDT ops. */
-	}
 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
 	pv_cpu_ops.load_gdt = xen_load_gdt_boot;

@@ -1466,59 +1442,6 @@ static void __ref xen_setup_gdt(int cpu)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }

-#ifdef CONFIG_XEN_PVH
-/*
- * A PV guest starts with default flags that are not set for PVH, set them
- * here asap.
- */
-static void xen_pvh_set_cr_flags(int cpu)
-{
-
-	/* Some of these are setup in 'secondary_startup_64'. The others:
-	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
-	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
-	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
-
-	if (!cpu)
-		return;
-	/*
-	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
-	*/
-	if (boot_cpu_has(X86_FEATURE_PSE))
-		cr4_set_bits_and_update_boot(X86_CR4_PSE);
-
-	if (boot_cpu_has(X86_FEATURE_PGE))
-		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-}
-
-/*
- * Note, that it is ref - because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
- */
-void __ref xen_pvh_secondary_vcpu_init(int cpu)
-{
-	xen_setup_gdt(cpu);
-	xen_pvh_set_cr_flags(cpu);
-}
-
-static void __init xen_pvh_early_guest_init(void)
-{
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
-	BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
-
-	xen_pvh_early_cpu_init(0, false);
-	xen_pvh_set_cr_flags(0);
-
-#ifdef CONFIG_X86_32
-	BUG(); /* PVH: Implement proper support. */
-#endif
-}
-#endif    /* CONFIG_XEN_PVH */
-
 static void __init xen_dom0_set_legacy_features(void)
 {
 	x86_platform.legacy.rtc = 1;
@@ -1555,24 +1478,17 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	xen_domain_type = XEN_PV_DOMAIN;

 	xen_setup_features();
-#ifdef CONFIG_XEN_PVH
-	xen_pvh_early_guest_init();
-#endif
+
 	xen_setup_machphys_mapping();

 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	if (!xen_pvh_domain()) {
-		pv_cpu_ops = xen_cpu_ops;
+	pv_cpu_ops = xen_cpu_ops;

-		x86_platform.get_nmi_reason = xen_get_nmi_reason;
-	}
+	x86_platform.get_nmi_reason = xen_get_nmi_reason;

-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
-	else
-		x86_init.resources.memory_setup = xen_memory_setup;
+	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;

@@ -1665,18 +1581,15 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/* set the limit of our address space */
 	xen_reserve_top();

-	/* PVH: runs at default kernel iopl of 0 */
-	if (!xen_pvh_domain()) {
-		/*
-		 * We used to do this in xen_arch_setup, but that is too late
-		 * on AMD were early_cpu_init (run before ->arch_setup()) calls
-		 * early_amd_init which pokes 0xcf8 port.
-		 */
-		set_iopl.iopl = 1;
-		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-		if (rc != 0)
-			xen_raw_printk("physdev_op failed %d\n", rc);
-	}
+	/*
+	 * We used to do this in xen_arch_setup, but that is too late
+	 * on AMD were early_cpu_init (run before ->arch_setup()) calls
+	 * early_amd_init which pokes 0xcf8 port.
+	 */
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+	if (rc != 0)
+		xen_raw_printk("physdev_op failed %d\n", rc);

 #ifdef CONFIG_X86_32
 	/* set up basic CPUID stuff */
@@ -1758,6 +1671,102 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #endif
 }

+#ifdef CONFIG_XEN_PVH
+
+static void xen_pvh_arch_setup(void)
+{
+#ifdef CONFIG_ACPI
+	/* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */
+	if (nr_ioapics == 0)
+		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
+#endif
+}
+
+static void __init init_pvh_bootparams(void)
+{
+	struct xen_memory_map memmap;
+	unsigned int i;
+	int rc;
+
+	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+
+	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map);
+	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc) {
+		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+		BUG();
+	}
+
+	if (memmap.nr_entries < E820MAX - 1) {
+		pvh_bootparams.e820_map[memmap.nr_entries].addr =
+			ISA_START_ADDRESS;
+		pvh_bootparams.e820_map[memmap.nr_entries].size =
+			ISA_END_ADDRESS - ISA_START_ADDRESS;
+		pvh_bootparams.e820_map[memmap.nr_entries].type =
+			E820_RESERVED;
+		memmap.nr_entries++;
+	} else
+		xen_raw_printk("Warning: Can fit ISA range into e820\n");
+
+	sanitize_e820_map(pvh_bootparams.e820_map,
+			  ARRAY_SIZE(pvh_bootparams.e820_map),
+			  &memmap.nr_entries);
+
+	pvh_bootparams.e820_entries = memmap.nr_entries;
+	for (i = 0; i < pvh_bootparams.e820_entries; i++)
+		e820_add_region(pvh_bootparams.e820_map[i].addr,
+				pvh_bootparams.e820_map[i].size,
+				pvh_bootparams.e820_map[i].type);
+
+	pvh_bootparams.hdr.cmd_line_ptr =
+		pvh_start_info.cmdline_paddr;
+
+	/* The first module is always ramdisk. */
+	if (pvh_start_info.nr_modules) {
+		struct hvm_modlist_entry *modaddr =
+			__va(pvh_start_info.modlist_paddr);
+		pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
+		pvh_bootparams.hdr.ramdisk_size = modaddr->size;
+	}
+
+	/*
+	 * See Documentation/x86/boot.txt.
+	 *
+	 * Version 2.12 supports Xen entry point but we will use default x86/PC
+	 * environment (i.e. hardware_subarch 0).
+	 */
+	pvh_bootparams.hdr.version = 0x212;
+	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later.
+ */
+void __init xen_prepare_pvh(void)
+{
+	u32 msr;
+	u64 pfn;
+
+	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
+		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+				pvh_start_info.magic);
+		BUG();
+	}
+
+	xen_pvh = 1;
+
+	msr = cpuid_ebx(xen_cpuid_base() + 2);
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	init_pvh_bootparams();
+
+	x86_init.oem.arch_setup = xen_pvh_arch_setup;
+}
+#endif
+
 void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
@@ -1797,20 +1806,29 @@ void __ref xen_hvm_init_shared_info(void)
 static void __init init_hvm_pv_info(void)
 {
 	int major, minor;
-	uint32_t eax, ebx, ecx, edx, pages, msr, base;
-	u64 pfn;
+	uint32_t eax, ebx, ecx, edx, base;

 	base = xen_cpuid_base();
-	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+	eax = cpuid_eax(base + 1);

 	major = eax >> 16;
 	minor = eax & 0xffff;
 	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);

-	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+	xen_domain_type = XEN_HVM_DOMAIN;

-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	/* PVH set up hypercall page in xen_prepare_pvh(). */
+	if (xen_pvh_domain())
+		pv_info.name = "Xen PVH";
+	else {
+		u64 pfn;
+		uint32_t msr;
+
+		pv_info.name = "Xen HVM";
+		msr = cpuid_ebx(base + 2);
+		pfn = __pa(hypercall_page);
+		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	}

 	xen_setup_features();

@@ -1819,10 +1837,6 @@ static void __init init_hvm_pv_info(void)
 		this_cpu_write(xen_vcpu_id, ebx);
 	else
 		this_cpu_write(xen_vcpu_id, smp_processor_id());
-
-	pv_info.name = "Xen HVM";
-
-	xen_domain_type = XEN_HVM_DOMAIN;
 }
 #endif

@@ -1910,6 +1924,9 @@ static void __init xen_hvm_guest_init(void)
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
 	xen_hvm_init_mmu_ops();
+
+	if (xen_pvh_domain())
+		machine_ops.emergency_restart = xen_emergency_restart;
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.shutdown = xen_hvm_shutdown;
 	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;

--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1792,10 +1792,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot,
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);

-	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
@@ -1902,8 +1898,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
 * kernel has a physical mapping to start with - but that's enough to
 * get __va working.  We need to fill in the rest of the physical
- * mapping once some sort of allocator has been set up.  NOTE: for
- * PVH, the page tables are native.
+ * mapping once some sort of allocator has been set up.
 */
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
@@ -2812,16 +2807,6 @@ static int do_remap_gfn(struct vm_area_struct *vma,

 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));

-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_XEN_PVH
-		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
-						 prot, domid, pages);
-#else
-		return -EINVAL;
-#endif
-        }
-
 	rmd.mfn = gfn;
 	rmd.prot = prot;
 	/* We use the err_ptr to indicate if there we are doing a contiguous
@@ -2915,10 +2900,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
 		return 0;

-#ifdef CONFIG_XEN_PVH
-	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
-#else
 	return -EINVAL;
-#endif
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -73,8 +73,8 @@ bool xen_has_pv_devices(void)
 	if (!xen_domain())
 		return false;

-	/* PV domains always have them. */
-	if (xen_pv_domain())
+	/* PV and PVH domains always have them. */
+	if (xen_pv_domain() || xen_pvh_domain())
 		return true;

 	/* And user has xen_platform_pci=0 set in guest config as

--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -914,39 +914,6 @@ char * __init xen_memory_setup(void)
 	return "Xen";
 }

-/*
- * Machine specific memory setup for auto-translated guests.
- */
-char * __init xen_auto_xlated_memory_setup(void)
-{
-	struct xen_memory_map memmap;
-	int i;
-	int rc;
-
-	memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
-	set_xen_guest_handle(memmap.buffer, xen_e820_map);
-
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if (rc < 0)
-		panic("No memory map (%d)\n", rc);
-
-	xen_e820_map_entries = memmap.nr_entries;
-
-	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
-			  &xen_e820_map_entries);
-
-	for (i = 0; i < xen_e820_map_entries; i++)
-		e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
-				xen_e820_map[i].type);
-
-	/* Remove p2m info, it is not needed. */
-	xen_start_info->mfn_list = 0;
-	xen_start_info->first_p2m_pfn = 0;
-	xen_start_info->nr_p2m_frames = 0;
-
-	return "Xen";
-}
-
 /*
 * Set the bit indicating "nosegneg" library variants should be used.
 * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -1032,8 +999,8 @@ void __init xen_pvmmu_arch_setup(void)
 void __init xen_arch_setup(void)
 {
 	xen_panic_handler_init();
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		xen_pvmmu_arch_setup();
+
+	xen_pvmmu_arch_setup();

 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {

--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -99,18 +99,8 @@ static void cpu_bringup(void)
 	local_irq_enable();
 }

-/*
- * Note: cpu parameter is only relevant for PVH. The reason for passing it
- * is we can't do smp_processor_id until the percpu segments are loaded, for
- * which we need the cpu number! So we pass it in rdi as first parameter.
- */
-asmlinkage __visible void cpu_bringup_and_idle(int cpu)
+asmlinkage __visible void cpu_bringup_and_idle(void)
 {
-#ifdef CONFIG_XEN_PVH
-	if (xen_feature(XENFEAT_auto_translated_physmap) &&
-	    xen_feature(XENFEAT_supervisor_mode_kernel))
-		xen_pvh_secondary_vcpu_init(cpu);
-#endif
 	cpu_bringup();
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
@@ -404,61 +394,47 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	gdt = get_cpu_gdt_table(cpu);

 #ifdef CONFIG_X86_32
-	/* Note: PVH is not yet supported on x86_32. */
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
 #endif
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));

-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-		ctxt->flags = VGCF_IN_KERNEL;
-		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-		ctxt->user_regs.ds = __USER_DS;
-		ctxt->user_regs.es = __USER_DS;
-		ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.ss = __KERNEL_DS;

-		xen_copy_trap_info(ctxt->trap_ctxt);
+	xen_copy_trap_info(ctxt->trap_ctxt);

-		ctxt->ldt_ents = 0;
+	ctxt->ldt_ents = 0;

-		BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+	BUG_ON((unsigned long)gdt & ~PAGE_MASK);

-		gdt_mfn = arbitrary_virt_to_mfn(gdt);
-		make_lowmem_page_readonly(gdt);
-		make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
+	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));

-		ctxt->gdt_frames[0] = gdt_mfn;
-		ctxt->gdt_ents      = GDT_ENTRIES;
+	ctxt->gdt_frames[0] = gdt_mfn;
+	ctxt->gdt_ents      = GDT_ENTRIES;

-		ctxt->kernel_ss = __KERNEL_DS;
-		ctxt->kernel_sp = idle->thread.sp0;
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.sp0;

 #ifdef CONFIG_X86_32
-		ctxt->event_callback_cs     = __KERNEL_CS;
-		ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
 #else
-		ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
-		ctxt->event_callback_eip    =
-					(unsigned long)xen_hypervisor_callback;
-		ctxt->failsafe_callback_eip =
-					(unsigned long)xen_failsafe_callback;
-		ctxt->user_regs.cs = __KERNEL_CS;
-		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-	}
-#ifdef CONFIG_XEN_PVH
-	else {
-		/*
-		 * The vcpu comes on kernel page tables which have the NX pte
-		 * bit set. This means before DS/SS is touched, NX in
-		 * EFER must be set. Hence the following assembly glue code.
-		 */
-		ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
-		ctxt->user_regs.rdi = cpu;
-		ctxt->user_regs.rsi = true;  /* entry == true */
-	}
+	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
+	ctxt->event_callback_eip    =
+		(unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_eip =
+		(unsigned long)xen_failsafe_callback;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))

--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu)
 static inline void xen_smp_intr_free(unsigned int cpu) {}
 #endif /* CONFIG_SMP */

-#ifdef CONFIG_XEN_PVH
-extern void xen_pvh_early_cpu_init(int cpu, bool entry);
-#else
-static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
-{
-}
-#endif
-
 #endif
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -16,25 +16,6 @@
 #include <xen/interface/xen-mca.h>
 #include <asm/xen/interface.h>

-#ifdef CONFIG_XEN_PVH
-#define PVH_FEATURES_STR  "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
-/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
- * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
- * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
- */
-#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
-		      (1 << XENFEAT_auto_translated_physmap) | \
-		      (1 << XENFEAT_supervisor_mode_kernel) | \
-		      (1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
- * up regardless whether this CONFIG option is enabled or not, but it
- * clarifies what the right flags need to be.
- */
-#else
-#define PVH_FEATURES_STR  ""
-#define PVH_FEATURES (0)
-#endif
-
 	__INIT
 ENTRY(startup_xen)
 	cld
@@ -54,41 +35,6 @@ ENTRY(startup_xen)

 	__FINIT

-#ifdef CONFIG_XEN_PVH
-/*
- * xen_pvh_early_cpu_init() - early PVH VCPU initialization
- * @cpu:   this cpu number (%rdi)
- * @entry: true if this is a secondary vcpu coming up on this entry
- *         point, false if this is the boot CPU being initialized for
- *         the first time (%rsi)
- *
- * Note: This is called as a function on the boot CPU, and is the entry point
- *       on the secondary CPU.
- */
-ENTRY(xen_pvh_early_cpu_init)
-	mov     %rsi, %r11
-
-	/* Gather features to see if NX implemented. */
-	mov     $0x80000001, %eax
-	cpuid
-	mov     %edx, %esi
-
-	mov     $MSR_EFER, %ecx
-	rdmsr
-	bts     $_EFER_SCE, %eax
-
-	bt      $20, %esi
-	jnc     1f      	/* No NX, skip setting it */
-	bts     $_EFER_NX, %eax
-1:	wrmsr
-#ifdef CONFIG_SMP
-	cmp     $0, %r11b
-	jne     cpu_bringup_and_idle
-#endif
-	ret
-
-#endif /* CONFIG_XEN_PVH */
-
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
@@ -114,10 +60,10 @@ ENTRY(hypercall_page)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
-	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
-	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
-						(1 << XENFEAT_writable_page_tables) |
-						(1 << XENFEAT_dom0))
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
+		.ascii "!writable_page_tables|pae_pgdir_above_4gb")
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+		.long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0))
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,

--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void);

 extern int xen_panic_handler_init(void);

-void xen_pvh_secondary_vcpu_init(int cpu);
 #endif /* XEN_OPS_H */
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
+/*
+ * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+	.code32
+	.text
+#define _pa(x)          ((x) - __START_KERNEL_map)
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <xen/interface/elfnote.h>
+
+	__HEAD
+
+/*
+ * Entry point for PVH guests.
+ *
+ * Xen ABI specifies the following register state when we come here:
+ *
+ * - `ebx`: contains the physical memory address where the loader has placed
+ *          the boot start info structure.
+ * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
+ * - `cr4`: all bits are cleared.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
+ *          and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `ds`, `es`: must be a 32-bit read/write data segment with a base of
+ *               ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ *               unspecified.
+ * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
+ *         of '0x67'.
+ * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
+ *             Bit 8 (TF) must be cleared. Other bits are all unspecified.
+ *
+ * All other processor registers and flag bits are unspecified. The OS is in
+ * charge of setting up it's own stack, GDT and IDT.
+ */
+
+ENTRY(pvh_start_xen)
+	cld
+
+	lgdt (_pa(gdt))
+
+	mov $(__BOOT_DS),%eax
+	mov %eax,%ds
+	mov %eax,%es
+	mov %eax,%ss
+
+	/* Stash hvm_start_info. */
+	mov $_pa(pvh_start_info), %edi
+	mov %ebx, %esi
+	mov _pa(pvh_start_info_sz), %ecx
+	shr $2,%ecx
+	rep
+	movsl
+
+	mov $_pa(early_stack_end), %esp
+
+	/* Enable PAE mode. */
+	mov %cr4, %eax
+	orl $X86_CR4_PAE, %eax
+	mov %eax, %cr4
+
+#ifdef CONFIG_X86_64
+	/* Enable Long mode. */
+	mov $MSR_EFER, %ecx
+	rdmsr
+	btsl $_EFER_LME, %eax
+	wrmsr
+
+	/* Enable pre-constructed page tables. */
+	mov $_pa(init_level4_pgt), %eax
+	mov %eax, %cr3
+	mov $(X86_CR0_PG | X86_CR0_PE), %eax
+	mov %eax, %cr0
+
+	/* Jump to 64-bit mode. */
+	ljmp $__KERNEL_CS, $_pa(1f)
+
+	/* 64-bit entry point. */
+	.code64
+1:
+	call xen_prepare_pvh
+
+	/* startup_64 expects boot_params in %rsi. */
+	mov $_pa(pvh_bootparams), %rsi
+	mov $_pa(startup_64), %rax
+	jmp *%rax
+
+#else /* CONFIG_X86_64 */
+
+	call mk_early_pgtbl_32
+
+	mov $_pa(initial_page_table), %eax
+	mov %eax, %cr3
+
+	mov %cr0, %eax
+	or $(X86_CR0_PG | X86_CR0_PE), %eax
+	mov %eax, %cr0
+
+	ljmp $__BOOT_CS, $1f
+1:
+	call xen_prepare_pvh
+	mov $_pa(pvh_bootparams), %esi
+
+	/* startup_32 doesn't expect paging and PAE to be on. */
+	ljmp $__BOOT_CS, $_pa(2f)
+2:
+	mov %cr0, %eax
+	and $~X86_CR0_PG, %eax
+	mov %eax, %cr0
+	mov %cr4, %eax
+	and $~X86_CR4_PAE, %eax
+	mov %eax, %cr4
+
+	ljmp $__BOOT_CS, $_pa(startup_32)
+#endif
+END(pvh_start_xen)
+
+	.section ".init.data","aw"
+	.balign 8
+gdt:
+	.word gdt_end - gdt_start
+	.long _pa(gdt_start)
+	.word 0
+gdt_start:
+	.quad 0x0000000000000000            /* NULL descriptor */
+	.quad 0x0000000000000000            /* reserved */
+#ifdef CONFIG_X86_64
+	.quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */
+#else
+	.quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */
+#endif
+	.quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */
+gdt_end:
+
+	.balign 4
+early_stack:
+	.fill 256, 1, 0
+early_stack_end:
+
+	ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
+	             _ASM_PTR (pvh_start_xen - __START_KERNEL_map))
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -38,8 +38,8 @@ struct backend_info {
 static struct kmem_cache *xen_blkif_cachep;
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
-static void backend_changed(struct xenbus_watch *, const char **,
-			    unsigned int);
+static void backend_changed(struct xenbus_watch *, const char *,
+			    const char *);
 static void xen_blkif_free(struct xen_blkif *blkif);
 static void xen_vbd_free(struct xen_vbd *vbd);

@@ -661,7 +661,7 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
 * ready, connect.
 */
 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
 	int err;
 	unsigned major;

--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -67,6 +67,7 @@ module_param(rx_drain_timeout_msecs, uint, 0444);
 unsigned int rx_stall_timeout_msecs = 60000;
 module_param(rx_stall_timeout_msecs, uint, 0444);

+#define MAX_QUEUES_DEFAULT 8
 unsigned int xenvif_max_queues;
 module_param_named(max_queues, xenvif_max_queues, uint, 0644);
 MODULE_PARM_DESC(max_queues,
@@ -1622,11 +1623,12 @@ static int __init netback_init(void)
 	if (!xen_domain())
 		return -ENODEV;

-	/* Allow as many queues as there are CPUs if user has not
+	/* Allow as many queues as there are CPUs but max. 8 if user has not
 	 * specified a value.
 	 */
 	if (xenvif_max_queues == 0)
-		xenvif_max_queues = num_online_cpus();
+		xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT,
+					  num_online_cpus());

 	if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
 		pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",

--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -734,7 +734,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
 }

 static void xen_net_rate_changed(struct xenbus_watch *watch,
-				const char **vec, unsigned int len)
+				 const char *path, const char *token)
 {
 	struct xenvif *vif = container_of(watch, struct xenvif, credit_watch);
 	struct xenbus_device *dev = xenvif_to_xenbus_device(vif);
@@ -791,7 +791,7 @@ static void xen_unregister_credit_watch(struct xenvif *vif)
 }

 static void xen_mcast_ctrl_changed(struct xenbus_watch *watch,
-				   const char **vec, unsigned int len)
+				   const char *path, const char *token)
 {
 	struct xenvif *vif = container_of(watch, struct xenvif,
 					  mcast_ctrl_watch);
@@ -866,8 +866,8 @@ static void unregister_hotplug_status_watch(struct backend_info *be)
 }

 static void hotplug_status_changed(struct xenbus_watch *watch,
-				   const char **vec,
-				   unsigned int vec_size)
+				   const char *path,
+				   const char *token)
 {
 	struct backend_info *be = container_of(watch,
 					       struct backend_info,

--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -57,6 +57,7 @@
 #include <xen/interface/grant_table.h>

 /* Module parameters */
+#define MAX_QUEUES_DEFAULT 8
 static unsigned int xennet_max_queues;
 module_param_named(max_queues, xennet_max_queues, uint, 0644);
 MODULE_PARM_DESC(max_queues,
@@ -2166,11 +2167,12 @@ static int __init netif_init(void)

 	pr_info("Initialising Xen virtual ethernet driver\n");

-	/* Allow as many queues as there are CPUs if user has not
+	/* Allow as many queues as there are CPUs inut max. 8 if user has not
 	 * specified a value.
 	 */
 	if (xennet_max_queues == 0)
-		xennet_max_queues = num_online_cpus();
+		xennet_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT,
+					  num_online_cpus());

 	return xenbus_register_frontend(&netfront_driver);
 }

--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -68,13 +68,12 @@ static void vcpu_hotplug(unsigned int cpu)
 }

 static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
-					const char **vec, unsigned int len)
+				      const char *path, const char *token)
 {
 	unsigned int cpu;
 	char *cpustr;
-	const char *node = vec[XS_WATCH_PATH];

-	cpustr = strstr(node, "cpu/");
+	cpustr = strstr(path, "cpu/");
 	if (cpustr != NULL) {
 		sscanf(cpustr, "cpu/%u", &cpu);
 		vcpu_hotplug(cpu);
@@ -107,7 +106,7 @@ static int __init setup_vcpu_hotplug_event(void)
 		.notifier_call = setup_cpu_watcher };

 #ifdef CONFIG_X86
-	if (!xen_pv_domain())
+	if (!xen_pv_domain() && !xen_pvh_domain())
 #else
 	if (!xen_domain())
 #endif

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1704,7 +1704,6 @@ void __init xen_init_IRQ(void)
 		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 		eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map);
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
-		/* TODO: No PVH support for PIRQ EOI */
 		if (rc != 0) {
 			free_page((unsigned long) pirq_eoi_map);
 			pirq_eoi_map = NULL;

--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1146,13 +1146,13 @@ EXPORT_SYMBOL_GPL(gnttab_init);

 static int __gnttab_init(void)
 {
+	if (!xen_domain())
+		return -ENODEV;
+
 	/* Delay grant-table initialization in the PV on HVM case */
-	if (xen_hvm_domain())
+	if (xen_hvm_domain() && !xen_pvh_domain())
 		return 0;

-	if (!xen_pv_domain())
-		return -ENODEV;
-
 	return gnttab_init();
 }
 /* Starts after core_initcall so that xen_pvh_gnttab_setup can be called

--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -218,7 +218,7 @@ static struct shutdown_handler shutdown_handlers[] = {
 };

 static void shutdown_handler(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
 	char *str;
 	struct xenbus_transaction xbt;
@@ -266,8 +266,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
 }

 #ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
-			  unsigned int len)
+static void sysrq_handler(struct xenbus_watch *watch, const char *path,
+			  const char *token)
 {
 	char sysrq_key = '\0';
 	struct xenbus_transaction xbt;
@@ -277,7 +277,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
 	err = xenbus_transaction_start(&xbt);
 	if (err)
 		return;
-	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+	if (xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key) < 0) {
 		pr_err("Unable to read sysrq code in control/sysrq\n");
 		xenbus_transaction_end(xbt, 1);
 		return;

--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -22,6 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
+#include <linux/moduleparam.h>

 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -32,6 +33,7 @@
 #include <xen/xen.h>
 #include <xen/privcmd.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/hvm/dm_op.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/xen-ops.h>
@@ -43,16 +45,36 @@ MODULE_LICENSE("GPL");

 #define PRIV_VMA_LOCKED ((void *)1)

+static unsigned int privcmd_dm_op_max_num = 16;
+module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
+MODULE_PARM_DESC(dm_op_max_nr_bufs,
+		 "Maximum number of buffers per dm_op hypercall");
+
+static unsigned int privcmd_dm_op_buf_max_size = 4096;
+module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
+		   0644);
+MODULE_PARM_DESC(dm_op_buf_max_size,
+		 "Maximum size of a dm_op hypercall buffer");
+
+struct privcmd_data {
+	domid_t domid;
+};
+
 static int privcmd_vma_range_is_mapped(
               struct vm_area_struct *vma,
               unsigned long addr,
               unsigned long nr_pages);

-static long privcmd_ioctl_hypercall(void __user *udata)
+static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
 {
+	struct privcmd_data *data = file->private_data;
 	struct privcmd_hypercall hypercall;
 	long ret;

+	/* Disallow arbitrary hypercalls if restricted */
+	if (data->domid != DOMID_INVALID)
+		return -EPERM;
+
 	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
 		return -EFAULT;

@@ -229,8 +251,9 @@ static int mmap_gfn_range(void *data, void *state)
 	return 0;
 }

-static long privcmd_ioctl_mmap(void __user *udata)
+static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
 {
+	struct privcmd_data *data = file->private_data;
 	struct privcmd_mmap mmapcmd;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -245,6 +268,10 @@ static long privcmd_ioctl_mmap(void __user *udata)
 	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
 		return -EFAULT;

+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
+		return -EPERM;
+
 	rc = gather_array(&pagelist,
 			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 			  mmapcmd.entry);
@@ -416,8 +443,10 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)

 static const struct vm_operations_struct privcmd_vm_ops;

-static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
+static long privcmd_ioctl_mmap_batch(
+	struct file *file, void __user *udata, int version)
 {
+	struct privcmd_data *data = file->private_data;
 	int ret;
 	struct privcmd_mmapbatch_v2 m;
 	struct mm_struct *mm = current->mm;
@@ -446,6 +475,10 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		return -EINVAL;
 	}

+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != m.dom)
+		return -EPERM;
+
 	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
 	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 		return -EINVAL;
@@ -548,37 +581,210 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 	goto out;
 }

+static int lock_pages(
+	struct privcmd_dm_op_buf kbufs[], unsigned int num,
+	struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		unsigned int requested;
+		int pinned;
+
+		requested = DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+		if (requested > nr_pages)
+			return -ENOSPC;
+
+		pinned = get_user_pages_fast(
+			(unsigned long) kbufs[i].uptr,
+			requested, FOLL_WRITE, pages);
+		if (pinned < 0)
+			return pinned;
+
+		nr_pages -= pinned;
+		pages += pinned;
+	}
+
+	return 0;
+}
+
+static void unlock_pages(struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	if (!pages)
+		return;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			put_page(pages[i]);
+	}
+}
+
+static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_dm_op kdata;
+	struct privcmd_dm_op_buf *kbufs;
+	unsigned int nr_pages = 0;
+	struct page **pages = NULL;
+	struct xen_dm_op_buf *xbufs = NULL;
+	unsigned int i;
+	long rc;
+
+	if (copy_from_user(&kdata, udata, sizeof(kdata)))
+		return -EFAULT;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
+		return -EPERM;
+
+	if (kdata.num == 0)
+		return 0;
+
+	if (kdata.num > privcmd_dm_op_max_num)
+		return -E2BIG;
+
+	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
+	if (!kbufs)
+		return -ENOMEM;
+
+	if (copy_from_user(kbufs, kdata.ubufs,
+			   sizeof(*kbufs) * kdata.num)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	for (i = 0; i < kdata.num; i++) {
+		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
+			rc = -E2BIG;
+			goto out;
+		}
+
+		if (!access_ok(VERIFY_WRITE, kbufs[i].uptr,
+			       kbufs[i].size)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		nr_pages += DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+	}
+
+	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
+	if (!xbufs) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = lock_pages(kbufs, kdata.num, pages, nr_pages);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < kdata.num; i++) {
+		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
+		xbufs[i].size = kbufs[i].size;
+	}
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
+	xen_preemptible_hcall_end();
+
+out:
+	unlock_pages(pages, nr_pages);
+	kfree(xbufs);
+	kfree(pages);
+	kfree(kbufs);
+
+	return rc;
+}
+
+static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	domid_t dom;
+
+	if (copy_from_user(&dom, udata, sizeof(dom)))
+		return -EFAULT;
+
+	/* Set restriction to the specified domain, or check it matches */
+	if (data->domid == DOMID_INVALID)
+		data->domid = dom;
+	else if (data->domid != dom)
+		return -EINVAL;
+
+	return 0;
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
-	int ret = -ENOSYS;
+	int ret = -ENOTTY;
 	void __user *udata = (void __user *) data;

 	switch (cmd) {
 	case IOCTL_PRIVCMD_HYPERCALL:
-		ret = privcmd_ioctl_hypercall(udata);
+		ret = privcmd_ioctl_hypercall(file, udata);
 		break;

 	case IOCTL_PRIVCMD_MMAP:
-		ret = privcmd_ioctl_mmap(udata);
+		ret = privcmd_ioctl_mmap(file, udata);
 		break;

 	case IOCTL_PRIVCMD_MMAPBATCH:
-		ret = privcmd_ioctl_mmap_batch(udata, 1);
+		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
 		break;

 	case IOCTL_PRIVCMD_MMAPBATCH_V2:
-		ret = privcmd_ioctl_mmap_batch(udata, 2);
+		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
+		break;
+
+	case IOCTL_PRIVCMD_DM_OP:
+		ret = privcmd_ioctl_dm_op(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_RESTRICT:
+		ret = privcmd_ioctl_restrict(file, udata);
 		break;

 	default:
-		ret = -EINVAL;
 		break;
 	}

 	return ret;
 }

+static int privcmd_open(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	/* DOMID_INVALID implies no restriction */
+	data->domid = DOMID_INVALID;
+
+	file->private_data = data;
+	return 0;
+}
+
+static int privcmd_release(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = file->private_data;
+
+	kfree(data);
+	return 0;
+}
+
 static void privcmd_close(struct vm_area_struct *vma)
 {
 	struct page **pages = vma->vm_private_data;
@@ -647,6 +853,8 @@ static int privcmd_vma_range_is_mapped(
 const struct file_operations xen_privcmd_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = privcmd_ioctl,
+	.open = privcmd_open,
+	.release = privcmd_release,
 	.mmap = privcmd_mmap,
 };
 EXPORT_SYMBOL_GPL(xen_privcmd_fops);

--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -55,7 +55,7 @@ static int register_balloon(struct device *dev);

 /* React to a change in the target key */
 static void watch_target(struct xenbus_watch *watch,
-			 const char **vec, unsigned int len)
+			 const char *path, const char *token)
 {
 	unsigned long long new_target;
 	int err;

--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -652,7 +652,7 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
 }

 static void xen_pcibk_be_watch(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			       const char *path, const char *token)
 {
 	struct xen_pcibk_device *pdev =
 	    container_of(watch, struct xen_pcibk_device, be_watch);

--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
-/******************************************************************************
- * xenbus_probe.h
- *
- * Talks to Xen Store to figure out what devices we have.
+/*
+ * Private include for xenbus communications.
 *
 * Copyright (C) 2005 Rusty Russell, IBM Corporation
 * Copyright (C) 2005 XenSource Ltd.
@@ -31,8 +29,12 @@
 * IN THE SOFTWARE.
 */

-#ifndef _XENBUS_PROBE_H
-#define _XENBUS_PROBE_H
+#ifndef _XENBUS_XENBUS_H
+#define _XENBUS_XENBUS_H
+
+#include <linux/mutex.h>
+#include <linux/uio.h>
+#include <xen/xenbus.h>

 #define XEN_BUS_ID_SIZE			20

@@ -42,8 +44,8 @@ struct xen_bus_type {
 	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
 	int (*probe)(struct xen_bus_type *bus, const char *type,
 		     const char *dir);
-	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
-				 unsigned int len);
+	void (*otherend_changed)(struct xenbus_watch *watch, const char *path,
+				 const char *token);
 	struct bus_type bus;
 };

@@ -54,35 +56,80 @@ enum xenstore_init {
 	XS_LOCAL,
 };

+struct xs_watch_event {
+	struct list_head list;
+	unsigned int len;
+	struct xenbus_watch *handle;
+	const char *path;
+	const char *token;
+	char body[];
+};
+
+enum xb_req_state {
+	xb_req_state_queued,
+	xb_req_state_wait_reply,
+	xb_req_state_got_reply,
+	xb_req_state_aborted
+};
+
+struct xb_req_data {
+	struct list_head list;
+	wait_queue_head_t wq;
+	struct xsd_sockmsg msg;
+	enum xsd_sockmsg_type type;
+	char *body;
+	const struct kvec *vec;
+	int num_vecs;
+	int err;
+	enum xb_req_state state;
+	void (*cb)(struct xb_req_data *);
+	void *par;
+};
+
+extern enum xenstore_init xen_store_domain_type;
 extern const struct attribute_group *xenbus_dev_groups[];
+extern struct mutex xs_response_mutex;
+extern struct list_head xs_reply_list;
+extern struct list_head xb_write_list;
+extern wait_queue_head_t xb_waitq;
+extern struct mutex xb_write_mutex;

-extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
-extern int xenbus_dev_probe(struct device *_dev);
-extern int xenbus_dev_remove(struct device *_dev);
-extern int xenbus_register_driver_common(struct xenbus_driver *drv,
-					 struct xen_bus_type *bus,
-					 struct module *owner,
-					 const char *mod_name);
-extern int xenbus_probe_node(struct xen_bus_type *bus,
-			     const char *type,
-			     const char *nodename);
-extern int xenbus_probe_devices(struct xen_bus_type *bus);
+int xs_init(void);
+int xb_init_comms(void);
+void xb_deinit_comms(void);
+int xs_watch_msg(struct xs_watch_event *event);
+void xs_request_exit(struct xb_req_data *req);

-extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+int xenbus_match(struct device *_dev, struct device_driver *_drv);
+int xenbus_dev_probe(struct device *_dev);
+int xenbus_dev_remove(struct device *_dev);
+int xenbus_register_driver_common(struct xenbus_driver *drv,
+				  struct xen_bus_type *bus,
+				  struct module *owner,
+				  const char *mod_name);
+int xenbus_probe_node(struct xen_bus_type *bus,
+		      const char *type,
+		      const char *nodename);
+int xenbus_probe_devices(struct xen_bus_type *bus);

-extern void xenbus_dev_shutdown(struct device *_dev);
+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);

-extern int xenbus_dev_suspend(struct device *dev);
-extern int xenbus_dev_resume(struct device *dev);
-extern int xenbus_dev_cancel(struct device *dev);
+void xenbus_dev_shutdown(struct device *_dev);

-extern void xenbus_otherend_changed(struct xenbus_watch *watch,
-				    const char **vec, unsigned int len,
-				    int ignore_on_shutdown);
+int xenbus_dev_suspend(struct device *dev);
+int xenbus_dev_resume(struct device *dev);
+int xenbus_dev_cancel(struct device *dev);

-extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
-					char *id_node, char *path_node);
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+			     const char *path, const char *token,
+			     int ignore_on_shutdown);
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
+				 char *id_node, char *path_node);

 void xenbus_ring_ops_init(void);

+int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par);
+void xenbus_dev_queue_reply(struct xb_req_data *req);
+
 #endif
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -47,7 +47,7 @@
 #include <xen/xen.h>
 #include <xen/features.h>

-#include "xenbus_probe.h"
+#include "xenbus.h"

 #define XENBUS_PAGES(_grants)	(DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE))

@@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(xenbus_strstate);
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
-				       const char **, unsigned int))
+				       const char *, const char *))
 {
 	int err;

@@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path);
 int xenbus_watch_pathfmt(struct xenbus_device *dev,
 			 struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
-					const char **, unsigned int),
+					  const char *, const char *),
 			 const char *pathfmt, ...)
 {
 	int err;
@@ -259,53 +259,34 @@ int xenbus_frontend_closed(struct xenbus_device *dev)
 }
 EXPORT_SYMBOL_GPL(xenbus_frontend_closed);

-/**
- * Return the path to the error node for the given device, or NULL on failure.
- * If the value returned is non-NULL, then it is the caller's to kfree.
- */
-static char *error_path(struct xenbus_device *dev)
-{
-	return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
-}
-
-
 static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
 				const char *fmt, va_list ap)
 {
 	unsigned int len;
-	char *printf_buffer = NULL;
-	char *path_buffer = NULL;
+	char *printf_buffer;
+	char *path_buffer;

 #define PRINTF_BUFFER_SIZE 4096
+
 	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
-	if (printf_buffer == NULL)
-		goto fail;
+	if (!printf_buffer)
+		return;

 	len = sprintf(printf_buffer, "%i ", -err);
-	vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+	vsnprintf(printf_buffer + len, PRINTF_BUFFER_SIZE - len, fmt, ap);

 	dev_err(&dev->dev, "%s\n", printf_buffer);

-	path_buffer = error_path(dev);
-
-	if (path_buffer == NULL) {
+	path_buffer = kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
+	if (!path_buffer ||
+	    xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer))
 		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-		       dev->nodename, printf_buffer);
-		goto fail;
-	}
+			dev->nodename, printf_buffer);

-	if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
-		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-		       dev->nodename, printf_buffer);
-		goto fail;
-	}
-
-fail:
 	kfree(printf_buffer);
 	kfree(path_buffer);
 }

-
 /**
 * xenbus_dev_error
 * @dev: xenbus device

--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -34,19 +34,31 @@

 #include <linux/wait.h>
 #include <linux/interrupt.h>
+#include <linux/kthread.h>
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <xen/xenbus.h>
 #include <asm/xen/hypervisor.h>
 #include <xen/events.h>
 #include <xen/page.h>
-#include "xenbus_comms.h"
+#include "xenbus.h"
+
+/* A list of replies. Currently only one will ever be outstanding. */
+LIST_HEAD(xs_reply_list);
+
+/* A list of write requests. */
+LIST_HEAD(xb_write_list);
+DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+DEFINE_MUTEX(xb_write_mutex);
+
+/* Protect xenbus reader thread against save/restore. */
+DEFINE_MUTEX(xs_response_mutex);

 static int xenbus_irq;
+static struct task_struct *xenbus_task;

 static DECLARE_WORK(probe_work, xenbus_probe);

-static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);

 static irqreturn_t wake_waiting(int irq, void *unused)
 {
@@ -84,30 +96,31 @@ static const void *get_input_chunk(XENSTORE_RING_IDX cons,
 	return buf + MASK_XENSTORE_IDX(cons);
 }

+static int xb_data_to_write(void)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+
+	return (intf->req_prod - intf->req_cons) != XENSTORE_RING_SIZE &&
+		!list_empty(&xb_write_list);
+}
+
 /**
 * xb_write - low level write
 * @data: buffer to send
 * @len: length of buffer
 *
- * Returns 0 on success, error otherwise.
+ * Returns number of bytes written or -err.
 */
-int xb_write(const void *data, unsigned len)
+static int xb_write(const void *data, unsigned int len)
 {
 	struct xenstore_domain_interface *intf = xen_store_interface;
 	XENSTORE_RING_IDX cons, prod;
-	int rc;
+	unsigned int bytes = 0;

 	while (len != 0) {
 		void *dst;
 		unsigned int avail;

-		rc = wait_event_interruptible(
-			xb_waitq,
-			(intf->req_prod - intf->req_cons) !=
-			XENSTORE_RING_SIZE);
-		if (rc < 0)
-			return rc;
-
 		/* Read indexes, then verify. */
 		cons = intf->req_cons;
 		prod = intf->req_prod;
@@ -115,6 +128,11 @@ int xb_write(const void *data, unsigned len)
 			intf->req_cons = intf->req_prod = 0;
 			return -EIO;
 		}
+		if (!xb_data_to_write())
+			return bytes;
+
+		/* Must write data /after/ reading the consumer index. */
+		virt_mb();

 		dst = get_output_chunk(cons, prod, intf->req, &avail);
 		if (avail == 0)
@@ -122,52 +140,45 @@ int xb_write(const void *data, unsigned len)
 		if (avail > len)
 			avail = len;

-		/* Must write data /after/ reading the consumer index. */
-		virt_mb();
-
 		memcpy(dst, data, avail);
 		data += avail;
 		len -= avail;
+		bytes += avail;

 		/* Other side must not see new producer until data is there. */
 		virt_wmb();
 		intf->req_prod += avail;

 		/* Implies mb(): other side will see the updated producer. */
-		notify_remote_via_evtchn(xen_store_evtchn);
+		if (prod <= intf->req_cons)
+			notify_remote_via_evtchn(xen_store_evtchn);
 	}

-	return 0;
+	return bytes;
 }

-int xb_data_to_read(void)
+static int xb_data_to_read(void)
 {
 	struct xenstore_domain_interface *intf = xen_store_interface;
 	return (intf->rsp_cons != intf->rsp_prod);
 }

-int xb_wait_for_data_to_read(void)
-{
-	return wait_event_interruptible(xb_waitq, xb_data_to_read());
-}
-
-int xb_read(void *data, unsigned len)
+static int xb_read(void *data, unsigned int len)
 {
 	struct xenstore_domain_interface *intf = xen_store_interface;
 	XENSTORE_RING_IDX cons, prod;
-	int rc;
+	unsigned int bytes = 0;

 	while (len != 0) {
 		unsigned int avail;
 		const char *src;

-		rc = xb_wait_for_data_to_read();
-		if (rc < 0)
-			return rc;
-
 		/* Read indexes, then verify. */
 		cons = intf->rsp_cons;
 		prod = intf->rsp_prod;
+		if (cons == prod)
+			return bytes;
+
 		if (!check_indexes(cons, prod)) {
 			intf->rsp_cons = intf->rsp_prod = 0;
 			return -EIO;
@@ -185,17 +196,243 @@ int xb_read(void *data, unsigned len)
 		memcpy(data, src, avail);
 		data += avail;
 		len -= avail;
+		bytes += avail;

 		/* Other side must not see free space until we've copied out */
 		virt_mb();
 		intf->rsp_cons += avail;

-		pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
-
 		/* Implies mb(): other side will see the updated consumer. */
-		notify_remote_via_evtchn(xen_store_evtchn);
+		if (intf->rsp_prod - cons >= XENSTORE_RING_SIZE)
+			notify_remote_via_evtchn(xen_store_evtchn);
+	}
+
+	return bytes;
+}
+
+static int process_msg(void)
+{
+	static struct {
+		struct xsd_sockmsg msg;
+		char *body;
+		union {
+			void *alloc;
+			struct xs_watch_event *watch;
+		};
+		bool in_msg;
+		bool in_hdr;
+		unsigned int read;
+	} state;
+	struct xb_req_data *req;
+	int err;
+	unsigned int len;
+
+	if (!state.in_msg) {
+		state.in_msg = true;
+		state.in_hdr = true;
+		state.read = 0;
+
+		/*
+		 * We must disallow save/restore while reading a message.
+		 * A partial read across s/r leaves us out of sync with
+		 * xenstored.
+		 * xs_response_mutex is locked as long as we are processing one
+		 * message. state.in_msg will be true as long as we are holding
+		 * the lock here.
+		 */
+		mutex_lock(&xs_response_mutex);
+
+		if (!xb_data_to_read()) {
+			/* We raced with save/restore: pending data 'gone'. */
+			mutex_unlock(&xs_response_mutex);
+			state.in_msg = false;
+			return 0;
+		}
+	}
+
+	if (state.in_hdr) {
+		if (state.read != sizeof(state.msg)) {
+			err = xb_read((void *)&state.msg + state.read,
+				      sizeof(state.msg) - state.read);
+			if (err < 0)
+				goto out;
+			state.read += err;
+			if (state.read != sizeof(state.msg))
+				return 0;
+			if (state.msg.len > XENSTORE_PAYLOAD_MAX) {
+				err = -EINVAL;
+				goto out;
+			}
+		}
+
+		len = state.msg.len + 1;
+		if (state.msg.type == XS_WATCH_EVENT)
+			len += sizeof(*state.watch);
+
+		state.alloc = kmalloc(len, GFP_NOIO | __GFP_HIGH);
+		if (!state.alloc)
+			return -ENOMEM;
+
+		if (state.msg.type == XS_WATCH_EVENT)
+			state.body = state.watch->body;
+		else
+			state.body = state.alloc;
+		state.in_hdr = false;
+		state.read = 0;
+	}
+
+	err = xb_read(state.body + state.read, state.msg.len - state.read);
+	if (err < 0)
+		goto out;
+
+	state.read += err;
+	if (state.read != state.msg.len)
+		return 0;
+
+	state.body[state.msg.len] = '\0';
+
+	if (state.msg.type == XS_WATCH_EVENT) {
+		state.watch->len = state.msg.len;
+		err = xs_watch_msg(state.watch);
+	} else {
+		err = -ENOENT;
+		mutex_lock(&xb_write_mutex);
+		list_for_each_entry(req, &xs_reply_list, list) {
+			if (req->msg.req_id == state.msg.req_id) {
+				if (req->state == xb_req_state_wait_reply) {
+					req->msg.type = state.msg.type;
+					req->msg.len = state.msg.len;
+					req->body = state.body;
+					req->state = xb_req_state_got_reply;
+					list_del(&req->list);
+					req->cb(req);
+				} else {
+					list_del(&req->list);
+					kfree(req);
+				}
+				err = 0;
+				break;
+			}
+		}
+		mutex_unlock(&xb_write_mutex);
+		if (err)
+			goto out;
 	}

+	mutex_unlock(&xs_response_mutex);
+
+	state.in_msg = false;
+	state.alloc = NULL;
+	return err;
+
+ out:
+	mutex_unlock(&xs_response_mutex);
+	state.in_msg = false;
+	kfree(state.alloc);
+	state.alloc = NULL;
+	return err;
+}
+
+static int process_writes(void)
+{
+	static struct {
+		struct xb_req_data *req;
+		int idx;
+		unsigned int written;
+	} state;
+	void *base;
+	unsigned int len;
+	int err = 0;
+
+	if (!xb_data_to_write())
+		return 0;
+
+	mutex_lock(&xb_write_mutex);
+
+	if (!state.req) {
+		state.req = list_first_entry(&xb_write_list,
+					     struct xb_req_data, list);
+		state.idx = -1;
+		state.written = 0;
+	}
+
+	if (state.req->state == xb_req_state_aborted)
+		goto out_err;
+
+	while (state.idx < state.req->num_vecs) {
+		if (state.idx < 0) {
+			base = &state.req->msg;
+			len = sizeof(state.req->msg);
+		} else {
+			base = state.req->vec[state.idx].iov_base;
+			len = state.req->vec[state.idx].iov_len;
+		}
+		err = xb_write(base + state.written, len - state.written);
+		if (err < 0)
+			goto out_err;
+		state.written += err;
+		if (state.written != len)
+			goto out;
+
+		state.idx++;
+		state.written = 0;
+	}
+
+	list_del(&state.req->list);
+	state.req->state = xb_req_state_wait_reply;
+	list_add_tail(&state.req->list, &xs_reply_list);
+	state.req = NULL;
+
+ out:
+	mutex_unlock(&xb_write_mutex);
+
+	return 0;
+
+ out_err:
+	state.req->msg.type = XS_ERROR;
+	state.req->err = err;
+	list_del(&state.req->list);
+	if (state.req->state == xb_req_state_aborted)
+		kfree(state.req);
+	else {
+		state.req->state = xb_req_state_got_reply;
+		wake_up(&state.req->wq);
+	}
+
+	mutex_unlock(&xb_write_mutex);
+
+	state.req = NULL;
+
+	return err;
+}
+
+static int xb_thread_work(void)
+{
+	return xb_data_to_read() || xb_data_to_write();
+}
+
+static int xenbus_thread(void *unused)
+{
+	int err;
+
+	while (!kthread_should_stop()) {
+		if (wait_event_interruptible(xb_waitq, xb_thread_work()))
+			continue;
+
+		err = process_msg();
+		if (err == -ENOMEM)
+			schedule();
+		else if (err)
+			pr_warn_ratelimited("error %d while reading message\n",
+					    err);
+
+		err = process_writes();
+		if (err)
+			pr_warn_ratelimited("error %d while writing message\n",
+					    err);
+	}
+
+	xenbus_task = NULL;
 	return 0;
 }

@@ -223,6 +460,7 @@ int xb_init_comms(void)
 		rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
 	} else {
 		int err;
+
 		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
 						0, "xenbus", &xb_waitq);
 		if (err < 0) {
@@ -231,6 +469,13 @@ int xb_init_comms(void)
 		}

 		xenbus_irq = err;
+
+		if (!xenbus_task) {
+			xenbus_task = kthread_run(xenbus_thread, NULL,
+						  "xenbus");
+			if (IS_ERR(xenbus_task))
+				return PTR_ERR(xenbus_task);
+		}
 	}

 	return 0;

--- a/drivers/xen/xenbus/xenbus_dev_backend.c
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -16,7 +16,7 @@
 #include <xen/events.h>
 #include <asm/xen/hypervisor.h>

-#include "xenbus_comms.h"
+#include "xenbus.h"

 static int xenbus_backend_open(struct inode *inode, struct file *filp)
 {

--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -57,12 +57,12 @@
 #include <linux/miscdevice.h>
 #include <linux/init.h>

-#include "xenbus_comms.h"
-
 #include <xen/xenbus.h>
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>

+#include "xenbus.h"
+
 /*
 * An element of a list of outstanding transactions, for which we're
 * still waiting a reply.
@@ -113,6 +113,7 @@ struct xenbus_file_priv {
 	struct list_head read_buffers;
 	wait_queue_head_t read_waitq;

+	struct kref kref;
 };

 /* Read out any raw xenbus messages queued up. */
@@ -258,26 +259,23 @@ static struct watch_adapter *alloc_watch_adapter(const char *path,
 }

 static void watch_fired(struct xenbus_watch *watch,
-			const char **vec,
-			unsigned int len)
+			const char *path,
+			const char *token)
 {
 	struct watch_adapter *adap;
 	struct xsd_sockmsg hdr;
-	const char *path, *token;
-	int path_len, tok_len, body_len, data_len = 0;
+	const char *token_caller;
+	int path_len, tok_len, body_len;
 	int ret;
 	LIST_HEAD(staging_q);

 	adap = container_of(watch, struct watch_adapter, watch);

-	path = vec[XS_WATCH_PATH];
-	token = adap->token;
+	token_caller = adap->token;

 	path_len = strlen(path) + 1;
-	tok_len = strlen(token) + 1;
-	if (len > 2)
-		data_len = vec[len] - vec[2] + 1;
-	body_len = path_len + tok_len + data_len;
+	tok_len = strlen(token_caller) + 1;
+	body_len = path_len + tok_len;

 	hdr.type = XS_WATCH_EVENT;
 	hdr.len = body_len;
@@ -288,9 +286,7 @@ static void watch_fired(struct xenbus_watch *watch,
 	if (!ret)
 		ret = queue_reply(&staging_q, path, path_len);
 	if (!ret)
-		ret = queue_reply(&staging_q, token, tok_len);
-	if (!ret && len > 2)
-		ret = queue_reply(&staging_q, vec[2], data_len);
+		ret = queue_reply(&staging_q, token_caller, tok_len);

 	if (!ret) {
 		/* success: pass reply list onto watcher */
@@ -302,6 +298,107 @@ static void watch_fired(struct xenbus_watch *watch,
 	mutex_unlock(&adap->dev_data->reply_mutex);
 }

+static void xenbus_file_free(struct kref *kref)
+{
+	struct xenbus_file_priv *u;
+	struct xenbus_transaction_holder *trans, *tmp;
+	struct watch_adapter *watch, *tmp_watch;
+	struct read_buffer *rb, *tmp_rb;
+
+	u = container_of(kref, struct xenbus_file_priv, kref);
+
+	/*
+	 * No need for locking here because there are no other users,
+	 * by definition.
+	 */
+
+	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+		xenbus_transaction_end(trans->handle, 1);
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+		unregister_xenbus_watch(&watch->watch);
+		list_del(&watch->list);
+		free_watch_adapter(watch);
+	}
+
+	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+		list_del(&rb->list);
+		kfree(rb);
+	}
+	kfree(u);
+}
+
+static struct xenbus_transaction_holder *xenbus_get_transaction(
+	struct xenbus_file_priv *u, uint32_t tx_id)
+{
+	struct xenbus_transaction_holder *trans;
+
+	list_for_each_entry(trans, &u->transactions, list)
+		if (trans->handle.id == tx_id)
+			return trans;
+
+	return NULL;
+}
+
+void xenbus_dev_queue_reply(struct xb_req_data *req)
+{
+	struct xenbus_file_priv *u = req->par;
+	struct xenbus_transaction_holder *trans = NULL;
+	int rc;
+	LIST_HEAD(staging_q);
+
+	xs_request_exit(req);
+
+	mutex_lock(&u->msgbuffer_mutex);
+
+	if (req->type == XS_TRANSACTION_START) {
+		trans = xenbus_get_transaction(u, 0);
+		if (WARN_ON(!trans))
+			goto out;
+		if (req->msg.type == XS_ERROR) {
+			list_del(&trans->list);
+			kfree(trans);
+		} else {
+			rc = kstrtou32(req->body, 10, &trans->handle.id);
+			if (WARN_ON(rc))
+				goto out;
+		}
+	} else if (req->msg.type == XS_TRANSACTION_END) {
+		trans = xenbus_get_transaction(u, req->msg.tx_id);
+		if (WARN_ON(!trans))
+			goto out;
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	mutex_unlock(&u->msgbuffer_mutex);
+
+	mutex_lock(&u->reply_mutex);
+	rc = queue_reply(&staging_q, &req->msg, sizeof(req->msg));
+	if (!rc)
+		rc = queue_reply(&staging_q, req->body, req->msg.len);
+	if (!rc) {
+		list_splice_tail(&staging_q, &u->read_buffers);
+		wake_up(&u->read_waitq);
+	} else {
+		queue_cleanup(&staging_q);
+	}
+	mutex_unlock(&u->reply_mutex);
+
+	kfree(req->body);
+	kfree(req);
+
+	kref_put(&u->kref, xenbus_file_free);
+
+	return;
+
+ out:
+	mutex_unlock(&u->msgbuffer_mutex);
+}
+
 static int xenbus_command_reply(struct xenbus_file_priv *u,
 				unsigned int msg_type, const char *reply)
 {
@@ -322,6 +419,9 @@ static int xenbus_command_reply(struct xenbus_file_priv *u,
 	wake_up(&u->read_waitq);
 	mutex_unlock(&u->reply_mutex);

+	if (!rc)
+		kref_put(&u->kref, xenbus_file_free);
+
 	return rc;
 }

@@ -329,57 +429,22 @@ static int xenbus_write_transaction(unsigned msg_type,
 				    struct xenbus_file_priv *u)
 {
 	int rc;
-	void *reply;
 	struct xenbus_transaction_holder *trans = NULL;
-	LIST_HEAD(staging_q);

 	if (msg_type == XS_TRANSACTION_START) {
-		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+		trans = kzalloc(sizeof(*trans), GFP_KERNEL);
 		if (!trans) {
 			rc = -ENOMEM;
 			goto out;
 		}
-	} else if (u->u.msg.tx_id != 0) {
-		list_for_each_entry(trans, &u->transactions, list)
-			if (trans->handle.id == u->u.msg.tx_id)
-				break;
-		if (&trans->list == &u->transactions)
-			return xenbus_command_reply(u, XS_ERROR, "ENOENT");
-	}
-
-	reply = xenbus_dev_request_and_reply(&u->u.msg);
-	if (IS_ERR(reply)) {
-		if (msg_type == XS_TRANSACTION_START)
-			kfree(trans);
-		rc = PTR_ERR(reply);
-		goto out;
-	}
+		list_add(&trans->list, &u->transactions);
+	} else if (u->u.msg.tx_id != 0 &&
+		   !xenbus_get_transaction(u, u->u.msg.tx_id))
+		return xenbus_command_reply(u, XS_ERROR, "ENOENT");

-	if (msg_type == XS_TRANSACTION_START) {
-		if (u->u.msg.type == XS_ERROR)
-			kfree(trans);
-		else {
-			trans->handle.id = simple_strtoul(reply, NULL, 0);
-			list_add(&trans->list, &u->transactions);
-		}
-	} else if (u->u.msg.type == XS_TRANSACTION_END) {
-		list_del(&trans->list);
+	rc = xenbus_dev_request_and_reply(&u->u.msg, u);
+	if (rc)
 		kfree(trans);
-	}
-
-	mutex_lock(&u->reply_mutex);
-	rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg));
-	if (!rc)
-		rc = queue_reply(&staging_q, reply, u->u.msg.len);
-	if (!rc) {
-		list_splice_tail(&staging_q, &u->read_buffers);
-		wake_up(&u->read_waitq);
-	} else {
-		queue_cleanup(&staging_q);
-	}
-	mutex_unlock(&u->reply_mutex);
-
-	kfree(reply);

 out:
 	return rc;
@@ -511,6 +576,8 @@ static ssize_t xenbus_file_write(struct file *filp,
 	 * OK, now we have a complete message.  Do something with it.
 	 */

+	kref_get(&u->kref);
+
 	msg_type = u->u.msg.type;

 	switch (msg_type) {
@@ -525,8 +592,10 @@ static ssize_t xenbus_file_write(struct file *filp,
 		ret = xenbus_write_transaction(msg_type, u);
 		break;
 	}
-	if (ret != 0)
+	if (ret != 0) {
 		rc = ret;
+		kref_put(&u->kref, xenbus_file_free);
+	}

 	/* Buffered message consumed */
 	u->len = 0;
@@ -551,6 +620,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp)
 	if (u == NULL)
 		return -ENOMEM;

+	kref_init(&u->kref);
+
 	INIT_LIST_HEAD(&u->transactions);
 	INIT_LIST_HEAD(&u->watches);
 	INIT_LIST_HEAD(&u->read_buffers);
@@ -567,32 +638,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp)
 static int xenbus_file_release(struct inode *inode, struct file *filp)
 {
 	struct xenbus_file_priv *u = filp->private_data;
-	struct xenbus_transaction_holder *trans, *tmp;
-	struct watch_adapter *watch, *tmp_watch;
-	struct read_buffer *rb, *tmp_rb;
-
-	/*
-	 * No need for locking here because there are no other users,
-	 * by definition.
-	 */
-
-	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
-		xenbus_transaction_end(trans->handle, 1);
-		list_del(&trans->list);
-		kfree(trans);
-	}
-
-	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-		unregister_xenbus_watch(&watch->watch);
-		list_del(&watch->list);
-		free_watch_adapter(watch);
-	}

-	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
-		list_del(&rb->list);
-		kfree(rb);
-	}
-	kfree(u);
+	kref_put(&u->kref, xenbus_file_free);

 	return 0;
 }

--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -62,8 +62,7 @@

 #include <xen/hvm.h>

-#include "xenbus_comms.h"
-#include "xenbus_probe.h"
+#include "xenbus.h"


 int xen_store_evtchn;
@@ -170,7 +169,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev,
 EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);

 void xenbus_otherend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len,
+			     const char *path, const char *token,
 			     int ignore_on_shutdown)
 {
 	struct xenbus_device *dev =
@@ -181,18 +180,15 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
 	/* Protect us against watches firing on old details when the otherend
 	   details change, say immediately after a resume. */
 	if (!dev->otherend ||
-	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
-		    strlen(dev->otherend))) {
-		dev_dbg(&dev->dev, "Ignoring watch at %s\n",
-			vec[XS_WATCH_PATH]);
+	    strncmp(dev->otherend, path, strlen(dev->otherend))) {
+		dev_dbg(&dev->dev, "Ignoring watch at %s\n", path);
 		return;
 	}

 	state = xenbus_read_driver_state(dev->otherend);

 	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
-		state, xenbus_strstate(state), dev->otherend_watch.node,
-		vec[XS_WATCH_PATH]);
+		state, xenbus_strstate(state), dev->otherend_watch.node, path);

 	/*
 	 * Ignore xenbus transitions during shutdown. This prevents us doing

--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -53,8 +53,7 @@
 #include <xen/xenbus.h>
 #include <xen/features.h>

-#include "xenbus_comms.h"
-#include "xenbus_probe.h"
+#include "xenbus.h"

 /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
 static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
@@ -182,9 +181,9 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
 }

 static void frontend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
-	xenbus_otherend_changed(watch, vec, len, 0);
+	xenbus_otherend_changed(watch, path, token, 0);
 }

 static struct xen_bus_type xenbus_backend = {
@@ -205,11 +204,11 @@ static struct xen_bus_type xenbus_backend = {
 };

 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
 	DPRINTK("");

-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+	xenbus_dev_changed(path, &xenbus_backend);
 }

 static struct xenbus_watch be_watch = {

--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -27,8 +27,7 @@

 #include <xen/platform_pci.h>

-#include "xenbus_comms.h"
-#include "xenbus_probe.h"
+#include "xenbus.h"



@@ -87,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev,


 static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
+			    const char *path, const char *token)
 {
-	xenbus_otherend_changed(watch, vec, len, 1);
+	xenbus_otherend_changed(watch, path, token, 1);
 }

 static void xenbus_frontend_delayed_resume(struct work_struct *w)
@@ -154,11 +153,11 @@ static struct xen_bus_type xenbus_frontend = {
 };

 static void frontend_changed(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
+			     const char *path, const char *token)
 {
 	DPRINTK("");

-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+	xenbus_dev_changed(path, &xenbus_frontend);
 }


@@ -333,13 +332,13 @@ static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
 static int backend_state;

 static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
-					const char **v, unsigned int l)
+					const char *path, const char *token)
 {
-	if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i",
+	if (xenbus_scanf(XBT_NIL, path, "", "%i",
 			 &backend_state) != 1)
 		backend_state = XenbusStateUnknown;
 	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
-			v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+	       path, xenbus_strstate(backend_state));
 	wake_up(&backend_state_wq);
 }


--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -43,69 +43,36 @@
 #include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/kthread.h>
+#include <linux/reboot.h>
 #include <linux/rwsem.h>
 #include <linux/mutex.h>
 #include <asm/xen/hypervisor.h>
 #include <xen/xenbus.h>
 #include <xen/xen.h>
-#include "xenbus_comms.h"
-#include "xenbus_probe.h"
-
-struct xs_stored_msg {
-	struct list_head list;
-
-	struct xsd_sockmsg hdr;
-
-	union {
-		/* Queued replies. */
-		struct {
-			char *body;
-		} reply;
-
-		/* Queued watch events. */
-		struct {
-			struct xenbus_watch *handle;
-			char **vec;
-			unsigned int vec_size;
-		} watch;
-	} u;
-};
+#include "xenbus.h"

-struct xs_handle {
-	/* A list of replies. Currently only one will ever be outstanding. */
-	struct list_head reply_list;
-	spinlock_t reply_lock;
-	wait_queue_head_t reply_waitq;
-
-	/*
-	 * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
-	 * response_mutex is never taken simultaneously with the other three.
-	 *
-	 * transaction_mutex must be held before incrementing
-	 * transaction_count. The mutex is held when a suspend is in
-	 * progress to prevent new transactions starting.
-	 *
-	 * When decrementing transaction_count to zero the wait queue
-	 * should be woken up, the suspend code waits for count to
-	 * reach zero.
-	 */
-
-	/* One request at a time. */
-	struct mutex request_mutex;
-
-	/* Protect xenbus reader thread against save/restore. */
-	struct mutex response_mutex;
-
-	/* Protect transactions against save/restore. */
-	struct mutex transaction_mutex;
-	atomic_t transaction_count;
-	wait_queue_head_t transaction_wq;
-
-	/* Protect watch (de)register against save/restore. */
-	struct rw_semaphore watch_mutex;
-};
+/*
+ * Framework to protect suspend/resume handling against normal Xenstore
+ * message handling:
+ * During suspend/resume there must be no open transaction and no pending
+ * Xenstore request.
+ * New watch events happening in this time can be ignored by firing all watches
+ * after resume.
+ */
+
+/* Lock protecting enter/exit critical region. */
+static DEFINE_SPINLOCK(xs_state_lock);
+/* Number of users in critical region (protected by xs_state_lock). */
+static unsigned int xs_state_users;
+/* Suspend handler waiting or already active (protected by xs_state_lock)? */
+static int xs_suspend_active;
+/* Unique Xenstore request id (protected by xs_state_lock). */
+static uint32_t xs_request_id;

-static struct xs_handle xs_state;
+/* Wait queue for all callers waiting for critical region to become usable. */
+static DECLARE_WAIT_QUEUE_HEAD(xs_state_enter_wq);
+/* Wait queue for suspend handling waiting for critical region being empty. */
+static DECLARE_WAIT_QUEUE_HEAD(xs_state_exit_wq);

 /* List of registered watches, and a lock to protect it. */
 static LIST_HEAD(watches);
@@ -115,6 +82,9 @@ static DEFINE_SPINLOCK(watches_lock);
 static LIST_HEAD(watch_events);
 static DEFINE_SPINLOCK(watch_events_lock);

+/* Protect watch (de)register against save/restore. */
+static DECLARE_RWSEM(xs_watch_rwsem);
+
 /*
 * Details of the xenwatch callback kernel thread. The thread waits on the
 * watch_events_waitq for work to do (queued on watch_events list). When it
@@ -125,6 +95,59 @@ static pid_t xenwatch_pid;
 static DEFINE_MUTEX(xenwatch_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);

+static void xs_suspend_enter(void)
+{
+	spin_lock(&xs_state_lock);
+	xs_suspend_active++;
+	spin_unlock(&xs_state_lock);
+	wait_event(xs_state_exit_wq, xs_state_users == 0);
+}
+
+static void xs_suspend_exit(void)
+{
+	spin_lock(&xs_state_lock);
+	xs_suspend_active--;
+	spin_unlock(&xs_state_lock);
+	wake_up_all(&xs_state_enter_wq);
+}
+
+static uint32_t xs_request_enter(struct xb_req_data *req)
+{
+	uint32_t rq_id;
+
+	req->type = req->msg.type;
+
+	spin_lock(&xs_state_lock);
+
+	while (!xs_state_users && xs_suspend_active) {
+		spin_unlock(&xs_state_lock);
+		wait_event(xs_state_enter_wq, xs_suspend_active == 0);
+		spin_lock(&xs_state_lock);
+	}
+
+	if (req->type == XS_TRANSACTION_START)
+		xs_state_users++;
+	xs_state_users++;
+	rq_id = xs_request_id++;
+
+	spin_unlock(&xs_state_lock);
+
+	return rq_id;
+}
+
+void xs_request_exit(struct xb_req_data *req)
+{
+	spin_lock(&xs_state_lock);
+	xs_state_users--;
+	if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) ||
+	    req->type == XS_TRANSACTION_END)
+		xs_state_users--;
+	spin_unlock(&xs_state_lock);
+
+	if (xs_suspend_active && !xs_state_users)
+		wake_up(&xs_state_exit_wq);
+}
+
 static int get_error(const char *errorstring)
 {
 	unsigned int i;
@@ -162,21 +185,24 @@ static bool xenbus_ok(void)
 	}
 	return false;
 }
-static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+
+static bool test_reply(struct xb_req_data *req)
 {
-	struct xs_stored_msg *msg;
-	char *body;
+	if (req->state == xb_req_state_got_reply || !xenbus_ok())
+		return true;
+
+	/* Make sure to reread req->state each time. */
+	barrier();

-	spin_lock(&xs_state.reply_lock);
+	return false;
+}
+
+static void *read_reply(struct xb_req_data *req)
+{
+	while (req->state != xb_req_state_got_reply) {
+		wait_event(req->wq, test_reply(req));

-	while (list_empty(&xs_state.reply_list)) {
-		spin_unlock(&xs_state.reply_lock);
-		if (xenbus_ok())
-			/* XXX FIXME: Avoid synchronous wait for response here. */
-			wait_event_timeout(xs_state.reply_waitq,
-					   !list_empty(&xs_state.reply_list),
-					   msecs_to_jiffies(500));
-		else {
+		if (!xenbus_ok())
 			/*
 			 * If we are in the process of being shut-down there is
 			 * no point of trying to contact XenBus - it is either
@@ -184,76 +210,82 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
 			 * has been killed or is unreachable.
 			 */
 			return ERR_PTR(-EIO);
-		}
-		spin_lock(&xs_state.reply_lock);
+		if (req->err)
+			return ERR_PTR(req->err);
+
 	}

-	msg = list_entry(xs_state.reply_list.next,
-			 struct xs_stored_msg, list);
-	list_del(&msg->list);
+	return req->body;
+}

-	spin_unlock(&xs_state.reply_lock);
+static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg)
+{
+	bool notify;

-	*type = msg->hdr.type;
-	if (len)
-		*len = msg->hdr.len;
-	body = msg->u.reply.body;
+	req->msg = *msg;
+	req->err = 0;
+	req->state = xb_req_state_queued;
+	init_waitqueue_head(&req->wq);

-	kfree(msg);
+	req->msg.req_id = xs_request_enter(req);

-	return body;
-}
+	mutex_lock(&xb_write_mutex);
+	list_add_tail(&req->list, &xb_write_list);
+	notify = list_is_singular(&xb_write_list);
+	mutex_unlock(&xb_write_mutex);

-static void transaction_start(void)
-{
-	mutex_lock(&xs_state.transaction_mutex);
-	atomic_inc(&xs_state.transaction_count);
-	mutex_unlock(&xs_state.transaction_mutex);
+	if (notify)
+		wake_up(&xb_waitq);
 }

-static void transaction_end(void)
+static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg)
 {
-	if (atomic_dec_and_test(&xs_state.transaction_count))
-		wake_up(&xs_state.transaction_wq);
-}
+	void *ret;

-static void transaction_suspend(void)
-{
-	mutex_lock(&xs_state.transaction_mutex);
-	wait_event(xs_state.transaction_wq,
-		   atomic_read(&xs_state.transaction_count) == 0);
+	ret = read_reply(req);
+
+	xs_request_exit(req);
+
+	msg->type = req->msg.type;
+	msg->len = req->msg.len;
+
+	mutex_lock(&xb_write_mutex);
+	if (req->state == xb_req_state_queued ||
+	    req->state == xb_req_state_wait_reply)
+		req->state = xb_req_state_aborted;
+	else
+		kfree(req);
+	mutex_unlock(&xb_write_mutex);
+
+	return ret;
 }

-static void transaction_resume(void)
+static void xs_wake_up(struct xb_req_data *req)
 {
-	mutex_unlock(&xs_state.transaction_mutex);
+	wake_up(&req->wq);
 }

-void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par)
 {
-	void *ret;
-	enum xsd_sockmsg_type type = msg->type;
-	int err;
-
-	if (type == XS_TRANSACTION_START)
-		transaction_start();
+	struct xb_req_data *req;
+	struct kvec *vec;

-	mutex_lock(&xs_state.request_mutex);
+	req = kmalloc(sizeof(*req) + sizeof(*vec), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;

-	err = xb_write(msg, sizeof(*msg) + msg->len);
-	if (err) {
-		msg->type = XS_ERROR;
-		ret = ERR_PTR(err);
-	} else
-		ret = read_reply(&msg->type, &msg->len);
+	vec = (struct kvec *)(req + 1);
+	vec->iov_len = msg->len;
+	vec->iov_base = msg + 1;

-	mutex_unlock(&xs_state.request_mutex);
+	req->vec = vec;
+	req->num_vecs = 1;
+	req->cb = xenbus_dev_queue_reply;
+	req->par = par;

-	if ((msg->type == XS_TRANSACTION_END) ||
-	    ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR)))
-		transaction_end();
+	xs_send(req, msg);

-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(xenbus_dev_request_and_reply);

@@ -264,37 +296,31 @@ static void *xs_talkv(struct xenbus_transaction t,
 		      unsigned int num_vecs,
 		      unsigned int *len)
 {
+	struct xb_req_data *req;
 	struct xsd_sockmsg msg;
 	void *ret = NULL;
 	unsigned int i;
 	int err;

+	req = kmalloc(sizeof(*req), GFP_NOIO | __GFP_HIGH);
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	req->vec = iovec;
+	req->num_vecs = num_vecs;
+	req->cb = xs_wake_up;
+
 	msg.tx_id = t.id;
-	msg.req_id = 0;
 	msg.type = type;
 	msg.len = 0;
 	for (i = 0; i < num_vecs; i++)
 		msg.len += iovec[i].iov_len;

-	mutex_lock(&xs_state.request_mutex);
-
-	err = xb_write(&msg, sizeof(msg));
-	if (err) {
-		mutex_unlock(&xs_state.request_mutex);
-		return ERR_PTR(err);
-	}
-
-	for (i = 0; i < num_vecs; i++) {
-		err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
-		if (err) {
-			mutex_unlock(&xs_state.request_mutex);
-			return ERR_PTR(err);
-		}
-	}
-
-	ret = read_reply(&msg.type, len);
+	xs_send(req, &msg);

-	mutex_unlock(&xs_state.request_mutex);
+	ret = xs_wait_for_reply(req, &msg);
+	if (len)
+		*len = msg.len;

 	if (IS_ERR(ret))
 		return ret;
@@ -501,13 +527,9 @@ int xenbus_transaction_start(struct xenbus_transaction *t)
 {
 	char *id_str;

-	transaction_start();
-
 	id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
-	if (IS_ERR(id_str)) {
-		transaction_end();
+	if (IS_ERR(id_str))
 		return PTR_ERR(id_str);
-	}

 	t->id = simple_strtoul(id_str, NULL, 0);
 	kfree(id_str);
@@ -521,18 +543,13 @@ EXPORT_SYMBOL_GPL(xenbus_transaction_start);
 int xenbus_transaction_end(struct xenbus_transaction t, int abort)
 {
 	char abortstr[2];
-	int err;

 	if (abort)
 		strcpy(abortstr, "F");
 	else
 		strcpy(abortstr, "T");

-	err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
-
-	transaction_end();
-
-	return err;
+	return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
 }
 EXPORT_SYMBOL_GPL(xenbus_transaction_end);

@@ -665,6 +682,30 @@ static struct xenbus_watch *find_watch(const char *token)

 	return NULL;
 }
+
+int xs_watch_msg(struct xs_watch_event *event)
+{
+	if (count_strings(event->body, event->len) != 2) {
+		kfree(event);
+		return -EINVAL;
+	}
+	event->path = (const char *)event->body;
+	event->token = (const char *)strchr(event->body, '\0') + 1;
+
+	spin_lock(&watches_lock);
+	event->handle = find_watch(event->token);
+	if (event->handle != NULL) {
+		spin_lock(&watch_events_lock);
+		list_add_tail(&event->list, &watch_events);
+		wake_up(&watch_events_waitq);
+		spin_unlock(&watch_events_lock);
+	} else
+		kfree(event);
+	spin_unlock(&watches_lock);
+
+	return 0;
+}
+
 /*
 * Certain older XenBus toolstack cannot handle reading values that are
 * not populated. Some Xen 3.4 installation are incapable of doing this
@@ -713,7 +754,7 @@ int register_xenbus_watch(struct xenbus_watch *watch)

 	sprintf(token, "%lX", (long)watch);

-	down_read(&xs_state.watch_mutex);
+	down_read(&xs_watch_rwsem);

 	spin_lock(&watches_lock);
 	BUG_ON(find_watch(token));
@@ -728,7 +769,7 @@ int register_xenbus_watch(struct xenbus_watch *watch)
 		spin_unlock(&watches_lock);
 	}

-	up_read(&xs_state.watch_mutex);
+	up_read(&xs_watch_rwsem);

 	return err;
 }
@@ -736,13 +777,13 @@ EXPORT_SYMBOL_GPL(register_xenbus_watch);

 void unregister_xenbus_watch(struct xenbus_watch *watch)
 {
-	struct xs_stored_msg *msg, *tmp;
+	struct xs_watch_event *event, *tmp;
 	char token[sizeof(watch) * 2 + 1];
 	int err;

 	sprintf(token, "%lX", (long)watch);

-	down_read(&xs_state.watch_mutex);
+	down_read(&xs_watch_rwsem);

 	spin_lock(&watches_lock);
 	BUG_ON(!find_watch(token));
@@ -753,7 +794,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
 	if (err)
 		pr_warn("Failed to release watch %s: %i\n", watch->node, err);

-	up_read(&xs_state.watch_mutex);
+	up_read(&xs_watch_rwsem);

 	/* Make sure there are no callbacks running currently (unless
 	   its us) */
@@ -762,12 +803,11 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)

 	/* Cancel pending watch events. */
 	spin_lock(&watch_events_lock);
-	list_for_each_entry_safe(msg, tmp, &watch_events, list) {
-		if (msg->u.watch.handle != watch)
+	list_for_each_entry_safe(event, tmp, &watch_events, list) {
+		if (event->handle != watch)
 			continue;
-		list_del(&msg->list);
-		kfree(msg->u.watch.vec);
-		kfree(msg);
+		list_del(&event->list);
+		kfree(event);
 	}
 	spin_unlock(&watch_events_lock);

@@ -778,10 +818,10 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch);

 void xs_suspend(void)
 {
-	transaction_suspend();
-	down_write(&xs_state.watch_mutex);
-	mutex_lock(&xs_state.request_mutex);
-	mutex_lock(&xs_state.response_mutex);
+	xs_suspend_enter();
+
+	down_write(&xs_watch_rwsem);
+	mutex_lock(&xs_response_mutex);
 }

 void xs_resume(void)
@@ -791,31 +831,31 @@ void xs_resume(void)

 	xb_init_comms();

-	mutex_unlock(&xs_state.response_mutex);
-	mutex_unlock(&xs_state.request_mutex);
-	transaction_resume();
+	mutex_unlock(&xs_response_mutex);
+
+	xs_suspend_exit();

-	/* No need for watches_lock: the watch_mutex is sufficient. */
+	/* No need for watches_lock: the xs_watch_rwsem is sufficient. */
 	list_for_each_entry(watch, &watches, list) {
 		sprintf(token, "%lX", (long)watch);
 		xs_watch(watch->node, token);
 	}

-	up_write(&xs_state.watch_mutex);
+	up_write(&xs_watch_rwsem);
 }

 void xs_suspend_cancel(void)
 {
-	mutex_unlock(&xs_state.response_mutex);
-	mutex_unlock(&xs_state.request_mutex);
-	up_write(&xs_state.watch_mutex);
-	mutex_unlock(&xs_state.transaction_mutex);
+	mutex_unlock(&xs_response_mutex);
+	up_write(&xs_watch_rwsem);
+
+	xs_suspend_exit();
 }

 static int xenwatch_thread(void *unused)
 {
 	struct list_head *ent;
-	struct xs_stored_msg *msg;
+	struct xs_watch_event *event;

 	for (;;) {
 		wait_event_interruptible(watch_events_waitq,
@@ -833,13 +873,10 @@ static int xenwatch_thread(void *unused)
 		spin_unlock(&watch_events_lock);

 		if (ent != &watch_events) {
-			msg = list_entry(ent, struct xs_stored_msg, list);
-			msg->u.watch.handle->callback(
-				msg->u.watch.handle,
-				(const char **)msg->u.watch.vec,
-				msg->u.watch.vec_size);
-			kfree(msg->u.watch.vec);
-			kfree(msg);
+			event = list_entry(ent, struct xs_watch_event, list);
+			event->handle->callback(event->handle, event->path,
+						event->token);
+			kfree(event);
 		}

 		mutex_unlock(&xenwatch_mutex);
@@ -848,126 +885,37 @@ static int xenwatch_thread(void *unused)
 	return 0;
 }

-static int process_msg(void)
+/*
+ * Wake up all threads waiting for a xenstore reply. In case of shutdown all
+ * pending replies will be marked as "aborted" in order to let the waiters
+ * return in spite of xenstore possibly no longer being able to reply. This
+ * will avoid blocking shutdown by a thread waiting for xenstore but being
+ * necessary for shutdown processing to proceed.
+ */
+static int xs_reboot_notify(struct notifier_block *nb,
+			    unsigned long code, void *unused)
 {
-	struct xs_stored_msg *msg;
-	char *body;
-	int err;
-
-	/*
-	 * We must disallow save/restore while reading a xenstore message.
-	 * A partial read across s/r leaves us out of sync with xenstored.
-	 */
-	for (;;) {
-		err = xb_wait_for_data_to_read();
-		if (err)
-			return err;
-		mutex_lock(&xs_state.response_mutex);
-		if (xb_data_to_read())
-			break;
-		/* We raced with save/restore: pending data 'disappeared'. */
-		mutex_unlock(&xs_state.response_mutex);
-	}
-
-
-	msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH);
-	if (msg == NULL) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = xb_read(&msg->hdr, sizeof(msg->hdr));
-	if (err) {
-		kfree(msg);
-		goto out;
-	}
-
-	if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) {
-		kfree(msg);
-		err = -EINVAL;
-		goto out;
-	}
-
-	body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH);
-	if (body == NULL) {
-		kfree(msg);
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = xb_read(body, msg->hdr.len);
-	if (err) {
-		kfree(body);
-		kfree(msg);
-		goto out;
-	}
-	body[msg->hdr.len] = '\0';
-
-	if (msg->hdr.type == XS_WATCH_EVENT) {
-		msg->u.watch.vec = split(body, msg->hdr.len,
-					 &msg->u.watch.vec_size);
-		if (IS_ERR(msg->u.watch.vec)) {
-			err = PTR_ERR(msg->u.watch.vec);
-			kfree(msg);
-			goto out;
-		}
-
-		spin_lock(&watches_lock);
-		msg->u.watch.handle = find_watch(
-			msg->u.watch.vec[XS_WATCH_TOKEN]);
-		if (msg->u.watch.handle != NULL) {
-			spin_lock(&watch_events_lock);
-			list_add_tail(&msg->list, &watch_events);
-			wake_up(&watch_events_waitq);
-			spin_unlock(&watch_events_lock);
-		} else {
-			kfree(msg->u.watch.vec);
-			kfree(msg);
-		}
-		spin_unlock(&watches_lock);
-	} else {
-		msg->u.reply.body = body;
-		spin_lock(&xs_state.reply_lock);
-		list_add_tail(&msg->list, &xs_state.reply_list);
-		spin_unlock(&xs_state.reply_lock);
-		wake_up(&xs_state.reply_waitq);
-	}
+	struct xb_req_data *req;

- out:
-	mutex_unlock(&xs_state.response_mutex);
-	return err;
+	mutex_lock(&xb_write_mutex);
+	list_for_each_entry(req, &xs_reply_list, list)
+		wake_up(&req->wq);
+	list_for_each_entry(req, &xb_write_list, list)
+		wake_up(&req->wq);
+	mutex_unlock(&xb_write_mutex);
+	return NOTIFY_DONE;
 }

-static int xenbus_thread(void *unused)
-{
-	int err;
-
-	for (;;) {
-		err = process_msg();
-		if (err)
-			pr_warn("error %d while reading message\n", err);
-		if (kthread_should_stop())
-			break;
-	}
-
-	return 0;
-}
+static struct notifier_block xs_reboot_nb = {
+	.notifier_call = xs_reboot_notify,
+};

 int xs_init(void)
 {
 	int err;
 	struct task_struct *task;

-	INIT_LIST_HEAD(&xs_state.reply_list);
-	spin_lock_init(&xs_state.reply_lock);
-	init_waitqueue_head(&xs_state.reply_waitq);
-
-	mutex_init(&xs_state.request_mutex);
-	mutex_init(&xs_state.response_mutex);
-	mutex_init(&xs_state.transaction_mutex);
-	init_rwsem(&xs_state.watch_mutex);
-	atomic_set(&xs_state.transaction_count, 0);
-	init_waitqueue_head(&xs_state.transaction_wq);
+	register_reboot_notifier(&xs_reboot_nb);

 	/* Initialize the shared memory rings to talk to xenstored */
 	err = xb_init_comms();
@@ -979,10 +927,6 @@ int xs_init(void)
 		return PTR_ERR(task);
 	xenwatch_pid = task->pid;

-	task = kthread_run(xenbus_thread, NULL, "xenbus");
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-
 	/* shutdown watches for kexec boot */
 	xs_reset_watches();


--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -16,10 +16,10 @@
 #include <linux/magic.h>

 #include <xen/xen.h>
+#include <xen/xenbus.h>

 #include "xenfs.h"
 #include "../privcmd.h"
-#include "../xenbus/xenbus_comms.h"

 #include <asm/xen/hypervisor.h>


--- a/drivers/xen/xenfs/xenstored.c
+++ b/drivers/xen/xenfs/xenstored.c
@@ -4,9 +4,9 @@
 #include <linux/fs.h>

 #include <xen/page.h>
+#include <xen/xenbus.h>

 #include "xenfs.h"
-#include "../xenbus/xenbus_comms.h"

 static ssize_t xsd_read(struct file *file, char __user *buf,
 			    size_t size, loff_t *off)

--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -77,6 +77,17 @@ struct privcmd_mmapbatch_v2 {
 	int __user *err;  /* array of error codes */
 };

+struct privcmd_dm_op_buf {
+	void __user *uptr;
+	size_t size;
+};
+
+struct privcmd_dm_op {
+	domid_t dom;
+	__u16 num;
+	const struct privcmd_dm_op_buf __user *ubufs;
+};
+
 /*
 * @cmd: IOCTL_PRIVCMD_HYPERCALL
 * @arg: &privcmd_hypercall_t
@@ -98,5 +109,9 @@ struct privcmd_mmapbatch_v2 {
 	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
 #define IOCTL_PRIVCMD_MMAPBATCH_V2				\
 	_IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2))
+#define IOCTL_PRIVCMD_DM_OP					\
+	_IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op))
+#define IOCTL_PRIVCMD_RESTRICT					\
+	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))

 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
--- a/include/xen/arm/hypercall.h
+++ b/include/xen/arm/hypercall.h
@@ -53,6 +53,7 @@ int HYPERVISOR_physdev_op(int cmd, void *arg);
 int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args);
 int HYPERVISOR_tmem_op(void *arg);
 int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type);
+int HYPERVISOR_dm_op(domid_t domid, unsigned int nr_bufs, void *bufs);
 int HYPERVISOR_platform_op_raw(void *arg);
 static inline int HYPERVISOR_platform_op(struct xen_platform_op *op)
 {

--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -192,10 +192,20 @@
 */
 #define XEN_ELFNOTE_SUPPORTED_FEATURES 17

+/*
+ * Physical entry point into the kernel.
+ *
+ * 32bit entry point into the kernel. When requested to launch the
+ * guest kernel in a HVM container, Xen will use this entry point to
+ * launch the guest in 32bit protected mode with paging disabled.
+ * Ignored otherwise.
+ */
+#define XEN_ELFNOTE_PHYS32_ENTRY 18
+
 /*
 * The number of the highest elfnote defined.
 */
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY

 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */


--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
 /*
- * Private include for xenbus communications.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
+ * Copyright (c) 2016, Citrix Systems Inc
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
@@ -24,28 +16,17 @@
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
 */

-#ifndef _XENBUS_COMMS_H
-#define _XENBUS_COMMS_H
-
-#include <linux/fs.h>
-
-int xs_init(void);
-int xb_init_comms(void);
-void xb_deinit_comms(void);
-
-/* Low level routines. */
-int xb_write(const void *data, unsigned len);
-int xb_read(void *data, unsigned len);
-int xb_data_to_read(void);
-int xb_wait_for_data_to_read(void);
-extern struct xenstore_domain_interface *xen_store_interface;
-extern int xen_store_evtchn;
-extern enum xenstore_init xen_store_domain_type;
+#ifndef __XEN_PUBLIC_HVM_DM_OP_H__
+#define __XEN_PUBLIC_HVM_DM_OP_H__

-extern const struct file_operations xen_xenbus_fops;
+struct xen_dm_op_buf {
+	GUEST_HANDLE(void) h;
+	xen_ulong_t size;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf);

-#endif /* _XENBUS_COMMS_H */
+#endif /* __XEN_PUBLIC_HVM_DM_OP_H__ */
--- a/include/xen/interface/hvm/hvm_vcpu.h
+++ b/include/xen/interface/hvm/hvm_vcpu.h
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2015, Roger Pau Monne <roger.pau@citrix.com>
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_VCPU_H__
+#define __XEN_PUBLIC_HVM_HVM_VCPU_H__
+
+#include "../xen.h"
+
+struct vcpu_hvm_x86_32 {
+    uint32_t eax;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t ebx;
+    uint32_t esp;
+    uint32_t ebp;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t eip;
+    uint32_t eflags;
+
+    uint32_t cr0;
+    uint32_t cr3;
+    uint32_t cr4;
+
+    uint32_t pad1;
+
+    /*
+     * EFER should only be used to set the NXE bit (if required)
+     * when starting a vCPU in 32bit mode with paging enabled or
+     * to set the LME/LMA bits in order to start the vCPU in
+     * compatibility mode.
+     */
+    uint64_t efer;
+
+    uint32_t cs_base;
+    uint32_t ds_base;
+    uint32_t ss_base;
+    uint32_t es_base;
+    uint32_t tr_base;
+    uint32_t cs_limit;
+    uint32_t ds_limit;
+    uint32_t ss_limit;
+    uint32_t es_limit;
+    uint32_t tr_limit;
+    uint16_t cs_ar;
+    uint16_t ds_ar;
+    uint16_t ss_ar;
+    uint16_t es_ar;
+    uint16_t tr_ar;
+
+    uint16_t pad2[3];
+};
+
+/*
+ * The layout of the _ar fields of the segment registers is the
+ * following:
+ *
+ * Bits   [0,3]: type (bits 40-43).
+ * Bit        4: s    (descriptor type, bit 44).
+ * Bit    [5,6]: dpl  (descriptor privilege level, bits 45-46).
+ * Bit        7: p    (segment-present, bit 47).
+ * Bit        8: avl  (available for system software, bit 52).
+ * Bit        9: l    (64-bit code segment, bit 53).
+ * Bit       10: db   (meaning depends on the segment, bit 54).
+ * Bit       11: g    (granularity, bit 55)
+ * Bits [12,15]: unused, must be blank.
+ *
+ * A more complete description of the meaning of this fields can be
+ * obtained from the Intel SDM, Volume 3, section 3.4.5.
+ */
+
+struct vcpu_hvm_x86_64 {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t rip;
+    uint64_t rflags;
+
+    uint64_t cr0;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t efer;
+
+    /*
+     * Using VCPU_HVM_MODE_64B implies that the vCPU is launched
+     * directly in long mode, so the cached parts of the segment
+     * registers get set to match that environment.
+     *
+     * If the user wants to launch the vCPU in compatibility mode
+     * the 32-bit structure should be used instead.
+     */
+};
+
+struct vcpu_hvm_context {
+#define VCPU_HVM_MODE_32B 0  /* 32bit fields of the structure will be used. */
+#define VCPU_HVM_MODE_64B 1  /* 64bit fields of the structure will be used. */
+    uint32_t mode;
+
+    uint32_t pad;
+
+    /* CPU registers. */
+    union {
+        struct vcpu_hvm_x86_32 x86_32;
+        struct vcpu_hvm_x86_64 x86_64;
+    } cpu_regs;
+};
+typedef struct vcpu_hvm_context vcpu_hvm_context_t;
+
+#endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- a/include/xen/interface/hvm/start_info.h
+++ b/include/xen/interface/hvm/start_info.h
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2016, Citrix Systems, Inc.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__
+#define __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__
+
+/*
+ * Start of day structure passed to PVH guests and to HVM guests in %ebx.
+ *
+ * NOTE: nothing will be loaded at physical address 0, so a 0 value in any
+ * of the address fields should be treated as not present.
+ *
+ *  0 +----------------+
+ *    | magic          | Contains the magic value XEN_HVM_START_MAGIC_VALUE
+ *    |                | ("xEn3" with the 0x80 bit of the "E" set).
+ *  4 +----------------+
+ *    | version        | Version of this structure. Current version is 0. New
+ *    |                | versions are guaranteed to be backwards-compatible.
+ *  8 +----------------+
+ *    | flags          | SIF_xxx flags.
+ * 12 +----------------+
+ *    | nr_modules     | Number of modules passed to the kernel.
+ * 16 +----------------+
+ *    | modlist_paddr  | Physical address of an array of modules
+ *    |                | (layout of the structure below).
+ * 24 +----------------+
+ *    | cmdline_paddr  | Physical address of the command line,
+ *    |                | a zero-terminated ASCII string.
+ * 32 +----------------+
+ *    | rsdp_paddr     | Physical address of the RSDP ACPI data structure.
+ * 40 +----------------+
+ *
+ * The layout of each entry in the module structure is the following:
+ *
+ *  0 +----------------+
+ *    | paddr          | Physical address of the module.
+ *  8 +----------------+
+ *    | size           | Size of the module in bytes.
+ * 16 +----------------+
+ *    | cmdline_paddr  | Physical address of the command line,
+ *    |                | a zero-terminated ASCII string.
+ * 24 +----------------+
+ *    | reserved       |
+ * 32 +----------------+
+ *
+ * The address and sizes are always a 64bit little endian unsigned integer.
+ *
+ * NB: Xen on x86 will always try to place all the data below the 4GiB
+ * boundary.
+ */
+#define XEN_HVM_START_MAGIC_VALUE 0x336ec578
+
+/*
+ * C representation of the x86/HVM start info layout.
+ *
+ * The canonical definition of this layout is above, this is just a way to
+ * represent the layout described there using C types.
+ */
+struct hvm_start_info {
+    uint32_t magic;             /* Contains the magic value 0x336ec578       */
+                                /* ("xEn3" with the 0x80 bit of the "E" set).*/
+    uint32_t version;           /* Version of this structure.                */
+    uint32_t flags;             /* SIF_xxx flags.                            */
+    uint32_t nr_modules;        /* Number of modules passed to the kernel.   */
+    uint64_t modlist_paddr;     /* Physical address of an array of           */
+                                /* hvm_modlist_entry.                        */
+    uint64_t cmdline_paddr;     /* Physical address of the command line.     */
+    uint64_t rsdp_paddr;        /* Physical address of the RSDP ACPI data    */
+                                /* structure.                                */
+};
+
+struct hvm_modlist_entry {
+    uint64_t paddr;             /* Physical address of the module.           */
+    uint64_t size;              /* Size of the module in bytes.              */
+    uint64_t cmdline_paddr;     /* Physical address of the command line.     */
+    uint64_t reserved;
+};
+
+#endif /* __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ */
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -81,6 +81,7 @@
 #define __HYPERVISOR_tmem_op              38
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 #define __HYPERVISOR_xenpmu_op            40
+#define __HYPERVISOR_dm_op                41

 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48

--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -30,16 +30,10 @@ extern enum xen_domain_type xen_domain_type;
 #endif	/* CONFIG_XEN_DOM0 */

 #ifdef CONFIG_XEN_PVH
-/* This functionality exists only for x86. The XEN_PVHVM support exists
- * only in x86 world - hence on ARM it will be always disabled.
- * N.B. ARM guests are neither PV nor HVM nor PVHVM.
- * It's a bit like PVH but is different also (it's further towards the H
- * end of the spectrum than even PVH).
- */
-#include <xen/features.h>
-#define xen_pvh_domain() (xen_pv_domain() && \
-			  xen_feature(XENFEAT_auto_translated_physmap))
+extern bool xen_pvh;
+#define xen_pvh_domain()	(xen_hvm_domain() && xen_pvh)
 #else
 #define xen_pvh_domain()	(0)
 #endif
+
 #endif	/* _XEN_XEN_H */
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -38,6 +38,7 @@
 #include <linux/notifier.h>
 #include <linux/mutex.h>
 #include <linux/export.h>
+#include <linux/fs.h>
 #include <linux/completion.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -60,7 +61,7 @@ struct xenbus_watch

 	/* Callback (executed in a process context with no locks held). */
 	void (*callback)(struct xenbus_watch *,
-			 const char **vec, unsigned int len);
+			 const char *path, const char *token);
 };


@@ -175,16 +176,9 @@ void xs_suspend(void);
 void xs_resume(void);
 void xs_suspend_cancel(void);

-/* Used by xenbus_dev to borrow kernel's store connection. */
-void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
-
 struct work_struct;

-/* Prepare for domain suspend: then resume or cancel the suspend. */
-void xenbus_suspend(void);
-void xenbus_resume(void);
 void xenbus_probe(struct work_struct *);
-void xenbus_suspend_cancel(void);

 #define XENBUS_IS_ERR_READ(str) ({			\
 	if (!IS_ERR(str) && strlen(str) == 0) {		\
@@ -199,11 +193,11 @@ void xenbus_suspend_cancel(void);
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
-				       const char **, unsigned int));
+				       const char *, const char *));
 __printf(4, 5)
 int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
-					  const char **, unsigned int),
+					  const char *, const char *),
 			 const char *pathfmt, ...);

 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
@@ -235,4 +229,8 @@ const char *xenbus_strstate(enum xenbus_state state);
 int xenbus_dev_is_online(struct xenbus_device *dev);
 int xenbus_frontend_closed(struct xenbus_device *dev);

+extern const struct file_operations xen_xenbus_fops;
+extern struct xenstore_domain_interface *xen_store_interface;
+extern int xen_store_evtchn;
+
 #endif /* _XEN_XENBUS_H */