Merge branch 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (55 commits) KVM: unbreak userspace that does not sets tss address KVM: MMU: cleanup pte write path KVM: MMU: introduce a common function to get no-dirty-logged slot KVM: fix rcu usage in init_rmode_* functions KVM: fix kvmclock regression due to missing clock update KVM: emulator: Fix permission checking in io permission bitmap KVM: emulator: Fix io permission checking for 64bit guest KVM: SVM: Load %gs earlier if CONFIG_X86_32_LAZY_GS=n KVM: x86: Remove useless regs_page pointer from kvm_lapic KVM: improve comment on rcu use in irqfd_deassign KVM: MMU: remove unused macros KVM: MMU: cleanup page alloc and free KVM: MMU: do not record gfn in kvm_mmu_pte_write KVM: MMU: move mmu pages calculated out of mmu lock KVM: MMU: set spte accessed bit properly KVM: MMU: fix kvm_mmu_slot_remove_write_access dropping intermediate W bits KVM: Start lock documentation KVM: better readability of efer_reserved_bits KVM: Clear async page fault hash after switching to real mode KVM: VMX: Initialize vm86 TSS only once. ...

Merge branch 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (55 commits) KVM: unbreak userspace that does not sets tss address KVM: MMU: cleanup pte write path KVM: MMU: introduce a common function to get no-dirty-logged slot KVM: fix rcu usage in init_rmode_* functions KVM: fix kvmclock regression due to missing clock update KVM: emulator: Fix permission checking in io permission bitmap KVM: emulator: Fix io permission checking for 64bit guest KVM: SVM: Load %gs earlier if CONFIG_X86_32_LAZY_GS=n KVM: x86: Remove useless regs_page pointer from kvm_lapic KVM: improve comment on rcu use in irqfd_deassign KVM: MMU: remove unused macros KVM: MMU: cleanup page alloc and free KVM: MMU: do not record gfn in kvm_mmu_pte_write KVM: MMU: move mmu pages calculated out of mmu lock KVM: MMU: set spte accessed bit properly KVM: MMU: fix kvm_mmu_slot_remove_write_access dropping intermediate W bits KVM: Start lock documentation KVM: better readability of efer_reserved_bits KVM: Clear async page fault hash after switching to real mode KVM: VMX: Initialize vm86 TSS only once. ...
ec0afc93 · Linus Torvalds · 804f1853 · 776e58ea · ec0afc93 · ec0afc93
Commit ec0afc93 authored Mar 17, 2011 by Linus Torvalds
33 changed files
--- a/Documentation/kvm/locking.txt
+++ b/Documentation/kvm/locking.txt
+KVM Lock Overview
+=================
+
+1. Acquisition Orders
+---------------------
+
+(to be written)
+
+2. Reference
+------------
+
+Name:		kvm_lock
+Type:		raw_spinlock
+Arch:		any
+Protects:	- vm_list
+		- hardware virtualization enable/disable
+Comment:	'raw' because hardware enabling/disabling must be atomic /wrt
+		migration.
+
+Name:		kvm_arch::tsc_write_lock
+Type:		raw_spinlock
+Arch:		x86
+Protects:	- kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
+		- tsc offset in vmcb
+Comment:	'raw' because updating the tsc offsets must not be preempted.
--- a/arch/alpha/include/asm/errno.h
+++ b/arch/alpha/include/asm/errno.h
@@ -122,4 +122,6 @@

 #define	ERFKILL		138	/* Operation not possible due to RF-kill */

+#define EHWPOISON	139	/* Memory page has hardware error */
+
 #endif
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -662,6 +662,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto vcpu_run_fail;

 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	vcpu->mode = IN_GUEST_MODE;
 	kvm_guest_enter();

 	/*
@@ -683,6 +684,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	barrier();
 	kvm_guest_exit();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
 	preempt_enable();

 	idx = srcu_read_lock(&vcpu->kvm->srcu);

--- a/arch/mips/include/asm/errno.h
+++ b/arch/mips/include/asm/errno.h
@@ -121,6 +121,8 @@

 #define	ERFKILL		167	/* Operation not possible due to RF-kill */

+#define EHWPOISON	168	/* Memory page has hardware error */
+
 #define EDQUOT		1133	/* Quota exceeded */

 #ifdef __KERNEL__

--- a/arch/parisc/include/asm/errno.h
+++ b/arch/parisc/include/asm/errno.h
@@ -122,4 +122,6 @@

 #define	ERFKILL		256	/* Operation not possible due to RF-kill */

+#define EHWPOISON	257	/* Memory page has hardware error */
+
 #endif
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1141,9 +1141,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
-	regs->sprg5 = vcpu->arch.sprg4;
-	regs->sprg6 = vcpu->arch.sprg5;
-	regs->sprg7 = vcpu->arch.sprg6;
+	regs->sprg4 = vcpu->arch.sprg4;
+	regs->sprg5 = vcpu->arch.sprg5;
+	regs->sprg6 = vcpu->arch.sprg6;
+	regs->sprg7 = vcpu->arch.sprg7;

 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -1167,9 +1168,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
-	vcpu->arch.sprg5 = regs->sprg4;
-	vcpu->arch.sprg6 = regs->sprg5;
-	vcpu->arch.sprg7 = regs->sprg6;
+	vcpu->arch.sprg4 = regs->sprg4;
+	vcpu->arch.sprg5 = regs->sprg5;
+	vcpu->arch.sprg6 = regs->sprg6;
+	vcpu->arch.sprg7 = regs->sprg7;

 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);

--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -546,9 +546,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->sprg1 = vcpu->arch.shared->sprg1;
 	regs->sprg2 = vcpu->arch.shared->sprg2;
 	regs->sprg3 = vcpu->arch.shared->sprg3;
-	regs->sprg5 = vcpu->arch.sprg4;
-	regs->sprg6 = vcpu->arch.sprg5;
-	regs->sprg7 = vcpu->arch.sprg6;
+	regs->sprg4 = vcpu->arch.sprg4;
+	regs->sprg5 = vcpu->arch.sprg5;
+	regs->sprg6 = vcpu->arch.sprg6;
+	regs->sprg7 = vcpu->arch.sprg7;

 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -572,9 +573,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.shared->sprg1 = regs->sprg1;
 	vcpu->arch.shared->sprg2 = regs->sprg2;
 	vcpu->arch.shared->sprg3 = regs->sprg3;
-	vcpu->arch.sprg5 = regs->sprg4;
-	vcpu->arch.sprg6 = regs->sprg5;
-	vcpu->arch.sprg7 = regs->sprg6;
+	vcpu->arch.sprg4 = regs->sprg4;
+	vcpu->arch.sprg5 = regs->sprg5;
+	vcpu->arch.sprg6 = regs->sprg6;
+	vcpu->arch.sprg7 = regs->sprg7;

 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);

--- a/arch/sparc/include/asm/errno.h
+++ b/arch/sparc/include/asm/errno.h
@@ -112,4 +112,6 @@

 #define	ERFKILL		134	/* Operation not possible due to RF-kill */

+#define EHWPOISON	135	/* Memory page has hardware error */
+
 #endif
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
 	int (*pio_out_emulated)(int size, unsigned short port, const void *val,
 				unsigned int count, struct kvm_vcpu *vcpu);

-	bool (*get_cached_descriptor)(struct desc_struct *desc,
+	bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
 				      int seg, struct kvm_vcpu *vcpu);
-	void (*set_cached_descriptor)(struct desc_struct *desc,
+	void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
 				      int seg, struct kvm_vcpu *vcpu);
 	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
 	int interruptibility;

 	bool perm_ok; /* do not check permissions if true */
+	bool only_vendor_specific_insn;

 	bool have_exception;
 	struct x86_exception exception;

--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@

 #define ASYNC_PF_PER_VCPU 64

-extern spinlock_t kvm_lock;
+extern raw_spinlock_t kvm_lock;
 extern struct list_head vm_list;

 struct kvm_vcpu;
@@ -255,6 +255,8 @@ struct kvm_mmu {
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			u64 *spte, const void *pte, unsigned long mmu_seq);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
@@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
 	u64  *last_pte_updated;
 	gfn_t last_pte_gfn;

-	struct {
-		gfn_t gfn;	/* presumed gfn during guest pte update */
-		pfn_t pfn;	/* pfn corresponding to that gfn */
-		unsigned long mmu_seq;
-	} update_pte;
-
 	struct fpu guest_fpu;
 	u64 xcr0;

@@ -448,7 +444,7 @@ struct kvm_arch {

 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
-	spinlock_t tsc_write_lock;
+	raw_spinlock_t tsc_write_lock;
 	u64 last_tsc_nsec;
 	u64 last_tsc_offset;
 	u64 last_tsc_write;

--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -43,6 +43,7 @@

 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
+#define MSR_IA32_BBL_CR_CTL3		0x0000011e

 #define MSR_IA32_SYSENTER_CS		0x00000174
 #define MSR_IA32_SYSENTER_ESP		0x00000175

--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 	native_smp_prepare_boot_cpu();
 }

-static void kvm_guest_cpu_online(void *dummy)
+static void __cpuinit kvm_guest_cpu_online(void *dummy)
 {
 	kvm_guest_cpu_init();
 }

--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 /* Misc flags */
+#define VendorSpecific (1<<22) /* Vendor specific instruction */
 #define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 	if (selector & 1 << 2) {
 		struct desc_struct desc;
 		memset (dt, 0, sizeof *dt);
-		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+		if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
+						ctxt->vcpu))
 			return;

 		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	return ret;
 }

+/* Does not support long mode */
 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   struct x86_emulate_ops *ops,
 				   u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	}
 load:
 	ops->set_segment_selector(selector, seg, ctxt->vcpu);
-	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
 	return X86EMUL_CONTINUE;
 exception:
 	emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct desc_struct *ss)
 {
 	memset(cs, 0, sizeof(struct desc_struct));
-	ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
 	memset(ss, 0, sizeof(struct desc_struct));

 	cs->l = 0;		/* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		cs.d = 0;
 		cs.l = 1;
 	}
-	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
 	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
-	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
 	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);

 	c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		cs.l = 1;
 	}

-	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
 	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
-	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
 	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);

 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	cs_sel |= SELECTOR_RPL_MASK;
 	ss_sel |= SELECTOR_RPL_MASK;

-	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
 	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
-	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
 	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);

 	c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 					    u16 port, u16 len)
 {
 	struct desc_struct tr_seg;
+	u32 base3;
 	int r;
-	u16 io_bitmap_ptr;
-	u8 perm, bit_idx = port & 0x7;
+	u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
 	unsigned mask = (1 << len) - 1;
+	unsigned long base;

-	ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+	ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
 	if (!tr_seg.p)
 		return false;
 	if (desc_limit_scaled(&tr_seg) < 103)
 		return false;
-	r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
-			  ctxt->vcpu, NULL);
+	base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+	base |= ((u64)base3) << 32;
+#endif
+	r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
 	if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
 		return false;
-	r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
-			  &perm, 1, ctxt->vcpu, NULL);
+	r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
+			  NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
 	if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	}

 	ops->set_cr(0,  ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
-	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+	ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
 	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);

 	if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
 	D(SrcMem16 | ModRM | Mov | Priv),
 	D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
 }, {
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | Priv | VendorSpecific), N,
+	N, D(SrcNone | ModRM | Priv | VendorSpecific),
 	D(SrcNone | ModRM | DstMem | Mov), N,
 	D(SrcMem16 | ModRM | Mov | Priv), N,
 } };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	N, GD(0, &group7), N, N,
-	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
 	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
 	N, D(ImplicitOps | ModRM), N, N,
 	/* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x30 - 0x3F */
 	D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
 	D(ImplicitOps | Priv), N,
-	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+	N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
 	X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	if (c->d == 0 || (c->d & Undefined))
 		return -1;

+	if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+		return -1;
+
 	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
 		c->op_bytes = 8;


--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -61,9 +61,6 @@ static void pic_unlock(struct kvm_pic *s)
 			}
 		}

-		if (!found)
-			found = s->kvm->bsp_vcpu;
-
 		if (!found)
 			return;

@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
-	s->isr_ack |= (1 << irq);
 	if (s != &s->pics_state->pics[0])
 		irq += 8;
 	/*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 	pic_lock(s->pics_state);
 }

-void kvm_pic_clear_isr_ack(struct kvm *kvm)
-{
-	struct kvm_pic *s = pic_irqchip(kvm);
-
-	pic_lock(s);
-	s->pics[0].isr_ack = 0xff;
-	s->pics[1].isr_ack = 0xff;
-	pic_unlock(s);
-}
-
 /*
 * set irq level. If an edge is detected, then the IRR is set to 1
 */
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 	s->irr = 0;
 	s->imr = 0;
 	s->isr = 0;
-	s->isr_ack = 0xff;
 	s->priority_add = 0;
 	s->irq_base = 0;
 	s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
 */
 static void pic_irq_request(struct kvm *kvm, int level)
 {
-	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_pic *s = pic_irqchip(kvm);
-	int irq = pic_get_irq(&s->pics[0]);

-	s->output = level;
-	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
-		s->pics[0].isr_ack &= ~(1 << irq);
+	if (!s->output)
 		s->wakeup_needed = true;
-	}
+	s->output = level;
 }

 static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s->pics[1].elcr_mask = 0xde;
 	s->pics[0].pics_state = s;
 	s->pics[1].pics_state = s;
-	s->pics[0].isr_ack = 0xff;
-	s->pics[1].isr_ack = 0xff;

 	/*
 	 * Initialize PIO device

--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_INIT:
 		if (level) {
 			result = 1;
-			if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-				printk(KERN_DEBUG
-				       "INIT on a runnable vcpu %d\n",
-				       vcpu->vcpu_id);
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)

 	hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);

-	if (vcpu->arch.apic->regs_page)
-		__free_page(vcpu->arch.apic->regs_page);
+	if (vcpu->arch.apic->regs)
+		free_page((unsigned long)vcpu->arch.apic->regs);

 	kfree(vcpu->arch.apic);
 }
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)

 	vcpu->arch.apic = apic;

-	apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
-	if (apic->regs_page == NULL) {
+	apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!apic->regs) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
 		goto nomem_free_apic;
 	}
-	apic->regs = page_address(apic->regs_page);
 	apic->vcpu = vcpu;

 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,

--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
 	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	bool irr_pending;
-	struct page *regs_page;
 	void *regs;
 	gpa_t vapic_addr;
 	struct page *vapic_page;

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
 	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 	#ifdef CONFIG_X86_64
 	#define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
 	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
 	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
 	#define CMPXCHG cmpxchg
@@ -327,7 +325,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 }

 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			      u64 *spte, const void *pte)
+			      u64 *spte, const void *pte, unsigned long mmu_seq)
 {
 	pt_element_t gpte;
 	unsigned pte_access;
@@ -339,14 +337,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,

 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
 		return;
-	pfn = vcpu->arch.update_pte.pfn;
-	if (is_error_pfn(pfn))
-		return;
-	if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
+	}
+	if (mmu_notifier_retry(vcpu, mmu_seq))
 		return;
-	kvm_get_pfn(pfn);
+
 	/*
 	 * we call mmu_set_spte() with host_writable = true beacuse that
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
@@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-#undef PT_LEVEL_MASK
 #undef PT_LVL_ADDR_MASK
 #undef PT_LVL_OFFSET_MASK
 #undef PT_LEVEL_BITS

--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {

 	u32 *msrpm;

+	ulong nmi_iret_rip;
+
 	struct nested_state nested;

 	bool nmi_singlestep;
@@ -1153,7 +1155,9 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
 	load_gs_index(svm->host.gs);
 #else
+#ifdef CONFIG_X86_32_LAZY_GS
 	loadsegment(gs, svm->host.gs);
+#endif
 #endif
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
 	++svm->vcpu.stat.nmi_window_exits;
 	clr_intercept(svm, INTERCEPT_IRET);
 	svm->vcpu.arch.hflags |= HF_IRET_MASK;
+	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
 	return 1;
 }

@@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)

 	svm->int3_injected = 0;

-	if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
+	/*
+	 * If we've made progress since setting HF_IRET_MASK, we've
+	 * executed an IRET and can allow NMI injection.
+	 */
+	if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
+	    && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
 		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	}
@@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 #else
 	loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+	loadsegment(gs, svm->host.gs);
+#endif
 #endif

 	reload_tss(vcpu);

 	local_irq_disable();

-	stgi();
-
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
 	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;

+	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+		kvm_before_handle_nmi(&svm->vcpu);
+
+	stgi();
+
+	/* Any pending NMI will happen here */
+
+	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+		kvm_after_handle_nmi(&svm->vcpu);
+
 	sync_cr8_to_lapic(vcpu);

 	svm->next_rip = 0;

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -53,8 +53,8 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages,
 }

 /* call with current->mm->mmap_sem held */
-static int __get_user_pages(unsigned long start_page, size_t num_pages,
-			struct page **p, struct vm_area_struct **vma)
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+				  struct page **p, struct vm_area_struct **vma)
 {
 	unsigned long lock_limit;
 	size_t got;
@@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages,

 	down_write(&current->mm->mmap_sem);

-	ret = __get_user_pages(start_page, num_pages, p, NULL);
+	ret = __ipath_get_user_pages(start_page, num_pages, p, NULL);

 	up_write(&current->mm->mmap_sem);


--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -51,8 +51,8 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages,
 /*
 * Call with current->mm->mmap_sem held.
 */
-static int __get_user_pages(unsigned long start_page, size_t num_pages,
-			    struct page **p, struct vm_area_struct **vma)
+static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
+				struct page **p, struct vm_area_struct **vma)
 {
 	unsigned long lock_limit;
 	size_t got;
@@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,

 	down_write(&current->mm->mmap_sem);

-	ret = __get_user_pages(start_page, num_pages, p, NULL);
+	ret = __qib_get_user_pages(start_page, num_pages, p, NULL);

 	up_write(&current->mm->mmap_sem);


--- a/include/asm-generic/errno.h
+++ b/include/asm-generic/errno.h
@@ -108,4 +108,6 @@

 #define ERFKILL		132	/* Operation not possible due to RF-kill */

+#define EHWPOISON	133	/* Memory page has hardware error */
+
 #endif
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -43,6 +43,7 @@
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
 #define KVM_REQ_APF_HALT          12
+#define KVM_REQ_NMI               13

 #define KVM_USERSPACE_IRQ_SOURCE_ID	0

@@ -98,23 +99,31 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif

+enum {
+	OUTSIDE_GUEST_MODE,
+	IN_GUEST_MODE,
+	EXITING_GUEST_MODE
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	struct preempt_notifier preempt_notifier;
 #endif
+	int cpu;
 	int vcpu_id;
-	struct mutex mutex;
-	int   cpu;
-	atomic_t guest_mode;
-	struct kvm_run *run;
+	int srcu_idx;
+	int mode;
 	unsigned long requests;
 	unsigned long guest_debug;
-	int srcu_idx;
+
+	struct mutex mutex;
+	struct kvm_run *run;

 	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
 	wait_queue_head_t wq;
+	struct pid *pid;
 	int sigset_active;
 	sigset_t sigset;
 	struct kvm_vcpu_stat stat;
@@ -140,6 +149,11 @@ struct kvm_vcpu {
 	struct kvm_vcpu_arch arch;
 };

+static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
+{
+	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
+}
+
 /*
 * Some of the bitops functions do not support too long bitmaps.
 * This number must be determined not to exceed such limits.
@@ -212,7 +226,6 @@ struct kvm_memslots {

 struct kvm {
 	spinlock_t mmu_lock;
-	raw_spinlock_t requests_lock;
 	struct mutex slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots *memslots;
@@ -223,6 +236,7 @@ struct kvm {
 #endif
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	atomic_t online_vcpus;
+	int last_boosted_vcpu;
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus *buses[KVM_NR_BUSES];
@@ -719,11 +733,6 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	set_bit(req, &vcpu->requests);
 }

-static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
-{
-	return test_and_set_bit(req, &vcpu->requests);
-}
-
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
 	if (test_bit(req, &vcpu->requests)) {

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -972,6 +972,10 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);

+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, unsigned int foll_flags,
+		     struct page **pages, struct vm_area_struct **vmas,
+		     int *nonblocking);
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			unsigned long start, int nr_pages, int write, int force,
 			struct page **pages, struct vm_area_struct **vmas);
@@ -1535,6 +1539,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */
 #define FOLL_MLOCK	0x40	/* mark page as mlocked */
 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
+#define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */

 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
@@ -1627,14 +1632,6 @@ extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
 extern int soft_offline_page(struct page *page, int flags);
-#ifdef CONFIG_MEMORY_FAILURE
-int is_hwpoison_address(unsigned long addr);
-#else
-static inline int is_hwpoison_address(unsigned long addr)
-{
-	return 0;
-}
-#endif

 extern void dump_page(struct page *page);


--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);

 /*
 * macro override instead of weak attribute alias, to workaround

--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 	rcu_read_unlock();
 	return pid;
 }
+EXPORT_SYMBOL_GPL(get_task_pid);

 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
@@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 	rcu_read_unlock();
 	return result;
 }
+EXPORT_SYMBOL_GPL(get_pid_task);

 struct pid *find_get_pid(pid_t nr)
 {

--- a/mm/internal.h
+++ b/mm/internal.h
@@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */

-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		     unsigned long start, int len, unsigned int foll_flags,
-		     struct page **pages, struct vm_area_struct **vmas,
-		     int *nonblocking);
-
 #define ZONE_RECLAIM_NOSCAN	-2
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0

--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1487,35 +1487,3 @@ int soft_offline_page(struct page *page, int flags)
 	/* keep elevated page count for bad page */
 	return ret;
 }
-
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
-	pgd_t *pgdp;
-	pud_t pud, *pudp;
-	pmd_t pmd, *pmdp;
-	pte_t pte, *ptep;
-	swp_entry_t entry;
-
-	pgdp = pgd_offset(current->mm, addr);
-	if (!pgd_present(*pgdp))
-		return 0;
-	pudp = pud_offset(pgdp, addr);
-	pud = *pudp;
-	if (!pud_present(pud) || pud_large(pud))
-		return 0;
-	pmdp = pmd_offset(pudp, addr);
-	pmd = *pmdp;
-	if (!pmd_present(pmd) || pmd_large(pmd))
-		return 0;
-	ptep = pte_offset_map(pmdp, addr);
-	pte = *ptep;
-	pte_unmap(ptep);
-	if (!is_swap_pte(pte))
-		return 0;
-	entry = pte_to_swp_entry(pte);
-	return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1410,6 +1410,55 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 	return page;
 }

+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk:	task_struct of target task
+ * @mm:		mm_struct of target mm
+ * @start:	starting user address
+ * @nr_pages:	number of pages from start to pin
+ * @gup_flags:	flags modifying pin behaviour
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long. Or NULL, if caller
+ *		only intends to ensure the pages are faulted in.
+ * @vmas:	array of pointers to vmas corresponding to each page.
+ *		Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		     unsigned long start, int nr_pages, unsigned int gup_flags,
 		     struct page **pages, struct vm_area_struct **vmas,
@@ -1527,9 +1576,16 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
-					if (ret &
-					    (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
-					     VM_FAULT_SIGBUS))
+					if (ret & (VM_FAULT_HWPOISON |
+						   VM_FAULT_HWPOISON_LARGE)) {
+						if (i)
+							return i;
+						else if (gup_flags & FOLL_HWPOISON)
+							return -EHWPOISON;
+						else
+							return -EFAULT;
+					}
+					if (ret & VM_FAULT_SIGBUS)
 						return i ? i : -EFAULT;
 					BUG();
 				}
@@ -1578,6 +1634,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	} while (nr_pages);
 	return i;
 }
+EXPORT_SYMBOL(__get_user_pages);

 /**
 * get_user_pages() - pin user pages in memory

--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
 			/*
 			 * This rcu_assign_pointer is needed for when
-			 * another thread calls kvm_irqfd_update before
-			 * we flush workqueue below.
+			 * another thread calls kvm_irq_routing_update before
+			 * we flush workqueue below (we synchronize with
+			 * kvm_irq_routing_update using irqfds.lock).
 			 * It is paired with synchronize_rcu done by caller
 			 * of that function.
 			 */

--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -69,7 +69,7 @@ MODULE_LICENSE("GPL");
 * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 */

-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_RAW_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);

 static cpumask_var_t cpus_hardware_enabled;
@@ -137,6 +137,14 @@ void vcpu_load(struct kvm_vcpu *vcpu)
 	int cpu;

 	mutex_lock(&vcpu->mutex);
+	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+		/* The thread running this VCPU changed. */
+		struct pid *oldpid = vcpu->pid;
+		struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+		rcu_assign_pointer(vcpu->pid, newpid);
+		synchronize_rcu();
+		put_pid(oldpid);
+	}
 	cpu = get_cpu();
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
@@ -165,13 +173,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)

 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);

-	raw_spin_lock(&kvm->requests_lock);
-	me = smp_processor_id();
+	me = get_cpu();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (kvm_make_check_request(req, vcpu))
-			continue;
+		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
-		if (cpus != NULL && cpu != -1 && cpu != me)
+
+		/* Set ->requests bit before we read ->mode */
+		smp_mb();
+
+		if (cpus != NULL && cpu != -1 && cpu != me &&
+		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 			cpumask_set_cpu(cpu, cpus);
 	}
 	if (unlikely(cpus == NULL))
@@ -180,7 +191,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		smp_call_function_many(cpus, ack_flush, NULL, 1);
 	else
 		called = false;
-	raw_spin_unlock(&kvm->requests_lock);
+	put_cpu();
 	free_cpumask_var(cpus);
 	return called;
 }
@@ -209,6 +220,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->cpu = -1;
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
+	vcpu->pid = NULL;
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);

@@ -233,6 +245,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);

 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
+	put_pid(vcpu->pid);
 	kvm_arch_vcpu_uninit(vcpu);
 	free_page((unsigned long)vcpu->run);
 }
@@ -463,15 +476,14 @@ static struct kvm *kvm_create_vm(void)
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
 	spin_lock_init(&kvm->mmu_lock);
-	raw_spin_lock_init(&kvm->requests_lock);
 	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
 	atomic_set(&kvm->users_count, 1);
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);

 	return kvm;

@@ -544,9 +556,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;

 	kvm_arch_sync_events(kvm);
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kvm_io_bus_destroy(kvm->buses[i]);
@@ -588,6 +600,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }

+#ifndef CONFIG_S390
 /*
 * Allocation size is twice as large as the actual dirty bitmap size.
 * This makes it possible to do double buffering: see x86's
@@ -608,6 +621,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	return 0;
 }
+#endif /* !CONFIG_S390 */

 /*
 * Allocate some memory and give it an address in the guest physical address
@@ -621,7 +635,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
 			    int user_alloc)
 {
-	int r, flush_shadow = 0;
+	int r;
 	gfn_t base_gfn;
 	unsigned long npages;
 	unsigned long i;
@@ -741,8 +755,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
 		/* destroy any largepage mappings for dirty tracking */
-		if (old.npages)
-			flush_shadow = 1;
 	}
 #else  /* not defined CONFIG_S390 */
 	new.user_alloc = user_alloc;
@@ -813,9 +825,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);

-	if (flush_shadow)
-		kvm_arch_flush_shadow(kvm);
-
 	return 0;

 out_free:
@@ -1029,6 +1038,15 @@ static pfn_t get_fault_pfn(void)
 	return fault_pfn;
 }

+static inline int check_user_page_hwpoison(unsigned long addr)
+{
+	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
+
+	rc = __get_user_pages(current, current->mm, addr, 1,
+			      flags, NULL, NULL, NULL);
+	return rc == -EHWPOISON;
+}
+
 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 			bool *async, bool write_fault, bool *writable)
 {
@@ -1076,7 +1094,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 			return get_fault_pfn();

 		down_read(&current->mm->mmap_sem);
-		if (is_hwpoison_address(addr)) {
+		if (check_user_page_hwpoison(addr)) {
 			up_read(&current->mm->mmap_sem);
 			get_page(hwpoison_page);
 			return page_to_pfn(hwpoison_page);
@@ -1466,18 +1484,55 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);

-void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
-	ktime_t expires;
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
-
-	/* Sleep for 100 us, and hope lock-holder got scheduled */
-	expires = ktime_add_ns(ktime_get(), 100000UL);
-	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+	struct kvm *kvm = me->kvm;
+	struct kvm_vcpu *vcpu;
+	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+	int yielded = 0;
+	int pass;
+	int i;

-	finish_wait(&vcpu->wq, &wait);
+	/*
+	 * We boost the priority of a VCPU that is runnable but not
+	 * currently running, because it got preempted by something
+	 * else and called schedule in __vcpu_run.  Hopefully that
+	 * VCPU is holding the lock that we need and will release it.
+	 * We approximate round-robin by starting at the last boosted VCPU.
+	 */
+	for (pass = 0; pass < 2 && !yielded; pass++) {
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			struct task_struct *task = NULL;
+			struct pid *pid;
+			if (!pass && i < last_boosted_vcpu) {
+				i = last_boosted_vcpu;
+				continue;
+			} else if (pass && i > last_boosted_vcpu)
+				break;
+			if (vcpu == me)
+				continue;
+			if (waitqueue_active(&vcpu->wq))
+				continue;
+			rcu_read_lock();
+			pid = rcu_dereference(vcpu->pid);
+			if (pid)
+				task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+			rcu_read_unlock();
+			if (!task)
+				continue;
+			if (task->flags & PF_VCPU) {
+				put_task_struct(task);
+				continue;
+			}
+			if (yield_to(task, 1)) {
+				put_task_struct(task);
+				kvm->last_boosted_vcpu = i;
+				yielded = 1;
+				break;
+			}
+			put_task_struct(task);
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);

@@ -2122,9 +2177,9 @@ static void hardware_enable_nolock(void *junk)

 static void hardware_enable(void *junk)
 {
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	hardware_enable_nolock(junk);
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 }

 static void hardware_disable_nolock(void *junk)
@@ -2139,9 +2194,9 @@ static void hardware_disable_nolock(void *junk)

 static void hardware_disable(void *junk)
 {
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	hardware_disable_nolock(junk);
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 }

 static void hardware_disable_all_nolock(void)
@@ -2155,16 +2210,16 @@ static void hardware_disable_all_nolock(void)

 static void hardware_disable_all(void)
 {
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	hardware_disable_all_nolock();
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 }

 static int hardware_enable_all(void)
 {
 	int r = 0;

-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);

 	kvm_usage_count++;
 	if (kvm_usage_count == 1) {
@@ -2177,7 +2232,7 @@ static int hardware_enable_all(void)
 		}
 	}

-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);

 	return r;
 }
@@ -2339,10 +2394,10 @@ static int vm_stat_get(void *_offset, u64 *val)
 	struct kvm *kvm;

 	*val = 0;
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		*val += *(u32 *)((void *)kvm + offset);
-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 	return 0;
 }

@@ -2356,12 +2411,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 	int i;

 	*val = 0;
-	spin_lock(&kvm_lock);
+	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			*val += *(u32 *)((void *)vcpu + offset);

-	spin_unlock(&kvm_lock);
+	raw_spin_unlock(&kvm_lock);
 	return 0;
 }

@@ -2402,7 +2457,7 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 static int kvm_resume(struct sys_device *dev)
 {
 	if (kvm_usage_count) {
-		WARN_ON(spin_is_locked(&kvm_lock));
+		WARN_ON(raw_spin_is_locked(&kvm_lock));
 		hardware_enable_nolock(NULL);
 	}
 	return 0;