Commit ec30dcf7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Radim Krčmář:
 "PPC:

   - Close a hole which could possibly lead to the host timebase getting
     out of sync.

   - Three fixes relating to PTEs and TLB entries for radix guests.

   - Fix a bug which could lead to an interrupt never getting delivered
     to the guest, if it is pending for a guest vCPU when the vCPU gets
     offlined.

  s390:

   - Fix false negatives in VSIE validity check (Cc stable)

  x86:

   - Fix time drift of VMX preemption timer when a guest uses LAPIC
     timer in periodic mode (Cc stable)

   - Unconditionally expose CPUID.IA32_ARCH_CAPABILITIES to allow
     migration from hosts that don't need retpoline mitigation (Cc
     stable)

   - Fix guest crashes on reboot by properly coupling CR4.OSXSAVE and
     CPUID.OSXSAVE (Cc stable)

   - Report correct RIP after Hyper-V hypercall #UD (introduced in
     -rc6)"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: fix #UD address of failed Hyper-V hypercalls
  kvm: x86: IA32_ARCH_CAPABILITIES is always supported
  KVM: x86: Update cpuid properly when CR4.OSXAVE or CR4.PKE is changed
  x86/kvm: fix LAPIC timer drift when guest uses periodic mode
  KVM: s390: vsie: fix < 8k check for the itdba
  KVM: PPC: Book 3S HV: Do ptesync in radix guest exit path
  KVM: PPC: Book3S HV: XIVE: Resend re-routed interrupts on CPU priority change
  KVM: PPC: Book3S HV: Make radix clear pte when unmapping
  KVM: PPC: Book3S HV: Make radix use correct tlbie sequence in kvmppc_radix_tlbie_page
  KVM: PPC: Book3S HV: Snapshot timebase offset on guest entry
parents bc2dbc54 696ca779
......@@ -96,6 +96,7 @@ struct kvmppc_vcore {
struct kvm_vcpu *runner;
struct kvm *kvm;
u64 tb_offset; /* guest timebase - host timebase */
u64 tb_offset_applied; /* timebase offset currently in force */
ulong lpcr;
u32 arch_compat;
ulong pcr;
......
......@@ -562,6 +562,7 @@ int main(void)
OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
OFFSET(VCORE_KVM, kvmppc_vcore, kvm);
OFFSET(VCORE_TB_OFFSET, kvmppc_vcore, tb_offset);
OFFSET(VCORE_TB_OFFSET_APPL, kvmppc_vcore, tb_offset_applied);
OFFSET(VCORE_LPCR, kvmppc_vcore, lpcr);
OFFSET(VCORE_PCR, kvmppc_vcore, pcr);
OFFSET(VCORE_DPDES, kvmppc_vcore, dpdes);
......
......@@ -162,7 +162,7 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG))
asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
: : "r" (addr), "r" (kvm->arch.lpid) : "memory");
asm volatile("ptesync": : :"memory");
asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
}
static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
......@@ -173,7 +173,7 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
/* RIC=1 PRS=0 R=1 IS=2 */
asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1)
: : "r" (rb), "r" (kvm->arch.lpid) : "memory");
asm volatile("ptesync": : :"memory");
asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
}
unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
......@@ -584,7 +584,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
......
......@@ -2441,6 +2441,7 @@ static void init_vcore_to_run(struct kvmppc_vcore *vc)
vc->in_guest = 0;
vc->napping_threads = 0;
vc->conferring_threads = 0;
vc->tb_offset_applied = 0;
}
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
......
......@@ -692,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
22: ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0
beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
mftb r6 /* current host timebase */
add r8,r8,r6
mtspr SPRN_TBU40,r8 /* update upper 40 bits */
......@@ -940,18 +941,6 @@ FTR_SECTION_ELSE
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8:
/*
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
/* r8 is a host timebase value here, convert to guest TB */
ld r5,HSTATE_KVM_VCORE(r13)
ld r6,VCORE_TB_OFFSET(r5)
add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4)
ld r7, VCPU_SPRG2(r4)
......@@ -1005,6 +994,18 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r8
isync
/*
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
/* r8 is a host timebase value here, convert to guest TB */
ld r5,HSTATE_KVM_VCORE(r13)
ld r6,VCORE_TB_OFFSET_APPL(r5)
add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
/* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC
EXTEND_HDEC(r3)
......@@ -1597,8 +1598,27 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
guest_bypass:
stw r12, STACK_SLOT_TRAP(r1)
mr r3, r12
/* Save DEC */
/* Do this before kvmhv_commence_exit so we know TB is guest TB */
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
/* On P9, if the guest has large decr enabled, don't sign extend */
BEGIN_FTR_SECTION
ld r4, VCORE_LPCR(r3)
andis. r4, r4, LPCR_LD@h
bne 16f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r4,VCORE_TB_OFFSET_APPL(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
/* Increment exit count, poke other threads to exit */
mr r3, r12
bl kvmhv_commence_exit
nop
ld r9, HSTATE_KVM_VCPU(r13)
......@@ -1639,23 +1659,6 @@ guest_bypass:
mtspr SPRN_PURR,r3
mtspr SPRN_SPURR,r4
/* Save DEC */
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
/* On P9, if the guest has large decr enabled, don't sign extend */
BEGIN_FTR_SECTION
ld r4, VCORE_LPCR(r3)
andis. r4, r4, LPCR_LD@h
bne 16f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r4,VCORE_TB_OFFSET(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
BEGIN_FTR_SECTION
b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
......@@ -1905,6 +1908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
cmpwi cr2, r0, 0
beq cr2, 4f
/*
* Radix: do eieio; tlbsync; ptesync sequence in case we
* interrupted the guest between a tlbie and a ptesync.
*/
eieio
tlbsync
ptesync
/* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid)
lwz r3, VCPU_GUEST_PID(r9)
......@@ -2017,9 +2028,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
27:
/* Subtract timebase offset from timebase */
ld r8,VCORE_TB_OFFSET(r5)
ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpdi r8,0
beq 17f
li r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
mftb r6 /* current guest timebase */
subf r8,r8,r6
mtspr SPRN_TBU40,r8 /* update upper 40 bits */
......@@ -2700,7 +2713,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET(r5)
ld r6, VCORE_TB_OFFSET_APPL(r5)
subf r3, r6, r3 /* convert to host TB value */
std r3, VCPU_DEC_EXPIRES(r4)
......@@ -2799,7 +2812,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* Restore guest decrementer */
ld r3, VCPU_DEC_EXPIRES(r4)
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET(r5)
ld r6, VCORE_TB_OFFSET_APPL(r5)
add r3, r3, r6 /* convert host TB to guest TB value */
mftb r7
subf r3, r7, r3
......@@ -3606,12 +3619,9 @@ kvmppc_fix_pmao:
*/
kvmhv_start_timing:
ld r5, HSTATE_KVM_VCORE(r13)
lbz r6, VCORE_IN_GUEST(r5)
cmpwi r6, 0
beq 5f /* if in guest, need to */
ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
5: mftb r5
subf r5, r6, r5
ld r6, VCORE_TB_OFFSET_APPL(r5)
mftb r5
subf r5, r6, r5 /* subtract current timebase offset */
std r3, VCPU_CUR_ACTIVITY(r4)
std r5, VCPU_ACTIVITY_START(r4)
blr
......@@ -3622,15 +3632,12 @@ kvmhv_start_timing:
*/
kvmhv_accumulate_time:
ld r5, HSTATE_KVM_VCORE(r13)
lbz r8, VCORE_IN_GUEST(r5)
cmpwi r8, 0
beq 4f /* if in guest, need to */
ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
4: ld r5, VCPU_CUR_ACTIVITY(r4)
ld r8, VCORE_TB_OFFSET_APPL(r5)
ld r5, VCPU_CUR_ACTIVITY(r4)
ld r6, VCPU_ACTIVITY_START(r4)
std r3, VCPU_CUR_ACTIVITY(r4)
mftb r7
subf r7, r8, r7
subf r7, r8, r7 /* subtract current timebase offset */
std r7, VCPU_ACTIVITY_START(r4)
cmpdi r5, 0
beqlr
......
......@@ -11,6 +11,9 @@
#define XGLUE(a,b) a##b
#define GLUE(a,b) XGLUE(a,b)
/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
#define XICS_DUMMY 1
static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
{
u8 cppr;
......@@ -205,6 +208,10 @@ static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
goto skip_ipi;
}
/* If it's the dummy interrupt, continue searching */
if (hirq == XICS_DUMMY)
goto skip_ipi;
/* If fetching, update queue pointers */
if (scan_type == scan_fetch) {
q->idx = idx;
......@@ -385,9 +392,76 @@ static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
__x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING);
}
static void GLUE(X_PFX,scan_for_rerouted_irqs)(struct kvmppc_xive *xive,
struct kvmppc_xive_vcpu *xc)
{
unsigned int prio;
/* For each priority that is now masked */
for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
struct xive_q *q = &xc->queues[prio];
struct kvmppc_xive_irq_state *state;
struct kvmppc_xive_src_block *sb;
u32 idx, toggle, entry, irq, hw_num;
struct xive_irq_data *xd;
__be32 *qpage;
u16 src;
idx = q->idx;
toggle = q->toggle;
qpage = READ_ONCE(q->qpage);
if (!qpage)
continue;
/* For each interrupt in the queue */
for (;;) {
entry = be32_to_cpup(qpage + idx);
/* No more ? */
if ((entry >> 31) == toggle)
break;
irq = entry & 0x7fffffff;
/* Skip dummies and IPIs */
if (irq == XICS_DUMMY || irq == XICS_IPI)
goto next;
sb = kvmppc_xive_find_source(xive, irq, &src);
if (!sb)
goto next;
state = &sb->irq_state[src];
/* Has it been rerouted ? */
if (xc->server_num == state->act_server)
goto next;
/*
* Allright, it *has* been re-routed, kill it from
* the queue.
*/
qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
/* Find the HW interrupt */
kvmppc_xive_select_irq(state, &hw_num, &xd);
/* If it's not an LSI, set PQ to 11 the EOI will force a resend */
if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_11);
/* EOI the source */
GLUE(X_PFX,source_eoi)(hw_num, xd);
next:
idx = (idx + 1) & q->msk;
if (idx == 0)
toggle ^= 1;
}
}
}
X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
u8 old_cppr;
pr_devel("H_CPPR(cppr=%ld)\n", cppr);
......@@ -407,14 +481,34 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
*/
smp_mb();
/*
* We are masking less, we need to look for pending things
* to deliver and set VP pending bits accordingly to trigger
* a new interrupt otherwise we might miss MFRR changes for
* which we have optimized out sending an IPI signal.
*/
if (cppr > old_cppr)
if (cppr > old_cppr) {
/*
* We are masking less, we need to look for pending things
* to deliver and set VP pending bits accordingly to trigger
* a new interrupt otherwise we might miss MFRR changes for
* which we have optimized out sending an IPI signal.
*/
GLUE(X_PFX,push_pending_to_hw)(xc);
} else {
/*
* We are masking more, we need to check the queue for any
* interrupt that has been routed to another CPU, take
* it out (replace it with the dummy) and retrigger it.
*
* This is necessary since those interrupts may otherwise
* never be processed, at least not until this CPU restores
* its CPPR.
*
* This is in theory racy vs. HW adding new interrupts to
* the queue. In practice this works because the interesting
* cases are when the guest has done a set_xive() to move the
* interrupt away, which flushes the xive, followed by the
* target CPU doing a H_CPPR. So any new interrupt coming into
* the queue must still be routed to us and isn't a source
* of concern.
*/
GLUE(X_PFX,scan_for_rerouted_irqs)(xive, xc);
}
/* Apply new CPPR */
xc->hw_cppr = cppr;
......
......@@ -578,7 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
if (gpa && (scb_s->ecb & ECB_TE)) {
if (!(gpa & ~0x1fffU)) {
if (!(gpa & ~0x1fffUL)) {
rc = set_validity_icpt(scb_s, 0x0080U);
goto unpin;
}
......
......@@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
cpuid_mask(&entry->edx, CPUID_7_EDX);
/*
* We emulate ARCH_CAPABILITIES in software even
* if the host doesn't support it.
*/
entry->edx |= F(ARCH_CAPABILITIES);
} else {
entry->ebx = 0;
entry->ecx = 0;
......
......@@ -1260,14 +1260,18 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
}
}
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
struct kvm_run *run = vcpu->run;
kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
{
return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
}
static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
{
struct eventfd_ctx *eventfd;
......@@ -1350,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
/* Hypercall continuation is not supported yet */
if (rep_cnt || rep_idx) {
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
goto set_result;
goto out;
}
switch (code) {
......@@ -1381,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
break;
}
set_result:
kvm_hv_hypercall_set_result(vcpu, ret);
return 1;
out:
return kvm_hv_hypercall_complete(vcpu, ret);
}
void kvm_hv_init_vm(struct kvm *kvm)
......
......@@ -1522,11 +1522,23 @@ static bool set_target_expiration(struct kvm_lapic *apic)
static void advance_periodic_target_expiration(struct kvm_lapic *apic)
{
apic->lapic_timer.tscdeadline +=
nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
ktime_t now = ktime_get();
u64 tscl = rdtsc();
ktime_t delta;
/*
* Synchronize both deadlines to the same time source or
* differences in the periods (caused by differences in the
* underlying clocks or numerical approximation errors) will
* cause the two to drift apart over time as the errors
* accumulate.
*/
apic->lapic_timer.target_expiration =
ktime_add_ns(apic->lapic_timer.target_expiration,
apic->lapic_timer.period);
delta = ktime_sub(apic->lapic_timer.target_expiration, now);
apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
nsec_to_cycles(apic->vcpu, delta);
}
static void start_sw_period(struct kvm_lapic *apic)
......
......@@ -6671,11 +6671,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
if (kvm_hv_hypercall_enabled(vcpu->kvm)) {
if (!kvm_hv_hypercall(vcpu))
return 0;
goto out;
}
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
......@@ -6696,7 +6693,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (kvm_x86_ops->get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out_error;
goto out;
}
switch (nr) {
......@@ -6716,12 +6713,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
out_error:
out:
if (!op_64_bit)
ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
out:
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
......@@ -7980,6 +7976,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
int cpuid_update_needed = 0;
int pending_vec, max_bits, idx;
struct desc_ptr dt;
int ret = -EINVAL;
......@@ -8018,8 +8015,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
(X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
if (cpuid_update_needed)
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment