Commit e9bda6f6 authored by Avi Kivity's avatar Avi Kivity

Merge branch 'queue' into next

Merge patches queued during the run-up to the merge window.

* queue: (25 commits)
  KVM: Choose better candidate for directed yield
  KVM: Note down when cpu relax intercepted or pause loop exited
  KVM: Add config to support ple or cpu relax optimzation
  KVM: switch to symbolic name for irq_states size
  KVM: x86: Fix typos in pmu.c
  KVM: x86: Fix typos in lapic.c
  KVM: x86: Fix typos in cpuid.c
  KVM: x86: Fix typos in emulate.c
  KVM: x86: Fix typos in x86.c
  KVM: SVM: Fix typos
  KVM: VMX: Fix typos
  KVM: remove the unused parameter of gfn_to_pfn_memslot
  KVM: remove is_error_hpa
  KVM: make bad_pfn static to kvm_main.c
  KVM: using get_fault_pfn to get the fault pfn
  KVM: MMU: track the refcount when unmap the page
  KVM: x86: remove unnecessary mark_page_dirty
  KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range()
  KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp()
  KVM: MMU: Add memslot parameter to hva handlers
  ...
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parents bdc0077a 06e48c51
...@@ -52,6 +52,8 @@ ...@@ -52,6 +52,8 @@
struct kvm; struct kvm;
extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
......
...@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put; goto out_put;
} }
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, static int kvm_handle_hva_range(struct kvm *kvm,
int (*handler)(struct kvm *kvm, unsigned long *rmapp, unsigned long start,
unsigned long gfn)) unsigned long end,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
unsigned long gfn))
{ {
int ret; int ret;
int retval = 0; int retval = 0;
...@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ...@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
slots = kvm_memslots(kvm); slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) { kvm_for_each_memslot(memslot, slots) {
unsigned long start = memslot->userspace_addr; unsigned long hva_start, hva_end;
unsigned long end; gfn_t gfn, gfn_end;
end = start + (memslot->npages << PAGE_SHIFT); hva_start = max(start, memslot->userspace_addr);
if (hva >= start && hva < end) { hva_end = min(end, memslot->userspace_addr +
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; (memslot->npages << PAGE_SHIFT));
if (hva_start >= hva_end)
continue;
/*
* {gfn(page) | page intersects with [hva_start, hva_end)} =
* {gfn, gfn+1, ..., gfn_end-1}.
*/
gfn = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
for (; gfn < gfn_end; ++gfn) {
gfn_t gfn_offset = gfn - memslot->base_gfn;
ret = handler(kvm, &memslot->rmap[gfn_offset], ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
memslot->base_gfn + gfn_offset);
retval |= ret; retval |= ret;
} }
} }
...@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ...@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return retval; return retval;
} }
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn) unsigned long gfn)
{ {
...@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) ...@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0; return 0;
} }
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
if (kvm->arch.using_mmu_notifiers)
kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
return 0;
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn) unsigned long gfn)
{ {
......
...@@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, ...@@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (likely(!pfnmap)) { if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn); (long)gfn);
......
...@@ -21,6 +21,7 @@ config KVM ...@@ -21,6 +21,7 @@ config KVM
depends on HAVE_KVM && EXPERIMENTAL depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS select PREEMPT_NOTIFIERS
select ANON_INODES select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
---help--- ---help---
Support hosting paravirtualized guest machines using the SIE Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work virtualization capability on the mainframe. This should work
......
...@@ -500,11 +500,11 @@ struct kvm_vcpu_arch { ...@@ -500,11 +500,11 @@ struct kvm_vcpu_arch {
}; };
struct kvm_lpage_info { struct kvm_lpage_info {
unsigned long rmap_pde;
int write_count; int write_count;
}; };
struct kvm_arch_memory_slot { struct kvm_arch_memory_slot {
unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
}; };
...@@ -957,6 +957,7 @@ extern bool kvm_rebooting; ...@@ -957,6 +957,7 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER #define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
......
...@@ -37,6 +37,7 @@ config KVM ...@@ -37,6 +37,7 @@ config KVM
select TASK_DELAY_ACCT select TASK_DELAY_ACCT
select PERF_EVENTS select PERF_EVENTS
select HAVE_KVM_MSI select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
---help--- ---help---
Support hosting fully virtualized guest machines using hardware Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent virtualization extensions. You will need a fairly recent
......
...@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, ...@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
} }
case 7: { case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capbability word 9 */ /* Mask ebx against host capability word 9 */
if (index == 0) { if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features; entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9); cpuid_mask(&entry->ebx, 9);
......
...@@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, ...@@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad; goto bad;
} else { } else {
/* exapand-down segment */ /* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad; goto bad;
lim = desc.d ? 0xffffffff : 0xffff; lim = desc.d ? 0xffffffff : 0xffff;
...@@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ...@@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
err_code = selector & 0xfffc; err_code = selector & 0xfffc;
err_vec = GP_VECTOR; err_vec = GP_VECTOR;
/* can't load system descriptor into segment selecor */ /* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s) if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception; goto exception;
...@@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, ...@@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/* /*
* Now load segment descriptors. If fault happenes at this stage * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task * it is handled in a context of new task
*/ */
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
...@@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ...@@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
* *
* 1. jmp/call/int to task gate: Check against DPL of the task gate * 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed * 2. Exception/IRQ/iret: No check is performed
* 3. jmp/call to TSS: Check agains DPL of the TSS * 3. jmp/call to TSS: Check against DPL of the TSS
*/ */
if (reason == TASK_SWITCH_GATE) { if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) { if (idt_index != -1) {
...@@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ...@@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags /* set back link to prev task only if NT bit is set in eflags
note that old_tss_sel is not used afetr this point */ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff; old_tss_sel = 0xffff;
......
...@@ -70,7 +70,7 @@ struct kvm_pic { ...@@ -70,7 +70,7 @@ struct kvm_pic {
struct kvm_io_device dev_slave; struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr; struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq); void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[16]; unsigned long irq_states[PIC_NUM_PINS];
}; };
struct kvm_pic *kvm_create_pic(struct kvm *kvm); struct kvm_pic *kvm_create_pic(struct kvm *kvm);
......
...@@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, ...@@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
{ {
unsigned char alignment = offset & 0xf; unsigned char alignment = offset & 0xf;
u32 result; u32 result;
/* this bitmask has a bit cleared for each reserver register */ /* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL; static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) { if ((alignment + len) > 4) {
...@@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic) ...@@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
atomic_set(&apic->lapic_timer.pending, 0); atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
/* lapic timer in oneshot or peroidic mode */ /* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time(); now = apic->lapic_timer.timer.base->get_time();
apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count; * APIC_BUS_CYCLE_NS * apic->divide_count;
......
...@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep) ...@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
return 0; return 0;
pfn = spte_to_pfn(old_spte); pfn = spte_to_pfn(old_spte);
/*
* KVM does not hold the refcount of the page used by
* kvm mmu, before reclaiming the page, we should
* unmap it from mmu first.
*/
WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn); kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
...@@ -960,13 +968,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) ...@@ -960,13 +968,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot) struct kvm_memory_slot *slot)
{ {
struct kvm_lpage_info *linfo; unsigned long idx;
if (likely(level == PT_PAGE_TABLE_LEVEL)) if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn]; return &slot->rmap[gfn - slot->base_gfn];
linfo = lpage_info_slot(gfn, slot, level); idx = gfn_to_index(gfn, slot->base_gfn, level);
return &linfo->rmap_pde; return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
} }
/* /*
...@@ -1200,7 +1208,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) ...@@ -1200,7 +1208,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
} }
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data) struct kvm_memory_slot *slot, unsigned long data)
{ {
u64 *sptep; u64 *sptep;
struct rmap_iterator iter; struct rmap_iterator iter;
...@@ -1218,7 +1226,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, ...@@ -1218,7 +1226,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
} }
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data) struct kvm_memory_slot *slot, unsigned long data)
{ {
u64 *sptep; u64 *sptep;
struct rmap_iterator iter; struct rmap_iterator iter;
...@@ -1259,43 +1267,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, ...@@ -1259,43 +1267,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0; return 0;
} }
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long data, unsigned long start,
int (*handler)(struct kvm *kvm, unsigned long *rmapp, unsigned long end,
unsigned long data)) unsigned long data,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
struct kvm_memory_slot *slot,
unsigned long data))
{ {
int j; int j;
int ret; int ret = 0;
int retval = 0;
struct kvm_memslots *slots; struct kvm_memslots *slots;
struct kvm_memory_slot *memslot; struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm); slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) { kvm_for_each_memslot(memslot, slots) {
unsigned long start = memslot->userspace_addr; unsigned long hva_start, hva_end;
unsigned long end; gfn_t gfn_start, gfn_end;
end = start + (memslot->npages << PAGE_SHIFT); hva_start = max(start, memslot->userspace_addr);
if (hva >= start && hva < end) { hva_end = min(end, memslot->userspace_addr +
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; (memslot->npages << PAGE_SHIFT));
gfn_t gfn = memslot->base_gfn + gfn_offset; if (hva_start >= hva_end)
continue;
/*
* {gfn(page) | page intersects with [hva_start, hva_end)} =
* {gfn_start, gfn_start+1, ..., gfn_end-1}.
*/
gfn_start = hva_to_gfn_memslot(hva_start, memslot);
gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = PT_PAGE_TABLE_LEVEL;
j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long idx, idx_end;
unsigned long *rmapp;
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { /*
struct kvm_lpage_info *linfo; * {idx(page_j) | page_j intersects with
* [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
*/
idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
linfo = lpage_info_slot(gfn, memslot, rmapp = __gfn_to_rmap(gfn_start, j, memslot);
PT_DIRECTORY_LEVEL + j);
ret |= handler(kvm, &linfo->rmap_pde, data); for (; idx <= idx_end; ++idx)
} ret |= handler(kvm, rmapp++, memslot, data);
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
} }
} }
return retval; return ret;
}
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
struct kvm_memory_slot *slot,
unsigned long data))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
} }
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
...@@ -1303,13 +1335,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) ...@@ -1303,13 +1335,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
} }
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
{
return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
}
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{ {
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
} }
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data) struct kvm_memory_slot *slot, unsigned long data)
{ {
u64 *sptep; u64 *sptep;
struct rmap_iterator uninitialized_var(iter); struct rmap_iterator uninitialized_var(iter);
...@@ -1323,8 +1360,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, ...@@ -1323,8 +1360,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
* This has some overhead, but not as much as the cost of swapping * This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages. * out actively used pages or breaking up actively used hugepages.
*/ */
if (!shadow_accessed_mask) if (!shadow_accessed_mask) {
return kvm_unmap_rmapp(kvm, rmapp, data); young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
goto out;
}
for (sptep = rmap_get_first(*rmapp, &iter); sptep; for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) { sptep = rmap_get_next(&iter)) {
...@@ -1336,12 +1375,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, ...@@ -1336,12 +1375,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep); (unsigned long *)sptep);
} }
} }
out:
/* @data has hva passed to kvm_age_hva(). */
trace_kvm_age_page(data, slot, young);
return young; return young;
} }
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data) struct kvm_memory_slot *slot, unsigned long data)
{ {
u64 *sptep; u64 *sptep;
struct rmap_iterator iter; struct rmap_iterator iter;
...@@ -1379,13 +1420,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) ...@@ -1379,13 +1420,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm); kvm_flush_remote_tlbs(vcpu->kvm);
} }
int kvm_age_hva(struct kvm *kvm, unsigned long hva) int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{ {
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
} }
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
...@@ -2472,14 +2513,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, ...@@ -2472,14 +2513,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
unsigned long hva; unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) { if (!slot)
get_page(fault_page); return get_fault_pfn();
return page_to_pfn(fault_page);
}
hva = gfn_to_hva_memslot(slot, gfn); hva = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn_atomic(vcpu->kvm, hva); return hva_to_pfn_atomic(hva);
} }
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
......
/* /*
* Kernel-based Virtual Machine -- Performane Monitoring Unit support * Kernel-based Virtual Machine -- Performance Monitoring Unit support
* *
* Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright 2011 Red Hat, Inc. and/or its affiliates.
* *
......
...@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) ...@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
if (svm->nested.intercept & 1ULL) { if (svm->nested.intercept & 1ULL) {
/* /*
* The #vmexit can't be emulated here directly because this * The #vmexit can't be emulated here directly because this
* code path runs with irqs and preemtion disabled. A * code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for * #vmexit emulation might sleep. Only signal request for
* the #vmexit here. * the #vmexit here.
*/ */
...@@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) ...@@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{ {
/* /*
* This function merges the msr permission bitmaps of kvm and the * This function merges the msr permission bitmaps of kvm and the
* nested vmcb. It is omptimized in that it only merges the parts where * nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits * the kvm msr permission bitmap may contain zero bits
*/ */
int i; int i;
......
...@@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) ...@@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer = vmx->vcpu.arch.efer; guest_efer = vmx->vcpu.arch.efer;
/* /*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode * outside long mode
*/ */
ignore_bits = EFER_NX | EFER_SCE; ignore_bits = EFER_NX | EFER_SCE;
...@@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ...@@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* qemu binaries. * qemu binaries.
* IA32 arch specifies that at the time of processor reset the * IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu * "Accessed" bit in the AR field of segment registers is 1. And qemu
* is setting it to 0 in the usedland code. This causes invalid guest * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on. * state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu * Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this * tree. Newer qemu binaries with that qemu fix would not need this
...@@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) ...@@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1; hypercall[2] = 0xc1;
} }
/* called to set cr0 as approriate for a mov-to-cr0 exit. */ /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{ {
if (to_vmx(vcpu)->nested.vmxon && if (to_vmx(vcpu)->nested.vmxon &&
......
...@@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) ...@@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* For each generation, we track the original measured * For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in * nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match * sync, we can match exact offset, and if not, we can match
* exact software computaion in compute_guest_tsc() * exact software computation in compute_guest_tsc()
* *
* These values are tracked in kvm->arch.cur_xxx variables. * These values are tracked in kvm->arch.cur_xxx variables.
*/ */
...@@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) ...@@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{ {
gpa_t gpa = data & ~0x3f; gpa_t gpa = data & ~0x3f;
/* Bits 2:5 are resrved, Should be zero */ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c) if (data & 0x3c)
return 1; return 1;
...@@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) ...@@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
* Ignore all writes to this no longer documented MSR. * Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors, * Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from * all pre-dating SVM, but a recommended workaround from
* AMD for these chips. It is possible to speicify the * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence * affected processor models on the command line, hence
* the need to ignore the workaround. * the need to ignore the workaround.
*/ */
...@@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) ...@@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
if (!vcpu->arch.time_page) if (!vcpu->arch.time_page)
return -EINVAL; return -EINVAL;
src->flags |= PVCLOCK_GUEST_STOPPED; src->flags |= PVCLOCK_GUEST_STOPPED;
mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0; return 0;
} }
...@@ -4492,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) ...@@ -4492,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
/* /*
* if emulation was due to access to shadowed page table * if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-entetr the * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction. * guest to let CPU execute the instruction.
*/ */
if (kvm_mmu_unprotect_page_virt(vcpu, gva)) if (kvm_mmu_unprotect_page_virt(vcpu, gva))
...@@ -5588,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) ...@@ -5588,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
/* /*
* We are here if userspace calls get_regs() in the middle of * We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied * instruction emulation. Registers state needs to be copied
* back from emulation context to vcpu. Usrapace shouldn't do * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware * that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work * backdoor interface) need this to work
*/ */
...@@ -6117,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage) ...@@ -6117,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
* as we reset last_host_tsc on all VCPUs to stop this from being * as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup). * called multiple times (one for each physical CPU bringup).
* *
* Platforms with unnreliable TSCs don't have to deal with this, they * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to * will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot * catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization. * guarantee that they stay in perfect synchronization.
...@@ -6314,6 +6313,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, ...@@ -6314,6 +6313,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
int i; int i;
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
kvm_kvfree(free->arch.rmap_pde[i]);
free->arch.rmap_pde[i] = NULL;
}
if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
kvm_kvfree(free->arch.lpage_info[i]); kvm_kvfree(free->arch.lpage_info[i]);
free->arch.lpage_info[i] = NULL; free->arch.lpage_info[i] = NULL;
...@@ -6333,6 +6336,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) ...@@ -6333,6 +6336,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
lpages = gfn_to_index(slot->base_gfn + npages - 1, lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1; slot->base_gfn, level) + 1;
slot->arch.rmap_pde[i] =
kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
if (!slot->arch.rmap_pde[i])
goto out_free;
slot->arch.lpage_info[i] = slot->arch.lpage_info[i] =
kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
if (!slot->arch.lpage_info[i]) if (!slot->arch.lpage_info[i])
...@@ -6361,7 +6369,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) ...@@ -6361,7 +6369,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
out_free: out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
kvm_kvfree(slot->arch.rmap_pde[i]);
kvm_kvfree(slot->arch.lpage_info[i]); kvm_kvfree(slot->arch.lpage_info[i]);
slot->arch.rmap_pde[i] = NULL;
slot->arch.lpage_info[i] = NULL; slot->arch.lpage_info[i] = NULL;
} }
return -ENOMEM; return -ENOMEM;
...@@ -6381,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, ...@@ -6381,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
map_flags = MAP_SHARED | MAP_ANONYMOUS; map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace, /*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case. *x86 needs to handle !user_alloc case.
*/ */
if (!user_alloc) { if (!user_alloc) {
if (npages && !old.rmap) { if (npages && !old.rmap) {
......
...@@ -183,6 +183,18 @@ struct kvm_vcpu { ...@@ -183,6 +183,18 @@ struct kvm_vcpu {
} async_pf; } async_pf;
#endif #endif
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Cpu relax intercept or pause loop exit optimization
* in_spin_loop: set when a vcpu does a pause loop exit
* or cpu relax intercepted.
* dy_eligible: indicates whether vcpu is eligible for directed yield.
*/
struct {
bool in_spin_loop;
bool dy_eligible;
} spin_loop;
#endif
struct kvm_vcpu_arch arch; struct kvm_vcpu_arch arch;
}; };
...@@ -378,20 +390,11 @@ id_to_memslot(struct kvm_memslots *slots, int id) ...@@ -378,20 +390,11 @@ id_to_memslot(struct kvm_memslots *slots, int id)
return slot; return slot;
} }
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
extern struct page *bad_page; extern struct page *bad_page;
extern struct page *fault_page;
extern pfn_t bad_pfn;
extern pfn_t fault_pfn;
int is_error_page(struct page *page); int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn); int is_error_pfn(pfn_t pfn);
int is_hwpoison_pfn(pfn_t pfn); int is_hwpoison_pfn(pfn_t pfn);
int is_fault_pfn(pfn_t pfn);
int is_noslot_pfn(pfn_t pfn); int is_noslot_pfn(pfn_t pfn);
int is_invalid_pfn(pfn_t pfn); int is_invalid_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr); int kvm_is_error_hva(unsigned long addr);
...@@ -427,20 +430,20 @@ void kvm_release_page_dirty(struct page *page); ...@@ -427,20 +430,20 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page); void kvm_set_page_accessed(struct page *page);
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); pfn_t hva_to_pfn_atomic(unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable); bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable); bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm, pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn); void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn);
void kvm_get_pfn(pfn_t pfn); void kvm_get_pfn(pfn_t pfn);
pfn_t get_fault_pfn(void);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len); int len);
...@@ -740,6 +743,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) ...@@ -740,6 +743,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
} }
static inline gfn_t
hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
return slot->base_gfn + gfn_offset;
}
static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn) gfn_t gfn)
{ {
...@@ -899,5 +910,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) ...@@ -899,5 +910,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
} }
} }
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
vcpu->spin_loop.in_spin_loop = val;
}
static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
vcpu->spin_loop.dy_eligible = val;
}
#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
}
static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
}
static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
return true;
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif #endif
...@@ -21,3 +21,6 @@ config KVM_ASYNC_PF ...@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
config HAVE_KVM_MSI config HAVE_KVM_MSI
bool bool
config HAVE_KVM_CPU_RELAX_INTERCEPT
bool
...@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); ...@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm, static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages); gfn_t base_gfn, unsigned long npages);
static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t gfn, unsigned long size) unsigned long size)
{ {
gfn_t end_gfn; gfn_t end_gfn;
pfn_t pfn; pfn_t pfn;
pfn = gfn_to_pfn_memslot(kvm, slot, gfn); pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT); end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1; gfn += 1;
...@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, ...@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
return pfn; return pfn;
while (gfn < end_gfn) while (gfn < end_gfn)
gfn_to_pfn_memslot(kvm, slot, gfn++); gfn_to_pfn_memslot(slot, gfn++);
return pfn; return pfn;
} }
...@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) ...@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is * Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later. * important because we unmap and unpin in 4kb steps later.
*/ */
pfn = kvm_pin_pages(kvm, slot, gfn, page_size); pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) { if (is_error_pfn(pfn)) {
gfn += 1; gfn += 1;
continue; continue;
......
...@@ -321,11 +321,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, ...@@ -321,11 +321,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) { switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER: case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq; e->set = kvm_set_pic_irq;
max_pin = 16; max_pin = PIC_NUM_PINS;
break; break;
case KVM_IRQCHIP_PIC_SLAVE: case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq; e->set = kvm_set_pic_irq;
max_pin = 16; max_pin = PIC_NUM_PINS;
delta = 8; delta = 8;
break; break;
case KVM_IRQCHIP_IOAPIC: case KVM_IRQCHIP_IOAPIC:
......
...@@ -100,11 +100,14 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); ...@@ -100,11 +100,14 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true; static bool largepages_enabled = true;
struct page *bad_page;
static pfn_t bad_pfn;
static struct page *hwpoison_page; static struct page *hwpoison_page;
static pfn_t hwpoison_pfn; static pfn_t hwpoison_pfn;
struct page *fault_page; static struct page *fault_page;
pfn_t fault_pfn; static pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn) inline int kvm_is_mmio_pfn(pfn_t pfn)
{ {
...@@ -236,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) ...@@ -236,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
} }
vcpu->run = page_address(page); vcpu->run = page_address(page);
kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
r = kvm_arch_vcpu_init(vcpu); r = kvm_arch_vcpu_init(vcpu);
if (r < 0) if (r < 0)
goto fail_free_run; goto fail_free_run;
...@@ -332,8 +338,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, ...@@ -332,8 +338,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section. * count is also read inside the mmu_lock critical section.
*/ */
kvm->mmu_notifier_count++; kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE) need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm_unmap_hva(kvm, start);
need_tlb_flush |= kvm->tlbs_dirty; need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */ /* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush) if (need_tlb_flush)
...@@ -950,12 +955,6 @@ int is_hwpoison_pfn(pfn_t pfn) ...@@ -950,12 +955,6 @@ int is_hwpoison_pfn(pfn_t pfn)
} }
EXPORT_SYMBOL_GPL(is_hwpoison_pfn); EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
int is_fault_pfn(pfn_t pfn)
{
return pfn == fault_pfn;
}
EXPORT_SYMBOL_GPL(is_fault_pfn);
int is_noslot_pfn(pfn_t pfn) int is_noslot_pfn(pfn_t pfn)
{ {
return pfn == bad_pfn; return pfn == bad_pfn;
...@@ -1039,11 +1038,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) ...@@ -1039,11 +1038,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
} }
EXPORT_SYMBOL_GPL(gfn_to_hva); EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t get_fault_pfn(void) pfn_t get_fault_pfn(void)
{ {
get_page(fault_page); get_page(fault_page);
return fault_pfn; return fault_pfn;
} }
EXPORT_SYMBOL_GPL(get_fault_pfn);
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int write, struct page **page) unsigned long start, int write, struct page **page)
...@@ -1065,8 +1065,8 @@ static inline int check_user_page_hwpoison(unsigned long addr) ...@@ -1065,8 +1065,8 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON; return rc == -EHWPOISON;
} }
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
bool *async, bool write_fault, bool *writable) bool write_fault, bool *writable)
{ {
struct page *page[1]; struct page *page[1];
int npages = 0; int npages = 0;
...@@ -1146,9 +1146,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, ...@@ -1146,9 +1146,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
return pfn; return pfn;
} }
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) pfn_t hva_to_pfn_atomic(unsigned long addr)
{ {
return hva_to_pfn(kvm, addr, true, NULL, true, NULL); return hva_to_pfn(addr, true, NULL, true, NULL);
} }
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
...@@ -1166,7 +1166,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, ...@@ -1166,7 +1166,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
return page_to_pfn(bad_page); return page_to_pfn(bad_page);
} }
return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); return hva_to_pfn(addr, atomic, async, write_fault, writable);
} }
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
...@@ -1195,11 +1195,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, ...@@ -1195,11 +1195,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
} }
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm, pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
struct kvm_memory_slot *slot, gfn_t gfn)
{ {
unsigned long addr = gfn_to_hva_memslot(slot, gfn); unsigned long addr = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn(kvm, addr, false, NULL, true, NULL); return hva_to_pfn(addr, false, NULL, true, NULL);
} }
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
...@@ -1580,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) ...@@ -1580,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
} }
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Helper that checks whether a VCPU is eligible for directed yield.
* Most eligible candidate to yield is decided by following heuristics:
*
* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
* (preempted lock holder), indicated by @in_spin_loop.
* Set at the beiginning and cleared at the end of interception/PLE handler.
*
* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
* chance last time (mostly it has become eligible now since we have probably
* yielded to lockholder in last iteration. This is done by toggling
* @dy_eligible each time a VCPU checked for eligibility.)
*
* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
* to preempted lock-holder could result in wrong VCPU selection and CPU
* burning. Giving priority for a potential lock-holder increases lock
* progress.
*
* Since algorithm is based on heuristics, accessing another VCPU data without
* locking does not harm. It may result in trying to yield to same VCPU, fail
* and continue with next VCPU and so on.
*/
bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
(vcpu->spin_loop.in_spin_loop &&
vcpu->spin_loop.dy_eligible);
if (vcpu->spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
return eligible;
}
#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me) void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{ {
struct kvm *kvm = me->kvm; struct kvm *kvm = me->kvm;
...@@ -1589,6 +1625,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) ...@@ -1589,6 +1625,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass; int pass;
int i; int i;
kvm_vcpu_set_in_spin_loop(me, true);
/* /*
* We boost the priority of a VCPU that is runnable but not * We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something * currently running, because it got preempted by something
...@@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) ...@@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue; continue;
if (waitqueue_active(&vcpu->wq)) if (waitqueue_active(&vcpu->wq))
continue; continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
if (kvm_vcpu_yield_to(vcpu)) { if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i; kvm->last_boosted_vcpu = i;
yielded = 1; yielded = 1;
...@@ -1614,6 +1653,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) ...@@ -1614,6 +1653,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
} }
} }
} }
kvm_vcpu_set_in_spin_loop(me, false);
/* Ensure vcpu is not eligible during next spinloop */
kvm_vcpu_set_dy_eligible(me, false);
} }
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
...@@ -2697,9 +2740,6 @@ static struct syscore_ops kvm_syscore_ops = { ...@@ -2697,9 +2740,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume, .resume = kvm_resume,
}; };
struct page *bad_page;
pfn_t bad_pfn;
static inline static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment