Commit af585b92 authored by Gleb Natapov's avatar Gleb Natapov Committed by Avi Kivity

KVM: Halt vcpu if page it tries to access is swapped out

If a guest accesses swapped out memory do not swap it in from vcpu thread
context. Schedule work to do swapping and put vcpu into halted state
instead.

Interrupts will still be delivered to the guest and if interrupt will
cause reschedule guest will continue to run another task.

[avi: remove call to get_user_pages_noio(), nacked by Linus; this
      makes everything synchrnous again]
Acked-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarGleb Natapov <gleb@redhat.com>
Signed-off-by: default avatarMarcelo Tosatti <mtosatti@redhat.com>
parent 010c520e
...@@ -83,11 +83,14 @@ ...@@ -83,11 +83,14 @@
#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8 #define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
extern spinlock_t kvm_lock; extern spinlock_t kvm_lock;
extern struct list_head vm_list; extern struct list_head vm_list;
struct kvm_vcpu; struct kvm_vcpu;
struct kvm; struct kvm;
struct kvm_async_pf;
enum kvm_reg { enum kvm_reg {
VCPU_REGS_RAX = 0, VCPU_REGS_RAX = 0,
...@@ -412,6 +415,11 @@ struct kvm_vcpu_arch { ...@@ -412,6 +415,11 @@ struct kvm_vcpu_arch {
u64 hv_vapic; u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask; cpumask_var_t wbinvd_dirty_mask;
struct {
bool halted;
gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
} apf;
}; };
struct kvm_arch { struct kvm_arch {
...@@ -585,6 +593,10 @@ struct kvm_x86_ops { ...@@ -585,6 +593,10 @@ struct kvm_x86_ops {
const struct trace_print_flags *exit_reasons_str; const struct trace_print_flags *exit_reasons_str;
}; };
struct kvm_arch_async_pf {
gfn_t gfn;
};
extern struct kvm_x86_ops *kvm_x86_ops; extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void); int kvm_mmu_module_init(void);
...@@ -799,4 +811,10 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); ...@@ -799,4 +811,10 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
#endif /* _ASM_X86_KVM_HOST_H */ #endif /* _ASM_X86_KVM_HOST_H */
...@@ -28,6 +28,7 @@ config KVM ...@@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE select KVM_APIC_ARCHITECTURE
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER select USER_RETURN_NOTIFIER
select KVM_MMIO select KVM_MMIO
---help--- ---help---
......
...@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ...@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \ coalesced_mmio.o irq_comm.o eventfd.o \
assigned-dev.o) assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o timer.o i8254.o timer.o
......
...@@ -18,9 +18,11 @@ ...@@ -18,9 +18,11 @@
* *
*/ */
#include "irq.h"
#include "mmu.h" #include "mmu.h"
#include "x86.h" #include "x86.h"
#include "kvm_cache_regs.h" #include "kvm_cache_regs.h"
#include "x86.h"
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/types.h> #include <linux/types.h>
...@@ -2587,6 +2589,50 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, ...@@ -2587,6 +2589,50 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn); error_code & PFERR_WRITE_MASK, gfn);
} }
int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
{
struct kvm_arch_async_pf arch;
arch.gfn = gfn;
return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
kvm_event_needs_reinjection(vcpu)))
return false;
return kvm_x86_ops->interrupt_allowed(vcpu);
}
static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
pfn_t *pfn)
{
bool async;
*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
if (!async)
return false; /* *pfn has correct page already */
put_page(pfn_to_page(*pfn));
if (can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(async, *pfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
return true;
}
*pfn = gfn_to_pfn(vcpu->kvm, gfn);
return false;
}
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
u32 error_code) u32 error_code)
{ {
...@@ -2609,7 +2655,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, ...@@ -2609,7 +2655,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb(); smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
if (try_async_pf(vcpu, gfn, gpa, &pfn))
return 0;
/* mmio */
if (is_error_pfn(pfn)) if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock); spin_lock(&vcpu->kvm->mmu_lock);
......
...@@ -568,7 +568,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -568,7 +568,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb(); smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
if (try_async_pf(vcpu, walker.gfn, addr, &pfn))
return 0;
/* mmio */ /* mmio */
if (is_error_pfn(pfn)) if (is_error_pfn(pfn))
......
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/hash.h>
#include <trace/events/kvm.h> #include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
...@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { ...@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0; u64 __read_mostly host_xcr0;
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn) static void kvm_on_user_return(struct user_return_notifier *urn)
{ {
unsigned slot; unsigned slot;
...@@ -5115,6 +5123,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ...@@ -5115,6 +5123,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->fpu_active = 0; vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu); kvm_x86_ops->fpu_deactivate(vcpu);
} }
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
vcpu->arch.apf.halted = true;
r = 1;
goto out;
}
} }
r = kvm_mmu_reload(vcpu); r = kvm_mmu_reload(vcpu);
...@@ -5243,7 +5257,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ...@@ -5243,7 +5257,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = 1; r = 1;
while (r > 0) { while (r > 0) {
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu); r = vcpu_enter_guest(vcpu);
else { else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
...@@ -5256,6 +5271,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ...@@ -5256,6 +5271,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE; KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE: case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break; break;
case KVM_MP_STATE_SIPI_RECEIVED: case KVM_MP_STATE_SIPI_RECEIVED:
default: default:
...@@ -5277,6 +5293,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ...@@ -5277,6 +5293,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits; ++vcpu->stat.request_irq_exits;
} }
kvm_check_async_pf_completion(vcpu);
if (signal_pending(current)) { if (signal_pending(current)) {
r = -EINTR; r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->run->exit_reason = KVM_EXIT_INTR;
...@@ -5792,6 +5811,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) ...@@ -5792,6 +5811,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
return kvm_x86_ops->vcpu_reset(vcpu); return kvm_x86_ops->vcpu_reset(vcpu);
} }
...@@ -5880,6 +5903,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) ...@@ -5880,6 +5903,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks; goto fail_free_mce_banks;
kvm_async_pf_hash_reset(vcpu);
return 0; return 0;
fail_free_mce_banks: fail_free_mce_banks:
kfree(vcpu->arch.mce_banks); kfree(vcpu->arch.mce_banks);
...@@ -5938,8 +5963,10 @@ static void kvm_free_vcpus(struct kvm *kvm) ...@@ -5938,8 +5963,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/* /*
* Unpin any mmu pages first. * Unpin any mmu pages first.
*/ */
kvm_for_each_vcpu(i, vcpu, kvm) kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu); kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm) kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu); kvm_arch_vcpu_free(vcpu);
...@@ -6050,7 +6077,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm) ...@@ -6050,7 +6077,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{ {
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending || || vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) && (kvm_arch_interrupt_allowed(vcpu) &&
...@@ -6109,6 +6138,83 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) ...@@ -6109,6 +6138,83 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
} }
EXPORT_SYMBOL_GPL(kvm_set_rflags); EXPORT_SYMBOL_GPL(kvm_set_rflags);
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
}
static inline u32 kvm_async_pf_next_probe(u32 key)
{
return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
}
static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u32 key = kvm_async_pf_hash_fn(gfn);
while (vcpu->arch.apf.gfns[key] != ~0)
key = kvm_async_pf_next_probe(key);
vcpu->arch.apf.gfns[key] = gfn;
}
static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
{
int i;
u32 key = kvm_async_pf_hash_fn(gfn);
for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
(vcpu->arch.apf.gfns[key] != gfn ||
vcpu->arch.apf.gfns[key] == ~0); i++)
key = kvm_async_pf_next_probe(key);
return key;
}
bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
}
static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u32 i, j, k;
i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
while (true) {
vcpu->arch.apf.gfns[i] = ~0;
do {
j = kvm_async_pf_next_probe(j);
if (vcpu->arch.apf.gfns[j] == ~0)
return;
k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
/*
* k lies cyclically in ]i,j]
* | i.k.j |
* |....j i.k.| or |.k..j i...|
*/
} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
i = j;
}
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_not_present(work->gva);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_ready(work->gva);
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
}
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
......
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#define KVM_REQ_KICK 9 #define KVM_REQ_KICK 9
#define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11 #define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12
#define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_USERSPACE_IRQ_SOURCE_ID 0
...@@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, ...@@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev); struct kvm_io_device *dev);
#ifdef CONFIG_KVM_ASYNC_PF
struct kvm_async_pf {
struct work_struct work;
struct list_head link;
struct list_head queue;
struct kvm_vcpu *vcpu;
struct mm_struct *mm;
gva_t gva;
unsigned long addr;
struct kvm_arch_async_pf arch;
struct page *page;
bool done;
};
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
struct kvm_arch_async_pf *arch);
#endif
struct kvm_vcpu { struct kvm_vcpu {
struct kvm *kvm; struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
...@@ -104,6 +125,15 @@ struct kvm_vcpu { ...@@ -104,6 +125,15 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr; gpa_t mmio_phys_addr;
#endif #endif
#ifdef CONFIG_KVM_ASYNC_PF
struct {
u32 queued;
struct list_head queue;
struct list_head done;
spinlock_t lock;
} async_pf;
#endif
struct kvm_vcpu_arch arch; struct kvm_vcpu_arch arch;
}; };
...@@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page); ...@@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page);
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm, pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn); struct kvm_memory_slot *slot, gfn_t gfn);
......
...@@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page, ...@@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page,
__entry->referenced ? "YOUNG" : "OLD") __entry->referenced ? "YOUNG" : "OLD")
); );
#ifdef CONFIG_KVM_ASYNC_PF
TRACE_EVENT(
kvm_try_async_get_page,
TP_PROTO(bool async, u64 pfn),
TP_ARGS(async, pfn),
TP_STRUCT__entry(
__field(__u64, pfn)
),
TP_fast_assign(
__entry->pfn = (!async) ? pfn : (u64)-1;
),
TP_printk("pfn %#llx", __entry->pfn)
);
TRACE_EVENT(
kvm_async_pf_not_present,
TP_PROTO(u64 gva),
TP_ARGS(gva),
TP_STRUCT__entry(
__field(__u64, gva)
),
TP_fast_assign(
__entry->gva = gva;
),
TP_printk("gva %#llx not present", __entry->gva)
);
TRACE_EVENT(
kvm_async_pf_ready,
TP_PROTO(u64 gva),
TP_ARGS(gva),
TP_STRUCT__entry(
__field(__u64, gva)
),
TP_fast_assign(
__entry->gva = gva;
),
TP_printk("gva %#llx ready", __entry->gva)
);
TRACE_EVENT(
kvm_async_pf_completed,
TP_PROTO(unsigned long address, struct page *page, u64 gva),
TP_ARGS(address, page, gva),
TP_STRUCT__entry(
__field(unsigned long, address)
__field(pfn_t, pfn)
__field(u64, gva)
),
TP_fast_assign(
__entry->address = address;
__entry->pfn = page ? page_to_pfn(page) : 0;
__entry->gva = gva;
),
TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
__entry->address, __entry->pfn)
);
TRACE_EVENT(
kvm_async_pf_doublefault,
TP_PROTO(u64 gva, u64 gfn),
TP_ARGS(gva, gfn),
TP_STRUCT__entry(
__field(u64, gva)
__field(u64, gfn)
),
TP_fast_assign(
__entry->gva = gva;
__entry->gfn = gfn;
),
TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
);
#endif
#endif /* _TRACE_KVM_MAIN_H */ #endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE ...@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
config KVM_MMIO config KVM_MMIO
bool bool
config KVM_ASYNC_PF
bool
/*
* kvm asynchronous fault support
*
* Copyright 2010 Red Hat, Inc.
*
* Author:
* Gleb Natapov <gleb@redhat.com>
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/mmu_context.h>
#include "async_pf.h"
#include <trace/events/kvm.h>
static struct kmem_cache *async_pf_cache;
int kvm_async_pf_init(void)
{
async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
if (!async_pf_cache)
return -ENOMEM;
return 0;
}
void kvm_async_pf_deinit(void)
{
if (async_pf_cache)
kmem_cache_destroy(async_pf_cache);
async_pf_cache = NULL;
}
void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
{
INIT_LIST_HEAD(&vcpu->async_pf.done);
INIT_LIST_HEAD(&vcpu->async_pf.queue);
spin_lock_init(&vcpu->async_pf.lock);
}
static void async_pf_execute(struct work_struct *work)
{
struct page *page = NULL;
struct kvm_async_pf *apf =
container_of(work, struct kvm_async_pf, work);
struct mm_struct *mm = apf->mm;
struct kvm_vcpu *vcpu = apf->vcpu;
unsigned long addr = apf->addr;
gva_t gva = apf->gva;
might_sleep();
use_mm(mm);
down_read(&mm->mmap_sem);
get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
up_read(&mm->mmap_sem);
unuse_mm(mm);
spin_lock(&vcpu->async_pf.lock);
list_add_tail(&apf->link, &vcpu->async_pf.done);
apf->page = page;
apf->done = true;
spin_unlock(&vcpu->async_pf.lock);
/*
* apf may be freed by kvm_check_async_pf_completion() after
* this point
*/
trace_kvm_async_pf_completed(addr, page, gva);
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
mmdrop(mm);
kvm_put_kvm(vcpu->kvm);
}
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
list_entry(vcpu->async_pf.queue.next,
typeof(*work), queue);
cancel_work_sync(&work->work);
list_del(&work->queue);
if (!work->done) /* work was canceled */
kmem_cache_free(async_pf_cache, work);
}
spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
if (work->page)
put_page(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
vcpu->async_pf.queued = 0;
}
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
{
struct kvm_async_pf *work;
if (list_empty_careful(&vcpu->async_pf.done))
return;
spin_lock(&vcpu->async_pf.lock);
work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
list_del(&work->link);
spin_unlock(&vcpu->async_pf.lock);
kvm_arch_async_page_present(vcpu, work);
list_del(&work->queue);
vcpu->async_pf.queued--;
if (work->page)
put_page(work->page);
kmem_cache_free(async_pf_cache, work);
}
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
struct kvm_arch_async_pf *arch)
{
struct kvm_async_pf *work;
if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
return 0;
/* setup delayed work */
/*
* do alloc nowait since if we are going to sleep anyway we
* may as well sleep faulting in page
*/
work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
if (!work)
return 0;
work->page = NULL;
work->done = false;
work->vcpu = vcpu;
work->gva = gva;
work->addr = gfn_to_hva(vcpu->kvm, gfn);
work->arch = *arch;
work->mm = current->mm;
atomic_inc(&work->mm->mm_count);
kvm_get_kvm(work->vcpu->kvm);
/* this can't really happen otherwise gfn_to_pfn_async
would succeed */
if (unlikely(kvm_is_error_hva(work->addr)))
goto retry_sync;
INIT_WORK(&work->work, async_pf_execute);
if (!schedule_work(&work->work))
goto retry_sync;
list_add_tail(&work->queue, &vcpu->async_pf.queue);
vcpu->async_pf.queued++;
kvm_arch_async_page_not_present(vcpu, work);
return 1;
retry_sync:
kvm_put_kvm(work->vcpu->kvm);
mmdrop(work->mm);
kmem_cache_free(async_pf_cache, work);
return 0;
}
/*
* kvm asynchronous fault support
*
* Copyright 2010 Red Hat, Inc.
*
* Author:
* Gleb Natapov <gleb@redhat.com>
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef __KVM_ASYNC_PF_H__
#define __KVM_ASYNC_PF_H__
#ifdef CONFIG_KVM_ASYNC_PF
int kvm_async_pf_init(void);
void kvm_async_pf_deinit(void);
void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
#else
#define kvm_async_pf_init() (0)
#define kvm_async_pf_deinit() do{}while(0)
#define kvm_async_pf_vcpu_init(C) do{}while(0)
#endif
#endif
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
#include <asm-generic/bitops/le.h> #include <asm-generic/bitops/le.h>
#include "coalesced_mmio.h" #include "coalesced_mmio.h"
#include "async_pf.h"
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/kvm.h> #include <trace/events/kvm.h>
...@@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) ...@@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm; vcpu->kvm = kvm;
vcpu->vcpu_id = id; vcpu->vcpu_id = id;
init_waitqueue_head(&vcpu->wq); init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
page = alloc_page(GFP_KERNEL | __GFP_ZERO); page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) { if (!page) {
...@@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) ...@@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
} }
EXPORT_SYMBOL_GPL(gfn_to_hva); EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async)
{ {
struct page *page[1]; struct page *page[1];
int npages; int npages = 0;
pfn_t pfn; pfn_t pfn;
if (atomic) /* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
if (atomic || async)
npages = __get_user_pages_fast(addr, 1, 1, page); npages = __get_user_pages_fast(addr, 1, 1, page);
else {
if (unlikely(npages != 1) && !atomic) {
might_sleep(); might_sleep();
npages = get_user_pages_fast(addr, 1, 1, page); npages = get_user_pages_fast(addr, 1, 1, page);
} }
...@@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) ...@@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
if (vma == NULL || addr < vma->vm_start || if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) { !(vma->vm_flags & VM_PFNMAP)) {
if (async && !(vma->vm_flags & VM_PFNMAP) &&
(vma->vm_flags & VM_WRITE))
*async = true;
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
return_fault_page: return_fault_page:
get_page(fault_page); get_page(fault_page);
...@@ -993,32 +1003,41 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) ...@@ -993,32 +1003,41 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
{ {
return hva_to_pfn(kvm, addr, true); return hva_to_pfn(kvm, addr, true, NULL);
} }
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
{ {
unsigned long addr; unsigned long addr;
if (async)
*async = false;
addr = gfn_to_hva(kvm, gfn); addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) { if (kvm_is_error_hva(addr)) {
get_page(bad_page); get_page(bad_page);
return page_to_pfn(bad_page); return page_to_pfn(bad_page);
} }
return hva_to_pfn(kvm, addr, atomic); return hva_to_pfn(kvm, addr, atomic, async);
} }
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
{ {
return __gfn_to_pfn(kvm, gfn, true); return __gfn_to_pfn(kvm, gfn, true, NULL);
} }
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
{
return __gfn_to_pfn(kvm, gfn, false, async);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{ {
return __gfn_to_pfn(kvm, gfn, false); return __gfn_to_pfn(kvm, gfn, false, NULL);
} }
EXPORT_SYMBOL_GPL(gfn_to_pfn); EXPORT_SYMBOL_GPL(gfn_to_pfn);
...@@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm, ...@@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn) struct kvm_memory_slot *slot, gfn_t gfn)
{ {
unsigned long addr = gfn_to_hva_memslot(slot, gfn); unsigned long addr = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn(kvm, addr, false); return hva_to_pfn(kvm, addr, false, NULL);
} }
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
...@@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, ...@@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_free_5; goto out_free_5;
} }
r = kvm_async_pf_init();
if (r)
goto out_free;
kvm_chardev_ops.owner = module; kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module; kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module; kvm_vcpu_fops.owner = module;
...@@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, ...@@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
r = misc_register(&kvm_dev); r = misc_register(&kvm_dev);
if (r) { if (r) {
printk(KERN_ERR "kvm: misc device register failed\n"); printk(KERN_ERR "kvm: misc device register failed\n");
goto out_free; goto out_unreg;
} }
kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_in = kvm_sched_in;
...@@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, ...@@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
return 0; return 0;
out_unreg:
kvm_async_pf_deinit();
out_free: out_free:
kmem_cache_destroy(kvm_vcpu_cache); kmem_cache_destroy(kvm_vcpu_cache);
out_free_5: out_free_5:
...@@ -2385,6 +2410,7 @@ void kvm_exit(void) ...@@ -2385,6 +2410,7 @@ void kvm_exit(void)
kvm_exit_debug(); kvm_exit_debug();
misc_deregister(&kvm_dev); misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache); kmem_cache_destroy(kvm_vcpu_cache);
kvm_async_pf_deinit();
sysdev_unregister(&kvm_sysdev); sysdev_unregister(&kvm_sysdev);
sysdev_class_unregister(&kvm_sysdev_class); sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier); unregister_reboot_notifier(&kvm_reboot_notifier);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment