Commit 1609d760 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "The main change here is a revert of reverts. We recently simplified
  some code that was thought unnecessary; however, since then KVM has
  grown quite a few cond_resched()s and for that reason the simplified
  code is prone to livelocks---one CPUs tries to empty a list of guest
  page tables while the others keep adding to them. This adds back the
  generation-based zapping of guest page tables, which was not
  unnecessary after all.

  On top of this, there is a fix for a kernel memory leak and a couple
  of s390 fixlets as well"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
  KVM: x86: work around leak of uninitialized stack contents
  KVM: nVMX: handle page fault in vmread
  KVM: s390: Do not leak kernel stack data in the KVM_S390_INTERRUPT ioctl
  KVM: s390: kvm_s390_vm_start_migration: check dirty_bitmap before using it as target for memset()
parents 1f9c632c a9c20bb0
...@@ -1961,6 +1961,16 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, ...@@ -1961,6 +1961,16 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
case KVM_S390_MCHK: case KVM_S390_MCHK:
irq->u.mchk.mcic = s390int->parm64; irq->u.mchk.mcic = s390int->parm64;
break; break;
case KVM_S390_INT_PFAULT_INIT:
irq->u.ext.ext_params = s390int->parm;
irq->u.ext.ext_params2 = s390int->parm64;
break;
case KVM_S390_RESTART:
case KVM_S390_INT_CLOCK_COMP:
case KVM_S390_INT_CPU_TIMER:
break;
default:
return -EINVAL;
} }
return 0; return 0;
} }
......
...@@ -1018,6 +1018,8 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) ...@@ -1018,6 +1018,8 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
/* mark all the pages in active slots as dirty */ /* mark all the pages in active slots as dirty */
for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
ms = slots->memslots + slotnr; ms = slots->memslots + slotnr;
if (!ms->dirty_bitmap)
return -EINVAL;
/* /*
* The second half of the bitmap is only used on x86, * The second half of the bitmap is only used on x86,
* and would be wasted otherwise, so we put it to good * and would be wasted otherwise, so we put it to good
...@@ -4323,7 +4325,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, ...@@ -4323,7 +4325,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
} }
case KVM_S390_INTERRUPT: { case KVM_S390_INTERRUPT: {
struct kvm_s390_interrupt s390int; struct kvm_s390_interrupt s390int;
struct kvm_s390_irq s390irq; struct kvm_s390_irq s390irq = {};
if (copy_from_user(&s390int, argp, sizeof(s390int))) if (copy_from_user(&s390int, argp, sizeof(s390int)))
return -EFAULT; return -EFAULT;
......
...@@ -335,6 +335,7 @@ struct kvm_mmu_page { ...@@ -335,6 +335,7 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */ int root_count; /* Currently serving as active root */
unsigned int unsync_children; unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512); DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
...@@ -856,6 +857,7 @@ struct kvm_arch { ...@@ -856,6 +857,7 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages; unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages; unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages; unsigned int indirect_shadow_pages;
unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/* /*
* Hash table of struct kvm_mmu_page. * Hash table of struct kvm_mmu_page.
......
...@@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct ...@@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct) if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp); set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
* comments in kvm_zap_obsolete_pages().
*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1); kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp; return sp;
...@@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, ...@@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
#define for_each_valid_sp(_kvm, _sp, _gfn) \ #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \ hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
if ((_sp)->role.invalid) { \ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
} else } else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
...@@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } ...@@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { } static void mmu_audit_disable(void) { }
#endif #endif
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list) struct list_head *invalid_list)
{ {
...@@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync) if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
} }
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt); clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true); trace_kvm_mmu_get_page(sp, true);
...@@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, ...@@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false; return false;
if (cached_root_available(vcpu, new_cr3, new_role)) { if (cached_root_available(vcpu, new_cr3, new_role)) {
/*
* It is possible that the cached previous root page is
* obsolete because of a change in the MMU generation
* number. However, changing the generation number is
* accompanied by KVM_REQ_MMU_RELOAD, which will free
* the root set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) { if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
...@@ -5649,11 +5668,89 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) ...@@ -5649,11 +5668,89 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return alloc_mmu_pages(vcpu); return alloc_mmu_pages(vcpu);
} }
static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
int ign;
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) {
/*
* No obsolete valid page exists before a newly created page
* since active_mmu_pages is a FIFO list.
*/
if (!is_obsolete_sp(kvm, sp))
break;
/*
* Do not repeatedly zap a root page to avoid unnecessary
* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
* progress:
* vcpu 0 vcpu 1
* call vcpu_enter_guest():
* 1): handle KVM_REQ_MMU_RELOAD
* and require mmu-lock to
* load mmu
* repeat:
* 1): zap root page and
* send KVM_REQ_MMU_RELOAD
*
* 2): if (cond_resched_lock(mmu-lock))
*
* 2): hold mmu-lock and load mmu
*
* 3): see KVM_REQ_MMU_RELOAD bit
* on vcpu->requests is set
* then return 1 to call
* vcpu_enter_guest() again.
* goto repeat;
*
* Since we are reversely walking the list and the invalid
* list will be moved to the head, skip the invalid page
* can help us to avoid the infinity list walking.
*/
if (sp->role.invalid)
continue;
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
cond_resched_lock(&kvm->mmu_lock);
goto restart;
}
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
/*
* Fast invalidate all shadow pages and use lock-break technique
* to zap obsolete pages.
*
* It's required when memslot is being deleted or VM is being
* destroyed, in these cases, we should ensure that KVM MMU does
* not use any resource of the being-deleted slot or all slots
* after calling the function.
*/
static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
spin_lock(&kvm->mmu_lock);
kvm->arch.mmu_valid_gen++;
kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node) struct kvm_page_track_notifier_node *node)
{ {
kvm_mmu_zap_all(kvm); kvm_mmu_zap_all_fast(kvm);
} }
void kvm_mmu_init_vm(struct kvm *kvm) void kvm_mmu_init_vm(struct kvm *kvm)
......
...@@ -4540,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) ...@@ -4540,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
int len; int len;
gva_t gva = 0; gva_t gva = 0;
struct vmcs12 *vmcs12; struct vmcs12 *vmcs12;
struct x86_exception e;
short offset; short offset;
if (!nested_vmx_check_permission(vcpu)) if (!nested_vmx_check_permission(vcpu))
...@@ -4588,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) ...@@ -4588,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
vmx_instruction_info, true, len, &gva)) vmx_instruction_info, true, len, &gva))
return 1; return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
kvm_inject_page_fault(vcpu, &e);
} }
return nested_vmx_succeed(vcpu); return nested_vmx_succeed(vcpu);
......
...@@ -5312,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, ...@@ -5312,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
/* kvm_write_guest_virt_system can pull in tons of pages. */ /* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true; vcpu->arch.l1tf_flush_l1d = true;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
* is returned, but our callers are not ready for that and they blindly
* call kvm_inject_page_fault. Ensure that they at least do not leak
* uninitialized kernel stack memory into cr2 and error code.
*/
memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception); PFERR_WRITE_MASK, exception);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment