Commit a54d8066 authored by Maciej S. Szmigiero's avatar Maciej S. Szmigiero Committed by Paolo Bonzini

KVM: Keep memslots in tree-based structures instead of array-based ones

The current memslot code uses a (reverse gfn-ordered) memslot array for
keeping track of them.

Because the memslot array that is currently in use cannot be modified
every memslot management operation (create, delete, move, change flags)
has to make a copy of the whole array so it has a scratch copy to work on.

Strictly speaking, however, it is only necessary to make copy of the
memslot that is being modified, copying all the memslots currently present
is just a limitation of the array-based memslot implementation.

Two memslot sets, however, are still needed so the VM continues to run
on the currently active set while the requested operation is being
performed on the second, currently inactive one.

In order to have two memslot sets, but only one copy of actual memslots
it is necessary to split out the memslot data from the memslot sets.

The memslots themselves should be also kept independent of each other
so they can be individually added or deleted.

These two memslot sets should normally point to the same set of
memslots. They can, however, be desynchronized when performing a
memslot management operation by replacing the memslot to be modified
by its copy.  After the operation is complete, both memslot sets once
again point to the same, common set of memslot data.

This commit implements the aforementioned idea.

For tracking of gfns an ordinary rbtree is used since memslots cannot
overlap in the guest address space and so this data structure is
sufficient for ensuring that lookups are done quickly.

The "last used slot" mini-caches (both per-slot set one and per-vCPU one),
that keep track of the last found-by-gfn memslot, are still present in the
new code.
Co-developed-by: default avatarSean Christopherson <seanjc@google.com>
Signed-off-by: default avatarSean Christopherson <seanjc@google.com>
Signed-off-by: default avatarMaciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <17c0cf3663b760a0d3753d4ac08c0753e941b811.1638817641.git.maciej.szmigiero@oracle.com>
parent 6a656832
......@@ -210,13 +210,13 @@ static void stage2_flush_vm(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int idx;
int idx, bkt;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
......@@ -595,14 +595,14 @@ void stage2_unmap_vm(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int idx;
int idx, bkt;
idx = srcu_read_lock(&kvm->srcu);
mmap_read_lock(current->mm);
spin_lock(&kvm->mmu_lock);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
......
......@@ -734,11 +734,11 @@ void kvmppc_rmap_reset(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int srcu_idx;
int srcu_idx, bkt;
srcu_idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
kvm_for_each_memslot(memslot, bkt, slots) {
/* Mutual exclusion with kvm_unmap_hva_range etc. */
spin_lock(&kvm->mmu_lock);
/*
......
......@@ -5880,11 +5880,12 @@ static int kvmhv_svm_off(struct kvm *kvm)
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
struct kvm_memory_slot *memslot;
struct kvm_memslots *slots = __kvm_memslots(kvm, i);
int bkt;
if (!slots)
continue;
kvm_for_each_memslot(memslot, slots) {
kvm_for_each_memslot(memslot, bkt, slots) {
kvmppc_uvmem_drop_pages(memslot, kvm, true);
uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
}
......
......@@ -749,7 +749,7 @@ void kvmhv_release_all_nested(struct kvm *kvm)
struct kvm_nested_guest *gp;
struct kvm_nested_guest *freelist = NULL;
struct kvm_memory_slot *memslot;
int srcu_idx;
int srcu_idx, bkt;
spin_lock(&kvm->mmu_lock);
for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
......@@ -770,7 +770,7 @@ void kvmhv_release_all_nested(struct kvm *kvm)
}
srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_for_each_memslot(memslot, kvm_memslots(kvm))
kvm_for_each_memslot(memslot, bkt, kvm_memslots(kvm))
kvmhv_free_memslot_nest_rmap(memslot);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
......
......@@ -459,7 +459,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot, *m;
int ret = H_SUCCESS;
int srcu_idx;
int srcu_idx, bkt;
kvm->arch.secure_guest = KVMPPC_SECURE_INIT_START;
......@@ -478,7 +478,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
/* register the memslot */
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
kvm_for_each_memslot(memslot, bkt, slots) {
ret = __kvmppc_uvmem_memslot_create(kvm, memslot);
if (ret)
break;
......@@ -486,7 +486,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
if (ret) {
slots = kvm_memslots(kvm);
kvm_for_each_memslot(m, slots) {
kvm_for_each_memslot(m, bkt, slots) {
if (m == memslot)
break;
__kvmppc_uvmem_memslot_delete(kvm, memslot);
......@@ -647,7 +647,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
{
int srcu_idx;
int srcu_idx, bkt;
struct kvm_memory_slot *memslot;
/*
......@@ -662,7 +662,7 @@ unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_for_each_memslot(memslot, kvm_memslots(kvm))
kvm_for_each_memslot(memslot, bkt, kvm_memslots(kvm))
kvmppc_uvmem_drop_pages(memslot, kvm, false);
srcu_read_unlock(&kvm->srcu, srcu_idx);
......@@ -821,7 +821,7 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int srcu_idx;
int srcu_idx, bkt;
long ret = H_SUCCESS;
if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
......@@ -830,7 +830,7 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
/* migrate any unmoved normal pfn to device pfns*/
srcu_idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
kvm_for_each_memslot(memslot, bkt, slots) {
ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
if (ret) {
/*
......
......@@ -1037,13 +1037,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
struct kvm_memory_slot *ms;
struct kvm_memslots *slots;
unsigned long ram_pages = 0;
int slotnr;
int bkt;
/* migration mode already enabled */
if (kvm->arch.migration_mode)
return 0;
slots = kvm_memslots(kvm);
if (!slots || !slots->used_slots)
if (!slots || kvm_memslots_empty(slots))
return -EINVAL;
if (!kvm->arch.use_cmma) {
......@@ -1051,8 +1051,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
return 0;
}
/* mark all the pages in active slots as dirty */
for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
ms = slots->memslots + slotnr;
kvm_for_each_memslot(ms, bkt, slots) {
if (!ms->dirty_bitmap)
return -EINVAL;
/*
......@@ -1976,22 +1975,21 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
unsigned long cur_gfn)
{
struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
int slotidx = ms - slots->memslots;
unsigned long ofs = cur_gfn - ms->base_gfn;
struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
if (ms->base_gfn + ms->npages <= cur_gfn) {
slotidx--;
mnode = rb_next(mnode);
/* If we are above the highest slot, wrap around */
if (slotidx < 0)
slotidx = slots->used_slots - 1;
if (!mnode)
mnode = rb_first(&slots->gfn_tree);
ms = slots->memslots + slotidx;
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = 0;
}
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
while ((slotidx > 0) && (ofs >= ms->npages)) {
slotidx--;
ms = slots->memslots + slotidx;
while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
}
return ms->base_gfn + ofs;
......@@ -2004,7 +2002,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *ms;
if (unlikely(!slots->used_slots))
if (unlikely(kvm_memslots_empty(slots)))
return 0;
cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
......
......@@ -220,12 +220,14 @@ static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm)
/* get the end gfn of the last (highest gfn) memslot */
static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
{
struct rb_node *node;
struct kvm_memory_slot *ms;
if (WARN_ON(!slots->used_slots))
if (WARN_ON(kvm_memslots_empty(slots)))
return 0;
ms = slots->memslots;
node = rb_last(&slots->gfn_tree);
ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]);
return ms->base_gfn + ms->npages;
}
......
......@@ -107,9 +107,10 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
int bkt;
slots = __kvm_memslots(kvm, i);
for (j = 0; j < slots->used_slots; j++) {
slot = &slots->memslots[j];
kvm_for_each_memslot(slot, bkt, slots)
for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
rmap = slot->arch.rmap[k];
lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
......@@ -121,7 +122,6 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
cur[index]++;
}
}
}
}
write_unlock(&kvm->mmu_lock);
......
......@@ -3409,7 +3409,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
int r = 0, i;
int r = 0, i, bkt;
/*
* Check if this is the first shadow root being allocated before
......@@ -3434,7 +3434,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(slot, slots) {
kvm_for_each_memslot(slot, bkt, slots) {
/*
* Both of these functions are no-ops if the target is
* already allocated, so unconditionally calling both
......@@ -5730,14 +5730,14 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
struct kvm_memslots *slots;
bool flush = false;
gfn_t start, end;
int i;
int i, bkt;
if (!kvm_memslots_have_rmaps(kvm))
return flush;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
kvm_for_each_memslot(memslot, bkt, slots) {
start = max(gfn_start, memslot->base_gfn);
end = min(gfn_end, memslot->base_gfn + memslot->npages);
if (start >= end)
......
......@@ -31,6 +31,7 @@
#include <linux/notifier.h>
#include <linux/hashtable.h>
#include <linux/interval_tree.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <asm/signal.h>
......@@ -358,11 +359,13 @@ struct kvm_vcpu {
struct kvm_dirty_ring dirty_ring;
/*
* The index of the most recently used memslot by this vCPU. It's ok
* if this becomes stale due to memslot changes since we always check
* it is a valid slot.
* The most recently used memslot by this vCPU and the slots generation
* for which it is valid.
* No wraparound protection is needed since generations won't overflow in
* thousands of years, even assuming 1M memslot operations per second.
*/
int last_used_slot;
struct kvm_memory_slot *last_used_slot;
u64 last_used_slot_gen;
};
/* must be called with irqs disabled */
......@@ -427,9 +430,26 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
*/
#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
/*
* Since at idle each memslot belongs to two memslot sets it has to contain
* two embedded nodes for each data structure that it forms a part of.
*
* Two memslot sets (one active and one inactive) are necessary so the VM
* continues to run on one memslot set while the other is being modified.
*
* These two memslot sets normally point to the same set of memslots.
* They can, however, be desynchronized when performing a memslot management
* operation by replacing the memslot to be modified by its copy.
* After the operation is complete, both memslot sets once again point to
* the same, common set of memslot data.
*
* The memslots themselves are independent of each other so they can be
* individually added or deleted.
*/
struct kvm_memory_slot {
struct hlist_node id_node;
struct interval_tree_node hva_node;
struct hlist_node id_node[2];
struct interval_tree_node hva_node[2];
struct rb_node gfn_node[2];
gfn_t base_gfn;
unsigned long npages;
unsigned long *dirty_bitmap;
......@@ -524,16 +544,13 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
/*
* Note:
* memslots are not sorted by id anymore, please use id_to_memslot()
* to get the memslot by its id.
*/
struct kvm_memslots {
u64 generation;
atomic_long_t last_used_slot;
struct rb_root_cached hva_tree;
struct rb_root gfn_tree;
/*
* The mapping table from slot id to the index in memslots[].
* The mapping table from slot id to memslot.
*
* 7-bit bucket count matches the size of the old id to index array for
* 512 slots, while giving good performance with this slot count.
......@@ -541,9 +558,7 @@ struct kvm_memslots {
* always result in higher memory usage (even for lower memslot counts).
*/
DECLARE_HASHTABLE(id_hash, 7);
atomic_t last_used_slot;
int used_slots;
struct kvm_memory_slot memslots[];
int node_idx;
};
struct kvm {
......@@ -565,6 +580,9 @@ struct kvm {
struct mutex slots_arch_lock;
struct mm_struct *mm; /* userspace tied to this vm */
unsigned long nr_memslot_pages;
/* The two memslot sets - active and inactive (per address space) */
struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
/* The current active memslot set for each address space */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct xarray vcpu_array;
......@@ -739,11 +757,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
return NULL;
}
#define kvm_for_each_memslot(memslot, slots) \
for (memslot = &slots->memslots[0]; \
memslot < slots->memslots + slots->used_slots; memslot++) \
if (WARN_ON_ONCE(!memslot->npages)) { \
} else
static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
{
return vcpu->vcpu_idx;
}
void kvm_destroy_vcpus(struct kvm *kvm);
......@@ -805,12 +822,23 @@ static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
return __kvm_memslots(vcpu->kvm, as_id);
}
static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
{
return RB_EMPTY_ROOT(&slots->gfn_tree);
}
#define kvm_for_each_memslot(memslot, bkt, slots) \
hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
if (WARN_ON_ONCE(!memslot->npages)) { \
} else
static inline
struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
{
struct kvm_memory_slot *slot;
int idx = slots->node_idx;
hash_for_each_possible(slots->id_hash, slot, id_node, id) {
hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) {
if (slot->id == id)
return slot;
}
......@@ -1214,25 +1242,15 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
/*
* Returns a pointer to the memslot at slot_index if it contains gfn.
* Returns a pointer to the memslot if it contains gfn.
* Otherwise returns NULL.
*/
static inline struct kvm_memory_slot *
try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_memory_slot *slot;
if (slot_index < 0 || slot_index >= slots->used_slots)
if (!slot)
return NULL;
/*
* slot_index can come from vcpu->last_used_slot which is not kept
* in sync with userspace-controllable memslot deletion. So use nospec
* to prevent the CPU from speculating past the end of memslots[].
*/
slot_index = array_index_nospec(slot_index, slots->used_slots);
slot = &slots->memslots[slot_index];
if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
return slot;
else
......@@ -1240,65 +1258,46 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
}
/*
* Returns a pointer to the memslot that contains gfn and records the index of
* the slot in index. Otherwise returns NULL.
* Returns a pointer to the memslot that contains gfn. Otherwise returns NULL.
*
* With "approx" set returns the memslot also when the address falls
* in a hole. In that case one of the memslots bordering the hole is
* returned.
*
* IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
*/
static inline struct kvm_memory_slot *
search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index, bool approx)
search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx)
{
int start = 0, end = slots->used_slots;
struct kvm_memory_slot *memslots = slots->memslots;
struct kvm_memory_slot *slot;
if (unlikely(!slots->used_slots))
return NULL;
while (start < end) {
int slot = start + (end - start) / 2;
if (gfn >= memslots[slot].base_gfn)
end = slot;
else
start = slot + 1;
}
if (approx && start >= slots->used_slots) {
*index = slots->used_slots - 1;
return &memslots[slots->used_slots - 1];
}
slot = try_get_memslot(slots, start, gfn);
if (slot) {
*index = start;
return slot;
}
if (approx) {
*index = start;
return &memslots[start];
struct rb_node *node;
int idx = slots->node_idx;
slot = NULL;
for (node = slots->gfn_tree.rb_node; node; ) {
slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]);
if (gfn >= slot->base_gfn) {
if (gfn < slot->base_gfn + slot->npages)
return slot;
node = node->rb_right;
} else
node = node->rb_left;
}
return NULL;
return approx ? slot : NULL;
}
static inline struct kvm_memory_slot *
____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx)
{
struct kvm_memory_slot *slot;
int slot_index = atomic_read(&slots->last_used_slot);
slot = try_get_memslot(slots, slot_index, gfn);
slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot);
slot = try_get_memslot(slot, gfn);
if (slot)
return slot;
slot = search_memslots(slots, gfn, &slot_index, approx);
slot = search_memslots(slots, gfn, approx);
if (slot) {
atomic_set(&slots->last_used_slot, slot_index);
atomic_long_set(&slots->last_used_slot, (unsigned long)slot);
return slot;
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment