Merge branch kvm-arm64/lpi-xarray into kvmarm/next

* kvm-arm64/lpi-xarray: : xarray-based representation of vgic LPIs : : KVM's linked-list of LPI state has proven to be a bottleneck in LPI : injection paths, due to lock serialization when acquiring / releasing a : reference on an IRQ. : : Start the tedious process of reworking KVM's LPI injection by replacing : the LPI linked-list with an xarray, leveraging this to allow RCU readers : to walk it outside of the spinlock. KVM: arm64: vgic: Don't acquire the lpi_list_lock in vgic_put_irq() KVM: arm64: vgic: Ensure the irq refcount is nonzero when taking a ref KVM: arm64: vgic: Rely on RCU protection in vgic_get_lpi() KVM: arm64: vgic: Free LPI vgic_irq structs in an RCU-safe manner KVM: arm64: vgic: Use atomics to count LPIs KVM: arm64: vgic: Get rid of the LPI linked-list KVM: arm64: vgic-its: Walk the LPI xarray in vgic_copy_lpi_list() KVM: arm64: vgic-v3: Iterate the xarray to find pending LPIs KVM: arm64: vgic: Use xarray to find LPI in vgic_get_lpi() KVM: arm64: vgic: Store LPIs in an xarray Signed-off-by: Oliver Upton <oliver.upton@linux.dev>

Merge branch kvm-arm64/lpi-xarray into kvmarm/next
* kvm-arm64/lpi-xarray: : xarray-based representation of vgic LPIs : : KVM's linked-list of LPI state has proven to be a bottleneck in LPI : injection paths, due to lock serialization when acquiring / releasing a : reference on an IRQ. : : Start the tedious process of reworking KVM's LPI injection by replacing : the LPI linked-list with an xarray, leveraging this to allow RCU readers : to walk it outside of the spinlock. KVM: arm64: vgic: Don't acquire the lpi_list_lock in vgic_put_irq() KVM: arm64: vgic: Ensure the irq refcount is nonzero when taking a ref KVM: arm64: vgic: Rely on RCU protection in vgic_get_lpi() KVM: arm64: vgic: Free LPI vgic_irq structs in an RCU-safe manner KVM: arm64: vgic: Use atomics to count LPIs KVM: arm64: vgic: Get rid of the LPI linked-list KVM: arm64: vgic-its: Walk the LPI xarray in vgic_copy_lpi_list() KVM: arm64: vgic-v3: Iterate the xarray to find pending LPIs KVM: arm64: vgic: Use xarray to find LPI in vgic_get_lpi() KVM: arm64: vgic: Store LPIs in an xarray Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
8dbc4110 · Oliver Upton · 0d874858 · e27f2d56 · 8dbc4110 · 8dbc4110
Commit 8dbc4110 authored Mar 07, 2024 by Oliver Upton
7 changed files
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -149,7 +149,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
 	seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
 	seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
 	if (v3)
-		seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
+		seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count));
 	seq_printf(s, "enabled:\t%d\n", dist->enabled);
 	seq_printf(s, "\n");

--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,9 +53,9 @@ void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
-	INIT_LIST_HEAD(&dist->lpi_list_head);
 	INIT_LIST_HEAD(&dist->lpi_translation_cache);
 	raw_spin_lock_init(&dist->lpi_list_lock);
+	xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
 }
 /* CREATION */
@@ -366,6 +366,8 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
 	if (vgic_supports_direct_msis(kvm))
 		vgic_v4_teardown(kvm);
+	xa_destroy(&dist->lpi_xa);
 }
 static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)

--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -52,7 +52,12 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	if (!irq)
 		return ERR_PTR(-ENOMEM);
-	INIT_LIST_HEAD(&irq->lpi_list);
+	ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+	if (ret) {
+		kfree(irq);
+		return ERR_PTR(ret);
+	}
 	INIT_LIST_HEAD(&irq->ap_list);
 	raw_spin_lock_init(&irq->irq_lock);
@@ -68,30 +73,30 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	 * There could be a race with another vgic_add_lpi(), so we need to
 	 * check that we don't add a second list entry with the same LPI.
 	 */
-	list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+	oldirq = xa_load(&dist->lpi_xa, intid);
-		if (oldirq->intid != intid)
+	if (vgic_try_get_irq_kref(oldirq)) {
-			continue;
 		/* Someone was faster with adding this LPI, lets use that. */
 		kfree(irq);
 		irq = oldirq;
-		/*
+		goto out_unlock;
-		 * This increases the refcount, the caller is expected to
+	}
-		 * call vgic_put_irq() on the returned pointer once it's
-		 * finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
+	ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0));
+	if (ret) {
+		xa_release(&dist->lpi_xa, intid);
+		kfree(irq);
 		goto out_unlock;
 	}
-	list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+	atomic_inc(&dist->lpi_count);
-	dist->lpi_list_count++;
 out_unlock:
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+	if (ret)
+		return ERR_PTR(ret);
 	/*
 	 * We "cache" the configuration table entries in our struct vgic_irq's.
 	 * However we only have those structs for mapped IRQs, so we read in
@@ -311,6 +316,8 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 	return 0;
 }
+#define GIC_LPI_MAX_INTID	((1 << INTERRUPT_ID_BITS_ITS) - 1)
 /*
 * Create a snapshot of the current LPIs targeting @vcpu, so that we can
 * enumerate those LPIs without holding any lock.
@@ -319,6 +326,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
+	XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET);
 	struct vgic_irq *irq;
 	unsigned long flags;
 	u32 *intids;
@@ -331,13 +339,15 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
 	 * command). If coming from another path (such as enabling LPIs),
 	 * we must be careful not to overrun the array.
 	 */
-	irq_count = READ_ONCE(dist->lpi_list_count);
+	irq_count = atomic_read(&dist->lpi_count);
 	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT);
 	if (!intids)
 		return -ENOMEM;
 	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+	rcu_read_lock();
+	xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) {
 		if (i == irq_count)
 			break;
 		/* We don't need to "get" the IRQ, as we hold the list lock. */
@@ -345,6 +355,8 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
 			continue;
 		intids[i++] = irq->intid;
 	}
+	rcu_read_unlock();
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 	*intid_ptr = intids;
@@ -592,8 +604,8 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
 	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
 	irq = __vgic_its_check_cache(dist, db, devid, eventid);
-	if (irq)
+	if (!vgic_try_get_irq_kref(irq))
-		vgic_get_irq_kref(irq);
+		irq = NULL;
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
@@ -637,8 +649,13 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 	 * was in the cache, and increment it on the new interrupt.
 	 */
 	if (cte->irq)
-		__vgic_put_lpi_locked(kvm, cte->irq);
+		vgic_put_irq(kvm, cte->irq);
+	/*
+	 * The irq refcount is guaranteed to be nonzero while holding the
+	 * its_lock, as the ITE (and the reference it holds) cannot be freed.
+	 */
+	lockdep_assert_held(&its->its_lock);
 	vgic_get_irq_kref(irq);
 	cte->db		= db;
@@ -669,7 +686,7 @@ void vgic_its_invalidate_cache(struct kvm *kvm)
 		if (!cte->irq)
 			break;
-		__vgic_put_lpi_locked(kvm, cte->irq);
+		vgic_put_irq(kvm, cte->irq);
 		cte->irq = NULL;
 	}

--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -380,6 +380,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
 	struct vgic_irq *irq;
 	gpa_t last_ptr = ~(gpa_t)0;
 	bool vlpi_avail = false;
+	unsigned long index;
 	int ret = 0;
 	u8 val;
@@ -396,7 +397,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
 		vlpi_avail = true;
 	}
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+	xa_for_each(&dist->lpi_xa, index, irq) {
 		int byte_offset, bit_nr;
 		struct kvm_vcpu *vcpu;
 		gpa_t pendbase, ptr;

--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -30,6 +30,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
 *         its->its_lock (mutex)
 *           vgic_cpu->ap_list_lock		must be taken with IRQs disabled
 *             kvm->lpi_list_lock		must be taken with IRQs disabled
+ *               vgic_dist->lpi_xa.xa_lock	must be taken with IRQs disabled
 *                 vgic_irq->irq_lock		must be taken with IRQs disabled
 *
 * As the ap_list_lock might be taken from the timer interrupt handler,
@@ -54,32 +55,22 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
 */
 /*
- * Iterate over the VM's list of mapped LPIs to find the one with a
+ * Index the VM's xarray of mapped LPIs and return a reference to the IRQ
- * matching interrupt ID and return a reference to the IRQ structure.
+ * structure. The caller is expected to call vgic_put_irq() later once it's
+ * finished with the IRQ.
 */
 static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq = NULL;
-	unsigned long flags;
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+	rcu_read_lock();
-		if (irq->intid != intid)
-			continue;
-		/*
+	irq = xa_load(&dist->lpi_xa, intid);
-		 * This increases the refcount, the caller is expected to
+	if (!vgic_try_get_irq_kref(irq))
-		 * call vgic_put_irq() later once it's finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
-		goto out_unlock;
-	}
 		irq = NULL;
-out_unlock:
+	rcu_read_unlock();
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 	return irq;
 }
@@ -120,22 +111,6 @@ static void vgic_irq_release(struct kref *ref)
 {
 }
-/*
- * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
- */
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	if (!kref_put(&irq->refcount, vgic_irq_release))
-		return;
-	list_del(&irq->lpi_list);
-	dist->lpi_list_count--;
-	kfree(irq);
-}
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
@@ -144,9 +119,15 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 	if (irq->intid < VGIC_MIN_LPI)
 		return;
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+	if (!kref_put(&irq->refcount, vgic_irq_release))
-	__vgic_put_lpi_locked(kvm, irq);
+		return;
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+	xa_lock_irqsave(&dist->lpi_xa, flags);
+	__xa_erase(&dist->lpi_xa, irq->intid);
+	xa_unlock_irqrestore(&dist->lpi_xa, flags);
+	atomic_dec(&dist->lpi_count);
+	kfree_rcu(irq, rcu);
 }
 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
@@ -404,7 +385,8 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 	/*
 	 * Grab a reference to the irq to reflect the fact that it is
-	 * now in the ap_list.
+	 * now in the ap_list. This is safe as the caller must already hold a
+	 * reference on the irq.
 	 */
 	vgic_get_irq_kref(irq);
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);

--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -180,7 +180,6 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
 		     gpa_t addr, int len);
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_get_phys_line_level(struct vgic_irq *irq);
 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
@@ -220,12 +219,20 @@ void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
 void vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
 {
+	if (!irq)
+		return false;
 	if (irq->intid < VGIC_MIN_LPI)
-		return;
+		return true;
-	kref_get(&irq->refcount);
+	return kref_get_unless_zero(&irq->refcount);
+}
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+	WARN_ON_ONCE(!vgic_try_get_irq_kref(irq));
 }
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);

--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
+#include <linux/xarray.h>
 #include <kvm/iodev.h>
 #include <linux/list.h>
 #include <linux/jump_label.h>
@@ -116,7 +117,7 @@ struct irq_ops {
 struct vgic_irq {
 	raw_spinlock_t irq_lock;	/* Protects the content of the struct */
-	struct list_head lpi_list;	/* Used to link all LPIs together */
+	struct rcu_head rcu;
 	struct list_head ap_list;
 	struct kvm_vcpu *vcpu;		/* SGIs and PPIs: The VCPU
@@ -273,10 +274,10 @@ struct vgic_dist {
 	 */
 	u64			propbaser;
-	/* Protects the lpi_list and the count value below. */
+	/* Protects the lpi_list. */
 	raw_spinlock_t		lpi_list_lock;
-	struct list_head	lpi_list_head;
+	struct xarray		lpi_xa;
-	int			lpi_list_count;
+	atomic_t		lpi_count;
 	/* LPI translation cache */
 	struct list_head	lpi_translation_cache;