scsi: lpfc: Change default IRQ model on AMD architectures

The current driver attempts to allocate an interrupt vector per cpu using the systems managed IRQ allocator (flag PCI_IRQ_AFFINITY). The system IRQ allocator will either provide the per-cpu vector, or return fewer vectors. When fewer vectors, they are evenly spread between the numa nodes on the system. When run on an AMD architecture, if interrupts occur to a cpu that is not in the same numa node as the adapter generating the interrupt, there are extreme costs and overheads in performance. Thus, if 1:1 vector allocation is used, or the "balanced" vectors in the other numa nodes, performance can be hit significantly. A much more performant model is to allocate interrupts only on the cpus that are in the numa node where the adapter resides. I/O completion is still performed by the cpu where the I/O was generated. Unfortunately, there is no flag to request the managed IRQ subsystem allocate vectors only for the CPUs in the numa node as the adapter. On AMD architecture, revert the irq allocation to the normal style (non-managed) and then use irq_set_affinity_hint() to set the cpu affinity and disable user-space rebalancing. Tie the support into CPU offline/online. If the cpu being offlined owns a vector, the vector is re-affinitized to one of the other CPUs on the same numa node. If there are no more CPUs on the numa node, the vector has all affinity removed and lets the system determine where it's serviced. Similarly, when the cpu that owned a vector comes online, the vector is reaffinitized to the cpu. Link: https://lore.kernel.org/r/20191105005708.7399-10-jsmart2021@gmail.comSigned-off-by: Dick Kennedy <dick.kennedy@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

scsi: lpfc: Change default IRQ model on AMD architectures
The current driver attempts to allocate an interrupt vector per cpu using the systems managed IRQ allocator (flag PCI_IRQ_AFFINITY). The system IRQ allocator will either provide the per-cpu vector, or return fewer vectors. When fewer vectors, they are evenly spread between the numa nodes on the system. When run on an AMD architecture, if interrupts occur to a cpu that is not in the same numa node as the adapter generating the interrupt, there are extreme costs and overheads in performance. Thus, if 1:1 vector allocation is used, or the "balanced" vectors in the other numa nodes, performance can be hit significantly. A much more performant model is to allocate interrupts only on the cpus that are in the numa node where the adapter resides. I/O completion is still performed by the cpu where the I/O was generated. Unfortunately, there is no flag to request the managed IRQ subsystem allocate vectors only for the CPUs in the numa node as the adapter. On AMD architecture, revert the irq allocation to the normal style (non-managed) and then use irq_set_affinity_hint() to set the cpu affinity and disable user-space rebalancing. Tie the support into CPU offline/online. If the cpu being offlined owns a vector, the vector is re-affinitized to one of the other CPUs on the same numa node. If there are no more CPUs on the numa node, the vector has all affinity removed and lets the system determine where it's serviced. Similarly, when the cpu that owned a vector comes online, the vector is reaffinitized to the cpu. Link: https://lore.kernel.org/r/20191105005708.7399-10-jsmart2021@gmail.comSigned-off-by: Dick Kennedy <dick.kennedy@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
dcaa2136 · James Smart · Martin K. Petersen · 93a4d6f4 · dcaa2136 · dcaa2136
Commit dcaa2136 authored Nov 04, 2019 by James Smart Committed by Martin K. Petersen Nov 06, 2019
4 changed files
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -837,6 +837,7 @@ struct lpfc_hba {
 	uint32_t cfg_fcp_mq_threshold;
 	uint32_t cfg_hdw_queue;
 	uint32_t cfg_irq_chann;
+	uint32_t cfg_irq_numa;
 	uint32_t cfg_suppress_rsp;
 	uint32_t cfg_nvme_oas;
 	uint32_t cfg_nvme_embed_cmd;
@@ -1311,6 +1312,26 @@ lpfc_phba_elsring(struct lpfc_hba *phba)
 	return &phba->sli.sli3_ring[LPFC_ELS_RING];
 }

+/**
+ * lpfc_next_online_numa_cpu - Finds next online CPU on NUMA node
+ * @numa_mask: Pointer to phba's numa_mask member.
+ * @start: starting cpu index
+ *
+ * Note: If no valid cpu found, then nr_cpu_ids is returned.
+ *
+ **/
+static inline unsigned int
+lpfc_next_online_numa_cpu(const struct cpumask *numa_mask, unsigned int start)
+{
+	unsigned int cpu_it;
+
+	for_each_cpu_wrap(cpu_it, numa_mask, start) {
+		if (cpu_online(cpu_it))
+			break;
+	}
+
+	return cpu_it;
+}
 /**
 * lpfc_sli4_mod_hba_eq_delay - update EQ delay
 * @phba: Pointer to HBA context object.

--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -5331,7 +5331,7 @@ lpfc_fcp_cpu_map_show(struct device *dev, struct device_attribute *attr,
 			len += scnprintf(buf + len, PAGE_SIZE - len,
 					"CPU %02d not present\n",
 					phba->sli4_hba.curr_disp_cpu);
-		else if (cpup->irq == LPFC_VECTOR_MAP_EMPTY) {
+		else if (cpup->eq == LPFC_VECTOR_MAP_EMPTY) {
 			if (cpup->hdwq == LPFC_VECTOR_MAP_EMPTY)
 				len += scnprintf(
 					buf + len, PAGE_SIZE - len,
@@ -5344,10 +5344,10 @@ lpfc_fcp_cpu_map_show(struct device *dev, struct device_attribute *attr,
 			else
 				len += scnprintf(
 					buf + len, PAGE_SIZE - len,
-					"CPU %02d EQ %04d hdwq %04d "
+					"CPU %02d EQ None hdwq %04d "
 					"physid %d coreid %d ht %d ua %d\n",
 					phba->sli4_hba.curr_disp_cpu,
-					cpup->eq, cpup->hdwq, cpup->phys_id,
+					cpup->hdwq, cpup->phys_id,
 					cpup->core_id,
 					(cpup->flag & LPFC_CPU_MAP_HYPER),
 					(cpup->flag & LPFC_CPU_MAP_UNASSIGN));
@@ -5362,7 +5362,7 @@ lpfc_fcp_cpu_map_show(struct device *dev, struct device_attribute *attr,
 					cpup->core_id,
 					(cpup->flag & LPFC_CPU_MAP_HYPER),
 					(cpup->flag & LPFC_CPU_MAP_UNASSIGN),
-					cpup->irq);
+					lpfc_get_irq(cpup->eq));
 			else
 				len += scnprintf(
 					buf + len, PAGE_SIZE - len,
@@ -5373,7 +5373,7 @@ lpfc_fcp_cpu_map_show(struct device *dev, struct device_attribute *attr,
 					cpup->core_id,
 					(cpup->flag & LPFC_CPU_MAP_HYPER),
 					(cpup->flag & LPFC_CPU_MAP_UNASSIGN),
-					cpup->irq);
+					lpfc_get_irq(cpup->eq));
 		}

 		phba->sli4_hba.curr_disp_cpu++;
@@ -5744,7 +5744,7 @@ LPFC_ATTR_RW(nvme_embed_cmd, 1, 0, 2,
 * the driver will advertise it supports to the SCSI layer.
 *
 *      0    = Set nr_hw_queues by the number of CPUs or HW queues.
- *      1,128 = Manually specify the maximum nr_hw_queue value to be set,
+ *      1,256 = Manually specify nr_hw_queue value to be advertised,
 *
 * Value range is [0,256]. Default value is 8.
 */
@@ -5762,30 +5762,130 @@ LPFC_ATTR_R(fcp_mq_threshold, LPFC_FCP_MQ_THRESHOLD_DEF,
 * A hardware IO queue maps (qidx) to a specific driver CQ/WQ.
 *
 *      0    = Configure the number of hdw queues to the number of active CPUs.
- *      1,128 = Manually specify how many hdw queues to use.
+ *      1,256 = Manually specify how many hdw queues to use.
 *
- * Value range is [0,128]. Default value is 0.
+ * Value range is [0,256]. Default value is 0.
 */
 LPFC_ATTR_R(hdw_queue,
 	    LPFC_HBA_HDWQ_DEF,
 	    LPFC_HBA_HDWQ_MIN, LPFC_HBA_HDWQ_MAX,
 	    "Set the number of I/O Hardware Queues");

+static inline void
+lpfc_assign_default_irq_numa(struct lpfc_hba *phba)
+{
+#if IS_ENABLED(CONFIG_X86)
+	/* If AMD architecture, then default is LPFC_IRQ_CHANN_NUMA */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		phba->cfg_irq_numa = 1;
+	else
+		phba->cfg_irq_numa = 0;
+#else
+	phba->cfg_irq_numa = 0;
+#endif
+}
+
 /*
 * lpfc_irq_chann: Set the number of IRQ vectors that are available
 * for Hardware Queues to utilize.  This also will map to the number
 * of EQ / MSI-X vectors the driver will create. This should never be
 * more than the number of Hardware Queues
 *
- *      0     = Configure number of IRQ Channels to the number of active CPUs.
- *      1,128 = Manually specify how many IRQ Channels to use.
+ *	0		= Configure number of IRQ Channels to:
+ *			  if AMD architecture, number of CPUs on HBA's NUMA node
+ *			  otherwise, number of active CPUs.
+ *	[1,256]		= Manually specify how many IRQ Channels to use.
 *
- * Value range is [0,128]. Default value is 0.
+ * Value range is [0,256]. Default value is [0].
 */
-LPFC_ATTR_R(irq_chann,
-	    LPFC_HBA_HDWQ_DEF,
-	    LPFC_HBA_HDWQ_MIN, LPFC_HBA_HDWQ_MAX,
-	    "Set the number of I/O IRQ Channels");
+static uint lpfc_irq_chann = LPFC_IRQ_CHANN_DEF;
+module_param(lpfc_irq_chann, uint, 0444);
+MODULE_PARM_DESC(lpfc_irq_chann, "Set number of interrupt vectors to allocate");
+
+/* lpfc_irq_chann_init - Set the hba irq_chann initial value
+ * @phba: lpfc_hba pointer.
+ * @val: contains the initial value
+ *
+ * Description:
+ * Validates the initial value is within range and assigns it to the
+ * adapter. If not in range, an error message is posted and the
+ * default value is assigned.
+ *
+ * Returns:
+ * zero if value is in range and is set
+ * -EINVAL if value was out of range
+ **/
+static int
+lpfc_irq_chann_init(struct lpfc_hba *phba, uint32_t val)
+{
+	const struct cpumask *numa_mask;
+
+	if (phba->cfg_use_msi != 2) {
+		lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+				"8532 use_msi = %u ignoring cfg_irq_numa\n",
+				phba->cfg_use_msi);
+		phba->cfg_irq_numa = 0;
+		phba->cfg_irq_chann = LPFC_IRQ_CHANN_MIN;
+		return 0;
+	}
+
+	/* Check if default setting was passed */
+	if (val == LPFC_IRQ_CHANN_DEF)
+		lpfc_assign_default_irq_numa(phba);
+
+	if (phba->cfg_irq_numa) {
+		numa_mask = &phba->sli4_hba.numa_mask;
+
+		if (cpumask_empty(numa_mask)) {
+			lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+					"8533 Could not identify NUMA node, "
+					"ignoring cfg_irq_numa\n");
+			phba->cfg_irq_numa = 0;
+			phba->cfg_irq_chann = LPFC_IRQ_CHANN_MIN;
+		} else {
+			phba->cfg_irq_chann = cpumask_weight(numa_mask);
+			lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+					"8543 lpfc_irq_chann set to %u "
+					"(numa)\n", phba->cfg_irq_chann);
+		}
+	} else {
+		if (val > LPFC_IRQ_CHANN_MAX) {
+			lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+					"8545 lpfc_irq_chann attribute cannot "
+					"be set to %u, allowed range is "
+					"[%u,%u]\n",
+					val,
+					LPFC_IRQ_CHANN_MIN,
+					LPFC_IRQ_CHANN_MAX);
+			phba->cfg_irq_chann = LPFC_IRQ_CHANN_MIN;
+			return -EINVAL;
+		}
+		phba->cfg_irq_chann = val;
+	}
+
+	return 0;
+}
+
+/**
+ * lpfc_irq_chann_show - Display value of irq_chann
+ * @dev: class converted to a Scsi_host structure.
+ * @attr: device attribute, not used.
+ * @buf: on return contains a string with the list sizes
+ *
+ * Returns: size of formatted string.
+ **/
+static ssize_t
+lpfc_irq_chann_show(struct device *dev, struct device_attribute *attr,
+		    char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct lpfc_vport *vport = (struct lpfc_vport *)shost->hostdata;
+	struct lpfc_hba *phba = vport->phba;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", phba->cfg_irq_chann);
+}
+
+static DEVICE_ATTR_RO(lpfc_irq_chann);

 /*
 # lpfc_enable_hba_reset: Allow or prevent HBA resets to the hardware.
@@ -7190,6 +7290,7 @@ lpfc_get_hba_function_mode(struct lpfc_hba *phba)
 void
 lpfc_get_cfgparam(struct lpfc_hba *phba)
 {
+	lpfc_hba_log_verbose_init(phba, lpfc_log_verbose);
 	lpfc_fcp_io_sched_init(phba, lpfc_fcp_io_sched);
 	lpfc_ns_query_init(phba, lpfc_ns_query);
 	lpfc_fcp2_no_tgt_reset_init(phba, lpfc_fcp2_no_tgt_reset);
@@ -7296,7 +7397,6 @@ lpfc_get_cfgparam(struct lpfc_hba *phba)
 	phba->cfg_soft_wwpn = 0L;
 	lpfc_sg_seg_cnt_init(phba, lpfc_sg_seg_cnt);
 	lpfc_hba_queue_depth_init(phba, lpfc_hba_queue_depth);
-	lpfc_hba_log_verbose_init(phba, lpfc_log_verbose);
 	lpfc_aer_support_init(phba, lpfc_aer_support);
 	lpfc_sriov_nr_virtfn_init(phba, lpfc_sriov_nr_virtfn);
 	lpfc_request_firmware_upgrade_init(phba, lpfc_req_fw_upgrade);

--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
--- a/drivers/scsi/lpfc/lpfc_sli4.h
+++ b/drivers/scsi/lpfc/lpfc_sli4.h
@@ -41,8 +41,13 @@

 /* Multi-queue arrangement for FCP EQ/CQ/WQ tuples */
 #define LPFC_HBA_HDWQ_MIN	0
-#define LPFC_HBA_HDWQ_MAX	128
-#define LPFC_HBA_HDWQ_DEF	0
+#define LPFC_HBA_HDWQ_MAX	256
+#define LPFC_HBA_HDWQ_DEF	LPFC_HBA_HDWQ_MIN
+
+/* irq_chann range, values */
+#define LPFC_IRQ_CHANN_MIN	0
+#define LPFC_IRQ_CHANN_MAX	256
+#define LPFC_IRQ_CHANN_DEF	LPFC_IRQ_CHANN_MIN

 /* FCP MQ queue count limiting */
 #define LPFC_FCP_MQ_THRESHOLD_MIN	0
@@ -467,11 +472,17 @@ struct lpfc_hba;
 #define LPFC_SLI4_HANDLER_NAME_SZ	16
 struct lpfc_hba_eq_hdl {
 	uint32_t idx;
+	uint16_t irq;
 	char handler_name[LPFC_SLI4_HANDLER_NAME_SZ];
 	struct lpfc_hba *phba;
 	struct lpfc_queue *eq;
+	struct cpumask aff_mask;
 };

+#define lpfc_get_eq_hdl(eqidx) (&phba->sli4_hba.hba_eq_hdl[eqidx])
+#define lpfc_get_aff_mask(eqidx) (&phba->sli4_hba.hba_eq_hdl[eqidx].aff_mask)
+#define lpfc_get_irq(eqidx) (phba->sli4_hba.hba_eq_hdl[eqidx].irq)
+
 /*BB Credit recovery value*/
 struct lpfc_bbscn_params {
 	uint32_t word0;
@@ -561,11 +572,10 @@ struct lpfc_sli4_lnk_info {
 #define LPFC_SLI4_HANDLER_CNT		(LPFC_HBA_IO_CHAN_MAX+ \
 					 LPFC_FOF_IO_CHAN_NUM)

-/* Used for IRQ vector to CPU mapping */
+/* Used for tracking CPU mapping attributes */
 struct lpfc_vector_map_info {
 	uint16_t	phys_id;
 	uint16_t	core_id;
-	uint16_t	irq;
 	uint16_t	eq;
 	uint16_t	hdwq;
 	uint16_t	flag;
@@ -908,6 +918,7 @@ struct lpfc_sli4_hba {
 	struct lpfc_vector_map_info *cpu_map;
 	uint16_t num_possible_cpu;
 	uint16_t num_present_cpu;
+	struct cpumask numa_mask;
 	uint16_t curr_disp_cpu;
 	struct lpfc_eq_intr_info __percpu *eq_info;
 	uint32_t conf_trunk;