spi: add support for pre-cooking messages

Merge series from David Lechner <dlechner@baylibre.com>: This is a follow-up to [1] where it was suggested to break down the proposed SPI offload support into smaller series. This takes on the first suggested task of introducing an API to "pre-cook" SPI messages. This idea was first discussed extensively in 2013 [2][3] and revisited more briefly 2022 [4]. The goal here is to be able to improve performance (higher throughput, and reduced CPU usage) by allowing peripheral drivers that use the same struct spi_message repeatedly to "pre-cook" the message once to avoid repeating the same validation, and possibly other operations each time the message is sent. This series includes __spi_validate() and the automatic splitting of xfers in the optimizations. Another frequently suggested optimization is doing DMA mapping only once. This is not included in this series, but can be added later (preferably by someone with a real use case for it). To show how this all works and get some real-world measurements, this series includes the core changes, optimization of a SPI controller driver, and optimization of an ADC driver. This test case was only able to take advantage of the single validation optimization, since it didn't require splitting transfers. With these changes, CPU usage of the threaded interrupt handler, which calls spi_sync(), was reduced from 83% to 73% while at the same time the sample rate (frequency of SPI xfers) was increased from 20kHz to 25kHz. [1]: https://lore.kernel.org/linux-spi/20240109-axi-spi-engine-series-3-v1-1-e42c6a986580@baylibre.com/T/ [2]: https://lore.kernel.org/linux-spi/E81F4810-48DD-41EE-B110-D0D848B8A510@martin.sperl.org/T/ [3]: https://lore.kernel.org/linux-spi/39DEC004-10A1-47EF-9D77-276188D2580C@martin.sperl.org/T/ [4]: https://lore.kernel.org/linux-spi/20220525163946.48ea40c9@erd992/T/

spi: add support for pre-cooking messages
Merge series from David Lechner <dlechner@baylibre.com>: This is a follow-up to [1] where it was suggested to break down the proposed SPI offload support into smaller series. This takes on the first suggested task of introducing an API to "pre-cook" SPI messages. This idea was first discussed extensively in 2013 [2][3] and revisited more briefly 2022 [4]. The goal here is to be able to improve performance (higher throughput, and reduced CPU usage) by allowing peripheral drivers that use the same struct spi_message repeatedly to "pre-cook" the message once to avoid repeating the same validation, and possibly other operations each time the message is sent. This series includes __spi_validate() and the automatic splitting of xfers in the optimizations. Another frequently suggested optimization is doing DMA mapping only once. This is not included in this series, but can be added later (preferably by someone with a real use case for it). To show how this all works and get some real-world measurements, this series includes the core changes, optimization of a SPI controller driver, and optimization of an ADC driver. This test case was only able to take advantage of the single validation optimization, since it didn't require splitting transfers. With these changes, CPU usage of the threaded interrupt handler, which calls spi_sync(), was reduced from 83% to 73% while at the same time the sample rate (frequency of SPI xfers) was increased from 20kHz to 25kHz. [1]: https://lore.kernel.org/linux-spi/20240109-axi-spi-engine-series-3-v1-1-e42c6a986580@baylibre.com/T/ [2]: https://lore.kernel.org/linux-spi/E81F4810-48DD-41EE-B110-D0D848B8A510@martin.sperl.org/T/ [3]: https://lore.kernel.org/linux-spi/39DEC004-10A1-47EF-9D77-276188D2580C@martin.sperl.org/T/ [4]: https://lore.kernel.org/linux-spi/20220525163946.48ea40c9@erd992/T/
78611565 · Mark Brown · e63aef9c · 7dba2adb · 78611565 · 78611565
Commit 78611565 authored Feb 26, 2024 by Mark Brown
4 changed files
--- a/drivers/spi/spi-axi-spi-engine.c
+++ b/drivers/spi/spi-axi-spi-engine.c
@@ -109,6 +109,7 @@ struct spi_engine {
 	spinlock_t lock;
 	void __iomem *base;
+	struct spi_engine_message_state msg_state;
 	struct completion msg_complete;
 	unsigned int int_enable;
 };
@@ -499,17 +500,11 @@ static irqreturn_t spi_engine_irq(int irq, void *devid)
 	return IRQ_HANDLED;
 }
-static int spi_engine_prepare_message(struct spi_controller *host,
+static int spi_engine_optimize_message(struct spi_message *msg)
-				      struct spi_message *msg)
 {
 	struct spi_engine_program p_dry, *p;
-	struct spi_engine_message_state *st;
 	size_t size;
-	st = kzalloc(sizeof(*st), GFP_KERNEL);
-	if (!st)
-		return -ENOMEM;
 	spi_engine_precompile_message(msg);
 	p_dry.length = 0;
@@ -517,31 +512,22 @@ static int spi_engine_prepare_message(struct spi_controller *host,
 	size = sizeof(*p->instructions) * (p_dry.length + 1);
 	p = kzalloc(sizeof(*p) + size, GFP_KERNEL);
-	if (!p) {
+	if (!p)
-		kfree(st);
 		return -ENOMEM;
-	}
 	spi_engine_compile_message(msg, false, p);
 	spi_engine_program_add_cmd(p, false, SPI_ENGINE_CMD_SYNC(
 						AXI_SPI_ENGINE_CUR_MSG_SYNC_ID));
-	st->p = p;
+	msg->opt_state = p;
-	st->cmd_buf = p->instructions;
-	st->cmd_length = p->length;
-	msg->state = st;
 	return 0;
 }
-static int spi_engine_unprepare_message(struct spi_controller *host,
+static int spi_engine_unoptimize_message(struct spi_message *msg)
-					struct spi_message *msg)
 {
-	struct spi_engine_message_state *st = msg->state;
+	kfree(msg->opt_state);
-	kfree(st->p);
-	kfree(st);
 	return 0;
 }
@@ -550,10 +536,18 @@ static int spi_engine_transfer_one_message(struct spi_controller *host,
 	struct spi_message *msg)
 {
 	struct spi_engine *spi_engine = spi_controller_get_devdata(host);
-	struct spi_engine_message_state *st = msg->state;
+	struct spi_engine_message_state *st = &spi_engine->msg_state;
+	struct spi_engine_program *p = msg->opt_state;
 	unsigned int int_enable = 0;
 	unsigned long flags;
+	/* reinitialize message state for this transfer */
+	memset(st, 0, sizeof(*st));
+	st->p = p;
+	st->cmd_buf = p->instructions;
+	st->cmd_length = p->length;
+	msg->state = st;
 	reinit_completion(&spi_engine->msg_complete);
 	spin_lock_irqsave(&spi_engine->lock, flags);
@@ -658,8 +652,8 @@ static int spi_engine_probe(struct platform_device *pdev)
 	host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
 	host->max_speed_hz = clk_get_rate(spi_engine->ref_clk) / 2;
 	host->transfer_one_message = spi_engine_transfer_one_message;
-	host->prepare_message = spi_engine_prepare_message;
+	host->optimize_message = spi_engine_optimize_message;
-	host->unprepare_message = spi_engine_unprepare_message;
+	host->unoptimize_message = spi_engine_unoptimize_message;
 	host->num_chipselect = 8;
 	if (host->max_speed_hz == 0)

--- a/drivers/spi/spi-stm32.c
+++ b/drivers/spi/spi-stm32.c
@@ -1118,6 +1118,21 @@ static irqreturn_t stm32h7_spi_irq_thread(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
+static int stm32_spi_optimize_message(struct spi_message *msg)
+{
+	struct spi_controller *ctrl = msg->spi->controller;
+	struct stm32_spi *spi = spi_controller_get_devdata(ctrl);
+	/* On STM32H7, messages should not exceed a maximum size set
+	 * later via the set_number_of_data function. In order to
+	 * ensure that, split large messages into several messages
+	 */
+	if (spi->cfg->set_number_of_data)
+		return spi_split_transfers_maxwords(ctrl, msg, spi->t_size_max);
+	return 0;
+}
 /**
 * stm32_spi_prepare_msg - set up the controller to transfer a single message
 * @ctrl: controller interface
@@ -1163,18 +1178,6 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl,
 		!!(spi_dev->mode & SPI_LSB_FIRST),
 		!!(spi_dev->mode & SPI_CS_HIGH));
-	/* On STM32H7, messages should not exceed a maximum size setted
-	 * afterward via the set_number_of_data function. In order to
-	 * ensure that, split large messages into several messages
-	 */
-	if (spi->cfg->set_number_of_data) {
-		int ret;
-		ret = spi_split_transfers_maxwords(ctrl, msg, spi->t_size_max);
-		if (ret)
-			return ret;
-	}
 	spin_lock_irqsave(&spi->lock, flags);
 	/* CPOL, CPHA and LSB FIRST bits have common register */
@@ -2180,6 +2183,7 @@ static int stm32_spi_probe(struct platform_device *pdev)
 	ctrl->max_speed_hz = spi->clk_rate / spi->cfg->baud_rate_div_min;
 	ctrl->min_speed_hz = spi->clk_rate / spi->cfg->baud_rate_div_max;
 	ctrl->use_gpio_descriptors = true;
+	ctrl->optimize_message = stm32_spi_optimize_message;
 	ctrl->prepare_message = stm32_spi_prepare_msg;
 	ctrl->transfer_one = stm32_spi_transfer_one;
 	ctrl->unprepare_message = stm32_spi_unprepare_msg;

--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -475,6 +475,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
 *
 * @set_cs: set the logic level of the chip select line.  May be called
 *          from interrupt context.
+ * @optimize_message: optimize the message for reuse
+ * @unoptimize_message: release resources allocated by optimize_message
 * @prepare_message: set up the controller to transfer a single message,
 *                   for example doing DMA mapping.  Called from threaded
 *                   context.
@@ -715,6 +717,8 @@ struct spi_controller {
 	struct completion               xfer_completion;
 	size_t				max_dma_len;
+	int (*optimize_message)(struct spi_message *msg);
+	int (*unoptimize_message)(struct spi_message *msg);
 	int (*prepare_transfer_hardware)(struct spi_controller *ctlr);
 	int (*transfer_one_message)(struct spi_controller *ctlr,
 				    struct spi_message *mesg);
@@ -1111,6 +1115,8 @@ struct spi_transfer {
 * @spi: SPI device to which the transaction is queued
 * @is_dma_mapped: if true, the caller provided both DMA and CPU virtual
 *	addresses for each transfer buffer
+ * @pre_optimized: peripheral driver pre-optimized the message
+ * @optimized: the message is in the optimized state
 * @prepared: spi_prepare_message was called for the this message
 * @status: zero for success, else negative errno
 * @complete: called to report transaction completions
@@ -1120,6 +1126,7 @@ struct spi_transfer {
 *	successful segments
 * @queue: for use by whichever driver currently owns the message
 * @state: for use by whichever driver currently owns the message
+ * @opt_state: for use by whichever driver currently owns the message
 * @resources: for resource management when the SPI message is processed
 *
 * A @spi_message is used to execute an atomic sequence of data transfers,
@@ -1143,6 +1150,11 @@ struct spi_message {
 	unsigned		is_dma_mapped:1;
+	/* spi_optimize_message() was called for this message */
+	bool			pre_optimized;
+	/* __spi_optimize_message() was called for this message */
+	bool			optimized;
 	/* spi_prepare_message() was called for this message */
 	bool			prepared;
@@ -1172,6 +1184,11 @@ struct spi_message {
 	 */
 	struct list_head	queue;
 	void			*state;
+	/*
+	 * Optional state for use by controller driver between calls to
+	 * __spi_optimize_message() and __spi_unoptimize_message().
+	 */
+	void			*opt_state;
 	/* List of spi_res resources when the SPI message is processed */
 	struct list_head        resources;
@@ -1255,6 +1272,9 @@ static inline void spi_message_free(struct spi_message *m)
 	kfree(m);
 }
+extern int spi_optimize_message(struct spi_device *spi, struct spi_message *msg);
+extern void spi_unoptimize_message(struct spi_message *msg);
 extern int spi_setup(struct spi_device *spi);
 extern int spi_async(struct spi_device *spi, struct spi_message *message);
 extern int spi_slave_abort(struct spi_device *spi);