Unverified Commit faa8e404 authored by Carlos Song's avatar Carlos Song Committed by Mark Brown
Browse files

spi: imx: support dynamic burst length for ECSPI DMA mode



ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
between words due to BURST_LENGTH updates, which significantly impacts
performance.

To improve throughput, configure BURST_LENGTH as large as possible (up to
512 bytes per frame) instead of word length. This avoids delays between
words. When transfer length is not 4-byte aligned, use bounce buffers to
align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
buffer data after transfer completion.

Introduce a new dma_package structure to store:
  1. BURST_LENGTH values for each DMA request
  2. Variables for DMA submission
  3. DMA transmission length and actual data length

Handle three cases:
  - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
  - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
  - len > 512 and unaligned: two packages, second for tail data

Performance test (spidev_test @10MHz, 4KB):
  Before: tx/rx ~6651.9 kbps
  After:  tx/rx ~9922.2 kbps (~50% improvement)

For compatibility with slow SPI devices, add configurable word delay in
DMA mode. When word delay is set, dynamic burst is disabled and
BURST_LENGTH equals word length.

Signed-off-by: default avatarCarlos Song <carlos.song@nxp.com>
Reviewed-by: default avatarFrank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251203085949.2922166-6-carlos.song@nxp.com


Signed-off-by: default avatarMark Brown <broonie@kernel.org>
parent a450c8b7
Loading
Loading
Loading
Loading
+377 −36
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
#define MX51_ECSPI_CTRL_MAX_BURST	512
/* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
#define MX53_MAX_TRANSFER_BYTES		512
#define BYTES_PER_32BITS_WORD		4

enum spi_imx_devtype {
	IMX1_CSPI,
@@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
	enum spi_imx_devtype devtype;
};

struct dma_data_package {
	u32 cmd_word;
	void *dma_rx_buf;
	void *dma_tx_buf;
	dma_addr_t dma_tx_addr;
	dma_addr_t dma_rx_addr;
	int dma_len;
	int data_len;
};

struct spi_imx_data {
	struct spi_controller *controller;
	struct device *dev;
@@ -130,6 +141,9 @@ struct spi_imx_data {
	u32 wml;
	struct completion dma_rx_completion;
	struct completion dma_tx_completion;
	size_t dma_package_num;
	struct dma_data_package *dma_data;
	int rx_offset;

	const struct spi_imx_devtype_data *devtype_data;
};
@@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
MXC_SPI_BUF_RX(u32)
MXC_SPI_BUF_TX(u32)

/* Align to cache line to avoid swiotlo bounce */
#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())

/* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
 * (which is currently not the case in this driver)
 */
@@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device
	if (transfer->len < spi_imx->devtype_data->fifo_size)
		return false;

	/* DMA only can transmit data in bytes */
	if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
	    spi_imx->bits_per_word != 32)
		return false;

	if (transfer->len >= MAX_SDMA_BD_BYTES)
		return false;

	spi_imx->dynamic_burst = 0;

	return true;
@@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx,

	init_completion(&spi_imx->dma_rx_completion);
	init_completion(&spi_imx->dma_tx_completion);
	controller->can_dma = spi_imx_can_dma;
	controller->max_dma_len = MAX_SDMA_BD_BYTES;
	spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
					 SPI_CONTROLLER_MUST_TX;

@@ -1437,10 +1460,259 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size)
	return secs_to_jiffies(2 * timeout);
}

static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
			      struct dma_data_package *dma_data)
{
	struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
	struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;

	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
			 DMA_TO_DEVICE);
	dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
			 DMA_FROM_DEVICE);
}

static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
				       struct dma_data_package *dma_data, void *rx_buf,
				       bool word_delay)
{
	void *copy_ptr;
	int unaligned;

	/*
	 * On little-endian CPUs, adjust byte order:
	 * - Swap bytes when bpw = 8
	 * - Swap half-words when bpw = 16
	 * This ensures correct data ordering for DMA transfers.
	 */
#ifdef __LITTLE_ENDIAN
	if (!word_delay) {
		unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
		u32 *temp = dma_data->dma_rx_buf;

		for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, sizeof(*temp)); i++) {
			if (bytes_per_word == 1)
				swab32s(temp + i);
			else if (bytes_per_word == 2)
				swahw32s(temp + i);
		}
	}
#endif

	/*
	 * When dynamic burst enabled, DMA RX always receives 32-bit words from RXFIFO with
	 * buswidth = 4, but when data_len is not 4-bytes alignment, the RM shows when
	 * burst length = 32*n + m bits, a SPI burst contains the m LSB in first word and all
	 * 32 bits in other n words. So if garbage bytes in the first word, trim first word then
	 * copy the actual data to rx_buf.
	 */
	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
		copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned;
	} else {
		copy_ptr = dma_data->dma_rx_buf;
	}

	memcpy(rx_buf, copy_ptr, dma_data->data_len);
}

static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
			   struct dma_data_package *dma_data)
{
	struct spi_controller *controller = spi_imx->controller;
	struct device *tx_dev = controller->dma_tx->device->dev;
	struct device *rx_dev = controller->dma_rx->device->dev;
	int ret;

	dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
					       DMA_TO_DEVICE);
	ret = dma_mapping_error(tx_dev, dma_data->dma_tx_addr);
	if (ret < 0) {
		dev_err(spi_imx->dev, "DMA TX map failed %d\n", ret);
		return ret;
	}

	dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
					       DMA_FROM_DEVICE);
	ret = dma_mapping_error(rx_dev, dma_data->dma_rx_addr);
	if (ret < 0) {
		dev_err(spi_imx->dev, "DMA RX map failed %d\n", ret);
		dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
				 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
				 DMA_TO_DEVICE);
		return ret;
	}

	return 0;
}

static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
				      struct dma_data_package *dma_data,
				      const void *tx_buf,
				      bool word_delay)
{
	void *copy_ptr;
	int unaligned;

	if (word_delay) {
		dma_data->dma_len = dma_data->data_len;
	} else {
		/*
		 * As per the reference manual, when burst length = 32*n + m bits, ECSPI
		 * sends m LSB bits in the first word, followed by n full 32-bit words.
		 * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers
		 * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO,
		 * while ECSPI uses BURST_LENGTH settings to maintain correct bit count.
		 * For RX, DMA always receives 32-bit words from RXFIFO, when data len is
		 * not 4-byte aligned, trim the first word to drop garbage bytes, then group
		 * all transfer DMA bounse buffer and copy all valid data to rx_buf.
		 */
		dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD);
	}

	dma_data->dma_tx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL);
	if (!dma_data->dma_tx_buf)
		return -ENOMEM;

	dma_data->dma_rx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL);
	if (!dma_data->dma_rx_buf) {
		kfree(dma_data->dma_tx_buf);
		return -ENOMEM;
	}

	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
		copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned;
	} else {
		copy_ptr = dma_data->dma_tx_buf;
	}

	memcpy(copy_ptr, tx_buf, dma_data->data_len);

	/*
	 * When word_delay is enabled, DMA transfers an entire word in one minor loop.
	 * In this case, no data requires additional handling.
	 */
	if (word_delay)
		return 0;

#ifdef __LITTLE_ENDIAN
	/*
	 * On little-endian CPUs, adjust byte order:
	 * - Swap bytes when bpw = 8
	 * - Swap half-words when bpw = 16
	 * This ensures correct data ordering for DMA transfers.
	 */
	unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
	u32 *temp = dma_data->dma_tx_buf;

	for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, sizeof(*temp)); i++) {
		if (bytes_per_word == 1)
			swab32s(temp + i);
		else if (bytes_per_word == 2)
			swahw32s(temp + i);
	}
#endif

	return 0;
}

static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
				    struct spi_transfer *transfer,
				    bool word_delay)
{
	u32 pre_bl, tail_bl;
	u32 ctrl;
	int ret;

	/*
	 * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
	 * and is not a multiple of 512, a tail transfer is required. BURST_LEGTH
	 * is used for SPI HW to maintain correct bit count. BURST_LENGTH should
	 * update with data length. After DMA request submit, SPI can not update the
	 * BURST_LENGTH, in this case, we must split two package, update the register
	 * then setup second DMA transfer.
	 */
	ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
	if (word_delay) {
		/*
		 * When SPI IMX need to support word delay, according to "Sample Period Control
		 * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
		 * provides software a way to insert delays (wait states) between consecutive
		 * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
		 * the delay occurs between frames.
		 */
		spi_imx->dma_package_num = 1;
		pre_bl = spi_imx->bits_per_word - 1;
	} else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
		spi_imx->dma_package_num = 1;
		pre_bl = transfer->len * BITS_PER_BYTE - 1;
	} else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
		spi_imx->dma_package_num = 1;
		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
	} else {
		spi_imx->dma_package_num = 2;
		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
		tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
	}

	spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
					  sizeof(struct dma_data_package),
					  GFP_KERNEL | __GFP_ZERO);
	if (!spi_imx->dma_data) {
		dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
		return -ENOMEM;
	}

	if (spi_imx->dma_package_num == 1) {
		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
		spi_imx->dma_data[0].cmd_word = ctrl;
		spi_imx->dma_data[0].data_len = transfer->len;
		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
						 word_delay);
		if (ret) {
			kfree(spi_imx->dma_data);
			return ret;
		}
	} else {
		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
		spi_imx->dma_data[0].cmd_word = ctrl;
		spi_imx->dma_data[0].data_len = round_down(transfer->len,
							   MX51_ECSPI_CTRL_MAX_BURST);
		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
						 false);
		if (ret) {
			kfree(spi_imx->dma_data);
			return ret;
		}

		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
		ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
		spi_imx->dma_data[1].cmd_word = ctrl;
		spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
						 transfer->tx_buf + spi_imx->dma_data[0].data_len,
						 false);
		if (ret) {
			kfree(spi_imx->dma_data[0].dma_tx_buf);
			kfree(spi_imx->dma_data[0].dma_rx_buf);
			kfree(spi_imx->dma_data);
		}
	}

	return 0;
}

static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
			      struct dma_data_package *dma_data,
			      struct spi_transfer *transfer)
{
	struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
	struct spi_controller *controller = spi_imx->controller;
	struct dma_async_tx_descriptor *desc_tx, *desc_rx;
	unsigned long transfer_timeout;
@@ -1451,8 +1723,8 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
	 * The TX DMA setup starts the transfer, so make sure RX is configured
	 * before TX.
	 */
	desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
					  rx->sgl, rx->nents, DMA_DEV_TO_MEM,
	desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr,
					      dma_data->dma_len, DMA_DEV_TO_MEM,
					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
	if (!desc_rx) {
		transfer->error |= SPI_TRANS_FAIL_NO_START;
@@ -1471,8 +1743,8 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
	reinit_completion(&spi_imx->dma_rx_completion);
	dma_async_issue_pending(controller->dma_rx);

	desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
					  tx->sgl, tx->nents, DMA_MEM_TO_DEV,
	desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr,
					      dma_data->dma_len, DMA_MEM_TO_DEV,
					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
	if (!desc_tx)
		goto dmaengine_terminate_rx;
@@ -1521,16 +1793,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
}

static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
				     struct spi_transfer *transfer)
				     struct dma_data_package *dma_data,
				     bool word_delay)
{
	struct sg_table *rx = &transfer->rx_sg;
	struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
	unsigned int bytes_per_word, i;
	unsigned int bytes_per_word = word_delay ?
				      spi_imx_bytes_per_word(spi_imx->bits_per_word) :
				      BYTES_PER_32BITS_WORD;
	unsigned int i;

	/* Get the right burst length from the last sg to ensure no tail data */
	bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
	for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
		if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
		if (!dma_data->dma_len % (i * bytes_per_word))
			break;
	}
	/* Use 1 as wml in case no available burst length got */
@@ -1540,13 +1812,14 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
	spi_imx->wml = i;
}

static int spi_imx_dma_configure(struct spi_controller *controller)
static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay)
{
	int ret;
	enum dma_slave_buswidth buswidth;
	struct dma_slave_config rx = {}, tx = {};
	struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);

	if (word_delay) {
		switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
		case 4:
			buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
@@ -1560,6 +1833,9 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
		default:
			return -EINVAL;
		}
	} else {
		buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
	}

	tx.direction = DMA_MEM_TO_DEV;
	tx.dst_addr = spi_imx->base_phys + MXC_CSPITXDATA;
@@ -1584,15 +1860,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
	return 0;
}

static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
				struct spi_transfer *transfer)
static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
					struct dma_data_package *dma_data,
					struct spi_transfer *transfer,
					bool word_delay)
{
	struct spi_controller *controller = spi_imx->controller;
	int ret;

	spi_imx_dma_max_wml_find(spi_imx, transfer);
	spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);

	ret = spi_imx_dma_configure(controller);
	ret = spi_imx_dma_configure(controller, word_delay);
	if (ret)
		goto dma_failure_no_start;

@@ -1603,10 +1881,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
	}
	spi_imx->devtype_data->setup_wml(spi_imx);

	ret = spi_imx_dma_submit(spi_imx, transfer);
	ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
	if (ret)
		return ret;

	/* Trim the DMA RX buffer and copy the actual data to rx_buf */
	dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr,
				dma_data->dma_len, DMA_FROM_DEVICE);
	spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset,
				   word_delay);
	spi_imx->rx_offset += dma_data->data_len;

	return 0;
/* fallback to pio */
dma_failure_no_start:
@@ -1614,6 +1899,57 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
	return ret;
}

static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
				struct spi_transfer *transfer)
{
	bool word_delay = transfer->word_delay.value != 0;
	int ret;
	int i;

	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
	if (ret < 0) {
		transfer->error |= SPI_TRANS_FAIL_NO_START;
		dev_err(spi_imx->dev, "DMA data prepare fail\n");
		goto fallback_pio;
	}

	spi_imx->rx_offset = 0;

	/* Each dma_package performs a separate DMA transfer once */
	for (i = 0; i < spi_imx->dma_package_num; i++) {
		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
		if (ret < 0) {
			if (i == 0)
				transfer->error |= SPI_TRANS_FAIL_NO_START;
			dev_err(spi_imx->dev, "DMA map fail\n");
			break;
		}

		/* Update the CTRL register BL field */
		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);

		ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
						   transfer, word_delay);

		/* Whether the dma transmission is successful or not, dma unmap is necessary */
		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);

		if (ret < 0) {
			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
			break;
		}
	}

	for (int j = 0; j < spi_imx->dma_package_num; j++) {
		kfree(spi_imx->dma_data[j].dma_tx_buf);
		kfree(spi_imx->dma_data[j].dma_rx_buf);
	}
	kfree(spi_imx->dma_data);

fallback_pio:
	return ret;
}

static int spi_imx_pio_transfer(struct spi_device *spi,
				struct spi_transfer *transfer)
{
@@ -1780,9 +2116,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
	 * transfer, the SPI transfer has already been mapped, so we
	 * have to do the DMA transfer here.
	 */
	if (spi_imx->usedma)
		return spi_imx_dma_transfer(spi_imx, transfer);

	if (spi_imx->usedma) {
		ret = spi_imx_dma_transfer(spi_imx, transfer);
		if (transfer->error & SPI_TRANS_FAIL_NO_START) {
			spi_imx->usedma = false;
			return spi_imx_pio_transfer(spi, transfer);
		}
		return ret;
	}
	/* run in polling mode for short transfers */
	if (transfer->len == 1 || (polling_limit_us &&
				   spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us))