/* libhw/rp2040_hwspi.c - <libhw/generic/spi.h> implementation for the RP2040's ARM Primecell SSP (PL022)
 *
 * Copyright (C) 2024-2025  Luke T. Shumaker <lukeshu@lukeshu.com>
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

#include <alloca.h>
#include <inttypes.h> /* for PRIu{n} */

#include <hardware/clocks.h> /* for clock_get_hz() and clk_peri */
#include <hardware/gpio.h>
#include <hardware/spi.h>

#include <libcr/coroutine.h>
#include <libmisc/assert.h>

#define LOG_NAME RP2040_SPI
#include <libmisc/log.h>

#define IMPLEMENTATION_FOR_LIBHW_RP2040_HWSPI_H YES
#include <libhw/rp2040_hwspi.h>

#include <libhw/generic/alarmclock.h>

#include "rp2040_dma.h"

#include "config.h"

#ifndef CONFIG_RP2040_SPI_DEBUG
	#error config.h must define CONFIG_RP2040_SPI_DEBUG (bool)
#endif

LO_IMPLEMENTATION_C(io_duplex_readwriter, struct rp2040_hwspi, rp2040_hwspi, static)
LO_IMPLEMENTATION_C(spi, struct rp2040_hwspi, rp2040_hwspi, static)

static void rp2040_hwspi_intrhandler(void *_self, enum dmairq LM_UNUSED(irq), uint LM_UNUSED(channel)) {
	struct rp2040_hwspi *self = _self;
	gpio_put(self->pin_cs, 1);
	cr_sema_signal_from_intrhandler(&self->sema);
}

#define assert_4distinct(a, b, c, d) \
	assert(a != b); \
	assert(a != c); \
	assert(a != d); \
	assert(b != c); \
	assert(b != d); \
	assert(c != d);

void _rp2040_hwspi_init(struct rp2040_hwspi *self,
                        enum rp2040_hwspi_instance inst_num,
                        enum spi_mode mode,
                        uint baudrate_hz,
                        uint64_t min_delay_ns,
                        uint8_t bogus_data,
                        uint pin_miso,
                        uint pin_mosi,
                        uint pin_clk,
                        uint pin_cs,
                        uint dma1,
                        uint dma2,
                        uint dma3,
                        uint dma4)
{
	/* Be not weary: This is but 12 lines of actual code; and many
	 * lines of comments and assert()s.  */
	spi_inst_t *inst;
	uint actual_baudrate_hz;

	assert(self);
	assert(baudrate_hz);
	uint32_t clk_peri_hz = clock_get_hz(clk_peri);
	debugf("clk_peri = %"PRIu32"Hz", clk_peri_hz);
	assert(baudrate_hz*2 <= clk_peri_hz);
	assert_4distinct(pin_miso, pin_mosi, pin_clk, pin_cs);
	assert_4distinct(dma1, dma2, dma3, dma4);

	/* Regarding the constraints on pin assignments: see the
	 * RP2040 datasheet, table 2, in §1.4.3 "GPIO Functions".  */
	switch (inst_num) {
	case RP2040_HWSPI_0:
		inst = spi0;
		assert(pin_miso ==  0 || pin_miso ==  4 || pin_miso == 16 || pin_miso == 20);
		/*assert(pin_cs   ==  1 || pin_cs   ==  5 || pin_cs   == 17 || pin_cs   == 21);*/
		assert(pin_clk  ==  2 || pin_clk  ==  6 || pin_clk  == 18 || pin_clk  == 22);
		assert(pin_mosi ==  3 || pin_mosi ==  7 || pin_mosi == 19 || pin_mosi == 23);
		break;
	case RP2040_HWSPI_1:
		inst = spi1;
		assert(pin_miso ==  8 || pin_miso == 12 || pin_miso == 24 || pin_miso == 28);
		/*assert(pin_cs   ==  9 || pin_cs   == 13 || pin_cs   == 25 || pin_cs   == 29);*/
		assert(pin_clk  == 10 || pin_clk  == 14 || pin_clk  == 26);
		assert(pin_mosi == 11 || pin_mosi == 15 || pin_mosi == 27);
		break;
	default:
		assert_notreached("invalid hwspi instance number");
	}

	actual_baudrate_hz = spi_init(inst, baudrate_hz);
	debugf("baudrate = %uHz", actual_baudrate_hz);
	assert(actual_baudrate_hz == baudrate_hz);
	spi_set_format(inst, 8,
	               (mode & 0b10) ? SPI_CPOL_1 : SPI_CPOL_0,
	               (mode & 0b01) ? SPI_CPHA_1 : SPI_CPHA_0,
	               SPI_MSB_FIRST);

	/* Connect the pins to the PL022; set them each to "function
	 * 1" (again, see the RP2040 datasheet, table 2, in §1.4.3
	 * "GPIO Functions").
	 *
	 * ("GPIO_FUNC_SPI" is how the pico-sdk spells "function 1",
	 * since on the RP2040 all of the "function 1" functions are
	 * some part of SPI.)  */
	gpio_set_function(pin_clk,  GPIO_FUNC_SPI);
	gpio_set_function(pin_mosi, GPIO_FUNC_SPI);
	gpio_set_function(pin_miso, GPIO_FUNC_SPI);

	/* Initialize the CS pin for software control.  */
	gpio_init(pin_cs);
	gpio_set_dir(pin_cs, GPIO_OUT);
	gpio_put(pin_cs, 1);

	/* Initialize self.  */
	self->inst = inst;
	self->min_delay_ns = min_delay_ns;
	self->bogus_data = bogus_data;
	self->pin_cs = pin_cs;
	self->dma_tx_ctrl = dma1;
	self->dma_rx_ctrl = dma2;
	self->dma_tx_data = dma3;
	self->dma_rx_data = dma4;
	self->dead_until_ns = 0;
	self->sema = (cr_sema_t){0};

	/* Initialize the interrupt handler.  */
	dmairq_set_and_enable_exclusive_handler(DMAIRQ_0, self->dma_rx_data, rp2040_hwspi_intrhandler, self);
}

static void rp2040_hwspi_readwritev(struct rp2040_hwspi *self, const struct duplex_iovec *iov, int iovcnt) {
	assert(self);
	assert(self->inst);
	assert(iov);
	assert(iovcnt > 0);

	/* At this time, I have no intention to run SPI faster than
	 * 80MHz (= 80Mb/s = 10MB/s).  If we ran the CPU at just
	 * 100MHz (we'll be running it faster than that, maybe even
	 * 200MHz), that means we'd have 10 clock cycles to send each
	 * byte.
	 *
	 * This affords us substantial simplifications, like being
	 * able to afford 4-cycle changeovers between DMA blocks, and
	 * not having to worry about alignment because we can just use
	 * DMA_SIZE_8.
	 */

	uint8_t bogus_rx_dst;

	int pruned_iovcnt = 0;
	for (int i = 0; i < iovcnt; i++)
		if (iov[i].iov_len)
			pruned_iovcnt++;
	if (!pruned_iovcnt)
		return;

	/* For tx_data_blocks, it doesn't really matter which aliases
	 * we choose:
	 *  - None of our fields can be NULL (so no
	 *    false-termination).
	 *  - Moving const fields first so they don't have to be
	 *    re-programmed each time isn't possible for us there need
	 *    to be at least 2 const fields, and we only have 1
	 *    (read_addr for rx_data_blocks, and write_addr for
	 *    tx_data_blocks).
	 *
	 * But for rx_data_blocks, we need ctrl to be the trigger
	 * register so that the DMA_CTRL_IRQ_QUIET flag isn't cleared
	 * before we get to the trigger; and while for tx_data_blocks
	 * it doesn't really matter, the inverse would be nice.
	 */
	struct dma_alias1 *tx_data_blocks = alloca(sizeof(struct dma_alias1)*(pruned_iovcnt+1));
	struct dma_alias0 *rx_data_blocks = alloca(sizeof(struct dma_alias0)*(pruned_iovcnt+1));

	for (int i = 0, j = 0; i < iovcnt; i++) {
		if (!iov[i].iov_len)
			continue;
		tx_data_blocks[j]    = (typeof(tx_data_blocks[0])){
			.read_addr   = iov[i].iov_write_src ?: &self->bogus_data,
			.write_addr  = &spi_get_hw(self->inst)->dr,
			.trans_count = iov[i].iov_len,
			.ctrl        = (DMA_CTRL_ENABLE
			                | DMA_CTRL_DATA_SIZE(DMA_SIZE_8)
			                | (iov[i].iov_write_src ? DMA_CTRL_INCR_READ : 0)
			                | DMA_CTRL_CHAIN_TO(self->dma_tx_ctrl)
			                | DMA_CTRL_TREQ_SEL(SPI_DREQ_NUM(self->inst, true))
			                | DMA_CTRL_IRQ_QUIET),
		};
		rx_data_blocks[j]    = (typeof(rx_data_blocks[0])){
			.read_addr   = &spi_get_hw(self->inst)->dr,
			.write_addr  = iov[i].iov_read_dst ?: &bogus_rx_dst,
			.trans_count = iov[i].iov_len,
			.ctrl        = (DMA_CTRL_ENABLE
			                | DMA_CTRL_DATA_SIZE(DMA_SIZE_8)
			                | (iov[i].iov_read_dst ? DMA_CTRL_INCR_WRITE : 0)
			                | DMA_CTRL_CHAIN_TO(self->dma_rx_ctrl)
			                | DMA_CTRL_TREQ_SEL(SPI_DREQ_NUM(self->inst, false))
			                | DMA_CTRL_IRQ_QUIET),
		};
		j++;
	}
	tx_data_blocks[pruned_iovcnt] = (typeof(tx_data_blocks[0])){0};
	rx_data_blocks[pruned_iovcnt] = (typeof(rx_data_blocks[0])){0};

	/* Set up ctrl.  */
	DMA_NONTRIGGER(self->dma_tx_ctrl, read_addr) = tx_data_blocks;
	DMA_NONTRIGGER(self->dma_tx_ctrl, write_addr) = DMA_CHAN_ADDR(self->dma_tx_data, typeof(tx_data_blocks[0]));
	DMA_NONTRIGGER(self->dma_tx_ctrl, trans_count) = DMA_CHAN_WR_TRANS_COUNT(typeof(tx_data_blocks[0]));
	DMA_NONTRIGGER(self->dma_tx_ctrl, ctrl) = (DMA_CTRL_ENABLE
	                                           | DMA_CHAN_WR_CTRL(typeof(tx_data_blocks[0]))
	                                           | DMA_CTRL_INCR_READ
	                                           | DMA_CTRL_CHAIN_TO(self->dma_tx_data)
	                                           | DMA_CTRL_TREQ_SEL(DREQ_FORCE)
	                                           | DMA_CTRL_IRQ_QUIET);
	DMA_NONTRIGGER(self->dma_rx_ctrl, read_addr) = rx_data_blocks;
	DMA_NONTRIGGER(self->dma_rx_ctrl, write_addr) = DMA_CHAN_ADDR(self->dma_rx_data, typeof(rx_data_blocks[0]));
	DMA_NONTRIGGER(self->dma_rx_ctrl, trans_count) = DMA_CHAN_WR_TRANS_COUNT(typeof(rx_data_blocks[0]));
	DMA_NONTRIGGER(self->dma_rx_ctrl, ctrl) = (DMA_CTRL_ENABLE
	                                           | DMA_CHAN_WR_CTRL(typeof(rx_data_blocks[0]))
	                                           | DMA_CTRL_INCR_READ
	                                           | DMA_CTRL_CHAIN_TO(self->dma_rx_data)
	                                           | DMA_CTRL_TREQ_SEL(DREQ_FORCE)
	                                           | DMA_CTRL_IRQ_QUIET);

	/* Run.  */
	uint64_t now = LO_CALL(bootclock, get_time_ns);
	if (now < self->dead_until_ns)
		sleep_until_ns(self->dead_until_ns);
	bool saved = cr_save_and_disable_interrupts();
	gpio_put(self->pin_cs, 0);
	dma_hw->multi_channel_trigger = (1u<<self->dma_tx_ctrl) | (1u<<self->dma_rx_ctrl);
	cr_restore_interrupts(saved);
	cr_sema_wait(&self->sema);
	self->dead_until_ns = LO_CALL(bootclock, get_time_ns) + self->min_delay_ns;
}