/*
 * NVIDIA Tegra SPI controller (T114 and later)
 *
 * Copyright (c) 2010-2013 NVIDIA Corporation
 * Copyright (C) 2013 Google Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <assert.h>
#include <cbfs.h>
#include <cbfs_core.h>
#include <inttypes.h>
#include <spi-generic.h>
#include <spi_flash.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <timer.h>
#include <arch/cache.h>
#include <arch/io.h>
#include <console/console.h>
#include <soc/addressmap.h>
#include <delay.h>

#include "dma.h"
#include "spi.h"

#if defined(CONFIG_DEBUG_SPI) && CONFIG_DEBUG_SPI
# define DEBUG_SPI(x,...)	printk(BIOS_DEBUG, "TEGRA_SPI: " x)
#else
# define DEBUG_SPI(x,...)
#endif

/*
 * 64 packets in FIFO mode, BLOCK_SIZE packets in DMA mode. Packets can vary
 * in size from 4 to 32 bits. To keep things simple we'll use 8-bit packets.
 */
#define SPI_PACKET_SIZE_BYTES		1
#define SPI_MAX_TRANSFER_BYTES_FIFO	(64 * SPI_PACKET_SIZE_BYTES)
#define SPI_MAX_TRANSFER_BYTES_DMA	(65535 * SPI_PACKET_SIZE_BYTES)

/*
 * This is used to workaround an issue seen where it may take some time for
 * packets to show up in the FIFO after they have been received and the
 * BLOCK_COUNT has been incremented.
 */
#define SPI_FIFO_XFER_TIMEOUT_US	1000

/* COMMAND1 */
#define SPI_CMD1_GO			(1 << 31)
#define SPI_CMD1_M_S			(1 << 30)
#define SPI_CMD1_MODE_MASK		0x3
#define SPI_CMD1_MODE_SHIFT		28
#define SPI_CMD1_CS_SEL_MASK		0x3
#define SPI_CMD1_CS_SEL_SHIFT		26
#define SPI_CMD1_CS_POL_INACTIVE3	(1 << 25)
#define SPI_CMD1_CS_POL_INACTIVE2	(1 << 24)
#define SPI_CMD1_CS_POL_INACTIVE1	(1 << 23)
#define SPI_CMD1_CS_POL_INACTIVE0	(1 << 22)
#define SPI_CMD1_CS_SW_HW		(1 << 21)
#define SPI_CMD1_CS_SW_VAL		(1 << 20)
#define SPI_CMD1_IDLE_SDA_MASK		0x3
#define SPI_CMD1_IDLE_SDA_SHIFT		18
#define SPI_CMD1_BIDIR			(1 << 17)
#define SPI_CMD1_LSBI_FE		(1 << 16)
#define SPI_CMD1_LSBY_FE		(1 << 15)
#define SPI_CMD1_BOTH_EN_BIT		(1 << 14)
#define SPI_CMD1_BOTH_EN_BYTE		(1 << 13)
#define SPI_CMD1_RX_EN			(1 << 12)
#define SPI_CMD1_TX_EN			(1 << 11)
#define SPI_CMD1_PACKED			(1 << 5)
#define SPI_CMD1_BIT_LEN_MASK		0x1f
#define SPI_CMD1_BIT_LEN_SHIFT		0

/* COMMAND2 */
#define SPI_CMD2_TX_CLK_TAP_DELAY	(1 << 6)
#define SPI_CMD2_TX_CLK_TAP_DELAY_MASK	(0x3F << 6)
#define SPI_CMD2_RX_CLK_TAP_DELAY	(1 << 0)
#define SPI_CMD2_RX_CLK_TAP_DELAY_MASK	(0x3F << 0)

/* SPI_TRANS_STATUS */
#define SPI_STATUS_RDY			(1 << 30)
#define SPI_STATUS_SLV_IDLE_COUNT_MASK	0xff
#define SPI_STATUS_SLV_IDLE_COUNT_SHIFT	16
#define SPI_STATUS_BLOCK_COUNT		0xffff
#define SPI_STATUS_BLOCK_COUNT_SHIFT	0

/* SPI_FIFO_STATUS */
#define SPI_FIFO_STATUS_CS_INACTIVE			(1 << 31)
#define SPI_FIFO_STATUS_FRAME_END			(1 << 30)
#define SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_MASK		0x7f
#define SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_SHIFT	23
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY_COUNT_MASK	0x7f
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY_COUNT_SHIFT	16
#define SPI_FIFO_STATUS_RX_FIFO_FLUSH			(1 << 15)
#define SPI_FIFO_STATUS_TX_FIFO_FLUSH			(1 << 14)
#define SPI_FIFO_STATUS_ERR				(1 << 8)
#define SPI_FIFO_STATUS_TX_FIFO_OVF			(1 << 7)
#define SPI_FIFO_STATUS_TX_FIFO_UNR			(1 << 6)
#define SPI_FIFO_STATUS_RX_FIFO_OVF			(1 << 5)
#define SPI_FIFO_STATUS_RX_FIFO_UNR			(1 << 4)
#define SPI_FIFO_STATUS_TX_FIFO_FULL			(1 << 3)
#define SPI_FIFO_STATUS_TX_FIFO_EMPTY			(1 << 2)
#define SPI_FIFO_STATUS_RX_FIFO_FULL			(1 << 1)
#define SPI_FIFO_STATUS_RX_FIFO_EMPTY			(1 << 0)

/* SPI_DMA_CTL */
#define SPI_DMA_CTL_DMA			(1 << 31)
#define SPI_DMA_CTL_CONT		(1 << 30)
#define SPI_DMA_CTL_IE_RX		(1 << 29)
#define SPI_DMA_CTL_IE_TX		(1 << 28)
#define SPI_DMA_CTL_RX_TRIG_MASK	0x3
#define SPI_DMA_CTL_RX_TRIG_SHIFT	19
#define SPI_DMA_CTL_TX_TRIG_MASK	0x3
#define SPI_DMA_CTL_TX_TRIG_SHIFT	15

/* SPI_DMA_BLK */
#define SPI_DMA_CTL_BLOCK_SIZE_MASK	0xffff
#define SPI_DMA_CTL_BLOCK_SIZE_SHIFT	0

static struct tegra_spi_channel tegra_spi_channels[] = {
	/*
	 * Note: Tegra pinmux must be setup for corresponding SPI channel in
	 * order for its registers to be accessible. If pinmux has not been
	 * set up, access to the channel's registers will simply hang.
	 *
	 * TODO(dhendrix): Clarify or remove this comment (is clock setup
	 * necessary first, or just pinmux, or both?)
	 */
	{
		.slave = { .bus = 1, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI1_BASE,
		.req_sel = APBDMA_SLAVE_SL2B1,
	},
	{
		.slave = { .bus = 2, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI2_BASE,
		.req_sel = APBDMA_SLAVE_SL2B2,
	},
	{
		.slave = { .bus = 3, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI3_BASE,
		.req_sel = APBDMA_SLAVE_SL2B3,
	},
	{
		.slave = { .bus = 4, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI4_BASE,
		.req_sel = APBDMA_SLAVE_SL2B4,
	},
	{
		.slave = { .bus = 5, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI5_BASE,
		.req_sel = APBDMA_SLAVE_SL2B5,
	},
	{
		.slave = { .bus = 6, },
		.regs = (struct tegra_spi_regs *)TEGRA_SPI6_BASE,
		.req_sel = APBDMA_SLAVE_SL2B6,
	},
};

enum spi_direction {
	SPI_SEND,
	SPI_RECEIVE,
};

struct tegra_spi_channel *tegra_spi_init(unsigned int bus)
{
	int i;
	struct tegra_spi_channel *spi = NULL;

	for (i = 0; i < ARRAY_SIZE(tegra_spi_channels); i++) {
		if (tegra_spi_channels[i].slave.bus == bus) {
			spi = &tegra_spi_channels[i];
			break;
		}
	}
	if (!spi)
		return NULL;

	/* software drives chip-select, set value to high */
	setbits_le32(&spi->regs->command1,
			SPI_CMD1_CS_SW_HW | SPI_CMD1_CS_SW_VAL);

	/* 8-bit transfers, unpacked mode, most significant bit first */
	clrbits_le32(&spi->regs->command1,
			SPI_CMD1_BIT_LEN_MASK | SPI_CMD1_PACKED);
	setbits_le32(&spi->regs->command1, 7 << SPI_CMD1_BIT_LEN_SHIFT);

	return spi;
}

static struct tegra_spi_channel * const to_tegra_spi(int bus) {
	return &tegra_spi_channels[bus - 1];
}

static unsigned int tegra_spi_speed(unsigned int bus)
{
	/* FIXME: implement this properly, for now use max value (50MHz) */
	return 50000000;
}

int spi_claim_bus(struct spi_slave *slave)
{
	struct tegra_spi_regs *regs = to_tegra_spi(slave->bus)->regs;
	u32 val;

	tegra_spi_init(slave->bus);

	val = read32(&regs->command1);

	/* select appropriate chip-select line */
	val &= ~(SPI_CMD1_CS_SEL_MASK << SPI_CMD1_CS_SEL_SHIFT);
	val |= (slave->cs << SPI_CMD1_CS_SEL_SHIFT);

	/* drive chip-select with the inverse of the "inactive" value */
	if (val & (SPI_CMD1_CS_POL_INACTIVE0 << slave->cs))
		val &= ~SPI_CMD1_CS_SW_VAL;
	else
		val |= SPI_CMD1_CS_SW_VAL;

	write32(val, &regs->command1);
	return 0;
}

void spi_release_bus(struct spi_slave *slave)
{
	struct tegra_spi_regs *regs = to_tegra_spi(slave->bus)->regs;
	u32 val;

	val = read32(&regs->command1);

	if (val & (SPI_CMD1_CS_POL_INACTIVE0 << slave->cs))
		val |= SPI_CMD1_CS_SW_VAL;
	else
		val &= ~SPI_CMD1_CS_SW_VAL;

	write32(val, &regs->command1);
}

static void dump_fifo_status(struct tegra_spi_channel *spi)
{
	u32 status = read32(&spi->regs->fifo_status);

	printk(BIOS_INFO, "Raw FIFO status: 0x%08x\n", status);
	if (status & SPI_FIFO_STATUS_TX_FIFO_OVF)
		printk(BIOS_INFO, "\tTx overflow detected\n");
	if (status & SPI_FIFO_STATUS_TX_FIFO_UNR)
		printk(BIOS_INFO, "\tTx underrun detected\n");
	if (status & SPI_FIFO_STATUS_RX_FIFO_OVF)
		printk(BIOS_INFO, "\tRx overflow detected\n");
	if (status & SPI_FIFO_STATUS_RX_FIFO_UNR)
		printk(BIOS_INFO, "\tRx underrun detected\n");

	printk(BIOS_INFO, "TX_FIFO: 0x%08x, TX_DATA: 0x%08x\n",
		read32(&spi->regs->tx_fifo), read32(&spi->regs->tx_data));
	printk(BIOS_INFO, "RX_FIFO: 0x%08x, RX_DATA: 0x%08x\n",
		read32(&spi->regs->rx_fifo), read32(&spi->regs->rx_data));
}

static void clear_fifo_status(struct tegra_spi_channel *spi)
{
	clrbits_le32(&spi->regs->fifo_status,
				SPI_FIFO_STATUS_ERR |
				SPI_FIFO_STATUS_TX_FIFO_OVF |
				SPI_FIFO_STATUS_TX_FIFO_UNR |
				SPI_FIFO_STATUS_RX_FIFO_OVF |
				SPI_FIFO_STATUS_RX_FIFO_UNR);
}

static void dump_spi_regs(struct tegra_spi_channel *spi)
{
	printk(BIOS_INFO, "SPI regs:\n"
			"\tdma_blk: 0x%08x\n"
			"\tcommand1: 0x%08x\n"
			"\tdma_ctl: 0x%08x\n"
			"\ttrans_status: 0x%08x\n",
			read32(&spi->regs->dma_blk),
			read32(&spi->regs->command1),
			read32(&spi->regs->dma_ctl),
			read32(&spi->regs->trans_status));
}

static void dump_dma_regs(struct apb_dma_channel *dma)
{
	printk(BIOS_INFO, "DMA regs:\n"
			"\tahb_ptr: 0x%08x\n"
			"\tapb_ptr: 0x%08x\n"
			"\tahb_seq: 0x%08x\n"
			"\tapb_seq: 0x%08x\n"
			"\tcsr: 0x%08x\n"
			"\tcsre: 0x%08x\n"
			"\twcount: 0x%08x\n"
			"\tdma_byte_sta: 0x%08x\n"
			"\tword_transfer: 0x%08x\n",
			read32(&dma->regs->ahb_ptr),
			read32(&dma->regs->apb_ptr),
			read32(&dma->regs->ahb_seq),
			read32(&dma->regs->apb_seq),
			read32(&dma->regs->csr),
			read32(&dma->regs->csre),
			read32(&dma->regs->wcount),
			read32(&dma->regs->dma_byte_sta),
			read32(&dma->regs->word_transfer));
}

static inline unsigned int spi_byte_count(struct tegra_spi_channel *spi)
{
	/* FIXME: Make this take total packet size into account */
	return read32(&spi->regs->trans_status) &
		(SPI_STATUS_BLOCK_COUNT << SPI_STATUS_BLOCK_COUNT_SHIFT);
}

/*
 * This calls udelay() with a calculated value based on the SPI speed and
 * number of bytes remaining to be transferred. It assumes that if the
 * calculated delay period is less than MIN_DELAY_US then it is probably
 * not worth the overhead of yielding.
 */
#define MIN_DELAY_US 250
static void spi_delay(struct tegra_spi_channel *spi,
				unsigned int bytes_remaining)
{
	unsigned int ns_per_byte, delay_us;

	ns_per_byte = 1000000000 / (tegra_spi_speed(spi->slave.bus) / 8);
	delay_us = (ns_per_byte * bytes_remaining) / 1000;

	if (delay_us < MIN_DELAY_US)
		return;

	udelay(delay_us);
}

static void tegra_spi_wait(struct tegra_spi_channel *spi)
{
	unsigned int count, dma_blk;

	dma_blk = 1 + (read32(&spi->regs->dma_blk) &
		(SPI_DMA_CTL_BLOCK_SIZE_MASK << SPI_DMA_CTL_BLOCK_SIZE_SHIFT));

	while ((count = spi_byte_count(spi)) != dma_blk)
		spi_delay(spi, dma_blk - count);
}


static int fifo_error(struct tegra_spi_channel *spi)
{
	return read32(&spi->regs->fifo_status) & SPI_FIFO_STATUS_ERR ? 1 : 0;
}

static int tegra_spi_pio_prepare(struct tegra_spi_channel *spi,
			unsigned int bytes, enum spi_direction dir)
{
	u8 *p = spi->out_buf;
	unsigned int todo = MIN(bytes, SPI_MAX_TRANSFER_BYTES_FIFO);
	u32 flush_mask, enable_mask;

	if (dir == SPI_SEND) {
		flush_mask = SPI_FIFO_STATUS_TX_FIFO_FLUSH;
		enable_mask = SPI_CMD1_TX_EN;
	} else {
		flush_mask = SPI_FIFO_STATUS_RX_FIFO_FLUSH;
		enable_mask = SPI_CMD1_RX_EN;
	}

	setbits_le32(&spi->regs->fifo_status, flush_mask);
	while (read32(&spi->regs->fifo_status) & flush_mask)
		;

	setbits_le32(&spi->regs->command1, enable_mask);

	/* BLOCK_SIZE in SPI_DMA_BLK register applies to both DMA and
	 * PIO transfers */
	write32(todo - 1, &spi->regs->dma_blk);

	if (dir == SPI_SEND) {
		unsigned int to_fifo = bytes;
		while (to_fifo) {
			write32(*p, &spi->regs->tx_fifo);
			p++;
			to_fifo--;
		}
	}

	return todo;
}

static void tegra_spi_pio_start(struct tegra_spi_channel *spi)
{
	setbits_le32(&spi->regs->trans_status, SPI_STATUS_RDY);
	setbits_le32(&spi->regs->command1, SPI_CMD1_GO);
	/* Make sure the write to command1 completes. */
	read32(&spi->regs->command1);
}

static inline u32 rx_fifo_count(struct tegra_spi_channel *spi)
{
	return (read32(&spi->regs->fifo_status) >>
		SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_SHIFT) &
		SPI_FIFO_STATUS_RX_FIFO_FULL_COUNT_MASK;
}

static int tegra_spi_pio_finish(struct tegra_spi_channel *spi)
{
	u8 *p = spi->in_buf;
	struct mono_time start;
	struct rela_time rt;

	clrbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN | SPI_CMD1_TX_EN);

	/*
	 * Allow some time in case the Rx FIFO does not yet have
	 * all packets pushed into it. See chrome-os-partner:24215.
	 */
	timer_monotonic_get(&start);
	do {
		if (rx_fifo_count(spi) == spi_byte_count(spi))
			break;
		rt = current_time_from(&start);
	} while (rela_time_in_microseconds(&rt) < SPI_FIFO_XFER_TIMEOUT_US);

	while (!(read32(&spi->regs->fifo_status) &
				SPI_FIFO_STATUS_RX_FIFO_EMPTY)) {
		*p = read8(&spi->regs->rx_fifo);
		p++;
	}

	if (fifo_error(spi)) {
		printk(BIOS_ERR, "%s: ERROR:\n", __func__);
		dump_spi_regs(spi);
		dump_fifo_status(spi);
		return -1;
	}

	return 0;
}

static void setup_dma_params(struct tegra_spi_channel *spi,
				struct apb_dma_channel *dma)
{
	/* APB bus width = 8-bits, address wrap for each word */
	clrbits_le32(&dma->regs->apb_seq,
			APB_BUS_WIDTH_MASK << APB_BUS_WIDTH_SHIFT);
	/* AHB 1 word burst, bus width = 32 bits (fixed in hardware),
	 * no address wrapping */
	clrsetbits_le32(&dma->regs->ahb_seq,
			(AHB_BURST_MASK << AHB_BURST_SHIFT),
			4 << AHB_BURST_SHIFT);

	/* Set ONCE mode to transfer one "block" at a time (64KB) and enable
	 * flow control. */
	clrbits_le32(&dma->regs->csr,
			APB_CSR_REQ_SEL_MASK << APB_CSR_REQ_SEL_SHIFT);
	setbits_le32(&dma->regs->csr, APB_CSR_ONCE | APB_CSR_FLOW |
			(spi->req_sel << APB_CSR_REQ_SEL_SHIFT));
}

static int tegra_spi_dma_prepare(struct tegra_spi_channel *spi,
		unsigned int bytes, enum spi_direction dir)
{
	unsigned int todo, wcount;

	/*
	 * For DMA we need to think of things in terms of word count.
	 * AHB width is fixed at 32-bits. To avoid overrunning
	 * the in/out buffers we must align down. (Note: lowest 2-bits
	 * in WCOUNT register are ignored, and WCOUNT seems to count
	 * words starting at n-1)
	 *
	 * Example: If "bytes" is 7 and we are transferring 1-byte at a time,
	 * WCOUNT should be 4. The remaining 3 bytes must be transferred
	 * using PIO.
	 */
	todo = MIN(bytes, SPI_MAX_TRANSFER_BYTES_DMA - TEGRA_DMA_ALIGN_BYTES);
	todo = ALIGN_DOWN(todo, TEGRA_DMA_ALIGN_BYTES);
	wcount = ALIGN_DOWN(todo - TEGRA_DMA_ALIGN_BYTES, TEGRA_DMA_ALIGN_BYTES);

	if (dir == SPI_SEND) {
		spi->dma_out = dma_claim();
		if (!spi->dma_out)
			return -1;

		/* ensure bytes to send will be visible to DMA controller */
		dcache_clean_by_mva(spi->out_buf, bytes);

		write32((uintptr_t)&spi->regs->tx_fifo, &spi->dma_out->regs->apb_ptr);
		write32((uintptr_t)spi->out_buf, &spi->dma_out->regs->ahb_ptr);
		setbits_le32(&spi->dma_out->regs->csr, APB_CSR_DIR);
		setup_dma_params(spi, spi->dma_out);
		write32(wcount, &spi->dma_out->regs->wcount);
	} else {
		spi->dma_in = dma_claim();
		if (!spi->dma_in)
			return -1;

		/* avoid data collisions */
		dcache_clean_invalidate_by_mva(spi->in_buf, bytes);

		write32((uintptr_t)&spi->regs->rx_fifo, &spi->dma_in->regs->apb_ptr);
		write32((uintptr_t)spi->in_buf, &spi->dma_in->regs->ahb_ptr);
		clrbits_le32(&spi->dma_in->regs->csr, APB_CSR_DIR);
		setup_dma_params(spi, spi->dma_in);
		write32(wcount, &spi->dma_in->regs->wcount);
	}

	/* BLOCK_SIZE starts at n-1 */
	write32(todo - 1, &spi->regs->dma_blk);
	return todo;
}

static void tegra_spi_dma_start(struct tegra_spi_channel *spi)
{
	/*
	 * The RDY bit in SPI_TRANS_STATUS needs to be cleared manually
	 * (set bit to clear) between each transaction. Otherwise the next
	 * transaction does not start.
	 */
	setbits_le32(&spi->regs->trans_status, SPI_STATUS_RDY);

	if (spi->dma_out)
		setbits_le32(&spi->regs->command1, SPI_CMD1_TX_EN);
	if (spi->dma_in)
		setbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN);

	/*
	 * To avoid underrun conditions, enable APB DMA before SPI DMA for
	 * Tx and enable SPI DMA before APB DMA before Rx.
	 */
	if (spi->dma_out)
		dma_start(spi->dma_out);
	setbits_le32(&spi->regs->dma_ctl, SPI_DMA_CTL_DMA);
	if (spi->dma_in)
		dma_start(spi->dma_in);


}

static int tegra_spi_dma_finish(struct tegra_spi_channel *spi)
{
	int ret;
	unsigned int todo;

	todo = read32(&spi->dma_in->regs->wcount);

	if (spi->dma_in) {
		while ((read32(&spi->dma_in->regs->dma_byte_sta) < todo) ||
				dma_busy(spi->dma_in))
			;	/* this shouldn't take long, no udelay */
		dma_stop(spi->dma_in);
		clrbits_le32(&spi->regs->command1, SPI_CMD1_RX_EN);
		dma_release(spi->dma_in);
	}

	if (spi->dma_out) {
		while ((read32(&spi->dma_out->regs->dma_byte_sta) < todo) ||
				dma_busy(spi->dma_out))
			spi_delay(spi, todo - spi_byte_count(spi));
		clrbits_le32(&spi->regs->command1, SPI_CMD1_TX_EN);
		dma_stop(spi->dma_out);
		dma_release(spi->dma_out);
	}

	if (fifo_error(spi)) {
		printk(BIOS_ERR, "%s: ERROR:\n", __func__);
		dump_dma_regs(spi->dma_out);
		dump_dma_regs(spi->dma_in);
		dump_spi_regs(spi);
		dump_fifo_status(spi);
		ret = -1;
		goto done;
	}

	ret = 0;
done:
	spi->dma_in = NULL;
	spi->dma_out = NULL;
	return ret;
}

/*
 * xfer_setup() prepares a transfer. It does sanity checking, alignment, and
 * sets transfer mode used by this channel (if not set already).
 *
 * A few caveats to watch out for:
 * - The number of bytes which can be transferred may be smaller than the
 *   number of bytes the caller specifies. The number of bytes ready for
 *   a transfer will be returned (unless an error occurs).
 *
 * - Only one mode can be used for both RX and TX. The transfer mode of the
 *   SPI channel (spi->xfer_mode) is checked each time this function is called.
 *   If conflicting modes are detected, spi->xfer_mode will be set to
 *   XFER_MODE_NONE and an error will be returned.
 *
 * Returns bytes ready for transfer if successful, <0 to indicate error.
 */
static int xfer_setup(struct tegra_spi_channel *spi, void *buf,
		unsigned int bytes, enum spi_direction dir)
{
	unsigned int line_size = dcache_line_bytes();
	unsigned int align;
	int ret = -1;

	if (!bytes)
		return 0;

	if (dir == SPI_SEND)
		spi->out_buf = buf;
	else if (dir == SPI_RECEIVE)
		spi->in_buf = buf;

	/*
	 * Alignment consideratons:
	 * When we enable caching we'll need to clean/invalidate portions of
	 * memory. So we need to be careful about memory alignment. Also, DMA
	 * likes to operate on 4-bytes at a time on the AHB side. So for
	 * example, if we only want to receive 1 byte, 4 bytes will be be
	 * written in memory even if those extra 3 bytes are beyond the length
	 * we want.
	 *
	 * For now we'll use PIO to send/receive unaligned bytes. We may
	 * consider setting aside some space for a kind of bounce buffer to
	 * stay in DMA mode once we have a chance to benchmark the two
	 * approaches.
	 */

	if (bytes < line_size) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, bytes, dir);
		}
		goto done;
	}

	/* transfer bytes before the aligned boundary */
	align = line_size - ((uintptr_t)buf % line_size);
	if ((align != 0) && (align != line_size)) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, align, dir);
		}
		goto done;
	}

	/* do aligned DMA transfer */
	align = (((uintptr_t)buf + bytes) % line_size);
	if (bytes - align > 0) {
		unsigned int dma_bytes = bytes - align;

		if (spi->xfer_mode == XFER_MODE_PIO) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_DMA;
			ret = tegra_spi_dma_prepare(spi, dma_bytes, dir);
		}

		goto done;
	}

	/* transfer any remaining unaligned bytes */
	if (align) {
		if (spi->xfer_mode == XFER_MODE_DMA) {
			spi->xfer_mode = XFER_MODE_NONE;
			ret = -1;
		} else {
			spi->xfer_mode = XFER_MODE_PIO;
			ret = tegra_spi_pio_prepare(spi, align, dir);
		}
		goto done;
	}

done:
	return ret;
}

static void xfer_start(struct tegra_spi_channel *spi)
{
	if (spi->xfer_mode == XFER_MODE_DMA)
		tegra_spi_dma_start(spi);
	else
		tegra_spi_pio_start(spi);
}

static void xfer_wait(struct tegra_spi_channel *spi)
{
	tegra_spi_wait(spi);
}

static int xfer_finish(struct tegra_spi_channel *spi)
{
	int ret;

	if (spi->xfer_mode == XFER_MODE_DMA)
		ret = tegra_spi_dma_finish(spi);
	else
		ret = tegra_spi_pio_finish(spi);

	spi->xfer_mode = XFER_MODE_NONE;
	return ret;
}

int spi_xfer(struct spi_slave *slave, const void *dout,
		unsigned int out_bytes, void *din, unsigned int in_bytes)
{
	struct tegra_spi_channel *spi = to_tegra_spi(slave->bus);
	u8 *out_buf = (u8 *)dout;
	u8 *in_buf = (u8 *)din;
	unsigned int todo;
	int ret = 0;

	/* tegra bus numbers start at 1 */
	ASSERT(slave->bus >= 1 && slave->bus <= ARRAY_SIZE(tegra_spi_channels));

	while (out_bytes || in_bytes) {
		int x = 0;

		if (out_bytes == 0)
			todo = in_bytes;
		else if (in_bytes == 0)
			todo = out_bytes;
		else
			todo = MIN(out_bytes, in_bytes);

		if (out_bytes) {
			x = xfer_setup(spi, out_buf, todo, SPI_SEND);
			if (x < 0) {
				if (spi->xfer_mode == XFER_MODE_NONE) {
					spi->xfer_mode = XFER_MODE_PIO;
					continue;
				} else {
					ret = -1;
					break;
				}
			}
		}
		if (in_bytes) {
			x = xfer_setup(spi, in_buf, todo, SPI_RECEIVE);
			if (x < 0) {
				if (spi->xfer_mode == XFER_MODE_NONE) {
					spi->xfer_mode = XFER_MODE_PIO;
					continue;
				} else {
					ret = -1;
					break;
				}
			}
		}

		/*
		 * Note: Some devices (such as Chrome EC) are sensitive to
		 * delays, so be careful when adding debug prints not to
		 * cause timeouts between transfers.
		 */
		xfer_start(spi);
		xfer_wait(spi);
		if (xfer_finish(spi)) {
			ret = -1;
			break;
		}

		/* Post-processing. */
		if (out_bytes) {
			out_bytes -= x;
			out_buf += x;
		}
		if (in_bytes) {
			in_bytes -= x;
			in_buf += x;
		}
	}

	if (ret < 0) {
		printk(BIOS_ERR, "%s: Error detected\n", __func__);
		printk(BIOS_ERR, "Transaction size: %u, bytes remaining: "
				"%u out / %u in\n", todo, out_bytes, in_bytes);
		clear_fifo_status(spi);
	}
	return ret;
}

/* SPI as CBFS media. */
struct tegra_spi_media {
	struct spi_slave *slave;
	struct cbfs_simple_buffer buffer;
};

static int tegra_spi_cbfs_open(struct cbfs_media *media)
{
	DEBUG_SPI("tegra_spi_cbfs_open\n");
	return 0;
}

static int tegra_spi_cbfs_close(struct cbfs_media *media)
{
	DEBUG_SPI("tegra_spi_cbfs_close\n");
	return 0;
}

#define JEDEC_READ			0x03
#define JEDEC_READ_OUTSIZE		0x04
#define JEDEC_FAST_READ_DUAL		0x3b
#define JEDEC_FAST_READ_DUAL_OUTSIZE	0x05

static size_t tegra_spi_cbfs_read(struct cbfs_media *media, void *dest,
				   size_t offset, size_t count)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media *)media->context;
	u8 spi_read_cmd[JEDEC_FAST_READ_DUAL_OUTSIZE];
	unsigned int read_cmd_bytes;
	int ret = count;
	struct tegra_spi_channel *channel;

	channel = to_tegra_spi(spi->slave->bus);

	if (channel->dual_mode) {
		/*
		 * Command 0x3b will interleave data only, command 0xbb will
		 * interleave the address as well. It's nice to see the address
		 * plainly when debugging, and we're mostly concerned with
		 * large transfers so the optimization of using 0xbb isn't
		 * really worthwhile.
		 */
		spi_read_cmd[0] = JEDEC_FAST_READ_DUAL;
		spi_read_cmd[4] = 0x00;	/* dummy byte */
		read_cmd_bytes = JEDEC_FAST_READ_DUAL_OUTSIZE;
	} else {
		spi_read_cmd[0] = JEDEC_READ;
		read_cmd_bytes = JEDEC_READ_OUTSIZE;
	}
	spi_read_cmd[1] = (offset >> 16) & 0xff;
	spi_read_cmd[2] = (offset >> 8) & 0xff;
	spi_read_cmd[3] = offset & 0xff;

	spi_claim_bus(spi->slave);

	if (spi_xfer(spi->slave, spi_read_cmd,
			read_cmd_bytes, NULL, 0) < 0) {
		ret = -1;
		printk(BIOS_ERR, "%s: Failed to transfer %zu bytes\n",
				__func__, sizeof(spi_read_cmd));
		goto tegra_spi_cbfs_read_exit;
	}

	if (channel->dual_mode) {
		setbits_le32(&channel->regs->command1, SPI_CMD1_BOTH_EN_BIT);
	}
	if (spi_xfer(spi->slave, NULL, 0, dest, count)) {
		ret = -1;
		printk(BIOS_ERR, "%s: Failed to transfer %zu bytes\n",
				__func__, count);
	}
	if (channel->dual_mode)
		clrbits_le32(&channel->regs->command1, SPI_CMD1_BOTH_EN_BIT);

tegra_spi_cbfs_read_exit:
	/* de-assert /CS */
	spi_release_bus(spi->slave);
	return (ret < 0) ? 0 : ret;
}

static void *tegra_spi_cbfs_map(struct cbfs_media *media, size_t offset,
				 size_t count)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media*)media->context;
	void *map;
	DEBUG_SPI("tegra_spi_cbfs_map\n");
	map = cbfs_simple_buffer_map(&spi->buffer, media, offset, count);
	return map;
}

static void *tegra_spi_cbfs_unmap(struct cbfs_media *media,
				   const void *address)
{
	struct tegra_spi_media *spi = (struct tegra_spi_media*)media->context;
	DEBUG_SPI("tegra_spi_cbfs_unmap\n");
	return cbfs_simple_buffer_unmap(&spi->buffer, address);
}

int initialize_tegra_spi_cbfs_media(struct cbfs_media *media,
				     void *buffer_address,
				     size_t buffer_size)
{
	// TODO Replace static variable to support multiple streams.
	static struct tegra_spi_media context;
	static struct tegra_spi_channel *channel;

	channel = &tegra_spi_channels[CONFIG_BOOT_MEDIA_SPI_BUS - 1];
	channel->slave.cs = CONFIG_BOOT_MEDIA_SPI_CHIP_SELECT;

	DEBUG_SPI("Initializing CBFS media on SPI\n");

	context.slave = &channel->slave;
	context.buffer.allocated = context.buffer.last_allocate = 0;
	context.buffer.buffer = buffer_address;
	context.buffer.size = buffer_size;
	media->context = (void*)&context;
	media->open = tegra_spi_cbfs_open;
	media->close = tegra_spi_cbfs_close;
	media->read = tegra_spi_cbfs_read;
	media->map = tegra_spi_cbfs_map;
	media->unmap = tegra_spi_cbfs_unmap;

#if CONFIG_SPI_FLASH_FAST_READ_DUAL_OUTPUT_3B == 1
	channel->dual_mode = 1;
#endif

	return 0;
}

struct spi_slave *spi_setup_slave(unsigned int bus, unsigned int cs)
{
	struct tegra_spi_channel *channel = to_tegra_spi(bus);
	if (!channel)
		return NULL;

	return &channel->slave;
}