Hello,
I have implemented my own SPI Driver. It is working with polling as well as with DMA. I checked the implementation with a SD Card. Both versions are working perfectly fine. I can access the Filesystem without problems and can run my Benchmarks. The polling Version is significant faster than the DMA version. I had expected that the speed is similar, but the polling implementation is up to 50% faster. I can’t explain that. I checked it on K64F and K60FX controller with differend SD-Cards.
Here is some of my code, maybe I have done something wrong with the DMA configuration.
Thanks
void HSPI_KinetisWriteReadAsync (const uint8_t *WriteData, uint8_t *ReadData, unsigned int uiDataSize)
{
DMAMUX->CHCFG[this->DMA_RX_Channel] = DMAMUX_CHCFG_ENBL_MASK | this->DMA_Source_RX; // SPI RX DMA request source enable
// Disable done-flag
DMA0->CDNE = this->DMA_RX_Channel;
DMA0->CDNE = this->DMA_TX_Channel;
this->SPI->RSER = SPI_RSER_TFFF_DIRS_MASK | SPI_RSER_RFDF_DIRS_MASK | SPI_RSER_RFDF_RE_MASK;
this->SPI->MCR |= SPI_MCR_DIS_TXF_MASK | SPI_MCR_DIS_RXF_MASK;
// RX
DMA0->TCD[this->DMA_RX_Channel].SADDR = (uint32_t)&this->SPI->POPR; // Source Address
DMA0->TCD[this->DMA_RX_Channel].SOFF = 0; // Source Address Offset (signed)
if (ReadData == NULL)
{
DMA0->TCD[this->DMA_RX_Channel].DADDR = (uint32_t)&this->DummyData; // Destination Address
DMA0->TCD[this->DMA_RX_Channel].DOFF = 0; // Destination Address Offset (signed)
}
else
{
DMA0->TCD[this->DMA_RX_Channel].DADDR = (uint32_t)ReadData; // Destination Address
DMA0->TCD[this->DMA_RX_Channel].DOFF = 1; // Destination Address Offset (signed)
}
DMA0->TCD[this->DMA_RX_Channel].ATTR = DMA_ATTR_SSIZE(0) | DMA_ATTR_DSIZE(0); // Transfer Attributes (Set Source and Destination data transfer size to 8-bits)
DMA0->TCD[this->DMA_RX_Channel].NBYTES_MLNO = 1; // Minor Byte Transfer Count
DMA0->TCD[this->DMA_RX_Channel].SLAST = 0x0; // Last source Address Adjustment
DMA0->TCD[this->DMA_RX_Channel].CITER_ELINKYES = DMA_CITER_ELINKYES_ELINK_MASK | DMA_CITER_ELINKYES_LINKCH(this->DMA_TX_Channel) | (uiDataSize);
DMA0->TCD[this->DMA_RX_Channel].DLAST_SGA = 0x0; //
DMA0->TCD[this->DMA_RX_Channel].BITER_ELINKYES = DMA0->TCD[this->DMA_RX_Channel].CITER_ELINKYES;
DMA0->TCD[this->DMA_RX_Channel].CSR = DMA_CSR_INTMAJOR_MASK | DMA_CSR_DREQ_MASK;
// TX
DMA0->TCD[this->DMA_TX_Channel].DADDR = (uint32_t)&this->SPI->PUSHR;
DMA0->TCD[this->DMA_TX_Channel].DOFF = 0;
if (WriteData == NULL)
{
this->DummyData = SPI_PUSHR_CONT_MASK | SPI_PUSHR_PCS(1) | SPI_PUSHR_CTAS(0) | SPI_PUSHR_TXDATA(0xFF);
DMA0->TCD[this->DMA_TX_Channel].SADDR = (uint32_t)&this->DummyData;
DMA0->TCD[this->DMA_TX_Channel].SOFF = 0;
DMA0->TCD[this->DMA_TX_Channel].ATTR = DMA_ATTR_SSIZE(2) | DMA_ATTR_DSIZE(2); // (32-bits)
DMA0->TCD[this->DMA_TX_Channel].NBYTES_MLNO = 4;
}
else
{
this->DummyData = SPI_PUSHR_CONT_MASK | SPI_PUSHR_PCS(1) | SPI_PUSHR_CTAS(0) | SPI_PUSHR_TXDATA(*WriteData);
DMA0->TCD[this->DMA_TX_Channel].SADDR = (uint32_t)WriteData+1;
DMA0->TCD[this->DMA_TX_Channel].SOFF = 1;
DMA0->TCD[this->DMA_TX_Channel].ATTR = DMA_ATTR_SSIZE(0) | DMA_ATTR_DSIZE(0); // (8-bits)
DMA0->TCD[this->DMA_TX_Channel].NBYTES_MLNO = 1;
}
DMA0->TCD[this->DMA_TX_Channel].SLAST = 0x0;
DMA0->TCD[this->DMA_TX_Channel].CITER_ELINKNO = uiDataSize -1;
DMA0->TCD[this->DMA_TX_Channel].DLAST_SGA = 0x0;
DMA0->TCD[this->DMA_TX_Channel].BITER_ELINKNO = DMA0->TCD[this->DMA_TX_Channel].CITER_ELINKNO;
DMA0->TCD[this->DMA_TX_Channel].CSR = DMA_CSR_INTMAJOR_MASK | DMA_CSR_DREQ_MASK;
this->Start ();
DMA0->SERQ = this->DMA_RX_Channel; // Enable DMA request
DMA0->SERQ = this->DMA_TX_Channel;
this->SPI->PUSHR = this->DummyData; // Manual CPU write, to get the SPI transfer started
}
bool HSPI_Kinetis::WaitDMAFinished (void)
{
// wait until DMA is done
while (!((DMA0->TCD[this->DMA_RX_Channel].CSR & DMA_CSR_DONE_MASK) && (DMA0->TCD[this->DMA_TX_Channel].CSR & DMA_CSR_DONE_MASK)))
;
bool bError = false;
if (DMA0->ES != 0)
bError = true;
if (!(DMA0->TCD[this->DMA_TX_Channel].CSR & DMA_CSR_DONE_MASK))
bError = true; // Tx didn't finish!
if (this->SPI->SR & SPI_SR_RXCTR_MASK)
bError = true; // Rx FiFo has junk!
if (DMA0->TCD[this->DMA_RX_Channel].CITER_ELINKNO != DMA0->TCD[this->DMA_RX_Channel].BITER_ELINKNO ||
DMA0->TCD[this->DMA_TX_Channel].CITER_ELINKNO != DMA0->TCD[this->DMA_TX_Channel].BITER_ELINKNO)
bError = true;
if (DMA0->ERR)
bError = true;
this->Stop ();
DMA0->CDNE = this->DMA_TX_Channel;
DMA0->CDNE = this->DMA_RX_Channel;
DMAMUX->CHCFG[this->DMA_RX_Channel] = 0; // SPI RX DMA request source disable
this->SPI->RSER &= ~(SPI_RSER_TFFF_DIRS_MASK | SPI_RSER_TFFF_RE_MASK | SPI_RSER_RFDF_DIRS_MASK | SPI_RSER_RFDF_RE_MASK);
this->SPI->MCR &= ~(SPI_MCR_DIS_TXF_MASK | SPI_MCR_DIS_RXF_MASK);
return !bError;
}
bool HSPI_Kinetis::Read (unsigned char *pucData, unsigned int uiDataSize)
{
#if ENABLE_DMA
if (uiDataSize > 128)
{
this->WriteReadAsync (NULL, pucData, uiDataSize);
return this->WaitDMAFinished ();
}
else
#endif
{
this->Start ();
unsigned int uiReadSize = uiDataSize;
while (uiDataSize != 0 || uiReadSize != 0)
{
// TX-FIFO is not full and we have data to sent
if ((this->SPI->SR & SPI_SR_TFFF_MASK) && (uiDataSize != 0))
{
// Write some dummy Data with chip select 1
this->SPI->PUSHR = SPI_PUSHR_CONT_MASK | SPI_PUSHR_PCS(1) | SPI_PUSHR_CTAS(0) | SPI_PUSHR_TXDATA(0xFF);
this->SPI->SR = SPI_SR_TFFF_MASK;
uiDataSize--;
}
// if RX-FIFO is not empty read some data
if ((this->SPI->SR & SPI_SR_RFDF_MASK) && uiReadSize != 0)
{
*pucData = this->SPI->POPR;
this->SPI->SR |= SPI_SR_RFDF_MASK;
pucData++;
uiReadSize--;
}
}
if (this->SPI->SR & SPI_SR_RFOF_MASK)
this->SPI->SR |= SPI_SR_RFOF_MASK;
if this->SPI->SR & SPI_SR_TFUF_MASK)
this->SPI->SR |= SPI_SR_TFUF_MASK;
this->Stop ();
}
return true;
}
Hi,
Using DMA transfer advantage is to transfer lots of data without core intervene(reduce core work load).
From customer setting, the DMA just transfer 1 byte data of each transfer, that doesn't using DMA module advantage feature.
I had write a document about SPI using DMA to reduce the core work load, please check below link for the detailed info:
使用DMA降低SPI通信过程中内核负荷 Reduce core work load with DMA Module during SPI communication
So, I would recommend customer to use DMA to transfer amount of data.
If there just transfer few bytes, customer can just using interrupt way.
Wish it helps.
Have a great day,
Ma Hui
-----------------------------------------------------------------------------------------------------------------------
Note: If this post answers your question, please click the Correct Answer button. Thank you!
-----------------------------------------------------------------------------------------------------------------------
Hello Ma Hui,
Thanks for the reply. Your Thread is very interesting.
I did in my code exactly as you recommended. For the small commands I use polling. And for the actual Data transfer from the SD-Card I use DMA. My threshold value is 128 Byte as you can see in my code in the Read-Method. I don’t know if this is the best threshold value. But it separates quite good the Data transfer (at least 512 Bytes) and the commands (16 Byte for reading the CSD) on SD Card communication.
In your study over workload during SPI communication you used a quite low SPI frequency of 5 MHz. I did my Benchmark again with 5, 10 and 15 MHz. and the transfer rates with polling and DMA where in the same range. Could you redo your tests with a higher SPI-Frequency?
Could there be a bottleneck on the SPI-DMA?
Thanks and Regards.
Markus
Hi Markus,
Could you let me know which flag you are polling with? SPI0_SR [TCF] bit?
For the polling way, the core will stop working until the flag was set. This way is not a high efficiency way.
In fact, customer could use interrupt way or DMA way to save SPI flag waiting time.
That's why using DMA reduce core working load and core could work with other issues.
During my test, the SPI communication speed is not the key issue. I want to let customer know different DMA transfer way will affect the core reducing percentage.
Select the right DMA transfer way cold reduce core work load dramatically.
Wish it helps.
Have a great day,
Ma Hui
-----------------------------------------------------------------------------------------------------------------------
Note: If this post answers your question, please click the Correct Answer button. Thank you!
-----------------------------------------------------------------------------------------------------------------------
Hello Ma Hui,
I absolutely agree with you. DMA should be used to reduce core working load. In my code above, I actively polled for finishing the DMA transfer which is surely not the way to use DMA. I did this to avoid influences of the Operating system in the Speed comparison between DMA and Polling. In the real code I would switch the Task here.
I use the benchmarks to have an indication if I have done all correctly. Putting heavy load on the system normally shows up the weak points. Being slower in DMA transfer than in Polling is for me a strong indication of having done something wrong. Do you agree?
In the Moment I use 8-Bit transfer in DMA (same as in Polling). I think I will try next 16-Bit transfer in DMA Mode and see if the throughput is increasing.
Thanks and Regards
Markus
PS I’m polling the SPI_SR_TFFF and SPI_SR_RFDF Flagsin polling Mode.