STM32 + HAL + FreeRTOS Part V: SPI (with DMA)

The main flow of SPI (or any other communications for that matter) is such, that a CPU generates data to send, passes it along to the peripheral (or bit-banging logic, but that's out of scope) and then waits  for magic to happen.
There are multiple ways of "magic happening":
  • blocking mode - CPU actively does nothing, but checks the peripheral's status until transmit is done
  • interrupt mode - CPU hands off data and then proceeds with dealing with whatever it has to do. It gets interrupted, once transfer is complete and then can act upon it (store to memory, parse or whatever)
  • direct memory access (DMA) mode - CPU hands off the data and proceeds. Peripheral meanwhile sends and receives data directly from/to the memory region defined and notifies the CPU only when the transfer is done. At that point CPU can access data already in memory, it does not have to be fetched from peripheral anymore
DMA provides quite a few benefits - CPU does not have to worry about arranging transmission or data storage. It can even go to sleep, thus saving power. Or in data-intensive applications it can process the data batch, while another is on its way. So, what we get is extra clock cycles and possible power saving.

STM provides AN-4031 application note in which DMA functionality is described. STM32F4 has 2 DMA controllers, each responsible for its own set of peripherals. Each controller has 8 streams, each stream bound to specific peripherals. Tables 1 and 2 show each controller stream/peripheral mappings. We are interested in SPI5, since that's where on-board gyro is connected to. SPI5 is available on DMA2 Streams 3, 4 (channel 2), 5 and 6 (channel 7). Both transmit (TX) and receive (RX) channels are available as separate streams. Since streams have hardware priority according to inverse of their number (stream0 is of higher priority than stream2), care must be taken with distributing streams in application to avoid race conditions and bad states. But we don't care much about it now, because our application is tiny and we are not using any more streams which could affect our lives.

OK, let's get to coding. First, we need to initialize DMA, by enabling its clock and defining streams. We'll do that in the same SPI5 configuration (spi.c):
#include "spi.h"
#include "gpio.h"

SPI_HandleTypeDef hspi5;
DMA_HandleTypeDef hdma_rx;
DMA_HandleTypeDef hdma_tx;

/* SPI5 init function */
void MX_SPI5_Init(void) {

  hspi5.Instance = SPI5;
  hspi5.Init.Mode = SPI_MODE_MASTER;
  hspi5.Init.Direction = SPI_DIRECTION_2LINES;
  hspi5.Init.DataSize = SPI_DATASIZE_8BIT;
  hspi5.Init.CLKPolarity = SPI_POLARITY_HIGH;
  hspi5.Init.CLKPhase = SPI_PHASE_2EDGE;
  hspi5.Init.NSS = SPI_NSS_SOFT;
  hspi5.Init.BaudRatePrescaler = SPI_BAUDRATEPRESCALER_16;
  hspi5.Init.FirstBit = SPI_FIRSTBIT_MSB;
  hspi5.Init.TIMode = SPI_TIMODE_DISABLE;
  hspi5.Init.CRCCalculation = SPI_CRCCALCULATION_DISABLE;
  hspi5.Init.CRCPolynomial = 10;
  if (HAL_SPI_Init(&hspi5) != HAL_OK) {
    _Error_Handler(__FILE__, __LINE__);
  }

}

void HAL_SPI_MspInit(SPI_HandleTypeDef* spiHandle) {

  GPIO_InitTypeDef GPIO_InitStruct;

  if (spiHandle->Instance == SPI5) {

    /* SPI5 clock enable */
    __HAL_RCC_SPI5_CLK_ENABLE();
    __HAL_RCC_DMA2_CLK_ENABLE();
  
    /**SPI5 GPIO Configuration    
    PF7     ------> SPI5_SCK
    PF8     ------> SPI5_MISO
    PF9     ------> SPI5_MOSI 
    */
    GPIO_InitStruct.Pin = SPI5_SCK_Pin | SPI5_MISO_Pin | SPI5_MOSI_Pin;
    GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
    GPIO_InitStruct.Pull = GPIO_NOPULL;
    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_LOW;
    GPIO_InitStruct.Alternate = GPIO_AF5_SPI5;
    HAL_GPIO_Init(GPIOF, &GPIO_InitStruct);

    hdma_rx.Instance = DMA2_Stream3;
    hdma_rx.Init.Channel = DMA_CHANNEL_2;
    hdma_rx.Init.Direction = DMA_PERIPH_TO_MEMORY;
    hdma_rx.Init.PeriphInc = DMA_PINC_DISABLE;
    hdma_rx.Init.MemInc = DMA_MINC_ENABLE;
    hdma_rx.Init.PeriphDataAlignment = DMA_PDATAALIGN_BYTE;
    hdma_rx.Init.MemDataAlignment = DMA_MDATAALIGN_BYTE;
    hdma_rx.Init.Mode = DMA_NORMAL;
    hdma_rx.Init.Priority = DMA_PRIORITY_VERY_HIGH;
    hdma_rx.Init.FIFOMode = DMA_FIFOMODE_DISABLE;
    if (HAL_DMA_Init(&hdma_rx) != HAL_OK) {
        _Error_Handler(__FILE__, __LINE__);
    }

    __HAL_LINKDMA(&hspi5, hdmarx, hdma_rx);

    hdma_tx.Instance = DMA2_Stream4;
    hdma_tx.Init.Channel = DMA_CHANNEL_2;
    hdma_tx.Init.Direction = DMA_MEMORY_TO_PERIPH;
    hdma_tx.Init.PeriphInc = DMA_PINC_DISABLE;
    hdma_tx.Init.MemInc = DMA_MINC_ENABLE;
    hdma_tx.Init.PeriphDataAlignment = DMA_PDATAALIGN_BYTE;
    hdma_tx.Init.MemDataAlignment = DMA_MDATAALIGN_BYTE;
    hdma_tx.Init.Mode = DMA_NORMAL;
    hdma_tx.Init.Priority = DMA_PRIORITY_VERY_HIGH;
    hdma_tx.Init.FIFOMode = DMA_FIFOMODE_DISABLE;
    if (HAL_DMA_Init(&hdma_tx) != HAL_OK) {
        _Error_Handler(__FILE__, __LINE__);
    }

    __HAL_LINKDMA(&hspi5, hdmatx, hdma_tx);
 
    HAL_NVIC_SetPriority(DMA2_Stream3_IRQn, 3, 0);
    HAL_NVIC_EnableIRQ(DMA2_Stream3_IRQn);

    HAL_NVIC_SetPriority(DMA2_Stream4_IRQn, 4, 0);
    HAL_NVIC_EnableIRQ(DMA2_Stream4_IRQn);

    HAL_NVIC_SetPriority(SPI5_IRQn, 1, 0);
    HAL_NVIC_EnableIRQ(SPI5_IRQn);
  }
}

void HAL_SPI_MspDeInit(SPI_HandleTypeDef* spiHandle) {

  if(spiHandle->Instance == SPI5) {
    /* Peripheral clock disable */
    __HAL_RCC_SPI5_CLK_DISABLE();
    /* DMA2 clock disable */
    __HAL_RCC_DMA2_CLK_DISABLE();
 
    /**SPI5 GPIO Configuration    
    PF7     ------> SPI5_SCK
    PF8     ------> SPI5_MISO
    PF9     ------> SPI5_MOSI 
    */
    HAL_GPIO_DeInit(GPIOF, SPI5_SCK_Pin | SPI5_MISO_Pin | SPI5_MOSI_Pin);
 
    /* SPI5 DMA DeInit */
    HAL_DMA_DeInit(spiHandle->hdmarx);
    HAL_DMA_DeInit(spiHandle->hdmatx);
  }
} 

void HAL_SPI_TxRxCpltCallback(SPI_HandleTypeDef *hspi) {
    SPI_QueueItem_t *item = (SPI_QueueItem_t*)osPoolAlloc(spi_pool);
    item->value = rxbuf[1];
    item->source = "CALLBACK";
    osMessagePut(xGyroQueue, (uint32_t) item, 0);
}
What is happening here, is we define 2 handles for DMA configuration. We leave actual SPI peripheral initialization code as is, but in HAL_SPI_MspInit() we enable clocks for DMA2 channel and then configure streams themselves. Since DMA on STM32 is quite flexible, you can have it working only on transmit and receive. So we have to set up two DMA channels - one for RX and one for TX. Once DMA is configured, we initialize it and link it to the SPI type itself - it now holds a reference to the DMA instance, which is used internally for transfer management.

Last, but not least, we should enable interrupts, so that we get some feedback on when the transfer has been completed.
In cleanup function we also should stop the DMA2 clock (if it's not used anywhere else) and deinitialize the DMA channels we were using.
HAL also provides callback functions for TX completed, RX completed and TXRX (transceiving) completed, which can be used once interrupt service routine has been completed (clearing flags, etc). Don't worry, it's done by HAL internally.

You might notice some weirdness happening in the callback function. What I have done, is defined a custom structure, which I fill with data in callback functions. This structure is defined in spi.h:
extern SPI_HandleTypeDef hspi5;
extern osMessageQId xGyroQueue;
extern uint8_t rxbuf[3];
extern uint8_t txbuf[3];
extern osPoolId spi_pool;

extern void _Error_Handler(char *, int);

typedef struct __SPI_QueueItem_t {
  uint32_t value;
  char *source;
} SPI_QueueItem_t;
SPI_QueueItem_t strucure has two members - an uint32 value and a pointer to a character sequence, which I'll use to store a string value with info on origin of the structure.
Now it's time for a brief theoretical intermission.

Queues

I will use this structure, to pass around received data from interrupt to a processing task using a queue. A queue is just that - a fixed-length list of items. A task can be assigned to monitor a queue and take items from the list. Items are put in the list by some other tasks. Normally, items can be taken only once. Initially this list is empty and monitoring task is in suspended state and gets woken up (transitions into Ready state) once an item is available in queue. As soon as processing task gets some CPU time (there are no tasks of higher priority blocking it), it'll take one item from the queue and process it, thus freeing a slot in the queue. Queues by nature are first-in-first-out (FIFO), so care must be taken to give some time slot for data processing, otherwise the data can get stale.
Queue by default can pass a single uint32 value, which, conveniently enough, is just the right amount of space to hold a pointer to a memory location, which can store whatever - from series of bits to bitmapped images. The latter is what we are going to do - we will pass a pointer instead of value, since I want not just a value, but also a source of the data. For this we need a mechanism for managing memory and this is where comes in

Memory pooling

A memory pool is a bunch of memory that is assigned for storing a number of particular objects. Usually application developer is able to predict types and amount of data and thus the amount of memory required for storing temporary data. As with a queue, it is fixed-size, except one can retrieve any value and in any order, using a reference (pointer). Object is stored in memory until it is specifically freed.

So, back to code. In our spi.h we define that we'll use external queue and a memory pool, as well as TX/RX buffers. In spi.c callback function we request a slot for a new object from our spi_pool pool by calling osPoolAlloc(spi_pool). This function returns a pointer to allocated space. We cast this pointer to our SPI_QueueItem structure pointer and then fill it with values. Once structure is populated, we put pointer to it into the queue.
Let's look at how do we define memory pool and queue for our data in freertos.c:
/* Variables -----------------------------------------------------------------*/
osThreadId defaultTaskHandle, blinkyTaskHandle, gyroTaskHandle, gyroPrinterHandle;
osMessageQId xGyroQueue;
uint8_t rxbuf[3] = {0x00, 0x00};
uint8_t txbuf[3] = {0x0F | 0x80, 0x00}; // 0x0F is WHO_AM_I register, 0x80 read bit, should return 0b11010100 or 0xD4
osPoolDef(spi_pool, 10, SPI_QueueItem_t);
osPoolId spi_pool;

/* Function prototypes -------------------------------------------------------*/
void StartDefaultTask(void const * argument);
void vBlinkyTask(void const * argument);
void vGyroTesterTask(void const * argument);
void vGyroPrinterTask(void const * argument);
void MX_FREERTOS_Init(void); /* (MISRA C 2004 rule 8.1) */
 
.. 
 
/* Init FreeRTOS */
void MX_FREERTOS_Init(void) {

 spi_pool = osPoolCreate(osPool(spi_pool));
 osThreadDef(defaultTask, StartDefaultTask, osPriorityLow, 0, 1000);
 defaultTaskHandle = osThreadCreate(osThread(defaultTask), NULL);

 osThreadDef(blinkyTask, vBlinkyTask, osPriorityHigh, 4, 1000);
 blinkyTaskHandle = osThreadCreate(osThread(blinkyTask), NULL);

 osThreadDef(gyroPrinterTask, vGyroPrinterTask, osPriorityLow, 1, 1000);
 gyroPrinterHandle = osThreadCreate(osThread(gyroPrinterTask), NULL);

 osMessageQDef(gyroPrinterQueue, 10, SPI_QueueItem_t); // 10 pointers
 xGyroQueue = osMessageCreate(osMessageQ(gyroPrinterQueue), NULL);

 // Put test data into the queue
 SPI_QueueItem_t *item = (SPI_QueueItem_t*)osPoolAlloc(spi_pool);
 item->value = 0x33;
 item->source = "TEST";
 osMessagePut(xGyroQueue, (uint32_t) item, 0);

 osThreadDef(gyroTask, vGyroTesterTask, osPriorityHigh, 0, 1000);
 gyroTaskHandle = osThreadCreate(osThread(gyroTask), NULL);

}

..

void vGyroPrinterTask(void const * argument) {
 osEvent event;
 uint8_t count = 0;
 SPI_QueueItem_t *item;
 while(1) {
  event = osMessageGet(xGyroQueue, osWaitForever);
  printf("Got %ld messages in queue\r\n", osMessageWaiting(xGyroQueue));
  while (event.status == osEventMessage) {
   item = (SPI_QueueItem_t *)event.value.p;
   count++;
   printf("Message %d: from %s: %lx\r\n", count, item->source, item->value);
   osPoolFree(spi_pool, item);
   event = osMessageGet(xGyroQueue, 1);
  }
  count = 0;
  osThreadYield();
 }
}
First, we use osPoolDef() macro to define pool, its name, depth and content type. Then we define a global variable for passing it around. In MX_FREERTOS_Init() we create the actual pool before using it. To create an object in the pool, we once again use osPoolAlloc() to get a pointer and use pointer to assign values. Here we create a test value with dummy data just to see, that the pool and queue processing works correctly.

Up to now we can fill the pool, but have no way of retrieving anything from it. For this we'll create a queue to pass around pointers to items in our pool. osMessageQDef() and osMessageCreate() deals with that. We create queue the same size as the pool itself and assign it to a global for use elsewhere.

vGyroPrinterTask() deals with processing the queue and freeing the pool. By default it sits in suspended state (waiting forever) until message comes in. Messages in CMSIS OS are implemented as a subtype of events, so we have to check for event type before we start processing it. Once it's clear, that it actually is a message, we can check for number of messages waiting in the queue and then process them until the queue is empty. Otherwise our thread will process a single message and then wait until the next tick to process next one.
In the processing loop we fish out pointer, and, since we don't have any other message types, we assign them to our SPI_QueueItem type pointer. At which point we get access to the members of the structure and can print them out. Once we are done with the object, we throw it out of the pool by telling memory management to free item at this location via osPoolFree().

Now we would like to populate the queue from DMA interrupts. The callback function defined in spi.c can be used for that. Otherwise for passing the data from DMA, we can use ISRs. Particulary interrupt processing in stm32f4xx_it.c:
extern TIM_HandleTypeDef htim6;
extern SPI_HandleTypeDef hspi5;
extern osMessageQId xGyroQueue;
extern uint8_t rxbuf[3];
extern uint8_t txbuf[3];
extern osPoolId spi_pool; 
 
..
 
// SPI5 DMA receive done
void DMA2_Stream3_IRQHandler(void) {
 SPI_QueueItem_t *item = (SPI_QueueItem_t*)osPoolAlloc(spi_pool);
 item->value = rxbuf[1];
 item->source = "DMA IRQ";
 osMessagePut(xGyroQueue, (uint32_t) item, 0);
 HAL_DMA_IRQHandler(hspi5.hdmarx);
}

// SPI5 DMA transmit done
void DMA2_Stream4_IRQHandler(void) {
 // Don't do anything, but still works
 HAL_DMA_IRQHandler(hspi5.hdmatx);
}

void SPI5_IRQHandler(void) {
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_SET);
 spiDone = 1;
 HAL_SPI_IRQHandler(&hspi5);
}
Here we once again create an item in the memory pool and send pointer to the queue. I have also defined a handler for interrupt-based transmit, if anybody wants it, but it is not required for dealing with DMA.


NB! To get access to queues (or any other FreeRTOS API functionality) within interrupts, we have to increase configLIBRARY_MAX_SYSCALL_INTERRUPT_PRIORITY in FreeRTOSConfig.h from 5 to 3 (interrupt priorities are also inverted, i.e. smaller numbers mean higher priority). I spent a bit of time debugging this, until I found out about this issue. Turns out, STMCubeMx does not assign these values correctly while generating code.

Now last touch - actually using all this setup in freertos.c:
void vGyroTesterTask(void const * argument) {
 HAL_StatusTypeDef response = HAL_ERROR; // default to error, so we can see, if value actually gets updated by HAL

 /* Transceive data with gyro in blocking mode */
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_RESET);
 response = HAL_SPI_TransmitReceive(&hspi5, txbuf, rxbuf, 2, 1000);
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_SET);
 if (response == HAL_OK) {
  printf("Sent: %02x %02x Got: %02x %02x\r\n", txbuf[0], txbuf[1], rxbuf[0], rxbuf[1]);
 } else {
  printf("Got error response as %d\r\n", response);
 }

 /* Now do the same in DMA mode */
 memset(rxbuf, 0x00, sizeof rxbuf);
 printf("RX buffer reset to %02x %02x\r\n", rxbuf[0], rxbuf[1]);
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_RESET);
 response = HAL_SPI_TransmitReceive_DMA(&hspi5, txbuf, rxbuf, 2);
 if (response != HAL_OK) {
  printf("Got error response as %d\r\n", response);
 }

 /* Print fome stuff, just to keep CPU busy to show that it's actually DMA performing transmit */
 uint8_t state = HAL_SPI_GetState(&hspi5);
 while (state != HAL_SPI_STATE_READY) {
   state = HAL_SPI_GetState(&hspi5);
   printf("State is: %d\r\n", state);
 }
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_SET);
 printf("Sent via DMA: %02x %02x Got: %02x %02x\r\n", txbuf[0], txbuf[1], rxbuf[0], rxbuf[1]);


 /* Again, this time using interrupts */
 memset(rxbuf, 0x00, sizeof rxbuf);
 printf("RX buffer reset to %02x %02x\r\n", rxbuf[0], rxbuf[1]);
 state = HAL_SPI_STATE_RESET;
 spiDone = 0;
 HAL_GPIO_WritePin(NCS_MEMS_SPI_GPIO_Port, NCS_MEMS_SPI_Pin, GPIO_PIN_RESET);
 response = HAL_SPI_TransmitReceive_IT(&hspi5, txbuf, rxbuf, 2);
 if (response != HAL_OK) {
  printf("Got error response as %d\r\n", response);
 }

 while (spiDone != 1) {
  printf("Not done yet!\r\n");
 }
 printf("Sent via IT: %02x %02x Got: %02x %02x\r\n", txbuf[0], txbuf[1], rxbuf[0], rxbuf[1]);


 osThreadTerminate(gyroTaskHandle);
}
Here we still send data in blocking mode. Afterwards we do the same in DMA mode. While DMA is working in the background, we print the peripheral state in a loop without delays (well, actually delay is while UART is sending, since UART right now is working in blocking mode). Anyhow, first printout of the state should be "5" or "busy", once it's done sending, printouts should stop. Memset is used to clear the receive buffer to show, that we are actually receiving the data, not just reusing values already there.
This setup should show callback hierarchy - first DMA IRQ should be printed, and afterwards SPI general callback. And the result is as expected:
What we see in this screenshot is:
  1. Blocking mode still works
  2. DMA is transferring in background, while UART is blocking for printing 
  3. Once UART stops blocking, data is available in the same thread
  4. Sending in non-blocking mode with interrupt gets executed and interrupt gets called (but no data for some reason). Too lazy right now to debug it and I'm not particularly interested in such usecase. Might return to it eventually
  5. Messages are put from both ISR and callback in this order.
  6. Lowest priority tasks (queue processor and blinky) get executed last
  7. FIFO nature of message queue.
I'll end on this for now, post is becoming a bit bloated already.
As usual, sources are available on GitHub

1 comment: