From 6e0b79a33b7e6d5405be6d0ae5b16e5f3fd62fac Mon Sep 17 00:00:00 2001
From: Victor Oliveira <rhapsodyv@gmail.com>
Date: Mon, 29 Mar 2021 23:52:30 -0300
Subject: [PATCH] Fix and add STM32 SDIO DMA (#21476)

---
 .../src/HAL/STM32/Sd2Card_sdio_stm32duino.cpp | 219 ++++++++++--------
 Marlin/src/gcode/gcode_d.cpp                  | 105 +++++++--
 2 files changed, 199 insertions(+), 125 deletions(-)

diff --git a/Marlin/src/HAL/STM32/Sd2Card_sdio_stm32duino.cpp b/Marlin/src/HAL/STM32/Sd2Card_sdio_stm32duino.cpp
index fc9b960c1c..824142b889 100644
--- a/Marlin/src/HAL/STM32/Sd2Card_sdio_stm32duino.cpp
+++ b/Marlin/src/HAL/STM32/Sd2Card_sdio_stm32duino.cpp
@@ -36,9 +36,10 @@
 
   // use USB drivers
 
-  extern "C" { int8_t SD_MSC_Read(uint8_t lun, uint8_t *buf, uint32_t blk_addr, uint16_t blk_len);
-               int8_t SD_MSC_Write(uint8_t lun, uint8_t *buf, uint32_t blk_addr, uint16_t blk_len);
-               extern SD_HandleTypeDef hsd;
+  extern "C" {
+    int8_t SD_MSC_Read(uint8_t lun, uint8_t *buf, uint32_t blk_addr, uint16_t blk_len);
+    int8_t SD_MSC_Write(uint8_t lun, uint8_t *buf, uint32_t blk_addr, uint16_t blk_len);
+    extern SD_HandleTypeDef hsd;
   }
 
   bool SDIO_Init() {
@@ -75,7 +76,18 @@
     #error "ERROR - Only STM32F103xE, STM32F103xG, STM32F4xx or STM32F7xx CPUs supported"
   #endif
 
+  // Fixed
+  #define SDIO_D0_PIN   PC8
+  #define SDIO_D1_PIN   PC9
+  #define SDIO_D2_PIN   PC10
+  #define SDIO_D3_PIN   PC11
+  #define SDIO_CK_PIN   PC12
+  #define SDIO_CMD_PIN  PD2
+
   SD_HandleTypeDef hsd;  // create SDIO structure
+  // F4 support one dma for RX and another for TX.
+  // But Marlin will never do read and write at same time, so we use always one dma for both.
+  DMA_HandleTypeDef hdma_sdio;
 
   /*
     SDIO_INIT_CLK_DIV is 118
@@ -96,12 +108,12 @@
 
   // Target Clock, configurable. Default is 18MHz, from STM32F1
   #ifndef SDIO_CLOCK
-    #define SDIO_CLOCK                         18000000       /* 18 MHz */
+    #define SDIO_CLOCK 18000000 // 18 MHz
   #endif
 
   // SDIO retries, configurable. Default is 3, from STM32F1
   #ifndef SDIO_READ_RETRIES
-    #define SDIO_READ_RETRIES                  3
+    #define SDIO_READ_RETRIES 3
   #endif
 
   // SDIO Max Clock (naming from STM Manual, don't change)
@@ -120,24 +132,21 @@
   }
 
   void go_to_transfer_speed() {
-    SD_InitTypeDef Init;
-
     /* Default SDIO peripheral configuration for SD card initialization */
-    Init.ClockEdge           = hsd.Init.ClockEdge;
-    Init.ClockBypass         = hsd.Init.ClockBypass;
-    Init.ClockPowerSave      = hsd.Init.ClockPowerSave;
-    Init.BusWide             = hsd.Init.BusWide;
-    Init.HardwareFlowControl = hsd.Init.HardwareFlowControl;
-    Init.ClockDiv            = clock_to_divider(SDIO_CLOCK);
+    hsd.Init.ClockEdge           = hsd.Init.ClockEdge;
+    hsd.Init.ClockBypass         = hsd.Init.ClockBypass;
+    hsd.Init.ClockPowerSave      = hsd.Init.ClockPowerSave;
+    hsd.Init.BusWide             = hsd.Init.BusWide;
+    hsd.Init.HardwareFlowControl = hsd.Init.HardwareFlowControl;
+    hsd.Init.ClockDiv            = clock_to_divider(SDIO_CLOCK);
 
     /* Initialize SDIO peripheral interface with default configuration */
-    SDIO_Init(hsd.Instance, Init);
+    SDIO_Init(hsd.Instance, hsd.Init);
   }
 
   void SD_LowLevel_Init(void) {
     uint32_t tempreg;
 
-    __HAL_RCC_SDIO_CLK_ENABLE();
     __HAL_RCC_GPIOC_CLK_ENABLE(); //enable GPIO clocks
     __HAL_RCC_GPIOD_CLK_ENABLE(); //enable GPIO clocks
 
@@ -163,11 +172,45 @@
     GPIO_InitStruct.Pin = GPIO_PIN_2;
     HAL_GPIO_Init(GPIOD, &GPIO_InitStruct);
 
-    #if DISABLED(STM32F1xx)
-      // TODO: use __HAL_RCC_SDIO_RELEASE_RESET() and __HAL_RCC_SDIO_CLK_ENABLE();
-      RCC->APB2RSTR &= ~RCC_APB2RSTR_SDIORST_Msk;  // take SDIO out of reset
-      RCC->APB2ENR  |=  RCC_APB2RSTR_SDIORST_Msk;  // enable SDIO clock
-      // Enable the DMA2 Clock
+    // Setup DMA
+    #if defined(STM32F1xx)
+      hdma_sdio.Init.Mode = DMA_NORMAL;
+      hdma_sdio.Instance = DMA2_Channel4;
+      HAL_NVIC_EnableIRQ(DMA2_Channel4_5_IRQn);
+    #elif defined(STM32F4xx)
+      hdma_sdio.Init.Mode = DMA_PFCTRL;
+      hdma_sdio.Instance = DMA2_Stream3;
+      hdma_sdio.Init.Channel = DMA_CHANNEL_4;
+      hdma_sdio.Init.FIFOMode = DMA_FIFOMODE_ENABLE;
+      hdma_sdio.Init.FIFOThreshold = DMA_FIFO_THRESHOLD_FULL;
+      hdma_sdio.Init.MemBurst = DMA_MBURST_INC4;
+      hdma_sdio.Init.PeriphBurst = DMA_PBURST_INC4;
+      HAL_NVIC_EnableIRQ(DMA2_Stream3_IRQn);
+    #endif
+    HAL_NVIC_EnableIRQ(SDIO_IRQn);
+    hdma_sdio.Init.PeriphInc = DMA_PINC_DISABLE;
+    hdma_sdio.Init.MemInc = DMA_MINC_ENABLE;
+    hdma_sdio.Init.PeriphDataAlignment = DMA_PDATAALIGN_WORD;
+    hdma_sdio.Init.MemDataAlignment = DMA_MDATAALIGN_WORD;
+    hdma_sdio.Init.Priority = DMA_PRIORITY_LOW;
+    __HAL_LINKDMA(&hsd, hdmarx, hdma_sdio);
+    __HAL_LINKDMA(&hsd, hdmatx, hdma_sdio);
+
+    #if defined(STM32F1xx)
+      __HAL_RCC_SDIO_CLK_ENABLE();
+      __HAL_RCC_DMA2_CLK_ENABLE();
+    #else
+      __HAL_RCC_SDIO_FORCE_RESET();
+      delay(2);
+      __HAL_RCC_SDIO_RELEASE_RESET();
+      delay(2);
+      __HAL_RCC_SDIO_CLK_ENABLE();
+
+      __HAL_RCC_DMA2_FORCE_RESET();
+      delay(2);
+      __HAL_RCC_DMA2_RELEASE_RESET();
+      delay(2);
+      __HAL_RCC_DMA2_CLK_ENABLE();
     #endif
 
     //Initialize the SDIO (with initial <400Khz Clock)
@@ -179,6 +222,7 @@
 
     // Power up the SDIO
     SDIO_PowerState_ON(SDIO);
+    hsd.Instance = SDIO;
   }
 
   void HAL_SD_MspInit(SD_HandleTypeDef *hsd) { // application specific init
@@ -222,107 +266,82 @@
           if (!status) break;
           if (!--retry_Cnt) return false;   // return failing status if retries are exhausted
         }
+        go_to_transfer_speed();
       }
     #endif
 
     return true;
   }
-  /*
-  void init_SDIO_pins(void) {
-    GPIO_InitTypeDef GPIO_InitStruct = {0};
 
-    // SDIO GPIO Configuration
-    // PC8     ------> SDIO_D0
-    // PC12    ------> SDIO_CK
-    // PD2     ------> SDIO_CMD
+  static bool SDIO_ReadWriteBlock_DMA(uint32_t block, const uint8_t *src, uint8_t *dst) {
+    if(HAL_SD_GetCardState(&hsd) != HAL_SD_CARD_TRANSFER) return false;
 
-    GPIO_InitStruct.Pin = GPIO_PIN_8;
-    GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
-    GPIO_InitStruct.Pull = GPIO_NOPULL;
-    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_VERY_HIGH;
-    GPIO_InitStruct.Alternate = GPIO_AF12_SDIO;
-    HAL_GPIO_Init(GPIOC, &GPIO_InitStruct);
+    TERN_(USE_WATCHDOG, HAL_watchdog_refresh());
 
-    GPIO_InitStruct.Pin = GPIO_PIN_12;
-    GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
-    GPIO_InitStruct.Pull = GPIO_NOPULL;
-    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_VERY_HIGH;
-    GPIO_InitStruct.Alternate = GPIO_AF12_SDIO;
-    HAL_GPIO_Init(GPIOC, &GPIO_InitStruct);
-
-    GPIO_InitStruct.Pin = GPIO_PIN_2;
-    GPIO_InitStruct.Mode = GPIO_MODE_AF_PP;
-    GPIO_InitStruct.Pull = GPIO_NOPULL;
-    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_VERY_HIGH;
-    GPIO_InitStruct.Alternate = GPIO_AF12_SDIO;
-    HAL_GPIO_Init(GPIOD, &GPIO_InitStruct);
-  }
-  */
-  //bool SDIO_init() { return (bool) (SD_SDIO_Init() ? 1 : 0);}
-  //bool SDIO_Init_C() { return (bool) (SD_SDIO_Init() ? 1 : 0);}
-
-  bool SDIO_ReadBlock(uint32_t block, uint8_t *dst) {
-    hsd.Instance = SDIO;
-    uint8_t retryCnt = SDIO_READ_RETRIES;
-
-    bool status;
-    for (;;) {
-      TERN_(USE_WATCHDOG, HAL_watchdog_refresh());
-      status = (bool) HAL_SD_ReadBlocks(&hsd, (uint8_t*)dst, block, 1, 1000);  // read one 512 byte block with 500mS timeout
-      status |= (bool) HAL_SD_GetCardState(&hsd);     // make sure all is OK
-      if (!status) break;       // return passing status
-      if (!--retryCnt) break;   // return failing status if retries are exhausted
+    HAL_StatusTypeDef ret;
+    if (src) {
+      hdma_sdio.Init.Direction = DMA_MEMORY_TO_PERIPH;
+      HAL_DMA_Init(&hdma_sdio);
+      ret = HAL_SD_WriteBlocks_DMA(&hsd, (uint8_t *)src, block, 1);
+    }
+    else {
+      hdma_sdio.Init.Direction = DMA_PERIPH_TO_MEMORY;
+      HAL_DMA_Init(&hdma_sdio);
+      ret = HAL_SD_ReadBlocks_DMA(&hsd, (uint8_t *)dst, block, 1);
     }
-    return status;
 
-    /*
-    return (bool) ((status_read | status_card) ? 1 : 0);
-
-    if (SDIO_GetCardState() != SDIO_CARD_TRANSFER) return false;
-    if (blockAddress >= SdCard.LogBlockNbr) return false;
-    if ((0x03 & (uint32_t)data)) return false; // misaligned data
-
-    if (SdCard.CardType != CARD_SDHC_SDXC) { blockAddress *= 512U; }
-
-    if (!SDIO_CmdReadSingleBlock(blockAddress)) {
-      SDIO_CLEAR_FLAG(SDIO_ICR_CMD_FLAGS);
-      dma_disable(SDIO_DMA_DEV, SDIO_DMA_CHANNEL);
+    if (ret != HAL_OK) {
+      HAL_DMA_Abort_IT(&hdma_sdio);
+      HAL_DMA_DeInit(&hdma_sdio);
       return false;
     }
 
-    while (!SDIO_GET_FLAG(SDIO_STA_DATAEND | SDIO_STA_TRX_ERROR_FLAGS)) {}
-
-    dma_disable(SDIO_DMA_DEV, SDIO_DMA_CHANNEL);
-
-    if (SDIO->STA & SDIO_STA_RXDAVL) {
-      while (SDIO->STA & SDIO_STA_RXDAVL) (void)SDIO->FIFO;
-      SDIO_CLEAR_FLAG(SDIO_ICR_CMD_FLAGS | SDIO_ICR_DATA_FLAGS);
-      return false;
+    uint32_t timeout = millis() + 500;
+    // Wait the transfer
+    while (hsd.State != HAL_SD_STATE_READY) {
+      if (millis() > timeout) {
+        HAL_DMA_Abort_IT(&hdma_sdio);
+        HAL_DMA_DeInit(&hdma_sdio);
+        return false;
+      }
     }
 
-    if (SDIO_GET_FLAG(SDIO_STA_TRX_ERROR_FLAGS)) {
-      SDIO_CLEAR_FLAG(SDIO_ICR_CMD_FLAGS | SDIO_ICR_DATA_FLAGS);
-      return false;
-    }
-    SDIO_CLEAR_FLAG(SDIO_ICR_CMD_FLAGS | SDIO_ICR_DATA_FLAGS);
-    */
+    while (__HAL_DMA_GET_FLAG(&hdma_sdio, __HAL_DMA_GET_TC_FLAG_INDEX(&hdma_sdio)) != 0
+        || __HAL_DMA_GET_FLAG(&hdma_sdio, __HAL_DMA_GET_TE_FLAG_INDEX(&hdma_sdio)) != 0) { /* nada */ }
+
+    HAL_DMA_Abort_IT(&hdma_sdio);
+    HAL_DMA_DeInit(&hdma_sdio);
+
+    timeout = millis() + 500;
+    while (HAL_SD_GetCardState(&hsd) != HAL_SD_CARD_TRANSFER)
+      if (millis() > timeout) return false;
 
     return true;
   }
 
-  bool SDIO_WriteBlock(uint32_t block, const uint8_t *src) {
-    hsd.Instance = SDIO;
-    uint8_t retryCnt = SDIO_READ_RETRIES;
-    bool status;
-    for (;;) {
-      status = (bool) HAL_SD_WriteBlocks(&hsd, (uint8_t*)src, block, 1, 500);  // write one 512 byte block with 500mS timeout
-      status |= (bool) HAL_SD_GetCardState(&hsd);     // make sure all is OK
-      if (!status) break;       // return passing status
-      if (!--retryCnt) break;   // return failing status if retries are exhausted
-    }
-    return status;
+  bool SDIO_ReadBlock(uint32_t block, uint8_t *dst) {
+    uint8_t retries = SDIO_READ_RETRIES;
+    while (retries--) if (SDIO_ReadWriteBlock_DMA(block, NULL, dst)) return true;
+    return false;
   }
 
+  bool SDIO_WriteBlock(uint32_t block, const uint8_t *src) {
+    uint8_t retries = SDIO_READ_RETRIES;
+    while (retries--) if (SDIO_ReadWriteBlock_DMA(block, src, NULL)) return true;
+    return false;
+  }
+
+  #if defined(STM32F1xx)
+    #define DMA_IRQ_HANDLER DMA2_Channel4_5_IRQHandler
+  #elif defined(STM32F4xx)
+    #define DMA_IRQ_HANDLER DMA2_Stream3_IRQHandler
+  #else
+    #error "Unknown STM32 architecture."
+  #endif
+
+  extern "C" void SDIO_IRQHandler(void) { HAL_SD_IRQHandler(&hsd); }
+  extern "C" void DMA_IRQ_HANDLER(void) { HAL_DMA_IRQHandler(&hdma_sdio); }
+
 #endif // !USBD_USE_CDC_COMPOSITE
 #endif // SDIO_SUPPORT
 #endif // ARDUINO_ARCH_STM32 && !STM32GENERIC
diff --git a/Marlin/src/gcode/gcode_d.cpp b/Marlin/src/gcode/gcode_d.cpp
index 8941523a16..f87cebc886 100644
--- a/Marlin/src/gcode/gcode_d.cpp
+++ b/Marlin/src/gcode/gcode_d.cpp
@@ -29,6 +29,7 @@
   #include "../libs/hex_print.h"
   #include "../HAL/shared/eeprom_if.h"
   #include "../HAL/shared/Delay.h"
+  #include "../sd/cardreader.h"
 
   extern void dump_delay_accuracy_check();
 
@@ -126,19 +127,19 @@
       #endif
 
       case 4: { // D4 Read / Write PIN
-        // const uint8_t pin = parser.byteval('P');
-        // const bool is_out = parser.boolval('F'),
-        //            val = parser.byteval('V', LOW);
+        //const bool is_out = parser.boolval('F');
+        //const uint8_t pin = parser.byteval('P'),
+        //              val = parser.byteval('V', LOW);
         if (parser.seenval('X')) {
           // TODO: Write the hex bytes after the X
           //while (len--) {
           //}
         }
         else {
-          // while (len--) {
-            // TODO: Read bytes from EEPROM
-            // print_hex_byte(eeprom_read_byte(*(adr++));
-          // }
+          //while (len--) {
+          //// TODO: Read bytes from EEPROM
+          //  print_hex_byte(eeprom_read_byte(adr++));
+          //}
           SERIAL_EOL();
         }
       } break;
@@ -155,10 +156,10 @@
           //while (len--) {}
         }
         else {
-          // while (len--) {
-            // TODO: Read bytes from EEPROM
-            // print_hex_byte(eeprom_read_byte(adr++));
-          // }
+          //while (len--) {
+          //// TODO: Read bytes from EEPROM
+          //  print_hex_byte(eeprom_read_byte(adr++));
+          //}
           SERIAL_EOL();
         }
       } break;
@@ -186,22 +187,76 @@
         SERIAL_ECHOLNPGM("FAILURE: Watchdog did not trigger board reset.");
       } break;
 
-      #if ENABLED(POSTMORTEM_DEBUGGING)
-      case 451: { // Trigger all kind of faults to test exception catcher
-        SERIAL_ECHOLNPGM("Disabling heaters");
-        thermalManager.disable_all_heaters();
-        delay(1000); // Allow time to print
-        volatile uint8_t type[5] = { parser.byteval('T', 1) };
+      #if ENABLED(SDSUPPORT)
 
-        // The code below is obviously wrong and it's full of quirks to fool the compiler from optimizing away the code
-        switch (type[0]) {
-          case 1: default: *(int*)0 = 451; break; // Write at bad address
-          case 2: { volatile int a = 0; volatile int b = 452 / a; *(int*)&a = b; } break; // Divide by zero (some CPUs accept this, like ARM)
-          case 3: { *(uint32_t*)&type[1] = 453; volatile int a = *(int*)&type[1]; type[0] = a / 255; } break; // Unaligned access (some CPUs accept this)
-          case 4: { volatile void (*func)() = (volatile void (*)()) 0xE0000000; func(); } break; // Invalid instruction
+        case 101: { // D101 Test SD Write
+          card.openFileWrite("test.gco");
+          if (!card.isFileOpen()) {
+            SERIAL_ECHOLNPAIR("Failed to open test.gco to write.");
+            return;
+          }
+          __attribute__((aligned(sizeof(size_t)))) uint8_t buf[512];
+
+          uint16_t c;
+          for (c = 0; c < COUNT(buf); c++)
+            buf[c] = 'A' + (c % ('Z' - 'A'));
+
+          c = 1024 * 4;
+          while (c--) {
+            TERN_(USE_WATCHDOG, watchdog_refresh());
+            card.write(buf, COUNT(buf));
+          }
+          SERIAL_ECHOLNPGM(" done");
+          card.closefile();
+        } break;
+
+        case 102: { // D102 Test SD Read
+          card.openFileRead("test.gco");
+          if (!card.isFileOpen()) {
+            SERIAL_ECHOLNPAIR("Failed to open test.gco to read.");
+            return;
+          }
+          __attribute__((aligned(sizeof(size_t)))) uint8_t buf[512];
+          uint16_t c = 1024 * 4;
+          while (c--) {
+            TERN_(USE_WATCHDOG, watchdog_refresh());
+            card.read(buf, COUNT(buf));
+            bool error = false;
+            for (uint16_t i = 0; i < COUNT(buf); i++) {
+              if (buf[i] != ('A' + (i % ('Z' - 'A')))) {
+                error = true;
+                break;
+              }
+            }
+            if (error) {
+              SERIAL_ECHOLNPGM(" Read error!");
+              break;
+            }
+          }
+          SERIAL_ECHOLNPGM(" done");
+          card.closefile();
+        } break;
+
+      #endif // SDSUPPORT
+
+      #if ENABLED(POSTMORTEM_DEBUGGING)
+
+        case 451: { // Trigger all kind of faults to test exception catcher
+          SERIAL_ECHOLNPGM("Disabling heaters");
+          thermalManager.disable_all_heaters();
+          delay(1000); // Allow time to print
+          volatile uint8_t type[5] = { parser.byteval('T', 1) };
+
+          // The code below is obviously wrong and it's full of quirks to fool the compiler from optimizing away the code
+          switch (type[0]) {
+            case 1: default: *(int*)0 = 451; break; // Write at bad address
+            case 2: { volatile int a = 0; volatile int b = 452 / a; *(int*)&a = b; } break; // Divide by zero (some CPUs accept this, like ARM)
+            case 3: { *(uint32_t*)&type[1] = 453; volatile int a = *(int*)&type[1]; type[0] = a / 255; } break; // Unaligned access (some CPUs accept this)
+            case 4: { volatile void (*func)() = (volatile void (*)()) 0xE0000000; func(); } break; // Invalid instruction
+          }
+          break;
         }
-        break;
-      }
+
       #endif
     }
   }