diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index 16cf5f6edfb..23866cbabb9 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -31,11 +31,13 @@ jtag_proc_s jtag_proc; static void jtagtap_reset(void); -static void jtagtap_tms_seq(uint32_t tms_states, size_t ticks); -static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles); -static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles); -static bool jtagtap_next(bool tms, bool tdi); -static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles); +static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline)); +static void jtagtap_tms_seq(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); +static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles) + __attribute__((optimize(3))); +static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles) __attribute__((optimize(3))); +static bool jtagtap_next(bool tms, bool tdi) __attribute__((optimize(3))); +static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles) __attribute__((optimize(3))); void jtagtap_init(void) { @@ -51,9 +53,8 @@ void jtagtap_init(void) jtag_proc.tap_idle_cycles = 1; /* Ensure we're in JTAG mode */ - for (size_t i = 0; i <= 50U; ++i) - jtagtap_next(true, false); /* 50 + 1 idle cycles for SWD reset */ - jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */ + jtagtap_cycle(true, false, 51U); /* 50 + 1 idle cycles for SWD reset */ + jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */ } static void jtagtap_reset(void) @@ -61,31 +62,42 @@ static void jtagtap_reset(void) #ifdef TRST_PORT if (platform_hwversion() == 0) { gpio_clear(TRST_PORT, TRST_PIN); - for (volatile size_t i = 0; i < 10000U; i++) - continue; + /* Hold low for approximately 1.5 ms */ + platform_delay(1U); /* Requires SysTick interrupt to be unblocked */ gpio_set(TRST_PORT, TRST_PIN); } #endif jtagtap_soft_reset(); } +/* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ +static inline void platform_delay_busy(const uint32_t loops) +{ + for (register uint32_t counter = loops; --counter > 0U;) + __asm__(""); +} + static bool jtagtap_next_clk_delay() { gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); const uint16_t result = gpio_get(TDO_PORT, TDO_PIN); gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); return result != 0; } +static bool jtagtap_next_no_delay() __attribute__((optimize(3))); + static bool jtagtap_next_no_delay() { gpio_set(TCK_PORT, TCK_PIN); + /* Stretch the clock high time */ + __asm__("nop" ::: "memory"); const uint16_t result = gpio_get(TDO_PORT, TDO_PIN); gpio_clear(TCK_PORT, TCK_PIN); + /* Stretch the clock low time */ + __asm__("nop" ::: "memory"); return result != 0; } @@ -105,15 +117,16 @@ static void jtagtap_tms_seq_clk_delay(uint32_t tms_states, const size_t clock_cy const bool state = tms_states & 1U; gpio_set_val(TMS_PORT, TMS_PIN, state); gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); tms_states >>= 1U; gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } +static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles) + __attribute__((noinline, optimize(3))); + static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles) { bool state = tms_states & 1U; @@ -130,13 +143,13 @@ static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cyc } } -static void jtagtap_tms_seq(const uint32_t tms_states, const size_t ticks) +static void jtagtap_tms_seq(const uint32_t tms_states, const size_t clock_cycles) { gpio_set(TDI_PORT, TDI_PIN); if (target_clk_divider != UINT32_MAX) - jtagtap_tms_seq_clk_delay(tms_states, ticks); + jtagtap_tms_seq_clk_delay(tms_states, clock_cycles); else - jtagtap_tms_seq_no_delay(tms_states, ticks); + jtagtap_tms_seq_no_delay(tms_states, clock_cycles); } static void jtagtap_tdi_tdo_seq_clk_delay( @@ -153,8 +166,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay( gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit)); /* Start the clock cycle */ gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); /* If TDO is high, store a 1 in the appropriate position in the value being accumulated */ if (gpio_get(TDO_PORT, TDO_PIN)) value |= 1U << bit; @@ -164,8 +176,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay( } /* Finish the clock cycle */ gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } /* If clock_cycles is not divisible by 8, we have some extra data to write back here. */ if (clock_cycles & 7U) { @@ -174,6 +185,9 @@ static void jtagtap_tdi_tdo_seq_clk_delay( } } +static void jtagtap_tdi_tdo_seq_no_delay(const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms, + const size_t clock_cycles) __attribute__((optimize(3))); + static void jtagtap_tdi_tdo_seq_no_delay( const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms, const size_t clock_cycles) { @@ -223,6 +237,9 @@ static void jtagtap_tdi_tdo_seq_no_delay( static void jtagtap_tdi_tdo_seq( uint8_t *const data_out, const bool final_tms, const uint8_t *const data_in, size_t clock_cycles) { + /* In case the callsite passes NULL for data_out, don't bother sampling TDO */ + if (!data_out) + return jtagtap_tdi_seq(final_tms, data_in, clock_cycles); gpio_clear(TMS_PORT, TMS_PIN); gpio_clear(TDI_PORT, TDI_PIN); if (target_clk_divider != UINT32_MAX) @@ -241,15 +258,16 @@ static void jtagtap_tdi_seq_clk_delay(const uint8_t *const data_in, const bool f /* Set up the TDI pin and start the clock cycle */ gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit)); gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); /* Finish the clock cycle */ gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } +static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles) + __attribute__((optimize(3))); + static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles;) { @@ -294,20 +312,23 @@ static void jtagtap_cycle_clk_delay(const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } +static void jtagtap_cycle_no_delay(const size_t clock_cycles) __attribute__((optimize(3))); + static void jtagtap_cycle_no_delay(const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_set(TCK_PORT, TCK_PIN); + /* Stretch the clock high time */ __asm__ volatile("nop" ::: "memory"); gpio_clear(TCK_PORT, TCK_PIN); + /* Stretch the clock low time */ + __asm__ volatile("nop" ::: "memory"); } } diff --git a/src/platforms/common/stm32/gpio.h b/src/platforms/common/stm32/gpio.h index cd0f9707b40..fd078f1ceb6 100644 --- a/src/platforms/common/stm32/gpio.h +++ b/src/platforms/common/stm32/gpio.h @@ -29,11 +29,6 @@ static inline void bmp_gpio_set(const uint32_t gpioport, const uint16_t gpios) { /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BSRR(gpioport) = gpios; -#if defined(STM32F4) || defined(STM32F7) - /* FIXME: Check if doubling is still needed */ - /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ - GPIO_BSRR(gpioport) = gpios; -#endif } #define gpio_set bmp_gpio_set @@ -44,10 +39,6 @@ static inline void bmp_gpio_clear(const uint32_t gpioport, const uint16_t gpios) /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BRR(gpioport) = gpios; #else -#if defined(STM32F4) || defined(STM32F7) - /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ - GPIO_BSRR(gpioport) = gpios << 16U; -#endif /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BSRR(gpioport) = gpios << 16U; #endif diff --git a/src/platforms/common/stm32/timing_stm32.c b/src/platforms/common/stm32/timing_stm32.c index d9bc5bcca40..69f6eae58cb 100644 --- a/src/platforms/common/stm32/timing_stm32.c +++ b/src/platforms/common/stm32/timing_stm32.c @@ -30,7 +30,7 @@ bool running_status = false; static volatile uint32_t time_ms = 0; -uint32_t target_clk_divider = 0; +uint32_t target_clk_divider = UINT32_MAX; static size_t morse_tick = 0; #if defined(PLATFORM_HAS_POWER_SWITCH) && defined(STM32F1) @@ -140,9 +140,23 @@ uint32_t platform_time_ms(void) * per delay loop count with 2 delay loops per clock */ +#if defined(STM32F4) +/* Values for STM32F411 at 96 MHz */ +#define USED_SWD_CYCLES 12 +#define CYCLES_PER_CNT 4 +#elif defined(STM32F1) /* Values for STM32F103 at 72 MHz */ +#define USED_SWD_CYCLES 18 +#define CYCLES_PER_CNT 4 +#elif defined(STM32F0) +/* Values for STM32F072 at 48 MHz */ +#define USED_SWD_CYCLES 16 +#define CYCLES_PER_CNT 6 +#else +/* Inherit defaults for other platforms (F3, F7) */ #define USED_SWD_CYCLES 22 #define CYCLES_PER_CNT 10 +#endif void platform_max_frequency_set(const uint32_t frequency) { @@ -172,6 +186,11 @@ void platform_max_frequency_set(const uint32_t frequency) target_clk_divider = UINT32_MAX; return; } + /* A zero loops delay will underflow and hang in platform_delay_busy() */ + if (divisor == 0U) { + target_clk_divider = UINT32_MAX; + return; + } divisor /= 2U; target_clk_divider = divisor / (CYCLES_PER_CNT * frequency); if (target_clk_divider * (CYCLES_PER_CNT * frequency) < divisor) @@ -194,8 +213,36 @@ uint32_t platform_max_frequency_get(void) const uint32_t ratio = (target_clk_divider * BITBANG_DIVIDER_FACTOR) + BITBANG_DIVIDER_OFFSET; return rcc_ahb_frequency / ratio; #else + if (target_clk_divider == UINT32_MAX) + return rcc_ahb_frequency / USED_SWD_CYCLES; uint32_t result = rcc_ahb_frequency; - result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider; + result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider * 2U; return result; #endif } + +/* Busy-looping delay for GPIO bitbanging operations. SUBS+BNE.N take 4 cycles. */ +void platform_delay_busy(const uint32_t loops) +{ + /* Avoid using `volatile` variables which incur stack accesses */ +#if 0 + register uint32_t i = loops; + do { + /* + * A "tactical" single NOP takes 0-1 cycles on Cortex-M0/M3/M4/M7 + * and avoids DCE, but consumes 2 bytes of flash, amplified by static/inline. + * A normal `continue` in for/while/do-loops with no side-effects + * makes the whole loop disappear to DCE at higher than -O1. + */ + __asm__("nop"); + } while (--i > 0U); +#else + /* + * Another version which still assembles to SUBS+BNE.N; the NOP can be eliminated + * in favor of an empty inline asm/volatile block if it's enough to suppress DCE. + * Note that the predecrement has to be merged into the condition expression. + */ + for (register uint32_t i = loops; --i > 0U;) + __asm__(""); +#endif +} diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 6742ba9fbe8..91fcffa9450 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -40,6 +40,7 @@ typedef enum swdio_status_e { swd_proc_s swd_proc; +static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline, optimize(3))); static void swdptap_turnaround(swdio_status_t dir) __attribute__((optimize(3))); static uint32_t swdptap_seq_in(size_t clock_cycles) __attribute__((optimize(3))); static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) __attribute__((optimize(3))); @@ -54,6 +55,13 @@ void swdptap_init(void) swd_proc.seq_out_parity = swdptap_seq_out_parity; } +/* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ +static inline void platform_delay_busy(const uint32_t loops) +{ + for (register uint32_t counter = loops; --counter > 0U;) + __asm__(""); +} + static void swdptap_turnaround(const swdio_status_t dir) { static swdio_status_t olddir = SWDIO_STATUS_FLOAT; @@ -71,12 +79,12 @@ static void swdptap_turnaround(const swdio_status_t dir) } else gpio_clear(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); if (dir == SWDIO_STATUS_DRIVE) { gpio_clear(SWCLK_PORT, SWCLK_PIN); @@ -92,11 +100,9 @@ static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; @@ -110,8 +116,10 @@ static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + /* Reordering barrier (in place of a delay) to retain timings */ + __asm__("" ::: "memory"); gpio_set(SWCLK_PORT, SWCLK_PIN); - __asm__("nop"); + __asm__("" ::: "memory"); } gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; @@ -129,15 +137,15 @@ static uint32_t swdptap_seq_in(size_t clock_cycles) static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) { const uint32_t result = swdptap_seq_in(clock_cycles); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); const bool parity = calculate_odd_parity(result); const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); *ret = result; /* Terminate the read cycle now */ @@ -152,11 +160,9 @@ static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t cl for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1 << cycle)); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } gpio_clear(SWCLK_PORT, SWCLK_PIN); } @@ -166,9 +172,14 @@ static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) _ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { + const bool state = tms_states & (1U << cycle); + /* Block the compiler from re-ordering the TMS states calculation to preserve timings */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1 << cycle)); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, state); + __asm__("" ::: "memory"); gpio_set(SWCLK_PORT, SWCLK_PIN); + __asm__("" ::: "memory"); } gpio_clear(SWCLK_PORT, SWCLK_PIN); } @@ -187,10 +198,12 @@ static void swdptap_seq_out_parity(const uint32_t tms_states, const size_t clock const bool parity = calculate_odd_parity(tms_states); swdptap_seq_out(tms_states, clock_cycles); gpio_set_val(SWDIO_PORT, SWDIO_PIN, parity); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); + gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); + gpio_clear(SWCLK_PORT, SWCLK_PIN); }