From 37254cab324b8be2daa458bd8bb78ce5783e171c Mon Sep 17 00:00:00 2001 From: Taylor Yu Date: Wed, 3 Jan 2024 20:49:53 -0600 Subject: [PATCH 1/4] common/swdptap: make SWD timing more consistent Make SWD timing more consistent: output changes (including floating) take place after the falling edge, and reads take place before the rising edge. This includes moving a gpio_get to occur after the clock-low delay, which would help reduce incorrect reads from lower-bandwidth target connections. Eliminate redundant GPIO operations, typically ones that set the clock low, by making the bit string input/output primitives always end by outputting a falling clock edge. --- src/platforms/common/swdptap.c | 39 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 1b113e3e828..aee7e4031a0 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -46,6 +46,18 @@ static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) __attribut static void swdptap_seq_out(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); static void swdptap_seq_out_parity(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); +/* + * Overall strategy for timing consistency: + * + * - Each primitive ends with a falling clock edge + * - Output is driven after the falling clock edge + * - Input is read immediately before the rising clock edge + * - Each primitive assumes it was immediately preceded by a falling clock edge + * + * This increases the chances of meeting setup and hold times when the target + * connection is lower bandwidth (with adequately slower clocks configured). + */ + void swdptap_init(void) { swd_proc.seq_in = swdptap_seq_in; @@ -68,9 +80,7 @@ static void swdptap_turnaround(const swdio_status_t dir) if (dir == SWDIO_STATUS_FLOAT) { SWDIO_MODE_FLOAT(); - } else - gpio_clear(SWCLK_PORT, SWCLK_PIN); - + } for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; @@ -78,8 +88,8 @@ static void swdptap_turnaround(const swdio_status_t dir) for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; + gpio_clear(SWCLK_PORT, SWCLK_PIN); if (dir == SWDIO_STATUS_DRIVE) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); SWDIO_MODE_DRIVE(); } } @@ -90,15 +100,14 @@ static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) { uint32_t value = 0; for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); - value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + gpio_clear(SWCLK_PORT, SWCLK_PIN); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; } @@ -108,12 +117,11 @@ static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) { uint32_t value = 0; for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; gpio_set(SWCLK_PORT, SWCLK_PIN); __asm__("nop"); + gpio_clear(SWCLK_PORT, SWCLK_PIN); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; } @@ -132,16 +140,18 @@ static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; - const bool parity = calculate_odd_parity(result); const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) continue; - *ret = result; + gpio_clear(SWCLK_PORT, SWCLK_PIN); /* Terminate the read cycle now */ swdptap_turnaround(SWDIO_STATUS_DRIVE); + + const bool parity = calculate_odd_parity(result); + *ret = result; return parity == bit; } @@ -150,15 +160,14 @@ static void swdptap_seq_out_clk_delay(uint32_t tms_states, size_t clock_cycles) static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + gpio_clear(SWCLK_PORT, SWCLK_PIN); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); } static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); @@ -166,11 +175,11 @@ static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) _ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_clear(SWCLK_PORT, SWCLK_PIN); gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); gpio_set(SWCLK_PORT, SWCLK_PIN); + __asm__("nop"); + gpio_clear(SWCLK_PORT, SWCLK_PIN); } - gpio_clear(SWCLK_PORT, SWCLK_PIN); } static void swdptap_seq_out(const uint32_t tms_states, const size_t clock_cycles) From caf82d562347642ea49144c43b30e7723696bdc6 Mon Sep 17 00:00:00 2001 From: Taylor Yu Date: Wed, 17 Jan 2024 18:11:52 -0600 Subject: [PATCH 2/4] common/swdptap: adjust timing Make clock cycle more symmetric, especially for `no_delay` variants. `swdptap_seq_out_no_delay` seems to need the clock clear to be moved back to the top of the loop, to ensure minimal delay between the falling edge and the SWDIO change. Also use a different loop counter that compiles to a `SUBS` `BCC`/`BCS` pair, instead of using an explicit `CMP`. --- src/platforms/common/swdptap.c | 64 ++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index aee7e4031a0..6005fa73641 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -99,15 +99,24 @@ static uint32_t swdptap_seq_in_clk_delay(size_t clock_cycles) __attribute__((opt static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) { uint32_t value = 0; - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { + if (!clock_cycles) + return 0; + for (size_t cycle = clock_cycles; cycle--;) { for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; - value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + value >>= 1U; + value |= (uint32_t)bit << 31U; + /* Reordering barrier */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); + /* Reordering barrier */ + __asm__("" ::: "memory"); } + value >>= (32U - clock_cycles); return value; } @@ -115,13 +124,24 @@ static uint32_t swdptap_seq_in_no_delay(size_t clock_cycles) __attribute__((opti static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) { + if (!clock_cycles) + return 0; uint32_t value = 0; - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); + bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); - __asm__("nop"); + __asm__("nop" ::: "memory"); + value >>= 1U; + value |= (uint32_t)bit << 31U; + /* Reordering barrier */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); + /* Reordering barrier */ + __asm__("" ::: "memory"); } + value >>= (32U - clock_cycles); return value; } @@ -159,13 +179,24 @@ static void swdptap_seq_out_clk_delay(uint32_t tms_states, size_t clock_cycles) static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t clock_cycles) { - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); + uint32_t value = tms_states; + bool bit = value & 1; + if (!clock_cycles) + return; + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, bit); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; + __asm__("nop" ::: "memory"); + value >>= 1U; + bit = value & 1; + /* Reordering barrier */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); } } @@ -174,12 +205,23 @@ static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) _ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { - for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1U << cycle)); - gpio_set(SWCLK_PORT, SWCLK_PIN); - __asm__("nop"); + uint32_t value = tms_states; + bool bit = value & 1; + if (!clock_cycles) + return; + for (size_t cycle = clock_cycles; cycle--;) { + /* Reordering barrier */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, bit); + gpio_set(SWCLK_PORT, SWCLK_PIN); + __asm__("nop" ::: "memory"); + value >>= 1U; + bit = value & 1; + /* Reordering barrier */ + __asm__("" ::: "memory"); } + gpio_clear(SWCLK_PORT, SWCLK_PIN); } static void swdptap_seq_out(const uint32_t tms_states, const size_t clock_cycles) From 7ea8176a8db5f50ca668669829c84a9e0a0e8570 Mon Sep 17 00:00:00 2001 From: Taylor Yu Date: Fri, 12 Jul 2024 10:55:36 -0500 Subject: [PATCH 3/4] common/swdptap: sign conversion, const fixes --- src/platforms/common/swdptap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 6005fa73641..afbf8d9e2d0 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -104,7 +104,7 @@ static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) for (size_t cycle = clock_cycles; cycle--;) { for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; - bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); + const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; @@ -180,7 +180,7 @@ static void swdptap_seq_out_clk_delay(uint32_t tms_states, size_t clock_cycles) static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t clock_cycles) { uint32_t value = tms_states; - bool bit = value & 1; + bool bit = value & 1U; if (!clock_cycles) return; for (size_t cycle = clock_cycles; cycle--;) { @@ -194,7 +194,7 @@ static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t cl continue; __asm__("nop" ::: "memory"); value >>= 1U; - bit = value & 1; + bit = value & 1U; /* Reordering barrier */ __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); @@ -206,7 +206,7 @@ static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) _ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { uint32_t value = tms_states; - bool bit = value & 1; + bool bit = value & 1U; if (!clock_cycles) return; for (size_t cycle = clock_cycles; cycle--;) { @@ -217,7 +217,7 @@ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clo gpio_set(SWCLK_PORT, SWCLK_PIN); __asm__("nop" ::: "memory"); value >>= 1U; - bit = value & 1; + bit = value & 1U; /* Reordering barrier */ __asm__("" ::: "memory"); } From e74212399a555f80d9cdf6b9fff6454a90f068fa Mon Sep 17 00:00:00 2001 From: Taylor Yu Date: Fri, 12 Jul 2024 11:57:05 -0500 Subject: [PATCH 4/4] common/swdptap: explain down-count --- src/platforms/common/swdptap.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index afbf8d9e2d0..1f171f83dd0 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -101,6 +101,11 @@ static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) uint32_t value = 0; if (!clock_cycles) return 0; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ for (size_t cycle = clock_cycles; cycle--;) { for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) continue; @@ -127,6 +132,11 @@ static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) if (!clock_cycles) return 0; uint32_t value = 0; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ for (size_t cycle = clock_cycles; cycle--;) { /* Reordering barrier */ __asm__("" ::: "memory"); @@ -183,6 +193,11 @@ static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t cl bool bit = value & 1U; if (!clock_cycles) return; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ for (size_t cycle = clock_cycles; cycle--;) { /* Reordering barrier */ __asm__("" ::: "memory"); @@ -209,6 +224,11 @@ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clo bool bit = value & 1U; if (!clock_cycles) return; + /* + * Count down instead of up, because with an up-count, some ARM-GCC + * versions use an explicit CMP, missing the optimization of converting + * to a faster down-count that uses SUBS followed by BCS/BCC. + */ for (size_t cycle = clock_cycles; cycle--;) { /* Reordering barrier */ __asm__("" ::: "memory");