diff --git a/src/rp2_common/CMakeLists.txt b/src/rp2_common/CMakeLists.txt index 0d0f9b93d..a3de4f86b 100644 --- a/src/rp2_common/CMakeLists.txt +++ b/src/rp2_common/CMakeLists.txt @@ -49,6 +49,7 @@ if (NOT PICO_BARE_METAL) pico_add_subdirectory(pico_mem_ops) pico_add_subdirectory(pico_malloc) pico_add_subdirectory(pico_printf) + pico_add_subdirectory(pico_tls) pico_add_subdirectory(pico_stdio) pico_add_subdirectory(pico_stdio_semihosting) diff --git a/src/rp2_common/pico_standard_link/memmap_blocked_ram.ld b/src/rp2_common/pico_standard_link/memmap_blocked_ram.ld index 5b0afe65b..e35255195 100644 --- a/src/rp2_common/pico_standard_link/memmap_blocked_ram.ld +++ b/src/rp2_common/pico_standard_link/memmap_blocked_ram.ld @@ -134,8 +134,14 @@ SECTIONS *(.text*) . = ALIGN(4); *(.rodata*) + . = ALIGN(4); + /* emutls objects */ + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); *(.data*) . = ALIGN(4); diff --git a/src/rp2_common/pico_standard_link/memmap_copy_to_ram.ld b/src/rp2_common/pico_standard_link/memmap_copy_to_ram.ld index 90975b593..e565f8199 100644 --- a/src/rp2_common/pico_standard_link/memmap_copy_to_ram.ld +++ b/src/rp2_common/pico_standard_link/memmap_copy_to_ram.ld @@ -133,8 +133,14 @@ SECTIONS . = ALIGN(4); *(.rodata*) + . = ALIGN(4); + /* emutls objects */ + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); *(.data*) . = ALIGN(4); diff --git a/src/rp2_common/pico_standard_link/memmap_default.ld b/src/rp2_common/pico_standard_link/memmap_default.ld index 07d5812db..dc8582770 100644 --- a/src/rp2_common/pico_standard_link/memmap_default.ld +++ b/src/rp2_common/pico_standard_link/memmap_default.ld @@ -134,8 +134,14 @@ SECTIONS *(.text*) . = ALIGN(4); *(.rodata*) + . = ALIGN(4); + /* emutls objects */ + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + . = ALIGN(4); *(.data*) . = ALIGN(4); diff --git a/src/rp2_common/pico_standard_link/memmap_no_flash.ld b/src/rp2_common/pico_standard_link/memmap_no_flash.ld index 7a5977fa5..6f34c1ac4 100644 --- a/src/rp2_common/pico_standard_link/memmap_no_flash.ld +++ b/src/rp2_common/pico_standard_link/memmap_no_flash.ld @@ -104,6 +104,15 @@ SECTIONS __etext = .; __data_start__ = .; *(vtable) + + + . = ALIGN(4); + /* emutls objects */ + PROVIDE_HIDDEN (__emutls_array_start = .); + *(.*.__emutls_v.*) + PROVIDE_HIDDEN (__emutls_array_end = .); + + . = ALIGN(4); *(.data*) . = ALIGN(4); diff --git a/src/rp2_common/pico_tls/CMakeLists.txt b/src/rp2_common/pico_tls/CMakeLists.txt new file mode 100644 index 000000000..30537466c --- /dev/null +++ b/src/rp2_common/pico_tls/CMakeLists.txt @@ -0,0 +1,38 @@ +if (NOT TARGET pico_tls) + # library to be depended on - we make this depend on particular implementations using per target generator expressions + pico_add_impl_library(pico_tls) + + # no custom implementation; falls thru to compiler + pico_add_impl_library(pico_tls_compiler) + + # add alias "default" which is just core_thread. + add_library(pico_tls_default INTERFACE) + target_link_libraries(pico_tls_default INTERFACE pico_tls_core_thread) + + set(PICO_DEFAULT_TLS_IMPL pico_tls_default) + + target_link_libraries(pico_tls INTERFACE + $>,$,${PICO_DEFAULT_TLS_IMPL}>) + + add_library(pico_tls_core_thread_explicit INTERFACE) + target_sources(pico_tls_core_thread_explicit INTERFACE + ${CMAKE_CURRENT_LIST_DIR}/tls.c + ${CMAKE_CURRENT_LIST_DIR}/tls.S + ) + + pico_add_impl_library(pico_tls_core_thread) + + target_link_libraries(pico_tls_core_thread INTERFACE pico_tls_core_thread_explicit) + + pico_wrap_function(pico_tls_core_thread __emutls_get_address) + + # Call this to substitute an alternate implementation, e.g. if using an RTOS. + macro(pico_set_tls_implementation TARGET IMPL) + get_target_property(target_type ${TARGET} TYPE) + if ("EXECUTABLE" STREQUAL "${target_type}") + set_target_properties(${TARGET} PROPERTIES PICO_TARGET_TLS_IMPL "pico_tls_${IMPL}") + else() + message(FATAL_ERROR "tls implementation must be set on executable not library") + endif() + endmacro() +endif() diff --git a/src/rp2_common/pico_tls/tls.S b/src/rp2_common/pico_tls/tls.S new file mode 100644 index 000000000..85a689aa0 --- /dev/null +++ b/src/rp2_common/pico_tls/tls.S @@ -0,0 +1,10 @@ +/* + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "pico/asm_helper.S" + +// This has to happen after memcpy() and memset() are available. +__pre_init tls_init, 10000 diff --git a/src/rp2_common/pico_tls/tls.c b/src/rp2_common/pico_tls/tls.c new file mode 100644 index 000000000..3ac7d9cff --- /dev/null +++ b/src/rp2_common/pico_tls/tls.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "pico.h" + +#include +#include +#include +#include + +// From emutls.c: +// 'For every TLS variable xyz, there is one __emutls_control variable named __emutls_v.xyz. If xyz has +// non-zero initial value, __emutls_v.xyz's "value" will point to __emutls_t.xyz, which has the initial value.' +// +// The linker script groups all the __emutls_v.xyz variables into a single array and provides symbols +// __emutls_array_start and __emutls_array_end, which can be used to iterate over the array. This allows +// the storage for each core's thread local variables to be pre-allocated and pre-initialized, which leaves +// minimal work for __wrap___emutls_get_address. +// +// This array is available to other TLS implementations too, such a TLS implementation for an RTOS. + +// Same layout as libgcc __emutls_object. Unfortunately, __emutls_object doesn't appear in any header files. +typedef struct { + uint size; + uint align; + union { + struct { + uint offset; + void *template; + } s; + void* lookup[NUM_CORES]; + } u; +} tls_object; + +extern tls_object __emutls_array_start; +extern tls_object __emutls_array_end; + +void tls_init(void); +void* __wrap___emutls_get_address(tls_object*); + +// Must be called after it is safe to call memcpy & memset. +void tls_init(void) { + // Three passes: + // 1) Calculate the offset of each thread local variable and the total storage to be allocated for each thread. + uint offset = 0; + uint max_align = 1; + for (tls_object* object = &__emutls_array_start; object < &__emutls_array_end; ++object) { + assert((object->align & (object->align - 1)) == 0); + + if (object->align > max_align) { + max_align = object->align; + } + + offset = (offset + object->align - 1) & ~(object->align - 1); + object->u.s.offset = offset; + offset += object->size; + } + + if (offset == 0) { + return; + } + + // 2) Allocate storage for each thread and initialize the thread local variables to their initial value. + char* stores[NUM_CORES]; + for (uint i = 0; i < NUM_CORES; ++i) { + // TODO: tls_init is invoked before pico_malloc's auto-initialized mutex has been initialized. + // However, aligned_alloc and memalign are not wrapped by pico_malloc so don't acquire or release + // the mutex so this works, though not for a satisfying reason. + // + // What I would like to do here, since malloc and friends ought not to be called at this point in + // initialization, is decrement the heap limit by the TLS storage size. At time of writing, the + // heap limit is &__StackLimit, i.e. static. It could be dynamic though. + char* storage = stores[i] = (char*) memalign(max_align, offset); + + for (tls_object* object = &__emutls_array_start; object < &__emutls_array_end; ++object) { + if (object->u.s.template) { + memcpy(storage + object->u.s.offset, object->u.s.template, object->size); + } else { + memset(storage + object->u.s.offset, 0, object->size); + } + } + } + + // 3) Repurpose the tls_objects so each contains a lookup table mapping from core index to pointer to + // thread local variable. + for (tls_object* object = &__emutls_array_start; object < &__emutls_array_end; ++object) { + uint offset = object->u.s.offset; + for (uint i = 0; i < NUM_CORES; ++i) { + object->u.lookup[i] = stores[i] + offset; + } + } +} + +void* __wrap___emutls_get_address(tls_object* object) { + // TLS storage is not allocated for exceptions. + assert(!__get_current_exception()); + return object->u.lookup[get_core_num()]; +} diff --git a/test/kitchen_sink/CMakeLists.txt b/test/kitchen_sink/CMakeLists.txt index a2019919c..fb25a264d 100644 --- a/test/kitchen_sink/CMakeLists.txt +++ b/test/kitchen_sink/CMakeLists.txt @@ -37,6 +37,7 @@ target_link_libraries(kitchen_sink_libs INTERFACE pico_platform pico_stdlib pico_sync + pico_tls pico_time pico_unique_id pico_util diff --git a/test/kitchen_sink/kitchen_sink.c b/test/kitchen_sink/kitchen_sink.c index a49654d1b..16fc5c26c 100644 --- a/test/kitchen_sink/kitchen_sink.c +++ b/test/kitchen_sink/kitchen_sink.c @@ -116,6 +116,15 @@ __force_inline int something_inlined(int x) { auto_init_mutex(mutex); auto_init_recursive_mutex(recursive_mutex); +#ifndef __cplusplus +#define thread_local __thread +#endif + +thread_local int initialized_tls_var = 7; +thread_local int __attribute__((section("other_section"))) other_section_tls_var = 7; +thread_local int uninitialized_tls_var; +thread_local int garbage_collected_tls_var; + int main(void) { spiggle(); @@ -129,6 +138,13 @@ int main(void) { hard_assert(!mutex_try_enter(&mutex, NULL)); hard_assert(recursive_mutex_try_enter(&recursive_mutex, NULL)); hard_assert(recursive_mutex_try_enter(&recursive_mutex, NULL)); + + hard_assert(initialized_tls_var == 7); + hard_assert(other_section_tls_var == 7); + hard_assert(uninitialized_tls_var == 0); + initialized_tls_var = 8; + hard_assert(initialized_tls_var == 8); + // this should compile as we are Cortex M0+ __asm volatile("SVC #3");