diff --git a/CMakeLists.txt b/CMakeLists.txt index 71bd6d73..50c3a494 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,10 +276,6 @@ if(BUILD_UNIVERSAL_DDPROF) endif() endif() -if(USE_LOADER) - target_compile_definitions(dd_profiling-embedded PRIVATE "DDPROF_USE_LOADER") -endif() - # Fix for link error in sanitizeddebug build mode with gcc: # ~~~ # /usr/bin/ld: ./libdd_profiling.so: undefined reference to `__dynamic_cast' diff --git a/cmake/dd_profiling.version b/cmake/dd_profiling.version index 988721d2..d7e1730e 100644 --- a/cmake/dd_profiling.version +++ b/cmake/dd_profiling.version @@ -1,4 +1,4 @@ { - global: ddprof_start_profiling; ddprof_stop_profiling; ddprof_lib_state; + global: ddprof_start_profiling; ddprof_stop_profiling; local: *; }; diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 208dfdcb..0875c75e 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace ddprof { @@ -79,7 +80,18 @@ class AllocationTracker { // can return null (does not init) static TrackerThreadLocalState *get_tl_state(); + // Initialize pthread key for TLS (idempotent, thread-safe). + // Public so that tests exercising fork() can call it directly without going + // through allocation_tracking_init() (which requires a ring buffer). + static void ensure_key_initialized(); + private: + static void delete_tl_state(void *tl_state); + + // POSIX does not define an invalid pthread_key_t value, but implementations + // allocate keys starting from 0, so -1 (all bits set) is a safe sentinel. + static constexpr pthread_key_t kInvalidKey = static_cast(-1); + static std::atomic _tl_state_key; static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16; // NOLINTBEGIN(misc-non-private-member-variables-in-classes) diff --git a/include/lib/allocation_tracker_tls.hpp b/include/lib/allocation_tracker_tls.hpp index fb7773bb..25ad78ae 100644 --- a/include/lib/allocation_tracker_tls.hpp +++ b/include/lib/allocation_tracker_tls.hpp @@ -25,10 +25,6 @@ struct TrackerThreadLocalState { // should not allocate because we might already // be inside an allocation) - // Set to true by placement new in init_tl_state(). - // Zero-initialized (false) in a fresh thread's TLS before init. - bool initialized{true}; - // In the choice of random generators, this one is smaller // - smaller than mt19937 (8 vs 5K) std::minstd_rand gen{std::random_device{}()}; diff --git a/include/lib/tls_state_storage.h b/include/lib/tls_state_storage.h deleted file mode 100644 index e658f2e9..00000000 --- a/include/lib/tls_state_storage.h +++ /dev/null @@ -1,19 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. This product includes software -// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present -// Datadog, Inc. - -#pragma once - -// C-compatible constants for the TLS buffer that stores -// TrackerThreadLocalState. Used by loader.c (C) and allocation_tracker.cc -// (C++). Correctness enforced at compile time via static_assert in -// allocation_tracker.cc. -#ifdef __cplusplus -enum : unsigned char { -#else -enum { -#endif - DDPROF_TLS_STATE_SIZE = 48, - DDPROF_TLS_STATE_ALIGN = 8, -}; diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index 0b321224..ba5c58bc 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -15,7 +15,6 @@ #include "ringbuffer_utils.hpp" #include "savecontext.hpp" #include "syscalls.hpp" -#include "tls_state_storage.h" #include "tsc_clock.hpp" #include @@ -25,28 +24,17 @@ #include #include #include +#include #include namespace ddprof { AllocationTracker *AllocationTracker::_instance; - -static_assert(sizeof(TrackerThreadLocalState) == DDPROF_TLS_STATE_SIZE, - "Update DDPROF_TLS_STATE_SIZE in tls_state_storage.h"); -static_assert(alignof(TrackerThreadLocalState) <= DDPROF_TLS_STATE_ALIGN, - "Update DDPROF_TLS_STATE_ALIGN in tls_state_storage.h"); +std::atomic AllocationTracker::_tl_state_key{ + AllocationTracker::kInvalidKey}; namespace { -#ifdef DDPROF_USE_LOADER -extern "C" __attribute((tls_model( - "initial-exec"))) __thread char ddprof_lib_state[DDPROF_TLS_STATE_SIZE]; -#else -__attribute((tls_model("initial-exec"))) -__attribute((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char - ddprof_lib_state[sizeof(TrackerThreadLocalState)]; -#endif - DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, size_t size, bool &timeout) { constexpr std::chrono::nanoseconds k_sleep_duration = @@ -64,19 +52,49 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer, } } // namespace +void AllocationTracker::delete_tl_state(void *tl_state) { + delete static_cast(tl_state); +} + +void AllocationTracker::ensure_key_initialized() { + if (_tl_state_key.load(std::memory_order_acquire) != kInvalidKey) { + return; + } + pthread_key_t new_key; + if (pthread_key_create(&new_key, delete_tl_state) != 0) { + return; + } + pthread_key_t expected = kInvalidKey; + if (!_tl_state_key.compare_exchange_strong(expected, new_key, + std::memory_order_release)) { + // Another thread beat us, discard our key + pthread_key_delete(new_key); + } +} + TrackerThreadLocalState *AllocationTracker::get_tl_state() { - // ddprof_lib_state is zero-initialized by libc for each new thread. - // After placement new (init_tl_state), initialized is set to true. - auto *state = reinterpret_cast(ddprof_lib_state); - return state->initialized ? state : nullptr; + const pthread_key_t key = _tl_state_key.load(std::memory_order_relaxed); + if (key == kInvalidKey) { + return nullptr; + } + return static_cast(pthread_getspecific(key)); } TrackerThreadLocalState *AllocationTracker::init_tl_state() { - // Placement new into TLS -- no heap allocation, no cleanup needed on thread - // exit. Safe to call after fork (TLS memory is inherited by child). - auto *state = new (ddprof_lib_state) TrackerThreadLocalState{}; + // acquire pairs with the release in ensure_key_initialized(): guarantees + // that if we see a valid key the pthread key is fully published and we won't + // silently return nullptr and drop the thread's initial allocations. + const pthread_key_t key = _tl_state_key.load(std::memory_order_acquire); + if (key == kInvalidKey) { + return nullptr; + } + auto *state = new (std::nothrow) TrackerThreadLocalState{}; + if (!state) { + return nullptr; + } state->tid = ddprof::gettid(); state->stack_bounds = retrieve_stack_bounds(); + pthread_setspecific(key, state); return state; } @@ -91,6 +109,7 @@ DDRes AllocationTracker::allocation_tracking_init( uint64_t allocation_profiling_rate, uint32_t flags, uint32_t stack_sample_size, const RingBufferInfo &ring_buffer, const IntervalTimerCheck &timer_check) { + ensure_key_initialized(); TrackerThreadLocalState *tl_state = get_tl_state(); if (!tl_state) { // This is the time at which the init_tl_state should not fail @@ -180,6 +199,13 @@ void AllocationTracker::free() { pevent_munmap_event(&_pevent); + // The pthread key (_tl_state_key) is intentionally not deleted here. + // pthread_key_delete would race with concurrent get_tl_state() calls that + // already loaded the key value but haven't called pthread_getspecific yet. + // The cost is one leaked key per dlclose/reload cycle, which is acceptable: + // POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process, and + // library reload is not a supported use case. + // Do not destroy the object: // there is an inherent race condition between checking // `_state.track_allocations ` and calling `_instance->track_allocation`. diff --git a/src/lib/loader.c b/src/lib/loader.c index ee47b197..4a6525f4 100644 --- a/src/lib/loader.c +++ b/src/lib/loader.c @@ -6,7 +6,6 @@ #include "constants.hpp" #include "dd_profiling.h" #include "lib_embedded_data.h" -#include "tls_state_storage.h" #include #include @@ -19,11 +18,6 @@ #include #include -__attribute__((__visibility__("default"))) -__attribute__((tls_model("initial-exec"))) -__attribute__((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char - ddprof_lib_state[DDPROF_TLS_STATE_SIZE]; - /* Role of loader is to ensure that all dependencies (libdl/lim/libpthread) of * libdd_profiling-embedded.so are satisfied before dlopen'ing it. * On musl, all libc features are in libc.so and hence are available once libc diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b5c02dee..99826a4c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -534,6 +534,19 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug") COMMAND ${CMAKE_SOURCE_DIR}/tools/check_for_unsafe_libc_functions.py $) endif() + + # Test that the loader works when dlopen'd with RTLD_GLOBAL (the pattern used by applications that + # load libdd_profiling.so at runtime). + add_executable(loader_rtld_global_test loader_rtld_global_test.c) + target_link_libraries(loader_rtld_global_test PRIVATE dl) + add_dependencies(loader_rtld_global_test dd_profiling-shared) + add_test( + NAME loader_rtld_global + COMMAND loader_rtld_global_test + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + set_tests_properties( + loader_rtld_global PROPERTIES ENVIRONMENT + "TEST_DD_PROFILING_LIB=$") endif() if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug") diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 1bb79608..6f10017c 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -314,6 +314,77 @@ static void BM_LongLived_Tracking(benchmark::State &state) { perform_memory_operations_2(true, state); } +// Microbenchmark: raw TLS access (get_tl_state) on persistent threads. +// This isolates the cost of TLS lookup from allocation tracking overhead. +// Compare initial-exec __thread (main branch) vs pthread_getspecific (this +// branch) by building against each branch and diffing the results. +static void BM_GetTlState(benchmark::State &state) { + LogHandle handle; + const size_t buf_size_order = 8; + const uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling; + ddprof::RingBufferHolder ring_buffer{buf_size_order, + RingBufferType::kMPSCRingBuffer}; + ddprof::AllocationTracker::allocation_tracking_init( + k_rate, flags, k_default_perf_stack_sample_size, + ring_buffer.get_buffer_info(), {}); + + const int nb_threads = 4; + static constexpr int k_ops_per_iter = 10000; + + // Barrier to synchronize worker threads at the start of each iteration + std::atomic ready_count{0}; + std::atomic go{false}; + std::atomic done{false}; + std::atomic total_ops{0}; + + std::vector workers; + workers.reserve(nb_threads); + for (int i = 0; i < nb_threads; ++i) { + workers.emplace_back([&] { + ddprof::AllocationTracker::init_tl_state(); + while (!done.load(std::memory_order_relaxed)) { + // Wait for the benchmark loop to signal go + ready_count.fetch_add(1, std::memory_order_release); + while (!go.load(std::memory_order_acquire) && + !done.load(std::memory_order_relaxed)) {} + if (done.load(std::memory_order_relaxed)) { + break; + } + // Hot loop: pure TLS access + int64_t ops = 0; + for (int j = 0; j < k_ops_per_iter; ++j) { + auto *tl = ddprof::AllocationTracker::get_tl_state(); + benchmark::DoNotOptimize(tl); + ++ops; + } + total_ops.fetch_add(ops, std::memory_order_relaxed); + ready_count.fetch_sub(1, std::memory_order_release); + // wait for go to be lowered before looping back + while (go.load(std::memory_order_acquire) && + !done.load(std::memory_order_relaxed)) {} + } + }); + } + + for (auto _ : state) { + total_ops.store(0, std::memory_order_relaxed); + // Wait for all workers to be ready + while (ready_count.load(std::memory_order_acquire) != nb_threads) {} + go.store(true, std::memory_order_release); + // Wait for all workers to finish the hot loop + while (ready_count.load(std::memory_order_acquire) != 0) {} + go.store(false, std::memory_order_release); + state.SetItemsProcessed(total_ops.load(std::memory_order_relaxed)); + } + + done.store(true, std::memory_order_release); + go.store(true, std::memory_order_release); // unblock workers + for (auto &t : workers) { + t.join(); + } + ddprof::AllocationTracker::allocation_tracking_free(); +} + // short lived threads BENCHMARK(BM_ShortLived_NoTracking)->MeasureProcessCPUTime()->UseRealTime(); BENCHMARK(BM_ShortLived_Tracking)->MeasureProcessCPUTime()->UseRealTime(); @@ -322,4 +393,7 @@ BENCHMARK(BM_ShortLived_Tracking)->MeasureProcessCPUTime()->UseRealTime(); BENCHMARK(BM_LongLived_NoTracking)->MeasureProcessCPUTime(); BENCHMARK(BM_LongLived_Tracking)->MeasureProcessCPUTime(); +// raw TLS access: pthread_getspecific vs __thread initial-exec +BENCHMARK(BM_GetTlState)->MeasureProcessCPUTime()->UseRealTime(); + } // namespace ddprof diff --git a/test/allocation_tracker_fork_test.cc b/test/allocation_tracker_fork_test.cc index 2bebdfcd..91d82253 100644 --- a/test/allocation_tracker_fork_test.cc +++ b/test/allocation_tracker_fork_test.cc @@ -45,8 +45,8 @@ void *thread_check_null_tls(void *arg) { } int run_child(void *parent_state) { - // After fork, the __thread buffer is inherited: the child's main thread - // sees the initialized state at the same virtual address. + // After fork, pthread thread-specific data is inherited by the child's main + // thread: get_tl_state() returns the same pointer as the parent had. auto *child_inherited = AllocationTracker::get_tl_state(); CHECK_OR_EXIT(child_inherited == parent_state, "expected inherited TLS %p, got %p", parent_state, @@ -81,14 +81,15 @@ int main() { const LogHandle log_handle(LL_NOTICE); LG_NTC("allocation_tracker_fork_test starting"); - // Before any init, main thread's TLS must be zero-initialized by libc, - // so get_tl_state() should return NULL (initialized == false). + // Before key initialization, get_tl_state() must return NULL. auto *pre_init = AllocationTracker::get_tl_state(); - CHECK_OR_RETURN(pre_init == nullptr, - "main thread TLS not zero-initialized before init (got %p)", + CHECK_OR_RETURN(pre_init == nullptr, "expected NULL before key init (got %p)", static_cast(pre_init)); - // Verify the same zero-initialization contract on a new thread + // Initialize the pthread key so TLS operations work. + AllocationTracker::ensure_key_initialized(); + + // After key init but before init_tl_state(), all threads return NULL. { pthread_t thread; int thread_result = -1; @@ -97,10 +98,10 @@ int main() { "pthread_create failed (pre-fork thread)"); pthread_join(thread, nullptr); CHECK_OR_RETURN(thread_result == 0, - "pre-fork thread TLS was not zero-initialized"); + "pre-fork thread TLS was not NULL before init"); } - // Create TLS in parent + // Create TLS state in parent. auto *parent_state = AllocationTracker::init_tl_state(); CHECK_OR_RETURN(parent_state != nullptr, "parent init_tl_state() returned NULL"); diff --git a/test/loader_rtld_global_test.c b/test/loader_rtld_global_test.c new file mode 100644 index 00000000..bd410a48 --- /dev/null +++ b/test/loader_rtld_global_test.c @@ -0,0 +1,74 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +// Test that libdd_profiling.so (the loader) works when loaded at runtime +// with RTLD_GLOBAL. +// +// This reproduces the pattern used by applications that dlopen the profiling +// library at startup (e.g., after reading a config flag). +// +// Historical bug: the embedded .so used initial-exec TLS referencing a symbol +// defined in the loader. Two failure modes: +// - glibc: RTLD_GLOBAL only takes effect after the outer dlopen returns, so +// the inner dlopen (loader -> embedded .so) saw the loader's symbols as +// local, causing "undefined symbol" failures. +// - musl: rejects initial-exec TLS cross-library relocations for dlopen'd +// libraries entirely (regardless of flags). +// Fix: the embedded .so now uses pthread_key_t for TLS, removing any +// cross-library symbol dependency between the embedded .so and the loader. + +#include +#include +#include + +typedef int (*start_fn_t)(void); +typedef void (*stop_fn_t)(int); + +int main(void) { + const char *lib_path = getenv("TEST_DD_PROFILING_LIB"); + if (!lib_path) { + lib_path = "./libdd_profiling.so"; + } + + fprintf(stderr, "loading %s with RTLD_LAZY | RTLD_GLOBAL...\n", lib_path); + + void *handle = dlopen(lib_path, RTLD_LAZY | RTLD_GLOBAL); + if (!handle) { + fprintf(stderr, "FAIL: dlopen: %s\n", dlerror()); + return 1; + } + + // Call ddprof_start_profiling. If the embedded library failed to load + // (the bug we're testing for), this returns -1. + start_fn_t start = (start_fn_t)dlsym(handle, "ddprof_start_profiling"); + if (!start) { + fprintf(stderr, "FAIL: ddprof_start_profiling not found\n"); + dlclose(handle); + return 1; + } + + int rc = start(); + // The profiler may fail to start for environment reasons (no perf events, + // etc.), but a return of -1 specifically means the embedded library was + // never loaded (the loader's start function returns -1 when its function + // pointer is NULL). + if (rc == -1) { + fprintf(stderr, + "FAIL: ddprof_start_profiling returned -1 " + "(embedded library not loaded)\n"); + dlclose(handle); + return 1; + } + + // Clean up: stop profiling + stop_fn_t stop = (stop_fn_t)dlsym(handle, "ddprof_stop_profiling"); + if (stop) { + stop(1000); + } + + fprintf(stderr, "PASS: loader constructor succeeded with RTLD_GLOBAL\n"); + dlclose(handle); + return 0; +}