Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,6 @@ if(BUILD_UNIVERSAL_DDPROF)
endif()
endif()

if(USE_LOADER)
target_compile_definitions(dd_profiling-embedded PRIVATE "DDPROF_USE_LOADER")
endif()

# Fix for link error in sanitizeddebug build mode with gcc:
# ~~~
# /usr/bin/ld: ./libdd_profiling.so: undefined reference to `__dynamic_cast'
Expand Down
2 changes: 1 addition & 1 deletion cmake/dd_profiling.version
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
global: ddprof_start_profiling; ddprof_stop_profiling; ddprof_lib_state;
global: ddprof_start_profiling; ddprof_stop_profiling;
local: *;
};
12 changes: 12 additions & 0 deletions include/lib/allocation_tracker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <cstddef>
#include <functional>
#include <mutex>
#include <pthread.h>

namespace ddprof {

Expand Down Expand Up @@ -79,7 +80,18 @@ class AllocationTracker {
// can return null (does not init)
static TrackerThreadLocalState *get_tl_state();

// Initialize pthread key for TLS (idempotent, thread-safe).
// Public so that tests exercising fork() can call it directly without going
// through allocation_tracking_init() (which requires a ring buffer).
static void ensure_key_initialized();

private:
static void delete_tl_state(void *tl_state);

// POSIX does not define an invalid pthread_key_t value, but implementations
// allocate keys starting from 0, so -1 (all bits set) is a safe sentinel.
static constexpr pthread_key_t kInvalidKey = static_cast<pthread_key_t>(-1);
static std::atomic<pthread_key_t> _tl_state_key;
static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16;

// NOLINTBEGIN(misc-non-private-member-variables-in-classes)
Expand Down
4 changes: 0 additions & 4 deletions include/lib/allocation_tracker_tls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ struct TrackerThreadLocalState {
// should not allocate because we might already
// be inside an allocation)

// Set to true by placement new in init_tl_state().
// Zero-initialized (false) in a fresh thread's TLS before init.
bool initialized{true};

// In the choice of random generators, this one is smaller
// - smaller than mt19937 (8 vs 5K)
std::minstd_rand gen{std::random_device{}()};
Expand Down
19 changes: 0 additions & 19 deletions include/lib/tls_state_storage.h

This file was deleted.

70 changes: 48 additions & 22 deletions src/lib/allocation_tracker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "ringbuffer_utils.hpp"
#include "savecontext.hpp"
#include "syscalls.hpp"
#include "tls_state_storage.h"
#include "tsc_clock.hpp"

#include <algorithm>
Expand All @@ -25,28 +24,17 @@
#include <cstdint>
#include <cstdlib>
#include <new>
#include <pthread.h>
#include <unistd.h>

namespace ddprof {

AllocationTracker *AllocationTracker::_instance;

static_assert(sizeof(TrackerThreadLocalState) == DDPROF_TLS_STATE_SIZE,
"Update DDPROF_TLS_STATE_SIZE in tls_state_storage.h");
static_assert(alignof(TrackerThreadLocalState) <= DDPROF_TLS_STATE_ALIGN,
"Update DDPROF_TLS_STATE_ALIGN in tls_state_storage.h");
std::atomic<pthread_key_t> AllocationTracker::_tl_state_key{
AllocationTracker::kInvalidKey};

namespace {

#ifdef DDPROF_USE_LOADER
extern "C" __attribute((tls_model(
"initial-exec"))) __thread char ddprof_lib_state[DDPROF_TLS_STATE_SIZE];
#else
__attribute((tls_model("initial-exec")))
__attribute((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char
ddprof_lib_state[sizeof(TrackerThreadLocalState)];
#endif

DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer,
size_t size, bool &timeout) {
constexpr std::chrono::nanoseconds k_sleep_duration =
Expand All @@ -64,19 +52,49 @@ DDPROF_NOINLINE auto sleep_and_retry_reserve(MPSCRingBufferWriter &writer,
}
} // namespace

void AllocationTracker::delete_tl_state(void *tl_state) {
delete static_cast<TrackerThreadLocalState *>(tl_state);
}

void AllocationTracker::ensure_key_initialized() {
if (_tl_state_key.load(std::memory_order_acquire) != kInvalidKey) {
return;
}
pthread_key_t new_key;
if (pthread_key_create(&new_key, delete_tl_state) != 0) {
return;
}
pthread_key_t expected = kInvalidKey;
if (!_tl_state_key.compare_exchange_strong(expected, new_key,
std::memory_order_release)) {
// Another thread beat us, discard our key
pthread_key_delete(new_key);
}
}

TrackerThreadLocalState *AllocationTracker::get_tl_state() {
// ddprof_lib_state is zero-initialized by libc for each new thread.
// After placement new (init_tl_state), initialized is set to true.
auto *state = reinterpret_cast<TrackerThreadLocalState *>(ddprof_lib_state);
return state->initialized ? state : nullptr;
const pthread_key_t key = _tl_state_key.load(std::memory_order_relaxed);
if (key == kInvalidKey) {
return nullptr;
}
return static_cast<TrackerThreadLocalState *>(pthread_getspecific(key));
}

TrackerThreadLocalState *AllocationTracker::init_tl_state() {
// Placement new into TLS -- no heap allocation, no cleanup needed on thread
// exit. Safe to call after fork (TLS memory is inherited by child).
auto *state = new (ddprof_lib_state) TrackerThreadLocalState{};
// acquire pairs with the release in ensure_key_initialized(): guarantees
// that if we see a valid key the pthread key is fully published and we won't
// silently return nullptr and drop the thread's initial allocations.
const pthread_key_t key = _tl_state_key.load(std::memory_order_acquire);
if (key == kInvalidKey) {
return nullptr;
}
auto *state = new (std::nothrow) TrackerThreadLocalState{};
if (!state) {
return nullptr;
}
state->tid = ddprof::gettid();
state->stack_bounds = retrieve_stack_bounds();
pthread_setspecific(key, state);
return state;
}

Expand All @@ -91,6 +109,7 @@ DDRes AllocationTracker::allocation_tracking_init(
uint64_t allocation_profiling_rate, uint32_t flags,
uint32_t stack_sample_size, const RingBufferInfo &ring_buffer,
const IntervalTimerCheck &timer_check) {
ensure_key_initialized();
TrackerThreadLocalState *tl_state = get_tl_state();
if (!tl_state) {
// This is the time at which the init_tl_state should not fail
Expand Down Expand Up @@ -180,6 +199,13 @@ void AllocationTracker::free() {

pevent_munmap_event(&_pevent);

// The pthread key (_tl_state_key) is intentionally not deleted here.
// pthread_key_delete would race with concurrent get_tl_state() calls that
// already loaded the key value but haven't called pthread_getspecific yet.
// The cost is one leaked key per dlclose/reload cycle, which is acceptable:
// POSIX guarantees at least PTHREAD_KEYS_MAX (128) keys per process, and
// library reload is not a supported use case.

// Do not destroy the object:
// there is an inherent race condition between checking
// `_state.track_allocations ` and calling `_instance->track_allocation`.
Expand Down
6 changes: 0 additions & 6 deletions src/lib/loader.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include "constants.hpp"
#include "dd_profiling.h"
#include "lib_embedded_data.h"
#include "tls_state_storage.h"

#include <dlfcn.h>
#include <fcntl.h>
Expand All @@ -19,11 +18,6 @@
#include <time.h>
#include <unistd.h>

__attribute__((__visibility__("default")))
__attribute__((tls_model("initial-exec")))
__attribute__((aligned(DDPROF_TLS_STATE_ALIGN))) __thread char
ddprof_lib_state[DDPROF_TLS_STATE_SIZE];

/* Role of loader is to ensure that all dependencies (libdl/lim/libpthread) of
* libdd_profiling-embedded.so are satisfied before dlopen'ing it.
* On musl, all libc features are in libc.so and hence are available once libc
Expand Down
13 changes: 13 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,19 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug")
COMMAND ${CMAKE_SOURCE_DIR}/tools/check_for_unsafe_libc_functions.py
$<TARGET_FILE:dd_profiling-embedded>)
endif()

# Test that the loader works when dlopen'd with RTLD_GLOBAL (the pattern used by applications that
# load libdd_profiling.so at runtime).
add_executable(loader_rtld_global_test loader_rtld_global_test.c)
target_link_libraries(loader_rtld_global_test PRIVATE dl)
add_dependencies(loader_rtld_global_test dd_profiling-shared)
add_test(
NAME loader_rtld_global
COMMAND loader_rtld_global_test
WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_tests_properties(
loader_rtld_global PROPERTIES ENVIRONMENT
"TEST_DD_PROFILING_LIB=$<TARGET_FILE:dd_profiling-shared>")
endif()

if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug")
Expand Down
74 changes: 74 additions & 0 deletions test/allocation_tracker-bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,77 @@ static void BM_LongLived_Tracking(benchmark::State &state) {
perform_memory_operations_2(true, state);
}

// Microbenchmark: raw TLS access (get_tl_state) on persistent threads.
// This isolates the cost of TLS lookup from allocation tracking overhead.
// Compare initial-exec __thread (main branch) vs pthread_getspecific (this
// branch) by building against each branch and diffing the results.
static void BM_GetTlState(benchmark::State &state) {
LogHandle handle;
const size_t buf_size_order = 8;
const uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling;
ddprof::RingBufferHolder ring_buffer{buf_size_order,
RingBufferType::kMPSCRingBuffer};
ddprof::AllocationTracker::allocation_tracking_init(
k_rate, flags, k_default_perf_stack_sample_size,
ring_buffer.get_buffer_info(), {});

const int nb_threads = 4;
static constexpr int k_ops_per_iter = 10000;

// Barrier to synchronize worker threads at the start of each iteration
std::atomic<int> ready_count{0};
std::atomic<bool> go{false};
std::atomic<bool> done{false};
std::atomic<int64_t> total_ops{0};

std::vector<std::thread> workers;
workers.reserve(nb_threads);
for (int i = 0; i < nb_threads; ++i) {
workers.emplace_back([&] {
ddprof::AllocationTracker::init_tl_state();
while (!done.load(std::memory_order_relaxed)) {
// Wait for the benchmark loop to signal go
ready_count.fetch_add(1, std::memory_order_release);
while (!go.load(std::memory_order_acquire) &&
!done.load(std::memory_order_relaxed)) {}
if (done.load(std::memory_order_relaxed)) {
break;
}
// Hot loop: pure TLS access
int64_t ops = 0;
for (int j = 0; j < k_ops_per_iter; ++j) {
auto *tl = ddprof::AllocationTracker::get_tl_state();
benchmark::DoNotOptimize(tl);
++ops;
}
total_ops.fetch_add(ops, std::memory_order_relaxed);
ready_count.fetch_sub(1, std::memory_order_release);
// wait for go to be lowered before looping back
while (go.load(std::memory_order_acquire) &&
!done.load(std::memory_order_relaxed)) {}
}
});
}

for (auto _ : state) {
total_ops.store(0, std::memory_order_relaxed);
// Wait for all workers to be ready
while (ready_count.load(std::memory_order_acquire) != nb_threads) {}
go.store(true, std::memory_order_release);
// Wait for all workers to finish the hot loop
while (ready_count.load(std::memory_order_acquire) != 0) {}
go.store(false, std::memory_order_release);
state.SetItemsProcessed(total_ops.load(std::memory_order_relaxed));
}

done.store(true, std::memory_order_release);
go.store(true, std::memory_order_release); // unblock workers
for (auto &t : workers) {
t.join();
}
ddprof::AllocationTracker::allocation_tracking_free();
}

// short lived threads
BENCHMARK(BM_ShortLived_NoTracking)->MeasureProcessCPUTime()->UseRealTime();
BENCHMARK(BM_ShortLived_Tracking)->MeasureProcessCPUTime()->UseRealTime();
Expand All @@ -322,4 +393,7 @@ BENCHMARK(BM_ShortLived_Tracking)->MeasureProcessCPUTime()->UseRealTime();
BENCHMARK(BM_LongLived_NoTracking)->MeasureProcessCPUTime();
BENCHMARK(BM_LongLived_Tracking)->MeasureProcessCPUTime();

// raw TLS access: pthread_getspecific vs __thread initial-exec
BENCHMARK(BM_GetTlState)->MeasureProcessCPUTime()->UseRealTime();

} // namespace ddprof
19 changes: 10 additions & 9 deletions test/allocation_tracker_fork_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ void *thread_check_null_tls(void *arg) {
}

int run_child(void *parent_state) {
// After fork, the __thread buffer is inherited: the child's main thread
// sees the initialized state at the same virtual address.
// After fork, pthread thread-specific data is inherited by the child's main
// thread: get_tl_state() returns the same pointer as the parent had.
auto *child_inherited = AllocationTracker::get_tl_state();
CHECK_OR_EXIT(child_inherited == parent_state,
"expected inherited TLS %p, got %p", parent_state,
Expand Down Expand Up @@ -81,14 +81,15 @@ int main() {
const LogHandle log_handle(LL_NOTICE);
LG_NTC("allocation_tracker_fork_test starting");

// Before any init, main thread's TLS must be zero-initialized by libc,
// so get_tl_state() should return NULL (initialized == false).
// Before key initialization, get_tl_state() must return NULL.
auto *pre_init = AllocationTracker::get_tl_state();
CHECK_OR_RETURN(pre_init == nullptr,
"main thread TLS not zero-initialized before init (got %p)",
CHECK_OR_RETURN(pre_init == nullptr, "expected NULL before key init (got %p)",
static_cast<void *>(pre_init));

// Verify the same zero-initialization contract on a new thread
// Initialize the pthread key so TLS operations work.
AllocationTracker::ensure_key_initialized();

// After key init but before init_tl_state(), all threads return NULL.
{
pthread_t thread;
int thread_result = -1;
Expand All @@ -97,10 +98,10 @@ int main() {
"pthread_create failed (pre-fork thread)");
pthread_join(thread, nullptr);
CHECK_OR_RETURN(thread_result == 0,
"pre-fork thread TLS was not zero-initialized");
"pre-fork thread TLS was not NULL before init");
}

// Create TLS in parent
// Create TLS state in parent.
auto *parent_state = AllocationTracker::init_tl_state();
CHECK_OR_RETURN(parent_state != nullptr,
"parent init_tl_state() returned NULL");
Expand Down
Loading