Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
07dca23
First commit; declare libdatadog ext api
gyuheon0h Oct 16, 2025
441b302
Dummy frame emitted
gyuheon0h Oct 20, 2025
a2d4dbe
Include internal structures (at least try to))
gyuheon0h Oct 21, 2025
ab81e8f
Can walk the stack, but no func names yet
gyuheon0h Oct 21, 2025
434f51f
Add safety checks when walking stack
gyuheon0h Oct 21, 2025
3bc90d8
Get C frames, and also add some AI generated sanity check tests
gyuheon0h Oct 27, 2025
c996800
Clean up tests
gyuheon0h Oct 28, 2025
e008db7
Use all safety checks
gyuheon0h Oct 28, 2025
f4c2e7b
Category specific runtime callback
gyuheon0h Oct 30, 2025
c1af1a3
Move runtime stuff to own file
gyuheon0h Nov 12, 2025
2f39282
Try to fix issues across diff ruby versions
gyuheon0h Nov 17, 2025
e8c14ee
remove early continue
gyuheon0h Nov 17, 2025
1daed6f
3.3 issues
gyuheon0h Nov 18, 2025
0b8401e
Fallback to common headers is MJIT headers fail?
gyuheon0h Nov 19, 2025
86d25f9
Move into own extension
gyuheon0h Nov 22, 2025
7e6dd2a
Move into prof extension
gyuheon0h Nov 24, 2025
060f872
Clean up
gyuheon0h Nov 24, 2025
ce79b31
Gate behind flag
gyuheon0h Nov 25, 2025
db8cb2b
Respond to code cleanliness comments; still need to fix logic/impl
gyuheon0h Nov 25, 2025
348db9e
No need to do minimal utf8 checking
gyuheon0h Nov 26, 2025
17609db
Register callback before starting CT
gyuheon0h Nov 26, 2025
aa014d0
Don't run separate script for runtime stacks spec
gyuheon0h Dec 1, 2025
7ce96e1
Remove env flag for rt stacks
gyuheon0h Dec 1, 2025
3be8580
Clean up iseq body validation logic
gyuheon0h Dec 4, 2025
6c8e8f1
Remove double frame skipping bug and unsafe dereference
gyuheon0h Dec 4, 2025
103dd46
Further clean up and small logic fixes
gyuheon0h Dec 5, 2025
01ff759
Respond to review (spec clean up, redundant steps removed, etc)
gyuheon0h Dec 10, 2025
acc5cf1
Just register on native side always, add comment on future work to mo…
gyuheon0h Dec 11, 2025
4dfcbe2
Lets reuse profiling api to traverse frames
gyuheon0h Dec 11, 2025
2e677b9
Gate for linux only, combine string validation utils, define max stac…
gyuheon0h Dec 12, 2025
fbd6ebd
Emit placeholder frames for no runtime stacks and truncation
gyuheon0h Dec 12, 2025
998cca7
Use rb_typeddata_is_kind_of and dont touch rb_thread_t
gyuheon0h Dec 12, 2025
5ab8327
Dont access directly and private ruby stuff
gyuheon0h Dec 12, 2025
c79e6fa
Reduce mincore call
gyuheon0h Dec 15, 2025
1f6b623
improve placeholder clarity
gyuheon0h Dec 15, 2025
4feafcc
Raise exception on registration failure
gyuheon0h Dec 15, 2025
27e0681
Fail w error message if profiling not supported for spec, fix truncat…
gyuheon0h Dec 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,14 @@ namespace :spec do
t.pattern = CORE_WITH_LIBDATADOG_API.join(', ')
t.rspec_opts = args.to_a.join(' ')
end.tap do |t|
Rake::Task[t.name].enhance(["compile:libdatadog_api.#{RUBY_VERSION[/\d+.\d+/]}_#{RUBY_PLATFORM}"])
# Core with libdatadog is dependent on profiling because crashtracking runtime stacks logic
# lives in the profiling extension (they share same logic accessing Ruby internals)
Rake::Task[t.name].enhance(
[
"compile:libdatadog_api.#{RUBY_VERSION[/\d+.\d+/]}_#{RUBY_PLATFORM}",
"compile:datadog_profiling_native_extension.#{RUBY_VERSION}_#{RUBY_PLATFORM}"
]
)
end
# rubocop:enable Style/MultilineBlockChain

Expand Down Expand Up @@ -326,7 +333,7 @@ namespace :spec do
rescue => e
# Compilation failed (likely unsupported Ruby version) - tests will skip gracefully
puts "Warning: libdatadog_api compilation failed: #{e.class}: #{e}"
puts "DSM tests will be skipped for this Ruby version"
puts 'DSM tests will be skipped for this Ruby version'
end

DSM_ENABLED_LIBRARIES.each do |task_name|
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
// NOTE: This file is a part of the profiling native extension even though the
// runtime stacks feature is consumed by the crashtracker. The profiling
// extension already carries all the Ruby VM private header access and build
// plumbing required to safely poke at internal structures. Sharing that setup
// avoids duplicating another native extension with the same (fragile) access
// patterns, and keeps the overall install/build surface area smaller.
#include "extconf.h"

#ifdef RUBY_MJIT_HEADER
// Pick up internal structures from the private Ruby MJIT header file
#include RUBY_MJIT_HEADER
#else
// The MJIT header was introduced on 2.6 and removed on 3.3; for other Rubies we rely on
// the datadog-ruby_core_source gem to get access to private VM headers.

// We can't do anything about warnings in VM headers, so we just use this technique to suppress them.
// See https://nelkinda.com/blog/suppress-warnings-in-gcc-and-clang/#d11e364 for details.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wattributes"
#pragma GCC diagnostic ignored "-Wpragmas"
#pragma GCC diagnostic ignored "-Wexpansion-to-defined"
#include <vm_core.h>
#pragma GCC diagnostic pop

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#include <iseq.h>
#pragma GCC diagnostic pop

#include <ruby.h>

#ifndef NO_RACTOR_HEADER_INCLUDE
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#include <ractor_core.h>
#pragma GCC diagnostic pop
#endif
#endif

#include <datadog/crashtracker.h>
#include "datadog_ruby_common.h"
#include "private_vm_api_access.h"
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>

static const rb_data_type_t *crashtracker_thread_data_type = NULL;

static void ruby_runtime_stack_callback(
void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*)
);

// Use a fixed, preallocated buffer for crash-time runtime stacks to avoid
// heap allocation in the signal/crash path.
static const int RUNTIME_STACK_MAX_FRAMES = 512;
static frame_info runtime_stack_buffer[512];

#if defined(__x86_64__)
# define SYS_MINCORE 0x1B
#elif defined(__aarch64__)
# define SYS_MINCORE 0xE8
#endif

long syscall(long number, ...);

// align down to power of two
static inline uintptr_t align_down(uintptr_t x, uintptr_t align) {
return x & ~(align - 1u);
}

// This function is not necessarily Ruby specific. This will be moved to
// `libdatadog` in the future as a shared utility function.
static inline bool is_pointer_readable(const void *ptr, size_t size) {
if (!ptr || size == 0) return false;

uintptr_t page_size = (uintptr_t)sysconf(_SC_PAGESIZE);
// fallback for weird value; 0 or not a power of two
if (page_size == 0 || (page_size & (page_size - 1u))) {
page_size = 4096;
}

const uintptr_t start = align_down((uintptr_t)ptr, page_size);
const uintptr_t end = ((uintptr_t)ptr + size - 1u);
const uintptr_t last = align_down(end, page_size);

// Number of pages spanned
size_t pages = 1u + (last != start);
if (pages > 2u) pages = 2u;

unsigned char vec[2];

int retries = 5;
for (;;) {
size_t len = pages * (size_t)page_size;
long rc = syscall(SYS_MINCORE, (void*)start, len, vec);

if (rc == 0) {
return true;
}

int e = errno;
if (e == ENOMEM || e == EFAULT) {
return false;
}

if (e == EAGAIN && retries-- > 0) {
continue;
}

// Unknown errno, we assume mapped to avoid cascading faults in crash path
return true;
}
}

static inline ddog_CharSlice char_slice_from_cstr(const char *cstr) {
if (cstr == NULL) {
return (ddog_CharSlice){.ptr = NULL, .len = 0};
}
return (ddog_CharSlice){.ptr = cstr, .len = strlen(cstr)};
}

// Heuristically validate a Ruby string VALUE before dereferencing it from crash context:
// 1) ensure it is actually a String heap object (no immediates/symbols)
// 2) confirm both the common header (RBasic) and the string payload (RString) live in
// readable pages to avoid faulting while inspecting potentially corrupted memory
// 3) enforce upper bound for lengths we expect to emit (file names, function names)
static bool is_reasonable_string_size(VALUE str) {
if (str == Qnil) return false;

// After RB_TYPE_P confirms this VALUE is a heap string, the tagged VALUE is
// guaranteed to be an aligned pointer, so casting to void* is equivalent to
// RBASIC(str); we verify the object header is readable before touching it.
if (!is_pointer_readable((const void *)str, sizeof(struct RBasic))) return false;

// For strings, we need to check the full RString structure
if (!is_pointer_readable((const void *)str, sizeof(struct RString))) return false;

long len = RSTRING_LEN(str);

if (len < 0) return false; // Negative length, probably corrupted
if (len > 1024) return false; // > 1KB path/function name, sus

return true;
}

static const char* safe_string_ptr(VALUE str) {
if (str == Qnil) return "<nil>";
if (!RB_TYPE_P(str, T_STRING)) return "<not_string>";

// Validate the VALUE first before touching any of its internals
if (!is_reasonable_string_size(str)) return "<corrupted>";

long len = RSTRING_LEN(str);
const char *ptr = RSTRING_PTR(str);

if (!ptr) return "<null>";

if (!is_pointer_readable(ptr, len > 0 ? len : 1)) return "<unreadable>";

return ptr;
}

// Collect the crashing thread's frames via ddtrace_rb_profile_frames into a static buffer, then emit
// them newest-first. If corruption is detected, emit placeholder frames so the crash report still
// completes. We lean on the Ruby VM helpers we already use for profiling and rely on crashtracker's
// safety nets so a failure here should not impact customers.
static void ruby_runtime_stack_callback(
void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*)
) {
// Grab the Ruby thread we crashed on; crashtracker only runs once.
VALUE current_thread = rb_thread_current();
if (current_thread == Qnil) return;

if (crashtracker_thread_data_type == NULL) return;

rb_thread_t *th = (rb_thread_t *) rb_check_typeddata(current_thread, crashtracker_thread_data_type);
if (!th || !is_pointer_readable(th, sizeof(*th))) return;

// Use the profiling helper to gather frames into our static buffer.
int frame_count = ddtrace_rb_profile_frames(
current_thread,
0,
RUNTIME_STACK_MAX_FRAMES,
runtime_stack_buffer
);

if (frame_count <= 0) return;

for (int i = frame_count - 1; i >= 0; i--) {
frame_info *info = &runtime_stack_buffer[i];

if (info->is_ruby_frame) {
const rb_iseq_t *iseq = (const rb_iseq_t *)info->as.ruby_frame.iseq;
const char *function_name = "<unknown>";
const char *file_name = "<unknown>";

if (iseq && is_pointer_readable(iseq, sizeof(rb_iseq_t))) {
VALUE name = rb_iseq_base_label(iseq);
if (name != Qnil) {
function_name = safe_string_ptr(name);
}

VALUE filename = rb_iseq_path(iseq);
if (filename != Qnil) {
file_name = safe_string_ptr(filename);
}
}

ddog_crasht_RuntimeStackFrame frame = {
.type_name = char_slice_from_cstr(NULL),
.function = char_slice_from_cstr(function_name),
.file = char_slice_from_cstr(file_name),
.line = info->as.ruby_frame.line,
.column = 0
};

emit_frame(&frame);
} else {
const char *function_name = "<C method>";
const char *file_name = "<C extension>";

if (info->as.native_frame.method_id) {
const char *method_name = rb_id2name(info->as.native_frame.method_id);
if (is_pointer_readable(method_name, 256)) {
size_t method_name_len = strnlen(method_name, 256);
if (method_name_len > 0 && method_name_len < 256) {
function_name = method_name;
}
}
}

ddog_crasht_RuntimeStackFrame frame = {
.type_name = char_slice_from_cstr(NULL),
.function = char_slice_from_cstr(function_name),
.file = char_slice_from_cstr(file_name),
.line = 0,
.column = 0
};

emit_frame(&frame);
}
}
}

void crashtracking_runtime_stacks_init(void) {
if (crashtracker_thread_data_type == NULL) {
VALUE current_thread = rb_thread_current();
if (current_thread == Qnil) return;

const rb_data_type_t *thread_data_type = RTYPEDDATA_TYPE(current_thread);
if (!thread_data_type) return;

crashtracker_thread_data_type = thread_data_type;
}

// Register immediately so Ruby doesn't need to manage this explicitly.
ddog_crasht_register_runtime_frame_callback(ruby_runtime_stack_callback);
}

2 changes: 2 additions & 0 deletions ext/datadog_profiling_native_extension/profiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ void collectors_thread_context_init(VALUE profiling_module);
void encoded_profile_init(VALUE profiling_module);
void http_transport_init(VALUE profiling_module);
void stack_recorder_init(VALUE profiling_module);
void crashtracking_runtime_stacks_init(void);

static VALUE native_working_p(VALUE self);
static VALUE _native_grab_gvl_and_raise(DDTRACE_UNUSED VALUE _self, VALUE exception_class, VALUE test_message, VALUE test_message_arg, VALUE release_gvl);
Expand Down Expand Up @@ -65,6 +66,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) {
encoded_profile_init(profiling_module);
http_transport_init(profiling_module);
stack_recorder_init(profiling_module);
crashtracking_runtime_stacks_init();
unsafe_api_calls_check_init();

// Hosts methods used for testing the native code using RSpec
Expand Down
68 changes: 64 additions & 4 deletions spec/datadog/core/crashtracking/component_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -198,25 +198,24 @@
let(:parsed_request) { JSON.parse(request.body, symbolize_names: true) }
let(:crash_report) { parsed_request.fetch(:payload).first }
let(:crash_report_message) { JSON.parse(crash_report.fetch(:message), symbolize_names: true) }
let(:crash_report_experimental) { crash_report_message.fetch(:experimental) }
let(:stack_trace) { crash_report_message.fetch(:error).fetch(:stack).fetch(:frames) }

# NOTE: If any of these tests seem flaky, the `upload_timeout_seconds` may need to be raised (or otherwise
# we need to tweak libdatadog to not need such high timeouts).

[
[:fiddle, "rb_fiddle_free", proc { Fiddle.free(42) }],
[:signal, "rb_f_kill", proc { Process.kill("SEGV", Process.pid) }],
[:fiddle, 'rb_fiddle_free', proc { Fiddle.free(42) }],
[:signal, 'rb_f_kill', proc { Process.kill('SEGV', Process.pid) }],
].each do |trigger_name, function, trigger|
it "reports crashes via http when app crashes with #{trigger_name}" do
expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do
crash_tracker = build_crashtracker(agent_base_url: agent_base_url)
crash_tracker.start
trigger.call
end

expect(stack_trace).to match(array_including(hash_including(function: function)))
expect(stack_trace.size).to be > 10

expect(crash_report[:tags]).to include('si_signo:11', 'si_signo_human_readable:SIGSEGV')

expect(crash_report_message[:metadata]).to include(
Expand Down Expand Up @@ -318,6 +317,67 @@
end
end
end

describe 'Ruby and C method runtime stack capture' do
let(:runtime_stack) { crash_report_experimental[:runtime_stack] }

it 'captures both Ruby and C method frames in mixed stacks' do
expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do
crash_stack_helper_class = Class.new do
def top_level_ruby_method
ruby_method_with_c_calls
end

def ruby_method_with_c_calls
'hello world'.gsub('world') do |_match|
{a: 1, b: 2}.each do |_key, _value|
Fiddle.free(42)
end
end
end
end

crash_tracker = build_crashtracker(agent_base_url: agent_base_url)
crash_tracker.start

crash_stack_helper_class.new.top_level_ruby_method
end

frames = runtime_stack[:frames]
puts "frames: #{frames.to_json}"
# Check that the crashing function is captured
expect(frames).to include(
hash_including(
function: 'free'
)
)

# Sanity check some frames
expect(frames).to include(
hash_including(
function: 'ruby_method_with_c_calls'
)
)

expect(frames).to include(
hash_including(
function: 'top_level_ruby_method'
)
)

expect(frames).to include(
hash_including(
function: 'gsub'
)
)

expect(frames).to include(
hash_including(
function: 'each'
)
)
end
end
end
end

Expand Down
Loading