diff --git a/Rakefile b/Rakefile index 3f549150c05..2e830227c50 100644 --- a/Rakefile +++ b/Rakefile @@ -230,7 +230,14 @@ namespace :spec do t.pattern = CORE_WITH_LIBDATADOG_API.join(', ') t.rspec_opts = args.to_a.join(' ') end.tap do |t| - Rake::Task[t.name].enhance(["compile:libdatadog_api.#{RUBY_VERSION[/\d+.\d+/]}_#{RUBY_PLATFORM}"]) + # Core with libdatadog is dependent on profiling because crashtracking runtime stacks logic + # lives in the profiling extension (they share same logic accessing Ruby internals) + Rake::Task[t.name].enhance( + [ + "compile:libdatadog_api.#{RUBY_VERSION[/\d+.\d+/]}_#{RUBY_PLATFORM}", + "compile:datadog_profiling_native_extension.#{RUBY_VERSION}_#{RUBY_PLATFORM}" + ] + ) end # rubocop:enable Style/MultilineBlockChain @@ -326,7 +333,7 @@ namespace :spec do rescue => e # Compilation failed (likely unsupported Ruby version) - tests will skip gracefully puts "Warning: libdatadog_api compilation failed: #{e.class}: #{e}" - puts "DSM tests will be skipped for this Ruby version" + puts 'DSM tests will be skipped for this Ruby version' end DSM_ENABLED_LIBRARIES.each do |task_name| diff --git a/ext/datadog_profiling_native_extension/crashtracking_runtime_stacks.c b/ext/datadog_profiling_native_extension/crashtracking_runtime_stacks.c new file mode 100644 index 00000000000..220fe6df6c7 --- /dev/null +++ b/ext/datadog_profiling_native_extension/crashtracking_runtime_stacks.c @@ -0,0 +1,239 @@ +// NOTE: This file is a part of the profiling native extension even though the +// runtime stacks feature is consumed by the crashtracker. The profiling +// extension already carries all the Ruby VM private header access and build +// plumbing required to safely poke at internal structures. Sharing that setup +// avoids duplicating another native extension with the same (fragile) access +// patterns, and keeps the overall install/build surface area smaller. +// +// This also means that this functionality is depend on the profiling native extension +// being available and built +#include "extconf.h" + +#if defined(__linux__) + +#include +#include "datadog_ruby_common.h" +#include "private_vm_api_access.h" +#include +#include +#include +#include + +static const rb_data_type_t *crashtracker_thread_data_type = NULL; + +static void ruby_runtime_stack_callback( + void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*) +); + +// Use a fixed, preallocated buffer for crash-time runtime stacks to avoid +// heap allocation in the signal/crash path. +#define RUNTIME_STACK_MAX_FRAMES 512 +static frame_info runtime_stack_buffer[RUNTIME_STACK_MAX_FRAMES]; + +#if defined(__x86_64__) +# define SYS_MINCORE 0x1B +#elif defined(__aarch64__) +# define SYS_MINCORE 0xE8 +#endif + +long syscall(long number, ...); + +// align down to power of two +static inline uintptr_t align_down(uintptr_t x, uintptr_t align) { + return x & ~(align - 1u); +} + +static uintptr_t cached_page_size = 0; + +// TODO: This function is not necessarily Ruby specific. This will be moved to +// `libdatadog` in the future as a shared utility function. +static inline bool is_pointer_readable(const void *ptr, size_t size) { + if (!ptr || size == 0) return false; + + uintptr_t page_size = cached_page_size; + + const uintptr_t start = align_down((uintptr_t)ptr, page_size); + const uintptr_t end = ((uintptr_t)ptr + size - 1u); + const uintptr_t last = align_down(end, page_size); + + // Number of pages spanned + size_t pages = 1u + (last != start); + if (pages > 2u) pages = 2u; + + unsigned char vec[2]; + + int retries = 5; + for (;;) { + size_t len = pages * (size_t)page_size; + long rc = syscall(SYS_MINCORE, (void*)start, len, vec); + + if (rc == 0) { + return true; + } + + int e = errno; + if (e == ENOMEM || e == EFAULT) { + return false; + } + + if (e == EAGAIN && retries-- > 0) { + continue; + } + + // Unknown errno, we assume mapped to avoid cascading faults in crash path + return true; + } +} + +static inline ddog_CharSlice char_slice_from_cstr(const char *cstr) { + if (cstr == NULL) { + return (ddog_CharSlice){.ptr = NULL, .len = 0}; + } + return (ddog_CharSlice){.ptr = cstr, .len = strlen(cstr)}; +} + +static ddog_CharSlice safe_string_value(VALUE str) { + if (str == Qnil) return DDOG_CHARSLICE_C(""); + + // Validate object and payload readability in one check + if (!is_pointer_readable((const void *)str, sizeof(struct RString))) return DDOG_CHARSLICE_C(""); + if (!RB_TYPE_P(str, T_STRING)) return DDOG_CHARSLICE_C(""); + + long len = RSTRING_LEN(str); + if (len < 0 || len > 1024) return DDOG_CHARSLICE_C(""); + + const char *ptr = RSTRING_PTR(str); + if (!ptr) return DDOG_CHARSLICE_C(""); + + if (!is_pointer_readable(ptr, len > 0 ? len : 1)) return DDOG_CHARSLICE_C(""); + + return (ddog_CharSlice){.ptr = ptr, .len = (size_t)len}; +} + +static void emit_placeholder_frame( + void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*), + const char *label +) { + ddog_crasht_RuntimeStackFrame frame = { + .type_name = DDOG_CHARSLICE_C(""), + .function = char_slice_from_cstr(label), + .file = DDOG_CHARSLICE_C(""), + .line = 0, + .column = 0 + }; + emit_frame(&frame); +} + +// Collect the crashing thread's frames via ddtrace_rb_profile_frames into a static buffer, then emit +// them newest-first. If corruption is detected, emit placeholder frames so the crash report still +// completes. We lean on the Ruby VM helpers we already use for profiling and rely on crashtracker's +// safety nets so a failure here should not impact customers. +static void ruby_runtime_stack_callback( + void (*emit_frame)(const ddog_crasht_RuntimeStackFrame*) +) { + // Grab the Ruby thread we crashed on; crashtracker only runs once. + VALUE current_thread = rb_thread_current(); + if (current_thread == Qnil) return; + + if (crashtracker_thread_data_type == NULL) return; + + if (!rb_typeddata_is_kind_of(current_thread, crashtracker_thread_data_type)) { + emit_placeholder_frame(emit_frame, ""); + return; + } + + // Use the profiling helper to gather frames into our static buffer. + int frame_count = ddtrace_rb_profile_frames( + current_thread, + 0, + RUNTIME_STACK_MAX_FRAMES, + runtime_stack_buffer + ); + + if (frame_count <= 0) { + emit_placeholder_frame(emit_frame, ""); + return; + } + + for (int i = frame_count - 1; i >= 0; i--) { + frame_info *info = &runtime_stack_buffer[i]; + + if (info->is_ruby_frame) { + const void *iseq = (const void *)info->as.ruby_frame.iseq; + ddog_CharSlice function_slice = DDOG_CHARSLICE_C(""); + ddog_CharSlice file_slice = DDOG_CHARSLICE_C(""); + + if (iseq && is_pointer_readable(iseq, sizeof_rb_iseq_t())) { + function_slice = safe_string_value(ddtrace_iseq_base_label(iseq)); + file_slice = safe_string_value(ddtrace_iseq_path(iseq)); + } + + ddog_crasht_RuntimeStackFrame frame = { + .type_name = DDOG_CHARSLICE_C(""), + .function = function_slice, + .file = file_slice, + .line = info->as.ruby_frame.line, + .column = 0 + }; + + emit_frame(&frame); + } else { + ddog_CharSlice function_slice = DDOG_CHARSLICE_C(""); + ddog_CharSlice file_slice = DDOG_CHARSLICE_C(""); + + if (info->as.native_frame.method_id) { + const char *method_name = rb_id2name(info->as.native_frame.method_id); + if (is_pointer_readable(method_name, 256)) { + size_t method_name_len = strnlen(method_name, 256); + if (method_name_len > 0 && method_name_len < 256) { + function_slice = char_slice_from_cstr(method_name); + } + } + } + + ddog_crasht_RuntimeStackFrame frame = { + .type_name = DDOG_CHARSLICE_C(""), + .function = function_slice, + .file = file_slice, + .line = 0, + .column = 0 + }; + + emit_frame(&frame); + } + } + + if (frame_count == RUNTIME_STACK_MAX_FRAMES) { + emit_placeholder_frame(emit_frame, ""); + } +} + +void crashtracking_runtime_stacks_init(void) { + cached_page_size = (uintptr_t)sysconf(_SC_PAGESIZE); + if (cached_page_size == 0 || (cached_page_size & (cached_page_size - 1u))) { + cached_page_size = 4096; + } + + if (crashtracker_thread_data_type == NULL) { + VALUE current_thread = rb_thread_current(); + if (current_thread == Qnil) { + rb_raise(rb_eRuntimeError, "crashtracking_runtime_stacks_init: rb_thread_current returned Qnil"); + } + + const rb_data_type_t *thread_data_type = RTYPEDDATA_TYPE(current_thread); + if (!thread_data_type) { + rb_raise(rb_eRuntimeError, "crashtracking_runtime_stacks_init: RTYPEDDATA_TYPE returned NULL"); + } + + crashtracker_thread_data_type = thread_data_type; + } + + // Register immediately so Ruby doesn't need to manage this explicitly. + ddog_crasht_register_runtime_frame_callback(ruby_runtime_stack_callback); +} + +#else +// Keep init symbol to satisfy linkage on non linux platforms, but do nothing +void crashtracking_runtime_stacks_init(void) {} +#endif + diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index dd846376987..5b093a59ccb 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -76,6 +76,18 @@ rb_nativethread_id_t pthread_id_for(VALUE thread) { #endif } +size_t sizeof_rb_iseq_t(void) { + return sizeof(rb_iseq_t); +} + +VALUE ddtrace_iseq_base_label(const void *iseq) { + return rb_iseq_base_label((const rb_iseq_t *)iseq); +} + +VALUE ddtrace_iseq_path(const void *iseq) { + return rb_iseq_path((const rb_iseq_t *)iseq); +} + // Queries if the current thread is the owner of the global VM lock. // // @ivoanjo: Ruby has a similarly-named `ruby_thread_has_gvl_p` but that API is insufficient for our needs because it can diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index 5757ed0bf93..ac95b90b13d 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -47,6 +47,10 @@ VALUE thread_name_for(VALUE thread); int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *stack_buffer); +size_t sizeof_rb_iseq_t(void); +VALUE ddtrace_iseq_base_label(const void *iseq); +VALUE ddtrace_iseq_path(const void *iseq); + // Returns true if the current thread belongs to the main Ractor or if Ruby has no Ractor support bool ddtrace_rb_ractor_main_p(void); diff --git a/ext/datadog_profiling_native_extension/profiling.c b/ext/datadog_profiling_native_extension/profiling.c index bcf97709c14..0daa3372873 100644 --- a/ext/datadog_profiling_native_extension/profiling.c +++ b/ext/datadog_profiling_native_extension/profiling.c @@ -23,6 +23,7 @@ void collectors_thread_context_init(VALUE profiling_module); void encoded_profile_init(VALUE profiling_module); void http_transport_init(VALUE profiling_module); void stack_recorder_init(VALUE profiling_module); +void crashtracking_runtime_stacks_init(void); static VALUE native_working_p(VALUE self); static VALUE _native_grab_gvl_and_raise(DDTRACE_UNUSED VALUE _self, VALUE exception_class, VALUE test_message, VALUE test_message_arg, VALUE release_gvl); @@ -65,6 +66,7 @@ void DDTRACE_EXPORT Init_datadog_profiling_native_extension(void) { encoded_profile_init(profiling_module); http_transport_init(profiling_module); stack_recorder_init(profiling_module); + crashtracking_runtime_stacks_init(); unsafe_api_calls_check_init(); // Hosts methods used for testing the native code using RSpec diff --git a/spec/datadog/core/crashtracking/component_spec.rb b/spec/datadog/core/crashtracking/component_spec.rb index a165161bd5e..537f7aa1ff3 100644 --- a/spec/datadog/core/crashtracking/component_spec.rb +++ b/spec/datadog/core/crashtracking/component_spec.rb @@ -198,14 +198,15 @@ let(:parsed_request) { JSON.parse(request.body, symbolize_names: true) } let(:crash_report) { parsed_request.fetch(:payload).first } let(:crash_report_message) { JSON.parse(crash_report.fetch(:message), symbolize_names: true) } + let(:crash_report_experimental) { crash_report_message.fetch(:experimental) } let(:stack_trace) { crash_report_message.fetch(:error).fetch(:stack).fetch(:frames) } # NOTE: If any of these tests seem flaky, the `upload_timeout_seconds` may need to be raised (or otherwise # we need to tweak libdatadog to not need such high timeouts). [ - [:fiddle, "rb_fiddle_free", proc { Fiddle.free(42) }], - [:signal, "rb_f_kill", proc { Process.kill("SEGV", Process.pid) }], + [:fiddle, 'rb_fiddle_free', proc { Fiddle.free(42) }], + [:signal, 'rb_f_kill', proc { Process.kill('SEGV', Process.pid) }], ].each do |trigger_name, function, trigger| it "reports crashes via http when app crashes with #{trigger_name}" do expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do @@ -213,10 +214,8 @@ crash_tracker.start trigger.call end - expect(stack_trace).to match(array_including(hash_including(function: function))) expect(stack_trace.size).to be > 10 - expect(crash_report[:tags]).to include('si_signo:11', 'si_signo_human_readable:SIGSEGV') expect(crash_report_message[:metadata]).to include( @@ -318,6 +317,71 @@ end end end + + describe 'Ruby and C method runtime stack capture' do + let(:runtime_stack) { crash_report_experimental[:runtime_stack] } + + before do + raise 'This spec requires profiling (native extension not available)' unless Datadog::Profiling.supported? + end + + it 'captures both Ruby and C method frames in mixed stacks' do + expect_in_fork(fork_expectations: fork_expectations, timeout_seconds: 15) do + crash_stack_helper_class = Class.new do + def top_level_ruby_method + ruby_method_with_c_calls + end + + def ruby_method_with_c_calls + 'hello world'.gsub('world') do |_match| + {a: 1, b: 2}.each do |_key, _value| + Fiddle.free(42) + end + end + end + end + + crash_tracker = build_crashtracker(agent_base_url: agent_base_url) + crash_tracker.start + + crash_stack_helper_class.new.top_level_ruby_method + end + + frames = runtime_stack[:frames] + + # Check that the crashing function is captured + expect(frames).to include( + hash_including( + function: 'free' + ) + ) + + # Sanity check some frames + expect(frames).to include( + hash_including( + function: 'ruby_method_with_c_calls' + ) + ) + + expect(frames).to include( + hash_including( + function: 'top_level_ruby_method' + ) + ) + + expect(frames).to include( + hash_including( + function: 'each' + ) + ) + + expect(frames).to include( + hash_including( + function: 'gsub' + ) + ) + end + end end end diff --git a/spec/datadog/core/telemetry/integration/telemetry_spec.rb b/spec/datadog/core/telemetry/integration/telemetry_spec.rb index fdc640605fe..1bad9a2e97a 100644 --- a/spec/datadog/core/telemetry/integration/telemetry_spec.rb +++ b/spec/datadog/core/telemetry/integration/telemetry_spec.rb @@ -89,7 +89,7 @@ # The most common unsupported reason is failure to load profiling # C extension due to it not having been compiled - we get that in # some CI configurations. - expect(Datadog::Profiling).to receive(:unsupported_reason).and_return(nil) + expect(Datadog::Profiling).to receive(:unsupported_reason).at_least(:once).and_return(nil) end end