Skip to content

Commit 781bfc3

Browse files
fix(logs): add logs flush on crash (#1404)
* Add logs-on-crash * skip crashpad test on macos * add straight-to-disk writing of log envelopes if crashing * CHANGELOG.md * restore thread join for normal shutdown * directly take log item from httpserver log for crashpad test * update CHANGELOG.md * we know which envelope has the log * we **don't** know which envelope has the log * add more general request fetch from httpserver function * add logs return value checks * fix tsan global data race on logs validation * move database-path cleanup into the cmake build cache fixture this eliminates boilerplate noise in the tests but also reduces the chance of forgetting it in a test. * in the crash-safe logs flush detach the batcher thread and don't wake it since we flush anyway. * don't clean for unit-test crash_marker * don't detach in crash-safe but spin-lock in its flusher to acquire flushing. * add clean parameter to cmake fixture * ensure logs flushing in crash handlers happens within the handler sync bounds. * bound the spin lock in the logs flusher during crash-safe mode * exclude crashed_last_run unit-test from cleaning database paths * use an unsigned integer for the sleep_time of the crash-safe logs flusher spinlock * provide cross-platform cpu_relax and replace syscall sleep with spinners. * isolate `crash_marker` unit test and eliminate exceptions to database cleaning in the pytest configuration * platform path special case in crash_marker unit test * crashed_last_run can no longer assume that a database path exists this means when it does its clearing a TEST_CHECK cannot assume that a directory even exists (in which case clear_crash_marker would actually fail versus the case were only the marker file didn't exist). * clean up log statements * bump CI python to 3.12 * fix CHANGELOG.md --------- Co-authored-by: Mischan Toosarani-Hausberger <[email protected]>
1 parent e041a2b commit 781bfc3

21 files changed

+413
-170
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737

3838
- uses: actions/setup-python@v5
3939
with:
40-
python-version: "3.11"
40+
python-version: "3.12"
4141
cache: "pip"
4242

4343
- name: Install Linux dependencies

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ jobs:
189189
submodules: recursive
190190
- uses: actions/setup-python@v5
191191
with:
192-
python-version: ${{ !env['SYSTEM_PYTHON'] && '3.11' || '' }}
192+
python-version: ${{ !env['SYSTEM_PYTHON'] && '3.12' || '' }}
193193
cache: "pip"
194194

195195
- name: Installing Linux Dependencies

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## Unreleased
4+
5+
**Fixes**:
6+
7+
- Add logs flush on crash. This is not available for macOS with the `crashpad` backend. ([#1404](https://github.com/getsentry/sentry-native/pull/1404))
8+
39
## 0.11.3
410

511
**Features**:

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ sentry_target_sources_cwd(sentry
88
sentry_boot.h
99
sentry_core.c
1010
sentry_core.h
11+
sentry_cpu_relax.h
1112
sentry_database.c
1213
sentry_database.h
1314
sentry_envelope.c

src/backends/sentry_backend_breakpad.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ extern "C" {
88
#include "sentry_database.h"
99
#include "sentry_envelope.h"
1010
#include "sentry_logger.h"
11+
#include "sentry_logs.h"
1112
#include "sentry_options.h"
1213
#ifdef SENTRY_PLATFORM_WINDOWS
1314
# include "sentry_os.h"
@@ -125,6 +126,11 @@ breakpad_backend_callback(const google_breakpad::MinidumpDescriptor &descriptor,
125126
event, "level", sentry__value_new_level(SENTRY_LEVEL_FATAL));
126127

127128
SENTRY_WITH_OPTIONS (options) {
129+
// Flush logs in a crash-safe manner before crash handling
130+
if (options->enable_logs) {
131+
sentry__logs_flush_crash_safe();
132+
}
133+
128134
sentry__write_crash_marker(options);
129135

130136
bool should_handle = true;

src/backends/sentry_backend_crashpad.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ extern "C" {
55
#include "sentry_attachment.h"
66
#include "sentry_backend.h"
77
#include "sentry_core.h"
8+
#include "sentry_cpu_relax.h"
89
#include "sentry_database.h"
910
#include "sentry_envelope.h"
1011
#include "sentry_logger.h"
12+
#include "sentry_logs.h"
1113
#include "sentry_options.h"
1214
#ifdef SENTRY_PLATFORM_WINDOWS
1315
# include "sentry_os.h"
@@ -287,6 +289,7 @@ flush_scope_from_handler(
287289
while (!state->scope_flush.compare_exchange_strong(
288290
expected, true, std::memory_order_acquire)) {
289291
expected = false;
292+
sentry__cpu_relax();
290293
}
291294

292295
// now we are the sole flusher and can flush into the crash event
@@ -305,6 +308,7 @@ static bool
305308
sentry__crashpad_handler(int signum, siginfo_t *info, ucontext_t *user_context)
306309
{
307310
sentry__page_allocator_enable();
311+
sentry__enter_signal_handler();
308312
# endif
309313
// Disable logging during crash handling if the option is set
310314
SENTRY_WITH_OPTIONS (options) {
@@ -318,6 +322,10 @@ sentry__crashpad_handler(int signum, siginfo_t *info, ucontext_t *user_context)
318322
bool should_dump = true;
319323

320324
SENTRY_WITH_OPTIONS (options) {
325+
// Flush logs in a crash-safe manner before crash handling
326+
if (options->enable_logs) {
327+
sentry__logs_flush_crash_safe();
328+
}
321329
auto state = static_cast<crashpad_state_t *>(options->backend->data);
322330
sentry_value_t crash_event
323331
= sentry__value_new_event_with_id(&state->crash_event_id);
@@ -404,6 +412,10 @@ sentry__crashpad_handler(int signum, siginfo_t *info, ucontext_t *user_context)
404412
# endif
405413
}
406414

415+
# ifndef SENTRY_PLATFORM_WINDOWS
416+
sentry__leave_signal_handler();
417+
# endif
418+
407419
// we did not "handle" the signal, so crashpad should do that.
408420
return false;
409421
}

src/backends/sentry_backend_inproc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "sentry_database.h"
88
#include "sentry_envelope.h"
99
#include "sentry_logger.h"
10+
#include "sentry_logs.h"
1011
#include "sentry_options.h"
1112
#if defined(SENTRY_PLATFORM_WINDOWS)
1213
# include "sentry_os.h"
@@ -555,6 +556,10 @@ handle_ucontext(const sentry_ucontext_t *uctx)
555556
#endif
556557

557558
SENTRY_WITH_OPTIONS (options) {
559+
// Flush logs in a crash-safe manner before crash handling
560+
if (options->enable_logs) {
561+
sentry__logs_flush_crash_safe();
562+
}
558563
#ifdef SENTRY_PLATFORM_LINUX
559564
// On Linux (and thus Android) CLR/Mono converts signals provoked by
560565
// AOT/JIT-generated native code into managed code exceptions. In these

src/sentry_cpu_relax.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#ifndef SENTRY_CPU_RELAX_H_INCLUDED
2+
#define SENTRY_CPU_RELAX_H_INCLUDED
3+
4+
#if defined(_MSC_VER)
5+
# include <intrin.h>
6+
# if defined(_M_IX86) || defined(_M_X64)
7+
# define sentry__cpu_relax() _mm_pause()
8+
# elif defined(_M_ARM) || defined(_M_ARM64)
9+
# define sentry__cpu_relax() __yield()
10+
# else
11+
# define sentry__cpu_relax() __nop()
12+
# endif
13+
#else
14+
# if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
15+
# define sentry__cpu_relax() __asm__ __volatile__("pause")
16+
# elif defined(__aarch64__) || defined(__arm__)
17+
# define sentry__cpu_relax() __asm__ __volatile__("yield")
18+
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
19+
# define sentry__cpu_relax() __asm__ __volatile__("or 27,27,27")
20+
# elif defined(__riscv)
21+
# if defined(__riscv_zihintpause)
22+
# define sentry__cpu_relax() __asm__ __volatile__("pause")
23+
# else
24+
# define sentry__cpu_relax() __asm__ __volatile__("nop")
25+
# endif
26+
# else
27+
# define sentry__cpu_relax() (void)0
28+
# endif
29+
#endif
30+
31+
#endif // SENTRY_CPU_RELAX_H_INCLUDED

src/sentry_logs.c

Lines changed: 89 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,28 @@
11
#include "sentry_logs.h"
22
#include "sentry_core.h"
3+
#include "sentry_cpu_relax.h"
34
#include "sentry_envelope.h"
45
#include "sentry_options.h"
56
#include "sentry_os.h"
67
#include "sentry_scope.h"
78
#include "sentry_sync.h"
8-
#if defined(SENTRY_PLATFORM_UNIX) || defined(SENTRY_PLATFORM_NX)
9-
# include "sentry_unix_spinlock.h"
10-
#endif
119
#include <stdarg.h>
1210
#include <string.h>
1311

1412
#ifdef SENTRY_UNITTEST
1513
# define QUEUE_LENGTH 5
16-
# ifdef SENTRY_PLATFORM_WINDOWS
17-
# include <windows.h>
18-
# define sleep_ms(MILLISECONDS) Sleep(MILLISECONDS)
19-
# else
20-
# include <unistd.h>
21-
# define sleep_ms(MILLISECONDS) usleep(MILLISECONDS * 1000)
22-
# endif
2314
#else
2415
# define QUEUE_LENGTH 100
2516
#endif
2617
#define FLUSH_TIMER 5
2718

19+
#ifdef SENTRY_PLATFORM_WINDOWS
20+
# include <windows.h>
21+
# define sleep_ms(MILLISECONDS) Sleep(MILLISECONDS)
22+
#else
23+
# include <unistd.h>
24+
# define sleep_ms(MILLISECONDS) usleep(MILLISECONDS * 1000)
25+
#endif
2826
/**
2927
* Thread lifecycle states for the logs batching thread.
3028
*/
@@ -87,13 +85,49 @@ check_for_flush_condition(void)
8785
return sentry__atomic_fetch(&current_buf->index) >= QUEUE_LENGTH;
8886
}
8987

88+
// Use a sleep spinner around a monotonic timer so we don't syscall sleep from
89+
// a signal handler. While this is strictly needed only there, there is no
90+
// reason not to use the same implementation across platforms.
9091
static void
91-
flush_logs_queue(void)
92+
crash_safe_sleep_ms(uint64_t delay_ms)
9293
{
93-
const long already_flushing
94-
= sentry__atomic_store(&g_logs_state.flushing, 1);
95-
if (already_flushing) {
96-
return;
94+
const uint64_t start = sentry__monotonic_time();
95+
const uint64_t end = start + delay_ms;
96+
while (sentry__monotonic_time() < end) {
97+
for (int i = 0; i < 64; i++) {
98+
sentry__cpu_relax();
99+
}
100+
}
101+
}
102+
103+
static void
104+
flush_logs_queue(bool crash_safe)
105+
{
106+
if (crash_safe) {
107+
// In crash-safe mode, spin lock with timeout and backoff
108+
int attempts = 0;
109+
while (!sentry__atomic_compare_swap(&g_logs_state.flushing, 0, 1)) {
110+
const int max_attempts = 200;
111+
if (++attempts > max_attempts) {
112+
SENTRY_WARN("flush_logs_queue: timeout waiting for flushing "
113+
"lock in crash-safe mode");
114+
return;
115+
}
116+
117+
// backoff max-wait with max_attempts = 200 based sleep slots:
118+
// 9ms + 450ms + 1010ms = 1500ish ms
119+
const uint32_t sleep_time = (attempts < 10) ? 1
120+
: (attempts < 100) ? 5
121+
: 10;
122+
crash_safe_sleep_ms(sleep_time);
123+
}
124+
} else {
125+
// Normal mode: try once and return if already flushing
126+
const long already_flushing
127+
= sentry__atomic_store(&g_logs_state.flushing, 1);
128+
if (already_flushing) {
129+
return;
130+
}
97131
}
98132
do {
99133
// prep both buffers
@@ -115,10 +149,7 @@ flush_logs_queue(void)
115149

116150
// Wait for all in-flight producers of the old buffer
117151
while (sentry__atomic_fetch(&old_buf->adding) > 0) {
118-
// TODO currently only on unix
119-
#ifdef SENTRY_PLATFORM_UNIX
120152
sentry__cpu_relax();
121-
#endif
122153
}
123154

124155
long n = sentry__atomic_store(&old_buf->index, 0);
@@ -139,8 +170,17 @@ flush_logs_queue(void)
139170

140171
sentry_envelope_t *envelope = sentry__envelope_new();
141172
sentry__envelope_add_logs(envelope, logs);
173+
142174
SENTRY_WITH_OPTIONS (options) {
143-
sentry__capture_envelope(options->transport, envelope);
175+
if (crash_safe) {
176+
// Write directly to disk to avoid transport queuing during
177+
// crash
178+
sentry__run_write_envelope(options->run, envelope);
179+
sentry_envelope_free(envelope);
180+
} else {
181+
// Normal operation: use transport for HTTP transmission
182+
sentry__capture_envelope(options->transport, envelope);
183+
}
144184
}
145185
sentry_value_decref(logs);
146186
}
@@ -260,20 +300,20 @@ batching_thread_func(void *data)
260300
case 0:
261301
#ifdef SENTRY_PLATFORM_WINDOWS
262302
if (GetLastError() == ERROR_TIMEOUT) {
263-
SENTRY_DEBUG("Logs flushed by timeout");
303+
SENTRY_TRACE("Logs flushed by timeout");
264304
break;
265305
}
266306
#endif
267-
SENTRY_DEBUG("Logs flushed by filled buffer");
307+
SENTRY_TRACE("Logs flushed by filled buffer");
268308
break;
269309
#ifdef SENTRY_PLATFORM_UNIX
270310
case ETIMEDOUT:
271-
SENTRY_DEBUG("Logs flushed by timeout");
311+
SENTRY_TRACE("Logs flushed by timeout");
272312
break;
273313
#endif
274314
#ifdef SENTRY_PLATFORM_WINDOWS
275315
case 1:
276-
SENTRY_DEBUG("Logs flushed by filled buffer");
316+
SENTRY_TRACE("Logs flushed by filled buffer");
277317
break;
278318
#endif
279319
default:
@@ -282,7 +322,7 @@ batching_thread_func(void *data)
282322
}
283323

284324
// Try to flush logs
285-
flush_logs_queue();
325+
flush_logs_queue(false);
286326
}
287327

288328
sentry__mutex_unlock(&task_lock);
@@ -833,13 +873,37 @@ sentry__logs_shutdown(uint64_t timeout)
833873
sentry__thread_join(g_logs_state.batching_thread);
834874

835875
// Perform final flush to ensure any remaining logs are sent
836-
flush_logs_queue();
876+
flush_logs_queue(false);
837877

838878
sentry__thread_free(&g_logs_state.batching_thread);
839879

840880
SENTRY_DEBUG("logs system shutdown complete");
841881
}
842882

883+
void
884+
sentry__logs_flush_crash_safe(void)
885+
{
886+
SENTRY_DEBUG("crash-safe logs flush");
887+
888+
// Check if logs system is initialized
889+
const long state = sentry__atomic_fetch(&g_logs_state.thread_state);
890+
if (state == SENTRY_LOGS_THREAD_STOPPED) {
891+
return;
892+
}
893+
894+
// Signal the thread to stop but don't wait, since the crash-safe flush
895+
// will spin-lock on flushing anyway.
896+
sentry__atomic_store(
897+
&g_logs_state.thread_state, (long)SENTRY_LOGS_THREAD_STOPPED);
898+
899+
// Perform crash-safe flush directly to disk to avoid transport queuing
900+
// This is safe because we're in a crash scenario and the main thread
901+
// is likely dead or dying anyway
902+
flush_logs_queue(true);
903+
904+
SENTRY_DEBUG("crash-safe logs flush complete");
905+
}
906+
843907
#ifdef SENTRY_UNITTEST
844908
/**
845909
* Wait for the logs batching thread to be ready.

src/sentry_logs.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ void sentry__logs_startup(void);
1616
*/
1717
void sentry__logs_shutdown(uint64_t timeout);
1818

19+
/**
20+
* Crash-safe logs flush that avoids thread synchronization.
21+
* This should be used during crash handling to flush logs without
22+
* waiting for the batching thread to shut down cleanly.
23+
*/
24+
void sentry__logs_flush_crash_safe(void);
25+
1926
#ifdef SENTRY_UNITTEST
2027
int populate_message_parameters(
2128
sentry_value_t attributes, const char *message, va_list args);

0 commit comments

Comments
 (0)