diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 5a11eefd7e3..c9dff932061 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -297,15 +297,17 @@ ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, // really don't want to have to start over loop choose { when(wait(recoveryCore)) {} - when(wait(waitFailureClient( - iMaster.waitFailure, - db->masterRegistrationCount - ? SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME - : (now() - recoveryStart) * SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY, - db->masterRegistrationCount ? -SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY - : SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY) || - db->forceMasterFailure.onTrigger())) { + when(wait( + traceAfter(waitFailureClient( + iMaster.waitFailure, + db->masterRegistrationCount + ? SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME + : (now() - recoveryStart) * SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY, + db->masterRegistrationCount ? -SERVER_KNOBS->MASTER_FAILURE_REACTION_TIME / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY + : SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY), + "CCWDBMasterFailed") || + traceAfter(db->forceMasterFailure.onTrigger(), "CCWDBForceMasterFailureTriggered"))) { break; } when(wait(db->serverInfo->onChange())) {} diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index e08b98fdafb..4d5e11da672 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -475,33 +475,36 @@ ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSy for (auto& it : self->tLogs) { for (auto& t : it->logServers) { if (t->get().present()) { - failed.push_back( - waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); + failed.push_back(waitFailureClient(t->get().interf().waitFailure, + /* failureReactionTime */ SERVER_KNOBS->TLOG_TIMEOUT, + /* failureReactionSlope */ -SERVER_KNOBS->TLOG_TIMEOUT / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /* trace */ true, + /* traceMsg */ "TLogFailed"_sr)); } else { changes.push_back(t->onChange()); } } for (auto& t : it->logRouters) { if (t->get().present()) { - failed.push_back( - waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); + failed.push_back(waitFailureClient(t->get().interf().waitFailure, + /* failureReactionTime */ SERVER_KNOBS->TLOG_TIMEOUT, + /* failureReactionSlope */ -SERVER_KNOBS->TLOG_TIMEOUT / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /* trace */ true, + /* traceMsg */ "LogRouterFailed"_sr)); } else { changes.push_back(t->onChange()); } } for (const auto& worker : it->backupWorkers) { if (worker->get().present()) { - backupFailed.push_back( - waitFailureClient(worker->get().interf().waitFailure, - SERVER_KNOBS->BACKUP_TIMEOUT, - -SERVER_KNOBS->BACKUP_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); + backupFailed.push_back(waitFailureClient(worker->get().interf().waitFailure, + /* failureReactionTime */ SERVER_KNOBS->BACKUP_TIMEOUT, + /* failureReactionSlope */ -SERVER_KNOBS->BACKUP_TIMEOUT / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /* trace */ true, + /* traceMsg */ "BackupWorkerFailed"_sr)); } else { changes.push_back(worker->onChange()); } @@ -514,10 +517,11 @@ ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSy for (auto& t : it->logRouters) { if (t->get().present()) { failed.push_back(waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / + /* failureReactionTime */ SERVER_KNOBS->TLOG_TIMEOUT, + /* failureReactionSlope */ -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); + /* trace */ true, + /* traceMsg */ "OldLogRouterFailed"_sr)); } else { changes.push_back(t->onChange()); } @@ -526,11 +530,13 @@ ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSy // Monitor changes of backup workers for old epochs. for (const auto& worker : old.tLogs[0]->backupWorkers) { if (worker->get().present()) { - backupFailed.push_back(waitFailureClient(worker->get().interf().waitFailure, - SERVER_KNOBS->BACKUP_TIMEOUT, - -SERVER_KNOBS->BACKUP_TIMEOUT / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); + backupFailed.push_back( + waitFailureClient(worker->get().interf().waitFailure, + /* failureReactionTime */ SERVER_KNOBS->BACKUP_TIMEOUT, + /* failureReactionSlope */ -SERVER_KNOBS->BACKUP_TIMEOUT / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /* trace */ true, + /* traceMsg */ "OldBackupWorkerFailed"_sr)); } else { changes.push_back(worker->onChange()); } @@ -546,7 +552,8 @@ ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSy changes.push_back(self->backupWorkerChanged.onTrigger()); ASSERT(failed.size() >= 1); - wait(quorum(changes, 1) || tagError(quorum(failed, 1), tlog_failed()) || + wait(quorum(changes, 1) || + tagError(traceAfter(quorum(failed, 1), "TPLSOnErrorLogSystemFailed"), tlog_failed()) || tagError(quorum(backupFailed, 1), backup_worker_failed())); } } diff --git a/fdbserver/WaitFailure.actor.cpp b/fdbserver/WaitFailure.actor.cpp index 267d01496ce..3ef1432bf87 100644 --- a/fdbserver/WaitFailure.actor.cpp +++ b/fdbserver/WaitFailure.actor.cpp @@ -42,6 +42,7 @@ ACTOR Future waitFailureClient(RequestStream> waitFailu double reactionTime, double reactionSlope, bool trace, + Optional> traceMsg, TaskPriority taskID) { loop { try { @@ -50,9 +51,12 @@ ACTOR Future waitFailureClient(RequestStream> waitFailu wait(waitFailure.getReplyUnlessFailedFor(ReplyPromise(), reactionTime, reactionSlope, taskID)); if (!x.present()) { if (trace) { - TraceEvent("WaitFailureClient") - .detail("FailedEndpoint", waitFailure.getEndpoint().getPrimaryAddress().toString()) + TraceEvent te("WaitFailureClient"); + te.detail("FailedEndpoint", waitFailure.getEndpoint().getPrimaryAddress().toString()) .detail("Token", waitFailure.getEndpoint().token); + if (traceMsg.present()) { + te.detail("Context", traceMsg.get()); + } } return Void(); } @@ -72,7 +76,12 @@ ACTOR Future waitFailureClientStrict(RequestStream> wai double failureReactionTime, TaskPriority taskID) { loop { - wait(waitFailureClient(waitFailure, 0, 0, false, taskID)); + wait(waitFailureClient(waitFailure, + /* reactionTime */ 0, + /* reactionSlope */ 0, + /* trace */ false, + /* traceMsg */ Optional>(), + taskID)); wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual(waitFailure.getEndpoint(), FailureStatus(false))); if (IFailureMonitor::failureMonitor().getState(waitFailure.getEndpoint()).isFailed()) { diff --git a/fdbserver/include/fdbserver/WaitFailure.h b/fdbserver/include/fdbserver/WaitFailure.h index 3baaf972933..e64c81c3ada 100644 --- a/fdbserver/include/fdbserver/WaitFailure.h +++ b/fdbserver/include/fdbserver/WaitFailure.h @@ -32,6 +32,7 @@ Future waitFailureClient(const RequestStream>& waitFail double const& failureReactionTime = 0, double const& failureReactionSlope = 0, bool const& trace = false, + Optional> const& traceMsg = Optional>(), TaskPriority const& taskID = TaskPriority::DefaultEndpoint); // talks to a wait failure server, returns Void on failure, reaction time is always waited