Skip to content

Commit 06f9d6a

Browse files
fix: mark remote prover as unhealthy in monitor (#1848)
1 parent bbe2e3f commit 06f9d6a

3 files changed

Lines changed: 24 additions & 8 deletions

File tree

bin/network-monitor/assets/index.js

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,8 +367,20 @@ function updateDisplay() {
367367
rpcService?.details?.RpcStatus?.block_producer_status?.chain_tip ??
368368
null;
369369

370+
// Compute effective health for a service, considering all signals for remote provers.
371+
const isServiceHealthy = (s) => {
372+
if (s.details && s.details.RemoteProverStatus) {
373+
const statusOk = s.status === 'Healthy';
374+
const testOk = s.testStatus == null || s.testStatus === 'Healthy';
375+
const probeResult = grpcWebProbeResults.get(s.details.RemoteProverStatus.url);
376+
const probeOk = !probeResult || probeResult.ok;
377+
return statusOk && testOk && probeOk;
378+
}
379+
return s.status === 'Healthy';
380+
};
381+
370382
// Count healthy vs unhealthy services
371-
const healthyServices = processedServices.filter(s => s.status === 'Healthy').length;
383+
const healthyServices = processedServices.filter(isServiceHealthy).length;
372384
const totalServices = processedServices.length;
373385
const allHealthy = healthyServices === totalServices;
374386

@@ -388,7 +400,7 @@ function updateDisplay() {
388400

389401
// Generate status cards
390402
const serviceCardsHtml = processedServices.map(service => {
391-
const isHealthy = service.status === 'Healthy';
403+
const isHealthy = isServiceHealthy(service);
392404
const statusColor = isHealthy ? '#22C55D' : '#ff5500';
393405
const statusIcon = isHealthy ? '✓' : '✗';
394406
const numOrDash = value => isHealthy ? (value?.toLocaleString?.() ?? value ?? '-') : '-';

bin/network-monitor/src/faucet.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ pub async fn run_faucet_test_task(
102102
let mut success_count = 0u64;
103103
let mut failure_count = 0u64;
104104
let mut last_tx_id = None;
105+
let mut last_error: Option<String>;
105106
let mut faucet_metadata = None;
106107

107108
let mut interval = tokio::time::interval(test_interval);
@@ -118,11 +119,13 @@ pub async fn run_faucet_test_task(
118119
Ok((minted_tokens, metadata)) => {
119120
success_count += 1;
120121
last_tx_id = Some(minted_tokens.tx_id.clone());
122+
last_error = None;
121123
faucet_metadata = Some(metadata);
122124
info!("Faucet test successful: tx_id={}", minted_tokens.tx_id);
123125
},
124126
Err(e) => {
125127
failure_count += 1;
128+
last_error = Some(format!("{e:#}"));
126129
warn!("Faucet test failed: {}", e);
127130
},
128131
}
@@ -140,13 +143,13 @@ pub async fn run_faucet_test_task(
140143

141144
let status = ServiceStatus {
142145
name: "Faucet".to_string(),
143-
status: if success_count > 0 || failure_count == 0 {
144-
Status::Healthy
145-
} else {
146+
status: if last_error.is_some() {
146147
Status::Unhealthy
148+
} else {
149+
Status::Healthy
147150
},
148151
last_checked: current_time,
149-
error: None,
152+
error: last_error.clone(),
150153
details: ServiceDetails::FaucetTest(test_details),
151154
};
152155

bin/network-monitor/src/status.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,10 +590,11 @@ pub(crate) async fn check_remote_prover_status(
590590
// Use the new method to convert gRPC status to domain type
591591
let remote_prover_details = RemoteProverStatusDetails::from_proxy_status(status, url);
592592

593-
// Determine overall health based on worker statuses
593+
// Determine overall health based on worker statuses.
594+
// All workers must be healthy for the prover to be considered healthy.
594595
let overall_health = if remote_prover_details.workers.is_empty() {
595596
Status::Unknown
596-
} else if remote_prover_details.workers.iter().any(|w| w.status == Status::Healthy) {
597+
} else if remote_prover_details.workers.iter().all(|w| w.status == Status::Healthy) {
597598
Status::Healthy
598599
} else {
599600
Status::Unhealthy

0 commit comments

Comments
 (0)