@@ -277,38 +277,43 @@ ERT_CLIB_SUBMODULE("queue", m) {
277
277
int (current_status), std::nullopt);
278
278
}
279
279
280
- std::optional<std::string> msg = std::nullopt;
281
-
282
- if ((current_status & JOB_QUEUE_RUNNING) &&
283
- (node->status_file && !(fs::exists (node->status_file )))) {
284
- // it's running, but not confirmed running.
285
- time_t runtime = time (nullptr ) - node->sim_start ;
286
- if (runtime >= MAX_CONFIRMED_WAIT) {
287
- std::string error_msg = fmt::format (
288
- " max_confirm_wait ({}) has passed since sim_start"
289
- " without success; {} is assumed dead (attempt {})" ,
290
- MAX_CONFIRMED_WAIT, node->job_name , node->submit_attempt );
291
- logger->info (error_msg);
292
- msg = error_msg;
293
- job_status_type new_status = JOB_QUEUE_DO_KILL_NODE_FAILURE;
294
- job_queue_node_set_status (node, new_status);
280
+ std::optional<std::string> error_msg = std::nullopt;
281
+
282
+ if (current_status & JOB_QUEUE_RUNNING && !node->confirmed_running ) {
283
+ node->confirmed_running =
284
+ node->status_file && fs::exists (node->status_file );
285
+
286
+ if (!node->confirmed_running ) {
287
+ if ((time (nullptr ) - node->sim_start ) >= MAX_CONFIRMED_WAIT) {
288
+ error_msg = fmt::format (
289
+ " max_confirm_wait ({}) has passed since sim_start"
290
+ " without success; {} is assumed dead (attempt {})" ,
291
+ MAX_CONFIRMED_WAIT, node->job_name ,
292
+ node->submit_attempt );
293
+ logger->info (error_msg.value ());
294
+ job_queue_node_set_status (node,
295
+ JOB_QUEUE_DO_KILL_NODE_FAILURE);
296
+ current_status = JOB_QUEUE_DO_KILL_NODE_FAILURE;
297
+ }
295
298
}
296
299
}
297
300
298
- current_status = job_queue_node_get_status (node);
299
301
if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) {
300
302
job_status_type new_status =
301
303
queue_driver_get_status (driver, node->job_data );
304
+
302
305
if (new_status == JOB_QUEUE_EXIT)
303
306
job_queue_node_fscanf_EXIT (node);
307
+
304
308
job_queue_node_set_status (node, new_status);
305
- current_status = job_queue_node_get_status (node) ;
309
+ current_status = new_status ;
306
310
}
307
- if (node->fail_message .has_value () and !msg.has_value ())
308
- msg = node->fail_message ;
311
+
312
+ if (node->fail_message .has_value () and !error_msg.has_value ())
313
+ error_msg = node->fail_message ;
309
314
310
315
pthread_mutex_unlock (&node->data_mutex );
311
316
return std::make_pair<int , std::optional<std::string>>(
312
- int (current_status), std::move (msg ));
317
+ int (current_status), std::move (error_msg ));
313
318
});
314
319
}
0 commit comments