Skip to content

Commit bcafbcb

Browse files
committed
Refactor refresh function to store confirmed running state
1 parent daae973 commit bcafbcb

File tree

2 files changed

+26
-20
lines changed

2 files changed

+26
-20
lines changed

src/clib/lib/include/ert/job_queue/job_node.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct job_queue_node_struct {
4343
/** The commandline arguments. */
4444
char **argv;
4545
int queue_index = 0;
46+
bool confirmed_running = false;
4647

4748
std::optional<std::string> fail_message{};
4849

src/clib/lib/job_queue/job_node.cpp

+25-20
Original file line numberDiff line numberDiff line change
@@ -277,38 +277,43 @@ ERT_CLIB_SUBMODULE("queue", m) {
277277
int(current_status), std::nullopt);
278278
}
279279

280-
std::optional<std::string> msg = std::nullopt;
281-
282-
if ((current_status & JOB_QUEUE_RUNNING) &&
283-
(node->status_file && !(fs::exists(node->status_file)))) {
284-
// it's running, but not confirmed running.
285-
time_t runtime = time(nullptr) - node->sim_start;
286-
if (runtime >= MAX_CONFIRMED_WAIT) {
287-
std::string error_msg = fmt::format(
288-
"max_confirm_wait ({}) has passed since sim_start"
289-
"without success; {} is assumed dead (attempt {})",
290-
MAX_CONFIRMED_WAIT, node->job_name, node->submit_attempt);
291-
logger->info(error_msg);
292-
msg = error_msg;
293-
job_status_type new_status = JOB_QUEUE_DO_KILL_NODE_FAILURE;
294-
job_queue_node_set_status(node, new_status);
280+
std::optional<std::string> error_msg = std::nullopt;
281+
282+
if (current_status & JOB_QUEUE_RUNNING && !node->confirmed_running) {
283+
node->confirmed_running =
284+
node->status_file && fs::exists(node->status_file);
285+
286+
if (!node->confirmed_running) {
287+
if ((time(nullptr) - node->sim_start) >= MAX_CONFIRMED_WAIT) {
288+
error_msg = fmt::format(
289+
"max_confirm_wait ({}) has passed since sim_start"
290+
"without success; {} is assumed dead (attempt {})",
291+
MAX_CONFIRMED_WAIT, node->job_name,
292+
node->submit_attempt);
293+
logger->info(error_msg.value());
294+
job_queue_node_set_status(node,
295+
JOB_QUEUE_DO_KILL_NODE_FAILURE);
296+
current_status = JOB_QUEUE_DO_KILL_NODE_FAILURE;
297+
}
295298
}
296299
}
297300

298-
current_status = job_queue_node_get_status(node);
299301
if (current_status & JOB_QUEUE_CAN_UPDATE_STATUS) {
300302
job_status_type new_status =
301303
queue_driver_get_status(driver, node->job_data);
304+
302305
if (new_status == JOB_QUEUE_EXIT)
303306
job_queue_node_fscanf_EXIT(node);
307+
304308
job_queue_node_set_status(node, new_status);
305-
current_status = job_queue_node_get_status(node);
309+
current_status = new_status;
306310
}
307-
if (node->fail_message.has_value() and !msg.has_value())
308-
msg = node->fail_message;
311+
312+
if (node->fail_message.has_value() and !error_msg.has_value())
313+
error_msg = node->fail_message;
309314

310315
pthread_mutex_unlock(&node->data_mutex);
311316
return std::make_pair<int, std::optional<std::string>>(
312-
int(current_status), std::move(msg));
317+
int(current_status), std::move(error_msg));
313318
});
314319
}

0 commit comments

Comments
 (0)