From a72a71102a67cb28a1c4f388c90e956da0e5053b Mon Sep 17 00:00:00 2001 From: Matt Warkentin Date: Tue, 7 Oct 2025 09:25:49 -0600 Subject: [PATCH 1/3] Fix answer text in scorer logging --- NEWS.md | 5 +++++ R/translate-events.R | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6fe9bed..2605d04 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # vitals (development version) +* The log viewer previously reported the solver's response as the answer provided + to the scorer. However, these two texts can differ when post-processing of + the solver's response is performed. This is now fixed in the log + viewer (#166, #169 by @mattwarkentin). + * Fixed bug where non-default grading systems in model-graded evals would result in scores being wiped during logging (#139). diff --git a/R/translate-events.R b/R/translate-events.R index 259f386..213a41e 100644 --- a/R/translate-events.R +++ b/R/translate-events.R @@ -690,8 +690,6 @@ create_scoring_model_event <- function(turn, sample, timestamp) { } create_score_event <- function(turn, sample, timestamp) { - solver_chat <- sample$solver_chat[[1]] - solver_turn <- solver_chat$last_turn() scorer_user_turn <- sample$scorer_chat[[1]]$get_turns()[[1]] list(list( @@ -700,7 +698,7 @@ create_score_event <- function(turn, sample, timestamp) { event = "score", score = list( value = "C", - answer = solver_turn@text, + answer = sample$result, explanation = turn@text, metadata = list( grading = list( From 6af2a2757ac2985c185317a0185dd39cb71de6b6 Mon Sep 17 00:00:00 2001 From: Matt Warkentin Date: Fri, 10 Oct 2025 13:32:39 -0600 Subject: [PATCH 2/3] Fix hard-coding of score in scorer event logging --- R/translate-events.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/translate-events.R b/R/translate-events.R index 213a41e..26d386b 100644 --- a/R/translate-events.R +++ b/R/translate-events.R @@ -697,7 +697,7 @@ create_score_event <- function(turn, sample, timestamp) { working_start = attr(turn, "working_start"), event = "score", score = list( - value = "C", + value = sample$score, answer = sample$result, explanation = turn@text, metadata = list( From 2c310841709c3eb10955a07a3d9ab069abdc819a Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Mon, 20 Oct 2025 07:21:38 -0500 Subject: [PATCH 3/3] handle numeric solver result case --- R/translate-events.R | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/R/translate-events.R b/R/translate-events.R index 26d386b..bec2131 100644 --- a/R/translate-events.R +++ b/R/translate-events.R @@ -291,7 +291,11 @@ create_tool_event <- function(turn, tool_result, timestamps) { id = tool_result@request@id, `function` = tool_result@request@name, arguments = tool_result@request@arguments, - result = if (!is.null(tool_result@error)) as.character(tool_result@error) else collapse_tool_result(tool_result), + result = if (!is.null(tool_result@error)) { + as.character(tool_result@error) + } else { + collapse_tool_result(tool_result) + }, events = list(), completed = events_timestamp(timestamp), working_time = attr(turn, "working_time") @@ -443,11 +447,20 @@ create_model_event <- function(turn, sample) { } else { # Handle tool results that may contain image objects lapply(msg$content, function(item) { - if (is.list(item) && identical(item$type, "image") && !is.null(item$source)) { + if ( + is.list(item) && + identical(item$type, "image") && + !is.null(item$source) + ) { # Convert image object to ContentImage format list( type = "image", - image = paste0("data:", item$source$media_type, ";base64,", item$source$data) + image = paste0( + "data:", + item$source$media_type, + ";base64,", + item$source$data + ) ) } else { item @@ -456,7 +469,11 @@ create_model_event <- function(turn, sample) { }, # This depends specifically on previous helpers using # `as_character()` on conditions to extract error messages - is_error = if (is.character(msg$content)) grepl("Error in", msg$content) else FALSE + is_error = if (is.character(msg$content)) { + grepl("Error in", msg$content) + } else { + FALSE + } )) )) } else if (msg$role == "user") { @@ -488,11 +505,20 @@ create_model_event <- function(turn, sample) { # Handle content that may contain image objects processed_content <- if (is.list(msg$content)) { lapply(msg$content, function(item) { - if (is.list(item) && identical(item$type, "image") && !is.null(item$source)) { + if ( + is.list(item) && + identical(item$type, "image") && + !is.null(item$source) + ) { # Convert image object to ContentImage format list( type = "image", - image = paste0("data:", item$source$media_type, ";base64,", item$source$data) + image = paste0( + "data:", + item$source$media_type, + ";base64,", + item$source$data + ) ) } else { item @@ -501,7 +527,7 @@ create_model_event <- function(turn, sample) { } else { msg$content } - + return(list( role = "assistant", content = processed_content @@ -698,7 +724,7 @@ create_score_event <- function(turn, sample, timestamp) { event = "score", score = list( value = sample$score, - answer = sample$result, + answer = as.character(sample$result), explanation = turn@text, metadata = list( grading = list(