From 24e243242365010fb08c256f441b6ab8af8139d2 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Tue, 29 Apr 2025 10:41:39 -0500 Subject: [PATCH 1/5] add tool to run queries against data frames --- DESCRIPTION | 1 + R/tool-query.R | 48 ++++++++ R/utils.R | 2 + man/btw_register_tools.Rd | 17 ++- man/btw_tool_env_query_data_frame.Rd | 19 ++++ tests/testthat/_snaps/tool-docs.new.md | 148 +++++++++++++++++++++++++ tests/testthat/_snaps/tool-query.md | 18 +++ tests/testthat/test-tool-query.R | 17 +++ 8 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 R/tool-query.R create mode 100644 man/btw_tool_env_query_data_frame.Rd create mode 100644 tests/testthat/_snaps/tool-docs.new.md create mode 100644 tests/testthat/_snaps/tool-query.md create mode 100644 tests/testthat/test-tool-query.R diff --git a/DESCRIPTION b/DESCRIPTION index 530f268f..3f96b415 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,6 +22,7 @@ BugReports: https://github.com/posit-dev/btw/issues Imports: cli, clipr, + DBI, dplyr, ellmer (>= 0.1.1.9000), fs, diff --git a/R/tool-query.R b/R/tool-query.R new file mode 100644 index 00000000..89675a54 --- /dev/null +++ b/R/tool-query.R @@ -0,0 +1,48 @@ +#' Perform a SQL query on the data, and return the results as JSON. +#' +#' @param query A DuckDB SQL query; must be a SELECT statement. +# TODO: should any `get`table data frame work here? +#' @param data_frame The name of the data frame. +#' @return The results of the query as a JSON string. +btw_tool_env_query_data_frame <- function(query, data_frame) { + d <- get(data_frame) + conn <- btw_connection() + + if (!DBI::dbExistsTable(conn, data_frame)) { + duckdb::duckdb_register(conn, data_frame, d, experimental = FALSE) + } + + res <- DBI::dbGetQuery(conn, query) + + btw_tool_env_describe_data_frame(res, format = "json", dims = c(Inf, Inf)) +} + +.btw_add_to_tools( + name = "btw_tool_env_query_data_frame", + group = "env", + tool = function() { + ellmer::tool( + btw_tool_env_query_data_frame, + .name = "btw_tool_env_query_data_frame", + .description = + "Run a DuckDB SQL query against a data frame. + Use this tool instead of btw_tool_env_describe_data_frame to run more + targeted queries, e.g. calculating statistics on specific columns.", + query = ellmer::type_string("A DuckDB SQL query, as a string."), + data_frame = ellmer::type_string("The name of the data frame, as a string.") + ) + } +) + +btw_connect <- function() { + # TODO: also check if the connection is active + if (is.null(btw_env$conn)) { + btw_env$conn <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + } +} + +btw_connection <- function() { + btw_connect() + + btw_env$conn +} diff --git a/R/utils.R b/R/utils.R index ca7d416e..7c5b98df 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,5 @@ +btw_env <- new_environment() + pandoc_convert <- function(path, ..., from = "html", to = "markdown") { tmp_file <- withr::local_tempfile() diff --git a/man/btw_register_tools.Rd b/man/btw_register_tools.Rd index ecc756f9..b6611379 100644 --- a/man/btw_register_tools.Rd +++ b/man/btw_register_tools.Rd @@ -29,12 +29,19 @@ to the tools:\tabular{lll}{ btw_tool_docs_vignette \tab docs \tab Get a package vignette in plain text. \cr btw_tool_env_describe_data_frame \tab env \tab Show the data frame or table or get information about the structure of a data frame or table. \cr btw_tool_env_describe_environment \tab env \tab List and describe items in an environment. \cr - btw_tool_files_list_files \tab files \tab List files in the current working directory or in subfolders in the current project directory. \cr - btw_tool_files_read_text_file \tab files \tab Read an entire text file. \cr - btw_tool_ide_read_current_editor \tab ide \tab Read the contents of the editor that is currently open in the user's IDE. \cr - btw_tool_session_package_info \tab session \tab Verify that a specific package is installed, or find out which packages are in use in the current session. \cr - btw_tool_session_platform_info \tab session \tab Describes the R version, operating system, language and locale settings for the user's system. \cr + btw_tool_env_query_data_frame \tab env \tab Run a DuckDB SQL query against a data frame. \cr } + + +\if{html}{\out{
}}\preformatted{ Use this tool instead of btw_tool_env_describe_data_frame to run more + targeted queries, e.g. | +}\if{html}{\out{
}} + +| btw_tool_files_list_files | files | List files in the current working directory or in subfolders in the current project directory. | +| btw_tool_files_read_text_file | files | Read an entire text file. | +| btw_tool_ide_read_current_editor | ide | Read the contents of the editor that is currently open in the user's IDE. | +| btw_tool_session_package_info | session | Verify that a specific package is installed, or find out which packages are in use in the current session. | +| btw_tool_session_platform_info | session | Describes the R version, operating system, language and locale settings for the user's system. | } \examples{ # requires an ANTHROPIC_API_KEY diff --git a/man/btw_tool_env_query_data_frame.Rd b/man/btw_tool_env_query_data_frame.Rd new file mode 100644 index 00000000..8b332ff9 --- /dev/null +++ b/man/btw_tool_env_query_data_frame.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tool-query.R +\name{btw_tool_env_query_data_frame} +\alias{btw_tool_env_query_data_frame} +\title{Perform a SQL query on the data, and return the results as JSON.} +\usage{ +btw_tool_env_query_data_frame(query, data_frame) +} +\arguments{ +\item{query}{A DuckDB SQL query; must be a SELECT statement.} + +\item{data_frame}{The name of the data frame.} +} +\value{ +The results of the query as a JSON string. +} +\description{ +Perform a SQL query on the data, and return the results as JSON. +} diff --git a/tests/testthat/_snaps/tool-docs.new.md b/tests/testthat/_snaps/tool-docs.new.md new file mode 100644 index 00000000..64251e8c --- /dev/null +++ b/tests/testthat/_snaps/tool-docs.new.md @@ -0,0 +1,148 @@ +# btw_tool_docs_help_page() works + + Code + cli::cat_line(res) + Output + ## `help(package = "stats", "rnorm")` + + ### The Normal Distribution + + #### Description + + Density, distribution function, quantile function and random generation + for the normal distribution with mean equal to `mean` and standard + deviation equal to `sd`. + + #### Usage + + ``` R + dnorm(x, mean = 0, sd = 1, log = FALSE) + pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) + qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) + rnorm(n, mean = 0, sd = 1) + ``` + + #### Arguments + + | | | + |----|----| + | `x`, `q` | vector of quantiles. | + | `p` | vector of probabilities. | + | `n` | number of observations. If `length(n) > 1`, the length is taken to be the number required. | + | `mean` | vector of means. | + | `sd` | vector of standard deviations. | + | `log`, `log.p` | logical; if TRUE, probabilities p are given as log(p). | + | `lower.tail` | logical; if TRUE (default), probabilities are `P[X \le x]` otherwise, `P[X > x]`. | + + #### Details + + If `mean` or `sd` are not specified they assume the default values of + `0` and `1`, respectively. + + The normal distribution has density + + ` f(x) = \frac{1}{\sqrt{2\pi}\sigma} e^{-(x-\mu)^2/2\sigma^2}` + + where `\mu` is the mean of the distribution and `\sigma` the standard + deviation. + + #### Value + + `dnorm` gives the density, `pnorm` gives the distribution function, + `qnorm` gives the quantile function, and `rnorm` generates random + deviates. + + The length of the result is determined by `n` for `rnorm`, and is the + maximum of the lengths of the numerical arguments for the other + functions. + + The numerical arguments other than `n` are recycled to the length of the + result. Only the first elements of the logical arguments are used. + + For `sd = 0` this gives the limit as `sd` decreases to 0, a point mass + at `mu`. `sd < 0` is an error and returns `NaN`. + + #### Source + + For `pnorm`, based on + + Cody, W. D. (1993) Algorithm 715: SPECFUN – A portable FORTRAN package + of special function routines and test drivers. *ACM Transactions on + Mathematical Software* **19**, 22–32. + + For `qnorm`, the code is based on a C translation of + + Wichura, M. J. (1988) Algorithm AS 241: The percentage points of the + normal distribution. *Applied Statistics*, **37**, 477–484; + [doi:10.2307/2347330](https://doi.org/10.2307/2347330). + + which provides precise results up to about 16 digits for `log.p=FALSE`. + For log scale probabilities in the extreme tails, since + **R** version 4.1.0, extensively since 4.3.0, + asymptotic expansions are used which have been derived and explored in + + Maechler, M. (2022) Asymptotic tail formulas for gaussian quantiles; + [DPQ](https://CRAN.R-project.org/package=DPQ) + vignette + . + + For `rnorm`, see RNG for how to select the algorithm and for references + to the supplied methods. + + #### References + + Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) *The New S + Language*. Wadsworth & Brooks/Cole. + + Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) *Continuous + Univariate Distributions*, volume 1, chapter 13. Wiley, New York. + + #### See Also + + Distributions for other standard distributions, including `dlnorm` for + the *Log*normal distribution. + + #### Examples + + ``` R + require(graphics) + + dnorm(0) == 1/sqrt(2*pi) + dnorm(1) == exp(-1/2)/sqrt(2*pi) + dnorm(1) == 1/sqrt(2*pi*exp(1)) + + ## Using "log = TRUE" for an extended range : + par(mfrow = c(2,1)) + plot(function(x) dnorm(x, log = TRUE), -60, 50, + main = "log { Normal density }") + curve(log(dnorm(x)), add = TRUE, col = "red", lwd = 2) + mtext("dnorm(x, log=TRUE)", adj = 0) + mtext("log(dnorm(x))", col = "red", adj = 1) + + plot(function(x) pnorm(x, log.p = TRUE), -50, 10, + main = "log { Normal Cumulative }") + curve(log(pnorm(x)), add = TRUE, col = "red", lwd = 2) + mtext("pnorm(x, log=TRUE)", adj = 0) + mtext("log(pnorm(x))", col = "red", adj = 1) + + ## if you want the so-called 'error function' + erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1 + ## (see Abramowitz and Stegun 29.2.29) + ## and the so-called 'complementary error function' + erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE) + ## and the inverses + erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2) + erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2) + ``` + +# btw_tool_docs_help_page() with multiple help topics + + Code + btw_tool_docs_help_page("filter") + Condition + Error in `btw_tool_docs_help_page()`: + ! Topic "filter" matched 2 different topics. + i Choose one or submit individual tool calls for each topic. + * {"topic":"filter", "package_name":"stats"} + * {"topic":"filter", "package_name":"dplyr"} + diff --git a/tests/testthat/_snaps/tool-query.md b/tests/testthat/_snaps/tool-query.md new file mode 100644 index 00000000..edf12398 --- /dev/null +++ b/tests/testthat/_snaps/tool-query.md @@ -0,0 +1,18 @@ +# btw_tool_env_query_data_frame() works + + Code + btw_tool_env_query_data_frame("SELECT mpg FROM mtcars LIMIT 5;", "mtcars") + Output + [1] "```json" + [2] "[\n {\"mpg\":21},\n {\"mpg\":21},\n {\"mpg\":22.8},\n {\"mpg\":21.4},\n {\"mpg\":18.7}\n]" + [3] "```" + +--- + + Code + btw_tool_env_query_data_frame("SELECT mpg FROM mtcars LIMIT 5;", "mtcars") + Output + [1] "```json" + [2] "[\n {\"mpg\":21},\n {\"mpg\":21},\n {\"mpg\":22.8},\n {\"mpg\":21.4},\n {\"mpg\":18.7}\n]" + [3] "```" + diff --git a/tests/testthat/test-tool-query.R b/tests/testthat/test-tool-query.R new file mode 100644 index 00000000..f6a4bd33 --- /dev/null +++ b/tests/testthat/test-tool-query.R @@ -0,0 +1,17 @@ +test_that("btw_tool_env_query_data_frame() works", { + # can run a simple query + expect_snapshot( + btw_tool_env_query_data_frame( + "SELECT mpg FROM mtcars LIMIT 5;", + "mtcars" + ) + ) + + # can run a query against the same table twice + expect_snapshot( + btw_tool_env_query_data_frame( + "SELECT mpg FROM mtcars LIMIT 5;", + "mtcars" + ) + ) +}) From 24add82ce5f340a9c7182e4c51a6779ef703e9dd Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Tue, 29 Apr 2025 10:42:59 -0500 Subject: [PATCH 2/5] delete new snaps --- tests/testthat/_snaps/tool-docs.new.md | 148 ------------------------- 1 file changed, 148 deletions(-) delete mode 100644 tests/testthat/_snaps/tool-docs.new.md diff --git a/tests/testthat/_snaps/tool-docs.new.md b/tests/testthat/_snaps/tool-docs.new.md deleted file mode 100644 index 64251e8c..00000000 --- a/tests/testthat/_snaps/tool-docs.new.md +++ /dev/null @@ -1,148 +0,0 @@ -# btw_tool_docs_help_page() works - - Code - cli::cat_line(res) - Output - ## `help(package = "stats", "rnorm")` - - ### The Normal Distribution - - #### Description - - Density, distribution function, quantile function and random generation - for the normal distribution with mean equal to `mean` and standard - deviation equal to `sd`. - - #### Usage - - ``` R - dnorm(x, mean = 0, sd = 1, log = FALSE) - pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) - qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) - rnorm(n, mean = 0, sd = 1) - ``` - - #### Arguments - - | | | - |----|----| - | `x`, `q` | vector of quantiles. | - | `p` | vector of probabilities. | - | `n` | number of observations. If `length(n) > 1`, the length is taken to be the number required. | - | `mean` | vector of means. | - | `sd` | vector of standard deviations. | - | `log`, `log.p` | logical; if TRUE, probabilities p are given as log(p). | - | `lower.tail` | logical; if TRUE (default), probabilities are `P[X \le x]` otherwise, `P[X > x]`. | - - #### Details - - If `mean` or `sd` are not specified they assume the default values of - `0` and `1`, respectively. - - The normal distribution has density - - ` f(x) = \frac{1}{\sqrt{2\pi}\sigma} e^{-(x-\mu)^2/2\sigma^2}` - - where `\mu` is the mean of the distribution and `\sigma` the standard - deviation. - - #### Value - - `dnorm` gives the density, `pnorm` gives the distribution function, - `qnorm` gives the quantile function, and `rnorm` generates random - deviates. - - The length of the result is determined by `n` for `rnorm`, and is the - maximum of the lengths of the numerical arguments for the other - functions. - - The numerical arguments other than `n` are recycled to the length of the - result. Only the first elements of the logical arguments are used. - - For `sd = 0` this gives the limit as `sd` decreases to 0, a point mass - at `mu`. `sd < 0` is an error and returns `NaN`. - - #### Source - - For `pnorm`, based on - - Cody, W. D. (1993) Algorithm 715: SPECFUN – A portable FORTRAN package - of special function routines and test drivers. *ACM Transactions on - Mathematical Software* **19**, 22–32. - - For `qnorm`, the code is based on a C translation of - - Wichura, M. J. (1988) Algorithm AS 241: The percentage points of the - normal distribution. *Applied Statistics*, **37**, 477–484; - [doi:10.2307/2347330](https://doi.org/10.2307/2347330). - - which provides precise results up to about 16 digits for `log.p=FALSE`. - For log scale probabilities in the extreme tails, since - **R** version 4.1.0, extensively since 4.3.0, - asymptotic expansions are used which have been derived and explored in - - Maechler, M. (2022) Asymptotic tail formulas for gaussian quantiles; - [DPQ](https://CRAN.R-project.org/package=DPQ) - vignette - . - - For `rnorm`, see RNG for how to select the algorithm and for references - to the supplied methods. - - #### References - - Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) *The New S - Language*. Wadsworth & Brooks/Cole. - - Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) *Continuous - Univariate Distributions*, volume 1, chapter 13. Wiley, New York. - - #### See Also - - Distributions for other standard distributions, including `dlnorm` for - the *Log*normal distribution. - - #### Examples - - ``` R - require(graphics) - - dnorm(0) == 1/sqrt(2*pi) - dnorm(1) == exp(-1/2)/sqrt(2*pi) - dnorm(1) == 1/sqrt(2*pi*exp(1)) - - ## Using "log = TRUE" for an extended range : - par(mfrow = c(2,1)) - plot(function(x) dnorm(x, log = TRUE), -60, 50, - main = "log { Normal density }") - curve(log(dnorm(x)), add = TRUE, col = "red", lwd = 2) - mtext("dnorm(x, log=TRUE)", adj = 0) - mtext("log(dnorm(x))", col = "red", adj = 1) - - plot(function(x) pnorm(x, log.p = TRUE), -50, 10, - main = "log { Normal Cumulative }") - curve(log(pnorm(x)), add = TRUE, col = "red", lwd = 2) - mtext("pnorm(x, log=TRUE)", adj = 0) - mtext("log(pnorm(x))", col = "red", adj = 1) - - ## if you want the so-called 'error function' - erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1 - ## (see Abramowitz and Stegun 29.2.29) - ## and the so-called 'complementary error function' - erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE) - ## and the inverses - erfinv <- function (x) qnorm((1 + x)/2)/sqrt(2) - erfcinv <- function (x) qnorm(x/2, lower = FALSE)/sqrt(2) - ``` - -# btw_tool_docs_help_page() with multiple help topics - - Code - btw_tool_docs_help_page("filter") - Condition - Error in `btw_tool_docs_help_page()`: - ! Topic "filter" matched 2 different topics. - i Choose one or submit individual tool calls for each topic. - * {"topic":"filter", "package_name":"stats"} - * {"topic":"filter", "package_name":"dplyr"} - From dd1780ebce05fcfda568ec90bc3ec6afd3886f7f Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Tue, 29 Apr 2025 10:45:18 -0500 Subject: [PATCH 3/5] remove outdated TODO --- R/tool-query.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/tool-query.R b/R/tool-query.R index 89675a54..80e1a175 100644 --- a/R/tool-query.R +++ b/R/tool-query.R @@ -1,7 +1,6 @@ #' Perform a SQL query on the data, and return the results as JSON. #' #' @param query A DuckDB SQL query; must be a SELECT statement. -# TODO: should any `get`table data frame work here? #' @param data_frame The name of the data frame. #' @return The results of the query as a JSON string. btw_tool_env_query_data_frame <- function(query, data_frame) { From d352184bf888bfcbdb0d79d987cd4edeeb7fe363 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Tue, 29 Apr 2025 10:45:51 -0500 Subject: [PATCH 4/5] add duckdb to Imports --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 3f96b415..463ca866 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,6 +24,7 @@ Imports: clipr, DBI, dplyr, + duckdb, ellmer (>= 0.1.1.9000), fs, jsonlite, From 16c166c81d1cfdbc1515b105b95c06add2eff320 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Tue, 29 Apr 2025 13:46:17 -0500 Subject: [PATCH 5/5] `btw_env` -> `.globals` [no ci] --- R/tool-query.R | 6 +++--- R/utils.R | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/tool-query.R b/R/tool-query.R index 80e1a175..61b583f4 100644 --- a/R/tool-query.R +++ b/R/tool-query.R @@ -35,13 +35,13 @@ btw_tool_env_query_data_frame <- function(query, data_frame) { btw_connect <- function() { # TODO: also check if the connection is active - if (is.null(btw_env$conn)) { - btw_env$conn <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + if (is.null(.globals$conn)) { + .globals$conn <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") } } btw_connection <- function() { btw_connect() - btw_env$conn + .globals$conn } diff --git a/R/utils.R b/R/utils.R index 7c5b98df..cf47706a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,4 +1,4 @@ -btw_env <- new_environment() +.globals <- new_environment() pandoc_convert <- function(path, ..., from = "html", to = "markdown") { tmp_file <- withr::local_tempfile()