From 85fc1dab0ad0f1aacbb516e50f379701d7b929cc Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 14:38:26 -0500 Subject: [PATCH 1/7] [python/r] Enforce dataframe domain lower bound == 0 --- apis/python/src/tiledbsoma/_dataframe.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index a7fb3aba9a..95e8af6669 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -295,6 +295,16 @@ def create( slot_core_max_domain, extent, saturated_md ) + if index_column_name == "soma_joinid": + if slot_core_current_domain[0] != 0: + raise ValueError( + f"domain for soma_joinid must have lower bound of 0; got {slot_core_current_domain[0]}" + ) + if slot_core_max_domain[0] != 0: + raise ValueError( + f"maxdomain for soma_joinid must have lower bound of 0; got {slot_core_max_domain[0]}" + ) + # Here is our Arrow data API for communicating schema info between # Python/R and C++ libtiledbsoma: # From 11b7882cc2e1c62f8c86026eb05688b657ce706d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 14:43:58 -0500 Subject: [PATCH 2/7] python unit-test cases --- apis/python/tests/test_dataframe.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index fe4f445afe..f2fca11033 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1864,3 +1864,31 @@ def test_presence_matrix(tmp_path): actual = soma_df.read().concat().to_pandas() assert actual.equals(df) + + +def test_lower_bound_on_somajoinid_domain(tmp_path): + uri = tmp_path.as_posix() + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myint", pa.int32()), + ("myfloat", pa.float32()), + ] + ) + + with pytest.raises(ValueError): + soma.DataFrame.create( + uri, + schema=schema, + domain=[[2, 99]], + ) + + soma.DataFrame.create( + uri, + schema=schema, + domain=[[0, 99]], + ) + + assert soma.DataFrame.exists(uri) From b454d323386ac86207f9bdc3bcb4d99054e40718 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 14:54:48 -0500 Subject: [PATCH 3/7] R-side assertion --- apis/r/R/SOMADataFrame.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index f649f2b063..e30e5fe741 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -65,6 +65,11 @@ SOMADataFrame <- R6::R6Class( length(attr_column_names) > 0 ) + if ("soma_joinid" %in% index_column_names && !is.null(domain)) { + lower <- domain[["soma_joinid"]][1] + stopifnot("The lower bound for soma_joinid domain must be 0" = lower == 0) + } + # Parse the tiledb/create/ subkeys of the platform_config into a handy, # typed, queryable data structure. tiledb_create_options <- TileDBCreateOptions$new(platform_config) From baf6ad1e949ee54dc54e15800ff752c7a6956cd7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 15:04:14 -0500 Subject: [PATCH 4/7] R-side unit tests --- apis/r/tests/testthat/test-SOMADataFrame.R | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index 9888be5def..168b9f5233 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -177,6 +177,41 @@ test_that("Basic mechanics with default index_column_names", { gc() }) +test_that("soma_joinid domain lower bound must be zero", { + uri <- withr::local_tempdir("soma-dataframe-soma-joinid-domain-lower-bound") + + index_column_names <- c("soma_joinid") + + asch <- arrow::schema( + arrow::field("soma_joinid", arrow::int64(), nullable = FALSE), + arrow::field("mystring", arrow::large_utf8(), nullable = FALSE), + arrow::field("myint", arrow::int32(), nullable = FALSE), + arrow::field("myfloat", arrow::float32(), nullable = FALSE) + ) + + expect_error( + SOMADataFrameCreate( + uri, + asch, + index_column_names = index_column_names, + domain = list(soma_joinid=c(2, 99)) + ) + ) + + expect_no_condition( + SOMADataFrameCreate( + uri, + asch, + index_column_names = index_column_names, + domain = list(soma_joinid=c(0, 99)) + ) + ) + + sdf <- SOMADataFrameOpen(uri) + expect_true(sdf$exists()) + sdf$close() +}) + test_that("creation with all supported dimension data types", { skip_if(!extended_tests()) sch <- arrow::schema( From da5bb4956d5013abc8f1c9bc71b2ba0d52a82ffc Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 20:31:42 +0000 Subject: [PATCH 5/7] update a few existing unit-test cases --- apis/r/tests/testthat/test-SOMADataFrame.R | 6 +++--- apis/r/tests/testthat/test-shape.R | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index 168b9f5233..fbba8e6345 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -640,9 +640,9 @@ test_that("SOMADataFrame timestamped ops", { arrow::field("valdbl", arrow::float64(), nullable = FALSE) ) if (dir.exists(uri)) unlink(uri, recursive = TRUE) - sdf <- SOMADataFrameCreate(uri = uri, schema = sch, domain = list(soma_joinid = c(1, 100))) + sdf <- SOMADataFrameCreate(uri = uri, schema = sch, domain = list(soma_joinid = c(0, 99))) rb1 <- arrow::record_batch( - soma_joinid = bit64::as.integer64(1L:3L), + soma_joinid = bit64::as.integer64(0L:2L), valint = 1L:3L, valdbl = 100 * (1:3), schema = sch @@ -660,7 +660,7 @@ test_that("SOMADataFrame timestamped ops", { t20 <- Sys.time() sdf <- SOMADataFrameOpen(uri = uri, mode = "WRITE") rb2 <- arrow::record_batch( - soma_joinid = bit64::as.integer64(4L:6L), + soma_joinid = bit64::as.integer64(3L:5L), valint = 4L:6L, valdbl = 100 * (4:6), schema = sch diff --git a/apis/r/tests/testthat/test-shape.R b/apis/r/tests/testthat/test-shape.R index 0f142a0f7b..3abae60ed0 100644 --- a/apis/r/tests/testthat/test-shape.R +++ b/apis/r/tests/testthat/test-shape.R @@ -327,14 +327,14 @@ test_that("SOMADataFrame domain mods", { index_column_names <- c("soma_joinid", "mystring", "myint", "myfloat") domain_for_create <- list( - soma_joinid = c(1, 4), + soma_joinid = c(0, 3), mystring = NULL, myint = c(20, 50), myfloat = c(0.0, 6.0) ) table <- arrow::arrow_table( - soma_joinid = 1L:4L, + soma_joinid = 0L:3L, mystring = c("a", "b", "a", "b"), myint = c(20, 30, 40, 50), myfloat = c(1.0, 2.5, 4.0, 5.5), @@ -355,7 +355,7 @@ test_that("SOMADataFrame domain mods", { # Check "expand" to same new_domain <- list( - soma_joinid = c(1, 4), + soma_joinid = c(0, 3), mystring = NULL, myint = c(20, 50), myfloat = c(0.0, 6.0) @@ -364,7 +364,7 @@ test_that("SOMADataFrame domain mods", { # Shrink new_domain <- list( - soma_joinid = c(1, 3), + soma_joinid = c(0, 2), mystring = NULL, myint = c(20, 50), myfloat = c(0.0, 6.0) @@ -372,7 +372,7 @@ test_that("SOMADataFrame domain mods", { expect_error(sdf$change_domain(new_domain)) new_domain <- list( - soma_joinid = c(1, 4), + soma_joinid = c(0, 3), mystring = NULL, myint = c(20, 40), myfloat = c(0.0, 6.0) @@ -380,7 +380,7 @@ test_that("SOMADataFrame domain mods", { expect_error(sdf$change_domain(new_domain)) new_domain <- list( - soma_joinid = c(1, 4), + soma_joinid = c(0, 3), mystring = NULL, myint = c(20, 50), myfloat = c(2.0, 6.0) @@ -389,7 +389,7 @@ test_that("SOMADataFrame domain mods", { # String domain cannot be specified new_domain <- list( - soma_joinid = c(1, 4), + soma_joinid = c(0, 3), mystring = c("a", "z"), myint = c(20, 50), myfloat = c(0.0, 6.0) @@ -400,7 +400,7 @@ test_that("SOMADataFrame domain mods", { # String domain cannot be specified new_domain <- list( - soma_joinid = c(1, 9), + soma_joinid = c(0, 8), mystring = c("", ""), myint = c(20, 50), myfloat = c(0.0, 10.0) From 46b53d992bb327fb3f50ccd24e4987eccac3ceb3 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 18:05:51 -0500 Subject: [PATCH 6/7] R-side code-review feedback [skip ci] --- apis/r/R/SOMADataFrame.R | 6 +++++- apis/r/tests/testthat/test-SOMADataFrame.R | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index e30e5fe741..a7b24d702a 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -67,7 +67,11 @@ SOMADataFrame <- R6::R6Class( if ("soma_joinid" %in% index_column_names && !is.null(domain)) { lower <- domain[["soma_joinid"]][1] - stopifnot("The lower bound for soma_joinid domain must be 0" = lower == 0) + upper <- domain[["soma_joinid"]][2] + stopifnot( + "The lower bound for soma_joinid domain must be 0" = lower == 0, + "The upper bound for soma_joinid domain must be >= 0" = upper >= 0 + ) } # Parse the tiledb/create/ subkeys of the platform_config into a handy, diff --git a/apis/r/tests/testthat/test-SOMADataFrame.R b/apis/r/tests/testthat/test-SOMADataFrame.R index fbba8e6345..e9aa748b9e 100644 --- a/apis/r/tests/testthat/test-SOMADataFrame.R +++ b/apis/r/tests/testthat/test-SOMADataFrame.R @@ -198,6 +198,15 @@ test_that("soma_joinid domain lower bound must be zero", { ) ) + expect_error( + SOMADataFrameCreate( + uri, + asch, + index_column_names = index_column_names, + domain = list(soma_joinid=c(0, -1)) + ) + ) + expect_no_condition( SOMADataFrameCreate( uri, From b1f55f348e9b8062cc3e984b4d73520ae76b8110 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 6 Nov 2024 18:08:25 -0500 Subject: [PATCH 7/7] Python-side code-review feedback --- apis/python/src/tiledbsoma/_dataframe.py | 10 ++++++---- apis/python/tests/test_dataframe.py | 9 ++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 95e8af6669..c91c43dca1 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -296,13 +296,15 @@ def create( ) if index_column_name == "soma_joinid": - if slot_core_current_domain[0] != 0: + lower = slot_core_current_domain[0] + upper = slot_core_current_domain[1] + if lower != 0: raise ValueError( - f"domain for soma_joinid must have lower bound of 0; got {slot_core_current_domain[0]}" + f"domain for soma_joinid must have lower bound of 0; got {lower}" ) - if slot_core_max_domain[0] != 0: + if upper < 0: raise ValueError( - f"maxdomain for soma_joinid must have lower bound of 0; got {slot_core_max_domain[0]}" + f"domain for soma_joinid must have upper bound >= 0; got {upper}" ) # Here is our Arrow data API for communicating schema info between diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index f2fca11033..b1f5103323 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1866,7 +1866,7 @@ def test_presence_matrix(tmp_path): assert actual.equals(df) -def test_lower_bound_on_somajoinid_domain(tmp_path): +def test_bounds_on_somajoinid_domain(tmp_path): uri = tmp_path.as_posix() schema = pa.schema( @@ -1885,6 +1885,13 @@ def test_lower_bound_on_somajoinid_domain(tmp_path): domain=[[2, 99]], ) + with pytest.raises(ValueError): + soma.DataFrame.create( + uri, + schema=schema, + domain=[[0, -1]], + ) + soma.DataFrame.create( uri, schema=schema,