Skip to content

Commit

Permalink
[c++] Handle add/drop/re-add of enumerated types (#3567)
Browse files Browse the repository at this point in the history
* [c++] Handle add/drop/re-add of enumerated types

* add unit-test case

* fix CI

* fix CI
  • Loading branch information
johnkerl authored Jan 16, 2025
1 parent 8a2b9d4 commit 270b222
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 19 deletions.
35 changes: 35 additions & 0 deletions apis/python/tests/test_update_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,38 @@ def test_update_non_null_to_null(tmp_path, conftest_pbmc3k_adata, separate_inges
conftest_pbmc3k_adata.obs["myfloat"] = np.nan
# We need nan_safe since pd.NA != pd.NA
verify_updates(uri, conftest_pbmc3k_adata.obs, conftest_pbmc3k_adata.var)


def test_enmr_add_drop_readd(tmp_path, conftest_pbmc3k_adata):
uri = tmp_path.as_posix()

# Add
tiledbsoma.io.from_anndata(uri, conftest_pbmc3k_adata, measurement_name="RNA")

with tiledbsoma.Experiment.open(uri, "r") as exp:
schema = exp.obs.schema
assert "louvain" in schema.names
field = schema.field("louvain")
assert pa.types.is_dictionary(field.type)

# Drop
with tiledbsoma.Experiment.open(uri, "r") as exp:
obs = exp.obs.read().concat().to_pandas()
obs.drop(columns=["louvain"], inplace=True)

with tiledbsoma.Experiment.open(uri, "w") as exp:
tiledbsoma.io.update_obs(exp, obs)

with tiledbsoma.Experiment.open(uri, "r") as exp:
schema = exp.obs.schema
assert "louvain" not in schema.names

# Re-add
with tiledbsoma.Experiment.open(uri, "w") as exp:
# Most importantly, we're implicitly checking for no throw here.
tiledbsoma.io.update_obs(exp, conftest_pbmc3k_adata.obs)

with tiledbsoma.Experiment.open(uri, "r") as exp:
schema = exp.obs.schema
assert "louvain" in schema.names
assert pa.types.is_dictionary(field.type)
101 changes: 82 additions & 19 deletions libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
*/

#include "soma_dataframe.h"
#include <format>
#include "../utils/logger.h"

namespace tiledbsoma {
Expand Down Expand Up @@ -107,7 +108,9 @@ void SOMADataFrame::update_dataframe_schema(
std::vector<std::string> drop_attrs,
std::map<std::string, std::string> add_attrs,
std::map<std::string, std::pair<std::string, bool>> add_enmrs) {
ArraySchemaEvolution se(*ctx->tiledb_ctx());
const auto& tctx = *ctx->tiledb_ctx();
tiledb::Array array(tctx, uri, TILEDB_READ);
ArraySchemaEvolution se(tctx);
for (auto key_name : drop_attrs) {
LOG_DEBUG(std::format(
"[SOMADataFrame::update_dataframe_schema] drop col name {}",
Expand All @@ -118,16 +121,14 @@ void SOMADataFrame::update_dataframe_schema(
auto [attr_name, attr_type] = add_attr;

Attribute attr(
*ctx->tiledb_ctx(),
attr_name,
ArrowAdapter::to_tiledb_format(attr_type));
tctx, attr_name, ArrowAdapter::to_tiledb_format(attr_type));

if (ArrowAdapter::arrow_is_var_length_type(attr_type.c_str())) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

FilterList filter_list(*ctx->tiledb_ctx());
filter_list.add_filter(Filter(*ctx->tiledb_ctx(), TILEDB_FILTER_ZSTD));
FilterList filter_list(tctx);
filter_list.add_filter(Filter(tctx, TILEDB_FILTER_ZSTD));
attr.set_filter_list(filter_list);

// An update can create (or drop) columns, or mutate existing
Expand Down Expand Up @@ -168,9 +169,10 @@ void SOMADataFrame::update_dataframe_schema(
// o ordered: bool

auto enmr_it = add_enmrs.find(attr_name);
bool has_enmr = enmr_it != add_enmrs.end();
bool request_has_enmr = enmr_it != add_enmrs.end();
bool array_already_has_it = false;

if (has_enmr) {
if (request_has_enmr) {
auto [enmr_type, ordered] = enmr_it->second;
LOG_DEBUG(std::format(
"[SOMADataFrame::update_dataframe_schema] add col name {} "
Expand All @@ -180,17 +182,78 @@ void SOMADataFrame::update_dataframe_schema(
attr_type,
enmr_type,
ordered));
se.add_enumeration(Enumeration::create_empty(
*ctx->tiledb_ctx(),
attr_name,
ArrowAdapter::to_tiledb_format(enmr_type),
enmr_type == "u" || enmr_type == "z" || enmr_type == "U" ||
enmr_type == "Z" ?
TILEDB_VAR_NUM :
1,
ordered));
AttributeExperimental::set_enumeration_name(
*ctx->tiledb_ctx(), attr, attr_name);

// First we need to check if the array has (ever had) an enumeration
// of this type in its history.
//
// Note that in core, attrs have names and enumerations have
// names, and the names need not match -- and multiple attrs can
// use the same enumeration. For tiledbsoma, though, we don't take
// advantage of this flexibility: we use the same name for an attr
// and its enumeration (if if has one).
//
// It's quite possible the array had this name as an enumerated
// column, and then that column was dropped, and it's now being
// re-added.
//
// * We need to check if this is the case.
// * If it is, we need to check if the enumeration is compatible.
try {
const auto existing_enmr = ArrayExperimental::get_enumeration(
tctx, array, attr_name);
auto existing_type = ArrowAdapter::tdb_to_arrow_type(
existing_enmr.type());
auto existing_ordered = existing_enmr.ordered();

// It's there but it'll cause a problem further down in core.
// We cannot continue without intervention, but let's at least
// be as clear as possible why we can't continue.
if (ordered != existing_ordered || enmr_type != existing_type) {
throw TileDBSOMAError(std::format(
"[SOMADataFrame::update_dataframe_schema] requested "
"enumeration [type='{}', ordered={}] incompatible with "
"existing [type='{}', ordered={}]",
enmr_type,
ordered,
existing_type,
existing_ordered));
}
array_already_has_it = true;

} catch (const std::exception& e) {
// Oddly, catch (tiledb::TileDBError& e) did not enter this
// block in unit test.
//
// Make sure the exception was for the reason we're considering.
// If it was for some other reason, fail.
const std::string msg = e.what();
if (msg.find("already exists in this ArraySchema") !=
std::string::npos) {
array_already_has_it = true;
} else if (
msg.find("Unable to check if unknown enumeration is "
"loaded. No enumeration named") !=
std::string::npos) {
array_already_has_it = false;
} else {
throw(e);
}
}

if (!array_already_has_it) {
se.add_enumeration(Enumeration::create_empty(
tctx,
attr_name,
ArrowAdapter::to_tiledb_format(enmr_type),
enmr_type == "u" || enmr_type == "z" || enmr_type == "U" ||
enmr_type == "Z" ?
TILEDB_VAR_NUM :
1,
ordered));
AttributeExperimental::set_enumeration_name(
tctx, attr, attr_name);
}

} else {
LOG_DEBUG(std::format(
"[SOMADataFrame::update_dataframe_schema] add col name {} type "
Expand Down

0 comments on commit 270b222

Please sign in to comment.