Skip to content

Commit

Permalink
Migrated N-dimensional string dataset validator to ritsuko.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Jan 17, 2024
1 parent cdb23ab commit 9b70a80
Showing 1 changed file with 1 addition and 86 deletions.
87 changes: 1 addition & 86 deletions include/takane/dense_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,91 +47,6 @@ inline bool is_transposed(const H5::Group& ghandle) {
return ritsuko::hdf5::load_scalar_numeric_attribute<int32_t>(attr) != 0;
}

inline void validate_string_contents(const H5::DataSet& dhandle, const std::vector<hsize_t>& data_extent, hsize_t buffer_size) {
auto stype = dhandle.getDataType();
if (!stype.isVariableStr()) {
return;
}

for (auto ex : data_extent) {
if (ex == 0) {
return;
}
}

hsize_t ndims = data_extent.size();
std::vector<hsize_t> chunk_extent(ndims, 1);
auto cplist = dhandle.getCreatePlist();
if (cplist.getLayout() == H5D_CHUNKED) {
cplist.getChunk(chunk_extent.size(), chunk_extent.data());
}

// Scaling up the block size as much as possible. We start from the
// fastest-changing dimension (i.e., the last one in HDF5) and increase it,
// and then we move onto the next-fastest dimension, and so on until the
// buffer size is exhausted.
auto block_extent = chunk_extent;
hsize_t block_size = 1;
for (hsize_t d = 0; d < ndims; ++d) {
block_extent[d] = std::min(block_extent[d], data_extent[d]); // should be a no-op, but we do this just in case.
block_size *= block_extent[d];
}

for (hsize_t i = ndims; i > 0; --i) {
int multiple = buffer_size / block_size;
if (multiple <= 1) {
break;
}
auto d = i - 1;
block_size /= block_extent[d];
block_extent[d] = std::min(data_extent[d], block_extent[d] * multiple);
block_size *= block_extent[d];
}

// Now iterating through the array.
std::vector<hsize_t> starts(ndims), counts(block_extent.begin(), block_extent.end());
std::vector<char*> buffer(block_size);
hsize_t buffer_length = block_size;

H5::DataSpace mspace, dspace(ndims, data_extent.data());
bool finished = false;

while (!finished) {
buffer.resize(buffer_length);
dspace.selectHyperslab(H5S_SELECT_SET, counts.data(), starts.data());
mspace.setExtentSimple(ndims, counts.data());

[[maybe_unused]] ritsuko::hdf5::VariableStringCleaner stream(stype.getId(), mspace.getId(), buffer.data());
dhandle.read(buffer.data(), stype, mspace, dspace);
for (auto x : buffer) {
if (x == NULL) {
throw std::runtime_error("detected NULL pointer in a variable-length string dataset");
}
}

// Attempting a shift from the last dimension as this is the fastest-changing.
for (hsize_t i = ndims; i > 0; --i) {
auto d = i - 1;
starts[d] += block_extent[d];
if (starts[d] >= data_extent[d]) {
if (d == 0) {
finished = true;
} else {
starts[d] = 0;
buffer_length /= counts[d];
counts[d] = std::min(data_extent[d], block_extent[d]);
buffer_length *= counts[d];
}
} else {
buffer_length /= counts[d];
counts[d] = std::min(data_extent[d] - starts[d], block_extent[d]);
buffer_length *= counts[d];
break;
}
}
}
}

}
/**
* @endcond
Expand Down Expand Up @@ -179,7 +94,7 @@ inline void validate(const std::filesystem::path& path, const ObjectMetadata& me
if (!ritsuko::hdf5::is_utf8_string(dhandle)) {
throw std::runtime_error("expected string array to have a datatype that can be represented by a UTF-8 encoded string");
}
internal::validate_string_contents(dhandle, extents, options.hdf5_buffer_size);
ritsuko::hdf5::validate_nd_string_dataset(dhandle, extents, options.hdf5_buffer_size);
} else {
throw std::runtime_error("unknown array type '" + type + "'");
}
Expand Down

0 comments on commit 9b70a80

Please sign in to comment.