Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ritsuko's custom variable length string arrays in HDF5. #18

Merged
merged 3 commits into from
Mar 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/specifications/build.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(knitr)
dir.create("compiled", showWarnings=FALSE)

for (v in c("1.0", "1.1", "1.2", "1.3")) {
for (v in c("1.0", "1.1", "1.2", "1.3", "1.4")) {
.version <- package_version(v)
knitr::knit("hdf5.Rmd", output=file.path("compiled", paste0("hdf5-", v, ".md")))
}
Expand Down
33 changes: 33 additions & 0 deletions docs/specifications/hdf5.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,39 @@ This should be interpreted as a boolean where a non-zero value specifies that we
}
```

```{r, results="asis", echo=FALSE}
if (.version >= package_version("1.4")) {
cat('### Variable length string arrays

Arrays of strings can be stored in [**ritsuko**\'s custom variable length string (VLS) array](https://github.com/ArtifactDB/ritsuko).
This is represented as a HDF5 group (`**/`) with the following attributes:

- `uzuki_object`, a scalar string dataset containing the value `"vector"`.
This should use a datatype that can be represented by a UTF-8 encoded string.
- `uzuki_type`, a scalar string dataset containing `"vls"`.

This group should contain the `pointers` and `heap` datasets.

- The `**/data` dataset should be a 1-dimensional or scalar dataset of a compound datatype of 2 members, `"offset"` and `"length"`.
Each member should be of a datatype that can be represented by an unsigned 64-bit integer.
If the dataset is scalar, the length of the VLS array is defined as 1.
- The `**/heap` dataset should be a 1-dimensional dataset of unsigned 8-bit integers.

Each entry of `**/data` refers to a slice `[offset, offset + length)` of the `**/heap` dataset.
This slice defines a variable-length UTF-8 encoded string of length `length` - unless the slice contains a null terminator, in which case the string is defined as the interval to the first null.
Pointers may be in any order, overlapping or non-contiguous, as long as `[offset, offset + length)` lies within the boundaries of the heap.

A `missing-value-placeholder` attribute on the `**/data` dataset may be present, defining a placeholder for missing values.
The attribute should be a scalar and should be of any HDF5 string datatype that can be represented by a UTF-8 encoded string.
An entry of `**/data` should be considered as missing if its corresponding string is equal to the placeholder.

The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `**/data`.
This should use a datatype that can be represented by a UTF-8 encoded string.
If `**/data` is a scalar, `**/names` should have length 1.
')
}
```

### Nothing

A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes:
Expand Down
43 changes: 43 additions & 0 deletions include/uzuki2/parse_hdf5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"
#include "ritsuko/hdf5/vls/vls.hpp"

/**
* @file parse_hdf5.hpp
Expand Down Expand Up @@ -286,6 +287,48 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
present.insert(std::move(x));
}

} else if (vector_type == "vls" && !version.lt(1, 4)) {
ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64);
auto hhandle = ritsuko::hdf5::vls::open_heap(handle, "heap");
auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, "missing-value-placeholder");

auto ptr = Provisioner::new_String(len, named, is_scalar, StringVector::NONE);
output.reset(ptr);

if (is_scalar) {
ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> vlsptr;
dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>());

hsize_t len = vlsptr.length;
H5::DataSpace mspace(1, &len);
hsize_t offset = vlsptr.offset;
hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle, false);
H5::DataSpace dspace(1, &hlen);
dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset);

std::vector<uint8_t> buffer(vlsptr.length);
hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace);
auto cptr = reinterpret_cast<const char*>(buffer.data());
std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length));

if (missingness.has_value() && str == *missingness) {
ptr->set_missing(0);
} else {
ptr->set(0, std::move(str));
}

} else {
ritsuko::hdf5::vls::Stream1dArray<uint64_t, uint64_t> stream(&dhandle, &hhandle, len, buffer_size);
for (hsize_t i = 0; i < len; ++i, stream.next()) {
auto x = stream.steal();
if (missingness.has_value() && x == *missingness) {
ptr->set_missing(i);
} else {
ptr->set(i, std::move(x));
}
}
}

} else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
StringVector::Format format = StringVector::NONE;
if (version.equals(1, 0)) {
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ add_executable(
src/factor.cpp
src/number.cpp
src/string.cpp
src/vls.cpp
src/date.cpp
src/datetime.cpp
src/external.cpp
Expand Down
2 changes: 2 additions & 0 deletions tests/src/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,5 @@ TEST(JsonStringTest, CheckError) {
*** See integer.cpp for vector error tests. ***
***********************************************/
}


10 changes: 1 addition & 9 deletions tests/src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,7 @@ H5::DataSet create_dataset(const H5::Group& parent, const std::string& name, con
}

auto dhandle = parent.createDataSet(name, dtype, dspace, cplist);

if constexpr(std::is_same<T, int>::value) {
dhandle.write(values.data(), H5::PredType::NATIVE_INT);
} else if constexpr(std::is_same<T, double>::value) {
dhandle.write(values.data(), H5::PredType::NATIVE_DOUBLE);
} else {
throw std::runtime_error("unknown type!");
}

dhandle.write(values.data(), ritsuko::hdf5::as_numeric_datatype<T>());
return dhandle;
}

Expand Down
191 changes: 191 additions & 0 deletions tests/src/vls.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include <gtest/gtest.h>
#include <gmock/gmock.h>

#include "uzuki2/parse_hdf5.hpp"

#include "utils.h"

TEST(Hdf5VlsTest, Basic) {
auto path = "TEST-vls.h5";
std::string heap = "abcdefghijklmno";
size_t nlen = 10;

{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto vhandle = vector_opener(handle, "blub", "vls");
add_version(vhandle, "1.4");

auto hhandle = create_dataset(vhandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
const unsigned char* hptr = reinterpret_cast<const unsigned char*>(heap.c_str());
hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);

std::vector<ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> > pointers(nlen);
size_t n = 0;
for (size_t i = 0; i < nlen; ++i) {
pointers[i].offset = n;
size_t count = (i % 2) + 1; // for some interesting differences.
pointers[i].length = count;
n += count;
}
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>();
auto phandle = create_dataset(vhandle, "data", pointers.size(), ptype);
phandle.write(pointers.data(), ptype);
}

// Check that it works correctly.
{
auto parsed = load_hdf5(path, "blub");
EXPECT_EQ(parsed->type(), uzuki2::STRING);
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
EXPECT_EQ(sptr->size(), nlen);
std::vector<std::string> expected { "a", "bc", "d", "ef", "g", "hi", "j", "kl", "m", "no" };
EXPECT_EQ(sptr->base.values, expected);
}

// Adding a missing value placeholder.
{
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto vhandle = handle.openDataSet("blub/data");
H5::StrType stype(0, H5T_VARIABLE);
auto ahandle = vhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
ahandle.write(stype, std::string("hi"));
}

auto parsed = load_hdf5(path, "blub");
EXPECT_EQ(parsed->type(), uzuki2::STRING);
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
EXPECT_EQ(sptr->base.values[5], "ich bin missing"); // the test's missing placeholder.

// Adding the wrong missing value placeholder.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto vhandle = handle.openDataSet("blub/data");
vhandle.removeAttr("missing-value-placeholder");
vhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT, H5S_SCALAR);
}
expect_hdf5_error(path, "blub", "string datatype");

// Removing for the next checks.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto vhandle = handle.openDataSet("blub/data");
vhandle.removeAttr("missing-value-placeholder");
}
}
}

TEST(Hdf5VlsTest, Failures) {
auto path = "TEST-vls.h5";
std::string heap = "abcdefghijklmno";
size_t nlen = 10;

// Shortening the heap to check that we perform bounds checks on the pointers.
{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto ghandle = vector_opener(handle, "blub", "vls");
add_version(ghandle, "1.4");

hsize_t zero = 0;
H5::DataSpace hspace(1, &zero);
ghandle.createDataSet("heap", H5::PredType::NATIVE_UINT8, hspace);

std::vector<ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> > pointers(nlen);
for (size_t i = 0; i < nlen; ++i) {
pointers[i].offset = i;
pointers[i].length = 1;
}
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>();
auto phandle = create_dataset(ghandle, "data", pointers.size(), ptype);
phandle.write(pointers.data(), ptype);
}
expect_hdf5_error(path, "blub", "out of range");

// Checking that we check for 64-bit unsigned integer types.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto ghandle = handle.openGroup("blub");
ghandle.unlink("data");

std::vector<ritsuko::hdf5::vls::Pointer<int, int> > pointers(3);
for (auto& p : pointers) {
p.offset = 0;
p.length = 0;
}
hsize_t plen = pointers.size();
H5::DataSpace pspace(1, &plen);
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<int, int>();
auto phandle = ghandle.createDataSet("data", ptype, pspace);
phandle.write(pointers.data(), ptype);
}
expect_hdf5_error(path, "blub", "64-bit unsigned integer");

// Checking that this only works in the latest version.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto vhandle = handle.openGroup("blub");
vhandle.removeAttr("uzuki_version");
}
expect_hdf5_error(path, "blub", "unknown vector type");
}

TEST(Hdf5VlsTest, Scalar) {
auto path = "TEST-vls.h5";
std::string heap = "abcdefghijklmno";

{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto ghandle = vector_opener(handle, "blub", "vls");
add_version(ghandle, "1.4");

auto hhandle = create_dataset(ghandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
const unsigned char* hptr = reinterpret_cast<const unsigned char*>(heap.c_str());
hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);

ritsuko::hdf5::vls::Pointer<uint8_t, uint8_t> ptr;
ptr.offset = 0; ptr.length = 10;
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint8_t, uint8_t>();
auto phandle = ghandle.createDataSet("data", ptype, H5S_SCALAR);
phandle.write(&ptr, ptype);
}
{
auto parsed = load_hdf5(path, "blub");
EXPECT_EQ(parsed->type(), uzuki2::STRING);
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
EXPECT_EQ(sptr->size(), 1);
EXPECT_EQ(sptr->base.values.front(), "abcdefghij");
}

// Checking that it works correctly with early termination.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto ghandle = handle.openGroup("blub");
auto hhandle = ghandle.openDataSet("heap");
std::vector<uint8_t> replacement(heap.size());
hhandle.write(replacement.data(), H5::PredType::NATIVE_UINT8);
}
{
auto parsed = load_hdf5(path, "blub");
EXPECT_EQ(parsed->type(), uzuki2::STRING);
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
EXPECT_EQ(sptr->size(), 1);
EXPECT_EQ(sptr->base.values.front(), "");
}

// Checking that scalar works correctly with missing values.
{
H5::H5File handle(path, H5F_ACC_RDWR);
auto ghandle = handle.openGroup("blub");
auto dhandle = ghandle.openDataSet("data");
H5::StrType stype(0, 10);
auto ahandle = dhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
ahandle.write(stype, std::string{});
}
{
auto parsed = load_hdf5(path, "blub");
EXPECT_EQ(parsed->type(), uzuki2::STRING);
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
EXPECT_EQ(sptr->size(), 1);
EXPECT_EQ(sptr->base.values.front(), "ich bin missing");
}
}