Skip to content

Commit 1cdb564

Browse files
authoredMar 2, 2025··
Support ritsuko's custom variable length string arrays for HDF5. (#18)
This patches over HDF5's deficiencies for storing many variable length strings without resorting to an external object.
1 parent 22b7b7b commit 1cdb564

File tree

7 files changed

+272
-10
lines changed

7 files changed

+272
-10
lines changed
 

‎docs/specifications/build.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
library(knitr)
22
dir.create("compiled", showWarnings=FALSE)
33

4-
for (v in c("1.0", "1.1", "1.2", "1.3")) {
4+
for (v in c("1.0", "1.1", "1.2", "1.3", "1.4")) {
55
.version <- package_version(v)
66
knitr::knit("hdf5.Rmd", output=file.path("compiled", paste0("hdf5-", v, ".md")))
77
}

‎docs/specifications/hdf5.Rmd

+33
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,39 @@ This should be interpreted as a boolean where a non-zero value specifies that we
213213
}
214214
```
215215

216+
```{r, results="asis", echo=FALSE}
217+
if (.version >= package_version("1.4")) {
218+
cat('### Variable length string arrays
219+
220+
Arrays of strings can be stored in [**ritsuko**\'s custom variable length string (VLS) array](https://github.com/ArtifactDB/ritsuko).
221+
This is represented as a HDF5 group (`**/`) with the following attributes:
222+
223+
- `uzuki_object`, a scalar string dataset containing the value `"vector"`.
224+
This should use a datatype that can be represented by a UTF-8 encoded string.
225+
- `uzuki_type`, a scalar string dataset containing `"vls"`.
226+
227+
This group should contain the `pointers` and `heap` datasets.
228+
229+
- The `**/data` dataset should be a 1-dimensional or scalar dataset of a compound datatype of 2 members, `"offset"` and `"length"`.
230+
Each member should be of a datatype that can be represented by an unsigned 64-bit integer.
231+
If the dataset is scalar, the length of the VLS array is defined as 1.
232+
- The `**/heap` dataset should be a 1-dimensional dataset of unsigned 8-bit integers.
233+
234+
Each entry of `**/data` refers to a slice `[offset, offset + length)` of the `**/heap` dataset.
235+
This slice defines a variable-length UTF-8 encoded string of length `length` - unless the slice contains a null terminator, in which case the string is defined as the interval to the first null.
236+
Pointers may be in any order, overlapping or non-contiguous, as long as `[offset, offset + length)` lies within the boundaries of the heap.
237+
238+
A `missing-value-placeholder` attribute on the `**/data` dataset may be present, defining a placeholder for missing values.
239+
The attribute should be a scalar and should be of any HDF5 string datatype that can be represented by a UTF-8 encoded string.
240+
An entry of `**/data` should be considered as missing if its corresponding string is equal to the placeholder.
241+
242+
The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `**/data`.
243+
This should use a datatype that can be represented by a UTF-8 encoded string.
244+
If `**/data` is a scalar, `**/names` should have length 1.
245+
')
246+
}
247+
```
248+
216249
### Nothing
217250

218251
A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes:

‎include/uzuki2/parse_hdf5.hpp

+43
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include "ritsuko/ritsuko.hpp"
2222
#include "ritsuko/hdf5/hdf5.hpp"
23+
#include "ritsuko/hdf5/vls/vls.hpp"
2324

2425
/**
2526
* @file parse_hdf5.hpp
@@ -286,6 +287,48 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
286287
present.insert(std::move(x));
287288
}
288289

290+
} else if (vector_type == "vls" && !version.lt(1, 4)) {
291+
ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64);
292+
auto hhandle = ritsuko::hdf5::vls::open_heap(handle, "heap");
293+
auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, "missing-value-placeholder");
294+
295+
auto ptr = Provisioner::new_String(len, named, is_scalar, StringVector::NONE);
296+
output.reset(ptr);
297+
298+
if (is_scalar) {
299+
ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> vlsptr;
300+
dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>());
301+
302+
hsize_t len = vlsptr.length;
303+
H5::DataSpace mspace(1, &len);
304+
hsize_t offset = vlsptr.offset;
305+
hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle, false);
306+
H5::DataSpace dspace(1, &hlen);
307+
dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset);
308+
309+
std::vector<uint8_t> buffer(vlsptr.length);
310+
hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace);
311+
auto cptr = reinterpret_cast<const char*>(buffer.data());
312+
std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length));
313+
314+
if (missingness.has_value() && str == *missingness) {
315+
ptr->set_missing(0);
316+
} else {
317+
ptr->set(0, std::move(str));
318+
}
319+
320+
} else {
321+
ritsuko::hdf5::vls::Stream1dArray<uint64_t, uint64_t> stream(&dhandle, &hhandle, len, buffer_size);
322+
for (hsize_t i = 0; i < len; ++i, stream.next()) {
323+
auto x = stream.steal();
324+
if (missingness.has_value() && x == *missingness) {
325+
ptr->set_missing(i);
326+
} else {
327+
ptr->set(i, std::move(x));
328+
}
329+
}
330+
}
331+
289332
} else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
290333
StringVector::Format format = StringVector::NONE;
291334
if (version.equals(1, 0)) {

‎tests/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ add_executable(
2323
src/factor.cpp
2424
src/number.cpp
2525
src/string.cpp
26+
src/vls.cpp
2627
src/date.cpp
2728
src/datetime.cpp
2829
src/external.cpp

‎tests/src/string.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,5 @@ TEST(JsonStringTest, CheckError) {
235235
*** See integer.cpp for vector error tests. ***
236236
***********************************************/
237237
}
238+
239+

‎tests/src/utils.h

+1-9
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,7 @@ H5::DataSet create_dataset(const H5::Group& parent, const std::string& name, con
9999
}
100100

101101
auto dhandle = parent.createDataSet(name, dtype, dspace, cplist);
102-
103-
if constexpr(std::is_same<T, int>::value) {
104-
dhandle.write(values.data(), H5::PredType::NATIVE_INT);
105-
} else if constexpr(std::is_same<T, double>::value) {
106-
dhandle.write(values.data(), H5::PredType::NATIVE_DOUBLE);
107-
} else {
108-
throw std::runtime_error("unknown type!");
109-
}
110-
102+
dhandle.write(values.data(), ritsuko::hdf5::as_numeric_datatype<T>());
111103
return dhandle;
112104
}
113105

‎tests/src/vls.cpp

+191
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#include <gtest/gtest.h>
2+
#include <gmock/gmock.h>
3+
4+
#include "uzuki2/parse_hdf5.hpp"
5+
6+
#include "utils.h"
7+
8+
TEST(Hdf5VlsTest, Basic) {
9+
auto path = "TEST-vls.h5";
10+
std::string heap = "abcdefghijklmno";
11+
size_t nlen = 10;
12+
13+
{
14+
H5::H5File handle(path, H5F_ACC_TRUNC);
15+
auto vhandle = vector_opener(handle, "blub", "vls");
16+
add_version(vhandle, "1.4");
17+
18+
auto hhandle = create_dataset(vhandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
19+
const unsigned char* hptr = reinterpret_cast<const unsigned char*>(heap.c_str());
20+
hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);
21+
22+
std::vector<ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> > pointers(nlen);
23+
size_t n = 0;
24+
for (size_t i = 0; i < nlen; ++i) {
25+
pointers[i].offset = n;
26+
size_t count = (i % 2) + 1; // for some interesting differences.
27+
pointers[i].length = count;
28+
n += count;
29+
}
30+
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>();
31+
auto phandle = create_dataset(vhandle, "data", pointers.size(), ptype);
32+
phandle.write(pointers.data(), ptype);
33+
}
34+
35+
// Check that it works correctly.
36+
{
37+
auto parsed = load_hdf5(path, "blub");
38+
EXPECT_EQ(parsed->type(), uzuki2::STRING);
39+
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
40+
EXPECT_EQ(sptr->size(), nlen);
41+
std::vector<std::string> expected { "a", "bc", "d", "ef", "g", "hi", "j", "kl", "m", "no" };
42+
EXPECT_EQ(sptr->base.values, expected);
43+
}
44+
45+
// Adding a missing value placeholder.
46+
{
47+
{
48+
H5::H5File handle(path, H5F_ACC_RDWR);
49+
auto vhandle = handle.openDataSet("blub/data");
50+
H5::StrType stype(0, H5T_VARIABLE);
51+
auto ahandle = vhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
52+
ahandle.write(stype, std::string("hi"));
53+
}
54+
55+
auto parsed = load_hdf5(path, "blub");
56+
EXPECT_EQ(parsed->type(), uzuki2::STRING);
57+
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
58+
EXPECT_EQ(sptr->base.values[5], "ich bin missing"); // the test's missing placeholder.
59+
60+
// Adding the wrong missing value placeholder.
61+
{
62+
H5::H5File handle(path, H5F_ACC_RDWR);
63+
auto vhandle = handle.openDataSet("blub/data");
64+
vhandle.removeAttr("missing-value-placeholder");
65+
vhandle.createAttribute("missing-value-placeholder", H5::PredType::NATIVE_INT, H5S_SCALAR);
66+
}
67+
expect_hdf5_error(path, "blub", "string datatype");
68+
69+
// Removing for the next checks.
70+
{
71+
H5::H5File handle(path, H5F_ACC_RDWR);
72+
auto vhandle = handle.openDataSet("blub/data");
73+
vhandle.removeAttr("missing-value-placeholder");
74+
}
75+
}
76+
}
77+
78+
TEST(Hdf5VlsTest, Failures) {
79+
auto path = "TEST-vls.h5";
80+
std::string heap = "abcdefghijklmno";
81+
size_t nlen = 10;
82+
83+
// Shortening the heap to check that we perform bounds checks on the pointers.
84+
{
85+
H5::H5File handle(path, H5F_ACC_TRUNC);
86+
auto ghandle = vector_opener(handle, "blub", "vls");
87+
add_version(ghandle, "1.4");
88+
89+
hsize_t zero = 0;
90+
H5::DataSpace hspace(1, &zero);
91+
ghandle.createDataSet("heap", H5::PredType::NATIVE_UINT8, hspace);
92+
93+
std::vector<ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> > pointers(nlen);
94+
for (size_t i = 0; i < nlen; ++i) {
95+
pointers[i].offset = i;
96+
pointers[i].length = 1;
97+
}
98+
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>();
99+
auto phandle = create_dataset(ghandle, "data", pointers.size(), ptype);
100+
phandle.write(pointers.data(), ptype);
101+
}
102+
expect_hdf5_error(path, "blub", "out of range");
103+
104+
// Checking that we check for 64-bit unsigned integer types.
105+
{
106+
H5::H5File handle(path, H5F_ACC_RDWR);
107+
auto ghandle = handle.openGroup("blub");
108+
ghandle.unlink("data");
109+
110+
std::vector<ritsuko::hdf5::vls::Pointer<int, int> > pointers(3);
111+
for (auto& p : pointers) {
112+
p.offset = 0;
113+
p.length = 0;
114+
}
115+
hsize_t plen = pointers.size();
116+
H5::DataSpace pspace(1, &plen);
117+
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<int, int>();
118+
auto phandle = ghandle.createDataSet("data", ptype, pspace);
119+
phandle.write(pointers.data(), ptype);
120+
}
121+
expect_hdf5_error(path, "blub", "64-bit unsigned integer");
122+
123+
// Checking that this only works in the latest version.
124+
{
125+
H5::H5File handle(path, H5F_ACC_RDWR);
126+
auto vhandle = handle.openGroup("blub");
127+
vhandle.removeAttr("uzuki_version");
128+
}
129+
expect_hdf5_error(path, "blub", "unknown vector type");
130+
}
131+
132+
TEST(Hdf5VlsTest, Scalar) {
133+
auto path = "TEST-vls.h5";
134+
std::string heap = "abcdefghijklmno";
135+
136+
{
137+
H5::H5File handle(path, H5F_ACC_TRUNC);
138+
auto ghandle = vector_opener(handle, "blub", "vls");
139+
add_version(ghandle, "1.4");
140+
141+
auto hhandle = create_dataset(ghandle, "heap", heap.size(), H5::PredType::NATIVE_UINT8);
142+
const unsigned char* hptr = reinterpret_cast<const unsigned char*>(heap.c_str());
143+
hhandle.write(hptr, H5::PredType::NATIVE_UCHAR);
144+
145+
ritsuko::hdf5::vls::Pointer<uint8_t, uint8_t> ptr;
146+
ptr.offset = 0; ptr.length = 10;
147+
auto ptype = ritsuko::hdf5::vls::define_pointer_datatype<uint8_t, uint8_t>();
148+
auto phandle = ghandle.createDataSet("data", ptype, H5S_SCALAR);
149+
phandle.write(&ptr, ptype);
150+
}
151+
{
152+
auto parsed = load_hdf5(path, "blub");
153+
EXPECT_EQ(parsed->type(), uzuki2::STRING);
154+
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
155+
EXPECT_EQ(sptr->size(), 1);
156+
EXPECT_EQ(sptr->base.values.front(), "abcdefghij");
157+
}
158+
159+
// Checking that it works correctly with early termination.
160+
{
161+
H5::H5File handle(path, H5F_ACC_RDWR);
162+
auto ghandle = handle.openGroup("blub");
163+
auto hhandle = ghandle.openDataSet("heap");
164+
std::vector<uint8_t> replacement(heap.size());
165+
hhandle.write(replacement.data(), H5::PredType::NATIVE_UINT8);
166+
}
167+
{
168+
auto parsed = load_hdf5(path, "blub");
169+
EXPECT_EQ(parsed->type(), uzuki2::STRING);
170+
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
171+
EXPECT_EQ(sptr->size(), 1);
172+
EXPECT_EQ(sptr->base.values.front(), "");
173+
}
174+
175+
// Checking that scalar works correctly with missing values.
176+
{
177+
H5::H5File handle(path, H5F_ACC_RDWR);
178+
auto ghandle = handle.openGroup("blub");
179+
auto dhandle = ghandle.openDataSet("data");
180+
H5::StrType stype(0, 10);
181+
auto ahandle = dhandle.createAttribute("missing-value-placeholder", stype, H5S_SCALAR);
182+
ahandle.write(stype, std::string{});
183+
}
184+
{
185+
auto parsed = load_hdf5(path, "blub");
186+
EXPECT_EQ(parsed->type(), uzuki2::STRING);
187+
auto sptr = static_cast<const DefaultStringVector*>(parsed.get());
188+
EXPECT_EQ(sptr->size(), 1);
189+
EXPECT_EQ(sptr->base.values.front(), "ich bin missing");
190+
}
191+
}

0 commit comments

Comments
 (0)
Please sign in to comment.