-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add pack and unpack of a more compact serialization format (#68)
* Store the original size argument in struct binary_fuse{8,16}_s ... in preparation of a more compact serialization format: All other parameters except for the Seed are derived from the size parameter. The drawback is that this format is sensitive to changes of binary_fuse8_allocate(). Due to alignment, this does not need any more space on 64bit. (There were 5 32bit values inbetween two 64bit values) Yet formally, this is a breaking change of the in-core format, which should not be used to store information across versions. See follow up commits for new compact serialization formats. * Add {xor,binary_fuse}{8,16}_{pack,unpack} serialization formats. Rationale: As mentioned in the previous commit, for binary_fuse filters, we do not need to save values derived from the size, saving 5 x sizeof(uint32_t). For both filter implementations, we add a bitmap to indicate non-zero fingerprint values. This adds 1/{8,16} of the fingerprint array size, but saves one or two bytes for each zero fingerprint. The net result is a packed format which can not be compressed further by zlib for the bundled unit tests. Note that this format is incompatible with the existing _serialize() format and, in the case of binary_fuse, sensitive to changes of the derived parameters in _allocate. Interface: We add _pack_bytes() to match _serialization_bytes(). _pack() and _unpack() match _serialize() and _deserialize(). The existing _{de,}serialize() interfaces take a buffer pointer only and thus implicitly assume that the buffer will be of sufficient size. For the new functions, we add a size_t parameter indicating the size of the buffer and check its bounds in the implementation. _pack returns the used size or zero for "does not fit", so when called with a buffer of arbitrary size, the used space or error condition can be determined without an additional call to _pack_bytes(), avoiding duplicate work. Implementation: We add some XOR_bitf_* macros to address words and individual bits of bitfields. The XOR_ser and XOR_deser macros have the otherwise repeated code for bounds checking and the actual serialization. Because the implementations for the 8 and 16 bit words are equal except for the data type, we add macros and create the actual functions by expanding the macros with the possible data types. Alternatives considered: Compared to _{de,}serialize(), the new functions need to copy individual fingerprint words rather than the whole array at once, which is less efficient. Therefor, an implementation using Duff's Device with branchless code was attempted but dismissed because avoiding out-of-bounds access would require an over-allocated buffer. * Adjust unit tests to new _{un,}pack() interface To exercise the new code without too much of a change to the existing unit test, we change the signature of _{un,}serialize_gen() to take an additional (const) size_t argument, which we ignore for _{un,}serialize(). We add to the reported metrics absolute and relative size information for the "in-core" and "wire" format, the latter jointly referencing to _{un,}serialize() and _{un,}pack(). * Document the new _{un,}pack() interface * tuning the wording and adding a spaceusage benchmark * changing the wording. * rewording. * explicit casts --------- Co-authored-by: Daniel Lemire <[email protected]>
- Loading branch information
Showing
7 changed files
with
498 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
add_executable(bench bench.c) | ||
target_link_libraries(bench PUBLIC xor_singleheader) | ||
|
||
add_executable(spaceusage spaceusage.c) | ||
target_link_libraries(spaceusage PUBLIC xor_singleheader) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#include "binaryfusefilter.h" | ||
#include "xorfilter.h" | ||
#include <stdlib.h> | ||
#include <iso646.h> | ||
|
||
typedef struct { | ||
size_t standard; | ||
size_t pack; | ||
} sizes; | ||
|
||
sizes fuse16(size_t n) { | ||
binary_fuse16_t filter = {0}; | ||
if (! binary_fuse16_allocate(n, &filter)) { | ||
printf("allocation failed\n"); | ||
return (sizes) {0, 0}; | ||
} | ||
uint64_t* big_set = malloc(n * sizeof(uint64_t)); | ||
for(size_t i = 0; i < n; i++) { | ||
big_set[i] = i; | ||
} | ||
bool is_ok = binary_fuse16_populate(big_set, n, &filter); | ||
if(! is_ok ) { | ||
printf("populating failed\n"); | ||
} | ||
free(big_set); | ||
sizes s = { | ||
.standard = binary_fuse16_serialization_bytes(&filter), | ||
.pack = binary_fuse16_pack_bytes(&filter) | ||
}; | ||
binary_fuse16_free(&filter); | ||
return s; | ||
} | ||
|
||
sizes fuse8(size_t n) { | ||
binary_fuse8_t filter = {0}; | ||
if (! binary_fuse8_allocate(n, &filter)) { | ||
printf("allocation failed\n"); | ||
return (sizes) {0, 0}; | ||
} | ||
uint64_t* big_set = malloc(n * sizeof(uint64_t)); | ||
for(size_t i = 0; i < n; i++) { | ||
big_set[i] = i; | ||
} | ||
bool is_ok = binary_fuse8_populate(big_set, n, &filter); | ||
if(! is_ok ) { | ||
printf("populating failed\n"); | ||
} | ||
free(big_set); | ||
sizes s = { | ||
.standard = binary_fuse8_serialization_bytes(&filter), | ||
.pack = binary_fuse8_pack_bytes(&filter) | ||
}; | ||
binary_fuse8_free(&filter); | ||
return s; | ||
} | ||
|
||
sizes xor16(size_t n) { | ||
xor16_t filter = {0}; | ||
if (! xor16_allocate(n, &filter)) { | ||
printf("allocation failed\n"); | ||
return (sizes) {0, 0}; | ||
} | ||
uint64_t* big_set = malloc(n * sizeof(uint64_t)); | ||
for(size_t i = 0; i < n; i++) { | ||
big_set[i] = i; | ||
} | ||
bool is_ok = xor16_populate(big_set, n, &filter); | ||
if(! is_ok ) { | ||
printf("populating failed\n"); | ||
} | ||
free(big_set); | ||
sizes s = { | ||
.standard = xor16_serialization_bytes(&filter), | ||
.pack = xor16_pack_bytes(&filter) | ||
}; | ||
xor16_free(&filter); | ||
return s; | ||
} | ||
|
||
sizes xor8(size_t n) { | ||
xor8_t filter = {0}; | ||
if (! xor8_allocate(n, &filter)) { | ||
printf("allocation failed\n"); | ||
return (sizes) {0, 0}; | ||
} | ||
uint64_t* big_set = malloc(n * sizeof(uint64_t)); | ||
for(size_t i = 0; i < n; i++) { | ||
big_set[i] = i; | ||
} | ||
bool is_ok = xor8_populate(big_set, n, &filter); | ||
if(! is_ok ) { | ||
printf("populating failed\n"); | ||
} | ||
free(big_set); | ||
sizes s = { | ||
.standard = xor8_serialization_bytes(&filter), | ||
.pack = xor8_pack_bytes(&filter) | ||
}; | ||
xor8_free(&filter); | ||
|
||
return s; | ||
} | ||
|
||
int main() { | ||
for (size_t n = 10; n <= 10000000; n *= 2) { | ||
printf("%-10zu ", n); // Align number to 10 characters wide | ||
sizes f16 = fuse16(n); | ||
sizes f8 = fuse8(n); | ||
sizes x16 = xor16(n); | ||
sizes x8 = xor8(n); | ||
|
||
printf("fuse16: %5.2f %5.2f ", (double)f16.standard * 8.0 / n, (double)f16.pack * 8.0 / n); | ||
printf("fuse8: %5.2f %5.2f ", (double)f8.standard * 8.0 / n, (double)f8.pack * 8.0 / n); | ||
printf("xor16: %5.2f %5.2f ", (double)x16.standard * 8.0 / n, (double)x16.pack * 8.0 / n); | ||
printf("xor8: %5.2f %5.2f ", (double)x8.standard * 8.0 / n, (double)x8.pack * 8.0 / n); | ||
printf("\n"); | ||
} | ||
return EXIT_SUCCESS; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.