diff --git a/CMakeLists.txt b/CMakeLists.txt index 24382a95..c51b1566 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,11 @@ if(ENABLE_ROARING_TESTS AND BASH AND NOT EMSCRIPTEN) else() message(STATUS "Amalgamation tests disabled") endif() +option(ROARING_OPENZL "Enable OpenZL integration" OFF) +if(ROARING_OPENZL) + add_subdirectory(openzl) +endif() + option(ENABLE_ROARING_MICROBENCHMARKS "Enable microbenchmarks" OFF) if(ENABLE_ROARING_MICROBENCHMARKS) add_subdirectory(microbenchmarks) diff --git a/README.md b/README.md index bef8a3d9..70495d5e 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Portable Roaring bitmaps in C (and C++) with full support for your favorite comp - [Mailing list/discussion group](#mailing-listdiscussion-group) - [Contributing](#contributing) - [References about Roaring](#references-about-roaring) +- [OpenZL Integration](#openzl-integration) # Introduction @@ -1189,6 +1190,27 @@ A compiler or static-analyzer warning is not a bug. Do not report such cases as [![Star History Chart](https://api.star-history.com/svg?repos=RoaringBitmap/CRoaring&type=Date)](https://www.star-history.com/#RoaringBitmap/CRoaring&Date) +# OpenZL Integration + +The `openzl/` subdirectory provides an [OpenZL](https://github.com/facebook/openzl) integration with an SDDL description of the [Roaring Bitmap serialization format](https://github.com/RoaringBitmap/RoaringFormatSpec). This enables format-aware compression of serialized Roaring Bitmap data using OpenZL's specialized compressor. + +To enable OpenZL support, set the `ROARING_OPENZL` flag during configuration: + +``` +cmake -DROARING_OPENZL=ON -B build +``` + +The SDDL file (`openzl/roaring.sddl`) describes the portable Roaring Bitmap binary format, including: + +- Cookie-based format detection (with and without run containers) +- Container descriptors (key + cardinality) +- All three container types: array, bitset, and run-length encoded +- Offset headers for random access + +See the [SDDL documentation](https://openzl.org/sddl/) for more on the format description language. + + + # References about Roaring - Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O'Hara, François Saint-Jacques, Gregory Ssi-Yan-Kai, Roaring Bitmaps: Implementation of an Optimized Software Library, Software: Practice and Experience Volume 48, Issue 4 April 2018 Pages 867-895 [arXiv:1709.07821](https://arxiv.org/abs/1709.07821) diff --git a/openzl/CMakeLists.txt b/openzl/CMakeLists.txt new file mode 100644 index 00000000..a530084f --- /dev/null +++ b/openzl/CMakeLists.txt @@ -0,0 +1,13 @@ +set(OPENZL_BUILD_TOOLS ON CACHE BOOL "Build OpenZL tools (needed for SDDL compiler)" FORCE) + +CPMAddPackage( + NAME openzl + GIT_REPOSITORY https://github.com/facebook/openzl.git + GIT_TAG 35300cebf0bca276fae7034aa511045df0de1936 +) + +if(ENABLE_ROARING_TESTS) + add_subdirectory(test) +endif() + +add_subdirectory(compression_benchmark) diff --git a/openzl/compression_benchmark/CMakeLists.txt b/openzl/compression_benchmark/CMakeLists.txt new file mode 100644 index 00000000..3db95b67 --- /dev/null +++ b/openzl/compression_benchmark/CMakeLists.txt @@ -0,0 +1,34 @@ +set(SDDL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/../roaring.sddl") +set(SDDL_COMPILED "${CMAKE_CURRENT_BINARY_DIR}/roaring_sddl.bin") +set(SDDL_HEADER "${CMAKE_CURRENT_BINARY_DIR}/roaring_sddl.h") + +# Compile the SDDL source to binary using the sddl_compiler tool. +add_custom_command( + OUTPUT ${SDDL_COMPILED} + COMMAND sddl_compiler < ${SDDL_SOURCE} > ${SDDL_COMPILED} + DEPENDS ${SDDL_SOURCE} sddl_compiler + COMMENT "Compiling roaring.sddl" +) + +# Generate a C header embedding the compiled SDDL as a byte array. +add_custom_command( + OUTPUT ${SDDL_HEADER} + COMMAND ${CMAKE_COMMAND} + -DINPUT_FILE=${SDDL_COMPILED} + -DOUTPUT_FILE=${SDDL_HEADER} + -DARRAY_NAME=roaring_sddl_compiled + -DSIZE_NAME=roaring_sddl_compiled_size + -DGUARD_NAME=ROARING_SDDL_H + -P ${CMAKE_CURRENT_SOURCE_DIR}/bin2header.cmake + DEPENDS ${SDDL_COMPILED} + COMMENT "Generating roaring_sddl.h" +) + +add_executable(compression_benchmark compression_benchmark.c ${SDDL_HEADER}) +target_link_libraries(compression_benchmark roaring openzl sddl_profile) +target_include_directories(compression_benchmark PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/cpp + ${CMAKE_CURRENT_BINARY_DIR} + ${openzl_SOURCE_DIR} +) diff --git a/openzl/compression_benchmark/bin2header.cmake b/openzl/compression_benchmark/bin2header.cmake new file mode 100644 index 00000000..dbc9e2c1 --- /dev/null +++ b/openzl/compression_benchmark/bin2header.cmake @@ -0,0 +1,38 @@ +file(READ "${INPUT_FILE}" CONTENT HEX) +string(LENGTH "${CONTENT}" HEX_LEN) +math(EXPR BYTE_COUNT "${HEX_LEN} / 2") + +set(OUTPUT "/* Auto-generated — do not edit. */\n") +string(APPEND OUTPUT "#ifndef ${GUARD_NAME}\n") +string(APPEND OUTPUT "#define ${GUARD_NAME}\n\n") +string(APPEND OUTPUT "#include \n\n") +string(APPEND OUTPUT "static const unsigned char ${ARRAY_NAME}[] = {\n") + +set(POS 0) +set(LINE " ") +set(COL 0) +while(POS LESS HEX_LEN) + string(SUBSTRING "${CONTENT}" ${POS} 2 BYTE) + math(EXPR POS "${POS} + 2") + if(POS LESS HEX_LEN) + string(APPEND LINE "0x${BYTE}, ") + else() + string(APPEND LINE "0x${BYTE}") + endif() + math(EXPR COL "${COL} + 1") + if(COL EQUAL 12) + string(APPEND OUTPUT "${LINE}\n") + set(LINE " ") + set(COL 0) + endif() +endwhile() + +if(COL GREATER 0) + string(APPEND OUTPUT "${LINE}\n") +endif() + +string(APPEND OUTPUT "};\n\n") +string(APPEND OUTPUT "static const size_t ${SIZE_NAME} = ${BYTE_COUNT};\n\n") +string(APPEND OUTPUT "#endif /* ${GUARD_NAME} */\n") + +file(WRITE "${OUTPUT_FILE}" "${OUTPUT}") diff --git a/openzl/compression_benchmark/compression_benchmark.c b/openzl/compression_benchmark/compression_benchmark.c new file mode 100644 index 00000000..e73ea5cc --- /dev/null +++ b/openzl/compression_benchmark/compression_benchmark.c @@ -0,0 +1,320 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../../benchmarks/numbersfromtextfiles.h" +#include "roaring_sddl.h" + +static const char *datadir[] = { + "census-income", "census-income_srt", "census1881", + "census1881_srt", "uscensus2000", "weather_sept_85", + "weather_sept_85_srt", "wikileaks-noquotes", "wikileaks-noquotes_srt"}; + +#define NUM_DATASETS (sizeof(datadir) / sizeof(datadir[0])) + +static double now_seconds(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; +} + +#define MIN_TIME 0.5 + +/* + * Serialize all bitmaps in a dataset into one contiguous buffer. + * Caller must free the returned buffer. + */ +static char *serialize_dataset(const char *basedir, const char *name, + bool runoptimize, size_t *out_size) { + char path[4096]; + snprintf(path, sizeof(path), "%s/%s", basedir, name); + + size_t count = 0; + size_t *howmany = NULL; + uint32_t **numbers = read_all_integer_files(path, ".txt", &howmany, &count); + if (numbers == NULL || count == 0) { + *out_size = 0; + return NULL; + } + + size_t total = 0; + for (size_t i = 0; i < count; i++) { + roaring_bitmap_t *bm = roaring_bitmap_of_ptr(howmany[i], numbers[i]); + if (runoptimize) roaring_bitmap_run_optimize(bm); + total += roaring_bitmap_portable_size_in_bytes(bm); + roaring_bitmap_free(bm); + } + + char *buf = (char *)malloc(total); + size_t offset = 0; + for (size_t i = 0; i < count; i++) { + roaring_bitmap_t *bm = roaring_bitmap_of_ptr(howmany[i], numbers[i]); + if (runoptimize) roaring_bitmap_run_optimize(bm); + offset += roaring_bitmap_portable_serialize(bm, buf + offset); + roaring_bitmap_free(bm); + } + + for (size_t i = 0; i < count; i++) free(numbers[i]); + free(numbers); + free(howmany); + + *out_size = total; + return buf; +} + +typedef struct { + size_t compressed_size; + double compress_speed_mbs; + double decompress_speed_mbs; +} bench_result_t; + +/* + * Compress, verify roundtrip, then time compress/decompress. + * Returns non-zero on verification failure. + */ +static int bench_generic(const char *serialized, size_t serialized_size, + bench_result_t *result) { + size_t bound = ZL_compressBound(serialized_size); + char *compressed = (char *)malloc(bound); + + ZL_CCtx *cctx = ZL_CCtx_create(); + (void)ZL_CCtx_setParameter(cctx, ZL_CParam_formatVersion, + ZL_MAX_FORMAT_VERSION); + + ZL_Report r = + ZL_CCtx_compress(cctx, compressed, bound, serialized, serialized_size); + if (ZL_isError(r)) { + fprintf(stderr, "generic compress failed: %s\n", + ZL_ErrorCode_toString(ZL_errorCode(r))); + ZL_CCtx_free(cctx); + free(compressed); + return 1; + } + size_t csz = ZL_validResult(r); + + /* Verify roundtrip. */ + char *decompressed = (char *)malloc(serialized_size); + ZL_Report dr = + ZL_decompress(decompressed, serialized_size, compressed, csz); + if (ZL_isError(dr)) { + fprintf(stderr, "generic roundtrip: decompress failed\n"); + free(decompressed); + ZL_CCtx_free(cctx); + free(compressed); + return 1; + } + if (ZL_validResult(dr) != serialized_size || + memcmp(serialized, decompressed, serialized_size) != 0) { + fprintf(stderr, "generic roundtrip: data mismatch\n"); + free(decompressed); + ZL_CCtx_free(cctx); + free(compressed); + return 1; + } + free(decompressed); + + /* Time compression. */ + int iters = 0; + double elapsed = 0; + while (elapsed < MIN_TIME) { + double t0 = now_seconds(); + (void)ZL_CCtx_compress(cctx, compressed, bound, serialized, + serialized_size); + elapsed += now_seconds() - t0; + iters++; + } + ZL_CCtx_free(cctx); + + /* Time decompression. */ + decompressed = (char *)malloc(serialized_size); + int diters = 0; + double delapsed = 0; + while (delapsed < MIN_TIME) { + double t0 = now_seconds(); + (void)ZL_decompress(decompressed, serialized_size, compressed, csz); + delapsed += now_seconds() - t0; + diters++; + } + free(decompressed); + free(compressed); + + result->compressed_size = csz; + result->compress_speed_mbs = + (double)serialized_size * iters / elapsed / (1024.0 * 1024.0); + result->decompress_speed_mbs = + (double)serialized_size * diters / delapsed / (1024.0 * 1024.0); + return 0; +} + +/* + * Compress with SDDL, verify roundtrip, then time compress/decompress. + * Uses the no-run SDDL profile. For run-format data where the SDDL + * description misparses the header, permissive mode falls back to + * generic compression. + */ +static int bench_sddl(const char *serialized, size_t serialized_size, + bench_result_t *result) { + size_t bound = ZL_compressBound(serialized_size); + char *compressed = (char *)malloc(bound); + + ZL_Compressor *comp = ZL_Compressor_create(); + ZL_RESULT_OF(ZL_GraphID) + gr = ZL_SDDL_setupProfile(comp, roaring_sddl_compiled, + roaring_sddl_compiled_size); + if (ZL_RES_isError(gr)) { + fprintf(stderr, "SDDL setupProfile failed\n"); + ZL_Compressor_free(comp); + free(compressed); + return 1; + } + (void)ZL_Compressor_selectStartingGraphID(comp, ZL_RES_value(gr)); + + ZL_CCtx *cctx = ZL_CCtx_create(); + (void)ZL_CCtx_setParameter(cctx, ZL_CParam_formatVersion, + ZL_MAX_FORMAT_VERSION); + (void)ZL_CCtx_setParameter(cctx, ZL_CParam_permissiveCompression, 1); + (void)ZL_CCtx_refCompressor(cctx, comp); + + ZL_Report r = + ZL_CCtx_compress(cctx, compressed, bound, serialized, serialized_size); + if (ZL_isError(r)) { + fprintf(stderr, "SDDL compress failed: %s\n", + ZL_ErrorCode_toString(ZL_errorCode(r))); + ZL_CCtx_free(cctx); + ZL_Compressor_free(comp); + free(compressed); + return 1; + } + size_t csz = ZL_validResult(r); + + /* Verify roundtrip. */ + char *decompressed = (char *)malloc(serialized_size); + ZL_Report dr = + ZL_decompress(decompressed, serialized_size, compressed, csz); + if (ZL_isError(dr)) { + fprintf(stderr, "SDDL roundtrip: decompress failed\n"); + free(decompressed); + ZL_CCtx_free(cctx); + ZL_Compressor_free(comp); + free(compressed); + return 1; + } + if (ZL_validResult(dr) != serialized_size || + memcmp(serialized, decompressed, serialized_size) != 0) { + fprintf(stderr, "SDDL roundtrip: data mismatch\n"); + free(decompressed); + ZL_CCtx_free(cctx); + ZL_Compressor_free(comp); + free(compressed); + return 1; + } + free(decompressed); + + /* Time compression. */ + int iters = 0; + double elapsed = 0; + while (elapsed < MIN_TIME) { + double t0 = now_seconds(); + (void)ZL_CCtx_compress(cctx, compressed, bound, serialized, + serialized_size); + elapsed += now_seconds() - t0; + iters++; + } + ZL_CCtx_free(cctx); + ZL_Compressor_free(comp); + + /* Time decompression. */ + decompressed = (char *)malloc(serialized_size); + int diters = 0; + double delapsed = 0; + while (delapsed < MIN_TIME) { + double t0 = now_seconds(); + (void)ZL_decompress(decompressed, serialized_size, compressed, csz); + delapsed += now_seconds() - t0; + diters++; + } + free(decompressed); + free(compressed); + + result->compressed_size = csz; + result->compress_speed_mbs = + (double)serialized_size * iters / elapsed / (1024.0 * 1024.0); + result->decompress_speed_mbs = + (double)serialized_size * diters / delapsed / (1024.0 * 1024.0); + return 0; +} + +static void print_header(void) { + printf(" %-25s %10s %10s %10s %7s %7s %12s %12s %12s %12s\n", "dataset", + "serial", "generic", "sddl", "g-ratio", "s-ratio", "c:gen MB/s", + "c:sddl MB/s", "d:gen MB/s", "d:sddl MB/s"); + printf(" %-25s %10s %10s %10s %7s %7s %12s %12s %12s %12s\n", "-------", + "------", "-------", "----", "-------", "-------", "----------", + "-----------", "----------", "-----------"); +} + +static void print_row(const char *name, size_t serialized_size, + bench_result_t *gen, bench_result_t *sddl) { + double ratio_g = (double)gen->compressed_size / (double)serialized_size; + if (sddl->compressed_size > 0) { + double ratio_s = + (double)sddl->compressed_size / (double)serialized_size; + printf( + " %-25s %10zu %10zu %10zu %7.3f %7.3f %10.1f %11.1f %10.1f " + "%11.1f\n", + name, serialized_size, gen->compressed_size, sddl->compressed_size, + ratio_g, ratio_s, gen->compress_speed_mbs, sddl->compress_speed_mbs, + gen->decompress_speed_mbs, sddl->decompress_speed_mbs); + } else { + printf( + " %-25s %10zu %10zu %10s %7.3f %7s %10.1f %11s %10.1f %11s\n", + name, serialized_size, gen->compressed_size, "ERR", ratio_g, "ERR", + gen->compress_speed_mbs, "ERR", gen->decompress_speed_mbs, "ERR"); + } +} + +int main(int argc, char **argv) { + const char *basedir = "benchmarks/realdata"; + if (argc > 1) basedir = argv[1]; + + for (int pass = 0; pass < 2; pass++) { + bool runopt = (pass == 1); + printf("%s run optimization:\n", runopt ? "With" : "Without"); + print_header(); + + for (size_t d = 0; d < NUM_DATASETS; d++) { + size_t sz = 0; + char *serialized = + serialize_dataset(basedir, datadir[d], runopt, &sz); + if (!serialized || sz == 0) { + fprintf(stderr, " %-25s (no data)\n", datadir[d]); + continue; + } + + bench_result_t gen = {0}, sddl = {0}; + if (bench_generic(serialized, sz, &gen) != 0) { + fprintf(stderr, " %-25s generic FAILED\n", datadir[d]); + free(serialized); + return 1; + } + if (bench_sddl(serialized, sz, &sddl) != 0) { + /* SDDL may fail for run-optimized data; show ERR column. */ + sddl.compressed_size = 0; + } + print_row(datadir[d], sz, &gen, &sddl); + + free(serialized); + } + printf("\n"); + } + + return 0; +} diff --git a/openzl/roaring.sddl b/openzl/roaring.sddl new file mode 100644 index 00000000..a1579a17 --- /dev/null +++ b/openzl/roaring.sddl @@ -0,0 +1,197 @@ +# Roaring Bitmap — Portable Format (SDDL v0.6) +# +# Roaring bitmaps are compressed bitmaps that store sets of 32-bit unsigned +# integers using a two-level structure. 32-bit integers are split into a 16-bit +# "key" (most significant bits) and a 16-bit value (least significant bits). +# Each key maps to one container of three possible types: +# - Array container: sorted UInt16 list, used when cardinality <= 4096 +# - Bitset container: dense 8KB bitset, used when cardinality > 4096 +# - Run container: sorted list of (start, length) run pairs +# +# The initial 16-bit cookie selects between two top-level formats: +# 12346 (NO_RUN_CONTAINER): no run containers; a 32-bit padding word follows +# 12347 (WITH_RUNS): run containers may be present; a run-bitset follows +# +# License: Apache-2.0 +# Source: https://github.com/RoaringBitmap/RoaringFormatSpec + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +var COOKIE_NO_RUNS = 12346 +var COOKIE_WITH_RUNS = 12347 +var ARRAY_CARDINALITY_THRESHOLD = 4096 # containers with cardinality <= this are array containers +var NO_OFFSET_THRESHOLD = 4 # offset header present when num_containers >= this + +# --------------------------------------------------------------------------- +# Enumerations +# --------------------------------------------------------------------------- + +enum CookieType { + NO_RUNS = 12346, + WITH_RUNS = 12347 +} + +# --------------------------------------------------------------------------- +# Sub-records +# --------------------------------------------------------------------------- + +# A single run: a start value and a run length (both minus-1 encoded for runs). +# The run covers the closed interval [start, start + count_minus_1], all within +# the 16-bit value space, so the end must not exceed 65535. +Record Run() = { + start: UInt16LE, # first value in the run (16-bit, within container) + count_minus_1: UInt16LE # length of run minus 1; run covers [start, start + count_minus_1] + expect start + count_minus_1 <= 65535 # run must not overflow the 16-bit value space +} + +# Run container: a list of non-overlapping, sorted runs of consecutive 16-bit integers. +# Space-efficient when values cluster into long consecutive ranges. +# +# Ordering invariant across runs (for each run i > 0): +# runs[i].start > runs[i-1].start + runs[i-1].count_minus_1 +# +# i.e. runs are strictly increasing and non-overlapping: the start of run i must +# be strictly greater than the last value of run i-1. +# +# In SDDL v0.6 this cross-element constraint is expressed with a `where` clause +# using `_prev` to refer to the preceding element. The predicate is skipped for +# the first element (index 0) since there is no predecessor. +Record RunContainer() = { + num_runs: UInt16LE, # number of runs in this container + runs: Run[num_runs] + where _index == 0 or runs[_index].start > runs[_index - 1].start + runs[_index - 1].count_minus_1 + # [requires scan — num_runs from local field; where clause references adjacent elements] +} + +# Array container: a sorted array of 16-bit integers. +# Used when cardinality <= 4096 (denser encoding than a full bitset). +# +# Ordering invariant: values are strictly increasing (no duplicates, sorted). +# In SDDL v0.6 this is expressed with a `where` clause using `_index` to compare +# each element against its predecessor. The predicate is skipped for index 0. +Record ArrayContainer(num_values) = { + values: UInt16LE[num_values] + where _index == 0 or values[_index] > values[_index - 1] + # [requires scan — num_values is a parameter, but where clause references local data] +} + +# Bitset container: a dense 65536-bit (8 KB) bitset. +# Bit i is set if the value i is present in this container. +# Used when cardinality > 4096. +Record BitsetContainer() = { + bitset: Bytes(8192) # 2^16 bits = 8192 bytes [instant-parse] +} + +# Per-container metadata entry in the descriptive header. +# Stored for every container, before the actual container data. +Record ContainerMeta() = { + key: UInt16LE, # 16 most-significant bits shared by all values in this container + cardinality_minus_1: UInt16LE # actual cardinality = cardinality_minus_1 + 1 (range 1..65536) +} + +# --------------------------------------------------------------------------- +# Top-level header variants (selected by the cookie) +# --------------------------------------------------------------------------- + +# Header used when cookie == 12346 (no run containers). +# The full 32-bit cookie word is completed by two zero bytes, then a 32-bit +# container count follows. +Record HeaderNoRuns() = { + _: Bytes(2), # two zero bytes completing the 32-bit cookie (0x0000) + num_containers: UInt32LE # number of containers in the bitmap +} + +# Header used when cookie == 12347 (run containers possible). +# The upper 16 bits of the cookie word are the container count minus 1, followed +# by a run-bitset whose byte length = ceil(num_containers / 8). +# Bit i of the bitset (LSB-first) is 1 if container i is a run container. +Record HeaderWithRuns() = { + num_containers_minus_1: UInt16LE, # num_containers = this + 1 + run_bitset: Bytes((num_containers_minus_1 + 1 + 7) / 8) # ceil(N/8) bytes [requires scan] +} + +# --------------------------------------------------------------------------- +# Root-level parse +# --------------------------------------------------------------------------- +# +# The file begins with a 16-bit cookie that determines: +# a) which header format to read, and +# b) whether any container may be a run container. +# +# After the header, num_containers ContainerMeta entries are read. +# +# An optional offset table (one UInt32LE per container, giving byte offsets from +# stream start) follows when: +# - cookie == NO_RUNS (always), OR +# - num_containers >= NO_OFFSET_THRESHOLD (4) +# +# Finally, num_containers containers are read in order. The container type for +# container i is determined by: +# 1. If cookie == WITH_RUNS AND bit i of the run_bitset is set → RunContainer +# 2. Else if cardinality_i <= 4096 → ArrayContainer +# 3. Else → BitsetContainer + +cookie: UInt16LE +expect cookie in CookieType # must be 12346 or 12347 + +# Read the appropriate header variant based on the cookie. +when cookie == COOKIE_NO_RUNS { + header_no_runs: HeaderNoRuns +} +when cookie == COOKIE_WITH_RUNS { + header_with_runs: HeaderWithRuns +} + +# Resolve num_containers from whichever header was parsed. +var num_containers = switch cookie { + case COOKIE_NO_RUNS: header_no_runs.num_containers, + default: header_with_runs.num_containers_minus_1 + 1 +} + +# Descriptive header: one ContainerMeta per container. +container_meta: ContainerMeta[num_containers] # [requires scan — num_containers from parsed data] + +# Optional byte-offset table. +# Present when there are no run containers OR when num_containers >= 4. +# Each entry is a UInt32LE byte offset measured from the beginning of the stream. +when (cookie == COOKIE_NO_RUNS) or (num_containers >= NO_OFFSET_THRESHOLD) { + offsets: UInt32LE[num_containers] +} + +# Container data. +# Each container is one of RunContainer, ArrayContainer, or BitsetContainer. +# Because the type depends on per-container state (the run_bitset and the +# individual cardinality values), this section requires scanning. +# +# The Union below dispatches on an integer kind computed per container: +# kind 1 → RunContainer +# kind 2 → ArrayContainer +# kind 3 → BitsetContainer +# +# Note: SDDL does not support per-element index expressions in top-level +# repeated unions directly, so the three container types are described +# individually below for documentation purposes. A custom parser or +# a function-graph extension is recommended for the full dispatch logic. +# +# The layout rules for each container type are: +# +# RunContainer — consumed when run_bitset bit i == 1 (WITH_RUNS only) +# num_runs: UInt16LE +# runs: Run[num_runs] +# +# ArrayContainer — consumed when cardinality_i <= 4096 +# values: UInt16LE[cardinality_i] +# +# BitsetContainer — consumed when cardinality_i > 4096 +# bitset: Bytes(8192) # always exactly 8 KB + +# The Union type definitions below allow OpenZL to understand each container +# shape for compression purposes even when dispatched externally. + +Union Container(kind, cardinality) = { + case 1: RunContainer, + case 2: ArrayContainer(cardinality), + case 3: BitsetContainer +} diff --git a/openzl/test/CMakeLists.txt b/openzl/test/CMakeLists.txt new file mode 100644 index 00000000..ab78e6fd --- /dev/null +++ b/openzl/test/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(openzl_roundtrip_test openzl_roundtrip_test.c) +target_link_libraries(openzl_roundtrip_test roaring openzl) +target_include_directories(openzl_roundtrip_test PRIVATE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/cpp +) +add_test(NAME openzl_roundtrip_test COMMAND openzl_roundtrip_test) diff --git a/openzl/test/openzl_roundtrip_test.c b/openzl/test/openzl_roundtrip_test.c new file mode 100644 index 00000000..63b57754 --- /dev/null +++ b/openzl/test/openzl_roundtrip_test.c @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include +#include + +#include + +/* + * Serialize a roaring bitmap, compress it with OpenZL, decompress, + * and verify the round-tripped data produces an identical bitmap. + */ +static int roundtrip(roaring_bitmap_t *bitmap, const char *label) { + /* Portable-serialize the bitmap. */ + uint32_t serialized_size = roaring_bitmap_portable_size_in_bytes(bitmap); + char *serialized = (char *)malloc(serialized_size); + if (!serialized) { + fprintf(stderr, "[%s] malloc(serialized) failed\n", label); + return 1; + } + roaring_bitmap_portable_serialize(bitmap, serialized); + + /* Compress with OpenZL. */ + ZL_CCtx *cctx = ZL_CCtx_create(); + if (!cctx) { + fprintf(stderr, "[%s] ZL_CCtx_create failed\n", label); + free(serialized); + return 1; + } + (void)ZL_CCtx_setParameter(cctx, ZL_CParam_formatVersion, + ZL_MAX_FORMAT_VERSION); + + size_t compress_bound = ZL_compressBound(serialized_size); + char *compressed = (char *)malloc(compress_bound); + if (!compressed) { + fprintf(stderr, "[%s] malloc(compressed) failed\n", label); + ZL_CCtx_free(cctx); + free(serialized); + return 1; + } + + ZL_Report creport = ZL_CCtx_compress(cctx, compressed, compress_bound, + serialized, serialized_size); + if (ZL_isError(creport)) { + fprintf(stderr, "[%s] ZL_CCtx_compress failed: %s\n", label, + ZL_ErrorCode_toString(ZL_errorCode(creport))); + free(compressed); + ZL_CCtx_free(cctx); + free(serialized); + return 1; + } + size_t compressed_size = ZL_validResult(creport); + + printf("[%s] serialized %u bytes -> compressed %zu bytes (%.1f%%)\n", label, + serialized_size, compressed_size, + 100.0 * (double)compressed_size / (double)serialized_size); + + ZL_CCtx_free(cctx); + + /* Decompress with OpenZL. */ + ZL_Report dreport_size = + ZL_getDecompressedSize(compressed, compressed_size); + if (ZL_isError(dreport_size)) { + fprintf(stderr, "[%s] ZL_getDecompressedSize failed: %s\n", label, + ZL_ErrorCode_toString(ZL_errorCode(dreport_size))); + free(compressed); + free(serialized); + return 1; + } + size_t decompressed_size = ZL_validResult(dreport_size); + + char *decompressed = (char *)malloc(decompressed_size); + if (!decompressed) { + fprintf(stderr, "[%s] malloc(decompressed) failed\n", label); + free(compressed); + free(serialized); + return 1; + } + + ZL_Report dreport = ZL_decompress(decompressed, decompressed_size, + compressed, compressed_size); + if (ZL_isError(dreport)) { + fprintf(stderr, "[%s] ZL_decompress failed: %s\n", label, + ZL_ErrorCode_toString(ZL_errorCode(dreport))); + free(decompressed); + free(compressed); + free(serialized); + return 1; + } + + free(compressed); + + /* Verify byte-for-byte equality. */ + if (decompressed_size != serialized_size) { + fprintf(stderr, "[%s] size mismatch: expected %u, got %zu\n", label, + serialized_size, decompressed_size); + free(decompressed); + free(serialized); + return 1; + } + if (memcmp(serialized, decompressed, serialized_size) != 0) { + fprintf(stderr, "[%s] decompressed data differs from original\n", + label); + free(decompressed); + free(serialized); + return 1; + } + + /* Deserialize the round-tripped bytes and verify bitmap equality. */ + roaring_bitmap_t *recovered = roaring_bitmap_portable_deserialize_safe( + decompressed, decompressed_size); + free(decompressed); + free(serialized); + + if (!recovered) { + fprintf(stderr, "[%s] portable_deserialize_safe returned NULL\n", + label); + return 1; + } + if (!roaring_bitmap_equals(bitmap, recovered)) { + fprintf(stderr, "[%s] recovered bitmap differs from original\n", label); + roaring_bitmap_free(recovered); + return 1; + } + + roaring_bitmap_free(recovered); + printf("[%s] PASSED\n", label); + return 0; +} + +int main(void) { + int failures = 0; + + /* 1. Sparse bitmap (array containers only). */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + roaring_bitmap_add(bm, 1); + roaring_bitmap_add(bm, 100); + roaring_bitmap_add(bm, 1000); + roaring_bitmap_add(bm, 10000); + roaring_bitmap_add(bm, 100000); + roaring_bitmap_add(bm, 1000000); + failures += roundtrip(bm, "sparse"); + roaring_bitmap_free(bm); + } + + /* 2. Dense bitmap (bitset containers). */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + for (uint32_t i = 0; i < 100000; i++) { + roaring_bitmap_add(bm, 3 * i); + } + failures += roundtrip(bm, "dense"); + roaring_bitmap_free(bm); + } + + /* 3. Run-optimized bitmap with long consecutive runs. */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + roaring_bitmap_add_range_closed(bm, 0, 49999); + roaring_bitmap_add_range_closed(bm, 100000, 199999); + roaring_bitmap_add_range_closed(bm, 500000, 599999); + roaring_bitmap_run_optimize(bm); + failures += roundtrip(bm, "runs"); + roaring_bitmap_free(bm); + } + + /* 4. Mixed containers: sparse + dense + runs in one bitmap. */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + /* Container 0: sparse (array) */ + roaring_bitmap_add(bm, 10); + roaring_bitmap_add(bm, 200); + roaring_bitmap_add(bm, 3000); + /* Container 1 (high16 = 1): dense (bitset) */ + for (uint32_t i = 0; i < 50000; i++) { + roaring_bitmap_add(bm, (1 << 16) + i); + } + /* Container 2 (high16 = 2): consecutive run */ + roaring_bitmap_add_range_closed(bm, 2 * (1 << 16), + 2 * (1 << 16) + 9999); + roaring_bitmap_run_optimize(bm); + failures += roundtrip(bm, "mixed"); + roaring_bitmap_free(bm); + } + + /* 5. Empty bitmap. */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + failures += roundtrip(bm, "empty"); + roaring_bitmap_free(bm); + } + + /* 6. Large bitmap with many containers. */ + { + roaring_bitmap_t *bm = roaring_bitmap_create(); + for (uint32_t i = 0; i < 1000000; i += 7) { + roaring_bitmap_add(bm, i); + } + roaring_bitmap_run_optimize(bm); + failures += roundtrip(bm, "large_with_runs"); + roaring_bitmap_free(bm); + } + + if (failures) { + fprintf(stderr, "\n%d test(s) FAILED\n", failures); + } else { + printf("\nAll tests PASSED\n"); + } + return failures ? EXIT_FAILURE : EXIT_SUCCESS; +}