From d908c03f8633f2322943b167ed034688a8aceee1 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 20 May 2026 15:53:51 -0400 Subject: [PATCH 1/2] fixing support for big endian (IO) --- README.md | 5 +-- cpp/roaring/roaring64map.hh | 10 ++++-- include/roaring/portability.h | 49 ++++++++++++++++++++++++++ include/roaring/roaring.h | 33 ++++-------------- include/roaring/roaring64.h | 14 +++----- src/containers/array.c | 15 ++++++++ src/containers/bitset.c | 15 ++++++++ src/containers/run.c | 28 +++++++++++++-- src/roaring.c | 19 +++++++++-- src/roaring64.c | 12 +++++-- src/roaring_array.c | 53 +++++++++++++++++++---------- tests/cpp_example2.cpp | 5 --- tests/cpp_unit.cpp | 11 ------ tests/format_portability_unit.c | 10 +----- tests/realdata_unit.c | 5 --- tests/roaring64_serialization.cpp | 5 --- tests/robust_deserialization_unit.c | 5 --- tests/toplevel_unit.c | 18 ++++------ 18 files changed, 191 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 4916da9b3..c9dd275f5 100644 --- a/README.md +++ b/README.md @@ -88,15 +88,12 @@ of the latest hardware. Roaring bitmaps are already available on a variety of pl # Requirements - Linux, macOS, FreeBSD, Windows (MSYS2 and Microsoft Visual studio). -- We test the library with ARM, x64/x86 and POWER processors. We only support little endian systems (big endian systems are vanishingly rare). +- We test the library with ARM, x64/x86 and POWER processors. We support big endian systems. - Recent C compiler supporting the C11 standard (GCC 7 or better, LLVM 8 or better (clang), Xcode 11 or better, Microsoft Visual Studio 2022 or better, Intel oneAPI Compiler 2023.2 or better), there is also an optional C++ class that requires a C++ compiler supporting the C++11 standard. We support [Fil-C, the memory-safe C/C++ compiler](https://fil-c.org). - CMake (to contribute to the project, users can rely on amalgamation/unity builds if they do not wish to use CMake). - The CMake system assumes that git is available. - Under x64 systems, the library provides runtime dispatch so that optimized functions are called based on the detected CPU features. It works with GCC, clang (version 9 and up) and Visual Studio (2017 and up). Other systems (e.g., ARM) do not need runtime dispatch. -Hardly anyone has access to an actual big-endian system. Nevertheless, -We support big-endian systems such as IBM s390x through emulators---except for -IO serialization which is only supported on little-endian systems (see [issue 423](https://github.com/RoaringBitmap/CRoaring/issues/423)). # Quick Start diff --git a/cpp/roaring/roaring64map.hh b/cpp/roaring/roaring64map.hh index 1521783bd..b14129ce1 100644 --- a/cpp/roaring/roaring64map.hh +++ b/cpp/roaring/roaring64map.hh @@ -1141,13 +1141,15 @@ class Roaring64Map { const char *orig = buf; // push map size uint64_t map_size = roarings.size(); - std::memcpy(buf, &map_size, sizeof(uint64_t)); + uint64_t map_size_le = croaring_htole64(map_size); + std::memcpy(buf, &map_size_le, sizeof(uint64_t)); buf += sizeof(uint64_t); std::for_each(roarings.cbegin(), roarings.cend(), [&buf, portable]( const std::pair &map_entry) { // push map key - std::memcpy(buf, &map_entry.first, sizeof(uint32_t)); + uint32_t key_le = croaring_htole32(map_entry.first); + std::memcpy(buf, &key_le, sizeof(uint32_t)); // ^-- Note: `*((uint32_t*)buf) = map_entry.first;` is // undefined @@ -1175,11 +1177,13 @@ class Roaring64Map { // get map size uint64_t map_size; std::memcpy(&map_size, buf, sizeof(uint64_t)); + map_size = croaring_letoh64(map_size); buf += sizeof(uint64_t); for (uint64_t lcv = 0; lcv < map_size; lcv++) { // get map key uint32_t key; std::memcpy(&key, buf, sizeof(uint32_t)); + key = croaring_letoh32(key); // ^-- Note: `uint32_t key = *((uint32_t*)buf);` is undefined buf += sizeof(uint32_t); @@ -1209,6 +1213,7 @@ class Roaring64Map { } uint64_t map_size; std::memcpy(&map_size, buf, sizeof(uint64_t)); + map_size = croaring_letoh64(map_size); buf += sizeof(uint64_t); maxbytes -= sizeof(uint64_t); for (uint64_t lcv = 0; lcv < map_size; lcv++) { @@ -1217,6 +1222,7 @@ class Roaring64Map { } uint32_t key; std::memcpy(&key, buf, sizeof(uint32_t)); + key = croaring_letoh32(key); // ^-- Note: `uint32_t key = *((uint32_t*)buf);` is undefined buf += sizeof(uint32_t); diff --git a/include/roaring/portability.h b/include/roaring/portability.h index e258d454d..8ec422630 100644 --- a/include/roaring/portability.h +++ b/include/roaring/portability.h @@ -462,6 +462,55 @@ static inline int roaring_hamming(uint64_t x) { #define croaring_be64toh(x) croaring_htobe64(x) // End of host <-> big endian conversion. +// Host <-> little-endian conversion helpers. +// +// The CRoaring "portable" serialization format (and the regular +// roaring_bitmap_serialize / Roaring64Map::write formats which build on it) +// is defined to be little-endian on the wire. Code that reads or writes +// multi-byte integers to such buffers must convert between host and +// little-endian byte order. On little-endian hosts these are no-ops; on +// big-endian hosts they swap bytes. +// +// The "frozen" format is intentionally non-portable and uses native byte +// order; it must not use these helpers. +#if CROARING_IS_BIG_ENDIAN + +static inline uint16_t croaring_bswap16(uint16_t x) { + return (uint16_t)((x << 8) | (x >> 8)); +} + +static inline uint32_t croaring_bswap32(uint32_t x) { + return ((x & 0x000000FFU) << 24) | ((x & 0x0000FF00U) << 8) | + ((x & 0x00FF0000U) >> 8) | ((x & 0xFF000000U) >> 24); +} + +static inline uint64_t croaring_bswap64(uint64_t x) { + return ((x & 0x00000000000000FFULL) << 56) | + ((x & 0x000000000000FF00ULL) << 40) | + ((x & 0x0000000000FF0000ULL) << 24) | + ((x & 0x00000000FF000000ULL) << 8) | + ((x & 0x000000FF00000000ULL) >> 8) | + ((x & 0x0000FF0000000000ULL) >> 24) | + ((x & 0x00FF000000000000ULL) >> 40) | + ((x & 0xFF00000000000000ULL) >> 56); +} + +#define croaring_htole16(x) croaring_bswap16(x) +#define croaring_htole32(x) croaring_bswap32(x) +#define croaring_htole64(x) croaring_bswap64(x) + +#else // CROARING_IS_BIG_ENDIAN + +#define croaring_htole16(x) (x) +#define croaring_htole32(x) (x) +#define croaring_htole64(x) (x) + +#endif // CROARING_IS_BIG_ENDIAN + +#define croaring_letoh16(x) croaring_htole16(x) +#define croaring_letoh32(x) croaring_htole32(x) +#define croaring_letoh64(x) croaring_htole64(x) + // Defines for the possible CROARING atomic implementations #define CROARING_ATOMIC_IMPL_NONE 1 #define CROARING_ATOMIC_IMPL_CPP 2 diff --git a/include/roaring/roaring.h b/include/roaring/roaring.h index 790fcae06..9d651ff44 100644 --- a/include/roaring/roaring.h +++ b/include/roaring/roaring.h @@ -636,10 +636,6 @@ size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r); * * Returns how many bytes written, should be `roaring_bitmap_size_in_bytes(r)`. * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * When serializing data to a file, we recommend that you also use * checksums so that, at deserialization, you can be confident * that you are recovering the correct data. @@ -652,10 +648,6 @@ size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf); * (See `roaring_bitmap_portable_deserialize()` if you want a format that's * compatible with Java and Go implementations). * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * The returned pointer may be NULL in case of errors. */ roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf); @@ -666,10 +658,6 @@ roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf); * (See `roaring_bitmap_portable_deserialize_safe()` if you want a format that's * compatible with Java and Go implementations). * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * The difference with `roaring_bitmap_deserialize()` is that this function * checks that the input buffer is a valid bitmap. If the buffer is too small, * NULL is returned. @@ -705,10 +693,6 @@ size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r); * This is meant to be compatible with the Java and Go versions: * https://github.com/RoaringBitmap/RoaringFormatSpec * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * The returned pointer may be NULL in case of errors. */ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); @@ -742,10 +726,6 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * corresponds to the serialized bitmap. The CRoaring library does not provide * checksumming. * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * The returned pointer may be NULL in case of errors. */ roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, @@ -769,7 +749,8 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. + * compatible with little-endian systems. It is not a bug, it is by design, + * since the format imitates C memory layout of roaring_bitmap_t. * * The returned pointer may be NULL in case of errors. */ @@ -803,10 +784,6 @@ size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r); * This is meant to be compatible with the Java and Go versions: * https://github.com/RoaringBitmap/RoaringFormatSpec * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * When serializing data to a file, we recommend that you also use * checksums so that, at deserialization, you can be confident * that you are recovering the correct data. @@ -843,7 +820,8 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. + * compatible with little-endian systems. This is not a bug, it is by design, + *since the format imitates C memory layout * * When serializing data to a file, we recommend that you also use * checksums so that, at deserialization, you can be confident @@ -864,7 +842,8 @@ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. + * compatible with little-endian systems. This is not a bug, it is by design, + *since the format imitates C memory layout of roaring_bitmap_t. */ const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf, size_t length); diff --git a/include/roaring/roaring64.h b/include/roaring/roaring64.h index 7d6bef836..0838c6fa1 100644 --- a/include/roaring/roaring64.h +++ b/include/roaring/roaring64.h @@ -583,10 +583,6 @@ size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r); * This is meant to be compatible with other languages: * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. - * * When serializing data to a file, we recommend that you also use * checksums so that, at deserialization, you can be confident * that you are recovering the correct data. @@ -631,10 +627,6 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, * We also recommend that you use checksums to check that serialized data * corresponds to the serialized bitmap. The CRoaring library does not provide * checksumming. - * - * This function is endian-sensitive. If you have a big-endian system (e.g., a - * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. */ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes); @@ -663,7 +655,8 @@ size_t roaring64_bitmap_frozen_size_in_bytes(const roaring64_bitmap_t *r); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. + * compatible with little-endian systems. This is not a bug, it is by design, + * since the format imitates C memory layout of roaring64_bitmap_t. */ size_t roaring64_bitmap_frozen_serialize(const roaring64_bitmap_t *r, char *buf); @@ -681,7 +674,8 @@ size_t roaring64_bitmap_frozen_serialize(const roaring64_bitmap_t *r, * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. + * compatible with little-endian systems. This is not a bug, it is by design, + * since the format imitates C memory layout of roaring64_bitmap_t. */ roaring64_bitmap_t *roaring64_bitmap_frozen_view(const char *buf, size_t maxbytes); diff --git a/src/containers/array.c b/src/containers/array.c index 817f1a693..aee0f9351 100644 --- a/src/containers/array.c +++ b/src/containers/array.c @@ -510,7 +510,14 @@ int32_t array_container_number_of_runs(const array_container_t *ac) { * */ int32_t array_container_write(const array_container_t *container, char *buf) { +#if CROARING_IS_BIG_ENDIAN + for (int32_t i = 0; i < container->cardinality; ++i) { + uint16_t v_le = croaring_htole16(container->array[i]); + memcpy(buf + i * sizeof(uint16_t), &v_le, sizeof(uint16_t)); + } +#else memcpy(buf, container->array, container->cardinality * sizeof(uint16_t)); +#endif return array_container_size_in_bytes(container); } @@ -543,7 +550,15 @@ int32_t array_container_read(int32_t cardinality, array_container_t *container, array_container_grow(container, cardinality, false); } container->cardinality = cardinality; +#if CROARING_IS_BIG_ENDIAN + for (int32_t i = 0; i < cardinality; ++i) { + uint16_t v_le; + memcpy(&v_le, buf + i * sizeof(uint16_t), sizeof(uint16_t)); + container->array[i] = croaring_letoh16(v_le); + } +#else memcpy(container->array, buf, container->cardinality * sizeof(uint16_t)); +#endif return array_container_size_in_bytes(container); } diff --git a/src/containers/bitset.c b/src/containers/bitset.c index 4b1076eef..f91062af4 100644 --- a/src/containers/bitset.c +++ b/src/containers/bitset.c @@ -1048,7 +1048,14 @@ int bitset_container_number_of_runs(bitset_container_t *bc) { int32_t bitset_container_write(const bitset_container_t *container, char *buf) { +#if CROARING_IS_BIG_ENDIAN + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) { + uint64_t w_le = croaring_htole64(container->words[i]); + memcpy(buf + i * sizeof(uint64_t), &w_le, sizeof(uint64_t)); + } +#else memcpy(buf, container->words, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t)); +#endif return bitset_container_size_in_bytes(container); } @@ -1056,7 +1063,15 @@ int32_t bitset_container_write(const bitset_container_t *container, int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container, const char *buf) { container->cardinality = cardinality; +#if CROARING_IS_BIG_ENDIAN + for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) { + uint64_t w_le; + memcpy(&w_le, buf + i * sizeof(uint64_t), sizeof(uint64_t)); + container->words[i] = croaring_letoh64(w_le); + } +#else memcpy(container->words, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t)); +#endif return bitset_container_size_in_bytes(container); } diff --git a/src/containers/run.c b/src/containers/run.c index b99818cff..ca1f64d8b 100644 --- a/src/containers/run.c +++ b/src/containers/run.c @@ -717,9 +717,21 @@ bool run_container_validate(const run_container_t *run, const char **reason) { int32_t run_container_write(const run_container_t *container, char *buf) { uint16_t cast_16 = container->n_runs; - memcpy(buf, &cast_16, sizeof(uint16_t)); + uint16_t n_runs_le = croaring_htole16(cast_16); + memcpy(buf, &n_runs_le, sizeof(uint16_t)); +#if CROARING_IS_BIG_ENDIAN + char *out = buf + sizeof(uint16_t); + for (int32_t i = 0; i < container->n_runs; ++i) { + uint16_t v_le = croaring_htole16(container->runs[i].value); + uint16_t l_le = croaring_htole16(container->runs[i].length); + memcpy(out, &v_le, sizeof(uint16_t)); + memcpy(out + sizeof(uint16_t), &l_le, sizeof(uint16_t)); + out += sizeof(rle16_t); + } +#else memcpy(buf + sizeof(uint16_t), container->runs, container->n_runs * sizeof(rle16_t)); +#endif return run_container_size_in_bytes(container); } @@ -728,12 +740,24 @@ int32_t run_container_read(int32_t cardinality, run_container_t *container, (void)cardinality; uint16_t cast_16; memcpy(&cast_16, buf, sizeof(uint16_t)); - container->n_runs = cast_16; + container->n_runs = croaring_letoh16(cast_16); if (container->n_runs > container->capacity) run_container_grow(container, container->n_runs, false); if (container->n_runs > 0) { +#if CROARING_IS_BIG_ENDIAN + const char *in = buf + sizeof(uint16_t); + for (int32_t i = 0; i < container->n_runs; ++i) { + uint16_t v_le, l_le; + memcpy(&v_le, in, sizeof(uint16_t)); + memcpy(&l_le, in + sizeof(uint16_t), sizeof(uint16_t)); + container->runs[i].value = croaring_letoh16(v_le); + container->runs[i].length = croaring_letoh16(l_le); + in += sizeof(rle16_t); + } +#else memcpy(container->runs, buf + sizeof(uint16_t), container->n_runs * sizeof(rle16_t)); +#endif } return run_container_size_in_bytes(container); } diff --git a/src/roaring.c b/src/roaring.c index 1caedc4f9..f0e3b1e04 100644 --- a/src/roaring.c +++ b/src/roaring.c @@ -1519,9 +1519,18 @@ size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf) { return roaring_bitmap_portable_serialize(r, buf + 1) + 1; } else { buf[0] = CROARING_SERIALIZATION_ARRAY_UINT32; - memcpy(buf + 1, &cardinality, sizeof(uint32_t)); - roaring_bitmap_to_uint32_array( - r, (uint32_t *)(buf + 1 + sizeof(uint32_t))); + uint32_t card_le = croaring_htole32((uint32_t)cardinality); + memcpy(buf + 1, &card_le, sizeof(uint32_t)); + uint32_t *out = (uint32_t *)(buf + 1 + sizeof(uint32_t)); + roaring_bitmap_to_uint32_array(r, out); +#if CROARING_IS_BIG_ENDIAN + for (uint64_t i = 0; i < cardinality; ++i) { + uint32_t v; + memcpy(&v, out + i, sizeof(uint32_t)); + v = croaring_htole32(v); + memcpy(out + i, &v, sizeof(uint32_t)); + } +#endif return 1 + (size_t)sizeasarray; } } @@ -1580,6 +1589,7 @@ roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) { uint32_t card; memcpy(&card, bufaschar + 1, sizeof(uint32_t)); + card = croaring_letoh32(card); const uint32_t *elems = (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t)); @@ -1593,6 +1603,7 @@ roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) { // elems may not be aligned, read with memcpy uint32_t elem; memcpy(&elem, elems + i, sizeof(elem)); + elem = croaring_letoh32(elem); roaring_bitmap_add_bulk(bitmap, &context, elem); } return bitmap; @@ -1618,6 +1629,7 @@ roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf, /* This looks like a compressed set of uint32_t elements */ uint32_t card; memcpy(&card, bufaschar + 1, sizeof(uint32_t)); + card = croaring_letoh32(card); // Check the buffer is big enough to contain card uint32_t elements if (maxbytes < 1 + sizeof(uint32_t) + card * sizeof(uint32_t)) { @@ -1636,6 +1648,7 @@ roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf, // elems may not be aligned, read with memcpy uint32_t elem; memcpy((char *)&elem, (char *)(elems + i), sizeof(elem)); + elem = croaring_letoh32(elem); roaring_bitmap_add_bulk(bitmap, &context, elem); } return bitmap; diff --git a/src/roaring64.c b/src/roaring64.c index 9cefecca2..aa0187396 100644 --- a/src/roaring64.c +++ b/src/roaring64.c @@ -2175,7 +2175,8 @@ size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, // Write as uint64 the distinct number of "buckets", where a bucket is // defined as the most significant 32 bits of an element. uint64_t high32_count = count_high32(r); - memcpy(buf, &high32_count, sizeof(high32_count)); + uint64_t high32_count_le = croaring_htole64(high32_count); + memcpy(buf, &high32_count_le, sizeof(high32_count_le)); buf += sizeof(high32_count); art_iterator_t it = art_init_iterator((art_t *)&r->art, /*first=*/true); @@ -2190,7 +2191,8 @@ size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, if (bitmap32 != NULL) { // Write as uint32 the most significant 32 bits of the // bucket. - memcpy(buf, &prev_high32, sizeof(prev_high32)); + uint32_t prev_high32_le = croaring_htole32(prev_high32); + memcpy(buf, &prev_high32_le, sizeof(prev_high32_le)); buf += sizeof(prev_high32); // Write the 32-bit Roaring bitmaps representing the least @@ -2221,7 +2223,8 @@ size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, if (bitmap32 != NULL) { // Write as uint32 the most significant 32 bits of the bucket. - memcpy(buf, &prev_high32, sizeof(prev_high32)); + uint32_t prev_high32_le = croaring_htole32(prev_high32); + memcpy(buf, &prev_high32_le, sizeof(prev_high32_le)); buf += sizeof(prev_high32); // Write the 32-bit Roaring bitmaps representing the least @@ -2248,6 +2251,7 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, return 0; } memcpy(&buckets, buf, sizeof(buckets)); + buckets = croaring_letoh64(buckets); buf += sizeof(buckets); read_bytes += sizeof(buckets); @@ -2294,6 +2298,7 @@ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( return NULL; } memcpy(&buckets, buf, sizeof(buckets)); + buckets = croaring_letoh64(buckets); buf += sizeof(buckets); read_bytes += sizeof(buckets); @@ -2313,6 +2318,7 @@ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( return NULL; } memcpy(&high32, buf, sizeof(high32)); + high32 = croaring_letoh32(high32); buf += sizeof(high32); read_bytes += sizeof(high32); // High 32 bits must be strictly increasing. diff --git a/src/roaring_array.c b/src/roaring_array.c index 423c4fe61..3b6224e21 100644 --- a/src/roaring_array.c +++ b/src/roaring_array.c @@ -464,15 +464,17 @@ size_t ra_portable_size_in_bytes(const roaring_array_t *ra) { return count; } -// This function is endian-sensitive. +// The portable serialization format is little-endian. On big-endian hosts we +// byte-swap multi-byte fields before writing them to the buffer. size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) { char *initbuf = buf; uint32_t startOffset = 0; bool hasrun = ra_has_run_container(ra); if (hasrun) { uint32_t cookie = SERIAL_COOKIE | ((uint32_t)(ra->size - 1) << 16); - memcpy(buf, &cookie, sizeof(cookie)); - buf += sizeof(cookie); + uint32_t cookie_le = croaring_htole32(cookie); + memcpy(buf, &cookie_le, sizeof(cookie_le)); + buf += sizeof(cookie_le); uint32_t s = (ra->size + 7) / 8; memset(buf, 0, s); for (int32_t i = 0; i < ra->size; ++i) { @@ -489,30 +491,34 @@ size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) { } } else { // backwards compatibility uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER; - - memcpy(buf, &cookie, sizeof(cookie)); - buf += sizeof(cookie); - memcpy(buf, &ra->size, sizeof(ra->size)); - buf += sizeof(ra->size); + uint32_t cookie_le = croaring_htole32(cookie); + memcpy(buf, &cookie_le, sizeof(cookie_le)); + buf += sizeof(cookie_le); + uint32_t size_le = croaring_htole32((uint32_t)ra->size); + memcpy(buf, &size_le, sizeof(size_le)); + buf += sizeof(size_le); startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size; } for (int32_t k = 0; k < ra->size; ++k) { - memcpy(buf, &ra->keys[k], sizeof(ra->keys[k])); - buf += sizeof(ra->keys[k]); + uint16_t key_le = croaring_htole16(ra->keys[k]); + memcpy(buf, &key_le, sizeof(key_le)); + buf += sizeof(key_le); // get_cardinality returns a value in [1,1<<16], subtracting one // we get [0,1<<16 - 1] which fits in 16 bits uint16_t card = (uint16_t)(container_get_cardinality(ra->containers[k], ra->typecodes[k]) - 1); - memcpy(buf, &card, sizeof(card)); - buf += sizeof(card); + uint16_t card_le = croaring_htole16(card); + memcpy(buf, &card_le, sizeof(card_le)); + buf += sizeof(card_le); } if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) { // writing the containers offsets for (int32_t k = 0; k < ra->size; k++) { - memcpy(buf, &startOffset, sizeof(startOffset)); - buf += sizeof(startOffset); + uint32_t off_le = croaring_htole32(startOffset); + memcpy(buf, &off_le, sizeof(off_le)); + buf += sizeof(off_le); startOffset = startOffset + container_size_in_bytes(ra->containers[k], ra->typecodes[k]); @@ -536,6 +542,7 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { if (bytestotal > maxbytes) return 0; uint32_t cookie; memcpy(&cookie, buf, sizeof(int32_t)); + cookie = croaring_letoh32(cookie); buf += sizeof(uint32_t); if ((cookie & 0xFFFF) != SERIAL_COOKIE && cookie != SERIAL_COOKIE_NO_RUNCONTAINER) { @@ -548,7 +555,9 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { else { bytestotal += sizeof(int32_t); if (bytestotal > maxbytes) return 0; - memcpy(&size, buf, sizeof(int32_t)); + uint32_t size_le; + memcpy(&size_le, buf, sizeof(int32_t)); + size = (int32_t)croaring_letoh32(size_le); buf += sizeof(uint32_t); } if (size > (1 << 16) || size < 0) { @@ -577,6 +586,7 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { for (int32_t k = 0; k < size; ++k) { uint16_t tmp; memcpy(&tmp, keyscards + 4 * k + 2, sizeof(tmp)); + tmp = croaring_letoh16(tmp); uint32_t thiscard = tmp + 1; bool isbitmap = (thiscard > DEFAULT_MAX_SIZE); bool isrun = false; @@ -597,6 +607,7 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { if (bytestotal > maxbytes) return 0; uint16_t n_runs; memcpy(&n_runs, buf, sizeof(uint16_t)); + n_runs = croaring_letoh16(n_runs); buf += sizeof(uint16_t); size_t containersize = n_runs * sizeof(rle16_t); bytestotal += containersize; @@ -617,7 +628,8 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) { // cannot be found. If it returns true, readbytes is populated by how many bytes // were read, we have that *readbytes <= maxbytes. // -// This function is endian-sensitive. +// The portable serialization format is little-endian. On big-endian hosts we +// byte-swap multi-byte fields after reading them from the buffer. bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t *readbytes) { *readbytes = sizeof(int32_t); // for cookie @@ -627,6 +639,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, } uint32_t cookie; memcpy(&cookie, buf, sizeof(int32_t)); + cookie = croaring_letoh32(cookie); buf += sizeof(uint32_t); if ((cookie & 0xFFFF) != SERIAL_COOKIE && cookie != SERIAL_COOKIE_NO_RUNCONTAINER) { @@ -643,7 +656,9 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, // Ran out of bytes while reading second part of the cookie. return false; } - memcpy(&size, buf, sizeof(int32_t)); + uint32_t size_le; + memcpy(&size_le, buf, sizeof(int32_t)); + size = (int32_t)croaring_letoh32(size_le); buf += sizeof(uint32_t); } if (size < 0) { @@ -685,7 +700,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, for (int32_t k = 0; k < size; ++k) { uint16_t tmp; memcpy(&tmp, keyscards + 4 * k, sizeof(tmp)); - answer->keys[k] = tmp; + answer->keys[k] = croaring_letoh16(tmp); } if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) { *readbytes += size * 4; @@ -703,6 +718,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, for (int32_t k = 0; k < size; ++k) { uint16_t tmp; memcpy(&tmp, keyscards + 4 * k + 2, sizeof(tmp)); + tmp = croaring_letoh16(tmp); uint32_t thiscard = tmp + 1; bool isbitmap = (thiscard > DEFAULT_MAX_SIZE); bool isrun = false; @@ -746,6 +762,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, } uint16_t n_runs; memcpy(&n_runs, buf, sizeof(uint16_t)); + n_runs = croaring_letoh16(n_runs); size_t containersize = n_runs * sizeof(rle16_t); *readbytes += containersize; if (*readbytes > maxbytes) { // data is corrupted? diff --git a/tests/cpp_example2.cpp b/tests/cpp_example2.cpp index 02f580843..0d97003bc 100644 --- a/tests/cpp_example2.cpp +++ b/tests/cpp_example2.cpp @@ -66,10 +66,6 @@ int main() { // we can compute intersection two-by-two Roaring i1_2 = r1 & r2; -#if CROARING_IS_BIG_ENDIAN - printf( - "We omit serialization tests because you have a big endian system.\n"); -#else // we can write a bitmap to a pointer and recover it later uint32_t expectedsize = r1.getSizeInBytes(); char *serializedbytes = new char[expectedsize]; @@ -97,7 +93,6 @@ int main() { ++counter; } // counter == t.cardinality() -#endif // we can move iterators to skip values const uint32_t manyvalues[] = {2, 3, 4, 7, 8}; Roaring rogue(5, manyvalues); diff --git a/tests/cpp_unit.cpp b/tests/cpp_unit.cpp index dde4f8b09..1197d5450 100644 --- a/tests/cpp_unit.cpp +++ b/tests/cpp_unit.cpp @@ -672,11 +672,9 @@ DEFINE_TEST(test_example_cpp_true) { test_example_cpp(true); } DEFINE_TEST(test_example_cpp_false) { test_example_cpp(false); } -#if !CROARING_IS_BIG_ENDIAN DEFINE_TEST(test_example_cpp_64_true) { test_example_cpp_64(true); } DEFINE_TEST(test_example_cpp_64_false) { test_example_cpp_64(false); } -#endif DEFINE_TEST(test_run_compression_cpp_64_true) { test_run_compression_cpp_64(true); @@ -2104,10 +2102,6 @@ DEFINE_TEST(test_cpp_remove_run_compression) { // Returns true on success, false on exception. bool test64Deserialize(const std::string &filename) { -#if CROARING_IS_BIG_ENDIAN - (void)filename; - printf("Big-endian IO unsupported.\n"); -#else // CROARING_IS_BIG_ENDIAN std::ifstream in(TEST_DATA_DIR + filename, std::ios::binary); std::vector buf1(std::istreambuf_iterator(in), {}); printf("Reading %lu bytes\n", (unsigned long)buf1.size()); @@ -2127,7 +2121,6 @@ bool test64Deserialize(const std::string &filename) { for (size_t i = 0; i < buf1.size(); ++i) { assert_true(buf1[i] == buf2[i]); } -#endif // CROARING_IS_BIG_ENDIAN return true; } @@ -2231,14 +2224,12 @@ int main() { cmocka_unit_test(test_bitmap_of_32), cmocka_unit_test(test_bitmap_of_64), cmocka_unit_test(serial_test), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(test_example_true), cmocka_unit_test(test_example_false), cmocka_unit_test(test_example_cpp_true), cmocka_unit_test(test_example_cpp_false), cmocka_unit_test(test_example_cpp_64_true), cmocka_unit_test(test_example_cpp_64_false), -#endif cmocka_unit_test(test_cpp_add_remove_checked), cmocka_unit_test(test_cpp_add_remove_checked_64), cmocka_unit_test(test_cpp_add_range), @@ -2281,7 +2272,6 @@ int main() { cmocka_unit_test(test_cpp_flip_64), cmocka_unit_test(test_cpp_flip_closed_64), cmocka_unit_test(test_combinatoric_flip_many_64), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(test_cpp_deserialize_64_empty), cmocka_unit_test(test_cpp_deserialize_64_32bit_vals), cmocka_unit_test(test_cpp_deserialize_64_spread_vals), @@ -2293,7 +2283,6 @@ int main() { cmocka_unit_test(test_cpp_deserialize_64_invalid_size), cmocka_unit_test(test_cpp_deserialize_64_key_too_small), #endif -#endif // !CROARING_IS_BIG_ENDIAN cmocka_unit_test(issue316), cmocka_unit_test(test_issue304), cmocka_unit_test(issue_336), diff --git a/tests/format_portability_unit.c b/tests/format_portability_unit.c index 8d26d3c9d..eb37c4eef 100644 --- a/tests/format_portability_unit.c +++ b/tests/format_portability_unit.c @@ -72,9 +72,7 @@ static inline void test_deserialize(char* filename) { free(input_buffer); roaring_bitmap_free(bitmap); } -#if CROARING_IS_BIG_ENDIAN -// port the test below. -#else + DEFINE_TEST(test_deserialize_portable_norun) { char filename[1024]; @@ -92,18 +90,12 @@ DEFINE_TEST(test_deserialize_portable_wrun) { test_deserialize(filename); } -#endif int main() { tellmeall(); -#if CROARING_IS_BIG_ENDIAN - printf("Big-endian IO unsupported.\n"); - return EXIT_SUCCESS; -#else const struct CMUnitTest tests[] = { cmocka_unit_test(test_deserialize_portable_norun), cmocka_unit_test(test_deserialize_portable_wrun), }; return cmocka_run_group_tests(tests, NULL, NULL); -#endif } diff --git a/tests/realdata_unit.c b/tests/realdata_unit.c index 03fef944e..7e8ecf32e 100644 --- a/tests/realdata_unit.c +++ b/tests/realdata_unit.c @@ -43,10 +43,6 @@ const char *datadir[] = { "weather_sept_85_srt", "wikileaks-noquotes", "wikileaks-noquotes_srt"}; bool serialize_correctly(roaring_bitmap_t *r) { -#if CROARING_IS_BIG_ENDIAN - (void)r; - return r; -#else uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r); char *serialized = (char *)malloc(expectedsize); if (serialized == NULL) { @@ -73,7 +69,6 @@ bool serialize_correctly(roaring_bitmap_t *r) { } roaring_bitmap_free(r2); return true; -#endif } // arrays expected to both be sorted. diff --git a/tests/roaring64_serialization.cpp b/tests/roaring64_serialization.cpp index f7518365a..2850e9522 100644 --- a/tests/roaring64_serialization.cpp +++ b/tests/roaring64_serialization.cpp @@ -178,10 +178,6 @@ DEFINE_TEST(test_64deseroverlappingupper32) { } // namespace int main() { -#if CROARING_IS_BIG_ENDIAN - printf("Big-endian IO is unsupported.\n"); - return 0; -#else const struct CMUnitTest tests[] = { cmocka_unit_test(test_64map32bitvals), cmocka_unit_test(test_64mapempty), @@ -195,5 +191,4 @@ int main() { cmocka_unit_test(test_64deseroverlappingupper32), }; return cmocka_run_group_tests(tests, NULL, NULL); -#endif // CROARING_IS_BIG_ENDIAN } diff --git a/tests/robust_deserialization_unit.c b/tests/robust_deserialization_unit.c index bb695cb58..ff4f543f5 100644 --- a/tests/robust_deserialization_unit.c +++ b/tests/robust_deserialization_unit.c @@ -388,10 +388,6 @@ DEFINE_TEST(deserialize_bitset_incorrect_cardinality) { int main() { tellmeall(); -#if CROARING_IS_BIG_ENDIAN - printf("Big-endian IO unsupported.\n"); - return EXIT_SUCCESS; -#else const struct CMUnitTest tests[] = { cmocka_unit_test(test_robust_deserialize1), cmocka_unit_test(test_robust_deserialize2), @@ -416,5 +412,4 @@ int main() { }; return cmocka_run_group_tests(tests, NULL, NULL); -#endif } diff --git a/tests/toplevel_unit.c b/tests/toplevel_unit.c index b5ce09dc8..58907f729 100644 --- a/tests/toplevel_unit.c +++ b/tests/toplevel_unit.c @@ -219,7 +219,6 @@ DEFINE_TEST(is_really_empty) { roaring_bitmap_free(bm); } -#if !CROARING_IS_BIG_ENDIAN // https://github.com/Ezibenroc/PyRoaringBitMap/issues/124 DEFINE_TEST(PyRoaringBitMap124) { // adversarial test case @@ -238,10 +237,15 @@ DEFINE_TEST(PyRoaringBitMap124) { roaring_bitmap_internal_validate(deserialized_bitmap, &reason_failure)); roaring_bitmap_free(deserialized_bitmap); +#if !CROARING_IS_BIG_ENDIAN + // The frozen format is intentionally non-portable (native byte order). + // On big-endian hosts the input here (little-endian) won't pass the + // frozen view validation, but the result is platform-dependent so we + // only assert the little-endian behavior. const roaring_bitmap_t *r2 = roaring_bitmap_frozen_view(data, length); assert_true(r2 == NULL); -} #endif +} DEFINE_TEST(inplaceorwide) { uint64_t end = 4294901761; @@ -307,7 +311,6 @@ bool check_serialization(roaring_bitmap_t *bitmap) { return ret; } -#if !CROARING_IS_BIG_ENDIAN DEFINE_TEST(issue245) { roaring_bitmap_t *bitmap = roaring_bitmap_create(); const uint32_t targetEntries = 2048; @@ -336,7 +339,6 @@ DEFINE_TEST(issue245) { } roaring_bitmap_free(bitmap); } -#endif DEFINE_TEST(issue208) { roaring_bitmap_t *r = roaring_bitmap_create(); @@ -5517,10 +5519,8 @@ int main() { cmocka_unit_test(test_contains_range_PyRoaringBitMap_issue81), cmocka_unit_test(issue316), cmocka_unit_test(issue288), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(PyRoaringBitMap124), cmocka_unit_test(issue245), -#endif cmocka_unit_test(issue208), cmocka_unit_test(issue208b), cmocka_unit_test(range_contains), @@ -5538,10 +5538,8 @@ int main() { cmocka_unit_test(test_stress_memory_false), cmocka_unit_test(check_interval), cmocka_unit_test(test_uint32_iterator_true), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(test_example_true), cmocka_unit_test(test_example_false), -#endif cmocka_unit_test(test_clear), cmocka_unit_test(can_copy_empty_true), cmocka_unit_test(can_copy_empty_false), @@ -5573,10 +5571,8 @@ int main() { cmocka_unit_test(test_iterate_empty), cmocka_unit_test(test_iterate_withbitmap), cmocka_unit_test(test_iterate_withrun), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(test_serialize), cmocka_unit_test(test_portable_serialize), -#endif cmocka_unit_test(test_add), cmocka_unit_test(test_add_checked), cmocka_unit_test(test_remove_checked), @@ -5674,14 +5670,12 @@ int main() { cmocka_unit_test(test_remove_range), cmocka_unit_test(test_remove_many), cmocka_unit_test(test_range_cardinality), -#if !CROARING_IS_BIG_ENDIAN cmocka_unit_test(test_frozen_serialization), cmocka_unit_test(test_frozen_serialization_max_containers), #if ROARING_UNSAFE_FROZEN_TESTS cmocka_unit_test(test_portable_deserialize_frozen), #endif // ROARING_UNSAFE_FROZEN_TESTS cmocka_unit_test(issue_15jan2024), -#endif }; return cmocka_run_group_tests(tests, NULL, NULL); From 4a62a0a1b3bcf31229b85a9a47e5a3397bfb71ce Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 20 May 2026 16:05:25 -0400 Subject: [PATCH 2/2] lint --- cpp/roaring/roaring.hh | 2 +- include/roaring/portability.h | 4 ++-- include/roaring/roaring.h | 6 +++--- include/roaring/roaring64.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/roaring/roaring.hh b/cpp/roaring/roaring.hh index b5a4c57f2..1e2e9bd71 100644 --- a/cpp/roaring/roaring.hh +++ b/cpp/roaring/roaring.hh @@ -282,7 +282,7 @@ class Roaring { return api::roaring_bitmap_remove_range_closed(&roaring, min, max); } - /** + /** * Keep only values in the half-open interval [min, max). * Equivalent to two consecutive removeRange calls. */ diff --git a/include/roaring/portability.h b/include/roaring/portability.h index 8ec422630..08c3397a2 100644 --- a/include/roaring/portability.h +++ b/include/roaring/portability.h @@ -526,13 +526,13 @@ static inline uint64_t croaring_bswap64(uint64_t x) { #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP #endif //__has_include() #else - // We lack __has_include to check: +// We lack __has_include to check: #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP #endif //__has_include #elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__) #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C #elif CROARING_REGULAR_VISUAL_STUDIO - // https://www.technetworkhub.com/c11-atomics-in-visual-studio-2022-version-17/ +// https://www.technetworkhub.com/c11-atomics-in-visual-studio-2022-version-17/ #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C_WINDOWS #endif #endif // !defined(CROARING_ATOMIC_IMPL) diff --git a/include/roaring/roaring.h b/include/roaring/roaring.h index 9d651ff44..244ab72a1 100644 --- a/include/roaring/roaring.h +++ b/include/roaring/roaring.h @@ -749,7 +749,7 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. It is not a bug, it is by design, + * compatible with little-endian systems. It is not a bug, it is by design, * since the format imitates C memory layout of roaring_bitmap_t. * * The returned pointer may be NULL in case of errors. @@ -820,7 +820,7 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. This is not a bug, it is by design, + * compatible with little-endian systems. This is not a bug, it is by design, *since the format imitates C memory layout * * When serializing data to a file, we recommend that you also use @@ -842,7 +842,7 @@ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. This is not a bug, it is by design, + * compatible with little-endian systems. This is not a bug, it is by design, *since the format imitates C memory layout of roaring_bitmap_t. */ const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf, diff --git a/include/roaring/roaring64.h b/include/roaring/roaring64.h index 0838c6fa1..193ce8d84 100644 --- a/include/roaring/roaring64.h +++ b/include/roaring/roaring64.h @@ -655,7 +655,7 @@ size_t roaring64_bitmap_frozen_size_in_bytes(const roaring64_bitmap_t *r); * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. This is not a bug, it is by design, + * compatible with little-endian systems. This is not a bug, it is by design, * since the format imitates C memory layout of roaring64_bitmap_t. */ size_t roaring64_bitmap_frozen_serialize(const roaring64_bitmap_t *r, @@ -674,7 +674,7 @@ size_t roaring64_bitmap_frozen_serialize(const roaring64_bitmap_t *r, * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not - * compatible with little-endian systems. This is not a bug, it is by design, + * compatible with little-endian systems. This is not a bug, it is by design, * since the format imitates C memory layout of roaring64_bitmap_t. */ roaring64_bitmap_t *roaring64_bitmap_frozen_view(const char *buf,