diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2024-10-05 22:20:55 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2024-10-05 22:32:53 +0300 |
commit | 2eec23b402e467c4997cae5f4fea19abad3195a6 (patch) | |
tree | 7bd2721e854318523c28c0915c235407f78cdfef /contrib | |
parent | c3797467c201a8de261c9da5475b2386197cb4b0 (diff) | |
download | ydb-2eec23b402e467c4997cae5f4fea19abad3195a6.tar.gz |
Intermediate changes
commit_hash:89e9af18f838d98d46029c1a50881e032a4efa72
Diffstat (limited to 'contrib')
-rw-r--r-- | contrib/libs/croaring/README.md | 15 | ||||
-rw-r--r-- | contrib/libs/croaring/include/roaring/roaring.h | 23 | ||||
-rw-r--r-- | contrib/libs/croaring/include/roaring/roaring64.h | 19 | ||||
-rw-r--r-- | contrib/libs/croaring/include/roaring/roaring_version.h | 4 | ||||
-rw-r--r-- | contrib/libs/croaring/src/bitset_util.c | 31 | ||||
-rw-r--r-- | contrib/libs/croaring/src/roaring.c | 2 | ||||
-rw-r--r-- | contrib/libs/croaring/src/roaring64.c | 25 | ||||
-rw-r--r-- | contrib/libs/croaring/ya.make | 5 |
8 files changed, 88 insertions, 36 deletions
diff --git a/contrib/libs/croaring/README.md b/contrib/libs/croaring/README.md index f582d31154..eb5ee92752 100644 --- a/contrib/libs/croaring/README.md +++ b/contrib/libs/croaring/README.md @@ -410,10 +410,17 @@ int main() { // we can write a bitmap to a pointer and recover it later uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1); char *serializedbytes = malloc(expectedsize); + // When serializing data to a file, we recommend that you also use + // checksums so that, at deserialization, you can be confident + // that you are recovering the correct data. roaring_bitmap_portable_serialize(r1, serializedbytes); // Note: it is expected that the input follows the specification // https://github.com/RoaringBitmap/RoaringFormatSpec // otherwise the result may be unusable. + // The 'roaring_bitmap_portable_deserialize_safe' function will not read + // beyond expectedsize bytes. + // We recommend you further use checksums to make sure that the input is from + // serialized data. roaring_bitmap_t *t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize); if(t == NULL) { return EXIT_FAILURE; } const char *reason = NULL; @@ -428,7 +435,11 @@ int main() { roaring_bitmap_portable_deserialize_size(serializedbytes, expectedsize); assert(sizeofbitmap == expectedsize); // sizeofbitmap would be zero if no bitmap were found - // we can also read the bitmap "safely" by specifying a byte size limit: + // We can also read the bitmap "safely" by specifying a byte size limit. + // The 'roaring_bitmap_portable_deserialize_safe' function will not read + // beyond expectedsize bytes. + // We recommend you further use checksums to make sure that the input is from + // serialized data. t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize); if(t == NULL) { printf("Problem during deserialization.\n"); @@ -500,6 +511,8 @@ We also support efficient 64-bit compressed bitmaps in C: roaring64_bitmap_free(r2); ``` +The API is similar to the conventional 32-bit bitmaps. Please see +the header file `roaring64.h` (compare with `roaring.h`). # Conventional bitsets (C) diff --git a/contrib/libs/croaring/include/roaring/roaring.h b/contrib/libs/croaring/include/roaring/roaring.h index 295bc3cb33..4256ad0ca5 100644 --- a/contrib/libs/croaring/include/roaring/roaring.h +++ b/contrib/libs/croaring/include/roaring/roaring.h @@ -552,6 +552,10 @@ size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf); @@ -615,7 +619,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * https://github.com/RoaringBitmap/RoaringFormatSpec * * The function itself is safe in the sense that it will not cause buffer - * overflows. However, for correct operations, it is assumed that the bitmap + * overflows: it will not read beyond the scope of the provided buffer + * (buf,maxbytes). + * + * However, for correct operations, it is assumed that the bitmap * read was once serialized from a valid bitmap (i.e., it follows the format * specification). If you provided an incorrect input (garbage), then the bitmap * read may not be in a valid state and following operations may not lead to @@ -625,8 +632,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * but not for random inputs. * * You may use roaring_bitmap_internal_validate to check the validity of the - * bitmap prior to using it. You may also use other strategies to check for - * corrupted inputs (e.g., checksums). + * bitmap prior to using it. + * + * We recommend that you use checksums to check that serialized data corresponds + * to a serialized bitmap. * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not @@ -688,6 +697,10 @@ size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf); @@ -722,6 +735,10 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf); diff --git a/contrib/libs/croaring/include/roaring/roaring64.h b/contrib/libs/croaring/include/roaring/roaring64.h index fd89feb5e0..c1f574d605 100644 --- a/contrib/libs/croaring/include/roaring/roaring64.h +++ b/contrib/libs/croaring/include/roaring/roaring64.h @@ -1,13 +1,13 @@ #ifndef ROARING64_H #define ROARING64_H -#include <roaring.h> #include <stdbool.h> #include <stddef.h> #include <stdint.h> #include <roaring/memory.h> #include <roaring/portability.h> +#include <roaring/roaring.h> #include <roaring/roaring_types.h> #ifdef __cplusplus @@ -511,6 +511,10 @@ size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, char *buf); @@ -525,14 +529,17 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes); /** - * Read a bitmap from a serialized buffer safely (reading up to maxbytes). + * Read a bitmap from a serialized buffer (reading up to maxbytes). * In case of failure, NULL is returned. * * This is meant to be compatible with other languages * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations * * The function itself is safe in the sense that it will not cause buffer - * overflows. However, for correct operations, it is assumed that the bitmap + * overflows: it will not read beyond the scope of the provided buffer + * (buf,maxbytes). + * + * However, for correct operations, it is assumed that the bitmap * read was once serialized from a valid bitmap (i.e., it follows the format * specification). If you provided an incorrect input (garbage), then the bitmap * read may not be in a valid state and following operations may not lead to @@ -541,6 +548,12 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, * order. This is is guaranteed to happen when serializing an existing bitmap, * but not for random inputs. * + * You may use roaring64_bitmap_internal_validate to check the validity of the + * bitmap prior to using it. + * + * We recommend that you use checksums to check that serialized data corresponds + * to a serialized bitmap. + * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. diff --git a/contrib/libs/croaring/include/roaring/roaring_version.h b/contrib/libs/croaring/include/roaring/roaring_version.h index 0d136bbbd3..d40681f4f6 100644 --- a/contrib/libs/croaring/include/roaring/roaring_version.h +++ b/contrib/libs/croaring/include/roaring/roaring_version.h @@ -2,11 +2,11 @@ // /include/roaring/roaring_version.h automatically generated by release.py, do not change by hand #ifndef ROARING_INCLUDE_ROARING_VERSION #define ROARING_INCLUDE_ROARING_VERSION -#define ROARING_VERSION "4.1.2" +#define ROARING_VERSION "4.1.6" enum { ROARING_VERSION_MAJOR = 4, ROARING_VERSION_MINOR = 1, - ROARING_VERSION_REVISION = 2 + ROARING_VERSION_REVISION = 6 }; #endif // ROARING_INCLUDE_ROARING_VERSION // clang-format on
\ No newline at end of file diff --git a/contrib/libs/croaring/src/bitset_util.c b/contrib/libs/croaring/src/bitset_util.c index 0ae7d92582..6bc12b44b4 100644 --- a/contrib/libs/croaring/src/bitset_util.c +++ b/contrib/libs/croaring/src/bitset_util.c @@ -613,16 +613,13 @@ size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -667,15 +664,12 @@ size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array, for (; (i < length) && (out < safeout); ++i) { uint64_t w = array[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint16_t)); out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -725,16 +719,13 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -749,16 +740,13 @@ size_t bitset_extract_setbits(const uint64_t *words, size_t length, for (size_t i = 0; i < length; ++i) { uint64_t w = words[i]; while (w != 0) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out + outpos, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 outpos++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -772,10 +760,9 @@ size_t bitset_extract_intersection_setbits_uint16( for (size_t i = 0; i < length; ++i) { uint64_t w = words1[i] & words2[i]; while (w != 0) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); out[outpos++] = (uint16_t)(r + base); - w ^= t; + w &= (w - 1); } base += 64; } @@ -836,11 +823,10 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); *out = (uint16_t)(r + base); out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -864,10 +850,9 @@ size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length, for (size_t i = 0; i < length; ++i) { uint64_t w = words[i]; while (w != 0) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); out[outpos++] = (uint16_t)(r + base); - w ^= t; + w &= (w - 1); } base += 64; } @@ -1158,4 +1143,4 @@ void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) { #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop -#endif
\ No newline at end of file +#endif diff --git a/contrib/libs/croaring/src/roaring.c b/contrib/libs/croaring/src/roaring.c index e3847bae92..5a71fd39c3 100644 --- a/contrib/libs/croaring/src/roaring.c +++ b/contrib/libs/croaring/src/roaring.c @@ -3319,7 +3319,7 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf) { bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t *bitset) { uint32_t max_value = roaring_bitmap_maximum(r); - size_t new_array_size = (size_t)(((uint64_t)max_value + 63) / 64); + size_t new_array_size = (size_t)(max_value / 64 + 1); bool resize_ok = bitset_resize(bitset, new_array_size, true); if (!resize_ok) { return false; diff --git a/contrib/libs/croaring/src/roaring64.c b/contrib/libs/croaring/src/roaring64.c index e63d3d965c..914faefe0b 100644 --- a/contrib/libs/croaring/src/roaring64.c +++ b/contrib/libs/croaring/src/roaring64.c @@ -1954,6 +1954,7 @@ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( roaring64_bitmap_t *r = roaring64_bitmap_create(); // Iterate through buckets ordered by increasing keys. + int64_t previous_high32 = -1; for (uint64_t bucket = 0; bucket < buckets; ++bucket) { // Read as uint32 the most significant 32 bits of the bucket. uint32_t high32; @@ -1964,6 +1965,12 @@ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( memcpy(&high32, buf, sizeof(high32)); buf += sizeof(high32); read_bytes += sizeof(high32); + // High 32 bits must be strictly increasing. + if (high32 <= previous_high32) { + roaring64_bitmap_free(r); + return NULL; + } + previous_high32 = high32; // Read the 32-bit Roaring bitmaps representing the least significant // bits of a set of elements. @@ -1983,6 +1990,24 @@ roaring64_bitmap_t *roaring64_bitmap_portable_deserialize_safe( buf += bitmap32_size; read_bytes += bitmap32_size; + // While we don't attempt to validate much, we must ensure that there + // is no duplication in the high 48 bits - inserting into the ART + // assumes (or UB) no duplicate keys. The top 32 bits must be unique + // because we check for strict increasing values of high32, but we + // must also ensure the top 16 bits within each 32-bit bitmap are also + // at least unique (we ensure they're strictly increasing as well, + // which they must be for a _valid_ bitmap, since it's cheaper to check) + int32_t last_bitmap_key = -1; + for (int i = 0; i < bitmap32->high_low_container.size; i++) { + uint16_t key = bitmap32->high_low_container.keys[i]; + if (key <= last_bitmap_key) { + roaring_bitmap_free(bitmap32); + roaring64_bitmap_free(r); + return NULL; + } + last_bitmap_key = key; + } + // Insert all containers of the 32-bit bitmap into the 64-bit bitmap. move_from_roaring32_offset(r, bitmap32, high32); roaring_bitmap_free(bitmap32); diff --git a/contrib/libs/croaring/ya.make b/contrib/libs/croaring/ya.make index 55acab5673..c180e97069 100644 --- a/contrib/libs/croaring/ya.make +++ b/contrib/libs/croaring/ya.make @@ -10,13 +10,12 @@ LICENSE( LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(4.1.2) +VERSION(4.1.6) -ORIGINAL_SOURCE(https://github.com/RoaringBitmap/CRoaring/archive/v4.1.2.tar.gz) +ORIGINAL_SOURCE(https://github.com/RoaringBitmap/CRoaring/archive/v4.1.6.tar.gz) ADDINCL( GLOBAL contrib/libs/croaring/include - contrib/libs/croaring/include/roaring ) NO_COMPILER_WARNINGS() |