diff options
author | Ivan Blinkov <ivan@blinkov.ru> | 2022-02-10 16:47:10 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:10 +0300 |
commit | 1aeb9a455974457866f78722ad98114bafc84e8a (patch) | |
tree | e4340eaf1668684d83a0a58c36947c5def5350ad /contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp | |
parent | bd5ef432f5cfb1e18851381329d94665a4c22470 (diff) | |
download | ydb-1aeb9a455974457866f78722ad98114bafc84e8a.tar.gz |
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp | 904 |
1 files changed, 452 insertions, 452 deletions
diff --git a/contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp b/contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp index 45a2eb270d..f9c3b0f840 100644 --- a/contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp +++ b/contrib/libs/hyperscan/src/rose/rose_build_long_lit.cpp @@ -1,452 +1,452 @@ -/* - * Copyright (c) 2016-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "rose_build_long_lit.h" - -#include "rose_build_engine_blob.h" -#include "rose_build_impl.h" -#include "stream_long_lit_hash.h" -#include "util/bytecode_ptr.h" -#include "util/bitutils.h" -#include "util/verify_types.h" -#include "util/compile_context.h" - -#include <algorithm> -#include <numeric> - -using namespace std; - -namespace ue2 { - -/** \brief Minimum size for a non-empty hash table. Must be a power of two. */ -static constexpr size_t MIN_HASH_TABLE_SIZE = 128; - -/** \brief Maximum load factor (between zero and one) for a hash table. */ -static constexpr double MAX_HASH_TABLE_LOAD = 0.7; - -/** \brief Minimum size (in bits) for a bloom filter. Must be a power of two. */ -static constexpr u32 MIN_BLOOM_FILTER_SIZE = 256; - -/** \brief Maximum load factor (between zero and one) for a bloom filter. */ -static constexpr double MAX_BLOOM_FILTER_LOAD = 0.25; - -struct LongLitModeInfo { - u32 num_literals = 0; //!< Number of strings for this mode. - u32 hashed_positions = 0; //!< Number of hashable string positions. -}; - -struct LongLitInfo { - LongLitModeInfo caseful; - LongLitModeInfo nocase; -}; - -static -u32 roundUpToPowerOfTwo(u32 x) { - assert(x != 0); - u32 bits = lg2(x - 1) + 1; - assert(bits < 32); - return 1U << bits; -} - -static -LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits, - size_t max_len) { - LongLitInfo info; - - for (const auto &lit : lits) { - auto &lit_info = lit.nocase ? info.nocase : info.caseful; - assert(lit.s.size() > max_len); - lit_info.num_literals++; - lit_info.hashed_positions += lit.s.size() - max_len; - } - - DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions); - DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions); - - return info; -} - -static -void addToBloomFilter(vector<u8> &bloom, const u8 *substr, bool nocase) { - const u32 num_keys = verify_u32(bloom.size() * 8); - const u32 key_mask = (1U << lg2(num_keys)) -1; - - const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 }; - for (const auto &hash_func : hash_functions) { - u32 hash = hash_func(substr, nocase); - u32 key = hash & key_mask; - DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * 8); - bloom[key / 8] |= 1U << (key % 8); - } -} - -static -size_t bloomOccupancy(const vector<u8> &bloom) { - return accumulate(begin(bloom), end(bloom), 0, - [](const size_t &sum, const u8 &elem) { - return sum + popcount32(elem); - }); -} - -static -double bloomLoad(const vector<u8> &bloom) { - return (double)bloomOccupancy(bloom) / (double)(bloom.size() * 8); -} - -static -vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len, - size_t num_entries, bool nocase) { - assert(num_entries % 8 == 0); - assert((num_entries & (num_entries - 1)) == 0); // Must be power of two. - - vector<u8> bloom(num_entries / 8, 0); - - if (!num_entries) { - return bloom; - } - - for (const auto &lit : lits) { - if (nocase != lit.nocase) { - continue; - } - for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) { - const u8 *substr = (const u8 *)lit.s.c_str() + offset; - addToBloomFilter(bloom, substr, nocase); - } - } - - DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n", - nocase ? "nocase" : "caseful", bloomOccupancy(bloom), - num_entries); - - return bloom; -} - - -static -vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits, - size_t max_len, bool nocase) { - vector<u8> bloom; - - size_t num_entries = MIN_BLOOM_FILTER_SIZE; - for (;;) { - bloom = buildBloomFilter(lits, max_len, num_entries, nocase); - DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n", - nocase ? "nocase" : "caseful", num_entries, - bloomLoad(bloom)); - if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) { - break; - } - num_entries *= 2; - } - return bloom; -} - -static UNUSED -size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) { - return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) { - return ent.str_offset != 0; - }); -} - -static UNUSED -double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) { - return (double)hashTableOccupancy(tab) / (double)(tab.size()); -} - -using LitOffsetVector = small_vector<pair<u32, u32>, 1>; - -static -vector<RoseLongLitHashEntry> buildHashTable( - size_t max_len, const vector<u32> &litToOffsetVal, - const map<u32, LitOffsetVector> &hashToLitOffPairs, - size_t numEntries) { - vector<RoseLongLitHashEntry> tab(numEntries, {0,0}); - - if (!numEntries) { - return tab; - } - - for (const auto &m : hashToLitOffPairs) { - u32 hash = m.first; - const LitOffsetVector &d = m.second; - - u32 bucket = hash % numEntries; - - // Placement via linear probing. - for (const auto &lit_offset : d) { - while (tab[bucket].str_offset != 0) { - bucket++; - if (bucket == numEntries) { - bucket = 0; - } - } - - u32 lit_id = lit_offset.first; - u32 offset = lit_offset.second; - - DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash, - lit_id, offset, bucket); - - auto &entry = tab[bucket]; - entry.str_offset = verify_u32(litToOffsetVal.at(lit_id)); - assert(entry.str_offset != 0); - entry.str_len = offset + max_len; - } - } - - DEBUG_PRINTF("hash table occupancy %zu of %zu entries\n", - hashTableOccupancy(tab), numEntries); - - return tab; -} - -static -map<u32, LitOffsetVector> computeLitHashes(const vector<ue2_case_string> &lits, - size_t max_len, bool nocase) { - map<u32, LitOffsetVector> hashToLitOffPairs; - - for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) { - const ue2_case_string &lit = lits[lit_id]; - if (nocase != lit.nocase) { - continue; - } - for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) { - const u8 *substr = (const u8 *)lit.s.c_str() + offset; - u32 hash = hashLongLiteral(substr, max_len, lit.nocase); - hashToLitOffPairs[hash].emplace_back(lit_id, offset); - } - } - - for (auto &m : hashToLitOffPairs) { - LitOffsetVector &d = m.second; - if (d.size() == 1) { - continue; - } - - // Sort by (offset, string) so that we'll be able to remove identical - // string prefixes. - stable_sort(begin(d), end(d), - [&](const pair<u32, u32> &a, const pair<u32, u32> &b) { - const auto &str_a = lits[a.first].s; - const auto &str_b = lits[b.first].s; - return tie(a.second, str_a) < tie(b.second, str_b); - }); - - // Remove entries that point to the same literal prefix. - d.erase(unique(begin(d), end(d), - [&](const pair<u32, u32> &a, const pair<u32, u32> &b) { - if (a.second != b.second) { - return false; - } - const auto &str_a = lits[a.first].s; - const auto &str_b = lits[b.first].s; - const size_t len = max_len + a.second; - return equal(begin(str_a), begin(str_a) + len, - begin(str_b)); - }), - end(d)); - - // Sort d by distance of the residual string (len minus our depth into - // the string). We need to put the 'furthest back' string first. - stable_sort(begin(d), end(d), - [](const pair<u32, u32> &a, const pair<u32, u32> &b) { - if (a.second != b.second) { - return a.second > b.second; /* longest is first */ - } - return a.first < b.first; - }); - } - - return hashToLitOffPairs; -} - -static -vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits, - size_t max_len, - const vector<u32> &litToOffsetVal, - u32 numPositions, bool nocase) { - // Compute lit substring hashes. - const auto hashToLitOffPairs = computeLitHashes(lits, max_len, nocase); - - // Compute the size of the hash table: we need enough entries to satisfy - // our max load constraint, and it must be a power of two. - size_t num_entries = (double)numPositions / MAX_HASH_TABLE_LOAD + 1; - num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, num_entries)); - - auto tab = buildHashTable(max_len, litToOffsetVal, hashToLitOffPairs, - num_entries); - DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n", - nocase ? "nocase" : "caseful", num_entries, - hashTableLoad(tab)); - assert(hashTableLoad(tab) < MAX_HASH_TABLE_LOAD); - - return tab; -} - -static -vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset, - vector<u32> &litToOffsetVal) { - vector<u8> blob; - litToOffsetVal.resize(lits.size(), 0); - - u32 lit_id = 0; - for (const auto &lit : lits) { - u32 offset = baseOffset + verify_u32(blob.size()); - blob.insert(blob.end(), begin(lit.s), end(lit.s)); - litToOffsetVal[lit_id] = offset; - lit_id++; - } - - DEBUG_PRINTF("built %zu bytes of strings\n", blob.size()); - return blob; -} - -u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob, - vector<ue2_case_string> &lits, - size_t longLitLengthThreshold, - size_t *historyRequired, - size_t *longLitStreamStateRequired) { - // Work in terms of history requirement (i.e. literal len - 1). - const size_t max_len = longLitLengthThreshold - 1; - - // We should only be building the long literal hash table in streaming mode. - if (!build.cc.streaming) { - return 0; - } - - if (lits.empty()) { - DEBUG_PRINTF("no long literals\n"); - return 0; - } - - // The last char of each literal is trimmed as we're not interested in full - // matches, only partial matches. - for (auto &lit : lits) { - assert(!lit.s.empty()); - lit.s.pop_back(); - } - - // Sort by caseful/caseless and in lexicographical order. - stable_sort(begin(lits), end(lits), [](const ue2_case_string &a, - const ue2_case_string &b) { - if (a.nocase != b.nocase) { - return a.nocase < b.nocase; - } - return a.s < b.s; - }); - - // Find literals that are prefixes of other literals (including - // duplicates). Note that we iterate in reverse, since we want to retain - // only the longest string from a set of prefixes. - auto it = unique(lits.rbegin(), lits.rend(), [](const ue2_case_string &a, - const ue2_case_string &b) { - return a.nocase == b.nocase && a.s.size() >= b.s.size() && - equal(b.s.begin(), b.s.end(), a.s.begin()); - }); - - // Erase dupes found by unique(). - lits.erase(lits.begin(), it.base()); - - LongLitInfo info = analyzeLongLits(lits, max_len); - - vector<u32> litToOffsetVal; - const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable)); - vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal); - - // Build caseful bloom filter and hash table. - vector<u8> bloom_case; - vector<RoseLongLitHashEntry> tab_case; - if (info.caseful.num_literals) { - bloom_case = makeBloomFilter(lits, max_len, false); - tab_case = makeHashTable(lits, max_len, litToOffsetVal, - info.caseful.hashed_positions, false); - } - - // Build nocase bloom filter and hash table. - vector<u8> bloom_nocase; - vector<RoseLongLitHashEntry> tab_nocase; - if (info.nocase.num_literals) { - bloom_nocase = makeBloomFilter(lits, max_len, true); - tab_nocase = makeHashTable(lits, max_len, litToOffsetVal, - info.nocase.hashed_positions, true); - } - - size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob)); - size_t htOffsetCase = headerSize + wholeLitTabSize; - size_t htOffsetNocase = htOffsetCase + byte_length(tab_case); - size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase); - size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case); - - size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase)); - - // need to add +2 to both of these to allow space for the actual largest - // value as well as handling the fact that we add one to the space when - // storing out a position to allow zero to mean "no stream state value" - u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + 2)); - u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2)); - u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8; - - auto table = make_zeroed_bytecode_ptr<char>(tabSize, 16); - assert(table); // otherwise would have thrown std::bad_alloc - - // Fill in the RoseLongLitTable header structure. - RoseLongLitTable *header = (RoseLongLitTable *)(table.get()); - header->size = verify_u32(tabSize); - header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255 - header->caseful.hashOffset = verify_u32(htOffsetCase); - header->caseful.hashBits = lg2(tab_case.size()); - header->caseful.streamStateBits = streamBitsCase; - header->caseful.bloomOffset = verify_u32(bloomOffsetCase); - header->caseful.bloomBits = lg2(bloom_case.size() * 8); - header->nocase.hashOffset = verify_u32(htOffsetNocase); - header->nocase.hashBits = lg2(tab_nocase.size()); - header->nocase.streamStateBits = streamBitsNocase; - header->nocase.bloomOffset = verify_u32(bloomOffsetNocase); - header->nocase.bloomBits = lg2(bloom_nocase.size() * 8); - assert(tot_state_bytes < sizeof(u64a)); - header->streamStateBytes = verify_u8(tot_state_bytes); // u8 - - // Copy in the literal strings, hash tables and bloom filters, - copy_bytes(table.get() + headerSize, lit_blob); - copy_bytes(table.get() + htOffsetCase, tab_case); - copy_bytes(table.get() + bloomOffsetCase, bloom_case); - copy_bytes(table.get() + htOffsetNocase, tab_nocase); - copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase); - - DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize); - DEBUG_PRINTF("requires %zu bytes of history\n", max_len); - DEBUG_PRINTF("requires %u bytes of stream state\n", tot_state_bytes); - - *historyRequired = max(*historyRequired, max_len); - *longLitStreamStateRequired = tot_state_bytes; - - return blob.add(table); -} - -} // namespace ue2 +/* + * Copyright (c) 2016-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rose_build_long_lit.h" + +#include "rose_build_engine_blob.h" +#include "rose_build_impl.h" +#include "stream_long_lit_hash.h" +#include "util/bytecode_ptr.h" +#include "util/bitutils.h" +#include "util/verify_types.h" +#include "util/compile_context.h" + +#include <algorithm> +#include <numeric> + +using namespace std; + +namespace ue2 { + +/** \brief Minimum size for a non-empty hash table. Must be a power of two. */ +static constexpr size_t MIN_HASH_TABLE_SIZE = 128; + +/** \brief Maximum load factor (between zero and one) for a hash table. */ +static constexpr double MAX_HASH_TABLE_LOAD = 0.7; + +/** \brief Minimum size (in bits) for a bloom filter. Must be a power of two. */ +static constexpr u32 MIN_BLOOM_FILTER_SIZE = 256; + +/** \brief Maximum load factor (between zero and one) for a bloom filter. */ +static constexpr double MAX_BLOOM_FILTER_LOAD = 0.25; + +struct LongLitModeInfo { + u32 num_literals = 0; //!< Number of strings for this mode. + u32 hashed_positions = 0; //!< Number of hashable string positions. +}; + +struct LongLitInfo { + LongLitModeInfo caseful; + LongLitModeInfo nocase; +}; + +static +u32 roundUpToPowerOfTwo(u32 x) { + assert(x != 0); + u32 bits = lg2(x - 1) + 1; + assert(bits < 32); + return 1U << bits; +} + +static +LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits, + size_t max_len) { + LongLitInfo info; + + for (const auto &lit : lits) { + auto &lit_info = lit.nocase ? info.nocase : info.caseful; + assert(lit.s.size() > max_len); + lit_info.num_literals++; + lit_info.hashed_positions += lit.s.size() - max_len; + } + + DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions); + DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions); + + return info; +} + +static +void addToBloomFilter(vector<u8> &bloom, const u8 *substr, bool nocase) { + const u32 num_keys = verify_u32(bloom.size() * 8); + const u32 key_mask = (1U << lg2(num_keys)) -1; + + const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 }; + for (const auto &hash_func : hash_functions) { + u32 hash = hash_func(substr, nocase); + u32 key = hash & key_mask; + DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * 8); + bloom[key / 8] |= 1U << (key % 8); + } +} + +static +size_t bloomOccupancy(const vector<u8> &bloom) { + return accumulate(begin(bloom), end(bloom), 0, + [](const size_t &sum, const u8 &elem) { + return sum + popcount32(elem); + }); +} + +static +double bloomLoad(const vector<u8> &bloom) { + return (double)bloomOccupancy(bloom) / (double)(bloom.size() * 8); +} + +static +vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len, + size_t num_entries, bool nocase) { + assert(num_entries % 8 == 0); + assert((num_entries & (num_entries - 1)) == 0); // Must be power of two. + + vector<u8> bloom(num_entries / 8, 0); + + if (!num_entries) { + return bloom; + } + + for (const auto &lit : lits) { + if (nocase != lit.nocase) { + continue; + } + for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) { + const u8 *substr = (const u8 *)lit.s.c_str() + offset; + addToBloomFilter(bloom, substr, nocase); + } + } + + DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n", + nocase ? "nocase" : "caseful", bloomOccupancy(bloom), + num_entries); + + return bloom; +} + + +static +vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits, + size_t max_len, bool nocase) { + vector<u8> bloom; + + size_t num_entries = MIN_BLOOM_FILTER_SIZE; + for (;;) { + bloom = buildBloomFilter(lits, max_len, num_entries, nocase); + DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n", + nocase ? "nocase" : "caseful", num_entries, + bloomLoad(bloom)); + if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) { + break; + } + num_entries *= 2; + } + return bloom; +} + +static UNUSED +size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) { + return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) { + return ent.str_offset != 0; + }); +} + +static UNUSED +double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) { + return (double)hashTableOccupancy(tab) / (double)(tab.size()); +} + +using LitOffsetVector = small_vector<pair<u32, u32>, 1>; + +static +vector<RoseLongLitHashEntry> buildHashTable( + size_t max_len, const vector<u32> &litToOffsetVal, + const map<u32, LitOffsetVector> &hashToLitOffPairs, + size_t numEntries) { + vector<RoseLongLitHashEntry> tab(numEntries, {0,0}); + + if (!numEntries) { + return tab; + } + + for (const auto &m : hashToLitOffPairs) { + u32 hash = m.first; + const LitOffsetVector &d = m.second; + + u32 bucket = hash % numEntries; + + // Placement via linear probing. + for (const auto &lit_offset : d) { + while (tab[bucket].str_offset != 0) { + bucket++; + if (bucket == numEntries) { + bucket = 0; + } + } + + u32 lit_id = lit_offset.first; + u32 offset = lit_offset.second; + + DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash, + lit_id, offset, bucket); + + auto &entry = tab[bucket]; + entry.str_offset = verify_u32(litToOffsetVal.at(lit_id)); + assert(entry.str_offset != 0); + entry.str_len = offset + max_len; + } + } + + DEBUG_PRINTF("hash table occupancy %zu of %zu entries\n", + hashTableOccupancy(tab), numEntries); + + return tab; +} + +static +map<u32, LitOffsetVector> computeLitHashes(const vector<ue2_case_string> &lits, + size_t max_len, bool nocase) { + map<u32, LitOffsetVector> hashToLitOffPairs; + + for (u32 lit_id = 0; lit_id < lits.size(); lit_id++) { + const ue2_case_string &lit = lits[lit_id]; + if (nocase != lit.nocase) { + continue; + } + for (u32 offset = 1; offset < lit.s.size() - max_len + 1; offset++) { + const u8 *substr = (const u8 *)lit.s.c_str() + offset; + u32 hash = hashLongLiteral(substr, max_len, lit.nocase); + hashToLitOffPairs[hash].emplace_back(lit_id, offset); + } + } + + for (auto &m : hashToLitOffPairs) { + LitOffsetVector &d = m.second; + if (d.size() == 1) { + continue; + } + + // Sort by (offset, string) so that we'll be able to remove identical + // string prefixes. + stable_sort(begin(d), end(d), + [&](const pair<u32, u32> &a, const pair<u32, u32> &b) { + const auto &str_a = lits[a.first].s; + const auto &str_b = lits[b.first].s; + return tie(a.second, str_a) < tie(b.second, str_b); + }); + + // Remove entries that point to the same literal prefix. + d.erase(unique(begin(d), end(d), + [&](const pair<u32, u32> &a, const pair<u32, u32> &b) { + if (a.second != b.second) { + return false; + } + const auto &str_a = lits[a.first].s; + const auto &str_b = lits[b.first].s; + const size_t len = max_len + a.second; + return equal(begin(str_a), begin(str_a) + len, + begin(str_b)); + }), + end(d)); + + // Sort d by distance of the residual string (len minus our depth into + // the string). We need to put the 'furthest back' string first. + stable_sort(begin(d), end(d), + [](const pair<u32, u32> &a, const pair<u32, u32> &b) { + if (a.second != b.second) { + return a.second > b.second; /* longest is first */ + } + return a.first < b.first; + }); + } + + return hashToLitOffPairs; +} + +static +vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits, + size_t max_len, + const vector<u32> &litToOffsetVal, + u32 numPositions, bool nocase) { + // Compute lit substring hashes. + const auto hashToLitOffPairs = computeLitHashes(lits, max_len, nocase); + + // Compute the size of the hash table: we need enough entries to satisfy + // our max load constraint, and it must be a power of two. + size_t num_entries = (double)numPositions / MAX_HASH_TABLE_LOAD + 1; + num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, num_entries)); + + auto tab = buildHashTable(max_len, litToOffsetVal, hashToLitOffPairs, + num_entries); + DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n", + nocase ? "nocase" : "caseful", num_entries, + hashTableLoad(tab)); + assert(hashTableLoad(tab) < MAX_HASH_TABLE_LOAD); + + return tab; +} + +static +vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset, + vector<u32> &litToOffsetVal) { + vector<u8> blob; + litToOffsetVal.resize(lits.size(), 0); + + u32 lit_id = 0; + for (const auto &lit : lits) { + u32 offset = baseOffset + verify_u32(blob.size()); + blob.insert(blob.end(), begin(lit.s), end(lit.s)); + litToOffsetVal[lit_id] = offset; + lit_id++; + } + + DEBUG_PRINTF("built %zu bytes of strings\n", blob.size()); + return blob; +} + +u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob, + vector<ue2_case_string> &lits, + size_t longLitLengthThreshold, + size_t *historyRequired, + size_t *longLitStreamStateRequired) { + // Work in terms of history requirement (i.e. literal len - 1). + const size_t max_len = longLitLengthThreshold - 1; + + // We should only be building the long literal hash table in streaming mode. + if (!build.cc.streaming) { + return 0; + } + + if (lits.empty()) { + DEBUG_PRINTF("no long literals\n"); + return 0; + } + + // The last char of each literal is trimmed as we're not interested in full + // matches, only partial matches. + for (auto &lit : lits) { + assert(!lit.s.empty()); + lit.s.pop_back(); + } + + // Sort by caseful/caseless and in lexicographical order. + stable_sort(begin(lits), end(lits), [](const ue2_case_string &a, + const ue2_case_string &b) { + if (a.nocase != b.nocase) { + return a.nocase < b.nocase; + } + return a.s < b.s; + }); + + // Find literals that are prefixes of other literals (including + // duplicates). Note that we iterate in reverse, since we want to retain + // only the longest string from a set of prefixes. + auto it = unique(lits.rbegin(), lits.rend(), [](const ue2_case_string &a, + const ue2_case_string &b) { + return a.nocase == b.nocase && a.s.size() >= b.s.size() && + equal(b.s.begin(), b.s.end(), a.s.begin()); + }); + + // Erase dupes found by unique(). + lits.erase(lits.begin(), it.base()); + + LongLitInfo info = analyzeLongLits(lits, max_len); + + vector<u32> litToOffsetVal; + const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable)); + vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal); + + // Build caseful bloom filter and hash table. + vector<u8> bloom_case; + vector<RoseLongLitHashEntry> tab_case; + if (info.caseful.num_literals) { + bloom_case = makeBloomFilter(lits, max_len, false); + tab_case = makeHashTable(lits, max_len, litToOffsetVal, + info.caseful.hashed_positions, false); + } + + // Build nocase bloom filter and hash table. + vector<u8> bloom_nocase; + vector<RoseLongLitHashEntry> tab_nocase; + if (info.nocase.num_literals) { + bloom_nocase = makeBloomFilter(lits, max_len, true); + tab_nocase = makeHashTable(lits, max_len, litToOffsetVal, + info.nocase.hashed_positions, true); + } + + size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob)); + size_t htOffsetCase = headerSize + wholeLitTabSize; + size_t htOffsetNocase = htOffsetCase + byte_length(tab_case); + size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase); + size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case); + + size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase)); + + // need to add +2 to both of these to allow space for the actual largest + // value as well as handling the fact that we add one to the space when + // storing out a position to allow zero to mean "no stream state value" + u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + 2)); + u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2)); + u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8; + + auto table = make_zeroed_bytecode_ptr<char>(tabSize, 16); + assert(table); // otherwise would have thrown std::bad_alloc + + // Fill in the RoseLongLitTable header structure. + RoseLongLitTable *header = (RoseLongLitTable *)(table.get()); + header->size = verify_u32(tabSize); + header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255 + header->caseful.hashOffset = verify_u32(htOffsetCase); + header->caseful.hashBits = lg2(tab_case.size()); + header->caseful.streamStateBits = streamBitsCase; + header->caseful.bloomOffset = verify_u32(bloomOffsetCase); + header->caseful.bloomBits = lg2(bloom_case.size() * 8); + header->nocase.hashOffset = verify_u32(htOffsetNocase); + header->nocase.hashBits = lg2(tab_nocase.size()); + header->nocase.streamStateBits = streamBitsNocase; + header->nocase.bloomOffset = verify_u32(bloomOffsetNocase); + header->nocase.bloomBits = lg2(bloom_nocase.size() * 8); + assert(tot_state_bytes < sizeof(u64a)); + header->streamStateBytes = verify_u8(tot_state_bytes); // u8 + + // Copy in the literal strings, hash tables and bloom filters, + copy_bytes(table.get() + headerSize, lit_blob); + copy_bytes(table.get() + htOffsetCase, tab_case); + copy_bytes(table.get() + bloomOffsetCase, bloom_case); + copy_bytes(table.get() + htOffsetNocase, tab_nocase); + copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase); + + DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize); + DEBUG_PRINTF("requires %zu bytes of history\n", max_len); + DEBUG_PRINTF("requires %u bytes of stream state\n", tot_state_bytes); + + *historyRequired = max(*historyRequired, max_len); + *longLitStreamStateRequired = tot_state_bytes; + + return blob.add(table); +} + +} // namespace ue2 |