diff options
author | artyasen <artyasen@yandex-team.com> | 2025-01-10 10:43:42 +0300 |
---|---|---|
committer | artyasen <artyasen@yandex-team.com> | 2025-01-10 11:54:42 +0300 |
commit | 215178650c519a89f29a47f942fe2be607a91e58 (patch) | |
tree | 58313a1490bf7d70dde00647915e468ae55e710d /library/cpp | |
parent | 2677f7fd48473bd66e77dbf330dc9065db086e9f (diff) | |
download | ydb-215178650c519a89f29a47f942fe2be607a91e58.tar.gz |
HyperScan literal compilation
add literal compilation
commit_hash:29f6f2d5c4ec11ceb61add67bc4e697194a4efff
Diffstat (limited to 'library/cpp')
-rw-r--r-- | library/cpp/regex/hyperscan/hyperscan.cpp | 107 | ||||
-rw-r--r-- | library/cpp/regex/hyperscan/hyperscan.h | 26 | ||||
-rw-r--r-- | library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp | 59 |
3 files changed, 192 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/hyperscan.cpp b/library/cpp/regex/hyperscan/hyperscan.cpp index 0a4bfcb9ec..bfb9d7b5ff 100644 --- a/library/cpp/regex/hyperscan/hyperscan.cpp +++ b/library/cpp/regex/hyperscan/hyperscan.cpp @@ -96,6 +96,27 @@ namespace NHyperscan { return db; } + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform) { + hs_database_t* rawDb = nullptr; + hs_compile_error_t* rawCompileErr = nullptr; + hs_error_t status = hs_compile_lit( + literal.data(), + flags, + literal.size(), + HS_MODE_BLOCK, + platform, + &rawDb, + &rawCompileErr); + TDatabase db(rawDb); + NHyperscan::TCompileError compileError(rawCompileErr); + if (status != HS_SUCCESS) { + ythrow TCompileException() + << "Failed to compile literal: " << literal << ". " + << "Error message (hyperscan): " << compileError->message; + } + return db; + } + TDatabase CompileMulti( const TVector<const char*>& regexs, const TVector<unsigned int>& flags, @@ -150,6 +171,61 @@ namespace NHyperscan { return db; } + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens, + hs_platform_info_t* platform) + { + unsigned int count = literals.size(); + if (flags.size() != count) { + ythrow yexception() + << "Mismatch of sizes vectors passed to CompileMultiLiteral. " + << "size(literals) = " << literals.size() << ". " + << "size(flags) = " << flags.size() << "."; + } + if (ids.size() != count) { + ythrow yexception() + << "Mismatch of sizes vectors passed to CompileMultiLiteral. " + << "size(literals) = " << literals.size() << ". " + << "size(ids) = " << ids.size() << "."; + } + if (lens.size() != count) { + ythrow yexception() + << "Mismatch of sizes vectors passed to CompileMultiLiteral. " + << "size(literals) = " << literals.size() << ". " + << "size(lens) = " << lens.size() << "."; + } + hs_database_t* rawDb = nullptr; + hs_compile_error_t* rawCompileErr = nullptr; + hs_error_t status = hs_compile_lit_multi( + literals.data(), + flags.data(), + ids.data(), + lens.data(), + count, + HS_MODE_BLOCK, + platform, + &rawDb, + &rawCompileErr); + TDatabase db(rawDb); + NHyperscan::TCompileError compileError(rawCompileErr); + if (status != HS_SUCCESS) { + if (compileError->expression >= 0) { + const char* literal = literals[compileError->expression]; + ythrow TCompileException() + << "Failed to compile literal: " << literal << ". " + << "Error message (hyperscan): " << compileError->message; + } else { + ythrow TCompileException() + << "Failed to compile multiple literals. " + << "Error message (hyperscan): " << compileError->message; + } + } + return db; + } + bool Matches( const TDatabase& db, const TScratch& scratch, @@ -180,6 +256,16 @@ namespace NHyperscan { return NPrivate::Compile(regex, flags, &platformInfo); } + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags) { + auto platformInfo = NPrivate::MakeCurrentPlatformInfo(); + return NPrivate::CompileLiteral(literal, flags, &platformInfo); + } + + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures) { + auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures); + return NPrivate::CompileLiteral(literal, flags, &platformInfo); + } + TDatabase CompileMulti( const TVector<const char*>& regexs, const TVector<unsigned int>& flags, @@ -201,6 +287,27 @@ namespace NHyperscan { return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters); } + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens) + { + auto platformInfo = NPrivate::MakeCurrentPlatformInfo(); + return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo); + } + + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens, + TCPUFeatures cpuFeatures) + { + auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures); + return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo); + } + TScratch MakeScratch(const TDatabase& db) { hs_scratch_t* rawScratch = nullptr; hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch); diff --git a/library/cpp/regex/hyperscan/hyperscan.h b/library/cpp/regex/hyperscan/hyperscan.h index eae82fa384..7f8c877b07 100644 --- a/library/cpp/regex/hyperscan/hyperscan.h +++ b/library/cpp/regex/hyperscan/hyperscan.h @@ -60,6 +60,8 @@ namespace NHyperscan { TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform); + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform); + TDatabase CompileMulti( const TVector<const char*>& regexs, const TVector<unsigned int>& flags, @@ -67,6 +69,13 @@ namespace NHyperscan { hs_platform_info_t* platform, const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr); + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens, + hs_platform_info_t* platform); + // We need to parametrize Scan and Matches functions for testing purposes template<typename TCallback> void Scan( @@ -118,6 +127,10 @@ namespace NHyperscan { TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures); + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags); + + TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures); + TDatabase CompileMulti( const TVector<const char*>& regexs, const TVector<unsigned int>& flags, @@ -131,6 +144,19 @@ namespace NHyperscan { TCPUFeatures cpuFeatures, const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr); + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens); + + TDatabase CompileMultiLiteral( + const TVector<const char*>& literals, + const TVector<unsigned int>& flags, + const TVector<unsigned int>& ids, + const TVector<size_t>& lens, + TCPUFeatures cpuFeatures); + TScratch MakeScratch(const TDatabase& db); void GrowScratch(TScratch& scratch, const TDatabase& db); diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp index 75cd0bcc89..063ca3dd03 100644 --- a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp +++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp @@ -27,6 +27,22 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) { UNIT_ASSERT_EQUAL(foundId, 0); } + Y_UNIT_TEST(CompileLiteralAndScan) { + TDatabase db = CompileLiteral("a.c?[)", HS_FLAG_SINGLEMATCH); + TScratch scratch = MakeScratch(db); + + unsigned int foundId = 42; + auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) { + foundId = id; + }; + NHyperscan::Scan( + db, + scratch, + "a.c?[)", + callback); + UNIT_ASSERT_EQUAL(foundId, 0); + } + Y_UNIT_TEST(Matches) { NHyperscan::TDatabase db = NHyperscan::Compile( "a.c", @@ -71,6 +87,49 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) { UNIT_ASSERT(foundIds.contains(241)); } + Y_UNIT_TEST(MultiLiteral) { + static const TVector<TString> LITERALS = { + "foo.", + "bar.", + }; + NHyperscan::TDatabase db = NHyperscan::CompileMultiLiteral( + { + LITERALS[0].c_str(), + LITERALS[1].c_str(), + }, + { + HS_FLAG_SINGLEMATCH, + HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS, + }, + { + 42, + 241, + }, + { + LITERALS[0].size(), + LITERALS[1].size(), + }); + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo.")); + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar.")); + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR.")); + UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO.")); + + TSet<unsigned int> foundIds; + auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) { + foundIds.insert(id); + }; + NHyperscan::Scan( + db, + scratch, + "foo.BaR.", + callback); + UNIT_ASSERT_EQUAL(foundIds.size(), 2); + UNIT_ASSERT(foundIds.contains(42)); + UNIT_ASSERT(foundIds.contains(241)); + } + // https://ml.yandex-team.ru/thread/2370000002965712422/ Y_UNIT_TEST(MultiRegression) { NHyperscan::CompileMulti( |