aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp
diff options
context:
space:
mode:
authorartyasen <artyasen@yandex-team.com>2025-01-10 10:43:42 +0300
committerartyasen <artyasen@yandex-team.com>2025-01-10 11:54:42 +0300
commit215178650c519a89f29a47f942fe2be607a91e58 (patch)
tree58313a1490bf7d70dde00647915e468ae55e710d /library/cpp
parent2677f7fd48473bd66e77dbf330dc9065db086e9f (diff)
downloadydb-215178650c519a89f29a47f942fe2be607a91e58.tar.gz
HyperScan literal compilation
add literal compilation commit_hash:29f6f2d5c4ec11ceb61add67bc4e697194a4efff
Diffstat (limited to 'library/cpp')
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.cpp107
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.h26
-rw-r--r--library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp59
3 files changed, 192 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/hyperscan.cpp b/library/cpp/regex/hyperscan/hyperscan.cpp
index 0a4bfcb9ec..bfb9d7b5ff 100644
--- a/library/cpp/regex/hyperscan/hyperscan.cpp
+++ b/library/cpp/regex/hyperscan/hyperscan.cpp
@@ -96,6 +96,27 @@ namespace NHyperscan {
return db;
}
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform) {
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile_lit(
+ literal.data(),
+ flags,
+ literal.size(),
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ ythrow TCompileException()
+ << "Failed to compile literal: " << literal << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ return db;
+ }
+
TDatabase CompileMulti(
const TVector<const char*>& regexs,
const TVector<unsigned int>& flags,
@@ -150,6 +171,61 @@ namespace NHyperscan {
return db;
}
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens,
+ hs_platform_info_t* platform)
+ {
+ unsigned int count = literals.size();
+ if (flags.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+ << "size(literals) = " << literals.size() << ". "
+ << "size(flags) = " << flags.size() << ".";
+ }
+ if (ids.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+ << "size(literals) = " << literals.size() << ". "
+ << "size(ids) = " << ids.size() << ".";
+ }
+ if (lens.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+ << "size(literals) = " << literals.size() << ". "
+ << "size(lens) = " << lens.size() << ".";
+ }
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile_lit_multi(
+ literals.data(),
+ flags.data(),
+ ids.data(),
+ lens.data(),
+ count,
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ if (compileError->expression >= 0) {
+ const char* literal = literals[compileError->expression];
+ ythrow TCompileException()
+ << "Failed to compile literal: " << literal << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ } else {
+ ythrow TCompileException()
+ << "Failed to compile multiple literals. "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ }
+ return db;
+ }
+
bool Matches(
const TDatabase& db,
const TScratch& scratch,
@@ -180,6 +256,16 @@ namespace NHyperscan {
return NPrivate::Compile(regex, flags, &platformInfo);
}
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags) {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::CompileLiteral(literal, flags, &platformInfo);
+ }
+
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures) {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::CompileLiteral(literal, flags, &platformInfo);
+ }
+
TDatabase CompileMulti(
const TVector<const char*>& regexs,
const TVector<unsigned int>& flags,
@@ -201,6 +287,27 @@ namespace NHyperscan {
return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
}
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens)
+ {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
+ }
+
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens,
+ TCPUFeatures cpuFeatures)
+ {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
+ }
+
TScratch MakeScratch(const TDatabase& db) {
hs_scratch_t* rawScratch = nullptr;
hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
diff --git a/library/cpp/regex/hyperscan/hyperscan.h b/library/cpp/regex/hyperscan/hyperscan.h
index eae82fa384..7f8c877b07 100644
--- a/library/cpp/regex/hyperscan/hyperscan.h
+++ b/library/cpp/regex/hyperscan/hyperscan.h
@@ -60,6 +60,8 @@ namespace NHyperscan {
TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform);
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform);
+
TDatabase CompileMulti(
const TVector<const char*>& regexs,
const TVector<unsigned int>& flags,
@@ -67,6 +69,13 @@ namespace NHyperscan {
hs_platform_info_t* platform,
const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens,
+ hs_platform_info_t* platform);
+
// We need to parametrize Scan and Matches functions for testing purposes
template<typename TCallback>
void Scan(
@@ -118,6 +127,10 @@ namespace NHyperscan {
TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures);
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags);
+
+ TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures);
+
TDatabase CompileMulti(
const TVector<const char*>& regexs,
const TVector<unsigned int>& flags,
@@ -131,6 +144,19 @@ namespace NHyperscan {
TCPUFeatures cpuFeatures,
const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens);
+
+ TDatabase CompileMultiLiteral(
+ const TVector<const char*>& literals,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<size_t>& lens,
+ TCPUFeatures cpuFeatures);
+
TScratch MakeScratch(const TDatabase& db);
void GrowScratch(TScratch& scratch, const TDatabase& db);
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
index 75cd0bcc89..063ca3dd03 100644
--- a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
+++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
@@ -27,6 +27,22 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) {
UNIT_ASSERT_EQUAL(foundId, 0);
}
+ Y_UNIT_TEST(CompileLiteralAndScan) {
+ TDatabase db = CompileLiteral("a.c?[)", HS_FLAG_SINGLEMATCH);
+ TScratch scratch = MakeScratch(db);
+
+ unsigned int foundId = 42;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundId = id;
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "a.c?[)",
+ callback);
+ UNIT_ASSERT_EQUAL(foundId, 0);
+ }
+
Y_UNIT_TEST(Matches) {
NHyperscan::TDatabase db = NHyperscan::Compile(
"a.c",
@@ -71,6 +87,49 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) {
UNIT_ASSERT(foundIds.contains(241));
}
+ Y_UNIT_TEST(MultiLiteral) {
+ static const TVector<TString> LITERALS = {
+ "foo.",
+ "bar.",
+ };
+ NHyperscan::TDatabase db = NHyperscan::CompileMultiLiteral(
+ {
+ LITERALS[0].c_str(),
+ LITERALS[1].c_str(),
+ },
+ {
+ HS_FLAG_SINGLEMATCH,
+ HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ },
+ {
+ LITERALS[0].size(),
+ LITERALS[1].size(),
+ });
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo."));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar."));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR."));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO."));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "foo.BaR.",
+ callback);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+
// https://ml.yandex-team.ru/thread/2370000002965712422/
Y_UNIT_TEST(MultiRegression) {
NHyperscan::CompileMulti(