From 215178650c519a89f29a47f942fe2be607a91e58 Mon Sep 17 00:00:00 2001
From: artyasen <artyasen@yandex-team.com>
Date: Fri, 10 Jan 2025 10:43:42 +0300
Subject: HyperScan literal compilation

add literal compilation
commit_hash:29f6f2d5c4ec11ceb61add67bc4e697194a4efff
---
 library/cpp/regex/hyperscan/hyperscan.cpp       | 107 ++++++++++++++++++++++++
 library/cpp/regex/hyperscan/hyperscan.h         |  26 ++++++
 library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp |  59 +++++++++++++
 3 files changed, 192 insertions(+)

(limited to 'library/cpp/regex/hyperscan')

diff --git a/library/cpp/regex/hyperscan/hyperscan.cpp b/library/cpp/regex/hyperscan/hyperscan.cpp
index 0a4bfcb9ec..bfb9d7b5ff 100644
--- a/library/cpp/regex/hyperscan/hyperscan.cpp
+++ b/library/cpp/regex/hyperscan/hyperscan.cpp
@@ -96,6 +96,27 @@ namespace NHyperscan {
             return db;
         }
 
+        TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform) {
+            hs_database_t* rawDb = nullptr;
+            hs_compile_error_t* rawCompileErr = nullptr;
+            hs_error_t status = hs_compile_lit(
+                    literal.data(),
+                    flags,
+                    literal.size(),
+                    HS_MODE_BLOCK,
+                    platform,
+                    &rawDb,
+                    &rawCompileErr);
+            TDatabase db(rawDb);
+            NHyperscan::TCompileError compileError(rawCompileErr);
+            if (status != HS_SUCCESS) {
+                ythrow TCompileException()
+                        << "Failed to compile literal: " << literal << ". "
+                        << "Error message (hyperscan): " << compileError->message;
+            }
+            return db;
+        }
+
         TDatabase CompileMulti(
                 const TVector<const char*>& regexs,
                 const TVector<unsigned int>& flags,
@@ -150,6 +171,61 @@ namespace NHyperscan {
             return db;
         }
 
+        TDatabase CompileMultiLiteral(
+            const TVector<const char*>& literals,
+            const TVector<unsigned int>& flags,
+            const TVector<unsigned int>& ids,
+            const TVector<size_t>& lens,
+            hs_platform_info_t* platform)
+        {
+            unsigned int count = literals.size();
+            if (flags.size() != count) {
+                ythrow yexception()
+                        << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+                        << "size(literals) = " << literals.size() << ". "
+                        << "size(flags) = " << flags.size() << ".";
+            }
+            if (ids.size() != count) {
+                ythrow yexception()
+                        << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+                        << "size(literals) = " << literals.size() << ". "
+                        << "size(ids) = " << ids.size() << ".";
+            }
+            if (lens.size() != count) {
+                ythrow yexception()
+                        << "Mismatch of sizes vectors passed to CompileMultiLiteral. "
+                        << "size(literals) = " << literals.size() << ". "
+                        << "size(lens) = " << lens.size() << ".";
+            }
+            hs_database_t* rawDb = nullptr;
+            hs_compile_error_t* rawCompileErr = nullptr;
+            hs_error_t status = hs_compile_lit_multi(
+                    literals.data(),
+                    flags.data(),
+                    ids.data(),
+                    lens.data(),
+                    count,
+                    HS_MODE_BLOCK,
+                    platform,
+                    &rawDb,
+                    &rawCompileErr);
+            TDatabase db(rawDb);
+            NHyperscan::TCompileError compileError(rawCompileErr);
+            if (status != HS_SUCCESS) {
+                if (compileError->expression >= 0) {
+                    const char* literal = literals[compileError->expression];
+                    ythrow TCompileException()
+                            << "Failed to compile literal: " << literal << ". "
+                            << "Error message (hyperscan): " << compileError->message;
+                } else {
+                    ythrow TCompileException()
+                            << "Failed to compile multiple literals. "
+                            << "Error message (hyperscan): " << compileError->message;
+                }
+            }
+            return db;
+        }
+
         bool Matches(
                 const TDatabase& db,
                 const TScratch& scratch,
@@ -180,6 +256,16 @@ namespace NHyperscan {
         return NPrivate::Compile(regex, flags, &platformInfo);
     }
 
+    TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags) {
+        auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+        return NPrivate::CompileLiteral(literal, flags, &platformInfo);
+    }
+
+    TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures) {
+        auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+        return NPrivate::CompileLiteral(literal, flags, &platformInfo);
+    }
+
     TDatabase CompileMulti(
             const TVector<const char*>& regexs,
             const TVector<unsigned int>& flags,
@@ -201,6 +287,27 @@ namespace NHyperscan {
         return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
     }
 
+    TDatabase CompileMultiLiteral(
+        const TVector<const char*>& literals,
+        const TVector<unsigned int>& flags,
+        const TVector<unsigned int>& ids,
+        const TVector<size_t>& lens)
+    {
+        auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+        return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
+    }
+
+    TDatabase CompileMultiLiteral(
+        const TVector<const char*>& literals,
+        const TVector<unsigned int>& flags,
+        const TVector<unsigned int>& ids,
+        const TVector<size_t>& lens,
+        TCPUFeatures cpuFeatures)
+    {
+        auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+        return NPrivate::CompileMultiLiteral(literals, flags, ids, lens, &platformInfo);
+    }
+
     TScratch MakeScratch(const TDatabase& db) {
         hs_scratch_t* rawScratch = nullptr;
         hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
diff --git a/library/cpp/regex/hyperscan/hyperscan.h b/library/cpp/regex/hyperscan/hyperscan.h
index eae82fa384..7f8c877b07 100644
--- a/library/cpp/regex/hyperscan/hyperscan.h
+++ b/library/cpp/regex/hyperscan/hyperscan.h
@@ -60,6 +60,8 @@ namespace NHyperscan {
 
         TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform);
 
+        TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, hs_platform_info_t* platform);
+
         TDatabase CompileMulti(
             const TVector<const char*>& regexs,
             const TVector<unsigned int>& flags,
@@ -67,6 +69,13 @@ namespace NHyperscan {
             hs_platform_info_t* platform,
             const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
 
+        TDatabase CompileMultiLiteral(
+            const TVector<const char*>& literals,
+            const TVector<unsigned int>& flags,
+            const TVector<unsigned int>& ids,
+            const TVector<size_t>& lens,
+            hs_platform_info_t* platform);
+
         // We need to parametrize Scan and Matches functions for testing purposes
         template<typename TCallback>
         void Scan(
@@ -118,6 +127,10 @@ namespace NHyperscan {
 
     TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures);
 
+    TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags);
+
+    TDatabase CompileLiteral(const TStringBuf& literal, unsigned int flags, TCPUFeatures cpuFeatures);
+
     TDatabase CompileMulti(
         const TVector<const char*>& regexs,
         const TVector<unsigned int>& flags,
@@ -131,6 +144,19 @@ namespace NHyperscan {
         TCPUFeatures cpuFeatures,
         const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
 
+    TDatabase CompileMultiLiteral(
+        const TVector<const char*>& literals,
+        const TVector<unsigned int>& flags,
+        const TVector<unsigned int>& ids,
+        const TVector<size_t>& lens);
+
+    TDatabase CompileMultiLiteral(
+        const TVector<const char*>& literals,
+        const TVector<unsigned int>& flags,
+        const TVector<unsigned int>& ids,
+        const TVector<size_t>& lens,
+        TCPUFeatures cpuFeatures);
+
     TScratch MakeScratch(const TDatabase& db);
 
     void GrowScratch(TScratch& scratch, const TDatabase& db);
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
index 75cd0bcc89..063ca3dd03 100644
--- a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
+++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
@@ -27,6 +27,22 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) {
         UNIT_ASSERT_EQUAL(foundId, 0);
     }
 
+    Y_UNIT_TEST(CompileLiteralAndScan) {
+        TDatabase db = CompileLiteral("a.c?[)", HS_FLAG_SINGLEMATCH);
+        TScratch scratch = MakeScratch(db);
+
+        unsigned int foundId = 42;
+        auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+            foundId = id;
+        };
+        NHyperscan::Scan(
+            db,
+            scratch,
+            "a.c?[)",
+            callback);
+        UNIT_ASSERT_EQUAL(foundId, 0);
+    }
+
     Y_UNIT_TEST(Matches) {
         NHyperscan::TDatabase db = NHyperscan::Compile(
             "a.c",
@@ -71,6 +87,49 @@ Y_UNIT_TEST_SUITE(HyperscanWrappers) {
         UNIT_ASSERT(foundIds.contains(241));
     }
 
+    Y_UNIT_TEST(MultiLiteral) {
+        static const TVector<TString> LITERALS = {
+             "foo.",
+             "bar.",
+        };
+        NHyperscan::TDatabase db = NHyperscan::CompileMultiLiteral(
+            {
+                LITERALS[0].c_str(),
+                LITERALS[1].c_str(),
+            },
+            {
+                HS_FLAG_SINGLEMATCH,
+                HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+            },
+            {
+                42,
+                241,
+            },
+            {
+                LITERALS[0].size(),
+                LITERALS[1].size(),
+            });
+        NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+        UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo."));
+        UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar."));
+        UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR."));
+        UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO."));
+
+        TSet<unsigned int> foundIds;
+        auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+            foundIds.insert(id);
+        };
+        NHyperscan::Scan(
+            db,
+            scratch,
+            "foo.BaR.",
+            callback);
+        UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+        UNIT_ASSERT(foundIds.contains(42));
+        UNIT_ASSERT(foundIds.contains(241));
+    }
+
     // https://ml.yandex-team.ru/thread/2370000002965712422/
     Y_UNIT_TEST(MultiRegression) {
         NHyperscan::CompileMulti(
-- 
cgit v1.2.3