aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/hyperscan
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/hyperscan
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/hyperscan')
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.cpp282
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.h160
-rw-r--r--library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp231
-rw-r--r--library/cpp/regex/hyperscan/ut/ya.make13
-rw-r--r--library/cpp/regex/hyperscan/ya.make19
5 files changed, 705 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/hyperscan.cpp b/library/cpp/regex/hyperscan/hyperscan.cpp
new file mode 100644
index 0000000000..ba321f9c29
--- /dev/null
+++ b/library/cpp/regex/hyperscan/hyperscan.cpp
@@ -0,0 +1,282 @@
+#include "hyperscan.h"
+
+#include <contrib/libs/hyperscan/runtime_core2/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_core2/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_corei7/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_corei7/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_avx2/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_avx2/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_avx512/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_avx512/hs_runtime.h>
+
+#include <util/generic/singleton.h>
+
+namespace NHyperscan {
+ using TSerializedDatabase = THolder<char, TDeleter<decltype(&free), &free>>;
+
+ using TCompileError = THolder<hs_compile_error_t, TDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
+
+ namespace NPrivate {
+ ERuntime DetectCurrentRuntime() {
+ if (NX86::HaveAVX512F() && NX86::HaveAVX512BW()) {
+ return ERuntime::AVX512;
+ } else if (NX86::HaveAVX() && NX86::HaveAVX2()) {
+ return ERuntime::AVX2;
+ } else if (NX86::HaveSSE42() && NX86::HavePOPCNT()) {
+ return ERuntime::Corei7;
+ } else {
+ return ERuntime::Core2;
+ }
+ }
+
+ TCPUFeatures RuntimeCpuFeatures(ERuntime runtime) {
+ switch (runtime) {
+ default:
+ Y_ASSERT(false);
+ [[fallthrough]];
+ case ERuntime::Core2:
+ case ERuntime::Corei7:
+ return 0;
+ case ERuntime::AVX2:
+ return CPU_FEATURES_AVX2;
+ case ERuntime::AVX512:
+ return CPU_FEATURES_AVX512;
+ }
+ }
+
+ hs_platform_info_t MakePlatformInfo(TCPUFeatures cpuFeatures) {
+ hs_platform_info_t platformInfo{HS_TUNE_FAMILY_GENERIC, cpuFeatures, 0, 0};
+ return platformInfo;
+ }
+
+ hs_platform_info_t MakeCurrentPlatformInfo() {
+ return MakePlatformInfo(RuntimeCpuFeatures(DetectCurrentRuntime()));
+ }
+
+ TImpl::TImpl(ERuntime runtime) {
+ switch (runtime) {
+ default:
+ Y_ASSERT(false);
+ [[fallthrough]];
+ case ERuntime::Core2:
+ AllocScratch = core2_hs_alloc_scratch;
+ Scan = core2_hs_scan;
+ SerializeDatabase = core2_hs_serialize_database;
+ DeserializeDatabase = core2_hs_deserialize_database;
+ break;
+ case ERuntime::Corei7:
+ AllocScratch = corei7_hs_alloc_scratch;
+ Scan = corei7_hs_scan;
+ SerializeDatabase = corei7_hs_serialize_database;
+ DeserializeDatabase = corei7_hs_deserialize_database;
+ break;
+ case ERuntime::AVX2:
+ AllocScratch = avx2_hs_alloc_scratch;
+ Scan = avx2_hs_scan;
+ SerializeDatabase = avx2_hs_serialize_database;
+ DeserializeDatabase = avx2_hs_deserialize_database;
+ break;
+ case ERuntime::AVX512:
+ AllocScratch = avx512_hs_alloc_scratch;
+ Scan = avx512_hs_scan;
+ SerializeDatabase = avx512_hs_serialize_database;
+ DeserializeDatabase = avx512_hs_deserialize_database;
+ }
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform) {
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile(
+ regex.begin(),
+ flags,
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ ythrow TCompileException()
+ << "Failed to compile regex: " << regex << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ return db;
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ hs_platform_info_t* platform,
+ const TVector<const hs_expr_ext_t*>* extendedParameters) {
+ unsigned int count = regexs.size();
+ if (flags.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(flags) = " << flags.size() << ".";
+ }
+ if (ids.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(ids) = " << ids.size() << ".";
+ }
+ if (extendedParameters && extendedParameters->size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(extendedParameters) = " << extendedParameters->size() << ".";
+ }
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile_ext_multi(
+ regexs.data(),
+ flags.data(),
+ ids.data(),
+ extendedParameters ? extendedParameters->data() : nullptr,
+ count,
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ if (compileError->expression >= 0) {
+ const char* regex = regexs[compileError->expression];
+ ythrow TCompileException()
+ << "Failed to compile regex: " << regex << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ } else {
+ ythrow TCompileException()
+ << "Failed to compile multiple regexs. "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ }
+ return db;
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ const TImpl& impl) {
+ bool result = false;
+ auto callback = [&](unsigned int /* id */, unsigned long long /* from */, unsigned long long /* to */) {
+ result = true;
+ return 1; // stop scan
+ };
+ Scan(
+ db,
+ scratch,
+ text,
+ callback,
+ impl);
+ return result;
+ }
+ } // namespace NPrivate
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags) {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::Compile(regex, flags, &platformInfo);
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures) {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::Compile(regex, flags, &platformInfo);
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<const hs_expr_ext_t*>* extendedParameters)
+ {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ TCPUFeatures cpuFeatures,
+ const TVector<const hs_expr_ext_t*>* extendedParameters)
+ {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
+ }
+
+ TScratch MakeScratch(const TDatabase& db) {
+ hs_scratch_t* rawScratch = nullptr;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
+ NHyperscan::TScratch scratch(rawScratch);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to make scratch for hyperscan database";
+ }
+ return scratch;
+ }
+
+ void GrowScratch(TScratch& scratch, const TDatabase& db) {
+ hs_scratch_t* rawScratch = scratch.Get();
+ hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
+ if (rawScratch != scratch.Get()) {
+ Y_UNUSED(scratch.Release()); // freed by hs_alloc_scratch
+ scratch.Reset(rawScratch);
+ }
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to make grow scratch for hyperscan database";
+ }
+ }
+
+ TScratch CloneScratch(const TScratch& scratch) {
+ hs_scratch_t* rawScratch = nullptr;
+ hs_error_t status = hs_clone_scratch(scratch.Get(), &rawScratch);
+ TScratch scratchCopy(rawScratch);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to clone scratch for hyperscan database";
+ }
+ return scratchCopy;
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text)
+ {
+ return NPrivate::Matches(db, scratch, text, *Singleton<NPrivate::TImpl>());
+ }
+
+ TString Serialize(const TDatabase& db) {
+ char* databaseBytes = nullptr;
+ size_t databaseLength;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->SerializeDatabase(
+ db.Get(),
+ &databaseBytes,
+ &databaseLength);
+ TSerializedDatabase serialization(databaseBytes);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to serialize hyperscan database";
+ }
+ return TString(serialization.Get(), databaseLength);
+ }
+
+ TDatabase Deserialize(const TStringBuf& serialization) {
+ hs_database_t* rawDb = nullptr;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->DeserializeDatabase(
+ serialization.begin(),
+ serialization.size(),
+ &rawDb);
+ TDatabase db(rawDb);
+ if (status != HS_SUCCESS) {
+ if (status == HS_DB_PLATFORM_ERROR) {
+ ythrow yexception() << "Serialized Hyperscan database is incompatible with current CPU";
+ } else {
+ ythrow yexception() << "Failed to deserialize hyperscan database";
+ }
+ }
+ return db;
+ }
+}
diff --git a/library/cpp/regex/hyperscan/hyperscan.h b/library/cpp/regex/hyperscan/hyperscan.h
new file mode 100644
index 0000000000..1c8f404389
--- /dev/null
+++ b/library/cpp/regex/hyperscan/hyperscan.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <contrib/libs/hyperscan/src/hs.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+#include <util/system/cpu_id.h>
+
+namespace NHyperscan {
+ using TCPUFeatures = decltype(hs_platform_info_t::cpu_features);
+ constexpr TCPUFeatures CPU_FEATURES_AVX2 = HS_CPU_FEATURES_AVX2;
+ constexpr TCPUFeatures CPU_FEATURES_AVX512 = HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX2;
+
+ template<typename TNativeDeleter, TNativeDeleter NativeDeleter>
+ class TDeleter {
+ public:
+ template<typename T>
+ static void Destroy(T* ptr) {
+ NativeDeleter(ptr);
+ }
+ };
+
+ using TDatabase = THolder<hs_database_t, TDeleter<decltype(&hs_free_database), &hs_free_database>>;
+
+ using TScratch = THolder<hs_scratch_t, TDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
+
+ class TCompileException : public yexception {
+ };
+
+
+ namespace NPrivate {
+ enum class ERuntime {
+ Core2 = 0,
+ Corei7 = 1,
+ AVX2 = 2,
+ AVX512 = 3
+ };
+
+ ERuntime DetectCurrentRuntime();
+
+ TCPUFeatures RuntimeCpuFeatures(ERuntime runtime);
+
+ hs_platform_info_t MakePlatformInfo(TCPUFeatures cpuFeatures);
+
+ struct TImpl {
+ hs_error_t (*AllocScratch)(const hs_database_t* db, hs_scratch_t** scratch);
+
+ hs_error_t (*Scan)(const hs_database_t* db, const char* data,
+ unsigned length, unsigned flags, hs_scratch_t* scratch,
+ match_event_handler onEvent, void* userCtx);
+
+ hs_error_t (*SerializeDatabase)(const hs_database_t* db, char** bytes, size_t* serialized_length);
+
+ hs_error_t (*DeserializeDatabase)(const char* bytes, size_t length, hs_database_t** info);
+
+ TImpl() : TImpl(DetectCurrentRuntime()) {}
+
+ explicit TImpl(ERuntime runtime);
+ };
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ hs_platform_info_t* platform,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ // We need to parametrize Scan and Matches functions for testing purposes
+ template<typename TCallback>
+ void Scan(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ TCallback& callback, // applied to index of matched regex
+ const TImpl& impl
+ ) {
+ struct TCallbackWrapper {
+ static int EventHandler(
+ unsigned int id,
+ unsigned long long from,
+ unsigned long long to,
+ unsigned int flags,
+ void* ctx) {
+ Y_UNUSED(flags);
+ TCallback& callback2 = *reinterpret_cast<TCallback*>(ctx);
+ if constexpr (std::is_same_v<int, std::invoke_result_t<TCallback, unsigned int, unsigned long long, unsigned long long>>) {
+ return callback2(id, from, to);
+ } else {
+ callback2(id, from, to);
+ return 0;
+ }
+ }
+ };
+ unsigned int flags = 0; // unused at present
+ hs_error_t status = impl.Scan(
+ db.Get(),
+ text.begin(),
+ text.size(),
+ flags,
+ scratch.Get(),
+ &TCallbackWrapper::EventHandler,
+ &callback);
+ if (status != HS_SUCCESS && status != HS_SCAN_TERMINATED) {
+ ythrow yexception() << "Failed to scan against text: " << text;
+ }
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ const TImpl& impl);
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags);
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ TCPUFeatures cpuFeatures,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ TScratch MakeScratch(const TDatabase& db);
+
+ void GrowScratch(TScratch& scratch, const TDatabase& db);
+
+ TScratch CloneScratch(const TScratch& scratch);
+
+ template<typename TCallback>
+ void Scan(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ TCallback& callback // applied to index of matched regex
+ ) {
+ NPrivate::Scan<TCallback>(db, scratch, text, callback, *Singleton<NPrivate::TImpl>());
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text);
+
+ TString Serialize(const TDatabase& db);
+
+ TDatabase Deserialize(const TStringBuf& serialization);
+}
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
new file mode 100644
index 0000000000..9caa53f2e7
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
@@ -0,0 +1,231 @@
+#include <library/cpp/regex/hyperscan/hyperscan.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/set.h>
+
+#include <array>
+#include <algorithm>
+
+Y_UNIT_TEST_SUITE(HyperscanWrappers) {
+ using namespace NHyperscan;
+ using namespace NHyperscan::NPrivate;
+
+ Y_UNIT_TEST(CompileAndScan) {
+ TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TScratch scratch = MakeScratch(db);
+
+ unsigned int foundId = 42;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundId = id;
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "abc",
+ callback);
+ UNIT_ASSERT_EQUAL(foundId, 0);
+ }
+
+ Y_UNIT_TEST(Matches) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "a.c",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
+ }
+
+ Y_UNIT_TEST(Multi) {
+ NHyperscan::TDatabase db = NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ });
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+
+ // https://ml.yandex-team.ru/thread/2370000002965712422/
+ Y_UNIT_TEST(MultiRegression) {
+ NHyperscan::CompileMulti(
+ {
+ "aa.bb/cc.dd",
+ },
+ {
+ HS_FLAG_UTF8,
+ },
+ {
+ 0,
+ });
+ }
+
+ Y_UNIT_TEST(Serialize) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TString serialization = Serialize(db);
+ db.Reset();
+ TDatabase db2 = Deserialize(serialization);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
+
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
+ UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
+ }
+
+ Y_UNIT_TEST(GrowScratch) {
+ NHyperscan::TDatabase db1 = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TDatabase db2 = NHyperscan::Compile(
+ "longer\\w\\w\\wpattern",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
+ NHyperscan::GrowScratch(scratch, db2);
+ UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
+ }
+
+ Y_UNIT_TEST(CloneScratch) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
+ NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
+ scratch1.Reset();
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
+ }
+
+ class TSimpleSingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+ }
+ };
+
+ // This regex uses AVX2 instructions on long (>70) texts.
+ // It crushes when compiled for machine with AVX2 and run on machine without it.
+ class TAvx2SingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
+ "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
+ unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
+ return NHyperscan::Compile(regex, flags, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу.bar"
+ "_________________________________________________________________",
+ impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу"
+ "_________________________________________________________________",
+ impl));
+ }
+ };
+
+ class TSimpleMultiRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ },
+ cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::NPrivate::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback,
+ impl);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+ };
+
+ template <class Regex>
+ void TestCrossPlatformCompile() {
+ const std::array<ERuntime, 4> runtimes = {
+ ERuntime::Core2,
+ ERuntime::Corei7,
+ ERuntime::AVX2,
+ ERuntime::AVX512
+ };
+
+ // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
+ auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
+ Y_ASSERT(currentRuntimeIter != runtimes.cend());
+
+ for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
+ auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
+ Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
+ }
+ }
+
+ Y_UNIT_TEST(CrossPlatformCompile) {
+ TestCrossPlatformCompile<TSimpleSingleRegex>();
+ TestCrossPlatformCompile<TAvx2SingleRegex>();
+ TestCrossPlatformCompile<TSimpleMultiRegex>();
+ }
+}
diff --git a/library/cpp/regex/hyperscan/ut/ya.make b/library/cpp/regex/hyperscan/ut/ya.make
new file mode 100644
index 0000000000..da67b88672
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ut/ya.make
@@ -0,0 +1,13 @@
+UNITTEST()
+
+PEERDIR(
+ library/cpp/regex/hyperscan
+)
+
+OWNER(g:antiinfra)
+
+SRCS(
+ hyperscan_ut.cpp
+)
+
+END()
diff --git a/library/cpp/regex/hyperscan/ya.make b/library/cpp/regex/hyperscan/ya.make
new file mode 100644
index 0000000000..e99130ae18
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+OWNER(g:antiinfra)
+
+PEERDIR(
+ contrib/libs/hyperscan
+ contrib/libs/hyperscan/runtime_core2
+ contrib/libs/hyperscan/runtime_corei7
+ contrib/libs/hyperscan/runtime_avx2
+ contrib/libs/hyperscan/runtime_avx512
+)
+
+SRCS(
+ hyperscan.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(ut)