aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp')
-rw-r--r--library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp231
1 files changed, 231 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
new file mode 100644
index 0000000000..9caa53f2e7
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
@@ -0,0 +1,231 @@
+#include <library/cpp/regex/hyperscan/hyperscan.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/set.h>
+
+#include <array>
+#include <algorithm>
+
+Y_UNIT_TEST_SUITE(HyperscanWrappers) {
+ using namespace NHyperscan;
+ using namespace NHyperscan::NPrivate;
+
+ Y_UNIT_TEST(CompileAndScan) {
+ TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TScratch scratch = MakeScratch(db);
+
+ unsigned int foundId = 42;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundId = id;
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "abc",
+ callback);
+ UNIT_ASSERT_EQUAL(foundId, 0);
+ }
+
+ Y_UNIT_TEST(Matches) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "a.c",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
+ }
+
+ Y_UNIT_TEST(Multi) {
+ NHyperscan::TDatabase db = NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ });
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+
+ // https://ml.yandex-team.ru/thread/2370000002965712422/
+ Y_UNIT_TEST(MultiRegression) {
+ NHyperscan::CompileMulti(
+ {
+ "aa.bb/cc.dd",
+ },
+ {
+ HS_FLAG_UTF8,
+ },
+ {
+ 0,
+ });
+ }
+
+ Y_UNIT_TEST(Serialize) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TString serialization = Serialize(db);
+ db.Reset();
+ TDatabase db2 = Deserialize(serialization);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
+
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
+ UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
+ }
+
+ Y_UNIT_TEST(GrowScratch) {
+ NHyperscan::TDatabase db1 = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TDatabase db2 = NHyperscan::Compile(
+ "longer\\w\\w\\wpattern",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
+ NHyperscan::GrowScratch(scratch, db2);
+ UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
+ }
+
+ Y_UNIT_TEST(CloneScratch) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
+ NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
+ scratch1.Reset();
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
+ }
+
+ class TSimpleSingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+ }
+ };
+
+ // This regex uses AVX2 instructions on long (>70) texts.
+ // It crushes when compiled for machine with AVX2 and run on machine without it.
+ class TAvx2SingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
+ "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
+ unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
+ return NHyperscan::Compile(regex, flags, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу.bar"
+ "_________________________________________________________________",
+ impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу"
+ "_________________________________________________________________",
+ impl));
+ }
+ };
+
+ class TSimpleMultiRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ },
+ cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::NPrivate::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback,
+ impl);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+ };
+
+ template <class Regex>
+ void TestCrossPlatformCompile() {
+ const std::array<ERuntime, 4> runtimes = {
+ ERuntime::Core2,
+ ERuntime::Corei7,
+ ERuntime::AVX2,
+ ERuntime::AVX512
+ };
+
+ // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
+ auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
+ Y_ASSERT(currentRuntimeIter != runtimes.cend());
+
+ for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
+ auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
+ Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
+ }
+ }
+
+ Y_UNIT_TEST(CrossPlatformCompile) {
+ TestCrossPlatformCompile<TSimpleSingleRegex>();
+ TestCrossPlatformCompile<TAvx2SingleRegex>();
+ TestCrossPlatformCompile<TSimpleMultiRegex>();
+ }
+}