diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp')
-rw-r--r-- | library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp new file mode 100644 index 0000000000..9caa53f2e7 --- /dev/null +++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp @@ -0,0 +1,231 @@ +#include <library/cpp/regex/hyperscan/hyperscan.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/set.h> + +#include <array> +#include <algorithm> + +Y_UNIT_TEST_SUITE(HyperscanWrappers) { + using namespace NHyperscan; + using namespace NHyperscan::NPrivate; + + Y_UNIT_TEST(CompileAndScan) { + TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH); + TScratch scratch = MakeScratch(db); + + unsigned int foundId = 42; + auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) { + foundId = id; + }; + NHyperscan::Scan( + db, + scratch, + "abc", + callback); + UNIT_ASSERT_EQUAL(foundId, 0); + } + + Y_UNIT_TEST(Matches) { + NHyperscan::TDatabase db = NHyperscan::Compile( + "a.c", + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH); + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc")); + UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo")); + } + + Y_UNIT_TEST(Multi) { + NHyperscan::TDatabase db = NHyperscan::CompileMulti( + { + "foo", + "bar", + }, + { + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS, + }, + { + 42, + 241, + }); + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo")); + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar")); + UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR")); + UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO")); + + TSet<unsigned int> foundIds; + auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) { + foundIds.insert(id); + }; + NHyperscan::Scan( + db, + scratch, + "fooBaR", + callback); + UNIT_ASSERT_EQUAL(foundIds.size(), 2); + UNIT_ASSERT(foundIds.contains(42)); + UNIT_ASSERT(foundIds.contains(241)); + } + + // https://ml.yandex-team.ru/thread/2370000002965712422/ + Y_UNIT_TEST(MultiRegression) { + NHyperscan::CompileMulti( + { + "aa.bb/cc.dd", + }, + { + HS_FLAG_UTF8, + }, + { + 0, + }); + } + + Y_UNIT_TEST(Serialize) { + NHyperscan::TDatabase db = NHyperscan::Compile( + "foo", + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH); + TString serialization = Serialize(db); + db.Reset(); + TDatabase db2 = Deserialize(serialization); + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2); + + UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo")); + UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO")); + } + + Y_UNIT_TEST(GrowScratch) { + NHyperscan::TDatabase db1 = NHyperscan::Compile( + "foo", + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH); + NHyperscan::TDatabase db2 = NHyperscan::Compile( + "longer\\w\\w\\wpattern", + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8); + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1); + NHyperscan::GrowScratch(scratch, db2); + UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo")); + UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern")); + } + + Y_UNIT_TEST(CloneScratch) { + NHyperscan::TDatabase db = NHyperscan::Compile( + "foo", + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH); + NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db); + NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1); + scratch1.Reset(); + UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo")); + } + + class TSimpleSingleRegex { + public: + static TDatabase Compile(TCPUFeatures cpuFeatures) { + return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures); + } + static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) { + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl)); + UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl)); + } + }; + + // This regex uses AVX2 instructions on long (>70) texts. + // It crushes when compiled for machine with AVX2 and run on machine without it. + class TAvx2SingleRegex { + public: + static TDatabase Compile(TCPUFeatures cpuFeatures) { + auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+" + "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}"; + unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY; + return NHyperscan::Compile(regex, flags, cpuFeatures); + } + static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) { + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + UNIT_ASSERT(NHyperscan::NPrivate::Matches( + db, + scratch, + "_________________________________________________________________" + "фу.bar" + "_________________________________________________________________", + impl)); + UNIT_ASSERT(!NHyperscan::NPrivate::Matches( + db, + scratch, + "_________________________________________________________________" + "фу" + "_________________________________________________________________", + impl)); + } + }; + + class TSimpleMultiRegex { + public: + static TDatabase Compile(TCPUFeatures cpuFeatures) { + return NHyperscan::CompileMulti( + { + "foo", + "bar", + }, + { + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, + HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS, + }, + { + 42, + 241, + }, + cpuFeatures); + } + static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) { + NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db); + + UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl)); + UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl)); + UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl)); + UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl)); + + TSet<unsigned int> foundIds; + auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) { + foundIds.insert(id); + }; + NHyperscan::NPrivate::Scan( + db, + scratch, + "fooBaR", + callback, + impl); + UNIT_ASSERT_EQUAL(foundIds.size(), 2); + UNIT_ASSERT(foundIds.contains(42)); + UNIT_ASSERT(foundIds.contains(241)); + } + }; + + template <class Regex> + void TestCrossPlatformCompile() { + const std::array<ERuntime, 4> runtimes = { + ERuntime::Core2, + ERuntime::Corei7, + ERuntime::AVX2, + ERuntime::AVX512 + }; + + // Unfortunately, we cannot emulate runtimes with more capabilities than current machine. + auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime()); + Y_ASSERT(currentRuntimeIter != runtimes.cend()); + + for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) { + auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime)); + Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime}); + } + } + + Y_UNIT_TEST(CrossPlatformCompile) { + TestCrossPlatformCompile<TSimpleSingleRegex>(); + TestCrossPlatformCompile<TAvx2SingleRegex>(); + TestCrossPlatformCompile<TSimpleMultiRegex>(); + } +} |