aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex')
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.cpp282
-rw-r--r--library/cpp/regex/hyperscan/hyperscan.h160
-rw-r--r--library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp231
-rw-r--r--library/cpp/regex/hyperscan/ut/ya.make13
-rw-r--r--library/cpp/regex/hyperscan/ya.make19
-rw-r--r--library/cpp/regex/pcre/README.md59
-rw-r--r--library/cpp/regex/pcre/benchmark/main.cpp80
-rw-r--r--library/cpp/regex/pcre/benchmark/ya.make14
-rw-r--r--library/cpp/regex/pcre/pcre.cpp1
-rw-r--r--library/cpp/regex/pcre/pcre.h191
-rw-r--r--library/cpp/regex/pcre/pcre_ut.cpp89
-rw-r--r--library/cpp/regex/pcre/pcre_ut_base.h38
-rw-r--r--library/cpp/regex/pcre/regexp.cpp317
-rw-r--r--library/cpp/regex/pcre/regexp.h63
-rw-r--r--library/cpp/regex/pcre/regexp_ut.cpp103
-rw-r--r--library/cpp/regex/pcre/traits.h99
-rw-r--r--library/cpp/regex/pcre/ut/ya.make10
-rw-r--r--library/cpp/regex/pcre/ya.make23
-rw-r--r--library/cpp/regex/pire/extraencodings.cpp81
-rw-r--r--library/cpp/regex/pire/inline/ya.make22
-rw-r--r--library/cpp/regex/pire/pcre2pire.cpp110
-rw-r--r--library/cpp/regex/pire/pcre2pire.h19
-rw-r--r--library/cpp/regex/pire/pire.h76
-rw-r--r--library/cpp/regex/pire/regexp.h337
-rw-r--r--library/cpp/regex/pire/ut/regexp_ut.cpp318
-rw-r--r--library/cpp/regex/pire/ut/ya.make44
-rw-r--r--library/cpp/regex/pire/ya.make40
-rw-r--r--library/cpp/regex/ya.make14
28 files changed, 2853 insertions, 0 deletions
diff --git a/library/cpp/regex/hyperscan/hyperscan.cpp b/library/cpp/regex/hyperscan/hyperscan.cpp
new file mode 100644
index 0000000000..ba321f9c29
--- /dev/null
+++ b/library/cpp/regex/hyperscan/hyperscan.cpp
@@ -0,0 +1,282 @@
+#include "hyperscan.h"
+
+#include <contrib/libs/hyperscan/runtime_core2/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_core2/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_corei7/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_corei7/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_avx2/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_avx2/hs_runtime.h>
+#include <contrib/libs/hyperscan/runtime_avx512/hs_common.h>
+#include <contrib/libs/hyperscan/runtime_avx512/hs_runtime.h>
+
+#include <util/generic/singleton.h>
+
+namespace NHyperscan {
+ using TSerializedDatabase = THolder<char, TDeleter<decltype(&free), &free>>;
+
+ using TCompileError = THolder<hs_compile_error_t, TDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
+
+ namespace NPrivate {
+ ERuntime DetectCurrentRuntime() {
+ if (NX86::HaveAVX512F() && NX86::HaveAVX512BW()) {
+ return ERuntime::AVX512;
+ } else if (NX86::HaveAVX() && NX86::HaveAVX2()) {
+ return ERuntime::AVX2;
+ } else if (NX86::HaveSSE42() && NX86::HavePOPCNT()) {
+ return ERuntime::Corei7;
+ } else {
+ return ERuntime::Core2;
+ }
+ }
+
+ TCPUFeatures RuntimeCpuFeatures(ERuntime runtime) {
+ switch (runtime) {
+ default:
+ Y_ASSERT(false);
+ [[fallthrough]];
+ case ERuntime::Core2:
+ case ERuntime::Corei7:
+ return 0;
+ case ERuntime::AVX2:
+ return CPU_FEATURES_AVX2;
+ case ERuntime::AVX512:
+ return CPU_FEATURES_AVX512;
+ }
+ }
+
+ hs_platform_info_t MakePlatformInfo(TCPUFeatures cpuFeatures) {
+ hs_platform_info_t platformInfo{HS_TUNE_FAMILY_GENERIC, cpuFeatures, 0, 0};
+ return platformInfo;
+ }
+
+ hs_platform_info_t MakeCurrentPlatformInfo() {
+ return MakePlatformInfo(RuntimeCpuFeatures(DetectCurrentRuntime()));
+ }
+
+ TImpl::TImpl(ERuntime runtime) {
+ switch (runtime) {
+ default:
+ Y_ASSERT(false);
+ [[fallthrough]];
+ case ERuntime::Core2:
+ AllocScratch = core2_hs_alloc_scratch;
+ Scan = core2_hs_scan;
+ SerializeDatabase = core2_hs_serialize_database;
+ DeserializeDatabase = core2_hs_deserialize_database;
+ break;
+ case ERuntime::Corei7:
+ AllocScratch = corei7_hs_alloc_scratch;
+ Scan = corei7_hs_scan;
+ SerializeDatabase = corei7_hs_serialize_database;
+ DeserializeDatabase = corei7_hs_deserialize_database;
+ break;
+ case ERuntime::AVX2:
+ AllocScratch = avx2_hs_alloc_scratch;
+ Scan = avx2_hs_scan;
+ SerializeDatabase = avx2_hs_serialize_database;
+ DeserializeDatabase = avx2_hs_deserialize_database;
+ break;
+ case ERuntime::AVX512:
+ AllocScratch = avx512_hs_alloc_scratch;
+ Scan = avx512_hs_scan;
+ SerializeDatabase = avx512_hs_serialize_database;
+ DeserializeDatabase = avx512_hs_deserialize_database;
+ }
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform) {
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile(
+ regex.begin(),
+ flags,
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ ythrow TCompileException()
+ << "Failed to compile regex: " << regex << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ return db;
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ hs_platform_info_t* platform,
+ const TVector<const hs_expr_ext_t*>* extendedParameters) {
+ unsigned int count = regexs.size();
+ if (flags.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(flags) = " << flags.size() << ".";
+ }
+ if (ids.size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(ids) = " << ids.size() << ".";
+ }
+ if (extendedParameters && extendedParameters->size() != count) {
+ ythrow yexception()
+ << "Mismatch of sizes vectors passed to CompileMulti. "
+ << "size(regexs) = " << regexs.size() << ". "
+ << "size(extendedParameters) = " << extendedParameters->size() << ".";
+ }
+ hs_database_t* rawDb = nullptr;
+ hs_compile_error_t* rawCompileErr = nullptr;
+ hs_error_t status = hs_compile_ext_multi(
+ regexs.data(),
+ flags.data(),
+ ids.data(),
+ extendedParameters ? extendedParameters->data() : nullptr,
+ count,
+ HS_MODE_BLOCK,
+ platform,
+ &rawDb,
+ &rawCompileErr);
+ TDatabase db(rawDb);
+ NHyperscan::TCompileError compileError(rawCompileErr);
+ if (status != HS_SUCCESS) {
+ if (compileError->expression >= 0) {
+ const char* regex = regexs[compileError->expression];
+ ythrow TCompileException()
+ << "Failed to compile regex: " << regex << ". "
+ << "Error message (hyperscan): " << compileError->message;
+ } else {
+ ythrow TCompileException()
+ << "Failed to compile multiple regexs. "
+ << "Error message (hyperscan): " << compileError->message;
+ }
+ }
+ return db;
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ const TImpl& impl) {
+ bool result = false;
+ auto callback = [&](unsigned int /* id */, unsigned long long /* from */, unsigned long long /* to */) {
+ result = true;
+ return 1; // stop scan
+ };
+ Scan(
+ db,
+ scratch,
+ text,
+ callback,
+ impl);
+ return result;
+ }
+ } // namespace NPrivate
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags) {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::Compile(regex, flags, &platformInfo);
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures) {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::Compile(regex, flags, &platformInfo);
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<const hs_expr_ext_t*>* extendedParameters)
+ {
+ auto platformInfo = NPrivate::MakeCurrentPlatformInfo();
+ return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
+ }
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ TCPUFeatures cpuFeatures,
+ const TVector<const hs_expr_ext_t*>* extendedParameters)
+ {
+ auto platformInfo = NPrivate::MakePlatformInfo(cpuFeatures);
+ return NPrivate::CompileMulti(regexs, flags, ids, &platformInfo, extendedParameters);
+ }
+
+ TScratch MakeScratch(const TDatabase& db) {
+ hs_scratch_t* rawScratch = nullptr;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
+ NHyperscan::TScratch scratch(rawScratch);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to make scratch for hyperscan database";
+ }
+ return scratch;
+ }
+
+ void GrowScratch(TScratch& scratch, const TDatabase& db) {
+ hs_scratch_t* rawScratch = scratch.Get();
+ hs_error_t status = Singleton<NPrivate::TImpl>()->AllocScratch(db.Get(), &rawScratch);
+ if (rawScratch != scratch.Get()) {
+ Y_UNUSED(scratch.Release()); // freed by hs_alloc_scratch
+ scratch.Reset(rawScratch);
+ }
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to make grow scratch for hyperscan database";
+ }
+ }
+
+ TScratch CloneScratch(const TScratch& scratch) {
+ hs_scratch_t* rawScratch = nullptr;
+ hs_error_t status = hs_clone_scratch(scratch.Get(), &rawScratch);
+ TScratch scratchCopy(rawScratch);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to clone scratch for hyperscan database";
+ }
+ return scratchCopy;
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text)
+ {
+ return NPrivate::Matches(db, scratch, text, *Singleton<NPrivate::TImpl>());
+ }
+
+ TString Serialize(const TDatabase& db) {
+ char* databaseBytes = nullptr;
+ size_t databaseLength;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->SerializeDatabase(
+ db.Get(),
+ &databaseBytes,
+ &databaseLength);
+ TSerializedDatabase serialization(databaseBytes);
+ if (status != HS_SUCCESS) {
+ ythrow yexception() << "Failed to serialize hyperscan database";
+ }
+ return TString(serialization.Get(), databaseLength);
+ }
+
+ TDatabase Deserialize(const TStringBuf& serialization) {
+ hs_database_t* rawDb = nullptr;
+ hs_error_t status = Singleton<NPrivate::TImpl>()->DeserializeDatabase(
+ serialization.begin(),
+ serialization.size(),
+ &rawDb);
+ TDatabase db(rawDb);
+ if (status != HS_SUCCESS) {
+ if (status == HS_DB_PLATFORM_ERROR) {
+ ythrow yexception() << "Serialized Hyperscan database is incompatible with current CPU";
+ } else {
+ ythrow yexception() << "Failed to deserialize hyperscan database";
+ }
+ }
+ return db;
+ }
+}
diff --git a/library/cpp/regex/hyperscan/hyperscan.h b/library/cpp/regex/hyperscan/hyperscan.h
new file mode 100644
index 0000000000..1c8f404389
--- /dev/null
+++ b/library/cpp/regex/hyperscan/hyperscan.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <contrib/libs/hyperscan/src/hs.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+#include <util/system/cpu_id.h>
+
+namespace NHyperscan {
+ using TCPUFeatures = decltype(hs_platform_info_t::cpu_features);
+ constexpr TCPUFeatures CPU_FEATURES_AVX2 = HS_CPU_FEATURES_AVX2;
+ constexpr TCPUFeatures CPU_FEATURES_AVX512 = HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX2;
+
+ template<typename TNativeDeleter, TNativeDeleter NativeDeleter>
+ class TDeleter {
+ public:
+ template<typename T>
+ static void Destroy(T* ptr) {
+ NativeDeleter(ptr);
+ }
+ };
+
+ using TDatabase = THolder<hs_database_t, TDeleter<decltype(&hs_free_database), &hs_free_database>>;
+
+ using TScratch = THolder<hs_scratch_t, TDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
+
+ class TCompileException : public yexception {
+ };
+
+
+ namespace NPrivate {
+ enum class ERuntime {
+ Core2 = 0,
+ Corei7 = 1,
+ AVX2 = 2,
+ AVX512 = 3
+ };
+
+ ERuntime DetectCurrentRuntime();
+
+ TCPUFeatures RuntimeCpuFeatures(ERuntime runtime);
+
+ hs_platform_info_t MakePlatformInfo(TCPUFeatures cpuFeatures);
+
+ struct TImpl {
+ hs_error_t (*AllocScratch)(const hs_database_t* db, hs_scratch_t** scratch);
+
+ hs_error_t (*Scan)(const hs_database_t* db, const char* data,
+ unsigned length, unsigned flags, hs_scratch_t* scratch,
+ match_event_handler onEvent, void* userCtx);
+
+ hs_error_t (*SerializeDatabase)(const hs_database_t* db, char** bytes, size_t* serialized_length);
+
+ hs_error_t (*DeserializeDatabase)(const char* bytes, size_t length, hs_database_t** info);
+
+ TImpl() : TImpl(DetectCurrentRuntime()) {}
+
+ explicit TImpl(ERuntime runtime);
+ };
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, hs_platform_info_t* platform);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ hs_platform_info_t* platform,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ // We need to parametrize Scan and Matches functions for testing purposes
+ template<typename TCallback>
+ void Scan(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ TCallback& callback, // applied to index of matched regex
+ const TImpl& impl
+ ) {
+ struct TCallbackWrapper {
+ static int EventHandler(
+ unsigned int id,
+ unsigned long long from,
+ unsigned long long to,
+ unsigned int flags,
+ void* ctx) {
+ Y_UNUSED(flags);
+ TCallback& callback2 = *reinterpret_cast<TCallback*>(ctx);
+ if constexpr (std::is_same_v<int, std::invoke_result_t<TCallback, unsigned int, unsigned long long, unsigned long long>>) {
+ return callback2(id, from, to);
+ } else {
+ callback2(id, from, to);
+ return 0;
+ }
+ }
+ };
+ unsigned int flags = 0; // unused at present
+ hs_error_t status = impl.Scan(
+ db.Get(),
+ text.begin(),
+ text.size(),
+ flags,
+ scratch.Get(),
+ &TCallbackWrapper::EventHandler,
+ &callback);
+ if (status != HS_SUCCESS && status != HS_SCAN_TERMINATED) {
+ ythrow yexception() << "Failed to scan against text: " << text;
+ }
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ const TImpl& impl);
+ }
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags);
+
+ TDatabase Compile(const TStringBuf& regex, unsigned int flags, TCPUFeatures cpuFeatures);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ TDatabase CompileMulti(
+ const TVector<const char*>& regexs,
+ const TVector<unsigned int>& flags,
+ const TVector<unsigned int>& ids,
+ TCPUFeatures cpuFeatures,
+ const TVector<const hs_expr_ext_t*>* extendedParameters = nullptr);
+
+ TScratch MakeScratch(const TDatabase& db);
+
+ void GrowScratch(TScratch& scratch, const TDatabase& db);
+
+ TScratch CloneScratch(const TScratch& scratch);
+
+ template<typename TCallback>
+ void Scan(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text,
+ TCallback& callback // applied to index of matched regex
+ ) {
+ NPrivate::Scan<TCallback>(db, scratch, text, callback, *Singleton<NPrivate::TImpl>());
+ }
+
+ bool Matches(
+ const TDatabase& db,
+ const TScratch& scratch,
+ const TStringBuf& text);
+
+ TString Serialize(const TDatabase& db);
+
+ TDatabase Deserialize(const TStringBuf& serialization);
+}
diff --git a/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
new file mode 100644
index 0000000000..9caa53f2e7
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ut/hyperscan_ut.cpp
@@ -0,0 +1,231 @@
+#include <library/cpp/regex/hyperscan/hyperscan.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/set.h>
+
+#include <array>
+#include <algorithm>
+
+Y_UNIT_TEST_SUITE(HyperscanWrappers) {
+ using namespace NHyperscan;
+ using namespace NHyperscan::NPrivate;
+
+ Y_UNIT_TEST(CompileAndScan) {
+ TDatabase db = Compile("a.c", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TScratch scratch = MakeScratch(db);
+
+ unsigned int foundId = 42;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundId = id;
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "abc",
+ callback);
+ UNIT_ASSERT_EQUAL(foundId, 0);
+ }
+
+ Y_UNIT_TEST(Matches) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "a.c",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "abc"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "foo"));
+ }
+
+ Y_UNIT_TEST(Multi) {
+ NHyperscan::TDatabase db = NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ });
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "bar"));
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch, "BAR"));
+ UNIT_ASSERT(!NHyperscan::Matches(db, scratch, "FOO"));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+
+ // https://ml.yandex-team.ru/thread/2370000002965712422/
+ Y_UNIT_TEST(MultiRegression) {
+ NHyperscan::CompileMulti(
+ {
+ "aa.bb/cc.dd",
+ },
+ {
+ HS_FLAG_UTF8,
+ },
+ {
+ 0,
+ });
+ }
+
+ Y_UNIT_TEST(Serialize) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ TString serialization = Serialize(db);
+ db.Reset();
+ TDatabase db2 = Deserialize(serialization);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db2);
+
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "foo"));
+ UNIT_ASSERT(!NHyperscan::Matches(db2, scratch, "FOO"));
+ }
+
+ Y_UNIT_TEST(GrowScratch) {
+ NHyperscan::TDatabase db1 = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TDatabase db2 = NHyperscan::Compile(
+ "longer\\w\\w\\wpattern",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db1);
+ NHyperscan::GrowScratch(scratch, db2);
+ UNIT_ASSERT(NHyperscan::Matches(db1, scratch, "foo"));
+ UNIT_ASSERT(NHyperscan::Matches(db2, scratch, "longerWWWpattern"));
+ }
+
+ Y_UNIT_TEST(CloneScratch) {
+ NHyperscan::TDatabase db = NHyperscan::Compile(
+ "foo",
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH);
+ NHyperscan::TScratch scratch1 = NHyperscan::MakeScratch(db);
+ NHyperscan::TScratch scratch2 = NHyperscan::CloneScratch(scratch1);
+ scratch1.Reset();
+ UNIT_ASSERT(NHyperscan::Matches(db, scratch2, "foo"));
+ }
+
+ class TSimpleSingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::Compile("foo", HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+ }
+ };
+
+ // This regex uses AVX2 instructions on long (>70) texts.
+ // It crushes when compiled for machine with AVX2 and run on machine without it.
+ class TAvx2SingleRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ auto regex = "[ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё]+"
+ "[.][\\-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]{2,5}";
+ unsigned int flags = HS_FLAG_UTF8 | HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY;
+ return NHyperscan::Compile(regex, flags, cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу.bar"
+ "_________________________________________________________________",
+ impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(
+ db,
+ scratch,
+ "_________________________________________________________________"
+ "фу"
+ "_________________________________________________________________",
+ impl));
+ }
+ };
+
+ class TSimpleMultiRegex {
+ public:
+ static TDatabase Compile(TCPUFeatures cpuFeatures) {
+ return NHyperscan::CompileMulti(
+ {
+ "foo",
+ "bar",
+ },
+ {
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH,
+ HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS,
+ },
+ {
+ 42,
+ 241,
+ },
+ cpuFeatures);
+ }
+ static void Check(const TDatabase& db, const NHyperscan::NPrivate::TImpl& impl) {
+ NHyperscan::TScratch scratch = NHyperscan::MakeScratch(db);
+
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "foo", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "bar", impl));
+ UNIT_ASSERT(NHyperscan::NPrivate::Matches(db, scratch, "BAR", impl));
+ UNIT_ASSERT(!NHyperscan::NPrivate::Matches(db, scratch, "FOO", impl));
+
+ TSet<unsigned int> foundIds;
+ auto callback = [&](unsigned int id, unsigned long long /* from */, unsigned long long /* to */) {
+ foundIds.insert(id);
+ };
+ NHyperscan::NPrivate::Scan(
+ db,
+ scratch,
+ "fooBaR",
+ callback,
+ impl);
+ UNIT_ASSERT_EQUAL(foundIds.size(), 2);
+ UNIT_ASSERT(foundIds.contains(42));
+ UNIT_ASSERT(foundIds.contains(241));
+ }
+ };
+
+ template <class Regex>
+ void TestCrossPlatformCompile() {
+ const std::array<ERuntime, 4> runtimes = {
+ ERuntime::Core2,
+ ERuntime::Corei7,
+ ERuntime::AVX2,
+ ERuntime::AVX512
+ };
+
+ // Unfortunately, we cannot emulate runtimes with more capabilities than current machine.
+ auto currentRuntimeIter = std::find(runtimes.cbegin(), runtimes.cend(), DetectCurrentRuntime());
+ Y_ASSERT(currentRuntimeIter != runtimes.cend());
+
+ for (auto targetRuntime = runtimes.cbegin(); targetRuntime <= currentRuntimeIter; ++targetRuntime) {
+ auto db = Regex::Compile(RuntimeCpuFeatures(*targetRuntime));
+ Regex::Check(db, NHyperscan::NPrivate::TImpl{*targetRuntime});
+ }
+ }
+
+ Y_UNIT_TEST(CrossPlatformCompile) {
+ TestCrossPlatformCompile<TSimpleSingleRegex>();
+ TestCrossPlatformCompile<TAvx2SingleRegex>();
+ TestCrossPlatformCompile<TSimpleMultiRegex>();
+ }
+}
diff --git a/library/cpp/regex/hyperscan/ut/ya.make b/library/cpp/regex/hyperscan/ut/ya.make
new file mode 100644
index 0000000000..da67b88672
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ut/ya.make
@@ -0,0 +1,13 @@
+UNITTEST()
+
+PEERDIR(
+ library/cpp/regex/hyperscan
+)
+
+OWNER(g:antiinfra)
+
+SRCS(
+ hyperscan_ut.cpp
+)
+
+END()
diff --git a/library/cpp/regex/hyperscan/ya.make b/library/cpp/regex/hyperscan/ya.make
new file mode 100644
index 0000000000..e99130ae18
--- /dev/null
+++ b/library/cpp/regex/hyperscan/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+OWNER(g:antiinfra)
+
+PEERDIR(
+ contrib/libs/hyperscan
+ contrib/libs/hyperscan/runtime_core2
+ contrib/libs/hyperscan/runtime_corei7
+ contrib/libs/hyperscan/runtime_avx2
+ contrib/libs/hyperscan/runtime_avx512
+)
+
+SRCS(
+ hyperscan.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(ut)
diff --git a/library/cpp/regex/pcre/README.md b/library/cpp/regex/pcre/README.md
new file mode 100644
index 0000000000..b5b09a3715
--- /dev/null
+++ b/library/cpp/regex/pcre/README.md
@@ -0,0 +1,59 @@
+# About
+This is a PCRE library wrapper which provides unified interface for UTF-8, UTF-16 and UTF-32 strings matching and optimization control.
+
+# Rationale
+Many Arcadia related libraries (telfinder, lemmer etc.) provides only UTF-16 interfaces, because this is way faster for cyrillic texts. Any algorithm that is working with such libraries and regular expressions must use `WideToUTF8` and `UTF8ToWide` at the borderline between regular expression and UTF-18 interface. This leads us to great performance penalty.
+This library allows us to erase these charset conversions.
+
+# Interface
+
+Before starting with interface details, let's consider simplest library usage example:
+`UNIT_ASSERT(NPcre::TPcre<wchar16>(u"ba+d").Matches(TWtringBuf(u"baaad")));`
+
+Here we see regular expression construction for UTF-16 charset:
+
+`NPcre::TPcre<wchar16>(u"ba+d")`
+
+and matching of the subject string `baaad` against this pattern:
+
+`.Matches(TWtringBuf(u"baaad"))`;
+
+Let's consider both of them in details.
+
+## Construction
+`NPcre::TPcre` class accepts single template parameter: `TCharType`. Currently supported char types are `char`, `wchar16` and `wchar32`. Additional char types traits can be defined in `traits.h`
+
+Constructor accepts three arguments. Two of them are optional:
+1. Zero-terminated string on characters with pattern
+2. Optimization type. The default value is `NPcre::EOptimize::None` which means no pattern optimization. Another possible value is `NPcre::EOptimize::Study` which will take some time at construction stage but could give up to 4x speed boost. And the last but not the least is `NPcre::EOptimize::JIT` which performs JIT optimization which could take significant time but could give up to 10x speed boost.
+3. Regular expressions compile flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_compile2.html), but for most cases `PCRE_UTF8 | PCRE_UCP` will be enough. The default value is `0`.
+
+## Matching
+{% note tip %}
+Two words on PCRE workspaces. Workspace is memory area where PCRE stores information about back references and capturing groups. If passed workspace size is not enough, PCRE will allocate bigger workspace in heap. For simple matching and string searching of string without back references, workspace is not required and this library provides separate functions that won't waste space on workspace and this could save ≈0.5% of CPU TIME on simple patterns.
+For regular expressions with capturing groups, recommended workspace size is `(capturing groups count + 1)`.
+{% endnote %}
+
+In the example above matching function `Matches` returns boolean indicating that subject string matched pattern and accepts two arguments:
+1. `TBasicStringBuf<TCharType>` with subject string
+2. Regular expression execute flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_exec.html). For most cases `0` will be just fine and this is the default value.
+
+## Searching
+Function `Find` accepts the same arguments as `Match` and returns `TMaybe<NPcre::TPcreMatch>` which contains pair of ints with start and end offsets of string found. Check result for `Defined` to ensure that pattern was found in subject string.
+
+## Capturing
+The last member function of `NPcre::TPcre` is `Capture` which searches for pattern and returns capturing group.
+
+### Return value
+Return value is `NPcre::TPcreMatches` which is alias for `TVector<NPcre::TPcreMatch>`.
+Vector will be empty if pattern wasn't found in subject string.
+If pattern was found, first element will contain start and end offsets of string found.
+All other elements will contains start and end offsets of capturing groups in order they appeared in regular expression.
+{% note tip %}
+If some capturing group not matched subject string, but some of consequent capturing groups did, this capturing group will present as `-1, -1` pair.
+For example: calling `Capture` on pattern `(a)(?:(b)c|b(d))` against subject string `zabda` will return `[{1,4},{1,2},{-1,-1},{3,4}]` because capturing group `(b)` wasn't matched.
+{% endnote %}
+### Arguments
+1. `TBasicStringBuf<TCharType>` with subject string
+2. Regular expression execute flags.
+3. Initial workspace size. Default value is `16` but if pattern contains more than 16 capturing groups, this function will reallocate workspace with bigger size.
diff --git a/library/cpp/regex/pcre/benchmark/main.cpp b/library/cpp/regex/pcre/benchmark/main.cpp
new file mode 100644
index 0000000000..3c11ef4f29
--- /dev/null
+++ b/library/cpp/regex/pcre/benchmark/main.cpp
@@ -0,0 +1,80 @@
+#include <benchmark/benchmark.h>
+
+#include <library/cpp/regex/pcre/pcre.h>
+
+#include <util/charset/wide.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+
+static TStringBuf SimplePattern = "[-.\\w]+@(?:[a-z\\d]{2,}\\.)+[a-z]{2,6}";
+static TStringBuf ComplexPattern = R"((?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s*(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*))*)?;\s*))";
+
+static constexpr size_t HaystacksCount = 32;
+static constexpr size_t MinPrefix = 1024;
+
+static TVector<TString> GenerateHaystacks() {
+ // Generate long randomized haystacks to prevent cache hit
+ TVector<TString> result(Reserve(HaystacksCount));
+ for (size_t i = 0; i < HaystacksCount; ++i) {
+ result.push_back(TString::Join(ComplexPattern.SubString(MinPrefix + i, ComplexPattern.Size() - MinPrefix - i), ComplexPattern.SubString(0, MinPrefix + i)));
+ }
+ return result;
+}
+
+static const TVector<TString> Haystacks{GenerateHaystacks()};
+
+static const NPcre::TPcre<char> Simple{SimplePattern.Data()};
+static const NPcre::TPcre<char> SimpleStudy{SimplePattern.Data(), NPcre::EOptimize::Study};
+static const NPcre::TPcre<char> SimpleJIT{SimplePattern.Data(), NPcre::EOptimize::JIT};
+static const NPcre::TPcre<char> Complex{ComplexPattern.Data()};
+static const NPcre::TPcre<char> ComplexStudy{ComplexPattern.Data(), NPcre::EOptimize::Study};
+static const NPcre::TPcre<char> ComplexJIT{ComplexPattern.Data(), NPcre::EOptimize::JIT};
+
+static void Benchmark(benchmark::State& state, const NPcre::TPcre<char>& pattern) {
+ for (auto _ : state) {
+ for (size_t i = 0; i < HaystacksCount; ++i) {
+ // Force string reallocation, so there will be no chance for cache hit of any type
+ benchmark::DoNotOptimize(pattern.Matches(TString{i, 'a'} + Haystacks[i]));
+ }
+ }
+}
+
+static void BenchmarkSimplePatternJIT(benchmark::State& state) {
+ Benchmark(state, SimpleJIT);
+}
+
+static void BenchmarkSimplePatternStudy(benchmark::State& state) {
+ Benchmark(state, SimpleStudy);
+}
+
+static void BenchmarkSimplePattern(benchmark::State& state) {
+ Benchmark(state, Simple);
+}
+
+BENCHMARK(BenchmarkSimplePatternJIT)->Iterations(1);
+BENCHMARK(BenchmarkSimplePatternStudy)->Iterations(1);
+BENCHMARK(BenchmarkSimplePattern)->Iterations(1);
+BENCHMARK(BenchmarkSimplePatternJIT);
+BENCHMARK(BenchmarkSimplePatternStudy);
+BENCHMARK(BenchmarkSimplePattern);
+
+static void BenchmarkComplexPatternJIT(benchmark::State& state) {
+ Benchmark(state, ComplexJIT);
+}
+
+static void BenchmarkComplexPatternStudy(benchmark::State& state) {
+ Benchmark(state, ComplexStudy);
+}
+
+static void BenchmarkComplexPattern(benchmark::State& state) {
+ Benchmark(state, Complex);
+}
+
+BENCHMARK(BenchmarkComplexPatternJIT)->Iterations(1);
+BENCHMARK(BenchmarkComplexPatternStudy)->Iterations(1);
+BENCHMARK(BenchmarkComplexPattern)->Iterations(1);
+BENCHMARK(BenchmarkComplexPatternJIT);
+BENCHMARK(BenchmarkComplexPatternStudy);
+BENCHMARK(BenchmarkComplexPattern);
+
diff --git a/library/cpp/regex/pcre/benchmark/ya.make b/library/cpp/regex/pcre/benchmark/ya.make
new file mode 100644
index 0000000000..7c30fae0a6
--- /dev/null
+++ b/library/cpp/regex/pcre/benchmark/ya.make
@@ -0,0 +1,14 @@
+G_BENCHMARK()
+
+OWNER(g:so)
+
+PEERDIR(
+ library/cpp/regex/pcre
+)
+
+SRCS(
+ main.cpp
+)
+
+END()
+
diff --git a/library/cpp/regex/pcre/pcre.cpp b/library/cpp/regex/pcre/pcre.cpp
new file mode 100644
index 0000000000..9e97d5f8f7
--- /dev/null
+++ b/library/cpp/regex/pcre/pcre.cpp
@@ -0,0 +1 @@
+#include "pcre.h"
diff --git a/library/cpp/regex/pcre/pcre.h b/library/cpp/regex/pcre/pcre.h
new file mode 100644
index 0000000000..82a9774f00
--- /dev/null
+++ b/library/cpp/regex/pcre/pcre.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "traits.h"
+
+#include <library/cpp/containers/stack_array/stack_array.h>
+
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NPcre {
+ //! Start and end offset for match group.
+ using TPcreMatch = std::pair<int, int>;
+
+ //! Full match result containing all capturing groups.
+ /*!
+ * At zero index we have whole matched string start and end offsets.
+ * All other elements will contain capturing groups positions.
+ * Non-captured capturing groups will have {-1, -1} offsets.
+ */
+ using TPcreMatches = TVector<TPcreMatch>;
+
+ //! Compiled pattern optimization strategy.
+ enum class EOptimize {
+ //! No optimization.
+ /*!
+ * Useful for non-reusable patterns where compile time matters.
+ */
+ None,
+ //! Basic optimization via |pcre_study|.
+ /*!
+ * Could give up to 4x match speed boost in exchange of increased
+ * construction time. Could not.
+ */
+ Study,
+ //! PCRE JIT optimization.
+ /*!
+ * Could give up to 10x match speed bust in exchange of significantly
+ * increased compile time. Also, for very complex patterns |pcre_exec|
+ * could return |PCRE_ERROR_JIT_STACKLIMIT|. See
+ * https://www.pcre.org/original/doc/html/pcrejit.html for details.
+ */
+ JIT
+ };
+
+ //! PCRE code container. Controls its life time and provides handy wrapper.
+ template <class TCharType>
+ class TPcre {
+ private:
+ using TCodeType = typename TPcreTraits<TCharType>::TCodeType;
+ using TExtraType = typename TPcreTraits<TCharType>::TExtraType;
+ using TStringType = typename TPcreTraits<TCharType>::TStringType;
+ using TTraits = TPcreTraits<TCharType>;
+ static constexpr size_t DefaultWorkspaceSize = 16;
+
+ public:
+ //! Compiles regexp into internal representation for future use.
+ /*!
+ * \param pattern Regular expression to be compiled.
+ * \param optimize If |EOptimize::JIT|, perform additional
+ * analysis, which will take extra time, but could
+ * speed up matching. |None| to omit optimization.
+ * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html
+ **/
+ TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) {
+ int errcode;
+ const char* errptr;
+ int erroffset;
+ Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr));
+ if (!Code) {
+ ythrow yexception() << "Failed to compile pattern <" << pattern
+ << ">, because of error at pos " << erroffset
+ << ", error code " << errcode << ": " << errptr;
+ }
+ if (optimize != EOptimize::None) {
+ errptr = nullptr;
+ int options;
+ if (optimize == EOptimize::Study) {
+ options = 0;
+ } else {
+ options = PCRE_STUDY_JIT_COMPILE;
+ }
+ Extra.Reset(TTraits::Study(Code.Get(), options, &errptr));
+ if (errptr) {
+ ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr;
+ }
+ }
+ }
+
+ //! Check if compiled pattern matches string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param workspaceSize Amount of space which will be allocated for
+ * back references. PCRE could allocate more
+ * heap space is provided workspaceSize won't
+ * fit all of them.
+ * \returns |true| if there is a match.
+ */
+ bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(workspaceSize >= 0);
+ size_t ovecsize = workspaceSize * 3;
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize));
+ }
+
+ //! Find compiled pattern in string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param workspaceSize Amount of space which will be allocated for
+ * back references. PCRE could allocate more
+ * heap space is provided workspaceSize won't
+ * fit all of them.
+ * \returns Start and end offsets pair if there is a
+ * match. |Nothing| otherwise.
+ */
+ Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(workspaceSize >= 0);
+ size_t ovecsize = workspaceSize * 3;
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ for (size_t i = 0; i < ovecsize; ++i) {
+ ovector[i] = -4;
+ }
+ int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
+ if (ConvertReturnCode(rc)) {
+ return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]);
+ } else {
+ return Nothing();
+ }
+ }
+
+ //! Find and return all capturing groups in string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param initialWorkspaceSize Capturing groups vector initial size.
+ * Workspace will be grown and search will
+ * be repeated if there is not enough
+ * space.
+ * \returns List of capturing groups start and end
+ * offsets. First element will contain
+ * whole matched substring start and end
+ * offsets. For non-matched capturing
+ * groups, result will contain {-1, -1}
+ * pair.
+ * If pattern not found in string, result
+ * vector will be empty.
+ */
+ Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(initialWorkspaceSize > 0);
+ size_t ovecsize = (initialWorkspaceSize + 1) * 3;
+ while (true) {
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
+ if (rc > 0) {
+ TPcreMatches result(Reserve(rc >> 1));
+ for (int i = 0, pos = 0; i < rc; ++i) {
+ int start = ovector[pos++];
+ int end = ovector[pos++];
+ result.emplace_back(start, end);
+ }
+ return result;
+ } else if (rc == 0) {
+ ovecsize <<= 1;
+ } else if (rc == PCRE_ERROR_NOMATCH) {
+ return TPcreMatches{};
+ } else if (rc < 0) {
+ ythrow yexception() << "Error. RC = " << rc;
+ }
+ }
+ }
+
+ private:
+ TPcreCode<TCharType> Code;
+ TPcreExtra<TCharType> Extra;
+
+ private:
+ static inline bool ConvertReturnCode(int rc) {
+ if (rc >= 0) {
+ return true;
+ } else if (rc == PCRE_ERROR_NOMATCH) {
+ return false;
+ } else {
+ ythrow yexception() << "Error. RC = " << rc;
+ }
+ }
+ };
+}
+
diff --git a/library/cpp/regex/pcre/pcre_ut.cpp b/library/cpp/regex/pcre/pcre_ut.cpp
new file mode 100644
index 0000000000..84d06499ae
--- /dev/null
+++ b/library/cpp/regex/pcre/pcre_ut.cpp
@@ -0,0 +1,89 @@
+#include <library/cpp/regex/pcre/pcre.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+template <class T>
+inline IOutputStream& operator<<(IOutputStream& out, const TVector<T>& value) {
+ size_t size = value.size();
+ out << "[";
+ for (size_t i = 0; i < size; ++i) {
+ if (i) {
+ out << ",";
+ }
+ out << value[i];
+ }
+ out << "]";
+ return out;
+}
+
+template <class T, class U>
+inline IOutputStream& operator<<(IOutputStream& out, const std::pair<T, U>& value) {
+ out << "{" << value.first << "," << value.second << "}";
+ return out;
+}
+
+// char8_t
+#define OPTIMIZE NPcre::EOptimize::None
+#define TEST_NAME(S) S
+#define STRING(S) S
+#define CHAR_TYPE char
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::Study
+#undef TEST_NAME
+#define TEST_NAME(S) S ## Study
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::JIT
+#undef TEST_NAME
+#define TEST_NAME(S) S ## JIT
+#include "pcre_ut_base.h"
+
+// char16_t
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::None
+#undef TEST_NAME
+#define TEST_NAME(S) S ## 16
+#undef STRING
+#define STRING(S) u ## S
+#undef CHAR_TYPE
+#define CHAR_TYPE wchar16
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::Study
+#undef TEST_NAME
+#define TEST_NAME(S) S ## Study16
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::JIT
+#undef TEST_NAME
+#define TEST_NAME(S) S ## JIT16
+#include "pcre_ut_base.h"
+
+// char32_t
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::None
+#undef TEST_NAME
+#define TEST_NAME(S) S ## 32
+#undef STRING
+#define STRING(S) U ## S
+#undef CHAR_TYPE
+#define CHAR_TYPE wchar32
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::Study
+#undef TEST_NAME
+#define TEST_NAME(S) S ## Study32
+#include "pcre_ut_base.h"
+
+#undef OPTIMIZE
+#define OPTIMIZE NPcre::EOptimize::JIT
+#undef TEST_NAME
+#define TEST_NAME(S) S ## JIT32
+#include "pcre_ut_base.h"
+
diff --git a/library/cpp/regex/pcre/pcre_ut_base.h b/library/cpp/regex/pcre/pcre_ut_base.h
new file mode 100644
index 0000000000..1d61d07b14
--- /dev/null
+++ b/library/cpp/regex/pcre/pcre_ut_base.h
@@ -0,0 +1,38 @@
+#define CHECK_MATCHES(EXPECTED, PATTERN, STR) \
+ UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR))); \
+ UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR), 0, 10));
+
+#define CHECK(A, B) UNIT_ASSERT_STRINGS_EQUAL(ToString(STRING(A)), ToString(B))
+
+#define CHECK_GROUPS(EXPECTED, PATTERN, STR) \
+ CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR))); \
+ CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR), 0, 10));
+
+Y_UNIT_TEST_SUITE(TEST_NAME(TestRegExp)) {
+ Y_UNIT_TEST(TestMatches) {
+ CHECK_MATCHES(true, "ю", "bюd");
+ CHECK_MATCHES(false, "c", "bюd");
+ CHECK_MATCHES(true, "(ю)(?:(b)c|bd)", "zюbda");
+ CHECK_MATCHES(false, "(ю)(?:(b)c|bd)", "bюd");
+ CHECK_MATCHES(true, "(abc|def)=\\g1", "abc=abc");
+ CHECK_MATCHES(true, "(abc|def)=\\g1", "def=def");
+ CHECK_MATCHES(false, "(abc|def)=\\g1", "abc=def");
+ }
+
+ Y_UNIT_TEST(TestGroups) {
+ CHECK_GROUPS("{1,2}", "a", "bad");
+ CHECK_GROUPS("(empty maybe)", "c", "bad");
+ CHECK_GROUPS("{1,4}", "(a)(?:(b)c|bd)", "zabda");
+ CHECK_GROUPS("(empty maybe)", "(a)(?:(b)c|bd)", "bad");
+ CHECK_GROUPS("{1,8}", "(abc|def)=\\g1", "aabc=abca");
+ CHECK_GROUPS("(empty maybe)", "(abc|def)=\\g1", "abc=def");
+ }
+
+ Y_UNIT_TEST(TestCapture) {
+ CHECK("[{1,2}]",NPcre::TPcre<CHAR_TYPE>(STRING("a"), OPTIMIZE).Capture(STRING("bad"), 0, 1));
+ CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("c"), OPTIMIZE).Capture(STRING("bad"), 0, 1));
+ CHECK("[{1,4},{1,2},{-1,-1},{3,4}]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|b(d))"), OPTIMIZE).Capture(STRING("zabda"), 0, 1));
+ CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|bd)"), OPTIMIZE).Capture(STRING("bad"), 0, 1));
+ }
+}
+
diff --git a/library/cpp/regex/pcre/regexp.cpp b/library/cpp/regex/pcre/regexp.cpp
new file mode 100644
index 0000000000..575c09cee4
--- /dev/null
+++ b/library/cpp/regex/pcre/regexp.cpp
@@ -0,0 +1,317 @@
+#include "regexp.h"
+
+#include <util/generic/string.h>
+#include <util/string/ascii.h>
+#include <util/system/defaults.h>
+
+#include <cstdlib>
+#include <util/generic/noncopyable.h>
+
+class TGlobalImpl : TNonCopyable {
+private:
+ const char* Str;
+ regmatch_t* Pmatch;
+ int Options;
+ int StrLen;
+ int StartOffset, NotEmptyOpts, MatchPos;
+ int MatchBuf[NMATCHES * 3];
+ pcre* PregComp;
+
+ enum StateCode {
+ TGI_EXIT,
+ TGI_CONTINUE,
+ TGI_WALKTHROUGH
+ };
+
+private:
+ void CopyResults(int count) {
+ for (int i = 0; i < count; i++) {
+ Pmatch[MatchPos].rm_so = MatchBuf[2 * i];
+ Pmatch[MatchPos].rm_eo = MatchBuf[2 * i + 1];
+ MatchPos++;
+ if (MatchPos >= NMATCHES) {
+ ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
+ }
+ }
+ }
+
+ int DoPcreExec(int opts) {
+ int rc = pcre_exec(
+ PregComp, /* the compiled pattern */
+ nullptr, /* no extra data - we didn't study the pattern */
+ Str, /* the subject string */
+ StrLen, /* the length of the subject */
+ StartOffset, /* start at offset 0 in the subject */
+ opts, /* default options */
+ MatchBuf, /* output vector for substring information */
+ NMATCHES); /* number of elements in the output vector */
+
+ if (rc == 0) {
+ ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
+ }
+
+ return rc;
+ }
+
+ StateCode CheckEmptyCase() {
+ if (MatchBuf[0] == MatchBuf[1]) { // founded an empty string
+ if (MatchBuf[0] == StrLen) { // at the end
+ return TGI_EXIT;
+ }
+ NotEmptyOpts = PCRE_NOTEMPTY | PCRE_ANCHORED; // trying to find non empty string
+ }
+ return TGI_WALKTHROUGH;
+ }
+
+ StateCode CheckNoMatch(int rc) {
+ if (rc == PCRE_ERROR_NOMATCH) {
+ if (NotEmptyOpts == 0) {
+ return TGI_EXIT;
+ }
+
+ MatchBuf[1] = StartOffset + 1; // we have failed to find non-empty-string. trying to find again shifting "previous match offset"
+ return TGI_CONTINUE;
+ }
+ return TGI_WALKTHROUGH;
+ }
+
+public:
+ TGlobalImpl(const char* st, regmatch_t& pma, int opts, pcre* pc_re)
+ : Str(st)
+ , Pmatch(&pma)
+ , Options(opts)
+ , StartOffset(0)
+ , NotEmptyOpts(0)
+ , MatchPos(0)
+ , PregComp(pc_re)
+ {
+ memset(Pmatch, -1, sizeof(regmatch_t) * NMATCHES);
+ StrLen = strlen(Str);
+ }
+
+ int ExecGlobal() {
+ StartOffset = 0;
+ int rc = DoPcreExec(Options);
+
+ if (rc < 0) {
+ return rc;
+ }
+ CopyResults(rc);
+ do {
+ NotEmptyOpts = 0;
+ StartOffset = MatchBuf[1];
+
+ if (CheckEmptyCase() == TGI_EXIT) {
+ return 0;
+ }
+
+ rc = DoPcreExec(NotEmptyOpts | Options);
+
+ switch (CheckNoMatch(rc)) {
+ case TGI_CONTINUE:
+ continue;
+ case TGI_EXIT:
+ return 0;
+ case TGI_WALKTHROUGH:
+ default:
+ break;
+ }
+
+ if (rc < 0) {
+ return rc;
+ }
+
+ CopyResults(rc);
+ } while (true);
+
+ return 0;
+ }
+
+private:
+};
+
+class TRegExBaseImpl: public TAtomicRefCount<TRegExBaseImpl> {
+ friend class TRegExBase;
+
+protected:
+ int CompileOptions;
+ TString RegExpr;
+ regex_t Preg;
+
+public:
+ TRegExBaseImpl()
+ : CompileOptions(0)
+ {
+ memset(&Preg, 0, sizeof(Preg));
+ }
+
+ TRegExBaseImpl(const TString& re, int cflags)
+ : CompileOptions(cflags)
+ , RegExpr(re)
+ {
+ int rc = regcomp(&Preg, re.data(), cflags);
+ if (rc) {
+ const size_t ERRBUF_SIZE = 100;
+ char errbuf[ERRBUF_SIZE];
+ regerror(rc, &Preg, errbuf, ERRBUF_SIZE);
+ Error = "Error: regular expression " + re + " is wrong: " + errbuf;
+ ythrow yexception() << "RegExp " << re << ": " << Error.data();
+ }
+ }
+
+ int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
+ if (!RegExpr) {
+ ythrow yexception() << "Regular expression is not compiled";
+ }
+ if (!str) {
+ ythrow yexception() << "Empty string is passed to TRegExBaseImpl::Exec";
+ }
+ if ((eflags & REGEXP_GLOBAL) == 0) {
+ return regexec(&Preg, str, nmatches, pmatch, eflags);
+ } else {
+ int options = 0;
+ if ((eflags & REG_NOTBOL) != 0)
+ options |= PCRE_NOTBOL;
+ if ((eflags & REG_NOTEOL) != 0)
+ options |= PCRE_NOTEOL;
+
+ return TGlobalImpl(str, pmatch[0], options, (pcre*)Preg.re_pcre).ExecGlobal();
+ }
+ }
+
+ bool IsCompiled() {
+ return Preg.re_pcre;
+ }
+
+ ~TRegExBaseImpl() {
+ regfree(&Preg);
+ }
+
+private:
+ TString Error;
+};
+
+bool TRegExBase::IsCompiled() const {
+ return Impl && Impl->IsCompiled();
+}
+
+TRegExBase::TRegExBase(const char* re, int cflags) {
+ if (re) {
+ Compile(re, cflags);
+ }
+}
+
+TRegExBase::TRegExBase(const TString& re, int cflags) {
+ Compile(re, cflags);
+}
+
+TRegExBase::~TRegExBase() {
+}
+
+void TRegExBase::Compile(const TString& re, int cflags) {
+ Impl = new TRegExBaseImpl(re, cflags);
+}
+
+int TRegExBase::Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->Exec(str, pmatch, eflags, nmatches);
+}
+
+int TRegExBase::GetCompileOptions() const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->CompileOptions;
+}
+
+TString TRegExBase::GetRegExpr() const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->RegExpr;
+}
+
+TRegExMatch::TRegExMatch(const char* re, int cflags)
+ : TRegExBase(re, cflags)
+{
+}
+
+TRegExMatch::TRegExMatch(const TString& re, int cflags)
+ : TRegExBase(re, cflags)
+{
+}
+
+bool TRegExMatch::Match(const char* str) const {
+ return Exec(str, nullptr, 0, 0) == 0;
+}
+
+TRegExSubst::TRegExSubst(const char* re, int cflags)
+ : TRegExBase(re, cflags)
+ , Replacement(nullptr)
+{
+ memset(Brfs, 0, sizeof(TBackReferences) * NMATCHES);
+}
+
+TString TRegExSubst::Replace(const char* str, int eflags) {
+ TString s;
+ if (BrfsCount) {
+ if (Exec(str, PMatch, eflags) == 0) {
+ int i;
+ for (i = 0; i < BrfsCount; i++) {
+ s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
+ if (Brfs[i].Refer >= 0 && Brfs[i].Refer < NMATCHES)
+ s += TString(str, PMatch[Brfs[i].Refer].rm_so, int(PMatch[Brfs[i].Refer].rm_eo - PMatch[Brfs[i].Refer].rm_so));
+ }
+ s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
+ }
+ } else {
+ s = Replacement;
+ }
+ return s;
+}
+
+//***
+// ��� ������������ ������ aaa.$1.$$$$.$2.bbb.$$$ccc Brfs ����� �����:
+// {beg = 0, end = 4, Refer = 1} => "aaa." + $1_match
+// {beg = 6, end = 8, Refer = -1} => ".$"
+// {beg = 9, end = 10, Refer = -1} => "$"
+// {beg = 11, end = 12, Refer = 2} => "." + $2_match
+// {beg = 14, end = 20, Refer = -1} => ".bbb.$"
+// {beg = 21, end = 22, Refer = -1} => "$"
+// {beg = 22, end = 25, Refer = -1} => "ccc"
+// {beg = 0, end = 0, Refer = 0}
+//***
+int TRegExSubst::ParseReplacement(const char* repl) {
+ Replacement = repl;
+ if (!Replacement || *Replacement == 0)
+ return 0;
+ char* pos = (char*)Replacement;
+ char* pos1 = nullptr;
+ char* pos2 = nullptr;
+ int i = 0;
+ while (pos && *pos && i < NMATCHES) {
+ pos1 = strchr(pos, '$');
+ Brfs[i].Refer = -1;
+ pos2 = pos1;
+ if (pos1) {
+ pos2 = pos1 + 1;
+ while (IsAsciiDigit(*pos2))
+ pos2++;
+ if (pos2 > pos1 + 1) {
+ Brfs[i].Refer = atol(TString(Replacement, pos1 + 1 - Replacement, pos2 - (pos1 + 1)).data());
+ } else {
+ pos1++;
+ if (*pos2 == '$')
+ pos2++;
+ Brfs[i].Refer = -1;
+ }
+ }
+ Brfs[i].Beg = int(pos - (char*)Replacement);
+ Brfs[i].End = (pos1 == nullptr ? (int)strlen(Replacement) : int(pos1 - Replacement));
+ pos = pos2;
+ i++;
+ }
+ Brfs[i].Beg = Brfs[i].End = 0;
+ Brfs[i].Refer = -1;
+ BrfsCount = i;
+ return BrfsCount;
+}
diff --git a/library/cpp/regex/pcre/regexp.h b/library/cpp/regex/pcre/regexp.h
new file mode 100644
index 0000000000..bc610bd2f3
--- /dev/null
+++ b/library/cpp/regex/pcre/regexp.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <util/system/defaults.h>
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+
+#include <contrib/libs/pcre/pcre.h>
+#include <contrib/libs/pcre/pcreposix.h>
+
+//THIS CODE LOOKS LIKE A TRASH, BUT WORKS.
+
+#define NMATCHES 100
+#define REGEXP_GLOBAL 0x0080 // use this if you want to find all occurences
+
+class TRegExBaseImpl;
+
+class TRegExBase {
+protected:
+ TSimpleIntrusivePtr<TRegExBaseImpl> Impl;
+
+public:
+ TRegExBase(const char* regExpr = nullptr, int cflags = REG_EXTENDED);
+ TRegExBase(const TString& regExpr, int cflags = REG_EXTENDED);
+
+ virtual ~TRegExBase();
+
+ int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches = NMATCHES) const;
+ void Compile(const TString& regExpr, int cflags = REG_EXTENDED);
+ bool IsCompiled() const;
+ int GetCompileOptions() const;
+ TString GetRegExpr() const;
+};
+
+class TRegExMatch: public TRegExBase {
+public:
+ TRegExMatch(const char* regExpr = nullptr, int cflags = REG_NOSUB | REG_EXTENDED);
+ TRegExMatch(const TString& regExpr, int cflags = REG_NOSUB | REG_EXTENDED);
+
+ bool Match(const char* str) const;
+};
+
+struct TBackReferences {
+ int Beg;
+ int End;
+ int Refer;
+};
+
+class TRegExSubst: public TRegExBase {
+private:
+ const char* Replacement;
+ regmatch_t PMatch[NMATCHES];
+
+ TBackReferences Brfs[NMATCHES];
+ int BrfsCount;
+
+public:
+ TRegExSubst(const char* regExpr = nullptr, int cflags = REG_EXTENDED);
+
+ TString Replace(const char* str, int eflags = 0);
+ int ParseReplacement(const char* replacement);
+};
diff --git a/library/cpp/regex/pcre/regexp_ut.cpp b/library/cpp/regex/pcre/regexp_ut.cpp
new file mode 100644
index 0000000000..5184e801cc
--- /dev/null
+++ b/library/cpp/regex/pcre/regexp_ut.cpp
@@ -0,0 +1,103 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/string/strip.h>
+#include <library/cpp/regex/pcre/regexp.h>
+#include <util/stream/output.h>
+
+struct TRegTest {
+ const char* Regexp;
+ const char* Data;
+ const char* Result;
+ int CompileOptions;
+ int RunOptions;
+
+ TRegTest(const char* re, const char* text, const char* res, int copts = REG_EXTENDED, int ropts = 0)
+ : Regexp(re)
+ , Data(text)
+ , Result(res)
+ , CompileOptions(copts)
+ , RunOptions(ropts)
+ {
+ }
+};
+
+struct TSubstTest: public TRegTest {
+ const char* Replacement;
+ const char* Replacement2;
+
+ TSubstTest(const char* re, const char* text, const char* res, const char* repl, const char* repl2)
+ : TRegTest(re, text, res, REG_EXTENDED, REGEXP_GLOBAL)
+ , Replacement(repl)
+ , Replacement2(repl2)
+ {
+ }
+};
+
+const TRegTest REGTEST_DATA[] = {
+ TRegTest("test", "its a test and test string.", "6 10", REG_EXTENDED, 0),
+ TRegTest("test", "its a test and test string.", "6 10 15 19", REG_EXTENDED, REGEXP_GLOBAL),
+ TRegTest("test|[an]{0,0}", "test and test an test string tes", "0 4 4 4 5 5 6 6 7 7 8 8 9 13 13 13 14 14 15 15 16 16 17 21 21 21 22 22 23 23 24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31 32 32", REG_EXTENDED, REGEXP_GLOBAL),
+ TRegTest("test[an]{1,}", "test and test an test string tes", "NM", REG_EXTENDED, REGEXP_GLOBAL)};
+
+const TSubstTest SUBSTTEST_DATA[] = {
+ TSubstTest("([a-zA-Z]*[0-9]+) (_[a-z]+)", "Xxx123 534 ___124 bsd _A ZXC _L 141 _sd dsfg QWE123 _bbb", "141 XXX/_sd", "$1 XXX/$2", "$2$2$2 YY$1Y/$2")};
+
+class TRegexpTest: public TTestBase {
+private:
+ regmatch_t Matches[NMATCHES];
+
+private:
+ UNIT_TEST_SUITE(TRegexpTest);
+ UNIT_TEST(TestRe)
+ UNIT_TEST(TestSubst)
+ UNIT_TEST(TestOffEndOfBuffer);
+ UNIT_TEST_SUITE_END();
+
+ inline void TestRe() {
+ for (const auto& regTest : REGTEST_DATA) {
+ memset(Matches, 0, sizeof(Matches));
+ TString result;
+
+ TRegExBase re(regTest.Regexp, regTest.CompileOptions);
+ if (re.Exec(regTest.Data, Matches, regTest.RunOptions) == 0) {
+ for (auto& matche : Matches) {
+ if (matche.rm_so == -1) {
+ break;
+ }
+ result.append(Sprintf("%i %i ", matche.rm_so, matche.rm_eo));
+ }
+ } else {
+ result = "NM";
+ }
+ StripInPlace(result);
+ UNIT_ASSERT_VALUES_EQUAL(result, regTest.Result);
+ }
+ }
+
+ inline void TestSubst() {
+ for (const auto& substTest : SUBSTTEST_DATA) {
+ TRegExSubst subst(substTest.Regexp, substTest.CompileOptions);
+ subst.ParseReplacement(substTest.Replacement);
+ TString result = subst.Replace(substTest.Data, substTest.RunOptions);
+ UNIT_ASSERT_VALUES_EQUAL(result, substTest.Result);
+ TRegExSubst substCopy = subst;
+ subst.ParseReplacement(substTest.Replacement2);
+ TString newResult = subst.Replace(substTest.Data, substTest.RunOptions);
+ UNIT_ASSERT_VALUES_UNEQUAL(newResult.c_str(), result.c_str());
+ TString copyResult = substCopy.Replace(substTest.Data, substTest.RunOptions);
+ UNIT_ASSERT_VALUES_EQUAL(copyResult, result);
+ substCopy = subst;
+ copyResult = substCopy.Replace(substTest.Data, substTest.RunOptions);
+ UNIT_ASSERT_VALUES_EQUAL(copyResult, newResult);
+ }
+ }
+
+ void TestOffEndOfBuffer() {
+ const TString needle{".*[^./]gov[.].*"};
+ TRegExMatch re{needle, REG_UTF8};
+ const TString haystack{"fakty.ictv.ua"};
+ UNIT_ASSERT_VALUES_EQUAL(re.Match(haystack.c_str()), false);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TRegexpTest);
diff --git a/library/cpp/regex/pcre/traits.h b/library/cpp/regex/pcre/traits.h
new file mode 100644
index 0000000000..e926bdd758
--- /dev/null
+++ b/library/cpp/regex/pcre/traits.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <contrib/libs/pcre/pcre.h>
+
+#include <util/generic/ptr.h> // THolder
+#include <util/system/types.h> // wchar16, wchar32
+
+namespace NPcre {
+ template <class TCharType>
+ struct TPcreTraits;
+
+ template <>
+ struct TPcreTraits<char> {
+ using TCharType = char;
+ using TStringType = const char*;
+ using TCodeType = pcre;
+ using TExtraType = pcre_extra;
+ static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre_compile2;
+ static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre_study;
+ static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre_exec;
+ };
+
+ template <>
+ struct TPcreTraits<wchar16> {
+ using TCharType = wchar16;
+ using TStringType = PCRE_SPTR16;
+ using TCodeType = pcre16;
+ using TExtraType = pcre16_extra;
+ static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre16_compile2;
+ static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre16_study;
+ static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre16_exec;
+ };
+
+ template <>
+ struct TPcreTraits<wchar32> {
+ using TCharType = wchar32;
+ using TStringType = PCRE_SPTR32;
+ using TCodeType = pcre32;
+ using TExtraType = pcre32_extra;
+ static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre32_compile2;
+ static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre32_study;
+ static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre32_exec;
+ };
+
+ template <class TCharType>
+ struct TFreePcre;
+
+ template <>
+ struct TFreePcre<char> {
+ static inline void Destroy(void* ptr) noexcept {
+ pcre_free(ptr);
+ }
+ };
+
+ template <>
+ struct TFreePcre<wchar16> {
+ static inline void Destroy(void* ptr) noexcept {
+ pcre16_free(ptr);
+ }
+ };
+
+ template <>
+ struct TFreePcre<wchar32> {
+ static inline void Destroy(void* ptr) noexcept {
+ pcre32_free(ptr);
+ }
+ };
+
+ template <class TCharType>
+ struct TFreePcreExtra;
+
+ template <>
+ struct TFreePcreExtra<char> {
+ static inline void Destroy(pcre_extra* ptr) noexcept {
+ pcre_free_study(ptr);
+ }
+ };
+
+ template <>
+ struct TFreePcreExtra<wchar16> {
+ static inline void Destroy(pcre16_extra* ptr) noexcept {
+ pcre16_free_study(ptr);
+ }
+ };
+
+ template <>
+ struct TFreePcreExtra<wchar32> {
+ static inline void Destroy(pcre32_extra* ptr) noexcept {
+ pcre32_free_study(ptr);
+ }
+ };
+
+ template <typename TCharType>
+ using TPcreCode = THolder<typename TPcreTraits<TCharType>::TCodeType, TFreePcre<TCharType>>;
+
+ template <typename TCharType>
+ using TPcreExtra = THolder<typename TPcreTraits<TCharType>::TExtraType, TFreePcreExtra<TCharType>>;
+}
+
diff --git a/library/cpp/regex/pcre/ut/ya.make b/library/cpp/regex/pcre/ut/ya.make
new file mode 100644
index 0000000000..0721ef87c2
--- /dev/null
+++ b/library/cpp/regex/pcre/ut/ya.make
@@ -0,0 +1,10 @@
+UNITTEST_FOR(library/cpp/regex/pcre)
+
+OWNER(g:util)
+
+SRCS(
+ pcre_ut.cpp
+ regexp_ut.cpp
+)
+
+END()
diff --git a/library/cpp/regex/pcre/ya.make b/library/cpp/regex/pcre/ya.make
new file mode 100644
index 0000000000..d34911f103
--- /dev/null
+++ b/library/cpp/regex/pcre/ya.make
@@ -0,0 +1,23 @@
+LIBRARY()
+
+OWNER(g:util)
+
+PEERDIR(
+ contrib/libs/pcre
+ contrib/libs/pcre/pcre16
+ contrib/libs/pcre/pcre32
+ library/cpp/containers/stack_array
+)
+
+SRCS(
+ pcre.cpp
+ regexp.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+ benchmark
+ ut
+)
+
diff --git a/library/cpp/regex/pire/extraencodings.cpp b/library/cpp/regex/pire/extraencodings.cpp
new file mode 100644
index 0000000000..2e507e4b67
--- /dev/null
+++ b/library/cpp/regex/pire/extraencodings.cpp
@@ -0,0 +1,81 @@
+#include <util/system/defaults.h>
+#include <util/system/yassert.h>
+#include <library/cpp/charset/codepage.h>
+#include <util/generic/singleton.h>
+#include <util/generic/yexception.h>
+#include <library/cpp/charset/doccodes.h>
+
+#include "pire.h"
+
+namespace NPire {
+ namespace {
+ // A one-byte encoding which is capable of transforming upper half of the character
+ // table to/from Unicode chars.
+ class TOneByte: public TEncoding {
+ public:
+ TOneByte(ECharset doccode) {
+ Table_ = CodePageByCharset(doccode)->unicode;
+ for (size_t i = 0; i < 256; ++i)
+ Reverse_.insert(std::make_pair(Table_[i], static_cast<char>(i)));
+ }
+
+ wchar32 FromLocal(const char*& begin, const char* end) const override {
+ if (begin != end)
+ return Table_[static_cast<unsigned char>(*begin++)];
+ else
+ ythrow yexception() << "EOF reached in Pire::OneByte::fromLocal()";
+ }
+
+ TString ToLocal(wchar32 c) const override {
+ THashMap<wchar32, char>::const_iterator i = Reverse_.find(c);
+ if (i != Reverse_.end())
+ return TString(1, i->second);
+ else
+ return TString();
+ }
+
+ void AppendDot(TFsm& fsm) const override {
+ fsm.AppendDot();
+ }
+
+ private:
+ const wchar32* Table_;
+ THashMap<wchar32, char> Reverse_;
+ };
+
+ template <unsigned N>
+ struct TOneByteHelper: public TOneByte {
+ inline TOneByteHelper()
+ : TOneByte((ECharset)N)
+ {
+ }
+ };
+ }
+
+ namespace NEncodings {
+ const NPire::TEncoding& Koi8r() {
+ return *Singleton<TOneByteHelper<CODES_KOI8>>();
+ }
+
+ const NPire::TEncoding& Cp1251() {
+ return *Singleton<TOneByteHelper<CODES_WIN>>();
+ }
+
+ const NPire::TEncoding& Get(ECharset encoding) {
+ switch (encoding) {
+ case CODES_WIN:
+ return Cp1251();
+ case CODES_KOI8:
+ return Koi8r();
+ case CODES_ASCII:
+ return NPire::NEncodings::Latin1();
+ case CODES_UTF8:
+ return NPire::NEncodings::Utf8();
+ default:
+ ythrow yexception() << "Pire::Encodings::get(ECharset): unknown encoding " << (int)encoding;
+ }
+ }
+
+ }
+
+}
diff --git a/library/cpp/regex/pire/inline/ya.make b/library/cpp/regex/pire/inline/ya.make
new file mode 100644
index 0000000000..d4850f7b45
--- /dev/null
+++ b/library/cpp/regex/pire/inline/ya.make
@@ -0,0 +1,22 @@
+PROGRAM(pire_inline)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+OWNER(
+ g:util
+ davenger
+)
+
+PEERDIR(
+ ADDINCL library/cpp/regex/pire
+)
+
+SRCDIR(
+ contrib/libs/pire/pire
+)
+
+SRCS(
+ inline.l
+)
+
+END()
diff --git a/library/cpp/regex/pire/pcre2pire.cpp b/library/cpp/regex/pire/pcre2pire.cpp
new file mode 100644
index 0000000000..f788beb85f
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.cpp
@@ -0,0 +1,110 @@
+#include "pcre2pire.h"
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+TString Pcre2Pire(const TString& src) {
+ TVector<char> result;
+ result.reserve(src.size() + 1);
+
+ enum EState {
+ S_SIMPLE,
+ S_SLASH,
+ S_BRACE,
+ S_EXPECT_Q,
+ S_QUESTION,
+ S_P,
+ S_COMMA,
+ S_IN,
+ };
+
+ EState state = S_SIMPLE;
+
+ for (ui32 i = 0; i < src.size(); ++i) {
+ const char c = src[i];
+
+ switch (state) {
+ case S_SIMPLE:
+ if (c == '\\') {
+ state = S_SLASH;
+ } else if (c == '(') {
+ state = S_BRACE;
+ } else if (c == '*' || c == '?') {
+ state = S_EXPECT_Q;
+ result.push_back(c);
+ } else {
+ if (c == ')' && result.size() > 0 && result.back() == '(') {
+ // eliminating "()"
+ result.pop_back();
+ } else {
+ result.push_back(c);
+ }
+ }
+ break;
+ case S_SLASH:
+ state = S_SIMPLE;
+ if (c == ':' || c == '=' || c == '#' || c == '&') {
+ result.push_back(c);
+ } else {
+ result.push_back('\\');
+ --i;
+ }
+ break;
+ case S_BRACE:
+ if (c == '?') {
+ state = S_QUESTION;
+ } else {
+ state = S_COMMA;
+ --i;
+ }
+ break;
+ case S_EXPECT_Q:
+ state = S_SIMPLE;
+ if (c != '?') {
+ --i;
+ }
+ break;
+ case S_QUESTION:
+ if (c == 'P') {
+ state = S_P;
+ } else if (c == ':' || c == '=') {
+ state = S_COMMA;
+ } else {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+ }
+ break;
+ case S_P:
+ if (c == '<') {
+ state = S_IN;
+ } else {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+ }
+ break;
+ case S_IN:
+ if (c == '>') {
+ state = S_COMMA;
+ } else {
+ // nothing to do
+ }
+ break;
+ case S_COMMA:
+ state = S_SIMPLE;
+ if (c == ')') {
+ // nothing to do
+ } else {
+ result.push_back('(');
+ --i;
+ }
+ break;
+ default:
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected automata state!";
+ }
+ }
+
+ if (state != S_SIMPLE && state != S_EXPECT_Q) {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected end of expression!";
+ }
+
+ result.push_back('\0');
+
+ return &result[0];
+}
diff --git a/library/cpp/regex/pire/pcre2pire.h b/library/cpp/regex/pire/pcre2pire.h
new file mode 100644
index 0000000000..46e45b9193
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// Author: smikler@yandex-team.ru
+
+#include <util/generic/string.h>
+
+/* Converts pcre regular expression to pire compatible format:
+ * - replaces "\\#" with "#"
+ * - replaces "\\=" with "="
+ * - replaces "\\:" with ":"
+ * - removes "?P<...>"
+ * - removes "?:"
+ * - removes "()" recursively
+ * - replaces "??" with "?"
+ * - replaces "*?" with "*"
+ * NOTE:
+ * - Not fully tested!
+ */
+TString Pcre2Pire(const TString& src);
diff --git a/library/cpp/regex/pire/pire.h b/library/cpp/regex/pire/pire.h
new file mode 100644
index 0000000000..286fecd693
--- /dev/null
+++ b/library/cpp/regex/pire/pire.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#ifndef PIRE_NO_CONFIG
+#define PIRE_NO_CONFIG
+#endif
+
+#include <contrib/libs/pire/pire/pire.h>
+#include <contrib/libs/pire/pire/extra.h>
+
+#include <library/cpp/charset/doccodes.h>
+
+namespace NPire {
+ using TChar = Pire::Char;
+ using Pire::MaxChar;
+
+ // Scanner classes
+ using TScanner = Pire::Scanner;
+ using TNonrelocScanner = Pire::NonrelocScanner;
+ using TScannerNoMask = Pire::ScannerNoMask;
+ using TNonrelocScannerNoMask = Pire::NonrelocScannerNoMask;
+ using THalfFinalScanner = Pire::HalfFinalScanner;
+ using TNonrelocHalfFinalScanner = Pire::NonrelocHalfFinalScanner;
+ using THalfFinalScannerNoMask = Pire::HalfFinalScannerNoMask;
+ using TNonrelocHalfFinalScannerNoMask = Pire::NonrelocHalfFinalScannerNoMask;
+ using TSimpleScanner = Pire::SimpleScanner;
+ using TSlowScanner = Pire::SlowScanner;
+ using TCapturingScanner = Pire::CapturingScanner;
+ using TSlowCapturingScanner = Pire::SlowCapturingScanner;
+ using TCountingScanner = Pire::CountingScanner;
+
+ template <typename T1, typename T2>
+ using TScannerPair = Pire::ScannerPair<T1, T2>;
+
+ // Helper classes
+ using TFsm = Pire::Fsm;
+ using TLexer = Pire::Lexer;
+ using TTerm = Pire::Term;
+ using TEncoding = Pire::Encoding;
+ using TFeature = Pire::Feature;
+ using TFeaturePtr = Pire::Feature::Ptr;
+ using TError = Pire::Error;
+
+ // Helper functions
+ using Pire::LongestPrefix;
+ using Pire::LongestSuffix;
+ using Pire::Matches;
+ using Pire::MmappedScanner;
+ using Pire::Run;
+ using Pire::Runner;
+ using Pire::ShortestPrefix;
+ using Pire::ShortestSuffix;
+ using Pire::Step;
+
+ using namespace Pire::SpecialChar;
+ using namespace Pire::Consts;
+
+ namespace NFeatures {
+ using Pire::Features::AndNotSupport;
+ using Pire::Features::Capture;
+ using Pire::Features::CaseInsensitive;
+ using Pire::Features::GlueSimilarGlyphs;
+ }
+
+ namespace NEncodings {
+ using Pire::Encodings::Latin1;
+ using Pire::Encodings::Utf8;
+
+ const NPire::TEncoding& Koi8r();
+ const NPire::TEncoding& Cp1251();
+ const NPire::TEncoding& Get(ECharset encoding);
+ }
+
+ namespace NTokenTypes {
+ using namespace Pire::TokenTypes;
+ }
+}
diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h
new file mode 100644
index 0000000000..94bba4064b
--- /dev/null
+++ b/library/cpp/regex/pire/regexp.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include "pire.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/charset/recyr.hh>
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NRegExp {
+ struct TMatcher;
+
+ struct TFsmBase {
+ struct TOptions {
+ inline TOptions& SetCaseInsensitive(bool v) noexcept {
+ CaseInsensitive = v;
+ return *this;
+ }
+
+ inline TOptions& SetSurround(bool v) noexcept {
+ Surround = v;
+ return *this;
+ }
+
+ inline TOptions& SetCapture(size_t pos) noexcept {
+ CapturePos = pos;
+ return *this;
+ }
+
+ inline TOptions& SetCharset(ECharset charset) noexcept {
+ Charset = charset;
+ return *this;
+ }
+
+ inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
+ AndNotSupport = andNotSupport;
+ return *this;
+ }
+
+ bool CaseInsensitive = false;
+ bool Surround = false;
+ TMaybe<size_t> CapturePos;
+ ECharset Charset = CODES_UNKNOWN;
+ bool AndNotSupport = false;
+ };
+
+ static inline NPire::TFsm Parse(const TStringBuf& regexp,
+ const TOptions& opts, const bool needDetermine = true) {
+ NPire::TLexer lexer;
+ if (opts.Charset == CODES_UNKNOWN) {
+ lexer.Assign(regexp.data(), regexp.data() + regexp.size());
+ } else {
+ TVector<wchar32> ucs4(regexp.size() + 1);
+ size_t inRead = 0;
+ size_t outWritten = 0;
+ int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
+ regexp.size(), regexp.size(), inRead, outWritten);
+ Y_ASSERT(recodeRes == RECODE_OK);
+ Y_ASSERT(outWritten < ucs4.size());
+ ucs4[outWritten] = 0;
+
+ lexer.Assign(ucs4.begin(),
+ ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
+ }
+
+ if (opts.CaseInsensitive) {
+ lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
+ }
+
+ if (opts.CapturePos) {
+ lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
+ }
+
+ if (opts.AndNotSupport) {
+ lexer.AddFeature(NPire::NFeatures::AndNotSupport());
+ }
+
+ switch (opts.Charset) {
+ case CODES_UNKNOWN:
+ break;
+ case CODES_UTF8:
+ lexer.SetEncoding(NPire::NEncodings::Utf8());
+ break;
+ case CODES_KOI8:
+ lexer.SetEncoding(NPire::NEncodings::Koi8r());
+ break;
+ default:
+ lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
+ break;
+ }
+
+ NPire::TFsm ret = lexer.Parse();
+
+ if (opts.Surround) {
+ ret.Surround();
+ }
+
+ if (needDetermine) {
+ ret.Determine();
+ }
+
+ return ret;
+ }
+ };
+
+ template <class TScannerType>
+ class TFsmParser: public TFsmBase {
+ public:
+ typedef TScannerType TScanner;
+
+ public:
+ inline explicit TFsmParser(const TStringBuf& regexp,
+ const TOptions& opts = TOptions(), bool needDetermine = true)
+ : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
+ {
+ }
+
+ inline const TScanner& GetScanner() const noexcept {
+ return Scanner;
+ }
+
+ static inline TFsmParser False() {
+ return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
+ }
+
+ inline explicit TFsmParser(const TScanner& compiled)
+ : Scanner(compiled)
+ {
+ if (Scanner.Empty())
+ ythrow yexception() << "Can't create fsm with empty scanner";
+ }
+
+ private:
+ TScanner Scanner;
+ };
+
+ class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
+ public:
+ inline explicit TFsm(const TStringBuf& regexp,
+ const TOptions& opts = TOptions())
+ : TFsmParser<TScanner>(regexp, opts)
+ {
+ }
+
+ inline TFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+
+ static inline TFsm Glue(const TFsm& l, const TFsm& r) {
+ return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
+ }
+
+ inline explicit TFsm(const TScanner& compiled)
+ : TFsmParser<TScanner>(compiled)
+ {
+ }
+ };
+
+ static inline TFsm operator|(const TFsm& l, const TFsm& r) {
+ return TFsm::Glue(l, r);
+ }
+
+ struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
+ inline explicit TCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
+ }
+
+ inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
+ inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
+ }
+
+ inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ template <class TFsm>
+ class TMatcherBase {
+ public:
+ typedef typename TFsm::TScanner::State TState;
+
+ public:
+ inline explicit TMatcherBase(const TFsm& fsm)
+ : Fsm(fsm)
+ {
+ Fsm.GetScanner().Initialize(State);
+ }
+
+ inline bool Final() const noexcept {
+ return GetScanner().Final(GetState());
+ }
+
+ protected:
+ inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
+ if (addBegin) {
+ NPire::Step(GetScanner(), State, NPire::BeginMark);
+ }
+ NPire::Run(GetScanner(), State, data, data + len);
+ if (addEnd) {
+ NPire::Step(GetScanner(), State, NPire::EndMark);
+ }
+ }
+
+ inline const typename TFsm::TScanner& GetScanner() const noexcept {
+ return Fsm.GetScanner();
+ }
+
+ inline const TState& GetState() const noexcept {
+ return State;
+ }
+
+ private:
+ const TFsm& Fsm;
+ TState State;
+ };
+
+ struct TMatcher : TMatcherBase<TFsm> {
+ inline explicit TMatcher(const TFsm& fsm)
+ : TMatcherBase<TFsm>(fsm)
+ {
+ }
+
+ inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
+ return Match(s.data(), s.size(), addBegin, addEnd);
+ }
+
+ inline const char* Find(const char* b, const char* e) noexcept {
+ return NPire::ShortestPrefix(GetScanner(), b, e);
+ }
+
+ typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
+
+ inline TMatchedRegexps MatchedRegexps() const noexcept {
+ return GetScanner().AcceptedRegexps(GetState());
+ }
+ };
+
+ class TSearcher: public TMatcherBase<TCapturingFsm> {
+ public:
+ inline explicit TSearcher(const TCapturingFsm& fsm)
+ : TMatcherBase<TCapturingFsm>(fsm)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return GetState().Captured();
+ }
+
+ inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
+ Data = TStringBuf(data, len);
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return TStringBuf(Data.data() + GetState().Begin() - 1,
+ Data.data() + GetState().End() - 1);
+ }
+
+ private:
+ TStringBuf Data;
+ };
+
+ class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
+ public:
+ typedef typename TSlowCapturingFsm::TScanner::State TState;
+ inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
+ : TMatcherBase<TSlowCapturingFsm>(fsm)
+ , HasCaptured(false)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return HasCaptured;
+ }
+
+ inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ TStringBuf textData(data, len);
+ Data = textData;
+ Run(Data.begin(), Data.size(), addBegin, addEnd);
+ return GetAns();
+ }
+
+ inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return Ans;
+ }
+
+ private:
+ TStringBuf Data;
+ TStringBuf Ans;
+ bool HasCaptured;
+
+ inline TSlowSearcher& GetAns() {
+ auto state = GetState();
+ Pire::SlowCapturingScanner::SingleState final;
+ if (!GetScanner().GetCapture(state, final)) {
+ HasCaptured = false;
+ } else {
+ if (!final.HasEnd()) {
+ final.SetEnd(Data.size());
+ }
+ Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
+ HasCaptured = true;
+ }
+ return *this;
+ }
+ };
+}
diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp
new file mode 100644
index 0000000000..e7206de9ad
--- /dev/null
+++ b/library/cpp/regex/pire/ut/regexp_ut.cpp
@@ -0,0 +1,318 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <library/cpp/regex/pire/regexp.h>
+#include <library/cpp/regex/pire/pcre2pire.h>
+
+Y_UNIT_TEST_SUITE(TRegExp) {
+ using namespace NRegExp;
+
+ Y_UNIT_TEST(False) {
+ UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final());
+ UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final());
+ }
+
+ Y_UNIT_TEST(Surround) {
+ UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final());
+ }
+
+ Y_UNIT_TEST(Boundaries) {
+ UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+
+ UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true)))
+ .Match(TStringBuf("a"), true, false)
+ .Match(TStringBuf("q"), false, false)
+ .Match(TStringBuf("w"), false, false)
+ .Match(TStringBuf("b"), false, true)
+ .Final());
+ }
+
+ Y_UNIT_TEST(Case) {
+ UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());
+ }
+
+ Y_UNIT_TEST(UnicodeCase) {
+ UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());
+ }
+
+ Y_UNIT_TEST(Utf) {
+ NRegExp::TFsmBase::TOptions opts;
+ opts.Charset = CODES_UTF8;
+ opts.Surround = true;
+ UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final());
+ UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final());
+ UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final());
+ }
+
+ Y_UNIT_TEST(AndNot) {
+ NRegExp::TFsmBase::TOptions opts;
+ opts.AndNotSupport = true;
+ {
+ NRegExp::TFsm fsm(".*&~([0-9]*)", opts);
+ UNIT_ASSERT(TMatcher(fsm).Match("a2").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("1a").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+ }
+ {
+ NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts);
+ UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+ }
+ {
+ NRegExp::TFsm fsm(
+ "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)"
+ "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?",
+ TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true)
+ );
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final());
+ }
+ }
+
+ Y_UNIT_TEST(Glue) {
+ TFsm glued =
+ TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) |
+ TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) |
+ TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false));
+ UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+ UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+ UNIT_ASSERT(TMatcher(glued).Match("abc").Final());
+ UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());
+ }
+
+ Y_UNIT_TEST(Capture1) {
+ TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+
+ TSearcher searcher(fsm);
+ searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+ }
+
+ Y_UNIT_TEST(Capture2) {
+ TCapturingFsm fsm("w([abcdez]+)f");
+
+ TSearcher searcher(fsm);
+ searcher.Search("wabcdef");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde"));
+ }
+
+ Y_UNIT_TEST(Capture3) {
+ TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+ TFsm::TOptions().SetCapture(2));
+
+ TSearcher searcher(fsm);
+ searcher.Search("http://vkontakte.ru/id100500");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+ }
+
+ Y_UNIT_TEST(Capture4) {
+ TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+ TFsm::TOptions().SetCharset(CODES_UTF8));
+
+ TSearcher searcher(fsm);
+ searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+ }
+
+ Y_UNIT_TEST(Capture5) {
+ TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");
+ TSearcher searcher(fsm);
+ searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+ UNIT_ASSERT(searcher.Captured());
+ //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+ }
+
+ Y_UNIT_TEST(Capture6) {
+ TCapturingFsm fsm("(/to-match-with)");
+ TSearcher searcher(fsm);
+ searcher.Search("/some/table/path/to-match-with");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with"));
+ }
+
+ Y_UNIT_TEST(Capture7) {
+ TCapturingFsm fsm("(pref.*suff)");
+ TSearcher searcher(fsm);
+ searcher.Search("ala pref bla suff cla");
+ UNIT_ASSERT(searcher.Captured());
+ //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff"));
+ }
+
+ Y_UNIT_TEST(CaptureXA) {
+ TCapturingFsm fsm(".*(xa).*");
+
+ TSearcher searcher(fsm);
+ searcher.Search("xa");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa"));
+ }
+
+ Y_UNIT_TEST(CaptureWrongXX) {
+ TCapturingFsm fsm(".*(xx).*");
+
+ TSearcher searcher(fsm);
+ searcher.Search("xx");
+ UNIT_ASSERT(searcher.Captured());
+ // Surprise!
+ // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm.
+ // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong.
+ // So it returns not the expected "xx" but just the second "x" instead.
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x"));
+ }
+
+ Y_UNIT_TEST(CaptureRight1XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("xxx");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureRight2XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("axx");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(CaptureRight3XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("axxb");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureXX) {
+ TSlowCapturingFsm fsm(".*(xx).*");
+
+ TSlowSearcher searcher(fsm);
+ searcher.Search("xx");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(SlowCapture) {
+ TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+ TFsm::TOptions().SetCapture(2));
+ TSlowSearcher searcher(fsm);
+ searcher.Search("http://vkontakte.ru/id100500");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureGreedy) {
+ TSlowCapturingFsm fsm(".*(pref.*suff)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("pref ala bla pref cla suff dla");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureNonGreedy) {
+ TSlowCapturingFsm fsm(".*?(pref.*suff)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("pref ala bla pref cla suff dla");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCapture2) {
+ TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+ TFsm::TOptions().SetCharset(CODES_UTF8));
+
+ TSlowSearcher searcher(fsm);
+ searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+ }
+
+ Y_UNIT_TEST(SlowCapture3) {
+ TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+ }
+
+ Y_UNIT_TEST(SlowCapture4) {
+ TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\"");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+ }
+
+ Y_UNIT_TEST(CapturedEmptySlow) {
+ TSlowCapturingFsm fsm("Comments=(.*)$");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("And Comments=");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+ }
+
+ Y_UNIT_TEST(CaptureInOrFirst) {
+ TSlowCapturingFsm fsm("(A)|A");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("A");
+ UNIT_ASSERT(searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureInOrSecond) {
+ TSlowCapturingFsm fsm("A|(A)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("A");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureOutside) {
+ TSlowCapturingFsm fsm("((ID=([0-9]+))?)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("ID=");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+ }
+
+ Y_UNIT_TEST(CaptureInside) {
+ TSlowCapturingFsm fsm("((ID=([0-9]+))?)",
+ TFsm::TOptions().SetCapture(2));
+ TSlowSearcher searcher(fsm);
+ searcher.Search("ID=");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(Pcre2PireTest) {
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), "");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))");
+ }
+}
diff --git a/library/cpp/regex/pire/ut/ya.make b/library/cpp/regex/pire/ut/ya.make
new file mode 100644
index 0000000000..8776695f40
--- /dev/null
+++ b/library/cpp/regex/pire/ut/ya.make
@@ -0,0 +1,44 @@
+# this test in not linked into build tree with ReCURSE and is built by unittest/library
+
+UNITTEST()
+
+OWNER(
+ g:util
+ davenger
+)
+
+SET(PIRETESTSDIR contrib/libs/pire/ut)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+PEERDIR(
+ library/cpp/regex/pire
+)
+
+SRCDIR(
+ ${PIRETESTSDIR}
+)
+
+ADDINCL(
+ contrib/libs/pire/pire
+ contrib/libs/pire/ut
+)
+
+SRCS(
+ pire_ut.cpp
+ capture_ut.cpp
+ count_ut.cpp
+ glyph_ut.cpp
+ easy_ut.cpp
+ read_unicode_ut.cpp
+ regexp_ut.cpp
+ approx_matching_ut.cpp
+)
+
+SIZE(MEDIUM)
+
+TIMEOUT(600)
+
+PIRE_INLINE(inline_ut.cpp)
+
+END()
diff --git a/library/cpp/regex/pire/ya.make b/library/cpp/regex/pire/ya.make
new file mode 100644
index 0000000000..c857e6d18b
--- /dev/null
+++ b/library/cpp/regex/pire/ya.make
@@ -0,0 +1,40 @@
+LIBRARY()
+
+OWNER(
+ g:util
+ g:antiinfra
+ davenger
+ pg
+)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+SRCDIR(contrib/libs/pire/pire)
+
+SRCS(
+ pcre2pire.cpp
+ classes.cpp
+ encoding.cpp
+ fsm.cpp
+ scanner_io.cpp
+ easy.cpp
+ scanners/null.cpp
+ extra/capture.cpp
+ extra/count.cpp
+ extra/glyphs.cpp
+ re_lexer.cpp
+ re_parser.y
+ read_unicode.cpp
+ extraencodings.cpp
+ approx_matching.cpp
+ half_final_fsm.cpp
+ minimize.h
+)
+
+PEERDIR(
+ library/cpp/charset
+)
+
+END()
+
+RECURSE_FOR_TESTS(ut)
diff --git a/library/cpp/regex/ya.make b/library/cpp/regex/ya.make
new file mode 100644
index 0000000000..15b0d1aeda
--- /dev/null
+++ b/library/cpp/regex/ya.make
@@ -0,0 +1,14 @@
+RECURSE(
+ glob
+ hyperscan
+ hyperscan/ut
+ libregex
+ pcre
+ pire
+ pire/inline
+ pire/ut
+ pire2hyperscan
+ pire2hyperscan/ut
+ regexp_classifier
+ regexp_classifier/ut
+)