aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/pire
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pire')
-rw-r--r--library/cpp/regex/pire/extraencodings.cpp81
-rw-r--r--library/cpp/regex/pire/inline/ya.make22
-rw-r--r--library/cpp/regex/pire/pcre2pire.cpp110
-rw-r--r--library/cpp/regex/pire/pcre2pire.h19
-rw-r--r--library/cpp/regex/pire/pire.h76
-rw-r--r--library/cpp/regex/pire/regexp.h337
-rw-r--r--library/cpp/regex/pire/ut/regexp_ut.cpp318
-rw-r--r--library/cpp/regex/pire/ut/ya.make44
-rw-r--r--library/cpp/regex/pire/ya.make40
9 files changed, 1047 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/extraencodings.cpp b/library/cpp/regex/pire/extraencodings.cpp
new file mode 100644
index 0000000000..2e507e4b67
--- /dev/null
+++ b/library/cpp/regex/pire/extraencodings.cpp
@@ -0,0 +1,81 @@
+#include <util/system/defaults.h>
+#include <util/system/yassert.h>
+#include <library/cpp/charset/codepage.h>
+#include <util/generic/singleton.h>
+#include <util/generic/yexception.h>
+#include <library/cpp/charset/doccodes.h>
+
+#include "pire.h"
+
+namespace NPire {
+ namespace {
+ // A one-byte encoding which is capable of transforming upper half of the character
+ // table to/from Unicode chars.
+ class TOneByte: public TEncoding {
+ public:
+ TOneByte(ECharset doccode) {
+ Table_ = CodePageByCharset(doccode)->unicode;
+ for (size_t i = 0; i < 256; ++i)
+ Reverse_.insert(std::make_pair(Table_[i], static_cast<char>(i)));
+ }
+
+ wchar32 FromLocal(const char*& begin, const char* end) const override {
+ if (begin != end)
+ return Table_[static_cast<unsigned char>(*begin++)];
+ else
+ ythrow yexception() << "EOF reached in Pire::OneByte::fromLocal()";
+ }
+
+ TString ToLocal(wchar32 c) const override {
+ THashMap<wchar32, char>::const_iterator i = Reverse_.find(c);
+ if (i != Reverse_.end())
+ return TString(1, i->second);
+ else
+ return TString();
+ }
+
+ void AppendDot(TFsm& fsm) const override {
+ fsm.AppendDot();
+ }
+
+ private:
+ const wchar32* Table_;
+ THashMap<wchar32, char> Reverse_;
+ };
+
+ template <unsigned N>
+ struct TOneByteHelper: public TOneByte {
+ inline TOneByteHelper()
+ : TOneByte((ECharset)N)
+ {
+ }
+ };
+ }
+
+ namespace NEncodings {
+ const NPire::TEncoding& Koi8r() {
+ return *Singleton<TOneByteHelper<CODES_KOI8>>();
+ }
+
+ const NPire::TEncoding& Cp1251() {
+ return *Singleton<TOneByteHelper<CODES_WIN>>();
+ }
+
+ const NPire::TEncoding& Get(ECharset encoding) {
+ switch (encoding) {
+ case CODES_WIN:
+ return Cp1251();
+ case CODES_KOI8:
+ return Koi8r();
+ case CODES_ASCII:
+ return NPire::NEncodings::Latin1();
+ case CODES_UTF8:
+ return NPire::NEncodings::Utf8();
+ default:
+ ythrow yexception() << "Pire::Encodings::get(ECharset): unknown encoding " << (int)encoding;
+ }
+ }
+
+ }
+
+}
diff --git a/library/cpp/regex/pire/inline/ya.make b/library/cpp/regex/pire/inline/ya.make
new file mode 100644
index 0000000000..d4850f7b45
--- /dev/null
+++ b/library/cpp/regex/pire/inline/ya.make
@@ -0,0 +1,22 @@
+PROGRAM(pire_inline)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+OWNER(
+ g:util
+ davenger
+)
+
+PEERDIR(
+ ADDINCL library/cpp/regex/pire
+)
+
+SRCDIR(
+ contrib/libs/pire/pire
+)
+
+SRCS(
+ inline.l
+)
+
+END()
diff --git a/library/cpp/regex/pire/pcre2pire.cpp b/library/cpp/regex/pire/pcre2pire.cpp
new file mode 100644
index 0000000000..f788beb85f
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.cpp
@@ -0,0 +1,110 @@
+#include "pcre2pire.h"
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+TString Pcre2Pire(const TString& src) {
+ TVector<char> result;
+ result.reserve(src.size() + 1);
+
+ enum EState {
+ S_SIMPLE,
+ S_SLASH,
+ S_BRACE,
+ S_EXPECT_Q,
+ S_QUESTION,
+ S_P,
+ S_COMMA,
+ S_IN,
+ };
+
+ EState state = S_SIMPLE;
+
+ for (ui32 i = 0; i < src.size(); ++i) {
+ const char c = src[i];
+
+ switch (state) {
+ case S_SIMPLE:
+ if (c == '\\') {
+ state = S_SLASH;
+ } else if (c == '(') {
+ state = S_BRACE;
+ } else if (c == '*' || c == '?') {
+ state = S_EXPECT_Q;
+ result.push_back(c);
+ } else {
+ if (c == ')' && result.size() > 0 && result.back() == '(') {
+ // eliminating "()"
+ result.pop_back();
+ } else {
+ result.push_back(c);
+ }
+ }
+ break;
+ case S_SLASH:
+ state = S_SIMPLE;
+ if (c == ':' || c == '=' || c == '#' || c == '&') {
+ result.push_back(c);
+ } else {
+ result.push_back('\\');
+ --i;
+ }
+ break;
+ case S_BRACE:
+ if (c == '?') {
+ state = S_QUESTION;
+ } else {
+ state = S_COMMA;
+ --i;
+ }
+ break;
+ case S_EXPECT_Q:
+ state = S_SIMPLE;
+ if (c != '?') {
+ --i;
+ }
+ break;
+ case S_QUESTION:
+ if (c == 'P') {
+ state = S_P;
+ } else if (c == ':' || c == '=') {
+ state = S_COMMA;
+ } else {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+ }
+ break;
+ case S_P:
+ if (c == '<') {
+ state = S_IN;
+ } else {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+ }
+ break;
+ case S_IN:
+ if (c == '>') {
+ state = S_COMMA;
+ } else {
+ // nothing to do
+ }
+ break;
+ case S_COMMA:
+ state = S_SIMPLE;
+ if (c == ')') {
+ // nothing to do
+ } else {
+ result.push_back('(');
+ --i;
+ }
+ break;
+ default:
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected automata state!";
+ }
+ }
+
+ if (state != S_SIMPLE && state != S_EXPECT_Q) {
+ ythrow yexception() << "Pcre to pire convertaion failed: unexpected end of expression!";
+ }
+
+ result.push_back('\0');
+
+ return &result[0];
+}
diff --git a/library/cpp/regex/pire/pcre2pire.h b/library/cpp/regex/pire/pcre2pire.h
new file mode 100644
index 0000000000..46e45b9193
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// Author: smikler@yandex-team.ru
+
+#include <util/generic/string.h>
+
+/* Converts pcre regular expression to pire compatible format:
+ * - replaces "\\#" with "#"
+ * - replaces "\\=" with "="
+ * - replaces "\\:" with ":"
+ * - removes "?P<...>"
+ * - removes "?:"
+ * - removes "()" recursively
+ * - replaces "??" with "?"
+ * - replaces "*?" with "*"
+ * NOTE:
+ * - Not fully tested!
+ */
+TString Pcre2Pire(const TString& src);
diff --git a/library/cpp/regex/pire/pire.h b/library/cpp/regex/pire/pire.h
new file mode 100644
index 0000000000..286fecd693
--- /dev/null
+++ b/library/cpp/regex/pire/pire.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#ifndef PIRE_NO_CONFIG
+#define PIRE_NO_CONFIG
+#endif
+
+#include <contrib/libs/pire/pire/pire.h>
+#include <contrib/libs/pire/pire/extra.h>
+
+#include <library/cpp/charset/doccodes.h>
+
+namespace NPire {
+ using TChar = Pire::Char;
+ using Pire::MaxChar;
+
+ // Scanner classes
+ using TScanner = Pire::Scanner;
+ using TNonrelocScanner = Pire::NonrelocScanner;
+ using TScannerNoMask = Pire::ScannerNoMask;
+ using TNonrelocScannerNoMask = Pire::NonrelocScannerNoMask;
+ using THalfFinalScanner = Pire::HalfFinalScanner;
+ using TNonrelocHalfFinalScanner = Pire::NonrelocHalfFinalScanner;
+ using THalfFinalScannerNoMask = Pire::HalfFinalScannerNoMask;
+ using TNonrelocHalfFinalScannerNoMask = Pire::NonrelocHalfFinalScannerNoMask;
+ using TSimpleScanner = Pire::SimpleScanner;
+ using TSlowScanner = Pire::SlowScanner;
+ using TCapturingScanner = Pire::CapturingScanner;
+ using TSlowCapturingScanner = Pire::SlowCapturingScanner;
+ using TCountingScanner = Pire::CountingScanner;
+
+ template <typename T1, typename T2>
+ using TScannerPair = Pire::ScannerPair<T1, T2>;
+
+ // Helper classes
+ using TFsm = Pire::Fsm;
+ using TLexer = Pire::Lexer;
+ using TTerm = Pire::Term;
+ using TEncoding = Pire::Encoding;
+ using TFeature = Pire::Feature;
+ using TFeaturePtr = Pire::Feature::Ptr;
+ using TError = Pire::Error;
+
+ // Helper functions
+ using Pire::LongestPrefix;
+ using Pire::LongestSuffix;
+ using Pire::Matches;
+ using Pire::MmappedScanner;
+ using Pire::Run;
+ using Pire::Runner;
+ using Pire::ShortestPrefix;
+ using Pire::ShortestSuffix;
+ using Pire::Step;
+
+ using namespace Pire::SpecialChar;
+ using namespace Pire::Consts;
+
+ namespace NFeatures {
+ using Pire::Features::AndNotSupport;
+ using Pire::Features::Capture;
+ using Pire::Features::CaseInsensitive;
+ using Pire::Features::GlueSimilarGlyphs;
+ }
+
+ namespace NEncodings {
+ using Pire::Encodings::Latin1;
+ using Pire::Encodings::Utf8;
+
+ const NPire::TEncoding& Koi8r();
+ const NPire::TEncoding& Cp1251();
+ const NPire::TEncoding& Get(ECharset encoding);
+ }
+
+ namespace NTokenTypes {
+ using namespace Pire::TokenTypes;
+ }
+}
diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h
new file mode 100644
index 0000000000..94bba4064b
--- /dev/null
+++ b/library/cpp/regex/pire/regexp.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include "pire.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/charset/recyr.hh>
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NRegExp {
+ struct TMatcher;
+
+ struct TFsmBase {
+ struct TOptions {
+ inline TOptions& SetCaseInsensitive(bool v) noexcept {
+ CaseInsensitive = v;
+ return *this;
+ }
+
+ inline TOptions& SetSurround(bool v) noexcept {
+ Surround = v;
+ return *this;
+ }
+
+ inline TOptions& SetCapture(size_t pos) noexcept {
+ CapturePos = pos;
+ return *this;
+ }
+
+ inline TOptions& SetCharset(ECharset charset) noexcept {
+ Charset = charset;
+ return *this;
+ }
+
+ inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
+ AndNotSupport = andNotSupport;
+ return *this;
+ }
+
+ bool CaseInsensitive = false;
+ bool Surround = false;
+ TMaybe<size_t> CapturePos;
+ ECharset Charset = CODES_UNKNOWN;
+ bool AndNotSupport = false;
+ };
+
+ static inline NPire::TFsm Parse(const TStringBuf& regexp,
+ const TOptions& opts, const bool needDetermine = true) {
+ NPire::TLexer lexer;
+ if (opts.Charset == CODES_UNKNOWN) {
+ lexer.Assign(regexp.data(), regexp.data() + regexp.size());
+ } else {
+ TVector<wchar32> ucs4(regexp.size() + 1);
+ size_t inRead = 0;
+ size_t outWritten = 0;
+ int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
+ regexp.size(), regexp.size(), inRead, outWritten);
+ Y_ASSERT(recodeRes == RECODE_OK);
+ Y_ASSERT(outWritten < ucs4.size());
+ ucs4[outWritten] = 0;
+
+ lexer.Assign(ucs4.begin(),
+ ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
+ }
+
+ if (opts.CaseInsensitive) {
+ lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
+ }
+
+ if (opts.CapturePos) {
+ lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
+ }
+
+ if (opts.AndNotSupport) {
+ lexer.AddFeature(NPire::NFeatures::AndNotSupport());
+ }
+
+ switch (opts.Charset) {
+ case CODES_UNKNOWN:
+ break;
+ case CODES_UTF8:
+ lexer.SetEncoding(NPire::NEncodings::Utf8());
+ break;
+ case CODES_KOI8:
+ lexer.SetEncoding(NPire::NEncodings::Koi8r());
+ break;
+ default:
+ lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
+ break;
+ }
+
+ NPire::TFsm ret = lexer.Parse();
+
+ if (opts.Surround) {
+ ret.Surround();
+ }
+
+ if (needDetermine) {
+ ret.Determine();
+ }
+
+ return ret;
+ }
+ };
+
+ template <class TScannerType>
+ class TFsmParser: public TFsmBase {
+ public:
+ typedef TScannerType TScanner;
+
+ public:
+ inline explicit TFsmParser(const TStringBuf& regexp,
+ const TOptions& opts = TOptions(), bool needDetermine = true)
+ : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
+ {
+ }
+
+ inline const TScanner& GetScanner() const noexcept {
+ return Scanner;
+ }
+
+ static inline TFsmParser False() {
+ return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
+ }
+
+ inline explicit TFsmParser(const TScanner& compiled)
+ : Scanner(compiled)
+ {
+ if (Scanner.Empty())
+ ythrow yexception() << "Can't create fsm with empty scanner";
+ }
+
+ private:
+ TScanner Scanner;
+ };
+
+ class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
+ public:
+ inline explicit TFsm(const TStringBuf& regexp,
+ const TOptions& opts = TOptions())
+ : TFsmParser<TScanner>(regexp, opts)
+ {
+ }
+
+ inline TFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+
+ static inline TFsm Glue(const TFsm& l, const TFsm& r) {
+ return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
+ }
+
+ inline explicit TFsm(const TScanner& compiled)
+ : TFsmParser<TScanner>(compiled)
+ {
+ }
+ };
+
+ static inline TFsm operator|(const TFsm& l, const TFsm& r) {
+ return TFsm::Glue(l, r);
+ }
+
+ struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
+ inline explicit TCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
+ }
+
+ inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
+ inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
+ }
+
+ inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ template <class TFsm>
+ class TMatcherBase {
+ public:
+ typedef typename TFsm::TScanner::State TState;
+
+ public:
+ inline explicit TMatcherBase(const TFsm& fsm)
+ : Fsm(fsm)
+ {
+ Fsm.GetScanner().Initialize(State);
+ }
+
+ inline bool Final() const noexcept {
+ return GetScanner().Final(GetState());
+ }
+
+ protected:
+ inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
+ if (addBegin) {
+ NPire::Step(GetScanner(), State, NPire::BeginMark);
+ }
+ NPire::Run(GetScanner(), State, data, data + len);
+ if (addEnd) {
+ NPire::Step(GetScanner(), State, NPire::EndMark);
+ }
+ }
+
+ inline const typename TFsm::TScanner& GetScanner() const noexcept {
+ return Fsm.GetScanner();
+ }
+
+ inline const TState& GetState() const noexcept {
+ return State;
+ }
+
+ private:
+ const TFsm& Fsm;
+ TState State;
+ };
+
+ struct TMatcher : TMatcherBase<TFsm> {
+ inline explicit TMatcher(const TFsm& fsm)
+ : TMatcherBase<TFsm>(fsm)
+ {
+ }
+
+ inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
+ return Match(s.data(), s.size(), addBegin, addEnd);
+ }
+
+ inline const char* Find(const char* b, const char* e) noexcept {
+ return NPire::ShortestPrefix(GetScanner(), b, e);
+ }
+
+ typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
+
+ inline TMatchedRegexps MatchedRegexps() const noexcept {
+ return GetScanner().AcceptedRegexps(GetState());
+ }
+ };
+
+ class TSearcher: public TMatcherBase<TCapturingFsm> {
+ public:
+ inline explicit TSearcher(const TCapturingFsm& fsm)
+ : TMatcherBase<TCapturingFsm>(fsm)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return GetState().Captured();
+ }
+
+ inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
+ Data = TStringBuf(data, len);
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return TStringBuf(Data.data() + GetState().Begin() - 1,
+ Data.data() + GetState().End() - 1);
+ }
+
+ private:
+ TStringBuf Data;
+ };
+
+ class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
+ public:
+ typedef typename TSlowCapturingFsm::TScanner::State TState;
+ inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
+ : TMatcherBase<TSlowCapturingFsm>(fsm)
+ , HasCaptured(false)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return HasCaptured;
+ }
+
+ inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ TStringBuf textData(data, len);
+ Data = textData;
+ Run(Data.begin(), Data.size(), addBegin, addEnd);
+ return GetAns();
+ }
+
+ inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return Ans;
+ }
+
+ private:
+ TStringBuf Data;
+ TStringBuf Ans;
+ bool HasCaptured;
+
+ inline TSlowSearcher& GetAns() {
+ auto state = GetState();
+ Pire::SlowCapturingScanner::SingleState final;
+ if (!GetScanner().GetCapture(state, final)) {
+ HasCaptured = false;
+ } else {
+ if (!final.HasEnd()) {
+ final.SetEnd(Data.size());
+ }
+ Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
+ HasCaptured = true;
+ }
+ return *this;
+ }
+ };
+}
diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp
new file mode 100644
index 0000000000..e7206de9ad
--- /dev/null
+++ b/library/cpp/regex/pire/ut/regexp_ut.cpp
@@ -0,0 +1,318 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <library/cpp/regex/pire/regexp.h>
+#include <library/cpp/regex/pire/pcre2pire.h>
+
+Y_UNIT_TEST_SUITE(TRegExp) {
+ using namespace NRegExp;
+
+ Y_UNIT_TEST(False) {
+ UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final());
+ UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final());
+ }
+
+ Y_UNIT_TEST(Surround) {
+ UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final());
+ }
+
+ Y_UNIT_TEST(Boundaries) {
+ UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+ UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+ UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+
+ UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true)))
+ .Match(TStringBuf("a"), true, false)
+ .Match(TStringBuf("q"), false, false)
+ .Match(TStringBuf("w"), false, false)
+ .Match(TStringBuf("b"), false, true)
+ .Final());
+ }
+
+ Y_UNIT_TEST(Case) {
+ UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());
+ }
+
+ Y_UNIT_TEST(UnicodeCase) {
+ UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());
+ }
+
+ Y_UNIT_TEST(Utf) {
+ NRegExp::TFsmBase::TOptions opts;
+ opts.Charset = CODES_UTF8;
+ opts.Surround = true;
+ UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final());
+ UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final());
+ UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final());
+ UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final());
+ }
+
+ Y_UNIT_TEST(AndNot) {
+ NRegExp::TFsmBase::TOptions opts;
+ opts.AndNotSupport = true;
+ {
+ NRegExp::TFsm fsm(".*&~([0-9]*)", opts);
+ UNIT_ASSERT(TMatcher(fsm).Match("a2").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("1a").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+ }
+ {
+ NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts);
+ UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+ }
+ {
+ NRegExp::TFsm fsm(
+ "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)"
+ "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?",
+ TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true)
+ );
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final());
+ UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final());
+ UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final());
+ }
+ }
+
+ Y_UNIT_TEST(Glue) {
+ TFsm glued =
+ TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) |
+ TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) |
+ TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false));
+ UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+ UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+ UNIT_ASSERT(TMatcher(glued).Match("abc").Final());
+ UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());
+ }
+
+ Y_UNIT_TEST(Capture1) {
+ TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+
+ TSearcher searcher(fsm);
+ searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+ }
+
+ Y_UNIT_TEST(Capture2) {
+ TCapturingFsm fsm("w([abcdez]+)f");
+
+ TSearcher searcher(fsm);
+ searcher.Search("wabcdef");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde"));
+ }
+
+ Y_UNIT_TEST(Capture3) {
+ TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+ TFsm::TOptions().SetCapture(2));
+
+ TSearcher searcher(fsm);
+ searcher.Search("http://vkontakte.ru/id100500");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+ }
+
+ Y_UNIT_TEST(Capture4) {
+ TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+ TFsm::TOptions().SetCharset(CODES_UTF8));
+
+ TSearcher searcher(fsm);
+ searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+ }
+
+ Y_UNIT_TEST(Capture5) {
+ TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");
+ TSearcher searcher(fsm);
+ searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+ UNIT_ASSERT(searcher.Captured());
+ //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+ }
+
+ Y_UNIT_TEST(Capture6) {
+ TCapturingFsm fsm("(/to-match-with)");
+ TSearcher searcher(fsm);
+ searcher.Search("/some/table/path/to-match-with");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with"));
+ }
+
+ Y_UNIT_TEST(Capture7) {
+ TCapturingFsm fsm("(pref.*suff)");
+ TSearcher searcher(fsm);
+ searcher.Search("ala pref bla suff cla");
+ UNIT_ASSERT(searcher.Captured());
+ //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff"));
+ }
+
+ Y_UNIT_TEST(CaptureXA) {
+ TCapturingFsm fsm(".*(xa).*");
+
+ TSearcher searcher(fsm);
+ searcher.Search("xa");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa"));
+ }
+
+ Y_UNIT_TEST(CaptureWrongXX) {
+ TCapturingFsm fsm(".*(xx).*");
+
+ TSearcher searcher(fsm);
+ searcher.Search("xx");
+ UNIT_ASSERT(searcher.Captured());
+ // Surprise!
+ // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm.
+ // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong.
+ // So it returns not the expected "xx" but just the second "x" instead.
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x"));
+ }
+
+ Y_UNIT_TEST(CaptureRight1XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("xxx");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureRight2XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("axx");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(CaptureRight3XX) {
+ TCapturingFsm fsm("[^x]+(xx).*");
+
+ TSearcher searcher(fsm);
+
+ searcher.Search("axxb");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureXX) {
+ TSlowCapturingFsm fsm(".*(xx).*");
+
+ TSlowSearcher searcher(fsm);
+ searcher.Search("xx");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+ }
+
+ Y_UNIT_TEST(SlowCapture) {
+ TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+ TFsm::TOptions().SetCapture(2));
+ TSlowSearcher searcher(fsm);
+ searcher.Search("http://vkontakte.ru/id100500");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureGreedy) {
+ TSlowCapturingFsm fsm(".*(pref.*suff)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("pref ala bla pref cla suff dla");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureNonGreedy) {
+ TSlowCapturingFsm fsm(".*?(pref.*suff)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("pref ala bla pref cla suff dla");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCapture2) {
+ TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+ TFsm::TOptions().SetCharset(CODES_UTF8));
+
+ TSlowSearcher searcher(fsm);
+ searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+ }
+
+ Y_UNIT_TEST(SlowCapture3) {
+ TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+ }
+
+ Y_UNIT_TEST(SlowCapture4) {
+ TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\"");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+ }
+
+ Y_UNIT_TEST(CapturedEmptySlow) {
+ TSlowCapturingFsm fsm("Comments=(.*)$");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("And Comments=");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+ }
+
+ Y_UNIT_TEST(CaptureInOrFirst) {
+ TSlowCapturingFsm fsm("(A)|A");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("A");
+ UNIT_ASSERT(searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureInOrSecond) {
+ TSlowCapturingFsm fsm("A|(A)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("A");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(CaptureOutside) {
+ TSlowCapturingFsm fsm("((ID=([0-9]+))?)");
+ TSlowSearcher searcher(fsm);
+ searcher.Search("ID=");
+ UNIT_ASSERT(searcher.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+ }
+
+ Y_UNIT_TEST(CaptureInside) {
+ TSlowCapturingFsm fsm("((ID=([0-9]+))?)",
+ TFsm::TOptions().SetCapture(2));
+ TSlowSearcher searcher(fsm);
+ searcher.Search("ID=");
+ UNIT_ASSERT(!searcher.Captured());
+ }
+
+ Y_UNIT_TEST(Pcre2PireTest) {
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), "");
+ UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))");
+ }
+}
diff --git a/library/cpp/regex/pire/ut/ya.make b/library/cpp/regex/pire/ut/ya.make
new file mode 100644
index 0000000000..8776695f40
--- /dev/null
+++ b/library/cpp/regex/pire/ut/ya.make
@@ -0,0 +1,44 @@
+# this test in not linked into build tree with ReCURSE and is built by unittest/library
+
+UNITTEST()
+
+OWNER(
+ g:util
+ davenger
+)
+
+SET(PIRETESTSDIR contrib/libs/pire/ut)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+PEERDIR(
+ library/cpp/regex/pire
+)
+
+SRCDIR(
+ ${PIRETESTSDIR}
+)
+
+ADDINCL(
+ contrib/libs/pire/pire
+ contrib/libs/pire/ut
+)
+
+SRCS(
+ pire_ut.cpp
+ capture_ut.cpp
+ count_ut.cpp
+ glyph_ut.cpp
+ easy_ut.cpp
+ read_unicode_ut.cpp
+ regexp_ut.cpp
+ approx_matching_ut.cpp
+)
+
+SIZE(MEDIUM)
+
+TIMEOUT(600)
+
+PIRE_INLINE(inline_ut.cpp)
+
+END()
diff --git a/library/cpp/regex/pire/ya.make b/library/cpp/regex/pire/ya.make
new file mode 100644
index 0000000000..c857e6d18b
--- /dev/null
+++ b/library/cpp/regex/pire/ya.make
@@ -0,0 +1,40 @@
+LIBRARY()
+
+OWNER(
+ g:util
+ g:antiinfra
+ davenger
+ pg
+)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+SRCDIR(contrib/libs/pire/pire)
+
+SRCS(
+ pcre2pire.cpp
+ classes.cpp
+ encoding.cpp
+ fsm.cpp
+ scanner_io.cpp
+ easy.cpp
+ scanners/null.cpp
+ extra/capture.cpp
+ extra/count.cpp
+ extra/glyphs.cpp
+ re_lexer.cpp
+ re_parser.y
+ read_unicode.cpp
+ extraencodings.cpp
+ approx_matching.cpp
+ half_final_fsm.cpp
+ minimize.h
+)
+
+PEERDIR(
+ library/cpp/charset
+)
+
+END()
+
+RECURSE_FOR_TESTS(ut)