intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
9 files changed, 1047 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/extraencodings.cpp b/library/cpp/regex/pire/extraencodings.cpp
new file mode 100644
index 0000000000..2e507e4b67
--- /dev/null
+++ b/library/cpp/regex/pire/extraencodings.cpp
@@ -0,0 +1,81 @@
+#include <util/system/defaults.h>
+#include <util/system/yassert.h>
+#include <library/cpp/charset/codepage.h>
+#include <util/generic/singleton.h>
+#include <util/generic/yexception.h>
+#include <library/cpp/charset/doccodes.h>
+
+#include "pire.h"
+
+namespace NPire {
+    namespace {
+        // A one-byte encoding which is capable of transforming upper half of the character
+        // table to/from Unicode chars.
+        class TOneByte: public TEncoding {
+        public:
+            TOneByte(ECharset doccode) {
+                Table_ = CodePageByCharset(doccode)->unicode;
+                for (size_t i = 0; i < 256; ++i)
+                    Reverse_.insert(std::make_pair(Table_[i], static_cast<char>(i)));
+            }
+
+            wchar32 FromLocal(const char*& begin, const char* end) const override {
+                if (begin != end)
+                    return Table_[static_cast<unsigned char>(*begin++)];
+                else
+                    ythrow yexception() << "EOF reached in Pire::OneByte::fromLocal()";
+            }
+
+            TString ToLocal(wchar32 c) const override {
+                THashMap<wchar32, char>::const_iterator i = Reverse_.find(c);
+                if (i != Reverse_.end())
+                    return TString(1, i->second);
+                else
+                    return TString();
+            }
+
+            void AppendDot(TFsm& fsm) const override {
+                fsm.AppendDot();
+            }
+
+        private:
+            const wchar32* Table_;
+            THashMap<wchar32, char> Reverse_;
+        };
+
+        template <unsigned N>
+        struct TOneByteHelper: public TOneByte {
+            inline TOneByteHelper()
+                : TOneByte((ECharset)N)
+            {
+            }
+        };
+    }
+
+    namespace NEncodings {
+        const NPire::TEncoding& Koi8r() {
+            return *Singleton<TOneByteHelper<CODES_KOI8>>();
+        }
+
+        const NPire::TEncoding& Cp1251() {
+            return *Singleton<TOneByteHelper<CODES_WIN>>();
+        }
+
+        const NPire::TEncoding& Get(ECharset encoding) {
+            switch (encoding) {
+                case CODES_WIN:
+                    return Cp1251();
+                case CODES_KOI8:
+                    return Koi8r();
+                case CODES_ASCII:
+                    return NPire::NEncodings::Latin1();
+                case CODES_UTF8:
+                    return NPire::NEncodings::Utf8();
+                default:
+                    ythrow yexception() << "Pire::Encodings::get(ECharset): unknown encoding " << (int)encoding;
+            }
+        }
+
+    }
+
+}
diff --git a/library/cpp/regex/pire/inline/ya.make b/library/cpp/regex/pire/inline/ya.make
new file mode 100644
index 0000000000..d4850f7b45
--- /dev/null
+++ b/library/cpp/regex/pire/inline/ya.make
@@ -0,0 +1,22 @@
+PROGRAM(pire_inline)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+OWNER(
+    g:util
+    davenger
+)
+
+PEERDIR(
+    ADDINCL library/cpp/regex/pire
+)
+
+SRCDIR(
+    contrib/libs/pire/pire
+)
+
+SRCS(
+    inline.l
+)
+
+END()
diff --git a/library/cpp/regex/pire/pcre2pire.cpp b/library/cpp/regex/pire/pcre2pire.cpp
new file mode 100644
index 0000000000..f788beb85f
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.cpp
@@ -0,0 +1,110 @@
+#include "pcre2pire.h"
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+TString Pcre2Pire(const TString& src) {
+    TVector<char> result;
+    result.reserve(src.size() + 1);
+
+    enum EState {
+        S_SIMPLE,
+        S_SLASH,
+        S_BRACE,
+        S_EXPECT_Q,
+        S_QUESTION,
+        S_P,
+        S_COMMA,
+        S_IN,
+    };
+
+    EState state = S_SIMPLE;
+
+    for (ui32 i = 0; i < src.size(); ++i) {
+        const char c = src[i];
+
+        switch (state) {
+            case S_SIMPLE:
+                if (c == '\\') {
+                    state = S_SLASH;
+                } else if (c == '(') {
+                    state = S_BRACE;
+                } else if (c == '*' || c == '?') {
+                    state = S_EXPECT_Q;
+                    result.push_back(c);
+                } else {
+                    if (c == ')' && result.size() > 0 && result.back() == '(') {
+                        // eliminating "()"
+                        result.pop_back();
+                    } else {
+                        result.push_back(c);
+                    }
+                }
+                break;
+            case S_SLASH:
+                state = S_SIMPLE;
+                if (c == ':' || c == '=' || c == '#' || c == '&') {
+                    result.push_back(c);
+                } else {
+                    result.push_back('\\');
+                    --i;
+                }
+                break;
+            case S_BRACE:
+                if (c == '?') {
+                    state = S_QUESTION;
+                } else {
+                    state = S_COMMA;
+                    --i;
+                }
+                break;
+            case S_EXPECT_Q:
+                state = S_SIMPLE;
+                if (c != '?') {
+                    --i;
+                }
+                break;
+            case S_QUESTION:
+                if (c == 'P') {
+                    state = S_P;
+                } else if (c == ':' || c == '=') {
+                    state = S_COMMA;
+                } else {
+                    ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+                }
+                break;
+            case S_P:
+                if (c == '<') {
+                    state = S_IN;
+                } else {
+                    ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!";
+                }
+                break;
+            case S_IN:
+                if (c == '>') {
+                    state = S_COMMA;
+                } else {
+                    // nothing to do
+                }
+                break;
+            case S_COMMA:
+                state = S_SIMPLE;
+                if (c == ')') {
+                    // nothing to do
+                } else {
+                    result.push_back('(');
+                    --i;
+                }
+                break;
+            default:
+                ythrow yexception() << "Pcre to pire convertaion failed: unexpected automata state!";
+        }
+    }
+
+    if (state != S_SIMPLE && state != S_EXPECT_Q) {
+        ythrow yexception() << "Pcre to pire convertaion failed: unexpected end of expression!";
+    }
+
+    result.push_back('\0');
+
+    return &result[0];
+}
diff --git a/library/cpp/regex/pire/pcre2pire.h b/library/cpp/regex/pire/pcre2pire.h
new file mode 100644
index 0000000000..46e45b9193
--- /dev/null
+++ b/library/cpp/regex/pire/pcre2pire.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// Author: smikler@yandex-team.ru
+
+#include <util/generic/string.h>
+
+/* Converts pcre regular expression to pire compatible format:
+ *   - replaces "\\#" with "#"
+ *   - replaces "\\=" with "="
+ *   - replaces "\\:" with ":"
+ *   - removes "?P<...>"
+ *   - removes "?:"
+ *   - removes "()" recursively
+ *   - replaces "??" with "?"
+ *   - replaces "*?" with "*"
+ * NOTE:
+ *   - Not fully tested!
+ */
+TString Pcre2Pire(const TString& src);
diff --git a/library/cpp/regex/pire/pire.h b/library/cpp/regex/pire/pire.h
new file mode 100644
index 0000000000..286fecd693
--- /dev/null
+++ b/library/cpp/regex/pire/pire.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#ifndef PIRE_NO_CONFIG
+#define PIRE_NO_CONFIG
+#endif
+
+#include <contrib/libs/pire/pire/pire.h>
+#include <contrib/libs/pire/pire/extra.h>
+
+#include <library/cpp/charset/doccodes.h>
+
+namespace NPire {
+    using TChar = Pire::Char;
+    using Pire::MaxChar;
+
+    // Scanner classes
+    using TScanner = Pire::Scanner;
+    using TNonrelocScanner = Pire::NonrelocScanner;
+    using TScannerNoMask = Pire::ScannerNoMask;
+    using TNonrelocScannerNoMask = Pire::NonrelocScannerNoMask;
+    using THalfFinalScanner = Pire::HalfFinalScanner;
+    using TNonrelocHalfFinalScanner = Pire::NonrelocHalfFinalScanner;
+    using THalfFinalScannerNoMask = Pire::HalfFinalScannerNoMask;
+    using TNonrelocHalfFinalScannerNoMask = Pire::NonrelocHalfFinalScannerNoMask;
+    using TSimpleScanner = Pire::SimpleScanner;
+    using TSlowScanner = Pire::SlowScanner;
+    using TCapturingScanner = Pire::CapturingScanner;
+    using TSlowCapturingScanner = Pire::SlowCapturingScanner;
+    using TCountingScanner = Pire::CountingScanner;
+
+    template <typename T1, typename T2>
+    using TScannerPair = Pire::ScannerPair<T1, T2>;
+
+    // Helper classes
+    using TFsm = Pire::Fsm;
+    using TLexer = Pire::Lexer;
+    using TTerm = Pire::Term;
+    using TEncoding = Pire::Encoding;
+    using TFeature = Pire::Feature;
+    using TFeaturePtr = Pire::Feature::Ptr;
+    using TError = Pire::Error;
+
+    // Helper functions
+    using Pire::LongestPrefix;
+    using Pire::LongestSuffix;
+    using Pire::Matches;
+    using Pire::MmappedScanner;
+    using Pire::Run;
+    using Pire::Runner;
+    using Pire::ShortestPrefix;
+    using Pire::ShortestSuffix;
+    using Pire::Step;
+
+    using namespace Pire::SpecialChar;
+    using namespace Pire::Consts;
+
+    namespace NFeatures {
+        using Pire::Features::AndNotSupport;
+        using Pire::Features::Capture;
+        using Pire::Features::CaseInsensitive;
+        using Pire::Features::GlueSimilarGlyphs;
+    }
+
+    namespace NEncodings {
+        using Pire::Encodings::Latin1;
+        using Pire::Encodings::Utf8;
+
+        const NPire::TEncoding& Koi8r();
+        const NPire::TEncoding& Cp1251();
+        const NPire::TEncoding& Get(ECharset encoding);
+    }
+
+    namespace NTokenTypes {
+        using namespace Pire::TokenTypes;
+    }
+}
diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h
new file mode 100644
index 0000000000..94bba4064b
--- /dev/null
+++ b/library/cpp/regex/pire/regexp.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include "pire.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/charset/recyr.hh>
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NRegExp {
+    struct TMatcher;
+
+    struct TFsmBase {
+        struct TOptions {
+            inline TOptions& SetCaseInsensitive(bool v) noexcept {
+                CaseInsensitive = v;
+                return *this;
+            }
+
+            inline TOptions& SetSurround(bool v) noexcept {
+                Surround = v;
+                return *this;
+            }
+
+            inline TOptions& SetCapture(size_t pos) noexcept {
+                CapturePos = pos;
+                return *this;
+            }
+
+            inline TOptions& SetCharset(ECharset charset) noexcept {
+                Charset = charset;
+                return *this;
+            }
+
+            inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
+                AndNotSupport = andNotSupport;
+                return *this;
+            }
+
+            bool CaseInsensitive = false;
+            bool Surround = false;
+            TMaybe<size_t> CapturePos;
+            ECharset Charset = CODES_UNKNOWN;
+            bool AndNotSupport = false;
+        };
+
+        static inline NPire::TFsm Parse(const TStringBuf& regexp,
+                                        const TOptions& opts, const bool needDetermine = true) {
+            NPire::TLexer lexer;
+            if (opts.Charset == CODES_UNKNOWN) {
+                lexer.Assign(regexp.data(), regexp.data() + regexp.size());
+            } else {
+                TVector<wchar32> ucs4(regexp.size() + 1);
+                size_t inRead = 0;
+                size_t outWritten = 0;
+                int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
+                                                regexp.size(), regexp.size(), inRead, outWritten);
+                Y_ASSERT(recodeRes == RECODE_OK);
+                Y_ASSERT(outWritten < ucs4.size());
+                ucs4[outWritten] = 0;
+
+                lexer.Assign(ucs4.begin(),
+                             ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
+            }
+
+            if (opts.CaseInsensitive) {
+                lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
+            }
+
+            if (opts.CapturePos) {
+                lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
+            }
+
+            if (opts.AndNotSupport) {
+                lexer.AddFeature(NPire::NFeatures::AndNotSupport());
+            }
+
+            switch (opts.Charset) {
+                case CODES_UNKNOWN:
+                    break;
+                case CODES_UTF8:
+                    lexer.SetEncoding(NPire::NEncodings::Utf8());
+                    break;
+                case CODES_KOI8:
+                    lexer.SetEncoding(NPire::NEncodings::Koi8r());
+                    break;
+                default:
+                    lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
+                    break;
+            }
+
+            NPire::TFsm ret = lexer.Parse();
+
+            if (opts.Surround) {
+                ret.Surround();
+            }
+
+            if (needDetermine) {
+                ret.Determine();
+            }
+
+            return ret;
+        }
+    };
+
+    template <class TScannerType>
+    class TFsmParser: public TFsmBase {
+    public:
+        typedef TScannerType TScanner;
+
+    public:
+        inline explicit TFsmParser(const TStringBuf& regexp,
+                                   const TOptions& opts = TOptions(), bool needDetermine = true)
+            : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
+        {
+        }
+
+        inline const TScanner& GetScanner() const noexcept {
+            return Scanner;
+        }
+
+        static inline TFsmParser False() {
+            return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
+        }
+
+        inline explicit TFsmParser(const TScanner& compiled)
+            : Scanner(compiled)
+        {
+            if (Scanner.Empty())
+                ythrow yexception() << "Can't create fsm with empty scanner";
+        }
+
+    private:
+        TScanner Scanner;
+    };
+
+    class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
+    public:
+        inline explicit TFsm(const TStringBuf& regexp,
+                             const TOptions& opts = TOptions())
+            : TFsmParser<TScanner>(regexp, opts)
+        {
+        }
+
+        inline TFsm(const TFsmParser<TScanner>& fsm)
+            : TFsmParser<TScanner>(fsm)
+        {
+        }
+
+        static inline TFsm Glue(const TFsm& l, const TFsm& r) {
+            return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
+        }
+
+        inline explicit TFsm(const TScanner& compiled)
+            : TFsmParser<TScanner>(compiled)
+        {
+        }
+    };
+
+    static inline TFsm operator|(const TFsm& l, const TFsm& r) {
+        return TFsm::Glue(l, r);
+    }
+
+    struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
+        inline explicit TCapturingFsm(const TStringBuf& regexp,
+                                      TOptions opts = TOptions())
+            : TFsmParser<TScanner>(regexp,
+                                   opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
+        }
+
+        inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
+            : TFsmParser<TScanner>(fsm)
+        {
+        }
+    };
+
+    struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
+        inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
+                                          TOptions opts = TOptions())
+                : TFsmParser<TScanner>(regexp,
+                                       opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
+        }
+
+        inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
+                : TFsmParser<TScanner>(fsm)
+        {
+        }
+    };
+
+    template <class TFsm>
+    class TMatcherBase {
+    public:
+        typedef typename TFsm::TScanner::State TState;
+
+    public:
+        inline explicit TMatcherBase(const TFsm& fsm)
+            : Fsm(fsm)
+        {
+            Fsm.GetScanner().Initialize(State);
+        }
+
+        inline bool Final() const noexcept {
+            return GetScanner().Final(GetState());
+        }
+
+    protected:
+        inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
+            if (addBegin) {
+                NPire::Step(GetScanner(), State, NPire::BeginMark);
+            }
+            NPire::Run(GetScanner(), State, data, data + len);
+            if (addEnd) {
+                NPire::Step(GetScanner(), State, NPire::EndMark);
+            }
+        }
+
+        inline const typename TFsm::TScanner& GetScanner() const noexcept {
+            return Fsm.GetScanner();
+        }
+
+        inline const TState& GetState() const noexcept {
+            return State;
+        }
+
+    private:
+        const TFsm& Fsm;
+        TState State;
+    };
+
+    struct TMatcher : TMatcherBase<TFsm> {
+        inline explicit TMatcher(const TFsm& fsm)
+            : TMatcherBase<TFsm>(fsm)
+        {
+        }
+
+        inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+            Run(data, len, addBegin, addEnd);
+            return *this;
+        }
+
+        inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
+            return Match(s.data(), s.size(), addBegin, addEnd);
+        }
+
+        inline const char* Find(const char* b, const char* e) noexcept {
+            return NPire::ShortestPrefix(GetScanner(), b, e);
+        }
+
+        typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
+
+        inline TMatchedRegexps MatchedRegexps() const noexcept {
+            return GetScanner().AcceptedRegexps(GetState());
+        }
+    };
+
+    class TSearcher: public TMatcherBase<TCapturingFsm> {
+    public:
+        inline explicit TSearcher(const TCapturingFsm& fsm)
+            : TMatcherBase<TCapturingFsm>(fsm)
+        {
+        }
+
+        inline bool Captured() const noexcept {
+            return GetState().Captured();
+        }
+
+        inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
+            Data = TStringBuf(data, len);
+            Run(data, len, addBegin, addEnd);
+            return *this;
+        }
+
+        inline TSearcher& Search(const TStringBuf& s) noexcept {
+            return Search(s.data(), s.size());
+        }
+
+        inline TStringBuf GetCaptured() const noexcept {
+            return TStringBuf(Data.data() + GetState().Begin() - 1,
+                              Data.data() + GetState().End() - 1);
+        }
+
+    private:
+        TStringBuf Data;
+    };
+
+    class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
+    public:
+        typedef typename TSlowCapturingFsm::TScanner::State TState;
+        inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
+                : TMatcherBase<TSlowCapturingFsm>(fsm)
+                , HasCaptured(false)
+        {
+        }
+
+        inline bool Captured() const noexcept {
+            return HasCaptured;
+        }
+
+        inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+            TStringBuf textData(data, len);
+            Data = textData;
+            Run(Data.begin(), Data.size(), addBegin, addEnd);
+            return GetAns();
+        }
+
+        inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
+            return Search(s.data(), s.size());
+        }
+
+        inline TStringBuf GetCaptured() const noexcept {
+            return Ans;
+        }
+
+    private:
+        TStringBuf Data;
+        TStringBuf Ans;
+        bool HasCaptured;
+
+        inline TSlowSearcher& GetAns() {
+            auto state = GetState();
+            Pire::SlowCapturingScanner::SingleState final;
+            if (!GetScanner().GetCapture(state, final)) {
+                HasCaptured = false;
+            } else {
+                if (!final.HasEnd()) {
+                    final.SetEnd(Data.size());
+                }
+                Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
+                HasCaptured = true;
+            }
+            return *this;
+        }
+    };
+}
diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp
new file mode 100644
index 0000000000..e7206de9ad
--- /dev/null
+++ b/library/cpp/regex/pire/ut/regexp_ut.cpp
@@ -0,0 +1,318 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <library/cpp/regex/pire/regexp.h>
+#include <library/cpp/regex/pire/pcre2pire.h>
+
+Y_UNIT_TEST_SUITE(TRegExp) {
+    using namespace NRegExp;
+
+    Y_UNIT_TEST(False) {
+        UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final());
+        UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final());
+    }
+
+    Y_UNIT_TEST(Surround) {
+        UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+        UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final());
+    }
+
+    Y_UNIT_TEST(Boundaries) {
+        UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+        UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
+        UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+        UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+        UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+        UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
+
+        UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true)))
+                        .Match(TStringBuf("a"), true, false)
+                        .Match(TStringBuf("q"), false, false)
+                        .Match(TStringBuf("w"), false, false)
+                        .Match(TStringBuf("b"), false, true)
+                        .Final());
+    }
+
+    Y_UNIT_TEST(Case) {
+        UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final());
+        UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());
+    }
+
+    Y_UNIT_TEST(UnicodeCase) {
+        UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());
+        UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());
+    }
+
+    Y_UNIT_TEST(Utf) {
+        NRegExp::TFsmBase::TOptions opts;
+        opts.Charset = CODES_UTF8;
+        opts.Surround = true;
+        UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final());
+        UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final());
+        UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final());
+        UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final());
+    }
+
+    Y_UNIT_TEST(AndNot) {
+        NRegExp::TFsmBase::TOptions opts;
+        opts.AndNotSupport = true;
+        {
+            NRegExp::TFsm fsm(".*&~([0-9]*)", opts);
+            UNIT_ASSERT(TMatcher(fsm).Match("a2").Final());
+            UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+            UNIT_ASSERT(TMatcher(fsm).Match("1a").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+        }
+        {
+            NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts);
+            UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
+        }
+        {
+            NRegExp::TFsm fsm(
+                "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)"
+                "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?",
+                TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true)
+            );
+            UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final());
+            UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final());
+            UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final());
+            UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final());
+        }
+    }
+
+    Y_UNIT_TEST(Glue) {
+        TFsm glued =
+            TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) |
+            TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) |
+            TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false));
+        UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+        UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
+        UNIT_ASSERT(TMatcher(glued).Match("abc").Final());
+        UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());
+    }
+
+    Y_UNIT_TEST(Capture1) {
+        TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+
+        TSearcher searcher(fsm);
+        searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+    }
+
+    Y_UNIT_TEST(Capture2) {
+        TCapturingFsm fsm("w([abcdez]+)f");
+
+        TSearcher searcher(fsm);
+        searcher.Search("wabcdef");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde"));
+    }
+
+    Y_UNIT_TEST(Capture3) {
+        TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+                          TFsm::TOptions().SetCapture(2));
+
+        TSearcher searcher(fsm);
+        searcher.Search("http://vkontakte.ru/id100500");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+    }
+
+    Y_UNIT_TEST(Capture4) {
+        TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+                          TFsm::TOptions().SetCharset(CODES_UTF8));
+
+        TSearcher searcher(fsm);
+        searcher.Search("   Здравствуйте, Уважаемый (-ая)!   ");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+    }
+
+    Y_UNIT_TEST(Capture5) {
+        TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");
+        TSearcher searcher(fsm);
+        searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+        UNIT_ASSERT(searcher.Captured());
+        //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+    }
+
+    Y_UNIT_TEST(Capture6) {
+        TCapturingFsm fsm("(/to-match-with)");
+        TSearcher searcher(fsm);
+        searcher.Search("/some/table/path/to-match-with");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with"));
+    }
+
+    Y_UNIT_TEST(Capture7) {
+        TCapturingFsm fsm("(pref.*suff)");
+        TSearcher searcher(fsm);
+        searcher.Search("ala pref bla suff cla");
+        UNIT_ASSERT(searcher.Captured());
+        //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff"));
+    }
+
+    Y_UNIT_TEST(CaptureXA) {
+        TCapturingFsm fsm(".*(xa).*");
+
+        TSearcher searcher(fsm);
+        searcher.Search("xa");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa"));
+    }
+
+    Y_UNIT_TEST(CaptureWrongXX) {
+        TCapturingFsm fsm(".*(xx).*");
+
+        TSearcher searcher(fsm);
+        searcher.Search("xx");
+        UNIT_ASSERT(searcher.Captured());
+        // Surprise!
+        // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm.
+        // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong.
+        // So it returns not the expected "xx" but just the second "x" instead.
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x"));
+    }
+
+    Y_UNIT_TEST(CaptureRight1XX) {
+        TCapturingFsm fsm("[^x]+(xx).*");
+
+        TSearcher searcher(fsm);
+
+        searcher.Search("xxx");
+        UNIT_ASSERT(!searcher.Captured());
+    }
+
+    Y_UNIT_TEST(CaptureRight2XX) {
+        TCapturingFsm fsm("[^x]+(xx).*");
+
+        TSearcher searcher(fsm);
+
+        searcher.Search("axx");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+    }
+
+    Y_UNIT_TEST(CaptureRight3XX) {
+        TCapturingFsm fsm("[^x]+(xx).*");
+
+        TSearcher searcher(fsm);
+
+        searcher.Search("axxb");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+    }
+
+    Y_UNIT_TEST(SlowCaptureXX) {
+        TSlowCapturingFsm fsm(".*(xx).*");
+
+        TSlowSearcher searcher(fsm);
+        searcher.Search("xx");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
+    }
+
+    Y_UNIT_TEST(SlowCapture) {
+        TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
+                              TFsm::TOptions().SetCapture(2));
+        TSlowSearcher searcher(fsm);
+        searcher.Search("http://vkontakte.ru/id100500");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
+    }
+
+    Y_UNIT_TEST(SlowCaptureGreedy) {
+        TSlowCapturingFsm fsm(".*(pref.*suff)");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("pref ala bla pref cla suff dla");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff"));
+    }
+
+    Y_UNIT_TEST(SlowCaptureNonGreedy) {
+        TSlowCapturingFsm fsm(".*?(pref.*suff)");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("pref ala bla pref cla suff dla");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff"));
+    }
+
+    Y_UNIT_TEST(SlowCapture2) {
+        TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
+                              TFsm::TOptions().SetCharset(CODES_UTF8));
+
+        TSlowSearcher searcher(fsm);
+        searcher.Search("   Здравствуйте, Уважаемый (-ая)!   ");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
+    }
+
+    Y_UNIT_TEST(SlowCapture3) {
+        TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
+    }
+
+    Y_UNIT_TEST(SlowCapture4) {
+        TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\"");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
+    }
+
+    Y_UNIT_TEST(CapturedEmptySlow) {
+        TSlowCapturingFsm fsm("Comments=(.*)$");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("And Comments=");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+    }
+
+    Y_UNIT_TEST(CaptureInOrFirst) {
+        TSlowCapturingFsm fsm("(A)|A");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("A");
+        UNIT_ASSERT(searcher.Captured());
+    }
+
+    Y_UNIT_TEST(CaptureInOrSecond) {
+        TSlowCapturingFsm fsm("A|(A)");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("A");
+        UNIT_ASSERT(!searcher.Captured());
+    }
+
+    Y_UNIT_TEST(CaptureOutside) {
+        TSlowCapturingFsm fsm("((ID=([0-9]+))?)");
+        TSlowSearcher searcher(fsm);
+        searcher.Search("ID=");
+        UNIT_ASSERT(searcher.Captured());
+        UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
+    }
+
+    Y_UNIT_TEST(CaptureInside) {
+        TSlowCapturingFsm fsm("((ID=([0-9]+))?)",
+                              TFsm::TOptions().SetCapture(2));
+        TSlowSearcher searcher(fsm);
+        searcher.Search("ID=");
+        UNIT_ASSERT(!searcher.Captured());
+    }
+
+    Y_UNIT_TEST(Pcre2PireTest) {
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), "");
+        UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))");
+    }
+}
diff --git a/library/cpp/regex/pire/ut/ya.make b/library/cpp/regex/pire/ut/ya.make
new file mode 100644
index 0000000000..8776695f40
--- /dev/null
+++ b/library/cpp/regex/pire/ut/ya.make
@@ -0,0 +1,44 @@
+# this test in not linked into build tree with ReCURSE and is built by unittest/library
+
+UNITTEST()
+
+OWNER(
+    g:util
+    davenger
+)
+
+SET(PIRETESTSDIR contrib/libs/pire/ut)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+PEERDIR(
+    library/cpp/regex/pire
+)
+
+SRCDIR(
+    ${PIRETESTSDIR}
+)
+
+ADDINCL(
+    contrib/libs/pire/pire
+    contrib/libs/pire/ut
+)
+
+SRCS(
+    pire_ut.cpp
+    capture_ut.cpp
+    count_ut.cpp
+    glyph_ut.cpp
+    easy_ut.cpp
+    read_unicode_ut.cpp
+    regexp_ut.cpp
+    approx_matching_ut.cpp
+)
+
+SIZE(MEDIUM)
+
+TIMEOUT(600)
+
+PIRE_INLINE(inline_ut.cpp)
+
+END()
diff --git a/library/cpp/regex/pire/ya.make b/library/cpp/regex/pire/ya.make
new file mode 100644
index 0000000000..c857e6d18b
--- /dev/null
+++ b/library/cpp/regex/pire/ya.make
@@ -0,0 +1,40 @@
+LIBRARY()
+
+OWNER(
+    g:util
+    g:antiinfra
+    davenger
+    pg
+)
+
+CFLAGS(-DPIRE_NO_CONFIG)
+
+SRCDIR(contrib/libs/pire/pire)
+
+SRCS(
+    pcre2pire.cpp
+    classes.cpp
+    encoding.cpp
+    fsm.cpp
+    scanner_io.cpp
+    easy.cpp
+    scanners/null.cpp
+    extra/capture.cpp
+    extra/count.cpp
+    extra/glyphs.cpp
+    re_lexer.cpp
+    re_parser.y
+    read_unicode.cpp
+    extraencodings.cpp
+    approx_matching.cpp
+    half_final_fsm.cpp
+    minimize.h
+)
+
+PEERDIR(
+    library/cpp/charset
+)
+
+END()
+
+RECURSE_FOR_TESTS(ut)
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz