diff options
| author | Dmitry Potapov <[email protected]> | 2022-02-10 16:46:39 +0300 | 
|---|---|---|
| committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:46:39 +0300 | 
| commit | 536101ea75c9ff5df10d01c2f460b1f6e12311b3 (patch) | |
| tree | 115291277ad61b2cdcf5044d210fb103b5e1647e /library/cpp/regex/pire | |
| parent | 5036b5f2122001f9aef8a0e4cd85440d73ea6b9f (diff) | |
Restoring authorship annotation for Dmitry Potapov <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/regex/pire')
| -rw-r--r-- | library/cpp/regex/pire/regexp.h | 308 | ||||
| -rw-r--r-- | library/cpp/regex/pire/ut/regexp_ut.cpp | 56 | 
2 files changed, 182 insertions, 182 deletions
diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h index 94bba4064b7..a517b47b710 100644 --- a/library/cpp/regex/pire/regexp.h +++ b/library/cpp/regex/pire/regexp.h @@ -5,36 +5,36 @@  #include <library/cpp/charset/doccodes.h>  #include <library/cpp/charset/recyr.hh>  #include <util/generic/maybe.h> -#include <util/generic/strbuf.h> +#include <util/generic/strbuf.h>   #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/generic/yexception.h> - +#include <util/generic/vector.h>  +#include <util/generic/yexception.h>  +   namespace NRegExp {      struct TMatcher; -    struct TFsmBase { -        struct TOptions { +    struct TFsmBase {  +        struct TOptions {               inline TOptions& SetCaseInsensitive(bool v) noexcept { -                CaseInsensitive = v; -                return *this; -            } +                CaseInsensitive = v;  +                return *this;  +            }               inline TOptions& SetSurround(bool v) noexcept { -                Surround = v; -                return *this; -            } +                Surround = v;  +                return *this;  +            }               inline TOptions& SetCapture(size_t pos) noexcept { -                CapturePos = pos; -                return *this; +                CapturePos = pos;  +                return *this;               }              inline TOptions& SetCharset(ECharset charset) noexcept { -                Charset = charset; -                return *this; -            } - +                Charset = charset;  +                return *this;  +            }  +               inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {                  AndNotSupport = andNotSupport;                  return *this; @@ -45,14 +45,14 @@ namespace NRegExp {              TMaybe<size_t> CapturePos;              ECharset Charset = CODES_UNKNOWN;              bool AndNotSupport = false; -        }; - +        };  +           static inline NPire::TFsm Parse(const TStringBuf& regexp,                                          const TOptions& opts, const bool needDetermine = true) { -            NPire::TLexer lexer; -            if (opts.Charset == CODES_UNKNOWN) { +            NPire::TLexer lexer;  +            if (opts.Charset == CODES_UNKNOWN) {                   lexer.Assign(regexp.data(), regexp.data() + regexp.size()); -            } else { +            } else {                   TVector<wchar32> ucs4(regexp.size() + 1);                  size_t inRead = 0;                  size_t outWritten = 0; @@ -61,13 +61,13 @@ namespace NRegExp {                  Y_ASSERT(recodeRes == RECODE_OK);                  Y_ASSERT(outWritten < ucs4.size());                  ucs4[outWritten] = 0; - -                lexer.Assign(ucs4.begin(), +  +                lexer.Assign(ucs4.begin(),                                ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data())); -            } - -            if (opts.CaseInsensitive) { -                lexer.AddFeature(NPire::NFeatures::CaseInsensitive()); +            }  +  +            if (opts.CaseInsensitive) {  +                lexer.AddFeature(NPire::NFeatures::CaseInsensitive());               }              if (opts.CapturePos) { @@ -78,7 +78,7 @@ namespace NRegExp {                  lexer.AddFeature(NPire::NFeatures::AndNotSupport());              } -            switch (opts.Charset) { +            switch (opts.Charset) {                   case CODES_UNKNOWN:                      break;                  case CODES_UTF8: @@ -90,76 +90,76 @@ namespace NRegExp {                  default:                      lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));                      break; -            } - -            NPire::TFsm ret = lexer.Parse(); - -            if (opts.Surround) { -                ret.Surround(); +            }  +  +            NPire::TFsm ret = lexer.Parse();  +  +            if (opts.Surround) {  +                ret.Surround();               }              if (needDetermine) {                  ret.Determine();              } -            return ret; -        } -    }; +            return ret;  +        }  +    };  -    template <class TScannerType> -    class TFsmParser: public TFsmBase { -    public: -        typedef TScannerType TScanner; +    template <class TScannerType>  +    class TFsmParser: public TFsmBase {  +    public:  +        typedef TScannerType TScanner;  -    public: +    public:           inline explicit TFsmParser(const TStringBuf& regexp,                                     const TOptions& opts = TOptions(), bool needDetermine = true)              : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>()) -        { -        } +        {  +        }           inline const TScanner& GetScanner() const noexcept { -            return Scanner; -        } - -        static inline TFsmParser False() { -            return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>()); -        } - -        inline explicit TFsmParser(const TScanner& compiled) -            : Scanner(compiled) -        { -            if (Scanner.Empty()) -                ythrow yexception() << "Can't create fsm with empty scanner"; -        } - -    private: -        TScanner Scanner; +            return Scanner;  +        }  + +        static inline TFsmParser False() {  +            return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());  +        }  + +        inline explicit TFsmParser(const TScanner& compiled)  +            : Scanner(compiled)  +        {  +            if (Scanner.Empty())  +                ythrow yexception() << "Can't create fsm with empty scanner";  +        }  +  +    private:  +        TScanner Scanner;       }; -    class TFsm: public TFsmParser<NPire::TNonrelocScanner> { -    public: +    class TFsm: public TFsmParser<NPire::TNonrelocScanner> {  +    public:           inline explicit TFsm(const TStringBuf& regexp,                               const TOptions& opts = TOptions()) -            : TFsmParser<TScanner>(regexp, opts) -        { -        } - -        inline TFsm(const TFsmParser<TScanner>& fsm) -            : TFsmParser<TScanner>(fsm) -        { -        } - +            : TFsmParser<TScanner>(regexp, opts)  +        {  +        }  +  +        inline TFsm(const TFsmParser<TScanner>& fsm)  +            : TFsmParser<TScanner>(fsm)  +        {  +        }  +           static inline TFsm Glue(const TFsm& l, const TFsm& r) { -            return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner())); -        } - -        inline explicit TFsm(const TScanner& compiled) -            : TFsmParser<TScanner>(compiled) -        { -        } -    }; - +            return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));  +        }  +  +        inline explicit TFsm(const TScanner& compiled)  +            : TFsmParser<TScanner>(compiled)  +        {  +        }  +    };  +       static inline TFsm operator|(const TFsm& l, const TFsm& r) {          return TFsm::Glue(l, r);      } @@ -167,15 +167,15 @@ namespace NRegExp {      struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {          inline explicit TCapturingFsm(const TStringBuf& regexp,                                        TOptions opts = TOptions()) -            : TFsmParser<TScanner>(regexp, +            : TFsmParser<TScanner>(regexp,                                      opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) { -        } +        }  -        inline TCapturingFsm(const TFsmParser<TScanner>& fsm) -            : TFsmParser<TScanner>(fsm) -        { -        } -    }; +        inline TCapturingFsm(const TFsmParser<TScanner>& fsm)  +            : TFsmParser<TScanner>(fsm)  +        {  +        }  +    };       struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {          inline explicit TSlowCapturingFsm(const TStringBuf& regexp, @@ -190,101 +190,101 @@ namespace NRegExp {          }      }; -    template <class TFsm> -    class TMatcherBase { -    public: -        typedef typename TFsm::TScanner::State TState; +    template <class TFsm>  +    class TMatcherBase {  +    public:  +        typedef typename TFsm::TScanner::State TState;  -    public: -        inline explicit TMatcherBase(const TFsm& fsm) -            : Fsm(fsm) -        { -            Fsm.GetScanner().Initialize(State); -        } +    public:  +        inline explicit TMatcherBase(const TFsm& fsm)  +            : Fsm(fsm)  +        {  +            Fsm.GetScanner().Initialize(State);  +        }           inline bool Final() const noexcept { -            return GetScanner().Final(GetState()); -        } +            return GetScanner().Final(GetState());  +        }  -    protected: +    protected:           inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {              if (addBegin) { -                NPire::Step(GetScanner(), State, NPire::BeginMark); +                NPire::Step(GetScanner(), State, NPire::BeginMark);               } -            NPire::Run(GetScanner(), State, data, data + len); +            NPire::Run(GetScanner(), State, data, data + len);               if (addEnd) { -                NPire::Step(GetScanner(), State, NPire::EndMark); -            } -        } +                NPire::Step(GetScanner(), State, NPire::EndMark);  +            }  +        }           inline const typename TFsm::TScanner& GetScanner() const noexcept { -            return Fsm.GetScanner(); -        } +            return Fsm.GetScanner();  +        }           inline const TState& GetState() const noexcept { -            return State; -        } +            return State;  +        }  -    private: -        const TFsm& Fsm; -        TState State; +    private:  +        const TFsm& Fsm;  +        TState State;       }; - +       struct TMatcher : TMatcherBase<TFsm> { -        inline explicit TMatcher(const TFsm& fsm) -            : TMatcherBase<TFsm>(fsm) -        { -        } - +        inline explicit TMatcher(const TFsm& fsm)  +            : TMatcherBase<TFsm>(fsm)  +        {  +        }  +           inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {              Run(data, len, addBegin, addEnd); -            return *this; -        } - +            return *this;  +        }  +           inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {              return Match(s.data(), s.size(), addBegin, addEnd); -        } - +        }  +           inline const char* Find(const char* b, const char* e) noexcept { -            return NPire::ShortestPrefix(GetScanner(), b, e); -        } - +            return NPire::ShortestPrefix(GetScanner(), b, e);  +        }  +           typedef std::pair<const size_t*, const size_t*> TMatchedRegexps; - +           inline TMatchedRegexps MatchedRegexps() const noexcept { -            return GetScanner().AcceptedRegexps(GetState()); -        } -    }; - -    class TSearcher: public TMatcherBase<TCapturingFsm> { -    public: -        inline explicit TSearcher(const TCapturingFsm& fsm) -            : TMatcherBase<TCapturingFsm>(fsm) -        { -        } - +            return GetScanner().AcceptedRegexps(GetState());  +        }  +    };  +  +    class TSearcher: public TMatcherBase<TCapturingFsm> {  +    public:  +        inline explicit TSearcher(const TCapturingFsm& fsm)  +            : TMatcherBase<TCapturingFsm>(fsm)  +        {  +        }  +           inline bool Captured() const noexcept { -            return GetState().Captured(); -        } - +            return GetState().Captured();  +        }  +           inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept { -            Data = TStringBuf(data, len); +            Data = TStringBuf(data, len);               Run(data, len, addBegin, addEnd); -            return *this; -        } - +            return *this;  +        }  +           inline TSearcher& Search(const TStringBuf& s) noexcept {              return Search(s.data(), s.size()); -        } - +        }  +           inline TStringBuf GetCaptured() const noexcept {              return TStringBuf(Data.data() + GetState().Begin() - 1,                                Data.data() + GetState().End() - 1); -        } - -    private: -        TStringBuf Data; -    }; +        }  +  +    private:  +        TStringBuf Data;  +    };       class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{      public: diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp index e7206de9ad4..c8db34d9868 100644 --- a/library/cpp/regex/pire/ut/regexp_ut.cpp +++ b/library/cpp/regex/pire/ut/regexp_ut.cpp @@ -93,44 +93,44 @@ Y_UNIT_TEST_SUITE(TRegExp) {          UNIT_ASSERT(TMatcher(glued).Match("abc").Final());          UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());      } - +       Y_UNIT_TEST(Capture1) { -        TCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); - -        TSearcher searcher(fsm); -        searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); -        UNIT_ASSERT(searcher.Captured()); +        TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");  +  +        TSearcher searcher(fsm);  +        searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");  +        UNIT_ASSERT(searcher.Captured());           UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); -    } - +    }  +       Y_UNIT_TEST(Capture2) { -        TCapturingFsm fsm("w([abcdez]+)f"); - -        TSearcher searcher(fsm); -        searcher.Search("wabcdef"); -        UNIT_ASSERT(searcher.Captured()); +        TCapturingFsm fsm("w([abcdez]+)f");  +  +        TSearcher searcher(fsm);  +        searcher.Search("wabcdef");  +        UNIT_ASSERT(searcher.Captured());           UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde")); -    } - +    }  +       Y_UNIT_TEST(Capture3) { -        TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", +        TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",                             TFsm::TOptions().SetCapture(2)); - -        TSearcher searcher(fsm); -        searcher.Search("http://vkontakte.ru/id100500"); -        UNIT_ASSERT(searcher.Captured()); +  +        TSearcher searcher(fsm);  +        searcher.Search("http://vkontakte.ru/id100500");  +        UNIT_ASSERT(searcher.Captured());           UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); -    } - +    }  +       Y_UNIT_TEST(Capture4) { -        TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", +        TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",                             TFsm::TOptions().SetCharset(CODES_UTF8)); - -        TSearcher searcher(fsm); -        searcher.Search("   Здравствуйте, Уважаемый (-ая)!   "); -        UNIT_ASSERT(searcher.Captured()); +  +        TSearcher searcher(fsm);  +        searcher.Search("   Здравствуйте, Уважаемый (-ая)!   ");  +        UNIT_ASSERT(searcher.Captured());           UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); -    } +    }       Y_UNIT_TEST(Capture5) {          TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");  | 
