diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pire')
-rw-r--r-- | library/cpp/regex/pire/extraencodings.cpp | 81 | ||||
-rw-r--r-- | library/cpp/regex/pire/inline/ya.make | 22 | ||||
-rw-r--r-- | library/cpp/regex/pire/pcre2pire.cpp | 110 | ||||
-rw-r--r-- | library/cpp/regex/pire/pcre2pire.h | 19 | ||||
-rw-r--r-- | library/cpp/regex/pire/pire.h | 76 | ||||
-rw-r--r-- | library/cpp/regex/pire/regexp.h | 337 | ||||
-rw-r--r-- | library/cpp/regex/pire/ut/regexp_ut.cpp | 318 | ||||
-rw-r--r-- | library/cpp/regex/pire/ut/ya.make | 44 | ||||
-rw-r--r-- | library/cpp/regex/pire/ya.make | 40 |
9 files changed, 1047 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/extraencodings.cpp b/library/cpp/regex/pire/extraencodings.cpp new file mode 100644 index 0000000000..2e507e4b67 --- /dev/null +++ b/library/cpp/regex/pire/extraencodings.cpp @@ -0,0 +1,81 @@ +#include <util/system/defaults.h> +#include <util/system/yassert.h> +#include <library/cpp/charset/codepage.h> +#include <util/generic/singleton.h> +#include <util/generic/yexception.h> +#include <library/cpp/charset/doccodes.h> + +#include "pire.h" + +namespace NPire { + namespace { + // A one-byte encoding which is capable of transforming upper half of the character + // table to/from Unicode chars. + class TOneByte: public TEncoding { + public: + TOneByte(ECharset doccode) { + Table_ = CodePageByCharset(doccode)->unicode; + for (size_t i = 0; i < 256; ++i) + Reverse_.insert(std::make_pair(Table_[i], static_cast<char>(i))); + } + + wchar32 FromLocal(const char*& begin, const char* end) const override { + if (begin != end) + return Table_[static_cast<unsigned char>(*begin++)]; + else + ythrow yexception() << "EOF reached in Pire::OneByte::fromLocal()"; + } + + TString ToLocal(wchar32 c) const override { + THashMap<wchar32, char>::const_iterator i = Reverse_.find(c); + if (i != Reverse_.end()) + return TString(1, i->second); + else + return TString(); + } + + void AppendDot(TFsm& fsm) const override { + fsm.AppendDot(); + } + + private: + const wchar32* Table_; + THashMap<wchar32, char> Reverse_; + }; + + template <unsigned N> + struct TOneByteHelper: public TOneByte { + inline TOneByteHelper() + : TOneByte((ECharset)N) + { + } + }; + } + + namespace NEncodings { + const NPire::TEncoding& Koi8r() { + return *Singleton<TOneByteHelper<CODES_KOI8>>(); + } + + const NPire::TEncoding& Cp1251() { + return *Singleton<TOneByteHelper<CODES_WIN>>(); + } + + const NPire::TEncoding& Get(ECharset encoding) { + switch (encoding) { + case CODES_WIN: + return Cp1251(); + case CODES_KOI8: + return Koi8r(); + case CODES_ASCII: + return NPire::NEncodings::Latin1(); + case CODES_UTF8: + return NPire::NEncodings::Utf8(); + default: + ythrow yexception() << "Pire::Encodings::get(ECharset): unknown encoding " << (int)encoding; + } + } + + } + +} diff --git a/library/cpp/regex/pire/inline/ya.make b/library/cpp/regex/pire/inline/ya.make new file mode 100644 index 0000000000..d4850f7b45 --- /dev/null +++ b/library/cpp/regex/pire/inline/ya.make @@ -0,0 +1,22 @@ +PROGRAM(pire_inline) + +CFLAGS(-DPIRE_NO_CONFIG) + +OWNER( + g:util + davenger +) + +PEERDIR( + ADDINCL library/cpp/regex/pire +) + +SRCDIR( + contrib/libs/pire/pire +) + +SRCS( + inline.l +) + +END() diff --git a/library/cpp/regex/pire/pcre2pire.cpp b/library/cpp/regex/pire/pcre2pire.cpp new file mode 100644 index 0000000000..f788beb85f --- /dev/null +++ b/library/cpp/regex/pire/pcre2pire.cpp @@ -0,0 +1,110 @@ +#include "pcre2pire.h" +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +TString Pcre2Pire(const TString& src) { + TVector<char> result; + result.reserve(src.size() + 1); + + enum EState { + S_SIMPLE, + S_SLASH, + S_BRACE, + S_EXPECT_Q, + S_QUESTION, + S_P, + S_COMMA, + S_IN, + }; + + EState state = S_SIMPLE; + + for (ui32 i = 0; i < src.size(); ++i) { + const char c = src[i]; + + switch (state) { + case S_SIMPLE: + if (c == '\\') { + state = S_SLASH; + } else if (c == '(') { + state = S_BRACE; + } else if (c == '*' || c == '?') { + state = S_EXPECT_Q; + result.push_back(c); + } else { + if (c == ')' && result.size() > 0 && result.back() == '(') { + // eliminating "()" + result.pop_back(); + } else { + result.push_back(c); + } + } + break; + case S_SLASH: + state = S_SIMPLE; + if (c == ':' || c == '=' || c == '#' || c == '&') { + result.push_back(c); + } else { + result.push_back('\\'); + --i; + } + break; + case S_BRACE: + if (c == '?') { + state = S_QUESTION; + } else { + state = S_COMMA; + --i; + } + break; + case S_EXPECT_Q: + state = S_SIMPLE; + if (c != '?') { + --i; + } + break; + case S_QUESTION: + if (c == 'P') { + state = S_P; + } else if (c == ':' || c == '=') { + state = S_COMMA; + } else { + ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!"; + } + break; + case S_P: + if (c == '<') { + state = S_IN; + } else { + ythrow yexception() << "Pcre to pire convertaion failed: unexpected symbol '" << c << "' at posiotion " << i << "!"; + } + break; + case S_IN: + if (c == '>') { + state = S_COMMA; + } else { + // nothing to do + } + break; + case S_COMMA: + state = S_SIMPLE; + if (c == ')') { + // nothing to do + } else { + result.push_back('('); + --i; + } + break; + default: + ythrow yexception() << "Pcre to pire convertaion failed: unexpected automata state!"; + } + } + + if (state != S_SIMPLE && state != S_EXPECT_Q) { + ythrow yexception() << "Pcre to pire convertaion failed: unexpected end of expression!"; + } + + result.push_back('\0'); + + return &result[0]; +} diff --git a/library/cpp/regex/pire/pcre2pire.h b/library/cpp/regex/pire/pcre2pire.h new file mode 100644 index 0000000000..46e45b9193 --- /dev/null +++ b/library/cpp/regex/pire/pcre2pire.h @@ -0,0 +1,19 @@ +#pragma once + +// Author: smikler@yandex-team.ru + +#include <util/generic/string.h> + +/* Converts pcre regular expression to pire compatible format: + * - replaces "\\#" with "#" + * - replaces "\\=" with "=" + * - replaces "\\:" with ":" + * - removes "?P<...>" + * - removes "?:" + * - removes "()" recursively + * - replaces "??" with "?" + * - replaces "*?" with "*" + * NOTE: + * - Not fully tested! + */ +TString Pcre2Pire(const TString& src); diff --git a/library/cpp/regex/pire/pire.h b/library/cpp/regex/pire/pire.h new file mode 100644 index 0000000000..286fecd693 --- /dev/null +++ b/library/cpp/regex/pire/pire.h @@ -0,0 +1,76 @@ +#pragma once + +#ifndef PIRE_NO_CONFIG +#define PIRE_NO_CONFIG +#endif + +#include <contrib/libs/pire/pire/pire.h> +#include <contrib/libs/pire/pire/extra.h> + +#include <library/cpp/charset/doccodes.h> + +namespace NPire { + using TChar = Pire::Char; + using Pire::MaxChar; + + // Scanner classes + using TScanner = Pire::Scanner; + using TNonrelocScanner = Pire::NonrelocScanner; + using TScannerNoMask = Pire::ScannerNoMask; + using TNonrelocScannerNoMask = Pire::NonrelocScannerNoMask; + using THalfFinalScanner = Pire::HalfFinalScanner; + using TNonrelocHalfFinalScanner = Pire::NonrelocHalfFinalScanner; + using THalfFinalScannerNoMask = Pire::HalfFinalScannerNoMask; + using TNonrelocHalfFinalScannerNoMask = Pire::NonrelocHalfFinalScannerNoMask; + using TSimpleScanner = Pire::SimpleScanner; + using TSlowScanner = Pire::SlowScanner; + using TCapturingScanner = Pire::CapturingScanner; + using TSlowCapturingScanner = Pire::SlowCapturingScanner; + using TCountingScanner = Pire::CountingScanner; + + template <typename T1, typename T2> + using TScannerPair = Pire::ScannerPair<T1, T2>; + + // Helper classes + using TFsm = Pire::Fsm; + using TLexer = Pire::Lexer; + using TTerm = Pire::Term; + using TEncoding = Pire::Encoding; + using TFeature = Pire::Feature; + using TFeaturePtr = Pire::Feature::Ptr; + using TError = Pire::Error; + + // Helper functions + using Pire::LongestPrefix; + using Pire::LongestSuffix; + using Pire::Matches; + using Pire::MmappedScanner; + using Pire::Run; + using Pire::Runner; + using Pire::ShortestPrefix; + using Pire::ShortestSuffix; + using Pire::Step; + + using namespace Pire::SpecialChar; + using namespace Pire::Consts; + + namespace NFeatures { + using Pire::Features::AndNotSupport; + using Pire::Features::Capture; + using Pire::Features::CaseInsensitive; + using Pire::Features::GlueSimilarGlyphs; + } + + namespace NEncodings { + using Pire::Encodings::Latin1; + using Pire::Encodings::Utf8; + + const NPire::TEncoding& Koi8r(); + const NPire::TEncoding& Cp1251(); + const NPire::TEncoding& Get(ECharset encoding); + } + + namespace NTokenTypes { + using namespace Pire::TokenTypes; + } +} diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h new file mode 100644 index 0000000000..94bba4064b --- /dev/null +++ b/library/cpp/regex/pire/regexp.h @@ -0,0 +1,337 @@ +#pragma once + +#include "pire.h" + +#include <library/cpp/charset/doccodes.h> +#include <library/cpp/charset/recyr.hh> +#include <util/generic/maybe.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +namespace NRegExp { + struct TMatcher; + + struct TFsmBase { + struct TOptions { + inline TOptions& SetCaseInsensitive(bool v) noexcept { + CaseInsensitive = v; + return *this; + } + + inline TOptions& SetSurround(bool v) noexcept { + Surround = v; + return *this; + } + + inline TOptions& SetCapture(size_t pos) noexcept { + CapturePos = pos; + return *this; + } + + inline TOptions& SetCharset(ECharset charset) noexcept { + Charset = charset; + return *this; + } + + inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept { + AndNotSupport = andNotSupport; + return *this; + } + + bool CaseInsensitive = false; + bool Surround = false; + TMaybe<size_t> CapturePos; + ECharset Charset = CODES_UNKNOWN; + bool AndNotSupport = false; + }; + + static inline NPire::TFsm Parse(const TStringBuf& regexp, + const TOptions& opts, const bool needDetermine = true) { + NPire::TLexer lexer; + if (opts.Charset == CODES_UNKNOWN) { + lexer.Assign(regexp.data(), regexp.data() + regexp.size()); + } else { + TVector<wchar32> ucs4(regexp.size() + 1); + size_t inRead = 0; + size_t outWritten = 0; + int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(), + regexp.size(), regexp.size(), inRead, outWritten); + Y_ASSERT(recodeRes == RECODE_OK); + Y_ASSERT(outWritten < ucs4.size()); + ucs4[outWritten] = 0; + + lexer.Assign(ucs4.begin(), + ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data())); + } + + if (opts.CaseInsensitive) { + lexer.AddFeature(NPire::NFeatures::CaseInsensitive()); + } + + if (opts.CapturePos) { + lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos)); + } + + if (opts.AndNotSupport) { + lexer.AddFeature(NPire::NFeatures::AndNotSupport()); + } + + switch (opts.Charset) { + case CODES_UNKNOWN: + break; + case CODES_UTF8: + lexer.SetEncoding(NPire::NEncodings::Utf8()); + break; + case CODES_KOI8: + lexer.SetEncoding(NPire::NEncodings::Koi8r()); + break; + default: + lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset)); + break; + } + + NPire::TFsm ret = lexer.Parse(); + + if (opts.Surround) { + ret.Surround(); + } + + if (needDetermine) { + ret.Determine(); + } + + return ret; + } + }; + + template <class TScannerType> + class TFsmParser: public TFsmBase { + public: + typedef TScannerType TScanner; + + public: + inline explicit TFsmParser(const TStringBuf& regexp, + const TOptions& opts = TOptions(), bool needDetermine = true) + : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>()) + { + } + + inline const TScanner& GetScanner() const noexcept { + return Scanner; + } + + static inline TFsmParser False() { + return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>()); + } + + inline explicit TFsmParser(const TScanner& compiled) + : Scanner(compiled) + { + if (Scanner.Empty()) + ythrow yexception() << "Can't create fsm with empty scanner"; + } + + private: + TScanner Scanner; + }; + + class TFsm: public TFsmParser<NPire::TNonrelocScanner> { + public: + inline explicit TFsm(const TStringBuf& regexp, + const TOptions& opts = TOptions()) + : TFsmParser<TScanner>(regexp, opts) + { + } + + inline TFsm(const TFsmParser<TScanner>& fsm) + : TFsmParser<TScanner>(fsm) + { + } + + static inline TFsm Glue(const TFsm& l, const TFsm& r) { + return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner())); + } + + inline explicit TFsm(const TScanner& compiled) + : TFsmParser<TScanner>(compiled) + { + } + }; + + static inline TFsm operator|(const TFsm& l, const TFsm& r) { + return TFsm::Glue(l, r); + } + + struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> { + inline explicit TCapturingFsm(const TStringBuf& regexp, + TOptions opts = TOptions()) + : TFsmParser<TScanner>(regexp, + opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) { + } + + inline TCapturingFsm(const TFsmParser<TScanner>& fsm) + : TFsmParser<TScanner>(fsm) + { + } + }; + + struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> { + inline explicit TSlowCapturingFsm(const TStringBuf& regexp, + TOptions opts = TOptions()) + : TFsmParser<TScanner>(regexp, + opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) { + } + + inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm) + : TFsmParser<TScanner>(fsm) + { + } + }; + + template <class TFsm> + class TMatcherBase { + public: + typedef typename TFsm::TScanner::State TState; + + public: + inline explicit TMatcherBase(const TFsm& fsm) + : Fsm(fsm) + { + Fsm.GetScanner().Initialize(State); + } + + inline bool Final() const noexcept { + return GetScanner().Final(GetState()); + } + + protected: + inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept { + if (addBegin) { + NPire::Step(GetScanner(), State, NPire::BeginMark); + } + NPire::Run(GetScanner(), State, data, data + len); + if (addEnd) { + NPire::Step(GetScanner(), State, NPire::EndMark); + } + } + + inline const typename TFsm::TScanner& GetScanner() const noexcept { + return Fsm.GetScanner(); + } + + inline const TState& GetState() const noexcept { + return State; + } + + private: + const TFsm& Fsm; + TState State; + }; + + struct TMatcher : TMatcherBase<TFsm> { + inline explicit TMatcher(const TFsm& fsm) + : TMatcherBase<TFsm>(fsm) + { + } + + inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept { + Run(data, len, addBegin, addEnd); + return *this; + } + + inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept { + return Match(s.data(), s.size(), addBegin, addEnd); + } + + inline const char* Find(const char* b, const char* e) noexcept { + return NPire::ShortestPrefix(GetScanner(), b, e); + } + + typedef std::pair<const size_t*, const size_t*> TMatchedRegexps; + + inline TMatchedRegexps MatchedRegexps() const noexcept { + return GetScanner().AcceptedRegexps(GetState()); + } + }; + + class TSearcher: public TMatcherBase<TCapturingFsm> { + public: + inline explicit TSearcher(const TCapturingFsm& fsm) + : TMatcherBase<TCapturingFsm>(fsm) + { + } + + inline bool Captured() const noexcept { + return GetState().Captured(); + } + + inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept { + Data = TStringBuf(data, len); + Run(data, len, addBegin, addEnd); + return *this; + } + + inline TSearcher& Search(const TStringBuf& s) noexcept { + return Search(s.data(), s.size()); + } + + inline TStringBuf GetCaptured() const noexcept { + return TStringBuf(Data.data() + GetState().Begin() - 1, + Data.data() + GetState().End() - 1); + } + + private: + TStringBuf Data; + }; + + class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{ + public: + typedef typename TSlowCapturingFsm::TScanner::State TState; + inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm) + : TMatcherBase<TSlowCapturingFsm>(fsm) + , HasCaptured(false) + { + } + + inline bool Captured() const noexcept { + return HasCaptured; + } + + inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept { + TStringBuf textData(data, len); + Data = textData; + Run(Data.begin(), Data.size(), addBegin, addEnd); + return GetAns(); + } + + inline TSlowSearcher& Search(const TStringBuf& s) noexcept { + return Search(s.data(), s.size()); + } + + inline TStringBuf GetCaptured() const noexcept { + return Ans; + } + + private: + TStringBuf Data; + TStringBuf Ans; + bool HasCaptured; + + inline TSlowSearcher& GetAns() { + auto state = GetState(); + Pire::SlowCapturingScanner::SingleState final; + if (!GetScanner().GetCapture(state, final)) { + HasCaptured = false; + } else { + if (!final.HasEnd()) { + final.SetEnd(Data.size()); + } + Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin()); + HasCaptured = true; + } + return *this; + } + }; +} diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp new file mode 100644 index 0000000000..e7206de9ad --- /dev/null +++ b/library/cpp/regex/pire/ut/regexp_ut.cpp @@ -0,0 +1,318 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/regex/pire/regexp.h> +#include <library/cpp/regex/pire/pcre2pire.h> + +Y_UNIT_TEST_SUITE(TRegExp) { + using namespace NRegExp; + + Y_UNIT_TEST(False) { + UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final()); + UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final()); + } + + Y_UNIT_TEST(Surround) { + UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final()); + } + + Y_UNIT_TEST(Boundaries) { + UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + + UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true))) + .Match(TStringBuf("a"), true, false) + .Match(TStringBuf("q"), false, false) + .Match(TStringBuf("w"), false, false) + .Match(TStringBuf("b"), false, true) + .Final()); + } + + Y_UNIT_TEST(Case) { + UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final()); + } + + Y_UNIT_TEST(UnicodeCase) { + UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final()); + UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final()); + } + + Y_UNIT_TEST(Utf) { + NRegExp::TFsmBase::TOptions opts; + opts.Charset = CODES_UTF8; + opts.Surround = true; + UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final()); + UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final()); + UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final()); + UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final()); + } + + Y_UNIT_TEST(AndNot) { + NRegExp::TFsmBase::TOptions opts; + opts.AndNotSupport = true; + { + NRegExp::TFsm fsm(".*&~([0-9]*)", opts); + UNIT_ASSERT(TMatcher(fsm).Match("a2").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("ab").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("1a").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("12").Final()); + } + { + NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts); + UNIT_ASSERT(TMatcher(fsm).Match("ab").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("12").Final()); + } + { + NRegExp::TFsm fsm( + "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)" + "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?", + TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true) + ); + UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final()); + } + } + + Y_UNIT_TEST(Glue) { + TFsm glued = + TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) | + TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) | + TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false)); + UNIT_ASSERT(TMatcher(glued).Match("Qw").Final()); + UNIT_ASSERT(TMatcher(glued).Match("Qw").Final()); + UNIT_ASSERT(TMatcher(glued).Match("abc").Final()); + UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final()); + } + + Y_UNIT_TEST(Capture1) { + TCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); + + TSearcher searcher(fsm); + searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); + } + + Y_UNIT_TEST(Capture2) { + TCapturingFsm fsm("w([abcdez]+)f"); + + TSearcher searcher(fsm); + searcher.Search("wabcdef"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde")); + } + + Y_UNIT_TEST(Capture3) { + TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", + TFsm::TOptions().SetCapture(2)); + + TSearcher searcher(fsm); + searcher.Search("http://vkontakte.ru/id100500"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); + } + + Y_UNIT_TEST(Capture4) { + TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", + TFsm::TOptions().SetCharset(CODES_UTF8)); + + TSearcher searcher(fsm); + searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); + } + + Y_UNIT_TEST(Capture5) { + TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\""); + TSearcher searcher(fsm); + searcher.Search("\"/away.php?to=http:some.addr\"&id=1"); + UNIT_ASSERT(searcher.Captured()); + //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr")); + } + + Y_UNIT_TEST(Capture6) { + TCapturingFsm fsm("(/to-match-with)"); + TSearcher searcher(fsm); + searcher.Search("/some/table/path/to-match-with"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with")); + } + + Y_UNIT_TEST(Capture7) { + TCapturingFsm fsm("(pref.*suff)"); + TSearcher searcher(fsm); + searcher.Search("ala pref bla suff cla"); + UNIT_ASSERT(searcher.Captured()); + //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff")); + } + + Y_UNIT_TEST(CaptureXA) { + TCapturingFsm fsm(".*(xa).*"); + + TSearcher searcher(fsm); + searcher.Search("xa"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa")); + } + + Y_UNIT_TEST(CaptureWrongXX) { + TCapturingFsm fsm(".*(xx).*"); + + TSearcher searcher(fsm); + searcher.Search("xx"); + UNIT_ASSERT(searcher.Captured()); + // Surprise! + // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm. + // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong. + // So it returns not the expected "xx" but just the second "x" instead. + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x")); + } + + Y_UNIT_TEST(CaptureRight1XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("xxx"); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(CaptureRight2XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("axx"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(CaptureRight3XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("axxb"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(SlowCaptureXX) { + TSlowCapturingFsm fsm(".*(xx).*"); + + TSlowSearcher searcher(fsm); + searcher.Search("xx"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(SlowCapture) { + TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", + TFsm::TOptions().SetCapture(2)); + TSlowSearcher searcher(fsm); + searcher.Search("http://vkontakte.ru/id100500"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); + } + + Y_UNIT_TEST(SlowCaptureGreedy) { + TSlowCapturingFsm fsm(".*(pref.*suff)"); + TSlowSearcher searcher(fsm); + searcher.Search("pref ala bla pref cla suff dla"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureNonGreedy) { + TSlowCapturingFsm fsm(".*?(pref.*suff)"); + TSlowSearcher searcher(fsm); + searcher.Search("pref ala bla pref cla suff dla"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff")); + } + + Y_UNIT_TEST(SlowCapture2) { + TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", + TFsm::TOptions().SetCharset(CODES_UTF8)); + + TSlowSearcher searcher(fsm); + searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); + } + + Y_UNIT_TEST(SlowCapture3) { + TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); + TSlowSearcher searcher(fsm); + searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); + } + + Y_UNIT_TEST(SlowCapture4) { + TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\""); + TSlowSearcher searcher(fsm); + searcher.Search("\"/away.php?to=http:some.addr\"&id=1"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr")); + } + + Y_UNIT_TEST(CapturedEmptySlow) { + TSlowCapturingFsm fsm("Comments=(.*)$"); + TSlowSearcher searcher(fsm); + searcher.Search("And Comments="); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("")); + } + + Y_UNIT_TEST(CaptureInOrFirst) { + TSlowCapturingFsm fsm("(A)|A"); + TSlowSearcher searcher(fsm); + searcher.Search("A"); + UNIT_ASSERT(searcher.Captured()); + } + + Y_UNIT_TEST(CaptureInOrSecond) { + TSlowCapturingFsm fsm("A|(A)"); + TSlowSearcher searcher(fsm); + searcher.Search("A"); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(CaptureOutside) { + TSlowCapturingFsm fsm("((ID=([0-9]+))?)"); + TSlowSearcher searcher(fsm); + searcher.Search("ID="); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("")); + } + + Y_UNIT_TEST(CaptureInside) { + TSlowCapturingFsm fsm("((ID=([0-9]+))?)", + TFsm::TOptions().SetCapture(2)); + TSlowSearcher searcher(fsm); + searcher.Search("ID="); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(Pcre2PireTest) { + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), ""); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))"); + } +} diff --git a/library/cpp/regex/pire/ut/ya.make b/library/cpp/regex/pire/ut/ya.make new file mode 100644 index 0000000000..8776695f40 --- /dev/null +++ b/library/cpp/regex/pire/ut/ya.make @@ -0,0 +1,44 @@ +# this test in not linked into build tree with ReCURSE and is built by unittest/library + +UNITTEST() + +OWNER( + g:util + davenger +) + +SET(PIRETESTSDIR contrib/libs/pire/ut) + +CFLAGS(-DPIRE_NO_CONFIG) + +PEERDIR( + library/cpp/regex/pire +) + +SRCDIR( + ${PIRETESTSDIR} +) + +ADDINCL( + contrib/libs/pire/pire + contrib/libs/pire/ut +) + +SRCS( + pire_ut.cpp + capture_ut.cpp + count_ut.cpp + glyph_ut.cpp + easy_ut.cpp + read_unicode_ut.cpp + regexp_ut.cpp + approx_matching_ut.cpp +) + +SIZE(MEDIUM) + +TIMEOUT(600) + +PIRE_INLINE(inline_ut.cpp) + +END() diff --git a/library/cpp/regex/pire/ya.make b/library/cpp/regex/pire/ya.make new file mode 100644 index 0000000000..c857e6d18b --- /dev/null +++ b/library/cpp/regex/pire/ya.make @@ -0,0 +1,40 @@ +LIBRARY() + +OWNER( + g:util + g:antiinfra + davenger + pg +) + +CFLAGS(-DPIRE_NO_CONFIG) + +SRCDIR(contrib/libs/pire/pire) + +SRCS( + pcre2pire.cpp + classes.cpp + encoding.cpp + fsm.cpp + scanner_io.cpp + easy.cpp + scanners/null.cpp + extra/capture.cpp + extra/count.cpp + extra/glyphs.cpp + re_lexer.cpp + re_parser.y + read_unicode.cpp + extraencodings.cpp + approx_matching.cpp + half_final_fsm.cpp + minimize.h +) + +PEERDIR( + library/cpp/charset +) + +END() + +RECURSE_FOR_TESTS(ut) |