aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/pire/regexp.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire/regexp.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pire/regexp.h')
-rw-r--r--library/cpp/regex/pire/regexp.h337
1 files changed, 337 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h
new file mode 100644
index 00000000000..94bba4064b7
--- /dev/null
+++ b/library/cpp/regex/pire/regexp.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include "pire.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <library/cpp/charset/recyr.hh>
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NRegExp {
+ struct TMatcher;
+
+ struct TFsmBase {
+ struct TOptions {
+ inline TOptions& SetCaseInsensitive(bool v) noexcept {
+ CaseInsensitive = v;
+ return *this;
+ }
+
+ inline TOptions& SetSurround(bool v) noexcept {
+ Surround = v;
+ return *this;
+ }
+
+ inline TOptions& SetCapture(size_t pos) noexcept {
+ CapturePos = pos;
+ return *this;
+ }
+
+ inline TOptions& SetCharset(ECharset charset) noexcept {
+ Charset = charset;
+ return *this;
+ }
+
+ inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept {
+ AndNotSupport = andNotSupport;
+ return *this;
+ }
+
+ bool CaseInsensitive = false;
+ bool Surround = false;
+ TMaybe<size_t> CapturePos;
+ ECharset Charset = CODES_UNKNOWN;
+ bool AndNotSupport = false;
+ };
+
+ static inline NPire::TFsm Parse(const TStringBuf& regexp,
+ const TOptions& opts, const bool needDetermine = true) {
+ NPire::TLexer lexer;
+ if (opts.Charset == CODES_UNKNOWN) {
+ lexer.Assign(regexp.data(), regexp.data() + regexp.size());
+ } else {
+ TVector<wchar32> ucs4(regexp.size() + 1);
+ size_t inRead = 0;
+ size_t outWritten = 0;
+ int recodeRes = RecodeToUnicode(opts.Charset, regexp.data(), ucs4.data(),
+ regexp.size(), regexp.size(), inRead, outWritten);
+ Y_ASSERT(recodeRes == RECODE_OK);
+ Y_ASSERT(outWritten < ucs4.size());
+ ucs4[outWritten] = 0;
+
+ lexer.Assign(ucs4.begin(),
+ ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data()));
+ }
+
+ if (opts.CaseInsensitive) {
+ lexer.AddFeature(NPire::NFeatures::CaseInsensitive());
+ }
+
+ if (opts.CapturePos) {
+ lexer.AddFeature(NPire::NFeatures::Capture(*opts.CapturePos));
+ }
+
+ if (opts.AndNotSupport) {
+ lexer.AddFeature(NPire::NFeatures::AndNotSupport());
+ }
+
+ switch (opts.Charset) {
+ case CODES_UNKNOWN:
+ break;
+ case CODES_UTF8:
+ lexer.SetEncoding(NPire::NEncodings::Utf8());
+ break;
+ case CODES_KOI8:
+ lexer.SetEncoding(NPire::NEncodings::Koi8r());
+ break;
+ default:
+ lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset));
+ break;
+ }
+
+ NPire::TFsm ret = lexer.Parse();
+
+ if (opts.Surround) {
+ ret.Surround();
+ }
+
+ if (needDetermine) {
+ ret.Determine();
+ }
+
+ return ret;
+ }
+ };
+
+ template <class TScannerType>
+ class TFsmParser: public TFsmBase {
+ public:
+ typedef TScannerType TScanner;
+
+ public:
+ inline explicit TFsmParser(const TStringBuf& regexp,
+ const TOptions& opts = TOptions(), bool needDetermine = true)
+ : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>())
+ {
+ }
+
+ inline const TScanner& GetScanner() const noexcept {
+ return Scanner;
+ }
+
+ static inline TFsmParser False() {
+ return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>());
+ }
+
+ inline explicit TFsmParser(const TScanner& compiled)
+ : Scanner(compiled)
+ {
+ if (Scanner.Empty())
+ ythrow yexception() << "Can't create fsm with empty scanner";
+ }
+
+ private:
+ TScanner Scanner;
+ };
+
+ class TFsm: public TFsmParser<NPire::TNonrelocScanner> {
+ public:
+ inline explicit TFsm(const TStringBuf& regexp,
+ const TOptions& opts = TOptions())
+ : TFsmParser<TScanner>(regexp, opts)
+ {
+ }
+
+ inline TFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+
+ static inline TFsm Glue(const TFsm& l, const TFsm& r) {
+ return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner()));
+ }
+
+ inline explicit TFsm(const TScanner& compiled)
+ : TFsmParser<TScanner>(compiled)
+ {
+ }
+ };
+
+ static inline TFsm operator|(const TFsm& l, const TFsm& r) {
+ return TFsm::Glue(l, r);
+ }
+
+ struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> {
+ inline explicit TCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) {
+ }
+
+ inline TCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> {
+ inline explicit TSlowCapturingFsm(const TStringBuf& regexp,
+ TOptions opts = TOptions())
+ : TFsmParser<TScanner>(regexp,
+ opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1), false) {
+ }
+
+ inline TSlowCapturingFsm(const TFsmParser<TScanner>& fsm)
+ : TFsmParser<TScanner>(fsm)
+ {
+ }
+ };
+
+ template <class TFsm>
+ class TMatcherBase {
+ public:
+ typedef typename TFsm::TScanner::State TState;
+
+ public:
+ inline explicit TMatcherBase(const TFsm& fsm)
+ : Fsm(fsm)
+ {
+ Fsm.GetScanner().Initialize(State);
+ }
+
+ inline bool Final() const noexcept {
+ return GetScanner().Final(GetState());
+ }
+
+ protected:
+ inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept {
+ if (addBegin) {
+ NPire::Step(GetScanner(), State, NPire::BeginMark);
+ }
+ NPire::Run(GetScanner(), State, data, data + len);
+ if (addEnd) {
+ NPire::Step(GetScanner(), State, NPire::EndMark);
+ }
+ }
+
+ inline const typename TFsm::TScanner& GetScanner() const noexcept {
+ return Fsm.GetScanner();
+ }
+
+ inline const TState& GetState() const noexcept {
+ return State;
+ }
+
+ private:
+ const TFsm& Fsm;
+ TState State;
+ };
+
+ struct TMatcher : TMatcherBase<TFsm> {
+ inline explicit TMatcher(const TFsm& fsm)
+ : TMatcherBase<TFsm>(fsm)
+ {
+ }
+
+ inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept {
+ return Match(s.data(), s.size(), addBegin, addEnd);
+ }
+
+ inline const char* Find(const char* b, const char* e) noexcept {
+ return NPire::ShortestPrefix(GetScanner(), b, e);
+ }
+
+ typedef std::pair<const size_t*, const size_t*> TMatchedRegexps;
+
+ inline TMatchedRegexps MatchedRegexps() const noexcept {
+ return GetScanner().AcceptedRegexps(GetState());
+ }
+ };
+
+ class TSearcher: public TMatcherBase<TCapturingFsm> {
+ public:
+ inline explicit TSearcher(const TCapturingFsm& fsm)
+ : TMatcherBase<TCapturingFsm>(fsm)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return GetState().Captured();
+ }
+
+ inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept {
+ Data = TStringBuf(data, len);
+ Run(data, len, addBegin, addEnd);
+ return *this;
+ }
+
+ inline TSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return TStringBuf(Data.data() + GetState().Begin() - 1,
+ Data.data() + GetState().End() - 1);
+ }
+
+ private:
+ TStringBuf Data;
+ };
+
+ class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{
+ public:
+ typedef typename TSlowCapturingFsm::TScanner::State TState;
+ inline explicit TSlowSearcher(const TSlowCapturingFsm& fsm)
+ : TMatcherBase<TSlowCapturingFsm>(fsm)
+ , HasCaptured(false)
+ {
+ }
+
+ inline bool Captured() const noexcept {
+ return HasCaptured;
+ }
+
+ inline TSlowSearcher& Search(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept {
+ TStringBuf textData(data, len);
+ Data = textData;
+ Run(Data.begin(), Data.size(), addBegin, addEnd);
+ return GetAns();
+ }
+
+ inline TSlowSearcher& Search(const TStringBuf& s) noexcept {
+ return Search(s.data(), s.size());
+ }
+
+ inline TStringBuf GetCaptured() const noexcept {
+ return Ans;
+ }
+
+ private:
+ TStringBuf Data;
+ TStringBuf Ans;
+ bool HasCaptured;
+
+ inline TSlowSearcher& GetAns() {
+ auto state = GetState();
+ Pire::SlowCapturingScanner::SingleState final;
+ if (!GetScanner().GetCapture(state, final)) {
+ HasCaptured = false;
+ } else {
+ if (!final.HasEnd()) {
+ final.SetEnd(Data.size());
+ }
+ Ans = TStringBuf(Data, final.GetBegin(), final.GetEnd() - final.GetBegin());
+ HasCaptured = true;
+ }
+ return *this;
+ }
+ };
+}