diff options
author | thegeorg <[email protected]> | 2025-07-28 23:26:41 +0300 |
---|---|---|
committer | thegeorg <[email protected]> | 2025-07-28 23:42:24 +0300 |
commit | 2e3c965f3bac5a35f0ce39c7ae4e81bac879542f (patch) | |
tree | 7de46185d4d1bc4b1808109d95f72b914fa4adce /library/cpp/regex | |
parent | 48b676b29c3a9750b57dfb8c1c3c6aaa5e056a66 (diff) |
library/cpp/pire: Tab to space
```
$ ya-subst '\t' ' '
$ ya-subst '\s+$' ''
```
commit_hash:402b2f02694cb6fae6bd903d31e429e1cc9a5d4b
Diffstat (limited to 'library/cpp/regex')
52 files changed, 9875 insertions, 9875 deletions
diff --git a/library/cpp/regex/pire/inline/inline.l b/library/cpp/regex/pire/inline/inline.l index a198ab9f978..72c1e32c6ab 100644 --- a/library/cpp/regex/pire/inline/inline.l +++ b/library/cpp/regex/pire/inline/inline.l @@ -47,27 +47,27 @@ static int isatty(int) { return 0; } class Die { public: - Die() { - Msg = filename.empty() ? "pire_inline" : (filename + ":" + ToString(line) + ":"); - } + Die() { + Msg = filename.empty() ? "pire_inline" : (filename + ":" + ToString(line) + ":"); + } - template<class T> - Die& operator << (const T& t) { - Msg += ToString(t); - return *this; - } + template<class T> + Die& operator << (const T& t) { + Msg += ToString(t); + return *this; + } - ~Die() { - fprintf(stderr, "%s\n", Msg.c_str()); - exit(1); - } + ~Die() { + fprintf(stderr, "%s\n", Msg.c_str()); + exit(1); + } private: - ystring Msg; + ystring Msg; }; Die DieHelper() { - return Die(); + return Die(); } void putChar(char c) { putc(c, yyout); } @@ -90,115 +90,115 @@ void eatComment(void (*action)(char)); <INITIAL>"PIRE_REGEXP"[:space:]*"(" { BEGIN(Regexp); args.clear(); args.push_back(ystring()); } <Regexp>"\""([^\"]|\\.)*"\"" { - ystring& s = args.back(); - const char* p; - for (p = yytext + 1; *p && p[1]; ++p) { - if (*p == '\\') { - ++p; - if (!*p) - Die() << "string ends with a backslash"; - else if (*p == '\'' || *p == '\"' || *p == '\\') - s.push_back(*p); - else if (*p == 'n') - s.push_back('\n'); - else if (*p == 't') - s.push_back('\t'); - else if (isdigit(*p)) { - const char* beg = p; - while (isdigit(*p)) - ++p; - s.push_back(strtol(ystring(beg, p).c_str(), 0, 8)); - } else if (*p == 'x') { - const char* beg = p; - while (isdigit(*p) || (*p > 'a' && *p <= 'f') || (*p > 'A' && *p < 'F')) - ++p; - s.push_back(strtol(ystring(beg, p).c_str(), 0, 16)); - } else - Die() << "unknown escape sequence (\\" << *p << ")"; - } else - s.push_back(*p); - } - if (!*p) - Die() << "string ends with a backslash"; + ystring& s = args.back(); + const char* p; + for (p = yytext + 1; *p && p[1]; ++p) { + if (*p == '\\') { + ++p; + if (!*p) + Die() << "string ends with a backslash"; + else if (*p == '\'' || *p == '\"' || *p == '\\') + s.push_back(*p); + else if (*p == 'n') + s.push_back('\n'); + else if (*p == 't') + s.push_back('\t'); + else if (isdigit(*p)) { + const char* beg = p; + while (isdigit(*p)) + ++p; + s.push_back(strtol(ystring(beg, p).c_str(), 0, 8)); + } else if (*p == 'x') { + const char* beg = p; + while (isdigit(*p) || (*p > 'a' && *p <= 'f') || (*p > 'A' && *p < 'F')) + ++p; + s.push_back(strtol(ystring(beg, p).c_str(), 0, 16)); + } else + Die() << "unknown escape sequence (\\" << *p << ")"; + } else + s.push_back(*p); + } + if (!*p) + Die() << "string ends with a backslash"; } <Regexp>[ \t] {} <Regexp>\n { ++line; } <Regexp>"," { args.push_back(ystring()); } <Regexp>")" { - if (args.size() & 1 || args.empty()) - Die() << "Usage: PIRE_REGEXP(\"regexp1\", \"flags1\" [, \"regexp2\", \"flags2\" [,...] ])"; - - bool first = true; - Pire::Scanner sc; - ystring pattern; - for (auto i = args.begin(), ie = args.end(); i != ie; i += 2) { - - Pire::Lexer lexer(i->c_str(), i->c_str() + i->size()); - bool surround = false; - bool greedy = false; - bool reverse = false; - for (const char* option = (i+1)->c_str(); *option; ++option) { - if (*option == 'i') - lexer.AddFeature(Pire::Features::CaseInsensitive()); - else if (*option == 'u') - lexer.SetEncoding(Pire::Encodings::Utf8()); - else if (*option == 's') - surround = true; - else if (*option == 'a') - lexer.AddFeature(Pire::Features::AndNotSupport()); - else if (*option == 'g') - greedy = true; - else if (*option == 'r') - reverse = true; - else - Die() << "unknown option " << *option << ""; - } - - Pire::Fsm fsm; - try { - fsm = lexer.Parse(); - } - catch (std::exception& e) { - Die() << "" << filename << ":" << line << ": " << e.what() << ""; - } - if (reverse) - fsm.Reverse(); - if (greedy && surround) - Die() << "greedy and surround options are incompatible"; - if (greedy) - fsm = ~fsm.Surrounded() + fsm; - else if (surround) - fsm.Surround(); - - Pire::Scanner tsc(fsm); - if (first) { - pattern = *i; - first = false; - tsc.Swap(sc); - } else { - sc = Pire::Scanner::Glue(sc, tsc); - pattern += " | "; - pattern += *i; - } - } - - BufferOutput buf; - AlignedOutput stream(&buf); - Save(&stream, sc); - - fprintf(yyout, "Pire::MmappedScanner<Pire::Scanner>(PIRE_LITERAL( // %s \n \"", pattern.c_str()); - size_t pos = 5; - for (auto i = buf.Buffer().Begin(), ie = buf.Buffer().End(); i != ie; ++i) { - pos += fprintf(yyout, "\\x%02X", static_cast<unsigned char>(*i)); - if (pos >= 78) { - fprintf(yyout, "\"\n \""); - pos = 5; - } - } - fprintf(yyout, "\"), %u)\n#line %d \"%s\"\n", - (unsigned int) buf.Buffer().Size(), line, filename.c_str()); - BEGIN(INITIAL); + if (args.size() & 1 || args.empty()) + Die() << "Usage: PIRE_REGEXP(\"regexp1\", \"flags1\" [, \"regexp2\", \"flags2\" [,...] ])"; + + bool first = true; + Pire::Scanner sc; + ystring pattern; + for (auto i = args.begin(), ie = args.end(); i != ie; i += 2) { + + Pire::Lexer lexer(i->c_str(), i->c_str() + i->size()); + bool surround = false; + bool greedy = false; + bool reverse = false; + for (const char* option = (i+1)->c_str(); *option; ++option) { + if (*option == 'i') + lexer.AddFeature(Pire::Features::CaseInsensitive()); + else if (*option == 'u') + lexer.SetEncoding(Pire::Encodings::Utf8()); + else if (*option == 's') + surround = true; + else if (*option == 'a') + lexer.AddFeature(Pire::Features::AndNotSupport()); + else if (*option == 'g') + greedy = true; + else if (*option == 'r') + reverse = true; + else + Die() << "unknown option " << *option << ""; + } + + Pire::Fsm fsm; + try { + fsm = lexer.Parse(); + } + catch (std::exception& e) { + Die() << "" << filename << ":" << line << ": " << e.what() << ""; + } + if (reverse) + fsm.Reverse(); + if (greedy && surround) + Die() << "greedy and surround options are incompatible"; + if (greedy) + fsm = ~fsm.Surrounded() + fsm; + else if (surround) + fsm.Surround(); + + Pire::Scanner tsc(fsm); + if (first) { + pattern = *i; + first = false; + tsc.Swap(sc); + } else { + sc = Pire::Scanner::Glue(sc, tsc); + pattern += " | "; + pattern += *i; + } + } + + BufferOutput buf; + AlignedOutput stream(&buf); + Save(&stream, sc); + + fprintf(yyout, "Pire::MmappedScanner<Pire::Scanner>(PIRE_LITERAL( // %s \n \"", pattern.c_str()); + size_t pos = 5; + for (auto i = buf.Buffer().Begin(), ie = buf.Buffer().End(); i != ie; ++i) { + pos += fprintf(yyout, "\\x%02X", static_cast<unsigned char>(*i)); + if (pos >= 78) { + fprintf(yyout, "\"\n \""); + pos = 5; + } + } + fprintf(yyout, "\"), %u)\n#line %d \"%s\"\n", + (unsigned int) buf.Buffer().Size(), line, filename.c_str()); + BEGIN(INITIAL); } <INITIAL>. { putc(*yytext, yyout); } @@ -209,26 +209,26 @@ void eatComment(void (*action)(char)); void eatComment(void (*action)(char)) { - int c; - action('/'); action('*'); - for (;;) { - while ((c = yyinput()) != EOF && c != '*') { - if (c == '\n') - ++line; - action(c); - } - if (c == '*') { - action(c); - while ((c = yyinput()) == '*') - action(c); - if (c == '/') { - action(c); - break; - } - } - if (c == EOF) - Die() << "EOF in comment"; - } + int c; + action('/'); action('*'); + for (;;) { + while ((c = yyinput()) != EOF && c != '*') { + if (c == '\n') + ++line; + action(c); + } + if (c == '*') { + action(c); + while ((c = yyinput()) == '*') + action(c); + if (c == '/') { + action(c); + break; + } + } + if (c == EOF) + Die() << "EOF in comment"; + } } int yywrap() { return 1; } @@ -236,37 +236,37 @@ int yywrap() { return 1; } int main(int argc, char** argv) { - // Suppress warnings - static_cast<void>(&yy_fatal_error); - static_cast<void>(&yyunput); - - - try { - const char* outfile = 0; - if (argc >= 3 && !strcmp(argv[1], "-o")) { - outfile = argv[2]; - argv += 2, argc -= 2; - } - if (argc == 2) - filename = ystring(argv[1]); - else if (argc > 2) - Die() << "usage: pire_inline [-o outfile] [infile]"; - - yyin = stdin, yyout = stdout; - if (outfile && (yyout = fopen(outfile, "w")) == NULL) - Die() << "cannot open file " << outfile << " for writing"; - if (!filename.empty()) { - if ((yyin = fopen(filename.c_str(), "r")) == NULL) - Die() << "cannot open file " << filename.c_str() << "\n"; - } else - filename = "(stdin)"; - - - yylex(); - return 0; - } - catch (std::exception& e) { - fprintf(stderr, "%s\n", e.what()); - return 1; - } + // Suppress warnings + static_cast<void>(&yy_fatal_error); + static_cast<void>(&yyunput); + + + try { + const char* outfile = 0; + if (argc >= 3 && !strcmp(argv[1], "-o")) { + outfile = argv[2]; + argv += 2, argc -= 2; + } + if (argc == 2) + filename = ystring(argv[1]); + else if (argc > 2) + Die() << "usage: pire_inline [-o outfile] [infile]"; + + yyin = stdin, yyout = stdout; + if (outfile && (yyout = fopen(outfile, "w")) == NULL) + Die() << "cannot open file " << outfile << " for writing"; + if (!filename.empty()) { + if ((yyin = fopen(filename.c_str(), "r")) == NULL) + Die() << "cannot open file " << filename.c_str() << "\n"; + } else + filename = "(stdin)"; + + + yylex(); + return 0; + } + catch (std::exception& e) { + fprintf(stderr, "%s\n", e.what()); + return 1; + } } diff --git a/library/cpp/regex/pire/pire/align.h b/library/cpp/regex/pire/pire/align.h index 9e482f1b44a..595b8cbde83 100644 --- a/library/cpp/regex/pire/pire/align.h +++ b/library/cpp/regex/pire/pire/align.h @@ -12,7 +12,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -31,73 +31,73 @@ #include "platform.h" namespace Pire { - - namespace Impl { - - template<class T> - inline T AlignUp(T t, size_t bound) - { - return (T) (((size_t) t + (bound-1)) & ~(bound-1)); - } - - template<class T> - inline T AlignDown(T t, size_t bound) - { - return (T) ((size_t) t & ~(bound-1)); - } - - inline void AlignSave(yostream* s, size_t size) - { - size_t tail = AlignUp(size, sizeof(size_t)) - size; - if (tail) { - static const char buf[sizeof(MaxSizeWord)] = {0}; - SavePodArray(s, buf, tail); - } - } - - inline void AlignLoad(yistream* s, size_t size) - { - size_t tail = AlignUp(size, sizeof(size_t)) - size; - if (tail) { - char buf[sizeof(MaxSizeWord)]; - LoadPodArray(s, buf, tail); - } - } - - template<class T> - inline void AlignedSaveArray(yostream* s, const T* array, size_t count) - { - SavePodArray(s, array, count); - AlignSave(s, sizeof(*array) * count); - } - - template<class T> - inline void AlignedLoadArray(yistream* s, T* array, size_t count) - { - LoadPodArray(s, array, count); - AlignLoad(s, sizeof(*array) * count); - } - - template<class T> - inline bool IsAligned(T t, size_t bound) - { - return ((size_t) t & (bound-1)) == 0; - } - - inline const void* AlignPtr(const size_t*& p, size_t& size) - { - if (!IsAligned(p, sizeof(size_t))) { - const size_t* next = AlignUp(p, sizeof(size_t)); - if (next > p+size) - throw Error("EOF reached in NPire::Impl::align"); - size -= (next - p); - p = next; - } - return (const void*) p; - } - - } - + + namespace Impl { + + template<class T> + inline T AlignUp(T t, size_t bound) + { + return (T) (((size_t) t + (bound-1)) & ~(bound-1)); + } + + template<class T> + inline T AlignDown(T t, size_t bound) + { + return (T) ((size_t) t & ~(bound-1)); + } + + inline void AlignSave(yostream* s, size_t size) + { + size_t tail = AlignUp(size, sizeof(size_t)) - size; + if (tail) { + static const char buf[sizeof(MaxSizeWord)] = {0}; + SavePodArray(s, buf, tail); + } + } + + inline void AlignLoad(yistream* s, size_t size) + { + size_t tail = AlignUp(size, sizeof(size_t)) - size; + if (tail) { + char buf[sizeof(MaxSizeWord)]; + LoadPodArray(s, buf, tail); + } + } + + template<class T> + inline void AlignedSaveArray(yostream* s, const T* array, size_t count) + { + SavePodArray(s, array, count); + AlignSave(s, sizeof(*array) * count); + } + + template<class T> + inline void AlignedLoadArray(yistream* s, T* array, size_t count) + { + LoadPodArray(s, array, count); + AlignLoad(s, sizeof(*array) * count); + } + + template<class T> + inline bool IsAligned(T t, size_t bound) + { + return ((size_t) t & (bound-1)) == 0; + } + + inline const void* AlignPtr(const size_t*& p, size_t& size) + { + if (!IsAligned(p, sizeof(size_t))) { + const size_t* next = AlignUp(p, sizeof(size_t)); + if (next > p+size) + throw Error("EOF reached in NPire::Impl::align"); + size -= (next - p); + p = next; + } + return (const void*) p; + } + + } + } #endif diff --git a/library/cpp/regex/pire/pire/any.h b/library/cpp/regex/pire/pire/any.h index 671a4832381..4e968857dc8 100644 --- a/library/cpp/regex/pire/pire/any.h +++ b/library/cpp/regex/pire/pire/any.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -34,98 +34,98 @@ namespace Pire { class Any { public: - Any() = default; - - Any(const Any& any) - { - if (any.h) - h = any.h->Duplicate(); - } - - Any& operator= (Any any) - { - any.Swap(*this); - return *this; - } - - template <class T> - Any(const T& t) - : h(new Holder<T>(t)) - { - } - - bool Empty() const { - return !h; - } - template <class T> - bool IsA() const { - return h && h->IsA(typeid(T)); - } - - template <class T> - T& As() - { - if (h && IsA<T>()) - return *reinterpret_cast<T*>(h->Ptr()); - else - throw Pire::Error("type mismatch"); - } - - template <class T> - const T& As() const - { - if (h && IsA<T>()) - return *reinterpret_cast<const T*>(h->Ptr()); - else - throw Pire::Error("type mismatch"); - } - - void Swap(Any& a) noexcept { - DoSwap(h, a.h); - } + Any() = default; + + Any(const Any& any) + { + if (any.h) + h = any.h->Duplicate(); + } + + Any& operator= (Any any) + { + any.Swap(*this); + return *this; + } + + template <class T> + Any(const T& t) + : h(new Holder<T>(t)) + { + } + + bool Empty() const { + return !h; + } + template <class T> + bool IsA() const { + return h && h->IsA(typeid(T)); + } + + template <class T> + T& As() + { + if (h && IsA<T>()) + return *reinterpret_cast<T*>(h->Ptr()); + else + throw Pire::Error("type mismatch"); + } + + template <class T> + const T& As() const + { + if (h && IsA<T>()) + return *reinterpret_cast<const T*>(h->Ptr()); + else + throw Pire::Error("type mismatch"); + } + + void Swap(Any& a) noexcept { + DoSwap(h, a.h); + } private: - struct AbstractHolder { - virtual ~AbstractHolder() { - } - virtual THolder<AbstractHolder> Duplicate() const = 0; - virtual bool IsA(const std::type_info& id) const = 0; - virtual void* Ptr() = 0; - virtual const void* Ptr() const = 0; - }; - - template <class T> - struct Holder: public AbstractHolder { - Holder(T t) - : d(t) - { - } - THolder<AbstractHolder> Duplicate() const { - return THolder<AbstractHolder>(new Holder<T>(d)); - } - bool IsA(const std::type_info& id) const { - return id == typeid(T); - } - void* Ptr() { - return &d; - } - const void* Ptr() const { - return &d; - } - private: - T d; - }; - - THolder<AbstractHolder> h; + struct AbstractHolder { + virtual ~AbstractHolder() { + } + virtual THolder<AbstractHolder> Duplicate() const = 0; + virtual bool IsA(const std::type_info& id) const = 0; + virtual void* Ptr() = 0; + virtual const void* Ptr() const = 0; + }; + + template <class T> + struct Holder: public AbstractHolder { + Holder(T t) + : d(t) + { + } + THolder<AbstractHolder> Duplicate() const { + return THolder<AbstractHolder>(new Holder<T>(d)); + } + bool IsA(const std::type_info& id) const { + return id == typeid(T); + } + void* Ptr() { + return &d; + } + const void* Ptr() const { + return &d; + } + private: + T d; + }; + + THolder<AbstractHolder> h; }; } namespace std { - inline void swap(Pire::Any& a, Pire::Any& b) { - a.Swap(b); - } + inline void swap(Pire::Any& a, Pire::Any& b) { + a.Swap(b); + } } #endif diff --git a/library/cpp/regex/pire/pire/approx_matching.cpp b/library/cpp/regex/pire/pire/approx_matching.cpp index 23f74ca01df..fb8adf1885d 100644 --- a/library/cpp/regex/pire/pire/approx_matching.cpp +++ b/library/cpp/regex/pire/pire/approx_matching.cpp @@ -23,72 +23,72 @@ #include "approx_matching.h" namespace Pire { - Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) { - Fsm approxFsm = regexp; + Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) { + Fsm approxFsm = regexp; - TVector<TSet<Char>> outgoingLettersTable(regexp.Size()); - for (size_t state = 0; state < regexp.Size(); ++state) { - outgoingLettersTable[state] = regexp.OutgoingLetters(state); - } + TVector<TSet<Char>> outgoingLettersTable(regexp.Size()); + for (size_t state = 0; state < regexp.Size(); ++state) { + outgoingLettersTable[state] = regexp.OutgoingLetters(state); + } - TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size()); - for (size_t state = 0; state < regexp.Size(); ++state) { - for (Char letter : outgoingLettersTable[state]) { - destinationsTable[state][letter] = regexp.Destinations(state, letter); - } - } + TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size()); + for (size_t state = 0; state < regexp.Size(); ++state) { + for (Char letter : outgoingLettersTable[state]) { + destinationsTable[state][letter] = regexp.Destinations(state, letter); + } + } - for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) { - approxFsm.Import(regexp); - const auto shift = fsmIdx * regexp.Size(); + for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) { + approxFsm.Import(regexp); + const auto shift = fsmIdx * regexp.Size(); - for (size_t state = 0; state < regexp.Size(); ++state) { - for (Char letter : outgoingLettersTable[state]) { - for (size_t to : destinationsTable[state][letter]) { - for (Char ch = 0; ch < MaxChar; ++ch) { - if (!approxFsm.Connected(state + shift, to + shift, ch)) { - approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch); - } - } + for (size_t state = 0; state < regexp.Size(); ++state) { + for (Char letter : outgoingLettersTable[state]) { + for (size_t to : destinationsTable[state][letter]) { + for (Char ch = 0; ch < MaxChar; ++ch) { + if (!approxFsm.Connected(state + shift, to + shift, ch)) { + approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch); + } + } - approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon); - } + approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon); + } - for (Char ch = 0; ch < MaxChar; ++ch) { - approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch); - } - } + for (Char ch = 0; ch < MaxChar; ++ch) { + approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch); + } + } - if (regexp.IsFinal(state)) { - approxFsm.SetFinal(state + shift + regexp.Size(), true); - } - } - } + if (regexp.IsFinal(state)) { + approxFsm.SetFinal(state + shift + regexp.Size(), true); + } + } + } - size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0; - for (size_t state = 0; state < maxState; ++state) { - size_t currentDist = state / regexp.Size(); - size_t intState = state % regexp.Size(); + size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0; + for (size_t state = 0; state < maxState; ++state) { + size_t currentDist = state / regexp.Size(); + size_t intState = state % regexp.Size(); - for (Char firstLetter : outgoingLettersTable[intState]) { - for (size_t firstDest : destinationsTable[intState][firstLetter]) { - for (Char secondLetter : outgoingLettersTable[firstDest]) { - for (size_t secondDest : destinationsTable[firstDest][secondLetter]) { - if (secondDest != intState || firstDest != intState) { - approxFsm.Resize(approxFsm.Size() + 1); + for (Char firstLetter : outgoingLettersTable[intState]) { + for (size_t firstDest : destinationsTable[intState][firstLetter]) { + for (Char secondLetter : outgoingLettersTable[firstDest]) { + for (size_t secondDest : destinationsTable[firstDest][secondLetter]) { + if (secondDest != intState || firstDest != intState) { + approxFsm.Resize(approxFsm.Size() + 1); - size_t to = secondDest + (currentDist + 1) * regexp.Size(); - size_t middle = approxFsm.Size() - 1; + size_t to = secondDest + (currentDist + 1) * regexp.Size(); + size_t middle = approxFsm.Size() - 1; - approxFsm.Connect(state, middle, secondLetter); - approxFsm.Connect(middle, to, firstLetter); - } - } - } - } - } - } + approxFsm.Connect(state, middle, secondLetter); + approxFsm.Connect(middle, to, firstLetter); + } + } + } + } + } + } - return approxFsm; - } + return approxFsm; + } } diff --git a/library/cpp/regex/pire/pire/approx_matching.h b/library/cpp/regex/pire/pire/approx_matching.h index fc2a9fd61c1..c20d68ce6ac 100644 --- a/library/cpp/regex/pire/pire/approx_matching.h +++ b/library/cpp/regex/pire/pire/approx_matching.h @@ -24,5 +24,5 @@ #include "fsm.h" namespace Pire { - Fsm CreateApproxFsm(const Fsm& regexp, size_t distance); + Fsm CreateApproxFsm(const Fsm& regexp, size_t distance); } diff --git a/library/cpp/regex/pire/pire/classes.cpp b/library/cpp/regex/pire/pire/classes.cpp index 7dd531ab3e1..3558e775994 100644 --- a/library/cpp/regex/pire/pire/classes.cpp +++ b/library/cpp/regex/pire/pire/classes.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -32,120 +32,120 @@ namespace Pire { namespace { - class CharClassesTable: private NonCopyable { - private: - class CharClass { - public: - CharClass() {} - explicit CharClass(wchar32 ch) { m_bounds.push_back(ymake_pair(ch, ch)); } - CharClass(wchar32 lower, wchar32 upper) { m_bounds.push_back(ymake_pair(lower, upper)); } - - CharClass& operator |= (const CharClass& cc) - { - std::copy(cc.m_bounds.begin(), cc.m_bounds.end(), std::back_inserter(m_bounds)); - return *this; - } - - CharClass operator | (const CharClass& cc) const - { - CharClass r(*this); - r |= cc; - return r; - } - - TSet<wchar32> ToSet() const - { - TSet<wchar32> ret; - for (auto&& bound : m_bounds) - for (wchar32 c = bound.first; c <= bound.second; ++c) - ret.insert(c); - return ret; - } - - private: - TVector<ypair<wchar32, wchar32> > m_bounds; - }; - - public: - bool Has(wchar32 wc) const - { - return (m_classes.find(to_lower(wc & ~ControlMask)) != m_classes.end()); - } - - TSet<wchar32> Get(wchar32 wc) const - { - auto it = m_classes.find(to_lower(wc & ~ControlMask)); - if (it == m_classes.end()) - throw Error("Unknown character class"); - return it->second.ToSet(); - } - - CharClassesTable() - { - m_classes['l'] = CharClass('A', 'Z') | CharClass('a', 'z'); - m_classes['c'] - = CharClass(0x0410, 0x044F) // Russian capital A to Russan capital YA, Russian small A to Russian small YA - | CharClass(0x0401) // Russian capital Yo - | CharClass(0x0451) // Russian small Yo - ; - - m_classes['w'] = m_classes['l'] | m_classes['c']; - m_classes['d'] = CharClass('0', '9'); - m_classes['s'] - = CharClass(' ') | CharClass('\t') | CharClass('\r') | CharClass('\n') - | CharClass(0x00A0) // Non-breaking space - ; - - // A few special classes which do not have any negation - m_classes['n'] = CharClass('\n'); - m_classes['r'] = CharClass('\r'); - m_classes['t'] = CharClass('\t'); - } - - TMap<wchar32, CharClass> m_classes; - }; - - class CharClassesImpl: public Feature { - public: - CharClassesImpl(): m_table(Singleton<CharClassesTable>()) {} - int Priority() const { return 10; } - - void Alter(Term& t) - { - if (t.Value().IsA<Term::CharacterRange>()) { - const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>(); - typedef Term::CharacterRange::first_type CharSet; - const CharSet& old = range.first; - CharSet altered; - bool pos = false; - bool neg = false; - for (auto&& i : old) - if (i.size() == 1 && (i[0] & ControlMask) == Control && m_table->Has(i[0])) { - if (is_upper(i[0] & ~ControlMask)) - neg = true; - else - pos = true; - - TSet<wchar32> klass = m_table->Get(i[0]); - for (auto&& j : klass) - altered.insert(Term::String(1, j)); - } else - altered.insert(i); - - if (neg && (pos || range.second)) - Error("Positive and negative character ranges mixed"); - t = Term(t.Type(), Term::CharacterRange(altered, neg || range.second)); - } - } - - private: - CharClassesTable* m_table; - }; + class CharClassesTable: private NonCopyable { + private: + class CharClass { + public: + CharClass() {} + explicit CharClass(wchar32 ch) { m_bounds.push_back(ymake_pair(ch, ch)); } + CharClass(wchar32 lower, wchar32 upper) { m_bounds.push_back(ymake_pair(lower, upper)); } + + CharClass& operator |= (const CharClass& cc) + { + std::copy(cc.m_bounds.begin(), cc.m_bounds.end(), std::back_inserter(m_bounds)); + return *this; + } + + CharClass operator | (const CharClass& cc) const + { + CharClass r(*this); + r |= cc; + return r; + } + + TSet<wchar32> ToSet() const + { + TSet<wchar32> ret; + for (auto&& bound : m_bounds) + for (wchar32 c = bound.first; c <= bound.second; ++c) + ret.insert(c); + return ret; + } + + private: + TVector<ypair<wchar32, wchar32> > m_bounds; + }; + + public: + bool Has(wchar32 wc) const + { + return (m_classes.find(to_lower(wc & ~ControlMask)) != m_classes.end()); + } + + TSet<wchar32> Get(wchar32 wc) const + { + auto it = m_classes.find(to_lower(wc & ~ControlMask)); + if (it == m_classes.end()) + throw Error("Unknown character class"); + return it->second.ToSet(); + } + + CharClassesTable() + { + m_classes['l'] = CharClass('A', 'Z') | CharClass('a', 'z'); + m_classes['c'] + = CharClass(0x0410, 0x044F) // Russian capital A to Russan capital YA, Russian small A to Russian small YA + | CharClass(0x0401) // Russian capital Yo + | CharClass(0x0451) // Russian small Yo + ; + + m_classes['w'] = m_classes['l'] | m_classes['c']; + m_classes['d'] = CharClass('0', '9'); + m_classes['s'] + = CharClass(' ') | CharClass('\t') | CharClass('\r') | CharClass('\n') + | CharClass(0x00A0) // Non-breaking space + ; + + // A few special classes which do not have any negation + m_classes['n'] = CharClass('\n'); + m_classes['r'] = CharClass('\r'); + m_classes['t'] = CharClass('\t'); + } + + TMap<wchar32, CharClass> m_classes; + }; + + class CharClassesImpl: public Feature { + public: + CharClassesImpl(): m_table(Singleton<CharClassesTable>()) {} + int Priority() const { return 10; } + + void Alter(Term& t) + { + if (t.Value().IsA<Term::CharacterRange>()) { + const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>(); + typedef Term::CharacterRange::first_type CharSet; + const CharSet& old = range.first; + CharSet altered; + bool pos = false; + bool neg = false; + for (auto&& i : old) + if (i.size() == 1 && (i[0] & ControlMask) == Control && m_table->Has(i[0])) { + if (is_upper(i[0] & ~ControlMask)) + neg = true; + else + pos = true; + + TSet<wchar32> klass = m_table->Get(i[0]); + for (auto&& j : klass) + altered.insert(Term::String(1, j)); + } else + altered.insert(i); + + if (neg && (pos || range.second)) + Error("Positive and negative character ranges mixed"); + t = Term(t.Type(), Term::CharacterRange(altered, neg || range.second)); + } + } + + private: + CharClassesTable* m_table; + }; } namespace Features { - Feature::Ptr CharClasses() { return Feature::Ptr(new CharClassesImpl); } + Feature::Ptr CharClasses() { return Feature::Ptr(new CharClassesImpl); } } } diff --git a/library/cpp/regex/pire/pire/defs.h b/library/cpp/regex/pire/pire/defs.h index 894cc780b72..18570cd5bfe 100644 --- a/library/cpp/regex/pire/pire/defs.h +++ b/library/cpp/regex/pire/pire/defs.h @@ -37,73 +37,73 @@ namespace Pire { #ifdef PIRE_DEBUG -# define PIRE_IFDEBUG(x) x +# define PIRE_IFDEBUG(x) x #else -# define PIRE_IFDEBUG(x) +# define PIRE_IFDEBUG(x) #endif #ifdef PIRE_CHECKED -# define PIRE_IF_CHECKED(e) e +# define PIRE_IF_CHECKED(e) e #else -# define PIRE_IF_CHECKED(e) +# define PIRE_IF_CHECKED(e) #endif - typedef unsigned short Char; + typedef unsigned short Char; - namespace SpecialChar { - enum { - Epsilon = 257, - BeginMark = 258, - EndMark = 259, + namespace SpecialChar { + enum { + Epsilon = 257, + BeginMark = 258, + EndMark = 259, - // Actual size of input alphabet - MaxCharUnaligned = 260, + // Actual size of input alphabet + MaxCharUnaligned = 260, - // Size of letter transition tables, must be a multiple of the machine word size - MaxChar = (MaxCharUnaligned + (sizeof(void*)-1)) & ~(sizeof(void*)-1) - }; - } + // Size of letter transition tables, must be a multiple of the machine word size + MaxChar = (MaxCharUnaligned + (sizeof(void*)-1)) & ~(sizeof(void*)-1) + }; + } - using namespace SpecialChar; + using namespace SpecialChar; - namespace Impl { + namespace Impl { #ifndef PIRE_WORDS_BIGENDIAN - inline size_t ToLittleEndian(size_t val) { return val; } + inline size_t ToLittleEndian(size_t val) { return val; } #else - template<unsigned N> - inline size_t SwapBytes(size_t val) - { - static const size_t Mask = (1 << (N/2)) - 1; - return ((SwapBytes<N/2>(val) & Mask) << (N/2)) | SwapBytes<N/2>(val >> (N/2)); - } + template<unsigned N> + inline size_t SwapBytes(size_t val) + { + static const size_t Mask = (1 << (N/2)) - 1; + return ((SwapBytes<N/2>(val) & Mask) << (N/2)) | SwapBytes<N/2>(val >> (N/2)); + } - template<> - inline size_t SwapBytes<8>(size_t val) { return val & 0xFF; } + template<> + inline size_t SwapBytes<8>(size_t val) { return val & 0xFF; } - inline size_t ToLittleEndian(size_t val) { return SwapBytes<sizeof(val)*8>(val); } + inline size_t ToLittleEndian(size_t val) { return SwapBytes<sizeof(val)*8>(val); } #endif struct Struct { void* p; }; - } + } } #ifndef PIRE_ALIGNED_DECL -# if defined(PIRE_HAVE_ALIGNAS) -# define PIRE_ALIGNED_DECL(x) alignas(::Pire::Impl::Struct) static const char x[] -# elif defined(PIRE_HAVE_ATTR_ALIGNED) -# define PIRE_ALIGNED_DECL(x) static const char x[] __attribute__((aligned(sizeof(void*)))) -# elif defined(PIRE_HAVE_DECLSPEC_ALIGN) -# define PIRE_ALIGNED_DECL(x) __declspec(align(8)) static const char x[] -# endif +# if defined(PIRE_HAVE_ALIGNAS) +# define PIRE_ALIGNED_DECL(x) alignas(::Pire::Impl::Struct) static const char x[] +# elif defined(PIRE_HAVE_ATTR_ALIGNED) +# define PIRE_ALIGNED_DECL(x) static const char x[] __attribute__((aligned(sizeof(void*)))) +# elif defined(PIRE_HAVE_DECLSPEC_ALIGN) +# define PIRE_ALIGNED_DECL(x) __declspec(align(8)) static const char x[] +# endif #endif #ifndef PIRE_LITERAL -# if defined(PIRE_HAVE_LAMBDAS) -# define PIRE_LITERAL(data) ([]() -> const char* { PIRE_ALIGNED_DECL(__pire_regexp__) = data; return __pire_regexp__; })() -# elif defined(PIRE_HAVE_SCOPED_EXPR) -# define PIRE_LITERAL(data) ({ PIRE_ALIGNED_DECL(__pire_regexp__) = data; __pire_regexp__; }) -# endif +# if defined(PIRE_HAVE_LAMBDAS) +# define PIRE_LITERAL(data) ([]() -> const char* { PIRE_ALIGNED_DECL(__pire_regexp__) = data; return __pire_regexp__; })() +# elif defined(PIRE_HAVE_SCOPED_EXPR) +# define PIRE_LITERAL(data) ({ PIRE_ALIGNED_DECL(__pire_regexp__) = data; __pire_regexp__; }) +# endif #endif #endif diff --git a/library/cpp/regex/pire/pire/determine.h b/library/cpp/regex/pire/pire/determine.h index 96ee1b52749..d60304a265d 100644 --- a/library/cpp/regex/pire/pire/determine.h +++ b/library/cpp/regex/pire/pire/determine.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -29,117 +29,117 @@ #include "partition.h" namespace Pire { - namespace Impl { - - /** - * An interface of a determination task. - * You don't have to derive from this class; it is just a start point template. - */ - class DetermineTask { - private: - struct ImplementationSpecific1; - struct ImplementationSpecific2; - - public: - /// A type representing a new state (may be a set of old states, a pair of them, etc...) - typedef ImplementationSpecific1 State; - - /// A type of letter equivalence classes table. - typedef Partition<char, ImplementationSpecific2> LettersTbl; - - /// A container used for storing map of states to thier indices. - typedef TMap<State, size_t> InvStates; - - /// Should return used letters' partition. - const LettersTbl& Letters() const; - - /// Should return initial state (surprise!) - State Initial() const; - - /// Should calculate next state, given the current state and a letter. - State Next(State state, Char letter) const; - - /// Should return true iff the state need to be processed. - bool IsRequired(const State& /*state*/) const { return true; } - - /// Called when the set of new states is closed. - void AcceptStates(const TVector<State>& newstates); - - /// Called for each transition from one new state to another. - void Connect(size_t from, size_t to, Char letter); - - typedef bool Result; - Result Success() { return true; } - Result Failure() { return false; } - }; - - /** - * A helper function for FSM determining and all determine-like algorithms - * like scanners' agglutination. - * - * Given an indirectly specified automaton (through Task::Initial() and Task::Next() - * functions, see above), performs a breadth-first traversal, finding and enumerating - * all effectively reachable states. Then passes all found states and transitions - * between them back to the task. - * - * Initial state is always placed at zero position. - * - * Please note that the function does not take care of any payload (including final flags); - * it is the task's responsibility to agglutinate them properly. - * - * Returns task.Succeed() if everything was done; task.Failure() if maximum limit of state count was reached. - */ - template<class Task> - typename Task::Result Determine(Task& task, size_t maxSize) - { - typedef typename Task::State State; - typedef typename Task::InvStates InvStates; - typedef TDeque< TVector<size_t> > TransitionTable; - - TVector<State> states; - InvStates invstates; - TransitionTable transitions; - TVector<size_t> stateIndices; - - states.push_back(task.Initial()); - invstates.insert(typename InvStates::value_type(states[0], 0)); - - for (size_t stateIdx = 0; stateIdx < states.size(); ++stateIdx) { - if (!task.IsRequired(states[stateIdx])) - continue; - TransitionTable::value_type row(task.Letters().Size()); - for (auto&& letter : task.Letters()) { - State newState = task.Next(states[stateIdx], letter.first); - auto i = invstates.find(newState); - if (i == invstates.end()) { - if (!maxSize--) - return task.Failure(); - i = invstates.insert(typename InvStates::value_type(newState, states.size())).first; - states.push_back(newState); - } - row[letter.second.first] = i->second; - } - transitions.push_back(row); - stateIndices.push_back(stateIdx); - } - - TVector<Char> invletters(task.Letters().Size()); - for (auto&& letter : task.Letters()) - invletters[letter.second.first] = letter.first; - - task.AcceptStates(states); - size_t from = 0; - for (TransitionTable::iterator i = transitions.begin(), ie = transitions.end(); i != ie; ++i, ++from) { - TVector<Char>::iterator l = invletters.begin(); - for (TransitionTable::value_type::iterator j = i->begin(), je = i->end(); j != je; ++j, ++l) - task.Connect(stateIndices[from], *j, *l); - } - return task.Success(); - } - - // Faster transition table representation for determined FSM - typedef TVector<size_t> DeterminedTransitions; - } + namespace Impl { + + /** + * An interface of a determination task. + * You don't have to derive from this class; it is just a start point template. + */ + class DetermineTask { + private: + struct ImplementationSpecific1; + struct ImplementationSpecific2; + + public: + /// A type representing a new state (may be a set of old states, a pair of them, etc...) + typedef ImplementationSpecific1 State; + + /// A type of letter equivalence classes table. + typedef Partition<char, ImplementationSpecific2> LettersTbl; + + /// A container used for storing map of states to thier indices. + typedef TMap<State, size_t> InvStates; + + /// Should return used letters' partition. + const LettersTbl& Letters() const; + + /// Should return initial state (surprise!) + State Initial() const; + + /// Should calculate next state, given the current state and a letter. + State Next(State state, Char letter) const; + + /// Should return true iff the state need to be processed. + bool IsRequired(const State& /*state*/) const { return true; } + + /// Called when the set of new states is closed. + void AcceptStates(const TVector<State>& newstates); + + /// Called for each transition from one new state to another. + void Connect(size_t from, size_t to, Char letter); + + typedef bool Result; + Result Success() { return true; } + Result Failure() { return false; } + }; + + /** + * A helper function for FSM determining and all determine-like algorithms + * like scanners' agglutination. + * + * Given an indirectly specified automaton (through Task::Initial() and Task::Next() + * functions, see above), performs a breadth-first traversal, finding and enumerating + * all effectively reachable states. Then passes all found states and transitions + * between them back to the task. + * + * Initial state is always placed at zero position. + * + * Please note that the function does not take care of any payload (including final flags); + * it is the task's responsibility to agglutinate them properly. + * + * Returns task.Succeed() if everything was done; task.Failure() if maximum limit of state count was reached. + */ + template<class Task> + typename Task::Result Determine(Task& task, size_t maxSize) + { + typedef typename Task::State State; + typedef typename Task::InvStates InvStates; + typedef TDeque< TVector<size_t> > TransitionTable; + + TVector<State> states; + InvStates invstates; + TransitionTable transitions; + TVector<size_t> stateIndices; + + states.push_back(task.Initial()); + invstates.insert(typename InvStates::value_type(states[0], 0)); + + for (size_t stateIdx = 0; stateIdx < states.size(); ++stateIdx) { + if (!task.IsRequired(states[stateIdx])) + continue; + TransitionTable::value_type row(task.Letters().Size()); + for (auto&& letter : task.Letters()) { + State newState = task.Next(states[stateIdx], letter.first); + auto i = invstates.find(newState); + if (i == invstates.end()) { + if (!maxSize--) + return task.Failure(); + i = invstates.insert(typename InvStates::value_type(newState, states.size())).first; + states.push_back(newState); + } + row[letter.second.first] = i->second; + } + transitions.push_back(row); + stateIndices.push_back(stateIdx); + } + + TVector<Char> invletters(task.Letters().Size()); + for (auto&& letter : task.Letters()) + invletters[letter.second.first] = letter.first; + + task.AcceptStates(states); + size_t from = 0; + for (TransitionTable::iterator i = transitions.begin(), ie = transitions.end(); i != ie; ++i, ++from) { + TVector<Char>::iterator l = invletters.begin(); + for (TransitionTable::value_type::iterator j = i->begin(), je = i->end(); j != je; ++j, ++l) + task.Connect(stateIndices[from], *j, *l); + } + return task.Success(); + } + + // Faster transition table representation for determined FSM + typedef TVector<size_t> DeterminedTransitions; + } } #endif diff --git a/library/cpp/regex/pire/pire/easy.cpp b/library/cpp/regex/pire/pire/easy.cpp index bcb56c693bb..8fd4d255f5e 100644 --- a/library/cpp/regex/pire/pire/easy.cpp +++ b/library/cpp/regex/pire/pire/easy.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -26,7 +26,7 @@ namespace Pire { const Option<const Encoding&> UTF8(&Pire::Encodings::Utf8); const Option<const Encoding&> LATIN1(&Pire::Encodings::Latin1); - + const Option<Feature::Ptr> I(&Pire::Features::CaseInsensitive); const Option<Feature::Ptr> ANDNOT(&Pire::Features::AndNotSupport); diff --git a/library/cpp/regex/pire/pire/easy.h b/library/cpp/regex/pire/pire/easy.h index 6434cd6f223..2fa3cbd5256 100644 --- a/library/cpp/regex/pire/pire/easy.h +++ b/library/cpp/regex/pire/pire/easy.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -57,76 +57,76 @@ #include "vbitset.h" namespace Pire { - + template<class Arg> class Option; class Options { public: - Options(): m_encoding(&Pire::Encodings::Latin1()) {} - ~Options() { Clear(); } - - void Add(const Pire::Encoding& encoding) { m_encoding = &encoding; } - void Add(Feature::Ptr&& feature) { m_features.push_back(std::move(feature)); } - - struct Proxy { - Options* const o; - /*implicit*/ Proxy(Options* opts): o(opts) {} - }; - operator Proxy() { return Proxy(this); } - - Options(Options& o): m_encoding(o.m_encoding) { m_features.swap(o.m_features); } - Options& operator = (Options& o) { m_encoding = o.m_encoding; m_features = std::move(o.m_features); o.Clear(); return *this; } - - Options(Proxy p): m_encoding(p.o->m_encoding) { m_features.swap(p.o->m_features); } - Options& operator = (Proxy p) { m_encoding = p.o->m_encoding; m_features = std::move(p.o->m_features); p.o->Clear(); return *this; } - - void Apply(Lexer& lexer) - { - lexer.SetEncoding(*m_encoding); - for (auto&& i : m_features) { - lexer.AddFeature(i); - i = 0; - } - m_features.clear(); - } - - template<class ArgT> - /*implicit*/ Options(const Option<ArgT>& option); - - const Pire::Encoding& Encoding() const { return *m_encoding; } + Options(): m_encoding(&Pire::Encodings::Latin1()) {} + ~Options() { Clear(); } + + void Add(const Pire::Encoding& encoding) { m_encoding = &encoding; } + void Add(Feature::Ptr&& feature) { m_features.push_back(std::move(feature)); } + + struct Proxy { + Options* const o; + /*implicit*/ Proxy(Options* opts): o(opts) {} + }; + operator Proxy() { return Proxy(this); } + + Options(Options& o): m_encoding(o.m_encoding) { m_features.swap(o.m_features); } + Options& operator = (Options& o) { m_encoding = o.m_encoding; m_features = std::move(o.m_features); o.Clear(); return *this; } + + Options(Proxy p): m_encoding(p.o->m_encoding) { m_features.swap(p.o->m_features); } + Options& operator = (Proxy p) { m_encoding = p.o->m_encoding; m_features = std::move(p.o->m_features); p.o->Clear(); return *this; } + + void Apply(Lexer& lexer) + { + lexer.SetEncoding(*m_encoding); + for (auto&& i : m_features) { + lexer.AddFeature(i); + i = 0; + } + m_features.clear(); + } + + template<class ArgT> + /*implicit*/ Options(const Option<ArgT>& option); + + const Pire::Encoding& Encoding() const { return *m_encoding; } private: - const Pire::Encoding* m_encoding; - TVector<Feature::Ptr> m_features; - - void Clear() - { - m_features.clear(); - } + const Pire::Encoding* m_encoding; + TVector<Feature::Ptr> m_features; + + void Clear() + { + m_features.clear(); + } }; template<class Arg> class Option { public: - typedef Arg (*Ctor)(); - - Option(Ctor ctor): m_ctor(ctor) {} - - friend Options operator | (Options::Proxy options, const Option<Arg>& self) - { - Options ret(options); - ret.Add((*self.m_ctor)()); - return ret; - } - - template<class Arg2> - friend Options operator | (const Option<Arg2>& a, const Option<Arg>& b) - { - return Options() | a | b; - } + typedef Arg (*Ctor)(); + + Option(Ctor ctor): m_ctor(ctor) {} + + friend Options operator | (Options::Proxy options, const Option<Arg>& self) + { + Options ret(options); + ret.Add((*self.m_ctor)()); + return ret; + } + + template<class Arg2> + friend Options operator | (const Option<Arg2>& a, const Option<Arg>& b) + { + return Options() | a | b; + } private: - Ctor m_ctor; + Ctor m_ctor; }; @@ -139,109 +139,109 @@ extern const Option<Feature::Ptr> ANDNOT; class Regexp { public: - template<class Pattern> - explicit Regexp(Pattern pattern, Options options = Options()) - { - Init(PatternBounds(pattern), options); - } - - template<class Pattern, class Arg> - Regexp(Pattern pattern, Option<Arg> option) - { - Init(PatternBounds(pattern), Options() | option); - } - - explicit Regexp(Scanner sc): m_scanner(sc) {} - explicit Regexp(SlowScanner ssc): m_slow(ssc) {} - - bool Matches(TStringBuf buf) const - { - if (!m_scanner.Empty()) - return Runner(m_scanner).Begin().Run(buf).End(); - else - return Runner(m_slow).Begin().Run(buf).End(); - } - - bool Matches(const char* begin, const char* end) const - { - return Matches(TStringBuf(begin, end)); - } - - /// A helper class allowing '==~' operator for regexps - class MatchProxy { - public: - MatchProxy(const Regexp& re): m_re(&re) {} - friend bool operator == (const char* str, const MatchProxy& re) { return re.m_re->Matches(str); } - friend bool operator == (const ystring& str, const MatchProxy& re) { return re.m_re->Matches(str); } - - private: - const Regexp* m_re; - }; - MatchProxy operator ~() const { return MatchProxy(*this); } - + template<class Pattern> + explicit Regexp(Pattern pattern, Options options = Options()) + { + Init(PatternBounds(pattern), options); + } + + template<class Pattern, class Arg> + Regexp(Pattern pattern, Option<Arg> option) + { + Init(PatternBounds(pattern), Options() | option); + } + + explicit Regexp(Scanner sc): m_scanner(sc) {} + explicit Regexp(SlowScanner ssc): m_slow(ssc) {} + + bool Matches(TStringBuf buf) const + { + if (!m_scanner.Empty()) + return Runner(m_scanner).Begin().Run(buf).End(); + else + return Runner(m_slow).Begin().Run(buf).End(); + } + + bool Matches(const char* begin, const char* end) const + { + return Matches(TStringBuf(begin, end)); + } + + /// A helper class allowing '==~' operator for regexps + class MatchProxy { + public: + MatchProxy(const Regexp& re): m_re(&re) {} + friend bool operator == (const char* str, const MatchProxy& re) { return re.m_re->Matches(str); } + friend bool operator == (const ystring& str, const MatchProxy& re) { return re.m_re->Matches(str); } + + private: + const Regexp* m_re; + }; + MatchProxy operator ~() const { return MatchProxy(*this); } + private: - Scanner m_scanner; - SlowScanner m_slow; - - ypair<const char*, const char*> PatternBounds(const ystring& pattern) - { - static const char c = 0; - return pattern.empty() ? ymake_pair(&c, &c) : ymake_pair(pattern.c_str(), pattern.c_str() + pattern.size()); - } - - ypair<const char*, const char*> PatternBounds(const char* pattern) - { - return ymake_pair(pattern, pattern + strlen(pattern)); - } - - void Init(ypair<const char*, const char*> rawPattern, Options options) - { - TVector<wchar32> pattern; - options.Encoding().FromLocal(rawPattern.first, rawPattern.second, std::back_inserter(pattern)); - - Lexer lexer(pattern); - options.Apply(lexer); - Fsm fsm = lexer.Parse(); - - if (!BeginsWithCircumflex(fsm)) - fsm.PrependAnything(); - fsm.AppendAnything(); - - if (fsm.Determine()) - m_scanner = fsm.Compile<Scanner>(); - else - m_slow = fsm.Compile<SlowScanner>(); - } - - static bool BeginsWithCircumflex(const Fsm& fsm) - { - typedef Fsm::StatesSet Set; - TDeque<size_t> queue; - BitSet handled(fsm.Size()); - - queue.push_back(fsm.Initial()); - handled.Set(fsm.Initial()); - - while (!queue.empty()) { - Set s = fsm.Destinations(queue.front(), SpecialChar::Epsilon); - for (auto&& i : s) { - if (!handled.Test(i)) { - handled.Set(i); - queue.push_back(i); - } - } - - TSet<Char> lets = fsm.OutgoingLetters(queue.front()); - lets.erase(SpecialChar::Epsilon); - lets.erase(SpecialChar::BeginMark); - if (!lets.empty()) - return false; - - queue.pop_front(); - } - - return true; - } + Scanner m_scanner; + SlowScanner m_slow; + + ypair<const char*, const char*> PatternBounds(const ystring& pattern) + { + static const char c = 0; + return pattern.empty() ? ymake_pair(&c, &c) : ymake_pair(pattern.c_str(), pattern.c_str() + pattern.size()); + } + + ypair<const char*, const char*> PatternBounds(const char* pattern) + { + return ymake_pair(pattern, pattern + strlen(pattern)); + } + + void Init(ypair<const char*, const char*> rawPattern, Options options) + { + TVector<wchar32> pattern; + options.Encoding().FromLocal(rawPattern.first, rawPattern.second, std::back_inserter(pattern)); + + Lexer lexer(pattern); + options.Apply(lexer); + Fsm fsm = lexer.Parse(); + + if (!BeginsWithCircumflex(fsm)) + fsm.PrependAnything(); + fsm.AppendAnything(); + + if (fsm.Determine()) + m_scanner = fsm.Compile<Scanner>(); + else + m_slow = fsm.Compile<SlowScanner>(); + } + + static bool BeginsWithCircumflex(const Fsm& fsm) + { + typedef Fsm::StatesSet Set; + TDeque<size_t> queue; + BitSet handled(fsm.Size()); + + queue.push_back(fsm.Initial()); + handled.Set(fsm.Initial()); + + while (!queue.empty()) { + Set s = fsm.Destinations(queue.front(), SpecialChar::Epsilon); + for (auto&& i : s) { + if (!handled.Test(i)) { + handled.Set(i); + queue.push_back(i); + } + } + + TSet<Char> lets = fsm.OutgoingLetters(queue.front()); + lets.erase(SpecialChar::Epsilon); + lets.erase(SpecialChar::BeginMark); + if (!lets.empty()) + return false; + + queue.pop_front(); + } + + return true; + } }; }; diff --git a/library/cpp/regex/pire/pire/encoding.cpp b/library/cpp/regex/pire/pire/encoding.cpp index d5000c31464..1a2ac6872f3 100644 --- a/library/cpp/regex/pire/pire/encoding.cpp +++ b/library/cpp/regex/pire/pire/encoding.cpp @@ -37,97 +37,97 @@ namespace Pire { namespace { - class Latin1: public Encoding { - public: - Latin1() : Encoding() {} - - wchar32 FromLocal(const char*& begin, const char* end) const - { - if (begin == end) - throw Error("EOF reached in Pire::Latin1::fromLocal()"); - else if (static_cast<unsigned char>(*begin) >= 0x80) - throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)"); - else - return (wchar32) *begin++; - } - - ystring ToLocal(wchar32 ch) const - { - if (ch < 0x80) - return ystring(1, (char) ch); - else - return ystring(); - } - - void AppendDot(Fsm& fsm) const { fsm.AppendDot(); } - }; - - namespace UtfRanges { - - static const size_t MaxLen = 4; - static const size_t First[MaxLen][2] = { - {0x00, 0x80}, - {0xC0, 0xE0}, - {0xE0, 0xF0}, - {0xF0, 0xF8} - }; - static const size_t Next[2] = {0x80, 0xC0}; - } - - - class Utf8: public Encoding { - public: - Utf8() : Encoding() {} - - wchar32 FromLocal(const char*& begin, const char* end) const - { - wchar32 rune; - size_t len; - if (SafeReadUTF8Char(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK) - throw Error("Error reading UTF8 sequence"); - begin += len; - return rune; - } - - ystring ToLocal(wchar32 c) const - { - ystring ret(UTF8RuneLenByUCS(c), ' '); - size_t len; - unsigned char* p = (unsigned char*) &*ret.begin(); - if (SafeWriteUTF8Char(c, len, p, p + ret.size()) != RECODE_OK) - Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error"); - return ret; - } - - void AppendDot(Fsm& fsm) const - { - size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen); - for (size_t i = 0; i < UtfRanges::MaxLen; ++i) - for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter) - fsm.ConnectFinal(fsm.Size() - i - 1, letter); - for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i) - for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter) - fsm.Connect(last + i, last + i + 1, letter); - fsm.ClearFinal(); - fsm.SetFinal(fsm.Size() - 1, true); - fsm.SetIsDetermined(false); - } - }; + class Latin1: public Encoding { + public: + Latin1() : Encoding() {} + + wchar32 FromLocal(const char*& begin, const char* end) const + { + if (begin == end) + throw Error("EOF reached in Pire::Latin1::fromLocal()"); + else if (static_cast<unsigned char>(*begin) >= 0x80) + throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)"); + else + return (wchar32) *begin++; + } + + ystring ToLocal(wchar32 ch) const + { + if (ch < 0x80) + return ystring(1, (char) ch); + else + return ystring(); + } + + void AppendDot(Fsm& fsm) const { fsm.AppendDot(); } + }; + + namespace UtfRanges { + + static const size_t MaxLen = 4; + static const size_t First[MaxLen][2] = { + {0x00, 0x80}, + {0xC0, 0xE0}, + {0xE0, 0xF0}, + {0xF0, 0xF8} + }; + static const size_t Next[2] = {0x80, 0xC0}; + } + + + class Utf8: public Encoding { + public: + Utf8() : Encoding() {} + + wchar32 FromLocal(const char*& begin, const char* end) const + { + wchar32 rune; + size_t len; + if (SafeReadUTF8Char(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK) + throw Error("Error reading UTF8 sequence"); + begin += len; + return rune; + } + + ystring ToLocal(wchar32 c) const + { + ystring ret(UTF8RuneLenByUCS(c), ' '); + size_t len; + unsigned char* p = (unsigned char*) &*ret.begin(); + if (SafeWriteUTF8Char(c, len, p, p + ret.size()) != RECODE_OK) + Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error"); + return ret; + } + + void AppendDot(Fsm& fsm) const + { + size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen); + for (size_t i = 0; i < UtfRanges::MaxLen; ++i) + for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter) + fsm.ConnectFinal(fsm.Size() - i - 1, letter); + for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i) + for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter) + fsm.Connect(last + i, last + i + 1, letter); + fsm.ClearFinal(); + fsm.SetFinal(fsm.Size() - 1, true); + fsm.SetIsDetermined(false); + } + }; } namespace Encodings { - const Encoding& Utf8() - { - static const Pire::Utf8 utf8; - return utf8; - } - - const Encoding& Latin1() - { - static const Pire::Latin1 latin1; - return latin1; - } + const Encoding& Utf8() + { + static const Pire::Utf8 utf8; + return utf8; + } + + const Encoding& Latin1() + { + static const Pire::Latin1 latin1; + return latin1; + } } diff --git a/library/cpp/regex/pire/pire/encoding.h b/library/cpp/regex/pire/pire/encoding.h index 5009d18cdac..fceab3b9751 100644 --- a/library/cpp/regex/pire/pire/encoding.h +++ b/library/cpp/regex/pire/pire/encoding.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -34,34 +34,34 @@ class Fsm; class Encoding { public: - virtual ~Encoding() {} - - /// Should read bytes from @p begin and return the corresponding Unicode - /// character, advancing @p begin. - virtual wchar32 FromLocal(const char*& begin, const char* end) const = 0; - - /// Opposite to FromLocal(), transforms given Unicode character into - /// the string in the encoding. - virtual ystring ToLocal(wchar32 c) const = 0; - - /// Given the FSM, should append the representation of a dot in the ecoding - /// to that FSM. - virtual void AppendDot(Fsm&) const = 0; - - template<class OutputIter> - OutputIter FromLocal(const char* begin, const char* end, OutputIter iter) const - { - while (begin != end) { - *iter = FromLocal(begin, end); - ++iter; - } - return iter; - } + virtual ~Encoding() {} + + /// Should read bytes from @p begin and return the corresponding Unicode + /// character, advancing @p begin. + virtual wchar32 FromLocal(const char*& begin, const char* end) const = 0; + + /// Opposite to FromLocal(), transforms given Unicode character into + /// the string in the encoding. + virtual ystring ToLocal(wchar32 c) const = 0; + + /// Given the FSM, should append the representation of a dot in the ecoding + /// to that FSM. + virtual void AppendDot(Fsm&) const = 0; + + template<class OutputIter> + OutputIter FromLocal(const char* begin, const char* end, OutputIter iter) const + { + while (begin != end) { + *iter = FromLocal(begin, end); + ++iter; + } + return iter; + } }; namespace Encodings { - const Encoding& Latin1(); - const Encoding& Utf8(); + const Encoding& Latin1(); + const Encoding& Utf8(); }; diff --git a/library/cpp/regex/pire/pire/extra.h b/library/cpp/regex/pire/pire/extra.h index 1ee9eee9eea..b7bcf9e4544 100644 --- a/library/cpp/regex/pire/pire/extra.h +++ b/library/cpp/regex/pire/pire/extra.h @@ -12,7 +12,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the diff --git a/library/cpp/regex/pire/pire/extra/capture.cpp b/library/cpp/regex/pire/pire/extra/capture.cpp index fb4cdf6d815..47ec60d7f30 100644 --- a/library/cpp/regex/pire/pire/extra/capture.cpp +++ b/library/cpp/regex/pire/pire/extra/capture.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -26,110 +26,110 @@ #include "capture.h" namespace Pire { - + namespace { - class CaptureImpl: public Feature { - public: - CaptureImpl(size_t pos) - : State(0) - , Pos(pos) - , Level(0) - , StateRepetition(NoRepetition) - {} - - bool Accepts(wchar32 c) const { return c == '(' || c == '+' || c == '*' || c == '?' || c == '{'; } - Term Lex() - { - wchar32 c = GetChar(); - if (!Accepts(c)) - Error("How did we get here?!.."); - if (c != '(') { - wchar32 next = PeekChar(); - if (next == '?') { - StateRepetition = NonGreedyRepetition; - GetChar(); - } - else - StateRepetition = GreedyRepetition; - } - else if (State == 0 && Pos > 1) - --Pos; - else if (State == 0 && Pos == 1) { - State = 1; - Level = 0; - } else if (State == 1) { - ++Level; - } - if (c == '(') - return Term(TokenTypes::Open); - else if (c == '+') - return Term::Repetition(1, Inf); - else if (c == '*') - return Term::Repetition(0, Inf); - else if (c == '?') - return Term::Repetition(0, 1); - else { - UngetChar(c); - return Term(0); - } - } - - void Parenthesized(Fsm& fsm) - { - if (StateRepetition != NoRepetition) { - bool greedy = (StateRepetition == GreedyRepetition); - SetRepetitionMark(fsm, greedy); - StateRepetition = NoRepetition; - } else if (State == 1 && Level == 0) { - SetCaptureMark(fsm); - State = 2; - } else if (State == 1 && Level > 0) - --Level; - } - private: - unsigned State; - size_t Pos; - size_t Level; - RepetitionTypes StateRepetition; + class CaptureImpl: public Feature { + public: + CaptureImpl(size_t pos) + : State(0) + , Pos(pos) + , Level(0) + , StateRepetition(NoRepetition) + {} + + bool Accepts(wchar32 c) const { return c == '(' || c == '+' || c == '*' || c == '?' || c == '{'; } + Term Lex() + { + wchar32 c = GetChar(); + if (!Accepts(c)) + Error("How did we get here?!.."); + if (c != '(') { + wchar32 next = PeekChar(); + if (next == '?') { + StateRepetition = NonGreedyRepetition; + GetChar(); + } + else + StateRepetition = GreedyRepetition; + } + else if (State == 0 && Pos > 1) + --Pos; + else if (State == 0 && Pos == 1) { + State = 1; + Level = 0; + } else if (State == 1) { + ++Level; + } + if (c == '(') + return Term(TokenTypes::Open); + else if (c == '+') + return Term::Repetition(1, Inf); + else if (c == '*') + return Term::Repetition(0, Inf); + else if (c == '?') + return Term::Repetition(0, 1); + else { + UngetChar(c); + return Term(0); + } + } - void SetRepetitionMark(Fsm& fsm, bool greedy) - { - fsm.Resize(fsm.Size() + 1); - fsm.ConnectFinal(fsm.Size() - 1); + void Parenthesized(Fsm& fsm) + { + if (StateRepetition != NoRepetition) { + bool greedy = (StateRepetition == GreedyRepetition); + SetRepetitionMark(fsm, greedy); + StateRepetition = NoRepetition; + } else if (State == 1 && Level == 0) { + SetCaptureMark(fsm); + State = 2; + } else if (State == 1 && Level > 0) + --Level; + } + private: + unsigned State; + size_t Pos; + size_t Level; + RepetitionTypes StateRepetition; - for (size_t state = 0; state < fsm.Size() - 1; ++state) - if (fsm.IsFinal(state)) - if (greedy) - fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndRepetition); - else - fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndNonGreedyRepetition); - fsm.ClearFinal(); - fsm.SetFinal(fsm.Size() - 1, true); - fsm.SetIsDetermined(false); - } + void SetRepetitionMark(Fsm& fsm, bool greedy) + { + fsm.Resize(fsm.Size() + 1); + fsm.ConnectFinal(fsm.Size() - 1); - void SetCaptureMark(Fsm& fsm) - { - fsm.Resize(fsm.Size() + 2); - fsm.Connect(fsm.Size() - 2, fsm.Initial()); - fsm.ConnectFinal(fsm.Size() - 1); + for (size_t state = 0; state < fsm.Size() - 1; ++state) + if (fsm.IsFinal(state)) + if (greedy) + fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndRepetition); + else + fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndNonGreedyRepetition); + fsm.ClearFinal(); + fsm.SetFinal(fsm.Size() - 1, true); + fsm.SetIsDetermined(false); + } - fsm.SetOutput(fsm.Size() - 2, fsm.Initial(), CapturingScanner::BeginCapture); - for (size_t state = 0; state < fsm.Size() - 2; ++state) - if (fsm.IsFinal(state)) - fsm.SetOutput(state, fsm.Size() - 1, CapturingScanner::EndCapture); + void SetCaptureMark(Fsm& fsm) + { + fsm.Resize(fsm.Size() + 2); + fsm.Connect(fsm.Size() - 2, fsm.Initial()); + fsm.ConnectFinal(fsm.Size() - 1); - fsm.SetInitial(fsm.Size() - 2); - fsm.ClearFinal(); - fsm.SetFinal(fsm.Size() - 1, true); - fsm.SetIsDetermined(false); - } - - void FinishBuild() {} - }; + fsm.SetOutput(fsm.Size() - 2, fsm.Initial(), CapturingScanner::BeginCapture); + for (size_t state = 0; state < fsm.Size() - 2; ++state) + if (fsm.IsFinal(state)) + fsm.SetOutput(state, fsm.Size() - 1, CapturingScanner::EndCapture); + + fsm.SetInitial(fsm.Size() - 2); + fsm.ClearFinal(); + fsm.SetFinal(fsm.Size() - 1, true); + fsm.SetIsDetermined(false); + } + + void FinishBuild() {} + }; } - + namespace Features { - Feature::Ptr Capture(size_t pos) { return Feature::Ptr(new CaptureImpl(pos)); } + Feature::Ptr Capture(size_t pos) { return Feature::Ptr(new CaptureImpl(pos)); } }; } diff --git a/library/cpp/regex/pire/pire/extra/capture.h b/library/cpp/regex/pire/pire/extra/capture.h index 8ed6bc7f801..b4dab8fd235 100644 --- a/library/cpp/regex/pire/pire/extra/capture.h +++ b/library/cpp/regex/pire/pire/extra/capture.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -48,121 +48,121 @@ namespace Pire { */ class CapturingScanner: public LoadedScanner { public: - enum { - NoAction = 0, - BeginCapture = 1, - EndCapture = 2, - - FinalFlag = 1 - }; - - class State { - public: - bool Captured() const { return (m_begin != npos) && (m_end != npos); } - size_t Begin() const { return m_begin; } - size_t End() const { return m_end; } - private: - static const size_t npos = static_cast<size_t>(-1); - size_t m_state; - size_t m_begin; - size_t m_end; - size_t m_counter; - friend class CapturingScanner; + enum { + NoAction = 0, + BeginCapture = 1, + EndCapture = 2, + + FinalFlag = 1 + }; + + class State { + public: + bool Captured() const { return (m_begin != npos) && (m_end != npos); } + size_t Begin() const { return m_begin; } + size_t End() const { return m_end; } + private: + static const size_t npos = static_cast<size_t>(-1); + size_t m_state; + size_t m_begin; + size_t m_end; + size_t m_counter; + friend class CapturingScanner; #ifdef PIRE_DEBUG - friend yostream& operator << (yostream& s, const State& state) - { - s << state.m_state; - if (state.m_begin != State::npos || state.m_end != npos) { - s << " ["; - if (state.m_begin != State::npos) - s << 'b'; - if (state.m_end != State::npos) - s << 'e'; - s << "]"; - } - return s; - } + friend yostream& operator << (yostream& s, const State& state) + { + s << state.m_state; + if (state.m_begin != State::npos || state.m_end != npos) { + s << " ["; + if (state.m_begin != State::npos) + s << 'b'; + if (state.m_end != State::npos) + s << 'e'; + s << "]"; + } + return s; + } #endif - }; - - void Initialize(State& state) const - { - state.m_state = m.initial; - state.m_begin = state.m_end = State::npos; - state.m_counter = 0; - } - - void TakeAction(State& s, Action a) const - { - if ((a & BeginCapture) && !s.Captured()) - s.m_begin = s.m_counter - 1; - else if (a & EndCapture) { - if (s.m_end == State::npos) - s.m_end = s.m_counter - 1; - } - } - - Char Translate(Char ch) const - { - return m_letters[static_cast<size_t>(ch)]; - } - - Action NextTranslated(State& s, unsigned char c) const - { - Transition x = reinterpret_cast<const Transition*>(s.m_state)[c]; - s.m_state += SignExtend(x.shift); - ++s.m_counter; - - return x.action; - } - - Action Next(State& s, Char c) const - { - return NextTranslated(s, Translate(c)); - } - - Action Next(const State& current, State& n, Char c) const - { - n = current; - return Next(n, c); - } - - bool CanStop(const State& s) const - { - return Final(s); - } - - bool Final(const State& s) const { return m_tags[(reinterpret_cast<Transition*>(s.m_state) - m_jumps) / m.lettersCount] & FinalFlag; } - - bool Dead(const State&) const { return false; } - - CapturingScanner() {} - CapturingScanner(const CapturingScanner& s): LoadedScanner(s) {} - explicit CapturingScanner(Fsm& fsm, size_t distance = 0) - { - if (distance) { - fsm = CreateApproxFsm(fsm, distance); - } - fsm.Canonize(); - Init(fsm.Size(), fsm.Letters(), fsm.Initial()); - BuildScanner(fsm, *this); - } - - void Swap(CapturingScanner& s) { LoadedScanner::Swap(s); } - CapturingScanner& operator = (const CapturingScanner& s) { CapturingScanner(s).Swap(*this); return *this; } - - size_t StateIndex(const State& s) const { return StateIdx(s.m_state); } + }; + + void Initialize(State& state) const + { + state.m_state = m.initial; + state.m_begin = state.m_end = State::npos; + state.m_counter = 0; + } + + void TakeAction(State& s, Action a) const + { + if ((a & BeginCapture) && !s.Captured()) + s.m_begin = s.m_counter - 1; + else if (a & EndCapture) { + if (s.m_end == State::npos) + s.m_end = s.m_counter - 1; + } + } + + Char Translate(Char ch) const + { + return m_letters[static_cast<size_t>(ch)]; + } + + Action NextTranslated(State& s, unsigned char c) const + { + Transition x = reinterpret_cast<const Transition*>(s.m_state)[c]; + s.m_state += SignExtend(x.shift); + ++s.m_counter; + + return x.action; + } + + Action Next(State& s, Char c) const + { + return NextTranslated(s, Translate(c)); + } + + Action Next(const State& current, State& n, Char c) const + { + n = current; + return Next(n, c); + } + + bool CanStop(const State& s) const + { + return Final(s); + } + + bool Final(const State& s) const { return m_tags[(reinterpret_cast<Transition*>(s.m_state) - m_jumps) / m.lettersCount] & FinalFlag; } + + bool Dead(const State&) const { return false; } + + CapturingScanner() {} + CapturingScanner(const CapturingScanner& s): LoadedScanner(s) {} + explicit CapturingScanner(Fsm& fsm, size_t distance = 0) + { + if (distance) { + fsm = CreateApproxFsm(fsm, distance); + } + fsm.Canonize(); + Init(fsm.Size(), fsm.Letters(), fsm.Initial()); + BuildScanner(fsm, *this); + } + + void Swap(CapturingScanner& s) { LoadedScanner::Swap(s); } + CapturingScanner& operator = (const CapturingScanner& s) { CapturingScanner(s).Swap(*this); return *this; } + + size_t StateIndex(const State& s) const { return StateIdx(s.m_state); } private: - friend void BuildScanner<CapturingScanner>(const Fsm&, CapturingScanner&); + friend void BuildScanner<CapturingScanner>(const Fsm&, CapturingScanner&); }; enum RepetitionTypes { // They are sorted by their priorities - NonGreedyRepetition, - NoRepetition, - GreedyRepetition, + NonGreedyRepetition, + NoRepetition, + GreedyRepetition, }; /** @@ -171,419 +171,419 @@ enum RepetitionTypes { // They are sorted by their priorities */ class SlowCapturingScanner : public SlowScanner { public: - enum { - Nothing = 0, - BeginCapture = 1, - EndCapture = 2, - EndRepetition = 4, - EndNonGreedyRepetition = 8, - - FinalFlag = 1, - }; - - const ui32 ActionsCapture = BeginCapture | EndCapture; - - class SingleState { - public: - bool Captured() const - { - return (m_begin != m_npos && m_end != m_npos); - } - - bool HasBegin() const - { - return (m_begin != m_npos); - } - - bool HasEnd() const - { - return (m_end != m_npos); - } - - SingleState(size_t num = 0) - { - m_state = num; - m_begin = m_npos; - m_end = m_npos; - } - - void SetBegin(size_t pos) - { - if (m_begin == m_npos) - m_begin = pos; - } - - void SetEnd(size_t pos) - { - if (m_end == m_npos) - m_end = pos; - } - - size_t Begin() const - { - return GetBegin(); - } - - size_t End() const - { - return GetEnd(); - } - - size_t GetBegin() const - { - return m_begin; - } - - size_t GetEnd() const - { - return m_end; - } - - size_t GetNum() const - { - return m_state; - } - - private: - size_t m_state; - size_t m_begin; - size_t m_end; - static const size_t m_npos = static_cast<size_t>(-1); - }; - - class State { - public: - State() - : m_strpos(0) - , m_matched(false) {} - - size_t GetPos() const - { - return m_strpos; - } - - const SingleState& GetState(size_t pos) const - { - return m_states[pos]; - } - - void SetPos(size_t newPos) - { - m_strpos = newPos; - } - - void PushState(SingleState& st) - { - m_states.push_back(st); - } - - void PushState(const SingleState& st) - { - m_states.push_back(st); - } - - size_t GetSize() const - { - return m_states.size(); - } - - const TVector<SingleState>& GetStates() const - { - return m_states; - } - - bool IsMatched() const - { - return m_matched; - } - - const SingleState& GetMatched() const - { - return m_match; - } - - void AddMatch(const SingleState& Matched) - { - m_matched = true; - m_match = Matched; - } - - private: - TVector<SingleState> m_states; - size_t m_strpos; - bool m_matched; - SingleState m_match; - }; - - class Transition { - private: - unsigned long m_stateto; - Action m_action; - - public: - unsigned long GetState() const - { - return m_stateto; - } - - Action GetAction() const - { - return m_action; - } - - Transition(unsigned long state, Action act = 0) - : m_stateto(state) - , m_action(act) - { - } - }; - - class PriorityStates { - private: - TVector<SingleState> m_nonGreedy; - TVector<SingleState> m_nothing; - TVector<SingleState> m_greedy; - - public: - void Push(const SingleState& st, RepetitionTypes repetition) - { - Get(repetition).push_back(st); - } - - TVector<SingleState>& Get(RepetitionTypes repetition) - { - switch (repetition) { - case NonGreedyRepetition: - return m_nonGreedy; - case NoRepetition: - return m_nothing; - case GreedyRepetition: - return m_greedy; - } - } - - const TVector<SingleState>& Get(RepetitionTypes repetition) const - { - switch (repetition) { - case NonGreedyRepetition: - return m_nonGreedy; - case NoRepetition: - return m_nothing; - case GreedyRepetition: - return m_greedy; - } - } - }; - - SlowScanner::State GetNextStates(const SingleState& cur, Char letter) const - { - SlowScanner::State st(GetSize()); - st.states.push_back(cur.GetNum()); - SlowScanner::State nextState(GetSize()); - SlowScanner::NextTranslated(st, nextState, letter); - return nextState; - } - - size_t GetPosition(const SingleState& state, Char letter) const - { - return state.GetNum() * GetLettersCount() + letter; - } - - void NextStates(const SingleState& state, Char letter, TVector<Transition>& nextStates) const - { - if (IsMmaped()) { - const size_t* pos = GetJumpPos() + GetPosition(state, letter); - size_t posBegin = pos[0]; - size_t posEnd = pos[1]; - for (size_t i = posBegin; i < posEnd; ++i) - nextStates.emplace_back(GetJump(i), GetAction(i)); - } else { - size_t num = GetPosition(state, letter); - const auto& jumpVec = GetJumpsVec(num); - const auto& actionVec = GetActionsVec(num); - for (size_t i = 0; i < jumpVec.size(); ++i) - nextStates.emplace_back(jumpVec[i], actionVec[i]); - } - } - - void InsertStates(const PriorityStates& states, TVector<SingleState>& nonGreedy, TVector<SingleState>& nothing, TVector<SingleState>& greedy) const - { - for (auto& greed : {ymake_pair(&nonGreedy, NonGreedyRepetition), ymake_pair(¬hing, NoRepetition), ymake_pair(&greedy, GreedyRepetition)}) { - auto& vec = greed.first; - auto& tag = greed.second; - vec->insert(vec->end(), states.Get(tag).begin(), states.Get(tag).end()); - } - } - - void NextAndGetToGroups(PriorityStates& states, const SingleState& cur, - Char letter, size_t pos, TVector<bool>& used) const - { - TVector<Transition> nextStates; - NextStates(cur, letter, nextStates); - for (const auto& trans : nextStates) { - size_t st = trans.GetState(); - if (used[st]) - continue; - used[st] = true; - SingleState state(st); - const auto& action = trans.GetAction(); - state.SetBegin(cur.GetBegin()); - state.SetEnd(cur.GetEnd()); - if (action & BeginCapture && !cur.HasBegin()) { - state.SetBegin(pos); - } - if (action & EndCapture && !cur.HasEnd()) { - state.SetEnd(pos); - } - PriorityStates statesInside; - NextAndGetToGroups(statesInside, state, Translate(Epsilon), pos, used); - statesInside.Push(state, NoRepetition); - if (action & EndNonGreedyRepetition) { - auto& nongreedy = states.Get(NonGreedyRepetition); - InsertStates(statesInside, nongreedy, nongreedy, nongreedy); - } - else if (!(action & EndRepetition)) - InsertStates(statesInside, states.Get(NonGreedyRepetition), states.Get(NoRepetition), states.Get(GreedyRepetition)); - else { - auto& greedy = states.Get(GreedyRepetition); - InsertStates(statesInside, greedy, greedy, greedy); - } - } - } - - bool Captured(const SingleState& st, bool& matched) const - { - matched = false; - if (IsFinal(st.GetNum())) { - matched = true; - if (st.HasBegin()) - return true; - } - TVector<Transition> nextStates; - NextStates(st, Translate(EndMark), nextStates); - for (const auto& trans : nextStates) - { - size_t state = trans.GetState(); - if (IsFinal(state)) { - matched = true; - if (st.HasBegin() || (trans.GetAction() & ActionsCapture)) - return true; - } else { // After EndMark there can be Epsilon-transitions to the Final State - TVector<Transition> epsilonTrans; - SingleState newSt(state); - NextStates(newSt, Translate(Epsilon), epsilonTrans); - for (auto new_trans : epsilonTrans) { - size_t fin = new_trans.GetState(); - if (IsFinal(fin)) { - matched = true; - if (st.HasBegin() || (trans.GetAction() & ActionsCapture)) - return true; - } - } - } - } - return false; - } - - bool Matched(const SingleState& st) const - { - bool matched; - Captured(st, matched); - return matched; - } - - bool GetCapture(const State& st, SingleState& final) const - { - size_t pos = 0; - bool matched = false; - bool ans = false; - while (pos < st.GetSize() && !matched) { - ans = Captured(st.GetState(pos), matched); - ++pos; - } - if (matched) { - final = st.GetState(pos - 1); - return ans; - } else { - if (st.IsMatched()) { - final = st.GetMatched(); - return true; - } - return false; - } - } - - bool PushState(State& nlist, const SingleState& state) const - { - nlist.PushState(state); - if (Matched(state)) { - nlist.AddMatch(state); - return true; - } - return false; - } - - void UpdateNList(State& nlist, const PriorityStates& states) const - { - static constexpr std::array<RepetitionTypes, 3> m_type_by_priority{{NonGreedyRepetition, NoRepetition, GreedyRepetition}}; - for (const auto type : m_type_by_priority) { - for (const auto& state : states.Get(type)) { - if (PushState(nlist, state)) // Because we have strict priorities, after matching some state, we can be sure, that not states after will be better - return; - } - } - } - - void Initialize(State& nlist) const - { - PriorityStates states; - SingleState init(GetStart()); - TVector<bool> used(GetSize()); - NextAndGetToGroups(states, init, Translate(BeginMark), 0, used); - NextAndGetToGroups(states, 0, Translate(BeginMark), 0, used); - UpdateNList(nlist, states); - } - - Action NextTranslated(State& clist, Char letter) const - { - State nlist; - if (clist.IsMatched()) - nlist.AddMatch(clist.GetMatched()); - nlist.SetPos(clist.GetPos() + 1); - size_t strpos = nlist.GetPos(); - TVector<bool> used(GetSize()); - size_t pos = 0; - while (pos < clist.GetSize()) { - PriorityStates states; - NextAndGetToGroups(states, clist.GetState(pos), letter, strpos, used); - UpdateNList(nlist, states); - ++pos; - } - DoSwap(clist, nlist); - return 0; - } - - void TakeAction(State&, Action) const {} - - Action Next(State& st, Char letter) const - { - return NextTranslated(st, Translate(letter)); - } + enum { + Nothing = 0, + BeginCapture = 1, + EndCapture = 2, + EndRepetition = 4, + EndNonGreedyRepetition = 8, + + FinalFlag = 1, + }; + + const ui32 ActionsCapture = BeginCapture | EndCapture; + + class SingleState { + public: + bool Captured() const + { + return (m_begin != m_npos && m_end != m_npos); + } + + bool HasBegin() const + { + return (m_begin != m_npos); + } + + bool HasEnd() const + { + return (m_end != m_npos); + } + + SingleState(size_t num = 0) + { + m_state = num; + m_begin = m_npos; + m_end = m_npos; + } + + void SetBegin(size_t pos) + { + if (m_begin == m_npos) + m_begin = pos; + } + + void SetEnd(size_t pos) + { + if (m_end == m_npos) + m_end = pos; + } + + size_t Begin() const + { + return GetBegin(); + } + + size_t End() const + { + return GetEnd(); + } + + size_t GetBegin() const + { + return m_begin; + } + + size_t GetEnd() const + { + return m_end; + } + + size_t GetNum() const + { + return m_state; + } + + private: + size_t m_state; + size_t m_begin; + size_t m_end; + static const size_t m_npos = static_cast<size_t>(-1); + }; + + class State { + public: + State() + : m_strpos(0) + , m_matched(false) {} + + size_t GetPos() const + { + return m_strpos; + } + + const SingleState& GetState(size_t pos) const + { + return m_states[pos]; + } + + void SetPos(size_t newPos) + { + m_strpos = newPos; + } + + void PushState(SingleState& st) + { + m_states.push_back(st); + } + + void PushState(const SingleState& st) + { + m_states.push_back(st); + } + + size_t GetSize() const + { + return m_states.size(); + } + + const TVector<SingleState>& GetStates() const + { + return m_states; + } + + bool IsMatched() const + { + return m_matched; + } + + const SingleState& GetMatched() const + { + return m_match; + } + + void AddMatch(const SingleState& Matched) + { + m_matched = true; + m_match = Matched; + } + + private: + TVector<SingleState> m_states; + size_t m_strpos; + bool m_matched; + SingleState m_match; + }; + + class Transition { + private: + unsigned long m_stateto; + Action m_action; + + public: + unsigned long GetState() const + { + return m_stateto; + } + + Action GetAction() const + { + return m_action; + } + + Transition(unsigned long state, Action act = 0) + : m_stateto(state) + , m_action(act) + { + } + }; + + class PriorityStates { + private: + TVector<SingleState> m_nonGreedy; + TVector<SingleState> m_nothing; + TVector<SingleState> m_greedy; + + public: + void Push(const SingleState& st, RepetitionTypes repetition) + { + Get(repetition).push_back(st); + } + + TVector<SingleState>& Get(RepetitionTypes repetition) + { + switch (repetition) { + case NonGreedyRepetition: + return m_nonGreedy; + case NoRepetition: + return m_nothing; + case GreedyRepetition: + return m_greedy; + } + } + + const TVector<SingleState>& Get(RepetitionTypes repetition) const + { + switch (repetition) { + case NonGreedyRepetition: + return m_nonGreedy; + case NoRepetition: + return m_nothing; + case GreedyRepetition: + return m_greedy; + } + } + }; + + SlowScanner::State GetNextStates(const SingleState& cur, Char letter) const + { + SlowScanner::State st(GetSize()); + st.states.push_back(cur.GetNum()); + SlowScanner::State nextState(GetSize()); + SlowScanner::NextTranslated(st, nextState, letter); + return nextState; + } + + size_t GetPosition(const SingleState& state, Char letter) const + { + return state.GetNum() * GetLettersCount() + letter; + } + + void NextStates(const SingleState& state, Char letter, TVector<Transition>& nextStates) const + { + if (IsMmaped()) { + const size_t* pos = GetJumpPos() + GetPosition(state, letter); + size_t posBegin = pos[0]; + size_t posEnd = pos[1]; + for (size_t i = posBegin; i < posEnd; ++i) + nextStates.emplace_back(GetJump(i), GetAction(i)); + } else { + size_t num = GetPosition(state, letter); + const auto& jumpVec = GetJumpsVec(num); + const auto& actionVec = GetActionsVec(num); + for (size_t i = 0; i < jumpVec.size(); ++i) + nextStates.emplace_back(jumpVec[i], actionVec[i]); + } + } + + void InsertStates(const PriorityStates& states, TVector<SingleState>& nonGreedy, TVector<SingleState>& nothing, TVector<SingleState>& greedy) const + { + for (auto& greed : {ymake_pair(&nonGreedy, NonGreedyRepetition), ymake_pair(¬hing, NoRepetition), ymake_pair(&greedy, GreedyRepetition)}) { + auto& vec = greed.first; + auto& tag = greed.second; + vec->insert(vec->end(), states.Get(tag).begin(), states.Get(tag).end()); + } + } + + void NextAndGetToGroups(PriorityStates& states, const SingleState& cur, + Char letter, size_t pos, TVector<bool>& used) const + { + TVector<Transition> nextStates; + NextStates(cur, letter, nextStates); + for (const auto& trans : nextStates) { + size_t st = trans.GetState(); + if (used[st]) + continue; + used[st] = true; + SingleState state(st); + const auto& action = trans.GetAction(); + state.SetBegin(cur.GetBegin()); + state.SetEnd(cur.GetEnd()); + if (action & BeginCapture && !cur.HasBegin()) { + state.SetBegin(pos); + } + if (action & EndCapture && !cur.HasEnd()) { + state.SetEnd(pos); + } + PriorityStates statesInside; + NextAndGetToGroups(statesInside, state, Translate(Epsilon), pos, used); + statesInside.Push(state, NoRepetition); + if (action & EndNonGreedyRepetition) { + auto& nongreedy = states.Get(NonGreedyRepetition); + InsertStates(statesInside, nongreedy, nongreedy, nongreedy); + } + else if (!(action & EndRepetition)) + InsertStates(statesInside, states.Get(NonGreedyRepetition), states.Get(NoRepetition), states.Get(GreedyRepetition)); + else { + auto& greedy = states.Get(GreedyRepetition); + InsertStates(statesInside, greedy, greedy, greedy); + } + } + } + + bool Captured(const SingleState& st, bool& matched) const + { + matched = false; + if (IsFinal(st.GetNum())) { + matched = true; + if (st.HasBegin()) + return true; + } + TVector<Transition> nextStates; + NextStates(st, Translate(EndMark), nextStates); + for (const auto& trans : nextStates) + { + size_t state = trans.GetState(); + if (IsFinal(state)) { + matched = true; + if (st.HasBegin() || (trans.GetAction() & ActionsCapture)) + return true; + } else { // After EndMark there can be Epsilon-transitions to the Final State + TVector<Transition> epsilonTrans; + SingleState newSt(state); + NextStates(newSt, Translate(Epsilon), epsilonTrans); + for (auto new_trans : epsilonTrans) { + size_t fin = new_trans.GetState(); + if (IsFinal(fin)) { + matched = true; + if (st.HasBegin() || (trans.GetAction() & ActionsCapture)) + return true; + } + } + } + } + return false; + } + + bool Matched(const SingleState& st) const + { + bool matched; + Captured(st, matched); + return matched; + } + + bool GetCapture(const State& st, SingleState& final) const + { + size_t pos = 0; + bool matched = false; + bool ans = false; + while (pos < st.GetSize() && !matched) { + ans = Captured(st.GetState(pos), matched); + ++pos; + } + if (matched) { + final = st.GetState(pos - 1); + return ans; + } else { + if (st.IsMatched()) { + final = st.GetMatched(); + return true; + } + return false; + } + } + + bool PushState(State& nlist, const SingleState& state) const + { + nlist.PushState(state); + if (Matched(state)) { + nlist.AddMatch(state); + return true; + } + return false; + } + + void UpdateNList(State& nlist, const PriorityStates& states) const + { + static constexpr std::array<RepetitionTypes, 3> m_type_by_priority{{NonGreedyRepetition, NoRepetition, GreedyRepetition}}; + for (const auto type : m_type_by_priority) { + for (const auto& state : states.Get(type)) { + if (PushState(nlist, state)) // Because we have strict priorities, after matching some state, we can be sure, that not states after will be better + return; + } + } + } + + void Initialize(State& nlist) const + { + PriorityStates states; + SingleState init(GetStart()); + TVector<bool> used(GetSize()); + NextAndGetToGroups(states, init, Translate(BeginMark), 0, used); + NextAndGetToGroups(states, 0, Translate(BeginMark), 0, used); + UpdateNList(nlist, states); + } + + Action NextTranslated(State& clist, Char letter) const + { + State nlist; + if (clist.IsMatched()) + nlist.AddMatch(clist.GetMatched()); + nlist.SetPos(clist.GetPos() + 1); + size_t strpos = nlist.GetPos(); + TVector<bool> used(GetSize()); + size_t pos = 0; + while (pos < clist.GetSize()) { + PriorityStates states; + NextAndGetToGroups(states, clist.GetState(pos), letter, strpos, used); + UpdateNList(nlist, states); + ++pos; + } + DoSwap(clist, nlist); + return 0; + } + + void TakeAction(State&, Action) const {} + + Action Next(State& st, Char letter) const + { + return NextTranslated(st, Translate(letter)); + } public: - SlowCapturingScanner() - : SlowScanner(true) - { - } - - SlowCapturingScanner(Fsm& fsm, size_t distance = 0) - : SlowScanner(fsm, true, false, distance) - { - } + SlowCapturingScanner() + : SlowScanner(true) + { + } + + SlowCapturingScanner(Fsm& fsm, size_t distance = 0) + : SlowScanner(fsm, true, false, distance) + { + } }; namespace Features { - Feature::Ptr Capture(size_t pos); + Feature::Ptr Capture(size_t pos); } } diff --git a/library/cpp/regex/pire/pire/extra/count.cpp b/library/cpp/regex/pire/pire/extra/count.cpp index 2fb7c8d0613..df3d520400a 100644 --- a/library/cpp/regex/pire/pire/extra/count.cpp +++ b/library/cpp/regex/pire/pire/extra/count.cpp @@ -44,221 +44,221 @@ class CountingFsmTask; class CountingFsm { public: - typedef Fsm::LettersTbl LettersTbl; - - enum { - NotMatched = 1 << 0, - Matched = 1 << 1, - Separated = 1 << 2, - }; - - explicit CountingFsm(Fsm re, Fsm sep) - : mFsm(std::move(re)) - { - mFsm.Canonize(); - const auto reMatchedStates = mFsm.Finals(); - - sep.Canonize(); - for (size_t state = 0; state < sep.Size(); ++state) { - sep.SetTag(state, Separated); - } - mFsm += sep; - - mReInitial = mFsm.Initial(); - const auto allowEmptySeparator = sep.IsFinal(sep.Initial()); - for (auto reMatchedState : reMatchedStates) { - mFsm.SetTag(reMatchedState, Matched); - if (allowEmptySeparator) { - mFsm.SetFinal(reMatchedState, true); - } - } - - mFsm.PrependAnything(); - mFsm.RemoveEpsilons(); - } - - const LettersTbl& Letters() const { - return mFsm.Letters(); - } - - const Fsm& Determined() const { - return mDetermined; - } - - Action Output(size_t from, Char letter) const { - const auto& row = mActions[from]; - const auto it = row.find(letter); - if (it != row.end()) { - return it->second; - } else { - return 0; - } - } - - bool Simple() const { - return mSimple; - } - - bool Determine(); - void Minimize(); + typedef Fsm::LettersTbl LettersTbl; + + enum { + NotMatched = 1 << 0, + Matched = 1 << 1, + Separated = 1 << 2, + }; + + explicit CountingFsm(Fsm re, Fsm sep) + : mFsm(std::move(re)) + { + mFsm.Canonize(); + const auto reMatchedStates = mFsm.Finals(); + + sep.Canonize(); + for (size_t state = 0; state < sep.Size(); ++state) { + sep.SetTag(state, Separated); + } + mFsm += sep; + + mReInitial = mFsm.Initial(); + const auto allowEmptySeparator = sep.IsFinal(sep.Initial()); + for (auto reMatchedState : reMatchedStates) { + mFsm.SetTag(reMatchedState, Matched); + if (allowEmptySeparator) { + mFsm.SetFinal(reMatchedState, true); + } + } + + mFsm.PrependAnything(); + mFsm.RemoveEpsilons(); + } + + const LettersTbl& Letters() const { + return mFsm.Letters(); + } + + const Fsm& Determined() const { + return mDetermined; + } + + Action Output(size_t from, Char letter) const { + const auto& row = mActions[from]; + const auto it = row.find(letter); + if (it != row.end()) { + return it->second; + } else { + return 0; + } + } + + bool Simple() const { + return mSimple; + } + + bool Determine(); + void Minimize(); private: - void SwapTaskOutputs(CountingFsmTask& task); + void SwapTaskOutputs(CountingFsmTask& task); private: - Fsm mFsm; - size_t mReInitial; - Fsm mDetermined; - TransitionTagTable mActions; - bool mSimple; + Fsm mFsm; + size_t mReInitial; + Fsm mDetermined; + TransitionTagTable mActions; + bool mSimple; }; class CountingFsmTask { public: - typedef Fsm::LettersTbl LettersTbl; + typedef Fsm::LettersTbl LettersTbl; - virtual ~CountingFsmTask() {} + virtual ~CountingFsmTask() {} - void Connect(size_t from, size_t to, Char letter) { - mNewFsm.Connect(from, to, letter); - } + void Connect(size_t from, size_t to, Char letter) { + mNewFsm.Connect(from, to, letter); + } - typedef bool Result; + typedef bool Result; - static Result Success() { - return true; - } + static Result Success() { + return true; + } - static Result Failure() { - return false; - } + static Result Failure() { + return false; + } - Fsm& Output() { - return mNewFsm; - } + Fsm& Output() { + return mNewFsm; + } - TransitionTagTable& Actions() { - return mNewActions; - } + TransitionTagTable& Actions() { + return mNewActions; + } protected: - void ResizeOutput(size_t size) { - mNewFsm.Resize(size); - mNewActions.resize(size); - } + void ResizeOutput(size_t size) { + mNewFsm.Resize(size); + mNewActions.resize(size); + } private: - Fsm mNewFsm; - TransitionTagTable mNewActions; + Fsm mNewFsm; + TransitionTagTable mNewActions; }; class StateLessForMinimize { public: - StateLessForMinimize(const CountingFsm& fsm) : mFsm(fsm) {} - - bool operator()(size_t first, size_t second) const { - for (auto&& lettersEl : mFsm.Letters()) { - const auto letter = lettersEl.first; - if (mFsm.Output(first, letter) != mFsm.Output(second, letter)) { - return mFsm.Output(first, letter) < mFsm.Output(second, letter); - } - } - return false; - } + StateLessForMinimize(const CountingFsm& fsm) : mFsm(fsm) {} + + bool operator()(size_t first, size_t second) const { + for (auto&& lettersEl : mFsm.Letters()) { + const auto letter = lettersEl.first; + if (mFsm.Output(first, letter) != mFsm.Output(second, letter)) { + return mFsm.Output(first, letter) < mFsm.Output(second, letter); + } + } + return false; + } private: - const CountingFsm& mFsm; + const CountingFsm& mFsm; }; class CountingFsmMinimizeTask : public CountingFsmTask { public: - explicit CountingFsmMinimizeTask(const CountingFsm& fsm) - : mFsm(fsm) - , reversedTransitions(fsm.Determined().Size()) - , StateClass(fsm.Determined().Size()) - , Classes(0) - { - TMap<size_t, size_t, StateLessForMinimize> stateClassMap = TMap<size_t, size_t, StateLessForMinimize>(StateLessForMinimize(mFsm)); - for (size_t state = 0; state < mFsm.Determined().Size(); ++state) { - if (stateClassMap.find(state) == stateClassMap.end()) { - stateClassMap[state] = Classes++; - } - StateClass[state] = stateClassMap[state]; - reversedTransitions[state].resize(mFsm.Letters().Size()); - } - - for (size_t state = 0; state < mFsm.Determined().Size(); ++state) { - TSet<ypair<Char, size_t>> usedTransitions; - for (const auto& letter : mFsm.Letters()) { - const auto destination = Next(state, letter.first); - const auto letterId = letter.second.first; - if (usedTransitions.find(ymake_pair(letterId, destination)) == usedTransitions.end()) { - usedTransitions.insert(ymake_pair(letterId, destination)); - reversedTransitions[destination][letterId].push_back(state); - } - } - } - } - - TVector<size_t>& GetStateClass() { return StateClass; } - - size_t& GetClassesNumber() { return Classes; } - - size_t LettersCount() const { - return mFsm.Letters().Size(); - } - - bool IsDetermined() const { - return mFsm.Determined().IsDetermined(); - } - - size_t Size() const { - return mFsm.Determined().Size(); - } - - const TVector<size_t>& Previous(size_t state, size_t letter) const { - return reversedTransitions[state][letter]; - } - - void AcceptStates() { - ResizeOutput(Classes); - auto& newFsm = Output(); - auto& newActions = Actions(); - newFsm.SetFinal(0, false); - - // Unite equality classes into new states - for (size_t from = 0; from != Size(); ++from) { - const auto fromMinimized = StateClass[from]; - for (auto&& letter : mFsm.Letters()) { - const auto representative = letter.first; - const auto next = Next(from, representative); - const auto nextMinimized = StateClass[next]; - Connect(fromMinimized, nextMinimized, representative); - const auto outputs = mFsm.Output(from, representative); - if (outputs) { - newActions[fromMinimized][representative] = outputs; - } - } - if (mFsm.Determined().IsFinal(from)) { - newFsm.SetFinal(fromMinimized, true); - } - } - newFsm.SetInitial(StateClass[mFsm.Determined().Initial()]); - newFsm.SetIsDetermined(true); - } + explicit CountingFsmMinimizeTask(const CountingFsm& fsm) + : mFsm(fsm) + , reversedTransitions(fsm.Determined().Size()) + , StateClass(fsm.Determined().Size()) + , Classes(0) + { + TMap<size_t, size_t, StateLessForMinimize> stateClassMap = TMap<size_t, size_t, StateLessForMinimize>(StateLessForMinimize(mFsm)); + for (size_t state = 0; state < mFsm.Determined().Size(); ++state) { + if (stateClassMap.find(state) == stateClassMap.end()) { + stateClassMap[state] = Classes++; + } + StateClass[state] = stateClassMap[state]; + reversedTransitions[state].resize(mFsm.Letters().Size()); + } + + for (size_t state = 0; state < mFsm.Determined().Size(); ++state) { + TSet<ypair<Char, size_t>> usedTransitions; + for (const auto& letter : mFsm.Letters()) { + const auto destination = Next(state, letter.first); + const auto letterId = letter.second.first; + if (usedTransitions.find(ymake_pair(letterId, destination)) == usedTransitions.end()) { + usedTransitions.insert(ymake_pair(letterId, destination)); + reversedTransitions[destination][letterId].push_back(state); + } + } + } + } + + TVector<size_t>& GetStateClass() { return StateClass; } + + size_t& GetClassesNumber() { return Classes; } + + size_t LettersCount() const { + return mFsm.Letters().Size(); + } + + bool IsDetermined() const { + return mFsm.Determined().IsDetermined(); + } + + size_t Size() const { + return mFsm.Determined().Size(); + } + + const TVector<size_t>& Previous(size_t state, size_t letter) const { + return reversedTransitions[state][letter]; + } + + void AcceptStates() { + ResizeOutput(Classes); + auto& newFsm = Output(); + auto& newActions = Actions(); + newFsm.SetFinal(0, false); + + // Unite equality classes into new states + for (size_t from = 0; from != Size(); ++from) { + const auto fromMinimized = StateClass[from]; + for (auto&& letter : mFsm.Letters()) { + const auto representative = letter.first; + const auto next = Next(from, representative); + const auto nextMinimized = StateClass[next]; + Connect(fromMinimized, nextMinimized, representative); + const auto outputs = mFsm.Output(from, representative); + if (outputs) { + newActions[fromMinimized][representative] = outputs; + } + } + if (mFsm.Determined().IsFinal(from)) { + newFsm.SetFinal(fromMinimized, true); + } + } + newFsm.SetInitial(StateClass[mFsm.Determined().Initial()]); + newFsm.SetIsDetermined(true); + } private: - const CountingFsm& mFsm; - TVector<TVector<TVector<size_t>>> reversedTransitions; - TVector<size_t> StateClass; - size_t Classes; - - size_t Next(size_t state, Char letter) const { - const auto& tos = mFsm.Determined().Destinations(state, letter); - Y_ASSERT(tos.size() == 1); - return *tos.begin(); - } + const CountingFsm& mFsm; + TVector<TVector<TVector<size_t>>> reversedTransitions; + TVector<size_t> StateClass; + size_t Classes; + + size_t Next(size_t state, Char letter) const { + const auto& tos = mFsm.Determined().Destinations(state, letter); + Y_ASSERT(tos.size() == 1); + return *tos.begin(); + } }; typedef size_t RawState; @@ -267,612 +267,612 @@ typedef TSet<TaggedState> StateGroup; struct DeterminedState { public: - StateGroup matched; - StateGroup unmatched; - StateGroup separated; - StateGroup lagging; + StateGroup matched; + StateGroup unmatched; + StateGroup separated; + StateGroup lagging; }; bool operator < (const DeterminedState& left, const DeterminedState& right) { - auto asTuple = [](const DeterminedState& state) { - return std::tie(state.matched, state.unmatched, state.separated, state.lagging); - }; + auto asTuple = [](const DeterminedState& state) { + return std::tie(state.matched, state.unmatched, state.separated, state.lagging); + }; - return asTuple(left) < asTuple(right); + return asTuple(left) < asTuple(right); } bool InvalidCharRange(const TVector<Char>& range) { - for (const auto letter : range) { - if (letter < MaxCharUnaligned && letter != 256) { - return false; - } - } - return true; + for (const auto letter : range) { + if (letter < MaxCharUnaligned && letter != 256) { + return false; + } + } + return true; } class BasicCountingFsmDetermineTask : public CountingFsmTask { public: - using CountingFsmTask::LettersTbl; - typedef DeterminedState State; - typedef TMap<State, size_t> InvStates; - - explicit BasicCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) - : mFsm(fsm) - , mReInitial{reInitial} - { - mDeadStates = fsm.DeadStates(); - for (auto&& letter : fsm.Letters()) { - if (InvalidCharRange(letter.second.second)) { - mInvalidLetters.insert(letter.first); - } - } - } - - const LettersTbl& Letters() const { - return mFsm.Letters(); - } - - State Initial() const { - return State{StateGroup{}, InitialGroup(), StateGroup{}, StateGroup{}}; - } - - bool IsRequired(const State& state) const { - Y_UNUSED(state); - return true; - } - - State Next(const State& state, Char letter) const { - if (mInvalidLetters.count(letter) != 0) { - AddAction(state, letter, CountingFsm::NotMatched); - return Initial(); - } - - auto next = PrepareNextState(state, letter); - AddAction(state, letter, CalculateTransitionTag(state, next)); - PostProcessNextState(next); - NormalizeState(next); - - return next; - } - - void AcceptStates(const TVector<State>& states) - { - ResizeOutput(states.size()); - auto& newFsm = Output(); - auto& newActions = Actions(); - newFsm.SetInitial(0); - newFsm.SetIsDetermined(true); - - for (size_t ns = 0; ns < states.size(); ++ns) { - const auto& state = states[ns]; - newFsm.SetFinal(ns, HasFinals(state.unmatched)); - - auto outputIt = mActionByState.find(state); - if (outputIt != mActionByState.end()) { - newActions[ns].swap(outputIt->second); - } - } - } + using CountingFsmTask::LettersTbl; + typedef DeterminedState State; + typedef TMap<State, size_t> InvStates; + + explicit BasicCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) + : mFsm(fsm) + , mReInitial{reInitial} + { + mDeadStates = fsm.DeadStates(); + for (auto&& letter : fsm.Letters()) { + if (InvalidCharRange(letter.second.second)) { + mInvalidLetters.insert(letter.first); + } + } + } + + const LettersTbl& Letters() const { + return mFsm.Letters(); + } + + State Initial() const { + return State{StateGroup{}, InitialGroup(), StateGroup{}, StateGroup{}}; + } + + bool IsRequired(const State& state) const { + Y_UNUSED(state); + return true; + } + + State Next(const State& state, Char letter) const { + if (mInvalidLetters.count(letter) != 0) { + AddAction(state, letter, CountingFsm::NotMatched); + return Initial(); + } + + auto next = PrepareNextState(state, letter); + AddAction(state, letter, CalculateTransitionTag(state, next)); + PostProcessNextState(next); + NormalizeState(next); + + return next; + } + + void AcceptStates(const TVector<State>& states) + { + ResizeOutput(states.size()); + auto& newFsm = Output(); + auto& newActions = Actions(); + newFsm.SetInitial(0); + newFsm.SetIsDetermined(true); + + for (size_t ns = 0; ns < states.size(); ++ns) { + const auto& state = states[ns]; + newFsm.SetFinal(ns, HasFinals(state.unmatched)); + + auto outputIt = mActionByState.find(state); + if (outputIt != mActionByState.end()) { + newActions[ns].swap(outputIt->second); + } + } + } protected: - void SplitDestinations(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, Char letter) const { - for (const auto& state : source) { - MakeTaggedStates(matched, unmatched, separated, mFsm.Destinations(state.first, letter), state.second); - if (mFsm.IsFinal(state.first)) { - // Implicit epsilon transitions from final states to reInitial after matching separator - MakeTaggedStates(separated, separated, separated, mFsm.Destinations(mReInitial, letter), CountingFsm::Separated); - } - } - } - - Action CalculateTransitionTagImpl(const State& dest) const { - Action result = 0; - if (!dest.matched.empty()) { - result = AdvancedCountingScanner::IncrementAction; - } else if (dest.unmatched.empty()) { - if (!dest.separated.empty()) { - for (const auto& state : dest.separated) { - if (state.second == CountingFsm::Matched) { - result = AdvancedCountingScanner::IncrementAction; - } - } - } else { - result = AdvancedCountingScanner::ResetAction; - for (const auto& state : dest.lagging) { - if (state.second != CountingFsm::NotMatched) { - result |= AdvancedCountingScanner::IncrementAction; - } - } - } - } - return result; - } - - unsigned long TagsOfGroup(const StateGroup& group) const { - unsigned long result = 0; - for (const auto& state : group) { - result |= state.second; - } - return result; - } - - void SplitGroupByTag(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, bool useFsmTag) const { - for (const auto& state : source) { - auto tag = useFsmTag ? mFsm.Tag(state.first) : state.second; - if (tag == CountingFsm::Matched) { - matched.insert(state); - } else if (tag == CountingFsm::Separated) { - separated.insert(state); - } else { - unmatched.insert(state); - } - } - } - - void UpdateLaggingStates(State& state, bool moveToLagging) const { - if (!state.matched.empty()) { - if (moveToLagging) { - state.lagging.insert(state.unmatched.cbegin(), state.unmatched.cend()); - state.lagging.insert(state.separated.cbegin(), state.separated.cend()); - } - state.unmatched.clear(); - state.separated.clear(); - } - if (state.unmatched.empty() && !state.separated.empty()) { - const auto unmatchedTags = TagsOfGroup(state.separated); - if ((unmatchedTags & CountingFsm::Matched) && (unmatchedTags != CountingFsm::Matched)) { - StateGroup separatedMatched; - for (const auto& separatedState : state.separated) { - if (separatedState.second == CountingFsm::Matched) { - separatedMatched.insert(separatedState); - } else if (moveToLagging) { - state.lagging.insert(separatedState); - } - } - state.separated.swap(separatedMatched); - } - } - } - - void RemoveDuplicateLaggingStates(State& state) const { - const auto statesToRemove = GetRawStates({state.matched, state.unmatched, state.separated}, 0); - const auto unmatchedStatesToRemove = GetRawStates({state.lagging}, CountingFsm::NotMatched); - - StateGroup newLagging; - for (const auto& taggedState : state.lagging) { - if (statesToRemove.count(taggedState.first) == 0) { - if (taggedState.second != CountingFsm::NotMatched || unmatchedStatesToRemove.count(taggedState.first) == 0) { - newLagging.insert(taggedState); - } - } - } - state.lagging.swap(newLagging); - } - - void RemoveDuplicateSeparatedStates(State& state) const { - if (state.separated.empty()) { - return; - } - const auto statesToRemove = GetRawStates({state.matched, state.unmatched}, 0); - RemoveRawStates(state.separated, statesToRemove); - } - - void NormalizeState(State& state) const { - if (!state.matched.empty()) { - Y_ASSERT(state.unmatched.empty()); - state.unmatched.swap(state.matched); - } - - if (state.unmatched.empty() && !state.separated.empty()) { - state.unmatched.swap(state.separated); - } - - if (state.unmatched.empty() && !state.lagging.empty()) { - State groups; - SplitGroupByTag(groups.matched, groups.unmatched, groups.separated, state.lagging, false); - if (!groups.matched.empty()) { - state.unmatched.swap(groups.matched); - state.separated.swap(groups.separated); - state.lagging.swap(groups.unmatched); - } else if (!groups.separated.empty()) { - state.unmatched.swap(groups.separated); - state.lagging.swap(groups.unmatched); - } else { - state.unmatched.swap(groups.unmatched); - state.lagging.swap(groups.matched); // just clear - } - } - } + void SplitDestinations(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, Char letter) const { + for (const auto& state : source) { + MakeTaggedStates(matched, unmatched, separated, mFsm.Destinations(state.first, letter), state.second); + if (mFsm.IsFinal(state.first)) { + // Implicit epsilon transitions from final states to reInitial after matching separator + MakeTaggedStates(separated, separated, separated, mFsm.Destinations(mReInitial, letter), CountingFsm::Separated); + } + } + } + + Action CalculateTransitionTagImpl(const State& dest) const { + Action result = 0; + if (!dest.matched.empty()) { + result = AdvancedCountingScanner::IncrementAction; + } else if (dest.unmatched.empty()) { + if (!dest.separated.empty()) { + for (const auto& state : dest.separated) { + if (state.second == CountingFsm::Matched) { + result = AdvancedCountingScanner::IncrementAction; + } + } + } else { + result = AdvancedCountingScanner::ResetAction; + for (const auto& state : dest.lagging) { + if (state.second != CountingFsm::NotMatched) { + result |= AdvancedCountingScanner::IncrementAction; + } + } + } + } + return result; + } + + unsigned long TagsOfGroup(const StateGroup& group) const { + unsigned long result = 0; + for (const auto& state : group) { + result |= state.second; + } + return result; + } + + void SplitGroupByTag(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, bool useFsmTag) const { + for (const auto& state : source) { + auto tag = useFsmTag ? mFsm.Tag(state.first) : state.second; + if (tag == CountingFsm::Matched) { + matched.insert(state); + } else if (tag == CountingFsm::Separated) { + separated.insert(state); + } else { + unmatched.insert(state); + } + } + } + + void UpdateLaggingStates(State& state, bool moveToLagging) const { + if (!state.matched.empty()) { + if (moveToLagging) { + state.lagging.insert(state.unmatched.cbegin(), state.unmatched.cend()); + state.lagging.insert(state.separated.cbegin(), state.separated.cend()); + } + state.unmatched.clear(); + state.separated.clear(); + } + if (state.unmatched.empty() && !state.separated.empty()) { + const auto unmatchedTags = TagsOfGroup(state.separated); + if ((unmatchedTags & CountingFsm::Matched) && (unmatchedTags != CountingFsm::Matched)) { + StateGroup separatedMatched; + for (const auto& separatedState : state.separated) { + if (separatedState.second == CountingFsm::Matched) { + separatedMatched.insert(separatedState); + } else if (moveToLagging) { + state.lagging.insert(separatedState); + } + } + state.separated.swap(separatedMatched); + } + } + } + + void RemoveDuplicateLaggingStates(State& state) const { + const auto statesToRemove = GetRawStates({state.matched, state.unmatched, state.separated}, 0); + const auto unmatchedStatesToRemove = GetRawStates({state.lagging}, CountingFsm::NotMatched); + + StateGroup newLagging; + for (const auto& taggedState : state.lagging) { + if (statesToRemove.count(taggedState.first) == 0) { + if (taggedState.second != CountingFsm::NotMatched || unmatchedStatesToRemove.count(taggedState.first) == 0) { + newLagging.insert(taggedState); + } + } + } + state.lagging.swap(newLagging); + } + + void RemoveDuplicateSeparatedStates(State& state) const { + if (state.separated.empty()) { + return; + } + const auto statesToRemove = GetRawStates({state.matched, state.unmatched}, 0); + RemoveRawStates(state.separated, statesToRemove); + } + + void NormalizeState(State& state) const { + if (!state.matched.empty()) { + Y_ASSERT(state.unmatched.empty()); + state.unmatched.swap(state.matched); + } + + if (state.unmatched.empty() && !state.separated.empty()) { + state.unmatched.swap(state.separated); + } + + if (state.unmatched.empty() && !state.lagging.empty()) { + State groups; + SplitGroupByTag(groups.matched, groups.unmatched, groups.separated, state.lagging, false); + if (!groups.matched.empty()) { + state.unmatched.swap(groups.matched); + state.separated.swap(groups.separated); + state.lagging.swap(groups.unmatched); + } else if (!groups.separated.empty()) { + state.unmatched.swap(groups.separated); + state.lagging.swap(groups.unmatched); + } else { + state.unmatched.swap(groups.unmatched); + state.lagging.swap(groups.matched); // just clear + } + } + } private: - virtual State PrepareNextState(const State& state, Char letter) const = 0; - - virtual void PostProcessNextState(State& next) const = 0; - - virtual Action CalculateTransitionTag(const State& source, const State& dest) const { - Y_UNUSED(source); - return CalculateTransitionTagImpl(dest); - } - - virtual StateGroup InitialGroup() const { - return StateGroup{TaggedState{mFsm.Initial(), CountingFsm::NotMatched}}; - } - - void AddAction(State from, Char letter, unsigned long value) const { - if (!value) { - return; - } - TransitionTagRow& row = mActionByState[from]; - row[letter] = value; - } - - void MakeTaggedStates(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const Fsm::StatesSet& destinations, unsigned long sourceTag) const { - for (const auto destState : destinations) { - if (mDeadStates.count(destState) == 0) { - const auto destTag = mFsm.Tag(destState); - if (sourceTag != CountingFsm::Matched && destTag == CountingFsm::Matched) { - matched.insert(ymake_pair(destState, destTag)); - } else if (sourceTag == CountingFsm::Separated || destTag == CountingFsm::Separated) { - separated.insert(ymake_pair(destState, CountingFsm::Separated)); - } else { - unmatched.insert(ymake_pair(destState, sourceTag)); - } - } - } - } - - bool HasFinals(const StateGroup& states) const { - for (const auto& state : states) { - if (mFsm.IsFinal(state.first)) { - return true; - } - } - return false; - } - - Fsm::StatesSet GetRawStates(const TVector<std::reference_wrapper<const StateGroup>> groups, unsigned long excludedTags) const { - Fsm::StatesSet result; - for (const auto& group : groups) { - for (const auto& taggedState : group.get()) { - if (!(taggedState.second & excludedTags)) { - result.insert(taggedState.first); - } - } - } - return result; - } - - void RemoveRawStates(StateGroup& group, const Fsm::StatesSet& states) const { - StateGroup removing; - for (const auto& taggedState : group) { - if (states.count(taggedState.first) != 0) { - removing.insert(taggedState); - } - } - for (const auto& taggedState : removing) { - group.erase(taggedState); - } - } + virtual State PrepareNextState(const State& state, Char letter) const = 0; + + virtual void PostProcessNextState(State& next) const = 0; + + virtual Action CalculateTransitionTag(const State& source, const State& dest) const { + Y_UNUSED(source); + return CalculateTransitionTagImpl(dest); + } + + virtual StateGroup InitialGroup() const { + return StateGroup{TaggedState{mFsm.Initial(), CountingFsm::NotMatched}}; + } + + void AddAction(State from, Char letter, unsigned long value) const { + if (!value) { + return; + } + TransitionTagRow& row = mActionByState[from]; + row[letter] = value; + } + + void MakeTaggedStates(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const Fsm::StatesSet& destinations, unsigned long sourceTag) const { + for (const auto destState : destinations) { + if (mDeadStates.count(destState) == 0) { + const auto destTag = mFsm.Tag(destState); + if (sourceTag != CountingFsm::Matched && destTag == CountingFsm::Matched) { + matched.insert(ymake_pair(destState, destTag)); + } else if (sourceTag == CountingFsm::Separated || destTag == CountingFsm::Separated) { + separated.insert(ymake_pair(destState, CountingFsm::Separated)); + } else { + unmatched.insert(ymake_pair(destState, sourceTag)); + } + } + } + } + + bool HasFinals(const StateGroup& states) const { + for (const auto& state : states) { + if (mFsm.IsFinal(state.first)) { + return true; + } + } + return false; + } + + Fsm::StatesSet GetRawStates(const TVector<std::reference_wrapper<const StateGroup>> groups, unsigned long excludedTags) const { + Fsm::StatesSet result; + for (const auto& group : groups) { + for (const auto& taggedState : group.get()) { + if (!(taggedState.second & excludedTags)) { + result.insert(taggedState.first); + } + } + } + return result; + } + + void RemoveRawStates(StateGroup& group, const Fsm::StatesSet& states) const { + StateGroup removing; + for (const auto& taggedState : group) { + if (states.count(taggedState.first) != 0) { + removing.insert(taggedState); + } + } + for (const auto& taggedState : removing) { + group.erase(taggedState); + } + } private: - const Fsm& mFsm; - RawState mReInitial; - Fsm::StatesSet mDeadStates; - TSet<Char> mInvalidLetters; + const Fsm& mFsm; + RawState mReInitial; + Fsm::StatesSet mDeadStates; + TSet<Char> mInvalidLetters; - mutable TMap<State, TransitionTagRow> mActionByState; + mutable TMap<State, TransitionTagRow> mActionByState; }; class CountingFsmDetermineTask : public BasicCountingFsmDetermineTask { public: - using BasicCountingFsmDetermineTask::State; - using BasicCountingFsmDetermineTask::LettersTbl; - using BasicCountingFsmDetermineTask::InvStates; + using BasicCountingFsmDetermineTask::State; + using BasicCountingFsmDetermineTask::LettersTbl; + using BasicCountingFsmDetermineTask::InvStates; - explicit CountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) - : BasicCountingFsmDetermineTask{fsm, reInitial} - {} + explicit CountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) + : BasicCountingFsmDetermineTask{fsm, reInitial} + {} private: - State PrepareNextState(const State& state, Char letter) const override { - State next; - SplitDestinations(next.matched, next.unmatched, next.separated, state.unmatched, letter); - SplitDestinations(next.separated, next.separated, next.separated, state.separated, letter); - SplitDestinations(next.lagging, next.lagging, next.lagging, state.lagging, letter); - return next; - } - - void PostProcessNextState(State& next) const override { - UpdateLaggingStates(next, true); - RemoveDuplicateLaggingStates(next); - RemoveDuplicateSeparatedStates(next); - } + State PrepareNextState(const State& state, Char letter) const override { + State next; + SplitDestinations(next.matched, next.unmatched, next.separated, state.unmatched, letter); + SplitDestinations(next.separated, next.separated, next.separated, state.separated, letter); + SplitDestinations(next.lagging, next.lagging, next.lagging, state.lagging, letter); + return next; + } + + void PostProcessNextState(State& next) const override { + UpdateLaggingStates(next, true); + RemoveDuplicateLaggingStates(next); + RemoveDuplicateSeparatedStates(next); + } }; class SimpleCountingFsmDetermineTask : public BasicCountingFsmDetermineTask { public: - using BasicCountingFsmDetermineTask::State; - using BasicCountingFsmDetermineTask::LettersTbl; - using BasicCountingFsmDetermineTask::InvStates; + using BasicCountingFsmDetermineTask::State; + using BasicCountingFsmDetermineTask::LettersTbl; + using BasicCountingFsmDetermineTask::InvStates; - static constexpr unsigned long MixedTags = CountingFsm::Separated | CountingFsm::Matched; + static constexpr unsigned long MixedTags = CountingFsm::Separated | CountingFsm::Matched; - SimpleCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) - : BasicCountingFsmDetermineTask{fsm, reInitial} - , mStartState{reInitial, CountingFsm::NotMatched} - {} + SimpleCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial) + : BasicCountingFsmDetermineTask{fsm, reInitial} + , mStartState{reInitial, CountingFsm::NotMatched} + {} private: - State PrepareNextState(const State& state, Char letter) const override { - State next; - auto from = state; - const auto fromIsEmpty = IsEmptyState(from); - if (fromIsEmpty) { - from.unmatched.insert(mStartState); - } - Y_ASSERT(IsValidState(from)); - - SplitDestinations(next.matched, next.unmatched, next.separated, from.unmatched, letter); - if (next.matched.empty() && !next.separated.empty()) { - if (next.unmatched.empty()) { - SplitSeparatedByFsmTag(next); - if (next.separated.size() > 1) { - RemoveDuplicateSeparatedStates(next); - } - if (next.unmatched.empty()) { - next.unmatched.swap(next.separated); - } - } else { - ChooseOneSeparatedState(next); - } - } - if (next.matched.empty() && next.separated.empty() && !from.separated.empty()) { - if (!next.unmatched.empty()) { - ChooseOneDestState(next.separated, from.separated, letter); - } else { - SplitDestinations(next.matched, next.unmatched, next.separated, from.separated, letter); - if (next.matched.empty() && !next.separated.empty()) { - SplitSeparatedByFsmTag(next); - } - } - ChooseOneSeparatedState(next); - } - if (!fromIsEmpty && IsEmptyState(next)) { - ChooseOneDestState(next.lagging, StateGroup{mStartState}, letter); - } - - return next; - } - - void PostProcessNextState(State& next) const override { - if (!next.lagging.empty()) { - next.unmatched.swap(next.lagging); - } - UpdateLaggingStates(next, false); - RemoveDuplicateSeparatedStates(next); - } - - Action CalculateTransitionTag(const State& source, const State& dest) const override { - Action tag = CalculateTransitionTagImpl(dest); - if (!((TagsOfGroup(source.unmatched) | TagsOfGroup(source.separated)) & MixedTags)) { - tag &= AdvancedCountingScanner::IncrementAction; - } - return tag; - } - - StateGroup InitialGroup() const override { - return StateGroup{}; - } - - bool IsEmptyState(const State& state) const { - return state.matched.empty() && state.unmatched.empty() && state.separated.empty() && state.lagging.empty(); - } - - bool IsValidState(const State& state) const { - return state.matched.empty() && state.unmatched.size() <= 1 && state.separated.size() <= 1 && state.lagging.empty(); - } - - void SplitSeparatedByFsmTag(State& state) const { - Y_ASSERT(state.unmatched.empty()); - StateGroup separated; - separated.swap(state.separated); - SplitGroupByTag(state.matched, state.unmatched, state.separated, separated, true); - } - - void ChooseOneDestState(StateGroup& dest, const StateGroup& source, Char letter) const { - State destState; - SplitDestinations(destState.matched, destState.unmatched, destState.separated, source, letter); - if (!destState.matched.empty()) { - dest.swap(destState.matched); - } else if (!destState.separated.empty()) { - dest.swap(destState.separated); - } else if (!destState.unmatched.empty()) { - dest.swap(destState.unmatched); - } - } - - void ChooseOneSeparatedState(State& state) const { - if (state.separated.size() <= 1) { - return; - } - RemoveDuplicateSeparatedStates(state); - State splitted; - SplitGroupByTag(splitted.matched, splitted.unmatched, splitted.separated, state.separated, true); - if (!splitted.separated.empty()) { - state.separated.swap(splitted.separated); - } else if (!splitted.matched.empty()) { - state.separated.swap(splitted.matched); - } - } + State PrepareNextState(const State& state, Char letter) const override { + State next; + auto from = state; + const auto fromIsEmpty = IsEmptyState(from); + if (fromIsEmpty) { + from.unmatched.insert(mStartState); + } + Y_ASSERT(IsValidState(from)); + + SplitDestinations(next.matched, next.unmatched, next.separated, from.unmatched, letter); + if (next.matched.empty() && !next.separated.empty()) { + if (next.unmatched.empty()) { + SplitSeparatedByFsmTag(next); + if (next.separated.size() > 1) { + RemoveDuplicateSeparatedStates(next); + } + if (next.unmatched.empty()) { + next.unmatched.swap(next.separated); + } + } else { + ChooseOneSeparatedState(next); + } + } + if (next.matched.empty() && next.separated.empty() && !from.separated.empty()) { + if (!next.unmatched.empty()) { + ChooseOneDestState(next.separated, from.separated, letter); + } else { + SplitDestinations(next.matched, next.unmatched, next.separated, from.separated, letter); + if (next.matched.empty() && !next.separated.empty()) { + SplitSeparatedByFsmTag(next); + } + } + ChooseOneSeparatedState(next); + } + if (!fromIsEmpty && IsEmptyState(next)) { + ChooseOneDestState(next.lagging, StateGroup{mStartState}, letter); + } + + return next; + } + + void PostProcessNextState(State& next) const override { + if (!next.lagging.empty()) { + next.unmatched.swap(next.lagging); + } + UpdateLaggingStates(next, false); + RemoveDuplicateSeparatedStates(next); + } + + Action CalculateTransitionTag(const State& source, const State& dest) const override { + Action tag = CalculateTransitionTagImpl(dest); + if (!((TagsOfGroup(source.unmatched) | TagsOfGroup(source.separated)) & MixedTags)) { + tag &= AdvancedCountingScanner::IncrementAction; + } + return tag; + } + + StateGroup InitialGroup() const override { + return StateGroup{}; + } + + bool IsEmptyState(const State& state) const { + return state.matched.empty() && state.unmatched.empty() && state.separated.empty() && state.lagging.empty(); + } + + bool IsValidState(const State& state) const { + return state.matched.empty() && state.unmatched.size() <= 1 && state.separated.size() <= 1 && state.lagging.empty(); + } + + void SplitSeparatedByFsmTag(State& state) const { + Y_ASSERT(state.unmatched.empty()); + StateGroup separated; + separated.swap(state.separated); + SplitGroupByTag(state.matched, state.unmatched, state.separated, separated, true); + } + + void ChooseOneDestState(StateGroup& dest, const StateGroup& source, Char letter) const { + State destState; + SplitDestinations(destState.matched, destState.unmatched, destState.separated, source, letter); + if (!destState.matched.empty()) { + dest.swap(destState.matched); + } else if (!destState.separated.empty()) { + dest.swap(destState.separated); + } else if (!destState.unmatched.empty()) { + dest.swap(destState.unmatched); + } + } + + void ChooseOneSeparatedState(State& state) const { + if (state.separated.size() <= 1) { + return; + } + RemoveDuplicateSeparatedStates(state); + State splitted; + SplitGroupByTag(splitted.matched, splitted.unmatched, splitted.separated, state.separated, true); + if (!splitted.separated.empty()) { + state.separated.swap(splitted.separated); + } else if (!splitted.matched.empty()) { + state.separated.swap(splitted.matched); + } + } private: - TaggedState mStartState; + TaggedState mStartState; }; bool CountingFsm::Determine() { - CountingFsmDetermineTask task{mFsm, mReInitial}; - size_t maxSize = mFsm.Size() * 4096; - if (Pire::Impl::Determine(task, maxSize)) { - SwapTaskOutputs(task); - mSimple = false; - } else { - SimpleCountingFsmDetermineTask simpleTask{mFsm, mReInitial}; - if (Pire::Impl::Determine(simpleTask, std::numeric_limits<size_t>::max())) { - SwapTaskOutputs(simpleTask); - mSimple = true; - } else { - return false; - } - } - return true; + CountingFsmDetermineTask task{mFsm, mReInitial}; + size_t maxSize = mFsm.Size() * 4096; + if (Pire::Impl::Determine(task, maxSize)) { + SwapTaskOutputs(task); + mSimple = false; + } else { + SimpleCountingFsmDetermineTask simpleTask{mFsm, mReInitial}; + if (Pire::Impl::Determine(simpleTask, std::numeric_limits<size_t>::max())) { + SwapTaskOutputs(simpleTask); + mSimple = true; + } else { + return false; + } + } + return true; } void CountingFsm::Minimize() { - CountingFsmMinimizeTask task{*this}; - Pire::Impl::Minimize(task); - SwapTaskOutputs(task); + CountingFsmMinimizeTask task{*this}; + Pire::Impl::Minimize(task); + SwapTaskOutputs(task); } void CountingFsm::SwapTaskOutputs(CountingFsmTask& task) { - task.Output().Swap(mDetermined); - task.Actions().swap(mActions); + task.Output().Swap(mDetermined); + task.Actions().swap(mActions); } } namespace { - Pire::Fsm FsmForDot() { Pire::Fsm f; f.AppendDot(); return f; } - Pire::Fsm FsmForChar(Pire::Char c) { Pire::Fsm f; f.AppendSpecial(c); return f; } + Pire::Fsm FsmForDot() { Pire::Fsm f; f.AppendDot(); return f; } + Pire::Fsm FsmForChar(Pire::Char c) { Pire::Fsm f; f.AppendSpecial(c); return f; } } CountingScanner::CountingScanner(const Fsm& re, const Fsm& sep) { - Fsm res = re; - res.Surround(); - Fsm sep_re = ((sep & ~res) /* | Fsm()*/) + re; - sep_re.Determine(); - - Fsm dup = sep_re; - for (size_t i = 0; i < dup.Size(); ++i) - dup.SetTag(i, Matched); - size_t oldsize = sep_re.Size(); - sep_re.Import(dup); - for (Fsm::FinalTable::const_iterator i = sep_re.Finals().begin(), ie = sep_re.Finals().end(); i != ie; ++i) - if (*i < oldsize) - sep_re.Connect(*i, oldsize + *i); - - sep_re |= (FsmForDot() | FsmForChar(Pire::BeginMark) | FsmForChar(Pire::EndMark)); - - // Make a full Cartesian product of two sep_res - sep_re.Determine(); - sep_re.Unsparse(); - TSet<size_t> dead = sep_re.DeadStates(); - - PIRE_IFDEBUG(Cdbg << "=== Original FSM ===" << Endl << sep_re << ">>> " << sep_re.Size() << " states, dead: [" << Join(dead.begin(), dead.end(), ", ") << "]" << Endl); - - Fsm sq; - - typedef ypair<size_t, size_t> NewState; - TVector<NewState> states; - TMap<NewState, size_t> invstates; - - states.push_back(NewState(sep_re.Initial(), sep_re.Initial())); - invstates.insert(ymake_pair(states.back(), states.size() - 1)); - - // TODO: this loop reminds me a general determination task... - for (size_t curstate = 0; curstate < states.size(); ++curstate) { - - unsigned long tag = sep_re.Tag(states[curstate].first); - if (tag) - sq.SetTag(curstate, tag); - sq.SetFinal(curstate, sep_re.IsFinal(states[curstate].first)); - - PIRE_IFDEBUG(Cdbg << "State " << curstate << " = (" << states[curstate].first << ", " << states[curstate].second << ")" << Endl); - for (Fsm::LettersTbl::ConstIterator lit = sep_re.Letters().Begin(), lie = sep_re.Letters().End(); lit != lie; ++lit) { - - Char letter = lit->first; - - const Fsm::StatesSet& mr = sep_re.Destinations(states[curstate].first, letter); - const Fsm::StatesSet& br = sep_re.Destinations(states[curstate].second, letter); - - if (mr.size() != 1) - Y_ASSERT(!"Wrong transition size for main"); - if (br.size() != 1) - Y_ASSERT(!"Wrong transition size for backup"); - - NewState ns(*mr.begin(), *br.begin()); - PIRE_IFDEBUG(NewState savedNs = ns); - unsigned long outputs = 0; - - PIRE_IFDEBUG(ystring dbgout); - if (dead.find(ns.first) != dead.end()) { - PIRE_IFDEBUG(dbgout = ((sep_re.Tag(ns.first) & Matched) ? ", ++cur" : ", max <- cur")); - outputs = DeadFlag | (sep_re.Tag(ns.first) & Matched); - ns.first = ns.second; - } - if (sep_re.IsFinal(ns.first) || (sep_re.IsFinal(ns.second) && !(sep_re.Tag(ns.first) & Matched))) - ns.second = sep_re.Initial(); - - PIRE_IFDEBUG(if (ns != savedNs) Cdbg << "Diverted transition to (" << savedNs.first << ", " << savedNs.second << ") on " << (char) letter << " to (" << ns.first << ", " << ns.second << ")" << dbgout << Endl); - - TMap<NewState, size_t>::iterator nsi = invstates.find(ns); - if (nsi == invstates.end()) { - PIRE_IFDEBUG(Cdbg << "New state " << states.size() << " = (" << ns.first << ", " << ns.second << ")" << Endl); - states.push_back(ns); - nsi = invstates.insert(ymake_pair(states.back(), states.size() - 1)).first; - sq.Resize(states.size()); - } - - for (TVector<Char>::const_iterator li = lit->second.second.begin(), le = lit->second.second.end(); li != le; ++li) - sq.Connect(curstate, nsi->second, *li); - if (outputs) - sq.SetOutput(curstate, nsi->second, outputs); - } - } - - sq.Determine(); - - PIRE_IFDEBUG(Cdbg << "=== FSM ===" << Endl << sq << Endl); - Init(sq.Size(), sq.Letters(), sq.Initial(), 1); - BuildScanner(sq, *this); + Fsm res = re; + res.Surround(); + Fsm sep_re = ((sep & ~res) /* | Fsm()*/) + re; + sep_re.Determine(); + + Fsm dup = sep_re; + for (size_t i = 0; i < dup.Size(); ++i) + dup.SetTag(i, Matched); + size_t oldsize = sep_re.Size(); + sep_re.Import(dup); + for (Fsm::FinalTable::const_iterator i = sep_re.Finals().begin(), ie = sep_re.Finals().end(); i != ie; ++i) + if (*i < oldsize) + sep_re.Connect(*i, oldsize + *i); + + sep_re |= (FsmForDot() | FsmForChar(Pire::BeginMark) | FsmForChar(Pire::EndMark)); + + // Make a full Cartesian product of two sep_res + sep_re.Determine(); + sep_re.Unsparse(); + TSet<size_t> dead = sep_re.DeadStates(); + + PIRE_IFDEBUG(Cdbg << "=== Original FSM ===" << Endl << sep_re << ">>> " << sep_re.Size() << " states, dead: [" << Join(dead.begin(), dead.end(), ", ") << "]" << Endl); + + Fsm sq; + + typedef ypair<size_t, size_t> NewState; + TVector<NewState> states; + TMap<NewState, size_t> invstates; + + states.push_back(NewState(sep_re.Initial(), sep_re.Initial())); + invstates.insert(ymake_pair(states.back(), states.size() - 1)); + + // TODO: this loop reminds me a general determination task... + for (size_t curstate = 0; curstate < states.size(); ++curstate) { + + unsigned long tag = sep_re.Tag(states[curstate].first); + if (tag) + sq.SetTag(curstate, tag); + sq.SetFinal(curstate, sep_re.IsFinal(states[curstate].first)); + + PIRE_IFDEBUG(Cdbg << "State " << curstate << " = (" << states[curstate].first << ", " << states[curstate].second << ")" << Endl); + for (Fsm::LettersTbl::ConstIterator lit = sep_re.Letters().Begin(), lie = sep_re.Letters().End(); lit != lie; ++lit) { + + Char letter = lit->first; + + const Fsm::StatesSet& mr = sep_re.Destinations(states[curstate].first, letter); + const Fsm::StatesSet& br = sep_re.Destinations(states[curstate].second, letter); + + if (mr.size() != 1) + Y_ASSERT(!"Wrong transition size for main"); + if (br.size() != 1) + Y_ASSERT(!"Wrong transition size for backup"); + + NewState ns(*mr.begin(), *br.begin()); + PIRE_IFDEBUG(NewState savedNs = ns); + unsigned long outputs = 0; + + PIRE_IFDEBUG(ystring dbgout); + if (dead.find(ns.first) != dead.end()) { + PIRE_IFDEBUG(dbgout = ((sep_re.Tag(ns.first) & Matched) ? ", ++cur" : ", max <- cur")); + outputs = DeadFlag | (sep_re.Tag(ns.first) & Matched); + ns.first = ns.second; + } + if (sep_re.IsFinal(ns.first) || (sep_re.IsFinal(ns.second) && !(sep_re.Tag(ns.first) & Matched))) + ns.second = sep_re.Initial(); + + PIRE_IFDEBUG(if (ns != savedNs) Cdbg << "Diverted transition to (" << savedNs.first << ", " << savedNs.second << ") on " << (char) letter << " to (" << ns.first << ", " << ns.second << ")" << dbgout << Endl); + + TMap<NewState, size_t>::iterator nsi = invstates.find(ns); + if (nsi == invstates.end()) { + PIRE_IFDEBUG(Cdbg << "New state " << states.size() << " = (" << ns.first << ", " << ns.second << ")" << Endl); + states.push_back(ns); + nsi = invstates.insert(ymake_pair(states.back(), states.size() - 1)).first; + sq.Resize(states.size()); + } + + for (TVector<Char>::const_iterator li = lit->second.second.begin(), le = lit->second.second.end(); li != le; ++li) + sq.Connect(curstate, nsi->second, *li); + if (outputs) + sq.SetOutput(curstate, nsi->second, outputs); + } + } + + sq.Determine(); + + PIRE_IFDEBUG(Cdbg << "=== FSM ===" << Endl << sq << Endl); + Init(sq.Size(), sq.Letters(), sq.Initial(), 1); + BuildScanner(sq, *this); } namespace Impl { template <class AdvancedScanner> AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple) { - Impl::CountingFsm countingFsm{re, sep}; - if (!countingFsm.Determine()) { - throw Error("regexp pattern too complicated"); - } - countingFsm.Minimize(); - if (simple) { - *simple = countingFsm.Simple(); - } - - const auto& determined = countingFsm.Determined(); - const auto& letters = countingFsm.Letters(); - - AdvancedScanner scanner; - scanner.Init(determined.Size(), letters, determined.Initial(), 1); - for (size_t from = 0; from != determined.Size(); ++from) { - for (auto&& lettersEl : letters) { - const auto letter = lettersEl.first; - const auto& tos = determined.Destinations(from, letter); - Y_ASSERT(tos.size() == 1); - scanner.SetJump(from, letter, *tos.begin(), scanner.RemapAction(countingFsm.Output(from, letter))); - } - } - return scanner; + Impl::CountingFsm countingFsm{re, sep}; + if (!countingFsm.Determine()) { + throw Error("regexp pattern too complicated"); + } + countingFsm.Minimize(); + if (simple) { + *simple = countingFsm.Simple(); + } + + const auto& determined = countingFsm.Determined(); + const auto& letters = countingFsm.Letters(); + + AdvancedScanner scanner; + scanner.Init(determined.Size(), letters, determined.Initial(), 1); + for (size_t from = 0; from != determined.Size(); ++from) { + for (auto&& lettersEl : letters) { + const auto letter = lettersEl.first; + const auto& tos = determined.Destinations(from, letter); + Y_ASSERT(tos.size() == 1); + scanner.SetJump(from, letter, *tos.begin(), scanner.RemapAction(countingFsm.Output(from, letter))); + } + } + return scanner; } } // namespace Impl AdvancedCountingScanner::AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple) - : AdvancedCountingScanner(Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(re, sep, simple)) + : AdvancedCountingScanner(Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(re, sep, simple)) { } NoGlueLimitCountingScanner::NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple) - : NoGlueLimitCountingScanner(Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(re, sep, simple)) + : NoGlueLimitCountingScanner(Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(re, sep, simple)) { } @@ -882,101 +882,101 @@ namespace Impl { template<class Scanner> class CountingScannerGlueTask: public ScannerGlueCommon<Scanner> { public: - using typename ScannerGlueCommon<Scanner>::State; - using TAction = typename Scanner::Action; - using InternalState = typename Scanner::InternalState; - typedef TMap<State, size_t> InvStates; - - CountingScannerGlueTask(const Scanner& lhs, const Scanner& rhs) - : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters)) - { - } - - void AcceptStates(const TVector<State>& states) - { - States = states; - this->SetSc(THolder<Scanner>(new Scanner)); - this->Sc().Init(states.size(), this->Letters(), 0, this->Lhs().RegexpsCount() + this->Rhs().RegexpsCount()); - - for (size_t i = 0; i < states.size(); ++i) - this->Sc().SetTag(i, this->Lhs().m_tags[this->Lhs().StateIdx(states[i].first)] | (this->Rhs().m_tags[this->Rhs().StateIdx(states[i].second)] << 3)); - } - - void Connect(size_t from, size_t to, Char letter) - { - this->Sc().SetJump(from, letter, to, - Action(this->Lhs(), States[from].first, letter) | (Action(this->Rhs(), States[from].second, letter) << this->Lhs().RegexpsCount())); - } + using typename ScannerGlueCommon<Scanner>::State; + using TAction = typename Scanner::Action; + using InternalState = typename Scanner::InternalState; + typedef TMap<State, size_t> InvStates; + + CountingScannerGlueTask(const Scanner& lhs, const Scanner& rhs) + : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters)) + { + } + + void AcceptStates(const TVector<State>& states) + { + States = states; + this->SetSc(THolder<Scanner>(new Scanner)); + this->Sc().Init(states.size(), this->Letters(), 0, this->Lhs().RegexpsCount() + this->Rhs().RegexpsCount()); + + for (size_t i = 0; i < states.size(); ++i) + this->Sc().SetTag(i, this->Lhs().m_tags[this->Lhs().StateIdx(states[i].first)] | (this->Rhs().m_tags[this->Rhs().StateIdx(states[i].second)] << 3)); + } + + void Connect(size_t from, size_t to, Char letter) + { + this->Sc().SetJump(from, letter, to, + Action(this->Lhs(), States[from].first, letter) | (Action(this->Rhs(), States[from].second, letter) << this->Lhs().RegexpsCount())); + } protected: - TVector<State> States; - TAction Action(const Scanner& sc, InternalState state, Char letter) const - { - size_t state_index = sc.StateIdx(state); - size_t transition_index = sc.TransitionIndex(state_index, letter); - const auto& tr = sc.m_jumps[transition_index]; - return tr.action; - } + TVector<State> States; + TAction Action(const Scanner& sc, InternalState state, Char letter) const + { + size_t state_index = sc.StateIdx(state); + size_t transition_index = sc.TransitionIndex(state_index, letter); + const auto& tr = sc.m_jumps[transition_index]; + return tr.action; + } }; class NoGlueLimitCountingScannerGlueTask : public CountingScannerGlueTask<NoGlueLimitCountingScanner> { public: - using ActionIndex = NoGlueLimitCountingScanner::ActionIndex; - struct TGlueAction { - TVector<ActionIndex> resets; - TVector<ActionIndex> increments; - bool operator<(const TGlueAction& rhs) const { - return std::tie(increments, resets) < std::tie(rhs.increments, rhs.resets); - } - }; - using TGlueMap = TMap<TGlueAction, ActionIndex>; - - NoGlueLimitCountingScannerGlueTask(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs) - : CountingScannerGlueTask(lhs, rhs) - { - } - - void Connect(size_t from, size_t to, Char letter) - { - TGlueAction glue_action; - this->Lhs().GetActions(Action(this->Lhs(), States[from].first, letter), 0, - std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments)); - this->Rhs().GetActions(Action(this->Rhs(), States[from].second, letter), this->Lhs().RegexpsCount(), - std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments)); - Y_ASSERT( - std::is_sorted(glue_action.increments.begin(), glue_action.increments.end()) && - std::is_sorted(glue_action.resets.begin(), glue_action.resets.end()) - ); - - if (glue_action.increments.empty() && glue_action.resets.empty()) { - this->Sc().SetJump(from, letter, to, 0); - return; - } - - auto action_iter = glue_map_.find(glue_action); - if (action_iter == glue_map_.end()) { - glue_map_[glue_action] = glue_actions_.size(); - for (const auto& ids : {glue_action.resets, glue_action.increments}) { - glue_actions_.push_back(ids.size()); - std::copy(ids.begin(), ids.end(), std::back_inserter(glue_actions_)); - } - } - - this->Sc().SetJump(from, letter, to, glue_map_[glue_action]); - } - - // Return type is same as in parent class - // TODO: Maybe return by value to use move semantic? - const NoGlueLimitCountingScanner& Success() - { - glue_actions_[0] = glue_actions_.size(); - Sc().AcceptActions(glue_actions_); - return Sc(); - } + using ActionIndex = NoGlueLimitCountingScanner::ActionIndex; + struct TGlueAction { + TVector<ActionIndex> resets; + TVector<ActionIndex> increments; + bool operator<(const TGlueAction& rhs) const { + return std::tie(increments, resets) < std::tie(rhs.increments, rhs.resets); + } + }; + using TGlueMap = TMap<TGlueAction, ActionIndex>; + + NoGlueLimitCountingScannerGlueTask(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs) + : CountingScannerGlueTask(lhs, rhs) + { + } + + void Connect(size_t from, size_t to, Char letter) + { + TGlueAction glue_action; + this->Lhs().GetActions(Action(this->Lhs(), States[from].first, letter), 0, + std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments)); + this->Rhs().GetActions(Action(this->Rhs(), States[from].second, letter), this->Lhs().RegexpsCount(), + std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments)); + Y_ASSERT( + std::is_sorted(glue_action.increments.begin(), glue_action.increments.end()) && + std::is_sorted(glue_action.resets.begin(), glue_action.resets.end()) + ); + + if (glue_action.increments.empty() && glue_action.resets.empty()) { + this->Sc().SetJump(from, letter, to, 0); + return; + } + + auto action_iter = glue_map_.find(glue_action); + if (action_iter == glue_map_.end()) { + glue_map_[glue_action] = glue_actions_.size(); + for (const auto& ids : {glue_action.resets, glue_action.increments}) { + glue_actions_.push_back(ids.size()); + std::copy(ids.begin(), ids.end(), std::back_inserter(glue_actions_)); + } + } + + this->Sc().SetJump(from, letter, to, glue_map_[glue_action]); + } + + // Return type is same as in parent class + // TODO: Maybe return by value to use move semantic? + const NoGlueLimitCountingScanner& Success() + { + glue_actions_[0] = glue_actions_.size(); + Sc().AcceptActions(glue_actions_); + return Sc(); + } private: - TGlueMap glue_map_; - TVector<ActionIndex> glue_actions_ = {1}; + TGlueMap glue_map_; + TVector<ActionIndex> glue_actions_ = {1}; }; @@ -984,85 +984,85 @@ private: CountingScanner CountingScanner::Glue(const CountingScanner& lhs, const CountingScanner& rhs, size_t maxSize /* = 0 */) { - if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) { - return CountingScanner(); - } - static constexpr size_t DefMaxSize = 250000; - Impl::CountingScannerGlueTask<CountingScanner> task(lhs, rhs); - return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); + if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) { + return CountingScanner(); + } + static constexpr size_t DefMaxSize = 250000; + Impl::CountingScannerGlueTask<CountingScanner> task(lhs, rhs); + return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); } AdvancedCountingScanner AdvancedCountingScanner::Glue(const AdvancedCountingScanner& lhs, const AdvancedCountingScanner& rhs, size_t maxSize /* = 0 */) { - if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) { - return AdvancedCountingScanner(); - } - static constexpr size_t DefMaxSize = 250000; - Impl::CountingScannerGlueTask<AdvancedCountingScanner> task(lhs, rhs); - return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); + if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) { + return AdvancedCountingScanner(); + } + static constexpr size_t DefMaxSize = 250000; + Impl::CountingScannerGlueTask<AdvancedCountingScanner> task(lhs, rhs); + return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); } NoGlueLimitCountingScanner NoGlueLimitCountingScanner::Glue(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs, size_t maxSize /* = 0 */) { - static constexpr size_t DefMaxSize = 250000; - Impl::NoGlueLimitCountingScannerGlueTask task(lhs, rhs); - return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); + static constexpr size_t DefMaxSize = 250000; + Impl::NoGlueLimitCountingScannerGlueTask task(lhs, rhs); + return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); } // Should Save(), Load() and Mmap() functions return stream/pointer in aligned state? // Now they don't because tests don't require it. void NoGlueLimitCountingScanner::Save(yostream* s) const { - Y_ASSERT(!AdvancedScannerCompatibilityMode); - LoadedScanner::Save(s, ScannerIOTypes::NoGlueLimitCountingScanner); - if (Actions) { - SavePodArray(s, Actions, *Actions); - } else { - const ActionIndex zeroSize = 0; - SavePodType(s, zeroSize); - } + Y_ASSERT(!AdvancedScannerCompatibilityMode); + LoadedScanner::Save(s, ScannerIOTypes::NoGlueLimitCountingScanner); + if (Actions) { + SavePodArray(s, Actions, *Actions); + } else { + const ActionIndex zeroSize = 0; + SavePodType(s, zeroSize); + } } void NoGlueLimitCountingScanner::Load(yistream* s) { - ui32 type; - LoadedScanner::Load(s, &type); - ActionIndex actionsSize; - if (type == ScannerIOTypes::NoGlueLimitCountingScanner) { - LoadPodType(s, actionsSize); - - if (actionsSize == 0) { - ActionsBuffer.reset(); - Actions = nullptr; - } else { - ActionsBuffer = TActionsBuffer(new ActionIndex[actionsSize]); - ActionsBuffer[0] = actionsSize; - LoadPodArray(s, &ActionsBuffer[1], actionsSize - 1); - Actions = ActionsBuffer.get(); - } - } else { - Y_ASSERT(type == ScannerIOTypes::LoadedScanner); - AdvancedScannerCompatibilityMode = true; - } + ui32 type; + LoadedScanner::Load(s, &type); + ActionIndex actionsSize; + if (type == ScannerIOTypes::NoGlueLimitCountingScanner) { + LoadPodType(s, actionsSize); + + if (actionsSize == 0) { + ActionsBuffer.reset(); + Actions = nullptr; + } else { + ActionsBuffer = TActionsBuffer(new ActionIndex[actionsSize]); + ActionsBuffer[0] = actionsSize; + LoadPodArray(s, &ActionsBuffer[1], actionsSize - 1); + Actions = ActionsBuffer.get(); + } + } else { + Y_ASSERT(type == ScannerIOTypes::LoadedScanner); + AdvancedScannerCompatibilityMode = true; + } } const void* NoGlueLimitCountingScanner::Mmap(const void* ptr, size_t size) { - NoGlueLimitCountingScanner scanner; - ui32 type; - auto p = static_cast<const size_t*> (scanner.LoadedScanner::Mmap(ptr, size, &type)); - - if (type == ScannerIOTypes::NoGlueLimitCountingScanner) { - scanner.Actions = reinterpret_cast<const ActionIndex*>(p); - if (*scanner.Actions == 0) { - scanner.Actions = nullptr; - Impl::AdvancePtr(p, size, sizeof(ActionIndex)); - } else { - Impl::AdvancePtr(p, size, *scanner.Actions * sizeof(ActionIndex)); - } - } else { - Y_ASSERT(type == ScannerIOTypes::LoadedScanner); - scanner.AdvancedScannerCompatibilityMode = true; - } - Swap(scanner); - return static_cast<const void*>(p); + NoGlueLimitCountingScanner scanner; + ui32 type; + auto p = static_cast<const size_t*> (scanner.LoadedScanner::Mmap(ptr, size, &type)); + + if (type == ScannerIOTypes::NoGlueLimitCountingScanner) { + scanner.Actions = reinterpret_cast<const ActionIndex*>(p); + if (*scanner.Actions == 0) { + scanner.Actions = nullptr; + Impl::AdvancePtr(p, size, sizeof(ActionIndex)); + } else { + Impl::AdvancePtr(p, size, *scanner.Actions * sizeof(ActionIndex)); + } + } else { + Y_ASSERT(type == ScannerIOTypes::LoadedScanner); + scanner.AdvancedScannerCompatibilityMode = true; + } + Swap(scanner); + return static_cast<const void*>(p); } } diff --git a/library/cpp/regex/pire/pire/extra/count.h b/library/cpp/regex/pire/pire/extra/count.h index 9032a7054a6..53ef98c8dd2 100644 --- a/library/cpp/regex/pire/pire/extra/count.h +++ b/library/cpp/regex/pire/pire/extra/count.h @@ -33,81 +33,81 @@ namespace Pire { class Fsm; namespace Impl { - template<class T> - class ScannerGlueCommon; + template<class T> + class ScannerGlueCommon; - template<class T> - class CountingScannerGlueTask; + template<class T> + class CountingScannerGlueTask; - class NoGlueLimitCountingScannerGlueTask; + class NoGlueLimitCountingScannerGlueTask; - template <class AdvancedScanner> - AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple); + template <class AdvancedScanner> + AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple); }; template<size_t I> class IncrementPerformer { public: - template<typename State, typename Action> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Do(State& s, Action mask) - { - if (mask & (1 << (I - 1))) { - Increment(s); - } - IncrementPerformer<I - 1>::Do(s, mask); - } + template<typename State, typename Action> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Do(State& s, Action mask) + { + if (mask & (1 << (I - 1))) { + Increment(s); + } + IncrementPerformer<I - 1>::Do(s, mask); + } private: - template<typename State> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Increment(State& s) - { - ++s.m_current[I - 1]; - } + template<typename State> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Increment(State& s) + { + ++s.m_current[I - 1]; + } }; template<> class IncrementPerformer<0> { public: - template<typename State, typename Action> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Do(State&, Action) - { - } + template<typename State, typename Action> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Do(State&, Action) + { + } }; template<size_t I> class ResetPerformer { public: - template<typename State, typename Action> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Do(State& s, Action mask) - { - if (mask & (1 << (LoadedScanner::MAX_RE_COUNT + (I - 1))) && s.m_current[I - 1]) { - Reset(s); - } - ResetPerformer<I - 1>::Do(s, mask); - } + template<typename State, typename Action> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Do(State& s, Action mask) + { + if (mask & (1 << (LoadedScanner::MAX_RE_COUNT + (I - 1))) && s.m_current[I - 1]) { + Reset(s); + } + ResetPerformer<I - 1>::Do(s, mask); + } private: - template<typename State> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Reset(State& s) - { - s.m_total[I - 1] = ymax(s.m_total[I - 1], s.m_current[I - 1]); - s.m_current[I - 1] = 0; - } + template<typename State> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Reset(State& s) + { + s.m_total[I - 1] = ymax(s.m_total[I - 1], s.m_current[I - 1]); + s.m_current[I - 1] = 0; + } }; template<> class ResetPerformer<0> { public: - template<typename State, typename Action> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - static void Do(State&, Action) - { - } + template<typename State, typename Action> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + static void Do(State&, Action) + { + } }; /** @@ -118,385 +118,385 @@ public: template<class DerivedScanner, class State> class BaseCountingScanner: public LoadedScanner { public: - enum { - IncrementAction = 1, - ResetAction = 2, + enum { + IncrementAction = 1, + ResetAction = 2, - FinalFlag = 0, - DeadFlag = 1, - }; + FinalFlag = 0, + DeadFlag = 1, + }; - void Initialize(State& state) const - { - state.m_state = m.initial; - memset(&state.m_current, 0, sizeof(state.m_current)); - memset(&state.m_total, 0, sizeof(state.m_total)); - state.m_updatedMask = 0; - } + void Initialize(State& state) const + { + state.m_state = m.initial; + memset(&state.m_current, 0, sizeof(state.m_current)); + memset(&state.m_total, 0, sizeof(state.m_total)); + state.m_updatedMask = 0; + } - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void TakeAction(State& s, Action a) const - { - static_cast<const DerivedScanner*>(this)->template TakeActionImpl<MAX_RE_COUNT>(s, a); - } + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void TakeAction(State& s, Action a) const + { + static_cast<const DerivedScanner*>(this)->template TakeActionImpl<MAX_RE_COUNT>(s, a); + } - bool CanStop(const State&) const { return false; } + bool CanStop(const State&) const { return false; } - Char Translate(Char ch) const - { - return m_letters[static_cast<size_t>(ch)]; - } + Char Translate(Char ch) const + { + return m_letters[static_cast<size_t>(ch)]; + } - Action NextTranslated(State& s, Char c) const - { - Transition x = reinterpret_cast<const Transition*>(s.m_state)[c]; - s.m_state += SignExtend(x.shift); - return x.action; - } + Action NextTranslated(State& s, Char c) const + { + Transition x = reinterpret_cast<const Transition*>(s.m_state)[c]; + s.m_state += SignExtend(x.shift); + return x.action; + } - Action Next(State& s, Char c) const - { - return NextTranslated(s, Translate(c)); - } + Action Next(State& s, Char c) const + { + return NextTranslated(s, Translate(c)); + } - Action Next(const State& current, State& n, Char c) const - { - n = current; - return Next(n, c); - } + Action Next(const State& current, State& n, Char c) const + { + n = current; + return Next(n, c); + } - bool Final(const State& /*state*/) const { return false; } + bool Final(const State& /*state*/) const { return false; } - bool Dead(const State&) const { return false; } + bool Dead(const State&) const { return false; } - using LoadedScanner::Swap; + using LoadedScanner::Swap; - size_t StateIndex(const State& s) const { return StateIdx(s.m_state); } + size_t StateIndex(const State& s) const { return StateIdx(s.m_state); } protected: - using LoadedScanner::Init; - using LoadedScanner::InternalState; - - template<size_t ActualReCount> - void PerformIncrement(State& s, Action mask) const - { - if (mask) { - IncrementPerformer<ActualReCount>::Do(s, mask); - s.m_updatedMask |= ((size_t)mask) << MAX_RE_COUNT; - } - } - - template<size_t ActualReCount> - void PerformReset(State& s, Action mask) const - { - mask &= s.m_updatedMask; - if (mask) { - ResetPerformer<ActualReCount>::Do(s, mask); - s.m_updatedMask &= (Action)~mask; - } - } - - void Next(InternalState& s, Char c) const - { - Transition x = reinterpret_cast<const Transition*>(s)[Translate(c)]; - s += SignExtend(x.shift); - } + using LoadedScanner::Init; + using LoadedScanner::InternalState; + + template<size_t ActualReCount> + void PerformIncrement(State& s, Action mask) const + { + if (mask) { + IncrementPerformer<ActualReCount>::Do(s, mask); + s.m_updatedMask |= ((size_t)mask) << MAX_RE_COUNT; + } + } + + template<size_t ActualReCount> + void PerformReset(State& s, Action mask) const + { + mask &= s.m_updatedMask; + if (mask) { + ResetPerformer<ActualReCount>::Do(s, mask); + s.m_updatedMask &= (Action)~mask; + } + } + + void Next(InternalState& s, Char c) const + { + Transition x = reinterpret_cast<const Transition*>(s)[Translate(c)]; + s += SignExtend(x.shift); + } }; template <size_t MAX_RE_COUNT> class CountingState { public: - size_t Result(int i) const { return ymax(m_current[i], m_total[i]); } + size_t Result(int i) const { return ymax(m_current[i], m_total[i]); } private: - using InternalState = LoadedScanner::InternalState; - InternalState m_state; - ui32 m_current[MAX_RE_COUNT]; - ui32 m_total[MAX_RE_COUNT]; - size_t m_updatedMask; + using InternalState = LoadedScanner::InternalState; + InternalState m_state; + ui32 m_current[MAX_RE_COUNT]; + ui32 m_total[MAX_RE_COUNT]; + size_t m_updatedMask; - template <class DerivedScanner, class State> - friend class BaseCountingScanner; + template <class DerivedScanner, class State> + friend class BaseCountingScanner; - template<size_t I> - friend class IncrementPerformer; + template<size_t I> + friend class IncrementPerformer; - template<size_t I> - friend class ResetPerformer; + template<size_t I> + friend class ResetPerformer; #ifdef PIRE_DEBUG - friend yostream& operator << (yostream& s, const State& state) - { - s << state.m_state << " ( "; - for (size_t i = 0; i < MAX_RE_COUNT; ++i) - s << state.m_current[i] << '/' << state.m_total[i] << ' '; - return s << ')'; - } + friend yostream& operator << (yostream& s, const State& state) + { + s << state.m_state << " ( "; + for (size_t i = 0; i < MAX_RE_COUNT; ++i) + s << state.m_current[i] << '/' << state.m_total[i] << ' '; + return s << ')'; + } #endif }; class CountingScanner : public BaseCountingScanner<CountingScanner, CountingState<LoadedScanner::MAX_RE_COUNT>> { public: - using State = CountingState<MAX_RE_COUNT>; - enum { - Matched = 2, - }; - - CountingScanner() {} - CountingScanner(const Fsm& re, const Fsm& sep); - - static CountingScanner Glue(const CountingScanner& a, const CountingScanner& b, size_t maxSize = 0); - - template<size_t ActualReCount> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void TakeActionImpl(State& s, Action a) const - { - if (a & IncrementMask) - PerformIncrement<ActualReCount>(s, a); - if (a & ResetMask) - PerformReset<ActualReCount>(s, a); - } + using State = CountingState<MAX_RE_COUNT>; + enum { + Matched = 2, + }; + + CountingScanner() {} + CountingScanner(const Fsm& re, const Fsm& sep); + + static CountingScanner Glue(const CountingScanner& a, const CountingScanner& b, size_t maxSize = 0); + + template<size_t ActualReCount> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void TakeActionImpl(State& s, Action a) const + { + if (a & IncrementMask) + PerformIncrement<ActualReCount>(s, a); + if (a & ResetMask) + PerformReset<ActualReCount>(s, a); + } private: - Action RemapAction(Action action) - { - if (action == (Matched | DeadFlag)) - return 1; - else if (action == DeadFlag) - return 1 << MAX_RE_COUNT; - else - return 0; - } - - friend void BuildScanner<CountingScanner>(const Fsm&, CountingScanner&); - friend class Impl::ScannerGlueCommon<CountingScanner>; - friend class Impl::CountingScannerGlueTask<CountingScanner>; + Action RemapAction(Action action) + { + if (action == (Matched | DeadFlag)) + return 1; + else if (action == DeadFlag) + return 1 << MAX_RE_COUNT; + else + return 0; + } + + friend void BuildScanner<CountingScanner>(const Fsm&, CountingScanner&); + friend class Impl::ScannerGlueCommon<CountingScanner>; + friend class Impl::CountingScannerGlueTask<CountingScanner>; }; class AdvancedCountingScanner : public BaseCountingScanner<AdvancedCountingScanner, CountingState<LoadedScanner::MAX_RE_COUNT>> { public: - using State = CountingState<MAX_RE_COUNT>; + using State = CountingState<MAX_RE_COUNT>; - AdvancedCountingScanner() {} - AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr); + AdvancedCountingScanner() {} + AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr); - static AdvancedCountingScanner Glue(const AdvancedCountingScanner& a, const AdvancedCountingScanner& b, size_t maxSize = 0); + static AdvancedCountingScanner Glue(const AdvancedCountingScanner& a, const AdvancedCountingScanner& b, size_t maxSize = 0); - template<size_t ActualReCount> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void TakeActionImpl(State& s, Action a) const - { - if (a & ResetMask) { - PerformReset<ActualReCount>(s, a); - } - if (a & IncrementMask) { - PerformIncrement<ActualReCount>(s, a); - } - } + template<size_t ActualReCount> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void TakeActionImpl(State& s, Action a) const + { + if (a & ResetMask) { + PerformReset<ActualReCount>(s, a); + } + if (a & IncrementMask) { + PerformIncrement<ActualReCount>(s, a); + } + } private: - Action RemapAction(Action action) - { - Action result = 0; - if (action & ResetAction) { - result = 1 << MAX_RE_COUNT; - } - if (action & IncrementAction) { - result |= 1; - } - return result; - } - - friend class Impl::ScannerGlueCommon<AdvancedCountingScanner>; - friend class Impl::CountingScannerGlueTask<AdvancedCountingScanner>; - friend AdvancedCountingScanner Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(const Fsm&, const Fsm&, bool*); + Action RemapAction(Action action) + { + Action result = 0; + if (action & ResetAction) { + result = 1 << MAX_RE_COUNT; + } + if (action & IncrementAction) { + result |= 1; + } + return result; + } + + friend class Impl::ScannerGlueCommon<AdvancedCountingScanner>; + friend class Impl::CountingScannerGlueTask<AdvancedCountingScanner>; + friend AdvancedCountingScanner Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(const Fsm&, const Fsm&, bool*); }; class NoGlueLimitCountingState { public: - size_t Result(int i) const { return ymax(m_current[i], m_total[i]); } + size_t Result(int i) const { return ymax(m_current[i], m_total[i]); } void Initialize(size_t initial, size_t regexpsCount) { - m_state = initial; - m_current.assign(regexpsCount, 0); - m_total.assign(regexpsCount, 0); - } - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void Reset(size_t regexpId) { - m_current[regexpId] = 0; - } - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void Increment(size_t regexp_id) { - ++m_current[regexp_id]; - m_total[regexp_id] = ymax(m_total[regexp_id], m_current[regexp_id]); - } - - template<size_t I> - friend class IncrementPerformer; - - template<size_t I> - friend class ResetPerformer; + m_state = initial; + m_current.assign(regexpsCount, 0); + m_total.assign(regexpsCount, 0); + } + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void Reset(size_t regexpId) { + m_current[regexpId] = 0; + } + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void Increment(size_t regexp_id) { + ++m_current[regexp_id]; + m_total[regexp_id] = ymax(m_total[regexp_id], m_current[regexp_id]); + } + + template<size_t I> + friend class IncrementPerformer; + + template<size_t I> + friend class ResetPerformer; private: - LoadedScanner::InternalState m_state; - TVector<ui32> m_current; - TVector<ui32> m_total; + LoadedScanner::InternalState m_state; + TVector<ui32> m_current; + TVector<ui32> m_total; - template <class DerivedScanner, class State> - friend class BaseCountingScanner; + template <class DerivedScanner, class State> + friend class BaseCountingScanner; #ifdef PIRE_DEBUG - yostream& operator << (yostream& s, const State& state) - { - s << state.m_state << " ( "; - for (size_t i = 0; i < state.m_current.size(); ++i) - s << state.m_current[i] << '/' << state.m_total[i] << ' '; - return s << ')'; - } + yostream& operator << (yostream& s, const State& state) + { + s << state.m_state << " ( "; + for (size_t i = 0; i < state.m_current.size(); ++i) + s << state.m_current[i] << '/' << state.m_total[i] << ' '; + return s << ')'; + } #endif }; class NoGlueLimitCountingScanner : public BaseCountingScanner<NoGlueLimitCountingScanner, NoGlueLimitCountingState> { public: - using State = NoGlueLimitCountingState; - using ActionIndex = ui32; - using TActionsBuffer = std::unique_ptr<ActionIndex[]>; + using State = NoGlueLimitCountingState; + using ActionIndex = ui32; + using TActionsBuffer = std::unique_ptr<ActionIndex[]>; private: - TActionsBuffer ActionsBuffer; - const ActionIndex* Actions = nullptr; - bool AdvancedScannerCompatibilityMode = false; + TActionsBuffer ActionsBuffer; + const ActionIndex* Actions = nullptr; + bool AdvancedScannerCompatibilityMode = false; public: - NoGlueLimitCountingScanner() = default; - NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr); - NoGlueLimitCountingScanner(const NoGlueLimitCountingScanner& rhs) - : BaseCountingScanner(rhs) - , AdvancedScannerCompatibilityMode(rhs.AdvancedScannerCompatibilityMode) - { - if (rhs.ActionsBuffer) { - Y_ASSERT(rhs.Actions); - ActionsBuffer = TActionsBuffer(new ActionIndex [*rhs.Actions]); - std::copy_n(rhs.ActionsBuffer.get(), *rhs.Actions, ActionsBuffer.get()); - Actions = ActionsBuffer.get(); - } else { - Actions = rhs.Actions; - } - } - - NoGlueLimitCountingScanner(NoGlueLimitCountingScanner&& other) : BaseCountingScanner() { - Swap(other); - } - - NoGlueLimitCountingScanner& operator=(NoGlueLimitCountingScanner rhs) { - Swap(rhs); - return *this; - } - - void Swap(NoGlueLimitCountingScanner& s) { - LoadedScanner::Swap(s); - DoSwap(ActionsBuffer, s.ActionsBuffer); - DoSwap(Actions, s.Actions); - DoSwap(AdvancedScannerCompatibilityMode, s.AdvancedScannerCompatibilityMode); - } - - void Initialize(State& state) const - { - state.Initialize(m.initial, RegexpsCount()); - } - - template <size_t ActualReCount> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - void TakeActionImpl(State& s, Action a) const - { - if (!a) { - return; - } - if (AdvancedScannerCompatibilityMode) { - AdvancedScannerTakeActionImpl<ActualReCount>(s, a); - return; - } - // Note: it's important to perform resets before increments, - // as it's possible for one repetition group to stop and another begin at the same symbol - if (Actions) { - auto action = Actions + a; - for (auto reset_count = *action++; reset_count--;) { - s.Reset(*action++); - } - for (auto inc_count = *action++; inc_count--;) { - s.Increment(*action++); - } - } else { - Y_ASSERT(RegexpsCount() == 1); - if (a & ResetAction) { - s.Reset(0); - } - if (a & IncrementAction) { - s.Increment(0); - } - } - } - - void Save(yostream* s) const; - - void Load(yistream* s); - - const void* Mmap(const void* ptr, size_t size); - - static NoGlueLimitCountingScanner Glue(const NoGlueLimitCountingScanner& a, const NoGlueLimitCountingScanner& b, size_t maxSize = 0); + NoGlueLimitCountingScanner() = default; + NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr); + NoGlueLimitCountingScanner(const NoGlueLimitCountingScanner& rhs) + : BaseCountingScanner(rhs) + , AdvancedScannerCompatibilityMode(rhs.AdvancedScannerCompatibilityMode) + { + if (rhs.ActionsBuffer) { + Y_ASSERT(rhs.Actions); + ActionsBuffer = TActionsBuffer(new ActionIndex [*rhs.Actions]); + std::copy_n(rhs.ActionsBuffer.get(), *rhs.Actions, ActionsBuffer.get()); + Actions = ActionsBuffer.get(); + } else { + Actions = rhs.Actions; + } + } + + NoGlueLimitCountingScanner(NoGlueLimitCountingScanner&& other) : BaseCountingScanner() { + Swap(other); + } + + NoGlueLimitCountingScanner& operator=(NoGlueLimitCountingScanner rhs) { + Swap(rhs); + return *this; + } + + void Swap(NoGlueLimitCountingScanner& s) { + LoadedScanner::Swap(s); + DoSwap(ActionsBuffer, s.ActionsBuffer); + DoSwap(Actions, s.Actions); + DoSwap(AdvancedScannerCompatibilityMode, s.AdvancedScannerCompatibilityMode); + } + + void Initialize(State& state) const + { + state.Initialize(m.initial, RegexpsCount()); + } + + template <size_t ActualReCount> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + void TakeActionImpl(State& s, Action a) const + { + if (!a) { + return; + } + if (AdvancedScannerCompatibilityMode) { + AdvancedScannerTakeActionImpl<ActualReCount>(s, a); + return; + } + // Note: it's important to perform resets before increments, + // as it's possible for one repetition group to stop and another begin at the same symbol + if (Actions) { + auto action = Actions + a; + for (auto reset_count = *action++; reset_count--;) { + s.Reset(*action++); + } + for (auto inc_count = *action++; inc_count--;) { + s.Increment(*action++); + } + } else { + Y_ASSERT(RegexpsCount() == 1); + if (a & ResetAction) { + s.Reset(0); + } + if (a & IncrementAction) { + s.Increment(0); + } + } + } + + void Save(yostream* s) const; + + void Load(yistream* s); + + const void* Mmap(const void* ptr, size_t size); + + static NoGlueLimitCountingScanner Glue(const NoGlueLimitCountingScanner& a, const NoGlueLimitCountingScanner& b, size_t maxSize = 0); private: - Action RemapAction(Action action) - { - return action; - } - - template <class Iterator> - void GetActions(Action a, ActionIndex id_shift, Iterator output_resets, Iterator output_increments) const { - if (!a) { - return; - } - if (!Actions) { - if (a & ResetAction) { - *output_resets++ = id_shift; - } - if (a & NoGlueLimitCountingScanner::IncrementAction) { - *output_increments++ = id_shift; - } - return; - } - auto action = Actions + a; - for (auto output : {output_resets, output_increments}) { - for (auto count = *action++; count--;) { - *output++ = *action++ + id_shift; - } - } - } - - void AcceptActions(const TVector<ActionIndex>& actions) { - Y_ASSERT(!Actions); - Y_ASSERT(!actions.empty()); - Y_ASSERT(actions[0] == actions.size()); - - ActionsBuffer = TActionsBuffer(new ActionIndex[actions.size()]); - std::copy(actions.begin(), actions.end(), ActionsBuffer.get()); - Actions = ActionsBuffer.get(); - } - - template <size_t ActualReCount> - void AdvancedScannerTakeActionImpl(State& s, Action a) const { - if (a & ResetMask) { - ResetPerformer<ActualReCount>::Do(s, a); - } - if (a & IncrementMask) { - IncrementPerformer<ActualReCount>::Do(s, a); - } - } - - friend class Impl::ScannerGlueCommon<NoGlueLimitCountingScanner>; - friend class Impl::CountingScannerGlueTask<NoGlueLimitCountingScanner>; - friend class Impl::NoGlueLimitCountingScannerGlueTask; - friend NoGlueLimitCountingScanner Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(const Fsm&, const Fsm&, bool*); + Action RemapAction(Action action) + { + return action; + } + + template <class Iterator> + void GetActions(Action a, ActionIndex id_shift, Iterator output_resets, Iterator output_increments) const { + if (!a) { + return; + } + if (!Actions) { + if (a & ResetAction) { + *output_resets++ = id_shift; + } + if (a & NoGlueLimitCountingScanner::IncrementAction) { + *output_increments++ = id_shift; + } + return; + } + auto action = Actions + a; + for (auto output : {output_resets, output_increments}) { + for (auto count = *action++; count--;) { + *output++ = *action++ + id_shift; + } + } + } + + void AcceptActions(const TVector<ActionIndex>& actions) { + Y_ASSERT(!Actions); + Y_ASSERT(!actions.empty()); + Y_ASSERT(actions[0] == actions.size()); + + ActionsBuffer = TActionsBuffer(new ActionIndex[actions.size()]); + std::copy(actions.begin(), actions.end(), ActionsBuffer.get()); + Actions = ActionsBuffer.get(); + } + + template <size_t ActualReCount> + void AdvancedScannerTakeActionImpl(State& s, Action a) const { + if (a & ResetMask) { + ResetPerformer<ActualReCount>::Do(s, a); + } + if (a & IncrementMask) { + IncrementPerformer<ActualReCount>::Do(s, a); + } + } + + friend class Impl::ScannerGlueCommon<NoGlueLimitCountingScanner>; + friend class Impl::CountingScannerGlueTask<NoGlueLimitCountingScanner>; + friend class Impl::NoGlueLimitCountingScannerGlueTask; + friend NoGlueLimitCountingScanner Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(const Fsm&, const Fsm&, bool*); }; } diff --git a/library/cpp/regex/pire/pire/extra/glyphs.cpp b/library/cpp/regex/pire/pire/extra/glyphs.cpp index fc16e5dae78..e44872cad7c 100644 --- a/library/cpp/regex/pire/pire/extra/glyphs.cpp +++ b/library/cpp/regex/pire/pire/extra/glyphs.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -38,106 +38,106 @@ namespace Pire { namespace { - /* - * A class providing a function which returns a character - * whose glyph resembles that of given char, if any; - * otherwise returns given char itself. - */ - class GlyphTable { - private: - TList< TVector<wchar32> > m_classes; - TMap<wchar32, TVector<wchar32>*> m_map; - - struct GlyphClass { - TVector<wchar32>* m_class; - TMap<wchar32, TVector<wchar32>*> *m_map; - - GlyphClass& operator << (wchar32 c) - { - m_class->push_back(c); - m_map->insert(ymake_pair(c, m_class)); - return *this; - } - }; - - GlyphClass Class() - { - GlyphClass cl; - m_classes.push_back(TVector<wchar32>()); - cl.m_class = &m_classes.back(); - cl.m_map = &m_map; - return cl; - } - - public: - - const TVector<wchar32>& Klass(wchar32 x) const - { - TMap<wchar32, TVector<wchar32>*>::const_iterator i = m_map.find(x); - if (i != m_map.end()) - return *i->second; - else - return DefaultValue< TVector<wchar32> >(); - } - - GlyphTable() - { - Class() << 'A' << 0x0410; - Class() << 'B' << 0x0412; - Class() << 'C' << 0x0421; - Class() << 'E' << 0x0415 << 0x0401; - Class() << 'H' << 0x041D; - Class() << 'K' << 0x041A; - Class() << 'M' << 0x041C; - Class() << 'O' << 0x041E; - Class() << 'P' << 0x0420; - Class() << 'T' << 0x0422; - Class() << 'X' << 0x0425; - - Class() << 'a' << 0x0430; - Class() << 'c' << 0x0441; - Class() << 'e' << 0x0435 << 0x0451; - Class() << 'm' << 0x0442; - Class() << 'o' << 0x043E; - Class() << 'p' << 0x0440; - Class() << 'u' << 0x0438; - Class() << 'x' << 0x0445; - Class() << 'y' << 0x0443; - } - }; - - class GlueSimilarGlyphsImpl: public Feature { - public: - GlueSimilarGlyphsImpl(): m_table(Singleton<GlyphTable>()) {} - int Priority() const { return 9; } - - void Alter(Term& t) - { - if (t.Value().IsA<Term::CharacterRange>()) { - const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>(); - typedef Term::CharacterRange::first_type CharSet; - const CharSet& old = range.first; - CharSet altered; - for (auto&& i : old) { - const TVector<wchar32>* klass = 0; - if (i.size() == 1 && !(klass = &m_table->Klass(i[0]))->empty()) - for (auto&& j : *klass) - altered.insert(Term::String(1, j)); - else - altered.insert(i); - } - - t = Term(t.Type(), Term::CharacterRange(altered, range.second)); - } - } - - private: - GlyphTable* m_table; - }; + /* + * A class providing a function which returns a character + * whose glyph resembles that of given char, if any; + * otherwise returns given char itself. + */ + class GlyphTable { + private: + TList< TVector<wchar32> > m_classes; + TMap<wchar32, TVector<wchar32>*> m_map; + + struct GlyphClass { + TVector<wchar32>* m_class; + TMap<wchar32, TVector<wchar32>*> *m_map; + + GlyphClass& operator << (wchar32 c) + { + m_class->push_back(c); + m_map->insert(ymake_pair(c, m_class)); + return *this; + } + }; + + GlyphClass Class() + { + GlyphClass cl; + m_classes.push_back(TVector<wchar32>()); + cl.m_class = &m_classes.back(); + cl.m_map = &m_map; + return cl; + } + + public: + + const TVector<wchar32>& Klass(wchar32 x) const + { + TMap<wchar32, TVector<wchar32>*>::const_iterator i = m_map.find(x); + if (i != m_map.end()) + return *i->second; + else + return DefaultValue< TVector<wchar32> >(); + } + + GlyphTable() + { + Class() << 'A' << 0x0410; + Class() << 'B' << 0x0412; + Class() << 'C' << 0x0421; + Class() << 'E' << 0x0415 << 0x0401; + Class() << 'H' << 0x041D; + Class() << 'K' << 0x041A; + Class() << 'M' << 0x041C; + Class() << 'O' << 0x041E; + Class() << 'P' << 0x0420; + Class() << 'T' << 0x0422; + Class() << 'X' << 0x0425; + + Class() << 'a' << 0x0430; + Class() << 'c' << 0x0441; + Class() << 'e' << 0x0435 << 0x0451; + Class() << 'm' << 0x0442; + Class() << 'o' << 0x043E; + Class() << 'p' << 0x0440; + Class() << 'u' << 0x0438; + Class() << 'x' << 0x0445; + Class() << 'y' << 0x0443; + } + }; + + class GlueSimilarGlyphsImpl: public Feature { + public: + GlueSimilarGlyphsImpl(): m_table(Singleton<GlyphTable>()) {} + int Priority() const { return 9; } + + void Alter(Term& t) + { + if (t.Value().IsA<Term::CharacterRange>()) { + const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>(); + typedef Term::CharacterRange::first_type CharSet; + const CharSet& old = range.first; + CharSet altered; + for (auto&& i : old) { + const TVector<wchar32>* klass = 0; + if (i.size() == 1 && !(klass = &m_table->Klass(i[0]))->empty()) + for (auto&& j : *klass) + altered.insert(Term::String(1, j)); + else + altered.insert(i); + } + + t = Term(t.Type(), Term::CharacterRange(altered, range.second)); + } + } + + private: + GlyphTable* m_table; + }; } namespace Features { - Feature::Ptr GlueSimilarGlyphs() { return Feature::Ptr(new GlueSimilarGlyphsImpl); } + Feature::Ptr GlueSimilarGlyphs() { return Feature::Ptr(new GlueSimilarGlyphsImpl); } } } diff --git a/library/cpp/regex/pire/pire/extra/glyphs.h b/library/cpp/regex/pire/pire/extra/glyphs.h index 678b9e15c4a..57fb1ce37cd 100644 --- a/library/cpp/regex/pire/pire/extra/glyphs.h +++ b/library/cpp/regex/pire/pire/extra/glyphs.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -29,12 +29,12 @@ namespace Pire { class Feature; namespace Features { - /** - * A feature which tells Pire not to distinguish latin - * and cyrillic letters having identical shapes - * (e.g. latin A and cyrillic A). - */ - Feature::Ptr GlueSimilarGlyphs(); + /** + * A feature which tells Pire not to distinguish latin + * and cyrillic letters having identical shapes + * (e.g. latin A and cyrillic A). + */ + Feature::Ptr GlueSimilarGlyphs(); } } diff --git a/library/cpp/regex/pire/pire/fsm.cpp b/library/cpp/regex/pire/pire/fsm.cpp index 24b4a9ab086..69956438178 100644 --- a/library/cpp/regex/pire/pire/fsm.cpp +++ b/library/cpp/regex/pire/pire/fsm.cpp @@ -45,250 +45,250 @@ namespace Pire { ystring CharDump(Char c) { - char buf[8]; - if (c == '"') - return ystring("\\\""); - else if (c == '[' || c == ']' || c == '-' || c == '^') { - snprintf(buf, sizeof(buf)-1, "\\\\%c", c); - return ystring(buf); - } else if (c >= 32 && c < 127) - return ystring(1, static_cast<char>(c)); - else if (c == '\n') - return ystring("\\\\n"); - else if (c == '\t') - return ystring("\\\\t"); - else if (c == '\r') - return ystring("\\\\r"); - else if (c < 256) { - snprintf(buf, sizeof(buf)-1, "\\\\%03o", static_cast<int>(c)); - return ystring(buf); - } else if (c == Epsilon) - return ystring("<Epsilon>"); - else if (c == BeginMark) - return ystring("<Begin>"); - else if (c == EndMark) - return ystring("<End>"); - else - return ystring("<?" "?" "?>"); + char buf[8]; + if (c == '"') + return ystring("\\\""); + else if (c == '[' || c == ']' || c == '-' || c == '^') { + snprintf(buf, sizeof(buf)-1, "\\\\%c", c); + return ystring(buf); + } else if (c >= 32 && c < 127) + return ystring(1, static_cast<char>(c)); + else if (c == '\n') + return ystring("\\\\n"); + else if (c == '\t') + return ystring("\\\\t"); + else if (c == '\r') + return ystring("\\\\r"); + else if (c < 256) { + snprintf(buf, sizeof(buf)-1, "\\\\%03o", static_cast<int>(c)); + return ystring(buf); + } else if (c == Epsilon) + return ystring("<Epsilon>"); + else if (c == BeginMark) + return ystring("<Begin>"); + else if (c == EndMark) + return ystring("<End>"); + else + return ystring("<?" "?" "?>"); } void Fsm::DumpState(yostream& s, size_t state) const { - // Fill in a 'row': Q -> exp(V) (for current state) - TVector< ybitset<MaxChar> > row(Size()); - for (auto&& transition : m_transitions[state]) - for (auto&& transitionState : transition.second) { - if (transitionState >= Size()) { - std::cerr << "WTF?! Transition from " << state << " on letter " << transition.first << " leads to non-existing state " << transitionState << "\n"; - Y_ASSERT(false); - } - if (Letters().Contains(transition.first)) { - const TVector<Char>& letters = Letters().Klass(Letters().Representative(transition.first)); - for (auto&& letter : letters) - row[transitionState].set(letter); - } else - row[transitionState].set(transition.first); - } - - bool statePrinted = false; - // Display each destination state - for (auto rit = row.begin(), rie = row.end(); rit != rie; ++rit) { - unsigned begin = 0, end = 0; - - ystring delimiter; - ystring label; - if (rit->test(Epsilon)) { - label += delimiter + CharDump(Epsilon); - delimiter = " "; - } - if (rit->test(BeginMark)) { - label += delimiter + CharDump(BeginMark); - delimiter = " "; - } - if (rit->test(EndMark)) { - label += delimiter + CharDump(EndMark); - delimiter = " "; - } - unsigned count = 0; - for (unsigned i = 0; i < 256; ++i) - if (rit->test(i)) - ++count; - if (count != 0 && count != 256) { - label += delimiter + "["; - bool complementary = (count > 128); - if (count > 128) - label += "^"; - while (begin < 256) { - for (begin = end; begin < 256 && (rit->test(begin) == complementary); ++begin) - ; - for (end = begin; end < 256 && (rit->test(end) == !complementary); ++end) - ; - if (begin + 1 == end) { - label += CharDump(begin); - delimiter = " "; - } else if (begin != end) { - label += CharDump(begin) + "-" + (CharDump(end-1)); - delimiter = " "; - } - } - label += "]"; - delimiter = " "; - } else if (count == 256) { - label += delimiter + "."; - delimiter = " "; - } - if (!label.empty()) { - if (!statePrinted) { - s << " " << state << "[shape=\"" << (IsFinal(state) ? "double" : "") << "circle\",label=\"" << state; - auto ti = tags.find(state); - if (ti != tags.end()) - s << " (tags: " << ti->second << ")"; - s << "\"]\n"; - if (Initial() == state) - s << " \"initial\" -> " << state << '\n'; - statePrinted = true; - } - s << " " << state << " -> " << std::distance(row.begin(), rit) << "[label=\"" << label; - - // Display outputs - auto oit = outputs.find(state); - if (oit != outputs.end()) { - auto oit2 = oit->second.find(std::distance(row.begin(), rit)); - if (oit2 == oit->second.end()) - ; - else { - TVector<int> payload; - for (unsigned i = 0; i < sizeof(oit2->second) * 8; ++i) - if (oit2->second & (1ul << i)) - payload.push_back(i); - if (!payload.empty()) - s << " (outputs: " << Join(payload.begin(), payload.end(), ", ") << ")"; - } - } - - s << "\"]\n"; - } - } - - if (statePrinted) - s << '\n'; + // Fill in a 'row': Q -> exp(V) (for current state) + TVector< ybitset<MaxChar> > row(Size()); + for (auto&& transition : m_transitions[state]) + for (auto&& transitionState : transition.second) { + if (transitionState >= Size()) { + std::cerr << "WTF?! Transition from " << state << " on letter " << transition.first << " leads to non-existing state " << transitionState << "\n"; + Y_ASSERT(false); + } + if (Letters().Contains(transition.first)) { + const TVector<Char>& letters = Letters().Klass(Letters().Representative(transition.first)); + for (auto&& letter : letters) + row[transitionState].set(letter); + } else + row[transitionState].set(transition.first); + } + + bool statePrinted = false; + // Display each destination state + for (auto rit = row.begin(), rie = row.end(); rit != rie; ++rit) { + unsigned begin = 0, end = 0; + + ystring delimiter; + ystring label; + if (rit->test(Epsilon)) { + label += delimiter + CharDump(Epsilon); + delimiter = " "; + } + if (rit->test(BeginMark)) { + label += delimiter + CharDump(BeginMark); + delimiter = " "; + } + if (rit->test(EndMark)) { + label += delimiter + CharDump(EndMark); + delimiter = " "; + } + unsigned count = 0; + for (unsigned i = 0; i < 256; ++i) + if (rit->test(i)) + ++count; + if (count != 0 && count != 256) { + label += delimiter + "["; + bool complementary = (count > 128); + if (count > 128) + label += "^"; + while (begin < 256) { + for (begin = end; begin < 256 && (rit->test(begin) == complementary); ++begin) + ; + for (end = begin; end < 256 && (rit->test(end) == !complementary); ++end) + ; + if (begin + 1 == end) { + label += CharDump(begin); + delimiter = " "; + } else if (begin != end) { + label += CharDump(begin) + "-" + (CharDump(end-1)); + delimiter = " "; + } + } + label += "]"; + delimiter = " "; + } else if (count == 256) { + label += delimiter + "."; + delimiter = " "; + } + if (!label.empty()) { + if (!statePrinted) { + s << " " << state << "[shape=\"" << (IsFinal(state) ? "double" : "") << "circle\",label=\"" << state; + auto ti = tags.find(state); + if (ti != tags.end()) + s << " (tags: " << ti->second << ")"; + s << "\"]\n"; + if (Initial() == state) + s << " \"initial\" -> " << state << '\n'; + statePrinted = true; + } + s << " " << state << " -> " << std::distance(row.begin(), rit) << "[label=\"" << label; + + // Display outputs + auto oit = outputs.find(state); + if (oit != outputs.end()) { + auto oit2 = oit->second.find(std::distance(row.begin(), rit)); + if (oit2 == oit->second.end()) + ; + else { + TVector<int> payload; + for (unsigned i = 0; i < sizeof(oit2->second) * 8; ++i) + if (oit2->second & (1ul << i)) + payload.push_back(i); + if (!payload.empty()) + s << " (outputs: " << Join(payload.begin(), payload.end(), ", ") << ")"; + } + } + + s << "\"]\n"; + } + } + + if (statePrinted) + s << '\n'; } void Fsm::DumpTo(yostream& s, const ystring& name) const { - s << "digraph {\n \"initial\"[shape=\"plaintext\",label=\"" << name << "\"]\n\n"; - for (size_t state = 0; state < Size(); ++state) { - DumpState(s, state); - } - s << "}\n\n"; + s << "digraph {\n \"initial\"[shape=\"plaintext\",label=\"" << name << "\"]\n\n"; + for (size_t state = 0; state < Size(); ++state) { + DumpState(s, state); + } + s << "}\n\n"; } yostream& operator << (yostream& s, const Fsm& fsm) { fsm.DumpTo(s); return s; } namespace { - template<class Vector> void resizeVector(Vector& v, size_t s) { v.resize(s); } + template<class Vector> void resizeVector(Vector& v, size_t s) { v.resize(s); } } Fsm::Fsm(): - m_transitions(1), - initial(0), - letters(m_transitions), - m_sparsed(false), - determined(false), - isAlternative(false) + m_transitions(1), + initial(0), + letters(m_transitions), + m_sparsed(false), + determined(false), + isAlternative(false) { - m_final.insert(0); + m_final.insert(0); } Fsm Fsm::MakeFalse() { - Fsm f; - f.SetFinal(0, false); - return f; + Fsm f; + f.SetFinal(0, false); + return f; } Char Fsm::Translate(Char c) const { - if (!m_sparsed || c == Epsilon) - return c; - else - return Letters().Representative(c); + if (!m_sparsed || c == Epsilon) + return c; + else + return Letters().Representative(c); } bool Fsm::Connected(size_t from, size_t to, Char c) const { - auto it = m_transitions[from].find(Translate(c)); - return (it != m_transitions[from].end() && it->second.find(to) != it->second.end()); + auto it = m_transitions[from].find(Translate(c)); + return (it != m_transitions[from].end() && it->second.find(to) != it->second.end()); } bool Fsm::Connected(size_t from, size_t to) const { - for (auto i = m_transitions[from].begin(), ie = m_transitions[from].end(); i != ie; ++i) - if (i->second.find(to) != i->second.end()) - return true; - return false; + for (auto i = m_transitions[from].begin(), ie = m_transitions[from].end(); i != ie; ++i) + if (i->second.find(to) != i->second.end()) + return true; + return false; } const Fsm::StatesSet& Fsm::Destinations(size_t from, Char c) const { - auto i = m_transitions[from].find(Translate(c)); - return (i != m_transitions[from].end()) ? i->second : DefaultValue<StatesSet>(); + auto i = m_transitions[from].find(Translate(c)); + return (i != m_transitions[from].end()) ? i->second : DefaultValue<StatesSet>(); } TSet<Char> Fsm::OutgoingLetters(size_t state) const { - TSet<Char> ret; - for (auto&& i : m_transitions[state]) - ret.insert(i.first); - return ret; + TSet<Char> ret; + for (auto&& i : m_transitions[state]) + ret.insert(i.first); + return ret; } size_t Fsm::Resize(size_t newSize) { - size_t ret = Size(); - m_transitions.resize(newSize); - return ret; + size_t ret = Size(); + m_transitions.resize(newSize); + return ret; } void Fsm::Swap(Fsm& fsm) { - DoSwap(m_transitions, fsm.m_transitions); - DoSwap(initial, fsm.initial); - DoSwap(m_final, fsm.m_final); - DoSwap(letters, fsm.letters); - DoSwap(determined, fsm.determined); - DoSwap(outputs, fsm.outputs); - DoSwap(tags, fsm.tags); - DoSwap(isAlternative, fsm.isAlternative); + DoSwap(m_transitions, fsm.m_transitions); + DoSwap(initial, fsm.initial); + DoSwap(m_final, fsm.m_final); + DoSwap(letters, fsm.letters); + DoSwap(determined, fsm.determined); + DoSwap(outputs, fsm.outputs); + DoSwap(tags, fsm.tags); + DoSwap(isAlternative, fsm.isAlternative); } void Fsm::SetFinal(size_t state, bool final) { - if (final) - m_final.insert(state); - else - m_final.erase(state); + if (final) + m_final.insert(state); + else + m_final.erase(state); } Fsm& Fsm::AppendDot() { - Resize(Size() + 1); - for (size_t letter = 0; letter != (1 << (sizeof(char)*8)); ++letter) - ConnectFinal(Size() - 1, letter); - ClearFinal(); - SetFinal(Size() - 1, true); - determined = false; + Resize(Size() + 1); + for (size_t letter = 0; letter != (1 << (sizeof(char)*8)); ++letter) + ConnectFinal(Size() - 1, letter); + ClearFinal(); + SetFinal(Size() - 1, true); + determined = false; return *this; } Fsm& Fsm::Append(char c) { - Resize(Size() + 1); - ConnectFinal(Size() - 1, static_cast<unsigned char>(c)); - ClearFinal(); - SetFinal(Size() - 1, true); - determined = false; + Resize(Size() + 1); + ConnectFinal(Size() - 1, static_cast<unsigned char>(c)); + ClearFinal(); + SetFinal(Size() - 1, true); + determined = false; return *this; } @@ -301,78 +301,78 @@ Fsm& Fsm::Append(const ystring& str) Fsm& Fsm::AppendSpecial(Char c) { - Resize(Size() + 1); - ConnectFinal(Size() - 1, c); - ClearFinal(); - SetFinal(Size() - 1, true); - determined = false; + Resize(Size() + 1); + ConnectFinal(Size() - 1, c); + ClearFinal(); + SetFinal(Size() - 1, true); + determined = false; return *this; } Fsm& Fsm::AppendStrings(const TVector<ystring>& strings) { - for (auto&& i : strings) - if (i.empty()) - throw Error("None of strings passed to appendStrings() can be empty"); - - Resize(Size() + 1); - size_t end = Size() - 1; - - // A local transitions table: (oldstate, char) -> newstate. - // Valid for all letters in given strings except final ones, - // which are always connected to the end state. - - // NB: since each FSM contains at least one state, - // state #0 cannot appear in LTRs. Thus we can use this - // criteria to test whether a transition has been created or not. - typedef ypair<size_t, char> Transition; - TMap<char, size_t> startLtr; - TMap<Transition, size_t> ltr; - - // A presense of a transition in this set indicates that - // a that transition already points somewhere (either to end - // or somewhere else). Another attempt to create such transition - // will clear `determined flag. - TSet<Transition> usedTransitions; - TSet<char> usedFirsts; - - for (const auto& str : strings) { - if (str.size() > 1) { - - // First letter: all previously final states are connected to the new state - size_t& firstJump = startLtr[str[0]]; - if (!firstJump) { - firstJump = Resize(Size() + 1); - ConnectFinal(firstJump, static_cast<unsigned char>(str[0])); - determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end()); - } - - // All other letters except last one - size_t state = firstJump; - for (auto cit = str.begin() + 1, cie = str.end() - 1; cit != cie; ++cit) { - size_t& newState = ltr[ymake_pair(state, *cit)]; - if (!newState) { - newState = Resize(Size() + 1); - Connect(state, newState, static_cast<unsigned char>(*cit)); - determined = determined && (usedTransitions.find(ymake_pair(state, *cit)) != usedTransitions.end()); - } - state = newState; - } - - // The last letter: connect the current state to end - unsigned char last = static_cast<unsigned char>(*(str.end() - 1)); - Connect(state, end, last); - determined = determined && (usedTransitions.find(ymake_pair(state, last)) != usedTransitions.end()); - - } else { - // The single letter: connect all the previously final states to end - ConnectFinal(end, static_cast<unsigned char>(str[0])); - determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end()); - } - } - - ClearFinal(); - SetFinal(end, true); + for (auto&& i : strings) + if (i.empty()) + throw Error("None of strings passed to appendStrings() can be empty"); + + Resize(Size() + 1); + size_t end = Size() - 1; + + // A local transitions table: (oldstate, char) -> newstate. + // Valid for all letters in given strings except final ones, + // which are always connected to the end state. + + // NB: since each FSM contains at least one state, + // state #0 cannot appear in LTRs. Thus we can use this + // criteria to test whether a transition has been created or not. + typedef ypair<size_t, char> Transition; + TMap<char, size_t> startLtr; + TMap<Transition, size_t> ltr; + + // A presense of a transition in this set indicates that + // a that transition already points somewhere (either to end + // or somewhere else). Another attempt to create such transition + // will clear `determined flag. + TSet<Transition> usedTransitions; + TSet<char> usedFirsts; + + for (const auto& str : strings) { + if (str.size() > 1) { + + // First letter: all previously final states are connected to the new state + size_t& firstJump = startLtr[str[0]]; + if (!firstJump) { + firstJump = Resize(Size() + 1); + ConnectFinal(firstJump, static_cast<unsigned char>(str[0])); + determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end()); + } + + // All other letters except last one + size_t state = firstJump; + for (auto cit = str.begin() + 1, cie = str.end() - 1; cit != cie; ++cit) { + size_t& newState = ltr[ymake_pair(state, *cit)]; + if (!newState) { + newState = Resize(Size() + 1); + Connect(state, newState, static_cast<unsigned char>(*cit)); + determined = determined && (usedTransitions.find(ymake_pair(state, *cit)) != usedTransitions.end()); + } + state = newState; + } + + // The last letter: connect the current state to end + unsigned char last = static_cast<unsigned char>(*(str.end() - 1)); + Connect(state, end, last); + determined = determined && (usedTransitions.find(ymake_pair(state, last)) != usedTransitions.end()); + + } else { + // The single letter: connect all the previously final states to end + ConnectFinal(end, static_cast<unsigned char>(str[0])); + determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end()); + } + } + + ClearFinal(); + SetFinal(end, true); return *this; } @@ -382,384 +382,384 @@ void Fsm::Import(const Fsm& rhs) // PIRE_IFDEBUG(LOG_DEBUG("fsm") << "=== Left-hand side ===\n" << *this); // PIRE_IFDEBUG(LOG_DEBUG("fsm") << "=== Right-hand side ===\n" << rhs); - size_t oldsize = Resize(Size() + rhs.Size()); - - for (auto&& outer : m_transitions) { - for (auto&& letter : letters) { - auto targets = outer.find(letter.first); - if (targets == outer.end()) - continue; - for (auto&& character : letter.second.second) - if (character != letter.first) - outer.insert(ymake_pair(character, targets->second)); - } - } - - auto dest = m_transitions.begin() + oldsize; - for (auto outer = rhs.m_transitions.begin(), outerEnd = rhs.m_transitions.end(); outer != outerEnd; ++outer, ++dest) { - for (auto&& inner : *outer) { - TSet<size_t> targets; - std::transform(inner.second.begin(), inner.second.end(), std::inserter(targets, targets.begin()), - std::bind2nd(std::plus<size_t>(), oldsize)); - dest->insert(ymake_pair(inner.first, targets)); - } - - for (auto&& letter : rhs.letters) { - auto targets = dest->find(letter.first); - if (targets == dest->end()) - continue; - for (auto&& character : letter.second.second) - if (character != letter.first) - dest->insert(ymake_pair(character, targets->second)); - } - } - - // Import outputs - for (auto&& output : rhs.outputs) { - auto& dest = outputs[output.first + oldsize]; - for (auto&& element : output.second) - dest.insert(ymake_pair(element.first + oldsize, element.second)); - } - - // Import tags - for (auto&& tag : rhs.tags) - tags.insert(ymake_pair(tag.first + oldsize, tag.second)); - - letters = LettersTbl(LettersEquality(m_transitions)); + size_t oldsize = Resize(Size() + rhs.Size()); + + for (auto&& outer : m_transitions) { + for (auto&& letter : letters) { + auto targets = outer.find(letter.first); + if (targets == outer.end()) + continue; + for (auto&& character : letter.second.second) + if (character != letter.first) + outer.insert(ymake_pair(character, targets->second)); + } + } + + auto dest = m_transitions.begin() + oldsize; + for (auto outer = rhs.m_transitions.begin(), outerEnd = rhs.m_transitions.end(); outer != outerEnd; ++outer, ++dest) { + for (auto&& inner : *outer) { + TSet<size_t> targets; + std::transform(inner.second.begin(), inner.second.end(), std::inserter(targets, targets.begin()), + std::bind2nd(std::plus<size_t>(), oldsize)); + dest->insert(ymake_pair(inner.first, targets)); + } + + for (auto&& letter : rhs.letters) { + auto targets = dest->find(letter.first); + if (targets == dest->end()) + continue; + for (auto&& character : letter.second.second) + if (character != letter.first) + dest->insert(ymake_pair(character, targets->second)); + } + } + + // Import outputs + for (auto&& output : rhs.outputs) { + auto& dest = outputs[output.first + oldsize]; + for (auto&& element : output.second) + dest.insert(ymake_pair(element.first + oldsize, element.second)); + } + + // Import tags + for (auto&& tag : rhs.tags) + tags.insert(ymake_pair(tag.first + oldsize, tag.second)); + + letters = LettersTbl(LettersEquality(m_transitions)); } void Fsm::Connect(size_t from, size_t to, Char c /* = Epsilon */) { - m_transitions[from][c].insert(to); - ClearHints(); + m_transitions[from][c].insert(to); + ClearHints(); } void Fsm::ConnectFinal(size_t to, Char c /* = Epsilon */) { - for (auto&& final : m_final) - Connect(final, to, c); - ClearHints(); + for (auto&& final : m_final) + Connect(final, to, c); + ClearHints(); } void Fsm::Disconnect(size_t from, size_t to, Char c) { - auto i = m_transitions[from].find(c); - if (i != m_transitions[from].end()) - i->second.erase(to); - ClearHints(); + auto i = m_transitions[from].find(c); + if (i != m_transitions[from].end()) + i->second.erase(to); + ClearHints(); } void Fsm::Disconnect(size_t from, size_t to) { - for (auto&& i : m_transitions[from]) - i.second.erase(to); - ClearHints(); + for (auto&& i : m_transitions[from]) + i.second.erase(to); + ClearHints(); } unsigned long Fsm::Output(size_t from, size_t to) const { - auto i = outputs.find(from); - if (i == outputs.end()) - return 0; - auto j = i->second.find(to); - if (j == i->second.end()) - return 0; - else - return j->second; + auto i = outputs.find(from); + if (i == outputs.end()) + return 0; + auto j = i->second.find(to); + if (j == i->second.end()) + return 0; + else + return j->second; } Fsm& Fsm::operator += (const Fsm& rhs) { - size_t lhsSize = Size(); - Import(rhs); + size_t lhsSize = Size(); + Import(rhs); - const TransitionRow& row = m_transitions[lhsSize + rhs.initial]; + const TransitionRow& row = m_transitions[lhsSize + rhs.initial]; - for (auto&& outer : row) - for (auto&& inner : outer.second) - ConnectFinal(inner, outer.first); + for (auto&& outer : row) + for (auto&& inner : outer.second) + ConnectFinal(inner, outer.first); - auto out = rhs.outputs.find(rhs.initial); - if (out != rhs.outputs.end()) - for (auto&& toAndOutput : out->second) { - for (auto&& final : m_final) - outputs[final].insert(ymake_pair(toAndOutput.first + lhsSize, toAndOutput.second)); - } + auto out = rhs.outputs.find(rhs.initial); + if (out != rhs.outputs.end()) + for (auto&& toAndOutput : out->second) { + for (auto&& final : m_final) + outputs[final].insert(ymake_pair(toAndOutput.first + lhsSize, toAndOutput.second)); + } - ClearFinal(); - for (auto&& letter : rhs.m_final) - SetFinal(letter + lhsSize, true); - determined = false; + ClearFinal(); + for (auto&& letter : rhs.m_final) + SetFinal(letter + lhsSize, true); + determined = false; - ClearHints(); - PIRE_IFDEBUG(Cdbg << "=== After addition ===" << Endl << *this << Endl); + ClearHints(); + PIRE_IFDEBUG(Cdbg << "=== After addition ===" << Endl << *this << Endl); - return *this; + return *this; } Fsm& Fsm::operator |= (const Fsm& rhs) { - size_t lhsSize = Size(); - - Import(rhs); - for (auto&& final : rhs.m_final) - m_final.insert(final + lhsSize); - - if (!isAlternative && !rhs.isAlternative) { - Resize(Size() + 1); - Connect(Size() - 1, initial); - Connect(Size() - 1, lhsSize + rhs.initial); - initial = Size() - 1; - } else if (isAlternative && !rhs.isAlternative) { - Connect(initial, lhsSize + rhs.initial, Epsilon); - } else if (!isAlternative && rhs.isAlternative) { - Connect(lhsSize + rhs.initial, initial, Epsilon); - initial = rhs.initial + lhsSize; - } else if (isAlternative && rhs.isAlternative) { - const StatesSet& tos = rhs.Destinations(rhs.initial, Epsilon); - for (auto&& to : tos) { - Connect(initial, to + lhsSize, Epsilon); - Disconnect(rhs.initial + lhsSize, to + lhsSize, Epsilon); - } - } - - determined = false; - isAlternative = true; - return *this; + size_t lhsSize = Size(); + + Import(rhs); + for (auto&& final : rhs.m_final) + m_final.insert(final + lhsSize); + + if (!isAlternative && !rhs.isAlternative) { + Resize(Size() + 1); + Connect(Size() - 1, initial); + Connect(Size() - 1, lhsSize + rhs.initial); + initial = Size() - 1; + } else if (isAlternative && !rhs.isAlternative) { + Connect(initial, lhsSize + rhs.initial, Epsilon); + } else if (!isAlternative && rhs.isAlternative) { + Connect(lhsSize + rhs.initial, initial, Epsilon); + initial = rhs.initial + lhsSize; + } else if (isAlternative && rhs.isAlternative) { + const StatesSet& tos = rhs.Destinations(rhs.initial, Epsilon); + for (auto&& to : tos) { + Connect(initial, to + lhsSize, Epsilon); + Disconnect(rhs.initial + lhsSize, to + lhsSize, Epsilon); + } + } + + determined = false; + isAlternative = true; + return *this; } Fsm& Fsm::operator &= (const Fsm& rhs) { - Fsm rhs2(rhs); - Complement(); - rhs2.Complement(); - *this |= rhs2; - Complement(); - return *this; + Fsm rhs2(rhs); + Complement(); + rhs2.Complement(); + *this |= rhs2; + Complement(); + return *this; } Fsm& Fsm::Iterate() { - PIRE_IFDEBUG(Cdbg << "Iterating:" << Endl << *this << Endl); - Resize(Size() + 2); + PIRE_IFDEBUG(Cdbg << "Iterating:" << Endl << *this << Endl); + Resize(Size() + 2); - Connect(Size() - 2, Size() - 1); - Connect(Size() - 2, initial); - ConnectFinal(initial); - ConnectFinal(Size() - 1); + Connect(Size() - 2, Size() - 1); + Connect(Size() - 2, initial); + ConnectFinal(initial); + ConnectFinal(Size() - 1); - ClearFinal(); - SetFinal(Size() - 1, true); - initial = Size() - 2; + ClearFinal(); + SetFinal(Size() - 1, true); + initial = Size() - 2; - determined = false; + determined = false; - PIRE_IFDEBUG(Cdbg << "Iterated:" << Endl << *this << Endl); - return *this; + PIRE_IFDEBUG(Cdbg << "Iterated:" << Endl << *this << Endl); + return *this; } Fsm& Fsm::Complement() { - if (!Determine()) - throw Error("Regexp pattern too complicated"); - Minimize(); - Resize(Size() + 1); - for (size_t i = 0; i < Size(); ++i) - if (!IsFinal(i)) - Connect(i, Size() - 1); - ClearFinal(); - SetFinal(Size() - 1, true); - determined = false; - - return *this; + if (!Determine()) + throw Error("Regexp pattern too complicated"); + Minimize(); + Resize(Size() + 1); + for (size_t i = 0; i < Size(); ++i) + if (!IsFinal(i)) + Connect(i, Size() - 1); + ClearFinal(); + SetFinal(Size() - 1, true); + determined = false; + + return *this; } Fsm Fsm::operator *(size_t count) const { - Fsm ret; - while (count--) - ret += *this; - return ret; + Fsm ret; + while (count--) + ret += *this; + return ret; } void Fsm::MakePrefix() { - RemoveDeadEnds(); - for (size_t i = 0; i < Size(); ++i) - if (!m_transitions[i].empty()) - m_final.insert(i); - ClearHints(); + RemoveDeadEnds(); + for (size_t i = 0; i < Size(); ++i) + if (!m_transitions[i].empty()) + m_final.insert(i); + ClearHints(); } void Fsm::MakeSuffix() { - for (size_t i = 0; i < Size(); ++i) - if (i != initial) - Connect(initial, i); - ClearHints(); + for (size_t i = 0; i < Size(); ++i) + if (i != initial) + Connect(initial, i); + ClearHints(); } Fsm& Fsm::Reverse() { - Fsm out; - out.Resize(Size() + 1); - out.letters = Letters(); - - // Invert transitions - for (size_t from = 0; from < Size(); ++from) - for (auto&& i : m_transitions[from]) - for (auto&& j : i.second) - out.Connect(j, from, i.first); - - // Invert initial and final states - out.m_final.clear(); - out.SetFinal(initial, true); - for (auto i : m_final) - out.Connect(Size(), i, Epsilon); - out.SetInitial(Size()); - - // Invert outputs - for (auto&& i : outputs) - for (auto&& j : i.second) - out.SetOutput(j.first, i.first, j.second); - - // Preserve tags (although thier semantics are usually heavily broken at this point) - out.tags = tags; - - // Apply - Swap(out); - return *this; + Fsm out; + out.Resize(Size() + 1); + out.letters = Letters(); + + // Invert transitions + for (size_t from = 0; from < Size(); ++from) + for (auto&& i : m_transitions[from]) + for (auto&& j : i.second) + out.Connect(j, from, i.first); + + // Invert initial and final states + out.m_final.clear(); + out.SetFinal(initial, true); + for (auto i : m_final) + out.Connect(Size(), i, Epsilon); + out.SetInitial(Size()); + + // Invert outputs + for (auto&& i : outputs) + for (auto&& j : i.second) + out.SetOutput(j.first, i.first, j.second); + + // Preserve tags (although thier semantics are usually heavily broken at this point) + out.tags = tags; + + // Apply + Swap(out); + return *this; } TSet<size_t> Fsm::DeadStates() const { - TSet<size_t> res; - - for (int invert = 0; invert <= 1; ++invert) { - Fsm digraph; - digraph.Resize(Size()); - for (TransitionTable::const_iterator j = m_transitions.begin(), je = m_transitions.end(); j != je; ++j) { - for (TransitionRow::const_iterator k = j->begin(), ke = j->end(); k != ke; ++k) { - for (StatesSet::const_iterator toSt = k->second.begin(), toSte = k->second.end(); toSt != toSte; ++toSt) { - // We only care if the states are connected or not regerdless through what letter - if (invert) { - // Build an FSM with inverted transitions - digraph.Connect(*toSt, j - m_transitions.begin(), 0); - } else { - digraph.Connect(j - m_transitions.begin(), *toSt, 0); - } - } - } - } - - TVector<bool> unchecked(Size(), true); - TVector<bool> useless(Size(), true); - TDeque<size_t> queue; - - // Put all final (or initial) states into queue, marking them useful - for (size_t i = 0; i < Size(); ++i) - if ((invert && IsFinal(i)) || (!invert && Initial() == i)) { - useless[i] = false; - queue.push_back(i); - } - - // Do the breadth-first search, marking all states - // from which already marked states are reachable - while (!queue.empty()) { - size_t to = queue.front(); - queue.pop_front(); - - // All the states that are connected to this state in the transition matrix are useful - const StatesSet& connections = (digraph.m_transitions[to])[0]; - for (auto&& fr : connections) { - // Enqueue the state for further traversal if it hasnt been already checked - if (unchecked[fr] && useless[fr]) { - useless[fr] = false; - queue.push_back(fr); - } - } - - // Now we consider this state checked - unchecked[to] = false; - } - - for (size_t i = 0; i < Size(); ++i) { - if (useless[i]) { - res.insert(i); - } - } - } - - return res; + TSet<size_t> res; + + for (int invert = 0; invert <= 1; ++invert) { + Fsm digraph; + digraph.Resize(Size()); + for (TransitionTable::const_iterator j = m_transitions.begin(), je = m_transitions.end(); j != je; ++j) { + for (TransitionRow::const_iterator k = j->begin(), ke = j->end(); k != ke; ++k) { + for (StatesSet::const_iterator toSt = k->second.begin(), toSte = k->second.end(); toSt != toSte; ++toSt) { + // We only care if the states are connected or not regerdless through what letter + if (invert) { + // Build an FSM with inverted transitions + digraph.Connect(*toSt, j - m_transitions.begin(), 0); + } else { + digraph.Connect(j - m_transitions.begin(), *toSt, 0); + } + } + } + } + + TVector<bool> unchecked(Size(), true); + TVector<bool> useless(Size(), true); + TDeque<size_t> queue; + + // Put all final (or initial) states into queue, marking them useful + for (size_t i = 0; i < Size(); ++i) + if ((invert && IsFinal(i)) || (!invert && Initial() == i)) { + useless[i] = false; + queue.push_back(i); + } + + // Do the breadth-first search, marking all states + // from which already marked states are reachable + while (!queue.empty()) { + size_t to = queue.front(); + queue.pop_front(); + + // All the states that are connected to this state in the transition matrix are useful + const StatesSet& connections = (digraph.m_transitions[to])[0]; + for (auto&& fr : connections) { + // Enqueue the state for further traversal if it hasnt been already checked + if (unchecked[fr] && useless[fr]) { + useless[fr] = false; + queue.push_back(fr); + } + } + + // Now we consider this state checked + unchecked[to] = false; + } + + for (size_t i = 0; i < Size(); ++i) { + if (useless[i]) { + res.insert(i); + } + } + } + + return res; } void Fsm::RemoveDeadEnds() { - PIRE_IFDEBUG(Cdbg << "Removing dead ends on:" << Endl << *this << Endl); - - TSet<size_t> dead = DeadStates(); - // Erase all useless states - for (auto&& i : dead) { - PIRE_IFDEBUG(Cdbg << "Removing useless state " << i << Endl); - m_transitions[i].clear(); - for (auto&& j : m_transitions) - for (auto&& k : j) - k.second.erase(i); - } - ClearHints(); - - PIRE_IFDEBUG(Cdbg << "Result:" << Endl << *this << Endl); + PIRE_IFDEBUG(Cdbg << "Removing dead ends on:" << Endl << *this << Endl); + + TSet<size_t> dead = DeadStates(); + // Erase all useless states + for (auto&& i : dead) { + PIRE_IFDEBUG(Cdbg << "Removing useless state " << i << Endl); + m_transitions[i].clear(); + for (auto&& j : m_transitions) + for (auto&& k : j) + k.second.erase(i); + } + ClearHints(); + + PIRE_IFDEBUG(Cdbg << "Result:" << Endl << *this << Endl); } // This method is one step of Epsilon-connection removal algorithm. // It merges transitions, tags, and outputs of 'to' state into 'from' state void Fsm::MergeEpsilonConnection(size_t from, size_t to) { - unsigned long frEpsOutput = 0; - bool fsEpsOutputExists = false; - - // Is there an output for 'from'->'to' transition? - if (outputs.find(from) != outputs.end() && outputs[from].find(to) != outputs[from].end()) { - frEpsOutput = outputs[from][to]; - fsEpsOutputExists = true; - } - - // Merge transitions from 'to' state into transitions from 'from' state - for (auto&& transition : m_transitions[to]) { - TSet<size_t> connStates; - std::copy(transition.second.begin(), transition.second.end(), - std::inserter(m_transitions[from][transition.first], m_transitions[from][transition.first].end())); - - // If there is an output of the 'from'->'to' connection it has to be set to all - // new connections that were merged from 'to' state - if (fsEpsOutputExists) { - // Compute the set of states that are reachable from 'to' state - std::copy(transition.second.begin(), transition.second.end(), std::inserter(connStates, connStates.end())); - - // For each of these states add an output equal to the Epsilon-connection output - for (auto&& newConnSt : connStates) { - outputs[from][newConnSt] |= frEpsOutput; - } - } - } - - // Mark 'from' state final if 'to' state is final - if (IsFinal(to)) - SetFinal(from, true); - - // Combine tags - auto ti = tags.find(to); - if (ti != tags.end()) - tags[from] |= ti->second; - - // Merge all 'to' into 'from' outputs: - // outputs[from][i] |= (outputs[from][to] | outputs[to][i]) - auto toOit = outputs.find(to); - if (toOit != outputs.end()) { - for (auto&& output : toOit->second) { - outputs[from][output.first] |= (frEpsOutput | output.second); - } - } + unsigned long frEpsOutput = 0; + bool fsEpsOutputExists = false; + + // Is there an output for 'from'->'to' transition? + if (outputs.find(from) != outputs.end() && outputs[from].find(to) != outputs[from].end()) { + frEpsOutput = outputs[from][to]; + fsEpsOutputExists = true; + } + + // Merge transitions from 'to' state into transitions from 'from' state + for (auto&& transition : m_transitions[to]) { + TSet<size_t> connStates; + std::copy(transition.second.begin(), transition.second.end(), + std::inserter(m_transitions[from][transition.first], m_transitions[from][transition.first].end())); + + // If there is an output of the 'from'->'to' connection it has to be set to all + // new connections that were merged from 'to' state + if (fsEpsOutputExists) { + // Compute the set of states that are reachable from 'to' state + std::copy(transition.second.begin(), transition.second.end(), std::inserter(connStates, connStates.end())); + + // For each of these states add an output equal to the Epsilon-connection output + for (auto&& newConnSt : connStates) { + outputs[from][newConnSt] |= frEpsOutput; + } + } + } + + // Mark 'from' state final if 'to' state is final + if (IsFinal(to)) + SetFinal(from, true); + + // Combine tags + auto ti = tags.find(to); + if (ti != tags.end()) + tags[from] |= ti->second; + + // Merge all 'to' into 'from' outputs: + // outputs[from][i] |= (outputs[from][to] | outputs[to][i]) + auto toOit = outputs.find(to); + if (toOit != outputs.end()) { + for (auto&& output : toOit->second) { + outputs[from][output.first] |= (frEpsOutput | output.second); + } + } } // Assuming the epsilon transitions is possible from 'from' to 'thru', @@ -768,467 +768,467 @@ void Fsm::MergeEpsilonConnection(size_t from, size_t to) // Updates inverse map of epsilon transitions as well. void Fsm::ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps) { - PIRE_IFDEBUG(Cdbg << "In Fsm::ShortCutEpsilon(" << from << ", " << thru << ")\n"); - const StatesSet& to = Destinations(thru, Epsilon); - Outputs::iterator outIt = outputs.find(from); - unsigned long fromThruOut = Output(from, thru); - for (auto&& toElement : to) { - PIRE_IFDEBUG(Cdbg << "Epsilon connecting " << from << " --> " << thru << " --> " << toElement << "\n"); - Connect(from, toElement, Epsilon); - inveps[toElement].insert(from); - if (outIt != outputs.end()) - outIt->second[toElement] |= (fromThruOut | Output(thru, toElement)); - } + PIRE_IFDEBUG(Cdbg << "In Fsm::ShortCutEpsilon(" << from << ", " << thru << ")\n"); + const StatesSet& to = Destinations(thru, Epsilon); + Outputs::iterator outIt = outputs.find(from); + unsigned long fromThruOut = Output(from, thru); + for (auto&& toElement : to) { + PIRE_IFDEBUG(Cdbg << "Epsilon connecting " << from << " --> " << thru << " --> " << toElement << "\n"); + Connect(from, toElement, Epsilon); + inveps[toElement].insert(from); + if (outIt != outputs.end()) + outIt->second[toElement] |= (fromThruOut | Output(thru, toElement)); + } } // Removes all Epsilon-connections by iterating though states and merging each Epsilon-connection // effects from 'to' state into 'from' state void Fsm::RemoveEpsilons() { - Unsparse(); - - // Build inverse map of epsilon transitions - TVector< TSet<size_t> > inveps(Size()); // We have to use TSet<> here since we want it sorted - for (size_t from = 0; from != Size(); ++from) { - const StatesSet& tos = Destinations(from, Epsilon); - for (auto&& to : tos) - inveps[to].insert(from); - } - - // Make a transitive closure of all epsilon transitions (Floyd-Warshall algorithm) - // (if there exists an epsilon-path between two states, epsilon-connect them directly) - for (size_t thru = 0; thru != Size(); ++thru) - for (auto&& from : inveps[thru]) - // inveps[thru] may alter during loop body, hence we cannot cache ivneps[thru].end() - if (from != thru) - ShortCutEpsilon(from, thru, inveps); - - PIRE_IFDEBUG(Cdbg << "=== After epsilons shortcut\n" << *this << Endl); - - // Iterate through all epsilon-connected state pairs, merging states together - for (size_t from = 0; from != Size(); ++from) { - const StatesSet& to = Destinations(from, Epsilon); - for (auto&& toElement : to) - if (toElement != from) - MergeEpsilonConnection(from, toElement); // it's a NOP if to == from, so don't waste time - } - - PIRE_IFDEBUG(Cdbg << "=== After epsilons merged\n" << *this << Endl); - - // Drop all epsilon transitions - for (auto&& i : m_transitions) - i.erase(Epsilon); - - Sparse(); - ClearHints(); + Unsparse(); + + // Build inverse map of epsilon transitions + TVector< TSet<size_t> > inveps(Size()); // We have to use TSet<> here since we want it sorted + for (size_t from = 0; from != Size(); ++from) { + const StatesSet& tos = Destinations(from, Epsilon); + for (auto&& to : tos) + inveps[to].insert(from); + } + + // Make a transitive closure of all epsilon transitions (Floyd-Warshall algorithm) + // (if there exists an epsilon-path between two states, epsilon-connect them directly) + for (size_t thru = 0; thru != Size(); ++thru) + for (auto&& from : inveps[thru]) + // inveps[thru] may alter during loop body, hence we cannot cache ivneps[thru].end() + if (from != thru) + ShortCutEpsilon(from, thru, inveps); + + PIRE_IFDEBUG(Cdbg << "=== After epsilons shortcut\n" << *this << Endl); + + // Iterate through all epsilon-connected state pairs, merging states together + for (size_t from = 0; from != Size(); ++from) { + const StatesSet& to = Destinations(from, Epsilon); + for (auto&& toElement : to) + if (toElement != from) + MergeEpsilonConnection(from, toElement); // it's a NOP if to == from, so don't waste time + } + + PIRE_IFDEBUG(Cdbg << "=== After epsilons merged\n" << *this << Endl); + + // Drop all epsilon transitions + for (auto&& i : m_transitions) + i.erase(Epsilon); + + Sparse(); + ClearHints(); } bool Fsm::LettersEquality::operator()(Char a, Char b) const { - for (auto&& outer : *m_tbl) { - auto ia = outer.find(a); - auto ib = outer.find(b); - if (ia == outer.end() && ib == outer.end()) - continue; - else if (ia == outer.end() || ib == outer.end() || ia->second != ib->second) { - return false; - } - } - return true; + for (auto&& outer : *m_tbl) { + auto ia = outer.find(a); + auto ib = outer.find(b); + if (ia == outer.end() && ib == outer.end()) + continue; + else if (ia == outer.end() || ib == outer.end() || ia->second != ib->second) { + return false; + } + } + return true; } void Fsm::Sparse(bool needEpsilons /* = false */) { - letters = LettersTbl(LettersEquality(m_transitions)); - for (unsigned letter = 0; letter < MaxChar; ++letter) - if (letter != Epsilon || needEpsilons) - letters.Append(letter); + letters = LettersTbl(LettersEquality(m_transitions)); + for (unsigned letter = 0; letter < MaxChar; ++letter) + if (letter != Epsilon || needEpsilons) + letters.Append(letter); - m_sparsed = true; - PIRE_IFDEBUG(Cdbg << "Letter classes = " << letters << Endl); + m_sparsed = true; + PIRE_IFDEBUG(Cdbg << "Letter classes = " << letters << Endl); } void Fsm::Unsparse() { - for (auto&& letter : letters) - for (auto&& i : m_transitions) - for (auto&& j : letter.second.second) - i[j] = i[letter.first]; - m_sparsed = false; + for (auto&& letter : letters) + for (auto&& i : m_transitions) + for (auto&& j : letter.second.second) + i[j] = i[letter.first]; + m_sparsed = false; } // Returns a set of 'terminal states', which are those of the final states, // from which a transition to themselves on any letter is possible. TSet<size_t> Fsm::TerminalStates() const { - TSet<size_t> terminals; - for (auto&& final : m_final) { - bool ok = true; - for (auto&& letter : letters) { - auto dests = m_transitions[final].find(letter.first); - ok = ok && (dests != m_transitions[final].end() && dests->second.find(final) != dests->second.end()); - } - if (ok) - terminals.insert(final); - } - return terminals; + TSet<size_t> terminals; + for (auto&& final : m_final) { + bool ok = true; + for (auto&& letter : letters) { + auto dests = m_transitions[final].find(letter.first); + ok = ok && (dests != m_transitions[final].end() && dests->second.find(final) != dests->second.end()); + } + if (ok) + terminals.insert(final); + } + return terminals; } namespace Impl { class FsmDetermineTask { public: - typedef TVector<size_t> State; - typedef Fsm::LettersTbl LettersTbl; - typedef TMap<State, size_t> InvStates; - - FsmDetermineTask(const Fsm& fsm) - : mFsm(fsm) - , mTerminals(fsm.TerminalStates()) - { - PIRE_IFDEBUG(Cdbg << "Terminal states: [" << Join(mTerminals.begin(), mTerminals.end(), ", ") << "]" << Endl); - } - const LettersTbl& Letters() const { return mFsm.letters; } - - State Initial() const { return State(1, mFsm.initial); } - bool IsRequired(const State& state) const - { - for (auto&& i : state) - if (mTerminals.find(i) != mTerminals.end()) - return false; - return true; - } - - State Next(const State& state, Char letter) const - { - State next; - next.reserve(20); - for (auto&& from : state) { - const auto& part = mFsm.Destinations(from, letter); - std::copy(part.begin(), part.end(), std::back_inserter(next)); - } - - std::sort(next.begin(), next.end()); - next.erase(std::unique(next.begin(), next.end()), next.end()); - PIRE_IFDEBUG(Cdbg << "Returning transition [" << Join(state.begin(), state.end(), ", ") << "] --" << letter - << "--> [" << Join(next.begin(), next.end(), ", ") << "]" << Endl); - return next; - } - - void AcceptStates(const TVector<State>& states) - { - mNewFsm.Resize(states.size()); - mNewFsm.initial = 0; - mNewFsm.determined = true; - mNewFsm.letters = Letters(); - mNewFsm.m_final.clear(); - for (size_t ns = 0; ns < states.size(); ++ns) { - PIRE_IFDEBUG(Cdbg << "State " << ns << " = [" << Join(states[ns].begin(), states[ns].end(), ", ") << "]" << Endl); - for (auto&& j : states[ns]) { - - // If it was a terminal state, connect it to itself - if (mTerminals.find(j) != mTerminals.end()) { - for (auto&& letter : Letters()) - mNewFsm.Connect(ns, ns, letter.first); - mNewTerminals.insert(ns); - PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes terminal because of old state " << j << Endl); - } - } - for (auto&& j : states[ns]) { - // If any state containing in our one is marked final, mark the new state final as well - if (mFsm.IsFinal(j)) { - PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes final because of old state " << j << Endl); - mNewFsm.SetFinal(ns, true); - if (mFsm.tags.empty()) - // Weve got no tags and already know that the state is final, - // hence weve done with this state and got nothing more to do. - break; - } - - // Bitwise OR all tags in states - auto ti = mFsm.tags.find(j); - if (ti != mFsm.tags.end()) { - PIRE_IFDEBUG(Cdbg << "State " << ns << " carries tag " << ti->second << " because of old state " << j << Endl); - mNewFsm.tags[ns] |= ti->second; - } - } - } - // For each old state, prepare a list of new state it is contained in - typedef TMap< size_t, TVector<size_t> > Old2New; - Old2New old2new; - for (size_t ns = 0; ns < states.size(); ++ns) - for (auto&& j : states[ns]) - old2new[j].push_back(ns); - - // Copy all outputs - for (auto&& i : mFsm.outputs) { - for (auto&& j : i.second) { - auto from = old2new.find(i.first); - auto to = old2new.find(j.first); - if (from != old2new.end() && to != old2new.end()) { - for (auto&& k : from->second) - for (auto&& l : to->second) - mNewFsm.outputs[k][l] |= j.second; - } - } - } - PIRE_IFDEBUG(Cdbg << "New terminals = [" << Join(mNewTerminals.begin(), mNewTerminals.end(), ",") << "]" << Endl); - } - - void Connect(size_t from, size_t to, Char letter) - { - PIRE_IFDEBUG(Cdbg << "Connecting " << from << " --" << letter << "--> " << to << Endl); - Y_ASSERT(mNewTerminals.find(from) == mNewTerminals.end()); - mNewFsm.Connect(from, to, letter); - } - typedef bool Result; - - Result Success() { - Fsm::Outputs oldOutputs; - // remove redundant outputs - oldOutputs.swap(mNewFsm.outputs); - for (size_t from = 0; from < mNewFsm.Size(); ++from) { - auto fromOutput = oldOutputs.find(from); - if (fromOutput == oldOutputs.end()) - continue; - const auto& newTransitionsRow = mNewFsm.m_transitions[from]; - for (auto&& row : newTransitionsRow) { - for (auto&& stateIt : row.second) { - auto toOutput = fromOutput->second.find(stateIt); - if (toOutput != fromOutput->second.end()) { - mNewFsm.outputs[from].insert(*toOutput); - } - } - } - } - return true; - } - - Result Failure() { return false; } - - Fsm& Output() { return mNewFsm; } + typedef TVector<size_t> State; + typedef Fsm::LettersTbl LettersTbl; + typedef TMap<State, size_t> InvStates; + + FsmDetermineTask(const Fsm& fsm) + : mFsm(fsm) + , mTerminals(fsm.TerminalStates()) + { + PIRE_IFDEBUG(Cdbg << "Terminal states: [" << Join(mTerminals.begin(), mTerminals.end(), ", ") << "]" << Endl); + } + const LettersTbl& Letters() const { return mFsm.letters; } + + State Initial() const { return State(1, mFsm.initial); } + bool IsRequired(const State& state) const + { + for (auto&& i : state) + if (mTerminals.find(i) != mTerminals.end()) + return false; + return true; + } + + State Next(const State& state, Char letter) const + { + State next; + next.reserve(20); + for (auto&& from : state) { + const auto& part = mFsm.Destinations(from, letter); + std::copy(part.begin(), part.end(), std::back_inserter(next)); + } + + std::sort(next.begin(), next.end()); + next.erase(std::unique(next.begin(), next.end()), next.end()); + PIRE_IFDEBUG(Cdbg << "Returning transition [" << Join(state.begin(), state.end(), ", ") << "] --" << letter + << "--> [" << Join(next.begin(), next.end(), ", ") << "]" << Endl); + return next; + } + + void AcceptStates(const TVector<State>& states) + { + mNewFsm.Resize(states.size()); + mNewFsm.initial = 0; + mNewFsm.determined = true; + mNewFsm.letters = Letters(); + mNewFsm.m_final.clear(); + for (size_t ns = 0; ns < states.size(); ++ns) { + PIRE_IFDEBUG(Cdbg << "State " << ns << " = [" << Join(states[ns].begin(), states[ns].end(), ", ") << "]" << Endl); + for (auto&& j : states[ns]) { + + // If it was a terminal state, connect it to itself + if (mTerminals.find(j) != mTerminals.end()) { + for (auto&& letter : Letters()) + mNewFsm.Connect(ns, ns, letter.first); + mNewTerminals.insert(ns); + PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes terminal because of old state " << j << Endl); + } + } + for (auto&& j : states[ns]) { + // If any state containing in our one is marked final, mark the new state final as well + if (mFsm.IsFinal(j)) { + PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes final because of old state " << j << Endl); + mNewFsm.SetFinal(ns, true); + if (mFsm.tags.empty()) + // Weve got no tags and already know that the state is final, + // hence weve done with this state and got nothing more to do. + break; + } + + // Bitwise OR all tags in states + auto ti = mFsm.tags.find(j); + if (ti != mFsm.tags.end()) { + PIRE_IFDEBUG(Cdbg << "State " << ns << " carries tag " << ti->second << " because of old state " << j << Endl); + mNewFsm.tags[ns] |= ti->second; + } + } + } + // For each old state, prepare a list of new state it is contained in + typedef TMap< size_t, TVector<size_t> > Old2New; + Old2New old2new; + for (size_t ns = 0; ns < states.size(); ++ns) + for (auto&& j : states[ns]) + old2new[j].push_back(ns); + + // Copy all outputs + for (auto&& i : mFsm.outputs) { + for (auto&& j : i.second) { + auto from = old2new.find(i.first); + auto to = old2new.find(j.first); + if (from != old2new.end() && to != old2new.end()) { + for (auto&& k : from->second) + for (auto&& l : to->second) + mNewFsm.outputs[k][l] |= j.second; + } + } + } + PIRE_IFDEBUG(Cdbg << "New terminals = [" << Join(mNewTerminals.begin(), mNewTerminals.end(), ",") << "]" << Endl); + } + + void Connect(size_t from, size_t to, Char letter) + { + PIRE_IFDEBUG(Cdbg << "Connecting " << from << " --" << letter << "--> " << to << Endl); + Y_ASSERT(mNewTerminals.find(from) == mNewTerminals.end()); + mNewFsm.Connect(from, to, letter); + } + typedef bool Result; + + Result Success() { + Fsm::Outputs oldOutputs; + // remove redundant outputs + oldOutputs.swap(mNewFsm.outputs); + for (size_t from = 0; from < mNewFsm.Size(); ++from) { + auto fromOutput = oldOutputs.find(from); + if (fromOutput == oldOutputs.end()) + continue; + const auto& newTransitionsRow = mNewFsm.m_transitions[from]; + for (auto&& row : newTransitionsRow) { + for (auto&& stateIt : row.second) { + auto toOutput = fromOutput->second.find(stateIt); + if (toOutput != fromOutput->second.end()) { + mNewFsm.outputs[from].insert(*toOutput); + } + } + } + } + return true; + } + + Result Failure() { return false; } + + Fsm& Output() { return mNewFsm; } private: - const Fsm& mFsm; - Fsm mNewFsm; - TSet<size_t> mTerminals; - TSet<size_t> mNewTerminals; + const Fsm& mFsm; + Fsm mNewFsm; + TSet<size_t> mTerminals; + TSet<size_t> mNewTerminals; }; } bool Fsm::Determine(size_t maxsize /* = 0 */) { - static const unsigned MaxSize = 200000; - if (determined) - return true; - - PIRE_IFDEBUG(Cdbg << "=== Initial ===" << Endl << *this << Endl); - - RemoveEpsilons(); - PIRE_IFDEBUG(Cdbg << "=== After all epsilons removed" << Endl << *this << Endl); - - Impl::FsmDetermineTask task(*this); - if (Pire::Impl::Determine(task, maxsize ? maxsize : MaxSize)) { - task.Output().Swap(*this); - PIRE_IFDEBUG(Cdbg << "=== Determined ===" << Endl << *this << Endl); - return true; - } else - return false; + static const unsigned MaxSize = 200000; + if (determined) + return true; + + PIRE_IFDEBUG(Cdbg << "=== Initial ===" << Endl << *this << Endl); + + RemoveEpsilons(); + PIRE_IFDEBUG(Cdbg << "=== After all epsilons removed" << Endl << *this << Endl); + + Impl::FsmDetermineTask task(*this); + if (Pire::Impl::Determine(task, maxsize ? maxsize : MaxSize)) { + task.Output().Swap(*this); + PIRE_IFDEBUG(Cdbg << "=== Determined ===" << Endl << *this << Endl); + return true; + } else + return false; } namespace Impl { class FsmMinimizeTask { public: - explicit FsmMinimizeTask(const Fsm& fsm) - : mFsm(fsm) - , reversedTransitions(fsm.Size()) - , StateClass(fsm.Size()) - , Classes(0) - { - Y_ASSERT(mFsm.IsDetermined()); - - TMap<bool, size_t> FinalStateClassMap; - - for (size_t state = 0; state < mFsm.Size(); ++state) { - reversedTransitions[state].resize(mFsm.Letters().Size()); - if (FinalStateClassMap.find(mFsm.IsFinal(state)) == FinalStateClassMap.end()) { - FinalStateClassMap[mFsm.IsFinal(state)] = Classes++; - } - StateClass[state] = FinalStateClassMap[mFsm.IsFinal(state)]; - } - - for (size_t state = 0; state < mFsm.Size(); ++state) { - TSet<ypair<Char, size_t>> usedTransitions; - for (const auto& transition : mFsm.m_transitions[state]) { - Y_ASSERT(transition.second.size() == 1); - auto destination = *transition.second.begin(); - auto letter = mFsm.Letters().Index(transition.first); - if (usedTransitions.find(ymake_pair(letter, destination)) == usedTransitions.end()) { - usedTransitions.insert(ymake_pair(letter, destination)); - reversedTransitions[destination][letter].push_back(state); - } - } - } - } - - TVector<size_t>& GetStateClass() { return StateClass; } - - size_t& GetClassesNumber() { return Classes; } - - size_t LettersCount() const { - return mFsm.Letters().Size(); - } - - bool IsDetermined() const { - return mFsm.IsDetermined(); - } - - size_t Size() const { - return mFsm.Size(); - } - - const TVector<size_t>& Previous(size_t state, size_t letter) const { - return reversedTransitions[state][letter]; - } - - void AcceptStates() { - mNewFsm.Resize(Classes); - mNewFsm.letters = mFsm.letters; - mNewFsm.determined = mFsm.determined; - mNewFsm.m_sparsed = mFsm.m_sparsed; - mNewFsm.SetFinal(0, false); - - // Unite equality classes into new states - size_t fromIdx = 0; - for (auto from = mFsm.m_transitions.begin(), fromEnd = mFsm.m_transitions.end(); from != fromEnd; ++from, ++fromIdx) { - size_t dest = StateClass[fromIdx]; - PIRE_IFDEBUG(Cdbg << "[min] State " << fromIdx << " becomes state " << dest << Endl); - for (auto&& letter : *from) { - Y_ASSERT(letter.second.size() == 1 || !"FSM::minimize(): FSM not deterministic"); - mNewFsm.Connect(dest, StateClass[*letter.second.begin()], letter.first); - } - if (mFsm.IsFinal(fromIdx)) { - mNewFsm.SetFinal(dest, true); - PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " becomes final because of old state " << fromIdx << Endl); - } - - // Append tags - auto ti = mFsm.tags.find(fromIdx); - if (ti != mFsm.tags.end()) { - mNewFsm.tags[dest] |= ti->second; - PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " carries tag " << ti->second << " because of old state " << fromIdx << Endl); - } - } - mNewFsm.initial = StateClass[mFsm.initial]; - - // Restore outputs - for (auto&& output : mFsm.outputs) - for (auto&& output2 : output.second) - mNewFsm.outputs[StateClass[output.first]].insert(ymake_pair(StateClass[output2.first], output2.second)); - } - - typedef bool Result; - - Result Success() { - return true; - } - - Result Failure() { - return false; - } - - Fsm& Output() { - return mNewFsm; - } + explicit FsmMinimizeTask(const Fsm& fsm) + : mFsm(fsm) + , reversedTransitions(fsm.Size()) + , StateClass(fsm.Size()) + , Classes(0) + { + Y_ASSERT(mFsm.IsDetermined()); + + TMap<bool, size_t> FinalStateClassMap; + + for (size_t state = 0; state < mFsm.Size(); ++state) { + reversedTransitions[state].resize(mFsm.Letters().Size()); + if (FinalStateClassMap.find(mFsm.IsFinal(state)) == FinalStateClassMap.end()) { + FinalStateClassMap[mFsm.IsFinal(state)] = Classes++; + } + StateClass[state] = FinalStateClassMap[mFsm.IsFinal(state)]; + } + + for (size_t state = 0; state < mFsm.Size(); ++state) { + TSet<ypair<Char, size_t>> usedTransitions; + for (const auto& transition : mFsm.m_transitions[state]) { + Y_ASSERT(transition.second.size() == 1); + auto destination = *transition.second.begin(); + auto letter = mFsm.Letters().Index(transition.first); + if (usedTransitions.find(ymake_pair(letter, destination)) == usedTransitions.end()) { + usedTransitions.insert(ymake_pair(letter, destination)); + reversedTransitions[destination][letter].push_back(state); + } + } + } + } + + TVector<size_t>& GetStateClass() { return StateClass; } + + size_t& GetClassesNumber() { return Classes; } + + size_t LettersCount() const { + return mFsm.Letters().Size(); + } + + bool IsDetermined() const { + return mFsm.IsDetermined(); + } + + size_t Size() const { + return mFsm.Size(); + } + + const TVector<size_t>& Previous(size_t state, size_t letter) const { + return reversedTransitions[state][letter]; + } + + void AcceptStates() { + mNewFsm.Resize(Classes); + mNewFsm.letters = mFsm.letters; + mNewFsm.determined = mFsm.determined; + mNewFsm.m_sparsed = mFsm.m_sparsed; + mNewFsm.SetFinal(0, false); + + // Unite equality classes into new states + size_t fromIdx = 0; + for (auto from = mFsm.m_transitions.begin(), fromEnd = mFsm.m_transitions.end(); from != fromEnd; ++from, ++fromIdx) { + size_t dest = StateClass[fromIdx]; + PIRE_IFDEBUG(Cdbg << "[min] State " << fromIdx << " becomes state " << dest << Endl); + for (auto&& letter : *from) { + Y_ASSERT(letter.second.size() == 1 || !"FSM::minimize(): FSM not deterministic"); + mNewFsm.Connect(dest, StateClass[*letter.second.begin()], letter.first); + } + if (mFsm.IsFinal(fromIdx)) { + mNewFsm.SetFinal(dest, true); + PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " becomes final because of old state " << fromIdx << Endl); + } + + // Append tags + auto ti = mFsm.tags.find(fromIdx); + if (ti != mFsm.tags.end()) { + mNewFsm.tags[dest] |= ti->second; + PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " carries tag " << ti->second << " because of old state " << fromIdx << Endl); + } + } + mNewFsm.initial = StateClass[mFsm.initial]; + + // Restore outputs + for (auto&& output : mFsm.outputs) + for (auto&& output2 : output.second) + mNewFsm.outputs[StateClass[output.first]].insert(ymake_pair(StateClass[output2.first], output2.second)); + } + + typedef bool Result; + + Result Success() { + return true; + } + + Result Failure() { + return false; + } + + Fsm& Output() { + return mNewFsm; + } private: - const Fsm& mFsm; - Fsm mNewFsm; - TVector<TVector<TVector<size_t>>> reversedTransitions; - TVector<size_t> StateClass; - size_t Classes; + const Fsm& mFsm; + Fsm mNewFsm; + TVector<TVector<TVector<size_t>>> reversedTransitions; + TVector<size_t> StateClass; + size_t Classes; }; } void Fsm::Minimize() { - // Minimization algorithm is only applicable to a determined FSM. - Y_ASSERT(determined); + // Minimization algorithm is only applicable to a determined FSM. + Y_ASSERT(determined); - Impl::FsmMinimizeTask task{*this}; - if (Pire::Impl::Minimize(task)) { - task.Output().Swap(*this); - } + Impl::FsmMinimizeTask task{*this}; + if (Pire::Impl::Minimize(task)) { + task.Output().Swap(*this); + } } Fsm& Fsm::Canonize(size_t maxSize /* = 0 */) { - if (!IsDetermined()) { - if (!Determine(maxSize)) - throw Error("regexp pattern too complicated"); - } - Minimize(); - return *this; + if (!IsDetermined()) { + if (!Determine(maxSize)) + throw Error("regexp pattern too complicated"); + } + Minimize(); + return *this; } void Fsm::PrependAnything() { - size_t newstate = Size(); - Resize(Size() + 1); - for (size_t letter = 0; letter < MaxChar; ++letter) - Connect(newstate, newstate, letter); + size_t newstate = Size(); + Resize(Size() + 1); + for (size_t letter = 0; letter < MaxChar; ++letter) + Connect(newstate, newstate, letter); - Connect(newstate, initial); - initial = newstate; + Connect(newstate, initial); + initial = newstate; - determined = false; + determined = false; } void Fsm::AppendAnything() { - size_t newstate = Size(); - Resize(Size() + 1); - for (size_t letter = 0; letter < MaxChar; ++letter) - Connect(newstate, newstate, letter); + size_t newstate = Size(); + Resize(Size() + 1); + for (size_t letter = 0; letter < MaxChar; ++letter) + Connect(newstate, newstate, letter); - ConnectFinal(newstate); - ClearFinal(); - SetFinal(newstate, 1); + ConnectFinal(newstate); + ClearFinal(); + SetFinal(newstate, 1); - determined = false; + determined = false; } Fsm& Fsm::Surround() { - PrependAnything(); - AppendAnything(); - return *this; + PrependAnything(); + AppendAnything(); + return *this; } void Fsm::Divert(size_t from, size_t to, size_t dest) { - if (to == dest) - return; - - // Assign the output - auto oi = outputs.find(from); - if (oi != outputs.end()) { - auto oi2 = oi->second.find(to); - if (oi2 != oi->second.end()) { - unsigned long output = oi2->second; - oi->second.erase(oi2); - oi->second.insert(ymake_pair(dest, output)); - } - } - - // Assign the transition - for (auto&& i : m_transitions[from]) { - auto di = i.second.find(to); - if (di != i.second.end()) { - i.second.erase(di); - i.second.insert(dest); - } - } - - ClearHints(); + if (to == dest) + return; + + // Assign the output + auto oi = outputs.find(from); + if (oi != outputs.end()) { + auto oi2 = oi->second.find(to); + if (oi2 != oi->second.end()) { + unsigned long output = oi2->second; + oi->second.erase(oi2); + oi->second.insert(ymake_pair(dest, output)); + } + } + + // Assign the transition + for (auto&& i : m_transitions[from]) { + auto di = i.second.find(to); + if (di != i.second.end()) { + i.second.erase(di); + i.second.insert(dest); + } + } + + ClearHints(); } diff --git a/library/cpp/regex/pire/pire/fsm.h b/library/cpp/regex/pire/pire/fsm.h index fce84616d94..64ca6dd7c19 100644 --- a/library/cpp/regex/pire/pire/fsm.h +++ b/library/cpp/regex/pire/pire/fsm.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -32,252 +32,252 @@ namespace Pire { - namespace Impl { - class FsmDetermineTask; - class FsmMinimizeTask; + namespace Impl { + class FsmDetermineTask; + class FsmMinimizeTask; class HalfFinalDetermineTask; - } - - /// A Flying Spaghetti Monster... no, just a Finite State Machine. - class Fsm { - public: - typedef ybitset<MaxChar> Charset; - - Fsm(); - void Swap(Fsm& fsm); - - static Fsm MakeFalse(); - - /// Current number of states - size_t Size() const { return m_transitions.size(); } - - Fsm& Append(char c); - Fsm& Append(const ystring& str); - Fsm& AppendSpecial(Char c); - - /// Efficiently appends a union of passed strings to FSM. - /// Used for ranges (e.g. [a-z]), character classes (e.g. \w, \d) - /// and case-insensitive comparison of multibyte characters, - /// when one string represents a lowercase variant of a character, - /// while another string represents its uppercase variant. - Fsm& AppendStrings(const TVector<ystring>& strings); - - /// Appends a part matching a single byte (any). - Fsm& AppendDot(); - - /// Appends and prepends the FSM with the iterated dot (see above). - Fsm& Surround(); // returns *this - Fsm Surrounded() const { Fsm copy(*this); copy.Surround(); return copy; } - - Fsm& operator += (const Fsm& rhs); ///< Concatenation - Fsm& operator |= (const Fsm& rhs); ///< Alternation - Fsm& operator &= (const Fsm& rhs); ///< Conjunction - Fsm& Iterate(); ///< Klene star - Fsm& Complement(); ///< Complementation - Fsm& operator *= (size_t count) { *this = *this * count; return *this; } - - Fsm operator + (const Fsm& rhs) const { Fsm a(*this); return a += rhs; } - Fsm operator | (const Fsm& rhs) const { Fsm a(*this); return a |= rhs; } - Fsm operator & (const Fsm& rhs) const { Fsm a(*this); return a &= rhs; } - Fsm operator * () const { Fsm a(*this); return a.Iterate(); } - Fsm operator ~ () const { Fsm a(*this); return a.Complement(); } - Fsm operator * (size_t count) const; - - // === Raw FSM construction === - - /// Connects two states with given transition - void Connect(size_t from, size_t to, Char c = Epsilon); - - /// Removes given character from the specified transition. - void Disconnect(size_t from, size_t to, Char c); - - /// Completely removes given transition - void Disconnect(size_t from, size_t to); - - /// Creates an FSM which matches any prefix of any word current FSM matches. - void MakePrefix(); - - /// Creates an FSM which matches any suffix of any word current FSM matches. - void MakeSuffix(); - - /// Does the one way part of Surround(). - void PrependAnything(); - void AppendAnything(); - - /// Creates an FSM which matches reversed strings matched by current FSM. - Fsm& Reverse(); - - /// Returns a set of states from which no final states are reachable - TSet<size_t> DeadStates() const; - - /// Removes all dead end paths from FSM - void RemoveDeadEnds(); - - /// Determines and minimizes the FSM if neccessary. Returns *this. - Fsm& Canonize(size_t maxSize = 0); - - template<class Scanner> - Scanner Compile(size_t distance = 0); - - void DumpState(yostream& s, size_t state) const; - void DumpTo(yostream& s, const ystring& name = "") const; - - typedef TSet<size_t> StatesSet; - typedef TMap<size_t, StatesSet> TransitionRow; - typedef TVector<TransitionRow> TransitionTable; - - struct LettersEquality { - LettersEquality(const Fsm::TransitionTable& tbl): m_tbl(&tbl) {} - bool operator()(Char a, Char b) const; - private: - const Fsm::TransitionTable* m_tbl; - }; - - typedef TSet<size_t> FinalTable; - typedef Partition<Char, LettersEquality> LettersTbl; - - - /* - * A very low level FSM building interface. - * - * It is generally unwise to call any of these functions unless you are building - * your own scanner, your own ecoding or exaclty know what you are doing. - */ - unsigned long Tag(size_t state) const { Tags::const_iterator i = tags.find(state); return (i == tags.end()) ? 0 : i->second; } - void SetTag(size_t state, unsigned long tag) { tags[state] = tag; } - - unsigned long Output(size_t from, size_t to) const; - void SetOutput(size_t from, size_t to, unsigned long output) { outputs[from][to] = output; } - void ClearOutputs() { outputs.clear(); } - - const FinalTable& Finals() const { return m_final; } - bool IsFinal(size_t state) const { return m_final.find(state) != m_final.end(); } - void SetFinal(size_t size, bool final); - void ClearFinal() { m_final.clear(); } - - /// Removes all espilon transitions from the FSM. Does not change the FSMs language. - void RemoveEpsilons(); - - /// Resize FSM to newSize states. Returns old size. - size_t Resize(size_t newSize); - - /// Imports foreign transition table - void Import(const Fsm& rhs); - - /// Connects all final state with given state - void ConnectFinal(size_t to, Char c = Epsilon); - - /// Diverts all transition between two given states to @p dest, preserving outputs - void Divert(size_t from, size_t to, size_t dest); - - /// Checks whether two states are connected using given letter. - bool Connected(size_t from, size_t to, Char c) const; - - /// Returns a set of letters on which a transition from the specified state exists - TSet<Char> OutgoingLetters(size_t state) const; - - /// Returns a set of states where a transition from the given state using the given letter is possible - const StatesSet& Destinations(size_t from, Char letter) const; - - /// Checks whether two states are connected using any letter. - bool Connected(size_t from, size_t to) const; - size_t Initial() const { return initial; } - void SetInitial(size_t init) { initial = init; } - - const LettersTbl& Letters() const { return letters; } - - /// Determines the FSM. - /// Breaks FSM invariant of having a single final state, so high-level FSM building - /// functions (i.e. Append(), operator+(), etc...) no longer can be applied to the FSM - /// until the invariants have been manually restored. - /// return value: successful? - bool Determine(size_t maxsize = 0); - bool IsDetermined() const { return determined; } - void SetIsDetermined(bool det) { determined = det; } - - /// Minimizes amount of states in the regexp. - /// Requires a determined FSM. - void Minimize(); - - - /// Builds letters equivalence classes - void Sparse(bool needEpsilons = false); - - /// Unpacks all letters equivalence classs back into transitions table - void Unsparse(); - - private: - - /// Transitions table :: Q x V -> exp(Q) - TransitionTable m_transitions; - - /// Initial state - size_t initial; - - /// Final states. - FinalTable m_final; - - LettersTbl letters; - - /// Does 'letters' make sense? - bool m_sparsed; - - /// Is the FSM already determined? - bool determined; - - /// Output - typedef TMap< size_t, TMap<size_t, unsigned long> > Outputs; - Outputs outputs; - - typedef TMap<size_t, unsigned long> Tags; - Tags tags; - - /// Heuristics hit: true iff this FSM is a union of two other FSMs - bool isAlternative; - - void ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps); ///< internal - void MergeEpsilonConnection(size_t from, size_t to); ///< internal - - TSet<size_t> TerminalStates() const; - - Char Translate(Char c) const; - - void ClearHints() { isAlternative = false; } - - friend class Impl::FsmDetermineTask; - friend class Impl::FsmMinimizeTask; + } + + /// A Flying Spaghetti Monster... no, just a Finite State Machine. + class Fsm { + public: + typedef ybitset<MaxChar> Charset; + + Fsm(); + void Swap(Fsm& fsm); + + static Fsm MakeFalse(); + + /// Current number of states + size_t Size() const { return m_transitions.size(); } + + Fsm& Append(char c); + Fsm& Append(const ystring& str); + Fsm& AppendSpecial(Char c); + + /// Efficiently appends a union of passed strings to FSM. + /// Used for ranges (e.g. [a-z]), character classes (e.g. \w, \d) + /// and case-insensitive comparison of multibyte characters, + /// when one string represents a lowercase variant of a character, + /// while another string represents its uppercase variant. + Fsm& AppendStrings(const TVector<ystring>& strings); + + /// Appends a part matching a single byte (any). + Fsm& AppendDot(); + + /// Appends and prepends the FSM with the iterated dot (see above). + Fsm& Surround(); // returns *this + Fsm Surrounded() const { Fsm copy(*this); copy.Surround(); return copy; } + + Fsm& operator += (const Fsm& rhs); ///< Concatenation + Fsm& operator |= (const Fsm& rhs); ///< Alternation + Fsm& operator &= (const Fsm& rhs); ///< Conjunction + Fsm& Iterate(); ///< Klene star + Fsm& Complement(); ///< Complementation + Fsm& operator *= (size_t count) { *this = *this * count; return *this; } + + Fsm operator + (const Fsm& rhs) const { Fsm a(*this); return a += rhs; } + Fsm operator | (const Fsm& rhs) const { Fsm a(*this); return a |= rhs; } + Fsm operator & (const Fsm& rhs) const { Fsm a(*this); return a &= rhs; } + Fsm operator * () const { Fsm a(*this); return a.Iterate(); } + Fsm operator ~ () const { Fsm a(*this); return a.Complement(); } + Fsm operator * (size_t count) const; + + // === Raw FSM construction === + + /// Connects two states with given transition + void Connect(size_t from, size_t to, Char c = Epsilon); + + /// Removes given character from the specified transition. + void Disconnect(size_t from, size_t to, Char c); + + /// Completely removes given transition + void Disconnect(size_t from, size_t to); + + /// Creates an FSM which matches any prefix of any word current FSM matches. + void MakePrefix(); + + /// Creates an FSM which matches any suffix of any word current FSM matches. + void MakeSuffix(); + + /// Does the one way part of Surround(). + void PrependAnything(); + void AppendAnything(); + + /// Creates an FSM which matches reversed strings matched by current FSM. + Fsm& Reverse(); + + /// Returns a set of states from which no final states are reachable + TSet<size_t> DeadStates() const; + + /// Removes all dead end paths from FSM + void RemoveDeadEnds(); + + /// Determines and minimizes the FSM if neccessary. Returns *this. + Fsm& Canonize(size_t maxSize = 0); + + template<class Scanner> + Scanner Compile(size_t distance = 0); + + void DumpState(yostream& s, size_t state) const; + void DumpTo(yostream& s, const ystring& name = "") const; + + typedef TSet<size_t> StatesSet; + typedef TMap<size_t, StatesSet> TransitionRow; + typedef TVector<TransitionRow> TransitionTable; + + struct LettersEquality { + LettersEquality(const Fsm::TransitionTable& tbl): m_tbl(&tbl) {} + bool operator()(Char a, Char b) const; + private: + const Fsm::TransitionTable* m_tbl; + }; + + typedef TSet<size_t> FinalTable; + typedef Partition<Char, LettersEquality> LettersTbl; + + + /* + * A very low level FSM building interface. + * + * It is generally unwise to call any of these functions unless you are building + * your own scanner, your own ecoding or exaclty know what you are doing. + */ + unsigned long Tag(size_t state) const { Tags::const_iterator i = tags.find(state); return (i == tags.end()) ? 0 : i->second; } + void SetTag(size_t state, unsigned long tag) { tags[state] = tag; } + + unsigned long Output(size_t from, size_t to) const; + void SetOutput(size_t from, size_t to, unsigned long output) { outputs[from][to] = output; } + void ClearOutputs() { outputs.clear(); } + + const FinalTable& Finals() const { return m_final; } + bool IsFinal(size_t state) const { return m_final.find(state) != m_final.end(); } + void SetFinal(size_t size, bool final); + void ClearFinal() { m_final.clear(); } + + /// Removes all espilon transitions from the FSM. Does not change the FSMs language. + void RemoveEpsilons(); + + /// Resize FSM to newSize states. Returns old size. + size_t Resize(size_t newSize); + + /// Imports foreign transition table + void Import(const Fsm& rhs); + + /// Connects all final state with given state + void ConnectFinal(size_t to, Char c = Epsilon); + + /// Diverts all transition between two given states to @p dest, preserving outputs + void Divert(size_t from, size_t to, size_t dest); + + /// Checks whether two states are connected using given letter. + bool Connected(size_t from, size_t to, Char c) const; + + /// Returns a set of letters on which a transition from the specified state exists + TSet<Char> OutgoingLetters(size_t state) const; + + /// Returns a set of states where a transition from the given state using the given letter is possible + const StatesSet& Destinations(size_t from, Char letter) const; + + /// Checks whether two states are connected using any letter. + bool Connected(size_t from, size_t to) const; + size_t Initial() const { return initial; } + void SetInitial(size_t init) { initial = init; } + + const LettersTbl& Letters() const { return letters; } + + /// Determines the FSM. + /// Breaks FSM invariant of having a single final state, so high-level FSM building + /// functions (i.e. Append(), operator+(), etc...) no longer can be applied to the FSM + /// until the invariants have been manually restored. + /// return value: successful? + bool Determine(size_t maxsize = 0); + bool IsDetermined() const { return determined; } + void SetIsDetermined(bool det) { determined = det; } + + /// Minimizes amount of states in the regexp. + /// Requires a determined FSM. + void Minimize(); + + + /// Builds letters equivalence classes + void Sparse(bool needEpsilons = false); + + /// Unpacks all letters equivalence classs back into transitions table + void Unsparse(); + + private: + + /// Transitions table :: Q x V -> exp(Q) + TransitionTable m_transitions; + + /// Initial state + size_t initial; + + /// Final states. + FinalTable m_final; + + LettersTbl letters; + + /// Does 'letters' make sense? + bool m_sparsed; + + /// Is the FSM already determined? + bool determined; + + /// Output + typedef TMap< size_t, TMap<size_t, unsigned long> > Outputs; + Outputs outputs; + + typedef TMap<size_t, unsigned long> Tags; + Tags tags; + + /// Heuristics hit: true iff this FSM is a union of two other FSMs + bool isAlternative; + + void ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps); ///< internal + void MergeEpsilonConnection(size_t from, size_t to); ///< internal + + TSet<size_t> TerminalStates() const; + + Char Translate(Char c) const; + + void ClearHints() { isAlternative = false; } + + friend class Impl::FsmDetermineTask; + friend class Impl::FsmMinimizeTask; friend class Impl::HalfFinalDetermineTask; - }; - - template<class Scanner> - void BuildScanner(const Fsm& fsm, Scanner& r) - { - TSet<size_t> dead; - if (Scanner::DeadFlag) - dead = fsm.DeadStates(); - - for (size_t state = 0; state < fsm.Size(); ++state) - r.SetTag(state, typename Scanner::Tag(fsm.Tag(state) - | (fsm.IsFinal(state) ? Scanner::FinalFlag : 0) - | ((dead.find(state) != dead.end()) ? Scanner::DeadFlag : 0))); - - for (size_t from = 0; from != fsm.Size(); ++from) - for (Fsm::LettersTbl::ConstIterator lit = fsm.Letters().Begin(), lie = fsm.Letters().End(); lit != lie; ++lit) { - const Fsm::StatesSet& tos = fsm.Destinations(from, lit->first); - for (Fsm::StatesSet::const_iterator to = tos.begin(), toEnd = tos.end(); to != toEnd; ++to) - r.SetJump(from, lit->first, *to, r.RemapAction(fsm.Output(from, *to))); - } - - r.FinishBuild(); - } - - template<class Scanner> - inline Scanner Fsm::Compile(size_t distance) - { - return Scanner(*this, distance); - } - - yostream& operator << (yostream&, const Fsm&); + }; + + template<class Scanner> + void BuildScanner(const Fsm& fsm, Scanner& r) + { + TSet<size_t> dead; + if (Scanner::DeadFlag) + dead = fsm.DeadStates(); + + for (size_t state = 0; state < fsm.Size(); ++state) + r.SetTag(state, typename Scanner::Tag(fsm.Tag(state) + | (fsm.IsFinal(state) ? Scanner::FinalFlag : 0) + | ((dead.find(state) != dead.end()) ? Scanner::DeadFlag : 0))); + + for (size_t from = 0; from != fsm.Size(); ++from) + for (Fsm::LettersTbl::ConstIterator lit = fsm.Letters().Begin(), lie = fsm.Letters().End(); lit != lie; ++lit) { + const Fsm::StatesSet& tos = fsm.Destinations(from, lit->first); + for (Fsm::StatesSet::const_iterator to = tos.begin(), toEnd = tos.end(); to != toEnd; ++to) + r.SetJump(from, lit->first, *to, r.RemapAction(fsm.Output(from, *to))); + } + + r.FinishBuild(); + } + + template<class Scanner> + inline Scanner Fsm::Compile(size_t distance) + { + return Scanner(*this, distance); + } + + yostream& operator << (yostream&, const Fsm&); } #endif diff --git a/library/cpp/regex/pire/pire/glue.h b/library/cpp/regex/pire/pire/glue.h index d8d6cb00e5b..3308a58863d 100644 --- a/library/cpp/regex/pire/pire/glue.h +++ b/library/cpp/regex/pire/pire/glue.h @@ -12,7 +12,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -45,7 +45,7 @@ public: private: typename Scanner::Letter* m_lhs; typename Scanner::Letter* m_rhs; -}; +}; // This lookup table is used instead of std::map. // The key idea is to specify size which is a power of 2 in order to use >> and | instead of @@ -54,113 +54,113 @@ private: template <size_t N, class State> class GluedStateLookupTable { public: - static const size_t MaxSize = N; - typedef ypair<State, State> key_type; - typedef size_t mapped_type; - typedef ypair<key_type, mapped_type> value_type; - typedef value_type* iterator; - typedef const value_type* const_iterator; - - GluedStateLookupTable() - : mMap(new value_type[N]) - , mFilled(N, false) - {} - - ~GluedStateLookupTable() = default; - - const_iterator end() const { - return mMap.Get() + MaxSize; - } - // Note that in fact mMap is sparsed and traditional [begin,end) - // traversal is unavailable; hence no begin() method here. - // end() is only valid for comparing with find() result. - const_iterator find(const key_type& st) const { - size_t ind = Search(st); - return mFilled[ind] ? (mMap.Get() + ind) : end(); - } - - ypair<iterator, bool> insert(const value_type& v) { - size_t ind = Search(v.first); - if (!mFilled[ind]) { - mMap[ind] = v; - mFilled[ind] = true; - return ymake_pair(mMap.Get() + ind, true); - } else - return ymake_pair(mMap.Get() + ind, false); - } + static const size_t MaxSize = N; + typedef ypair<State, State> key_type; + typedef size_t mapped_type; + typedef ypair<key_type, mapped_type> value_type; + typedef value_type* iterator; + typedef const value_type* const_iterator; + + GluedStateLookupTable() + : mMap(new value_type[N]) + , mFilled(N, false) + {} + + ~GluedStateLookupTable() = default; + + const_iterator end() const { + return mMap.Get() + MaxSize; + } + // Note that in fact mMap is sparsed and traditional [begin,end) + // traversal is unavailable; hence no begin() method here. + // end() is only valid for comparing with find() result. + const_iterator find(const key_type& st) const { + size_t ind = Search(st); + return mFilled[ind] ? (mMap.Get() + ind) : end(); + } + + ypair<iterator, bool> insert(const value_type& v) { + size_t ind = Search(v.first); + if (!mFilled[ind]) { + mMap[ind] = v; + mFilled[ind] = true; + return ymake_pair(mMap.Get() + ind, true); + } else + return ymake_pair(mMap.Get() + ind, false); + } private: - size_t Search(const key_type& st) const { - size_t startInd = (Hash(st) % N); - for (size_t ind = startInd; ind != (startInd + N - 1) % N; ind = (ind + 1) % N) { - if (!mFilled[ind] || mMap[ind].first == st) { - return ind; - } - } - return (size_t)-1; - } - - static size_t Hash(const key_type& st) { - return size_t((st.first >> 2) ^ (st.second >> 4) ^ (st.second << 10)); - } - - TArrayHolder<value_type> mMap; - TVector<bool> mFilled; - - // Noncopyable - GluedStateLookupTable(const GluedStateLookupTable&); - GluedStateLookupTable& operator = (const GluedStateLookupTable&); + size_t Search(const key_type& st) const { + size_t startInd = (Hash(st) % N); + for (size_t ind = startInd; ind != (startInd + N - 1) % N; ind = (ind + 1) % N) { + if (!mFilled[ind] || mMap[ind].first == st) { + return ind; + } + } + return (size_t)-1; + } + + static size_t Hash(const key_type& st) { + return size_t((st.first >> 2) ^ (st.second >> 4) ^ (st.second << 10)); + } + + TArrayHolder<value_type> mMap; + TVector<bool> mFilled; + + // Noncopyable + GluedStateLookupTable(const GluedStateLookupTable&); + GluedStateLookupTable& operator = (const GluedStateLookupTable&); }; template<class Scanner> class ScannerGlueCommon { public: - typedef Partition< Char, Impl::LettersEquality<Scanner> > LettersTbl; + typedef Partition< Char, Impl::LettersEquality<Scanner> > LettersTbl; - typedef ypair<typename Scanner::InternalState, typename Scanner::InternalState> State; - ScannerGlueCommon(const Scanner& lhs, const Scanner& rhs, const LettersTbl& letters) - : m_lhs(lhs) - , m_rhs(rhs) - , m_letters(letters) - { - // Form a new letters partition - for (unsigned ch = 0; ch < MaxChar; ++ch) - if (ch != Epsilon) - m_letters.Append(ch); - } + typedef ypair<typename Scanner::InternalState, typename Scanner::InternalState> State; + ScannerGlueCommon(const Scanner& lhs, const Scanner& rhs, const LettersTbl& letters) + : m_lhs(lhs) + , m_rhs(rhs) + , m_letters(letters) + { + // Form a new letters partition + for (unsigned ch = 0; ch < MaxChar; ++ch) + if (ch != Epsilon) + m_letters.Append(ch); + } - const LettersTbl& Letters() const { return m_letters; } + const LettersTbl& Letters() const { return m_letters; } - const Scanner& Lhs() const { return m_lhs; } - const Scanner& Rhs() const { return m_rhs; } + const Scanner& Lhs() const { return m_lhs; } + const Scanner& Rhs() const { return m_rhs; } - State Initial() const { return State(Lhs().m.initial, Rhs().m.initial); } + State Initial() const { return State(Lhs().m.initial, Rhs().m.initial); } - State Next(State state, Char letter) const - { - Lhs().Next(state.first, letter); - Rhs().Next(state.second, letter); - return state; - } + State Next(State state, Char letter) const + { + Lhs().Next(state.first, letter); + Rhs().Next(state.second, letter); + return state; + } - bool IsRequired(const State& /*state*/) const { return true; } + bool IsRequired(const State& /*state*/) const { return true; } - typedef Scanner Result; - const Scanner& Success() const { return *m_result; } - Scanner Failure() const { return Scanner(); } + typedef Scanner Result; + const Scanner& Success() const { return *m_result; } + Scanner Failure() const { return Scanner(); } protected: - Scanner& Sc() { return *m_result; } - void SetSc(THolder<Scanner>&& sc) { m_result = std::move(sc); } + Scanner& Sc() { return *m_result; } + void SetSc(THolder<Scanner>&& sc) { m_result = std::move(sc); } private: - const Scanner& m_lhs; - const Scanner& m_rhs; - LettersTbl m_letters; - THolder<Scanner> m_result; + const Scanner& m_lhs; + const Scanner& m_rhs; + LettersTbl m_letters; + THolder<Scanner> m_result; }; -} +} } #endif diff --git a/library/cpp/regex/pire/pire/half_final_fsm.cpp b/library/cpp/regex/pire/pire/half_final_fsm.cpp index e45d03b9e2c..9ce22eda0e1 100644 --- a/library/cpp/regex/pire/pire/half_final_fsm.cpp +++ b/library/cpp/regex/pire/pire/half_final_fsm.cpp @@ -3,335 +3,335 @@ #include "half_final_fsm.h" namespace Pire { - size_t HalfFinalFsm::MaxCountDepth = 10; - - void HalfFinalFsm::MakeScanner() { - fsm.Canonize(); - bool allowHalfFinals = AllowHalfFinals(); - if (!allowHalfFinals) { - MakeHalfFinal(); - return; - } - DisconnectFinals(true); - } - - bool HalfFinalFsm::AllowHalfFinals() { - fsm.Canonize(); - for (size_t state = 0; state < fsm.Size(); ++state) { - if (fsm.IsFinal(state)) { - for (const auto& let : fsm.Letters()) { - bool hasFinalTransition = fsm.Destinations(state, let.first).empty(); - for (const auto& to : fsm.Destinations(state, let.first)) { - if (fsm.IsFinal(to)) { - hasFinalTransition = true; - } - } - if (!hasFinalTransition) { - return false; - } - } - } - } - return true; - } - - void HalfFinalFsm::MakeHalfFinal() { - fsm.Unsparse(); - const auto newFinal = fsm.Size(); - fsm.Resize(newFinal + 1); - for (unsigned letter = 0; letter < MaxChar; ++letter) { - if (letter != Epsilon) { - fsm.Connect(newFinal, newFinal, letter); - } - } - for (size_t state = 0; state < fsm.Size(); ++state) { - bool hasFinalTransitions = false; - for (const auto& to : fsm.Destinations(state, EndMark)) { - if (fsm.IsFinal(to)) { - hasFinalTransitions = true; - break; - } - } - if (hasFinalTransitions) { - Fsm::StatesSet destinations = fsm.Destinations(state, EndMark); - for (const auto& to : destinations) { - fsm.Disconnect(state, to, EndMark); - } - fsm.Connect(state, newFinal, EndMark); - } - } - fsm.ClearFinal(); - fsm.SetFinal(newFinal, true); - fsm.Sparse(); - } - - void HalfFinalFsm::DisconnectFinals(bool allowIntersects) { - fsm.Unsparse(); - for (size_t state = 0; state != fsm.Size(); ++state) { - fsm.SetTag(state, 0); - if (fsm.IsFinal(state)) { - for (unsigned letter = 0; letter < MaxChar; ++letter) { - Fsm::StatesSet destinations = fsm.Destinations(state, letter); - for (const auto& to : destinations) { - fsm.Disconnect(state, to, letter); - } - } - if (!allowIntersects) { - fsm.Connect(state, fsm.Initial()); - } - } - } - if (allowIntersects) { - fsm.PrependAnything(); - } - fsm.Sparse(); - fsm.SetIsDetermined(false); - fsm.Canonize(); - } - - void HalfFinalFsm::MakeNonGreedyCounter(bool allowIntersects /* = true */, bool simplify /* = true */) { - fsm.Canonize(); - fsm.PrependAnything(); - fsm.RemoveDeadEnds(); - fsm.Canonize(); - if (!allowIntersects || simplify) { - DisconnectFinals(allowIntersects); - } - } - - void HalfFinalFsm::MakeGreedyCounter(bool simplify /* = true */) { - fsm.Canonize(); - fsm.RemoveDeadEnds(); - size_t determineFactor = MaxCountDepth; - if (simplify) { - determineFactor = 1; - } - Determine(determineFactor); - if (simplify) { - fsm.Minimize(); - } - fsm.RemoveDeadEnds(); - } - - namespace Impl { - - class HalfFinalDetermineState { - public: - HalfFinalDetermineState(const Fsm& fsm, bool initial = false, size_t lastFinalCount = 0) - : mFsm(fsm) - , ToAdd(0) - , LastFinalCount(lastFinalCount) - { - if (initial) { - FinishBuild(1); - } - } - - HalfFinalDetermineState Next(Char letter, size_t maxCount) const { - HalfFinalDetermineState next(mFsm, false, LastFinalCount); - for (const auto& state : States) { - for (const auto& nextState : mFsm.Destinations(state.State, letter)) { - next.AddState(nextState, state.Count, state.ReachedFinal); - } - } - next.FinishBuild(maxCount, States.back().Count); - if (letter == EndMark) { - next.ToAdd += next.LastFinalCount; - next.LastFinalCount = 0; - next.States.clear(); - next.AddState(mFsm.Initial(), 0, false, true); - return next; - } - return next; - } - - void CopyData(Fsm& newFsm, size_t index) const { - if (ToAdd) { - newFsm.SetFinal(index, true); - newFsm.SetTag(index, ToAdd); - } - } - - bool operator<(const HalfFinalDetermineState& otherState) const { - if (ToAdd != otherState.ToAdd) { - return ToAdd < otherState.ToAdd; - } - if (LastFinalCount != otherState.LastFinalCount) { - return LastFinalCount < otherState.LastFinalCount; - } - return States < otherState.States; - } - - struct StateHolder { - size_t State; - size_t Count; - bool ReachedFinal; - - bool operator<(const StateHolder& other) const { - if (State != other.State) { - return State < other.State; - } - if (Count != other.Count) { - return Count < other.Count; - } - return ReachedFinal < other.ReachedFinal; - } - }; - - private: - const Fsm& mFsm; - TVector<StateHolder> States; - size_t ToAdd; - size_t LastFinalCount; - - void AddState(size_t state, size_t count, bool reachedFinal, bool force = false) { - size_t newLastFinalCount = LastFinalCount; - if (mFsm.IsFinal(state) && !reachedFinal) { - ++count; - reachedFinal = true; - newLastFinalCount = count; - } - for (const auto& addedState : States) { - if (addedState.State == state) { - return; - } - } - if (States.empty() || !mFsm.IsFinal(States.back().State) || force) { - States.push_back({state, count, reachedFinal}); - LastFinalCount = newLastFinalCount; - } - } - - void FinishBuild(size_t maxCount, size_t lastCount = 0) { - if (!States.empty() && mFsm.IsFinal(States.back().State)) { - lastCount = States.back().Count; - } - AddState(mFsm.Initial(), lastCount, false, true); - LastFinalCount = std::min(LastFinalCount, maxCount); - size_t minCount = States[0].Count; - for (auto& state : States) { - if (state.Count > maxCount) { - state.Count = maxCount; - } - minCount = std::min(state.Count, minCount); - } - ToAdd = minCount; - for (auto& state : States) { - state.Count -= minCount; - } - LastFinalCount -= minCount; - } - }; - - class HalfFinalDetermineTask { - public: - typedef HalfFinalDetermineState State; - typedef Fsm::LettersTbl LettersTbl; - typedef TMap<State, size_t> InvStates; - - HalfFinalDetermineTask(const Fsm& fsm, size_t maxCount) - : mFsm(fsm) - , MaxCount(maxCount) - { - size_t oldSize = mFsm.Size(); - mFsm.Import(fsm); - mFsm.Unsparse(); - for (size_t state = 0; state < mFsm.Size(); ++state) { - for (Char letter = 0; letter < MaxChar; ++letter) { - Fsm::StatesSet destinations = mFsm.Destinations(state, letter); - for (const auto destination : destinations) { - size_t newDestination = destination % oldSize; - if (letter == EndMark) { - newDestination += oldSize; - } - if (destination != newDestination) { - mFsm.Disconnect(state, destination, letter); - mFsm.Connect(state, newDestination, letter); - } - } - } - if (mFsm.Destinations(state, EndMark).size() == 0) { - mFsm.Connect(state, oldSize + mFsm.Initial(), EndMark); - } - } - mFsm.Sparse(); - } - - const LettersTbl& Letters() const { return mFsm.Letters(); } - - State Initial() const { - return State(mFsm, true); - } - - State Next(const State& state, Char letter) const { - return state.Next(letter, MaxCount); - } - - bool IsRequired(const State& /*state*/) const { return true; } - - void AcceptStates(const TVector<State>& newStates) { - mNewFsm.Resize(newStates.size()); - mNewFsm.SetInitial(0); - mNewFsm.SetIsDetermined(true); - mNewFsm.letters = Letters(); - mNewFsm.ClearFinal(); - for (size_t i = 0; i < newStates.size(); i++) { - newStates[i].CopyData(mNewFsm, i); - } - } - - void Connect(size_t from, size_t to, Char letter) { - Y_ASSERT(mNewFsm.Destinations(from, letter).size() == 0); - mNewFsm.Connect(from, to, letter); - } - - typedef bool Result; - - Result Success() { return true; } - - Result Failure() { return false; } - - Fsm& Output() { return mNewFsm; } - - void SetMaxCount(size_t maxCount) { - MaxCount = maxCount; - } - - private: - Fsm mFsm; - size_t MaxCount; - Fsm mNewFsm; - }; - } - - void HalfFinalFsm::Determine(size_t depth) { - static const unsigned MaxSize = 200000; - - Impl::HalfFinalDetermineTask task(fsm, depth); - if (!Pire::Impl::Determine(task, MaxSize)) { - task.SetMaxCount(1); - Pire::Impl::Determine(task, MaxSize); - } - - task.Output().Swap(fsm); - } - - size_t HalfFinalFsm::GetCount(size_t state) const { - if (fsm.IsFinal(state)) { - if (fsm.Tag(state)) { - return fsm.Tag(state); - } else { - return 1; - } - } - return 0; - } - - size_t HalfFinalFsm::GetTotalCount() const { - size_t count = 0; - for (size_t state = 0; state < fsm.Size(); ++state) { - count += GetCount(state); - } - return count; - } + size_t HalfFinalFsm::MaxCountDepth = 10; + + void HalfFinalFsm::MakeScanner() { + fsm.Canonize(); + bool allowHalfFinals = AllowHalfFinals(); + if (!allowHalfFinals) { + MakeHalfFinal(); + return; + } + DisconnectFinals(true); + } + + bool HalfFinalFsm::AllowHalfFinals() { + fsm.Canonize(); + for (size_t state = 0; state < fsm.Size(); ++state) { + if (fsm.IsFinal(state)) { + for (const auto& let : fsm.Letters()) { + bool hasFinalTransition = fsm.Destinations(state, let.first).empty(); + for (const auto& to : fsm.Destinations(state, let.first)) { + if (fsm.IsFinal(to)) { + hasFinalTransition = true; + } + } + if (!hasFinalTransition) { + return false; + } + } + } + } + return true; + } + + void HalfFinalFsm::MakeHalfFinal() { + fsm.Unsparse(); + const auto newFinal = fsm.Size(); + fsm.Resize(newFinal + 1); + for (unsigned letter = 0; letter < MaxChar; ++letter) { + if (letter != Epsilon) { + fsm.Connect(newFinal, newFinal, letter); + } + } + for (size_t state = 0; state < fsm.Size(); ++state) { + bool hasFinalTransitions = false; + for (const auto& to : fsm.Destinations(state, EndMark)) { + if (fsm.IsFinal(to)) { + hasFinalTransitions = true; + break; + } + } + if (hasFinalTransitions) { + Fsm::StatesSet destinations = fsm.Destinations(state, EndMark); + for (const auto& to : destinations) { + fsm.Disconnect(state, to, EndMark); + } + fsm.Connect(state, newFinal, EndMark); + } + } + fsm.ClearFinal(); + fsm.SetFinal(newFinal, true); + fsm.Sparse(); + } + + void HalfFinalFsm::DisconnectFinals(bool allowIntersects) { + fsm.Unsparse(); + for (size_t state = 0; state != fsm.Size(); ++state) { + fsm.SetTag(state, 0); + if (fsm.IsFinal(state)) { + for (unsigned letter = 0; letter < MaxChar; ++letter) { + Fsm::StatesSet destinations = fsm.Destinations(state, letter); + for (const auto& to : destinations) { + fsm.Disconnect(state, to, letter); + } + } + if (!allowIntersects) { + fsm.Connect(state, fsm.Initial()); + } + } + } + if (allowIntersects) { + fsm.PrependAnything(); + } + fsm.Sparse(); + fsm.SetIsDetermined(false); + fsm.Canonize(); + } + + void HalfFinalFsm::MakeNonGreedyCounter(bool allowIntersects /* = true */, bool simplify /* = true */) { + fsm.Canonize(); + fsm.PrependAnything(); + fsm.RemoveDeadEnds(); + fsm.Canonize(); + if (!allowIntersects || simplify) { + DisconnectFinals(allowIntersects); + } + } + + void HalfFinalFsm::MakeGreedyCounter(bool simplify /* = true */) { + fsm.Canonize(); + fsm.RemoveDeadEnds(); + size_t determineFactor = MaxCountDepth; + if (simplify) { + determineFactor = 1; + } + Determine(determineFactor); + if (simplify) { + fsm.Minimize(); + } + fsm.RemoveDeadEnds(); + } + + namespace Impl { + + class HalfFinalDetermineState { + public: + HalfFinalDetermineState(const Fsm& fsm, bool initial = false, size_t lastFinalCount = 0) + : mFsm(fsm) + , ToAdd(0) + , LastFinalCount(lastFinalCount) + { + if (initial) { + FinishBuild(1); + } + } + + HalfFinalDetermineState Next(Char letter, size_t maxCount) const { + HalfFinalDetermineState next(mFsm, false, LastFinalCount); + for (const auto& state : States) { + for (const auto& nextState : mFsm.Destinations(state.State, letter)) { + next.AddState(nextState, state.Count, state.ReachedFinal); + } + } + next.FinishBuild(maxCount, States.back().Count); + if (letter == EndMark) { + next.ToAdd += next.LastFinalCount; + next.LastFinalCount = 0; + next.States.clear(); + next.AddState(mFsm.Initial(), 0, false, true); + return next; + } + return next; + } + + void CopyData(Fsm& newFsm, size_t index) const { + if (ToAdd) { + newFsm.SetFinal(index, true); + newFsm.SetTag(index, ToAdd); + } + } + + bool operator<(const HalfFinalDetermineState& otherState) const { + if (ToAdd != otherState.ToAdd) { + return ToAdd < otherState.ToAdd; + } + if (LastFinalCount != otherState.LastFinalCount) { + return LastFinalCount < otherState.LastFinalCount; + } + return States < otherState.States; + } + + struct StateHolder { + size_t State; + size_t Count; + bool ReachedFinal; + + bool operator<(const StateHolder& other) const { + if (State != other.State) { + return State < other.State; + } + if (Count != other.Count) { + return Count < other.Count; + } + return ReachedFinal < other.ReachedFinal; + } + }; + + private: + const Fsm& mFsm; + TVector<StateHolder> States; + size_t ToAdd; + size_t LastFinalCount; + + void AddState(size_t state, size_t count, bool reachedFinal, bool force = false) { + size_t newLastFinalCount = LastFinalCount; + if (mFsm.IsFinal(state) && !reachedFinal) { + ++count; + reachedFinal = true; + newLastFinalCount = count; + } + for (const auto& addedState : States) { + if (addedState.State == state) { + return; + } + } + if (States.empty() || !mFsm.IsFinal(States.back().State) || force) { + States.push_back({state, count, reachedFinal}); + LastFinalCount = newLastFinalCount; + } + } + + void FinishBuild(size_t maxCount, size_t lastCount = 0) { + if (!States.empty() && mFsm.IsFinal(States.back().State)) { + lastCount = States.back().Count; + } + AddState(mFsm.Initial(), lastCount, false, true); + LastFinalCount = std::min(LastFinalCount, maxCount); + size_t minCount = States[0].Count; + for (auto& state : States) { + if (state.Count > maxCount) { + state.Count = maxCount; + } + minCount = std::min(state.Count, minCount); + } + ToAdd = minCount; + for (auto& state : States) { + state.Count -= minCount; + } + LastFinalCount -= minCount; + } + }; + + class HalfFinalDetermineTask { + public: + typedef HalfFinalDetermineState State; + typedef Fsm::LettersTbl LettersTbl; + typedef TMap<State, size_t> InvStates; + + HalfFinalDetermineTask(const Fsm& fsm, size_t maxCount) + : mFsm(fsm) + , MaxCount(maxCount) + { + size_t oldSize = mFsm.Size(); + mFsm.Import(fsm); + mFsm.Unsparse(); + for (size_t state = 0; state < mFsm.Size(); ++state) { + for (Char letter = 0; letter < MaxChar; ++letter) { + Fsm::StatesSet destinations = mFsm.Destinations(state, letter); + for (const auto destination : destinations) { + size_t newDestination = destination % oldSize; + if (letter == EndMark) { + newDestination += oldSize; + } + if (destination != newDestination) { + mFsm.Disconnect(state, destination, letter); + mFsm.Connect(state, newDestination, letter); + } + } + } + if (mFsm.Destinations(state, EndMark).size() == 0) { + mFsm.Connect(state, oldSize + mFsm.Initial(), EndMark); + } + } + mFsm.Sparse(); + } + + const LettersTbl& Letters() const { return mFsm.Letters(); } + + State Initial() const { + return State(mFsm, true); + } + + State Next(const State& state, Char letter) const { + return state.Next(letter, MaxCount); + } + + bool IsRequired(const State& /*state*/) const { return true; } + + void AcceptStates(const TVector<State>& newStates) { + mNewFsm.Resize(newStates.size()); + mNewFsm.SetInitial(0); + mNewFsm.SetIsDetermined(true); + mNewFsm.letters = Letters(); + mNewFsm.ClearFinal(); + for (size_t i = 0; i < newStates.size(); i++) { + newStates[i].CopyData(mNewFsm, i); + } + } + + void Connect(size_t from, size_t to, Char letter) { + Y_ASSERT(mNewFsm.Destinations(from, letter).size() == 0); + mNewFsm.Connect(from, to, letter); + } + + typedef bool Result; + + Result Success() { return true; } + + Result Failure() { return false; } + + Fsm& Output() { return mNewFsm; } + + void SetMaxCount(size_t maxCount) { + MaxCount = maxCount; + } + + private: + Fsm mFsm; + size_t MaxCount; + Fsm mNewFsm; + }; + } + + void HalfFinalFsm::Determine(size_t depth) { + static const unsigned MaxSize = 200000; + + Impl::HalfFinalDetermineTask task(fsm, depth); + if (!Pire::Impl::Determine(task, MaxSize)) { + task.SetMaxCount(1); + Pire::Impl::Determine(task, MaxSize); + } + + task.Output().Swap(fsm); + } + + size_t HalfFinalFsm::GetCount(size_t state) const { + if (fsm.IsFinal(state)) { + if (fsm.Tag(state)) { + return fsm.Tag(state); + } else { + return 1; + } + } + return 0; + } + + size_t HalfFinalFsm::GetTotalCount() const { + size_t count = 0; + for (size_t state = 0; state < fsm.Size(); ++state) { + count += GetCount(state); + } + return count; + } } diff --git a/library/cpp/regex/pire/pire/half_final_fsm.h b/library/cpp/regex/pire/pire/half_final_fsm.h index 83828f8bb37..1742d999895 100644 --- a/library/cpp/regex/pire/pire/half_final_fsm.h +++ b/library/cpp/regex/pire/pire/half_final_fsm.h @@ -2,47 +2,47 @@ #include "defs.h" namespace Pire { - class HalfFinalFsm { - public: - HalfFinalFsm(const Fsm& sourceFsm) : fsm(sourceFsm) {} + class HalfFinalFsm { + public: + HalfFinalFsm(const Fsm& sourceFsm) : fsm(sourceFsm) {} - void MakeScanner(); + void MakeScanner(); - /// Non greedy counter without allowed intersects works correctly on all regexps - /// Non simplified non greedy counter with allowed intersects counts number of positions in string, - /// on which ends at least one substring that matches regexp - /// Simplified non greedy counter with allowed intersects does not always work correctly, - /// but has fewer number of states and more regexps can be glued into single scanner - void MakeNonGreedyCounter(bool allowIntersects = true, bool simplify = true); + /// Non greedy counter without allowed intersects works correctly on all regexps + /// Non simplified non greedy counter with allowed intersects counts number of positions in string, + /// on which ends at least one substring that matches regexp + /// Simplified non greedy counter with allowed intersects does not always work correctly, + /// but has fewer number of states and more regexps can be glued into single scanner + void MakeNonGreedyCounter(bool allowIntersects = true, bool simplify = true); - // Simplified counter does not work correctly on all regexps, but has less number of states - // and allows to glue larger number of scanners into one within the same size limit - void MakeGreedyCounter(bool simplify = true); + // Simplified counter does not work correctly on all regexps, but has less number of states + // and allows to glue larger number of scanners into one within the same size limit + void MakeGreedyCounter(bool simplify = true); - const Fsm& GetFsm() const { return fsm; } + const Fsm& GetFsm() const { return fsm; } - template<class Scanner> - Scanner Compile() const; + template<class Scanner> + Scanner Compile() const; - size_t GetCount(size_t state) const; + size_t GetCount(size_t state) const; - size_t GetTotalCount() const; + size_t GetTotalCount() const; - static size_t MaxCountDepth; - private: - Fsm fsm; + static size_t MaxCountDepth; + private: + Fsm fsm; - bool AllowHalfFinals(); + bool AllowHalfFinals(); - void MakeHalfFinal(); + void MakeHalfFinal(); - void DisconnectFinals(bool allowIntersects); + void DisconnectFinals(bool allowIntersects); - void Determine(size_t depth = MaxCountDepth); - }; + void Determine(size_t depth = MaxCountDepth); + }; - template<class Scanner> - Scanner HalfFinalFsm::Compile() const { - auto scanner = Scanner(*this); - } + template<class Scanner> + Scanner HalfFinalFsm::Compile() const { + auto scanner = Scanner(*this); + } } diff --git a/library/cpp/regex/pire/pire/minimize.h b/library/cpp/regex/pire/pire/minimize.h index d58c5ce79ea..38f65985c78 100644 --- a/library/cpp/regex/pire/pire/minimize.h +++ b/library/cpp/regex/pire/pire/minimize.h @@ -5,149 +5,149 @@ #include "partition.h" namespace Pire { - namespace Impl { + namespace Impl { - /** - * An interface of a minimization task. - * You don't have to derive from this class; it is just a start point template. - */ - class MinimizeTask { - private: - struct ImplementationSpecific1; + /** + * An interface of a minimization task. + * You don't have to derive from this class; it is just a start point template. + */ + class MinimizeTask { + private: + struct ImplementationSpecific1; - public: - // States must be represented by size_t. + public: + // States must be represented by size_t. - /// States must be initially divided into some equivalence classes. - /// If states are in the same equivalence class, they may be merged without loosing state specific info. - /// Equivalence classes must have indexes from 0 to (Classes - 1). - /// The algorithm will modify equivalent classes and in the end - /// all states in the same equivalent class can be merged into one state - TVector<size_t>& GetStateClass() { return StateClass; } - - /// Returns number of equivalent classes - size_t& GetClassesNumber() { return Classes; } - - /// Should return number of letter classes - size_t LettersCount() const; - - /// Should return true if FSM is determined. - bool IsDetermined() const; - - /// Should return number of states. - size_t Size() const; - - /// Should calculate vector of previous states by, given the current state and incoming letter class index. - const TVector<size_t>& Previous(size_t state, size_t letter) const; - - /// Called when states equivalent classes are formed, and written in StateClass. - void AcceptStates(); - - typedef bool Result; - - Result Success() { return true; } - - Result Failure() { return false; } - - private: - TVector<size_t> StateClass; - - size_t Classes; - }; - - // Minimizes Determined FSM using Hopcroft algorithm, works in O(Size * log(Size) * MaxChar) time, - // requires O(Size * MaxChar * sizof(size_t)) memory. - template<class Task> - typename Task::Result Minimize(Task& task) - { - // Minimization algorithm is only applicable to a determined FSM. - if (!task.IsDetermined()) { - return task.Failure(); - } - - typedef ypair<size_t, size_t> ClassLetter; - - TVector<ybitset<MaxChar>> queuedClasses(task.Size()); - - TDeque<ClassLetter> classesToProcess; - - TVector<TVector<size_t>> classStates(task.Size()); - - TVector<size_t>& stateClass = task.GetStateClass(); - - for (size_t state = 0; state < task.Size(); ++state) { - classStates[stateClass[state]].push_back(state); - } - - for (size_t classIndex = 0; classIndex < task.GetClassesNumber(); ++classIndex) { - for (size_t letter = 0; letter < task.LettersCount(); ++letter) { - classesToProcess.push_back(ymake_pair(classIndex, letter)); - queuedClasses[classIndex][letter] = 1; - } - } - - TVector<size_t> classChange(task.Size()); - TVector<TVector<size_t>> removedStates(task.Size()); - - while (classesToProcess.size()) { - const auto currentClass = classesToProcess.front().first; - const auto currentLetter = classesToProcess.front().second; - classesToProcess.pop_front(); - queuedClasses[currentClass][currentLetter] = 0; - TVector<size_t> splittedClasses; - - for (const auto& classState : classStates[currentClass]) { - for (const auto& state: task.Previous(classState, currentLetter)) { - if (classChange[stateClass[state]] != task.GetClassesNumber()) { - classChange[stateClass[state]] = task.GetClassesNumber(); - splittedClasses.push_back(stateClass[state]); - } - removedStates[stateClass[state]].push_back(state); - } - } - - - for (const auto& splittedClass : splittedClasses) { - if (removedStates[splittedClass].size() == classStates[splittedClass].size()) { - classChange[splittedClass] = 0; - removedStates[splittedClass].clear(); - continue; - } - - const auto newClass = task.GetClassesNumber()++; - classChange[splittedClass] = newClass; - std::swap(classStates[newClass], removedStates[splittedClass]); - for (const auto& state : classStates[newClass]) { - stateClass[state] = newClass; - } - - auto iter = classStates[splittedClass].begin(); - for (const auto state : classStates[splittedClass]) { - if (stateClass[state] == splittedClass) { - *iter = state; - ++iter; - } - } - classStates[splittedClass].erase(iter, classStates[splittedClass].end()); - - for (size_t letter = 0; letter < task.LettersCount(); ++letter) { - if (queuedClasses[splittedClass][letter] - || classStates[splittedClass].size() > classStates[newClass].size()) { - - queuedClasses[newClass][letter] = 1; - classesToProcess.push_back(ymake_pair(newClass, letter)); - } else { - queuedClasses[splittedClass][letter] = 1; - classesToProcess.push_back(ymake_pair(splittedClass, letter)); - } - } - } - } - - task.AcceptStates(); - return task.Success(); - } - } + /// States must be initially divided into some equivalence classes. + /// If states are in the same equivalence class, they may be merged without loosing state specific info. + /// Equivalence classes must have indexes from 0 to (Classes - 1). + /// The algorithm will modify equivalent classes and in the end + /// all states in the same equivalent class can be merged into one state + TVector<size_t>& GetStateClass() { return StateClass; } + + /// Returns number of equivalent classes + size_t& GetClassesNumber() { return Classes; } + + /// Should return number of letter classes + size_t LettersCount() const; + + /// Should return true if FSM is determined. + bool IsDetermined() const; + + /// Should return number of states. + size_t Size() const; + + /// Should calculate vector of previous states by, given the current state and incoming letter class index. + const TVector<size_t>& Previous(size_t state, size_t letter) const; + + /// Called when states equivalent classes are formed, and written in StateClass. + void AcceptStates(); + + typedef bool Result; + + Result Success() { return true; } + + Result Failure() { return false; } + + private: + TVector<size_t> StateClass; + + size_t Classes; + }; + + // Minimizes Determined FSM using Hopcroft algorithm, works in O(Size * log(Size) * MaxChar) time, + // requires O(Size * MaxChar * sizof(size_t)) memory. + template<class Task> + typename Task::Result Minimize(Task& task) + { + // Minimization algorithm is only applicable to a determined FSM. + if (!task.IsDetermined()) { + return task.Failure(); + } + + typedef ypair<size_t, size_t> ClassLetter; + + TVector<ybitset<MaxChar>> queuedClasses(task.Size()); + + TDeque<ClassLetter> classesToProcess; + + TVector<TVector<size_t>> classStates(task.Size()); + + TVector<size_t>& stateClass = task.GetStateClass(); + + for (size_t state = 0; state < task.Size(); ++state) { + classStates[stateClass[state]].push_back(state); + } + + for (size_t classIndex = 0; classIndex < task.GetClassesNumber(); ++classIndex) { + for (size_t letter = 0; letter < task.LettersCount(); ++letter) { + classesToProcess.push_back(ymake_pair(classIndex, letter)); + queuedClasses[classIndex][letter] = 1; + } + } + + TVector<size_t> classChange(task.Size()); + TVector<TVector<size_t>> removedStates(task.Size()); + + while (classesToProcess.size()) { + const auto currentClass = classesToProcess.front().first; + const auto currentLetter = classesToProcess.front().second; + classesToProcess.pop_front(); + queuedClasses[currentClass][currentLetter] = 0; + TVector<size_t> splittedClasses; + + for (const auto& classState : classStates[currentClass]) { + for (const auto& state: task.Previous(classState, currentLetter)) { + if (classChange[stateClass[state]] != task.GetClassesNumber()) { + classChange[stateClass[state]] = task.GetClassesNumber(); + splittedClasses.push_back(stateClass[state]); + } + removedStates[stateClass[state]].push_back(state); + } + } + + + for (const auto& splittedClass : splittedClasses) { + if (removedStates[splittedClass].size() == classStates[splittedClass].size()) { + classChange[splittedClass] = 0; + removedStates[splittedClass].clear(); + continue; + } + + const auto newClass = task.GetClassesNumber()++; + classChange[splittedClass] = newClass; + std::swap(classStates[newClass], removedStates[splittedClass]); + for (const auto& state : classStates[newClass]) { + stateClass[state] = newClass; + } + + auto iter = classStates[splittedClass].begin(); + for (const auto state : classStates[splittedClass]) { + if (stateClass[state] == splittedClass) { + *iter = state; + ++iter; + } + } + classStates[splittedClass].erase(iter, classStates[splittedClass].end()); + + for (size_t letter = 0; letter < task.LettersCount(); ++letter) { + if (queuedClasses[splittedClass][letter] + || classStates[splittedClass].size() > classStates[newClass].size()) { + + queuedClasses[newClass][letter] = 1; + classesToProcess.push_back(ymake_pair(newClass, letter)); + } else { + queuedClasses[splittedClass][letter] = 1; + classesToProcess.push_back(ymake_pair(splittedClass, letter)); + } + } + } + } + + task.AcceptStates(); + return task.Success(); + } + } } #endif diff --git a/library/cpp/regex/pire/pire/partition.h b/library/cpp/regex/pire/pire/partition.h index ae8ae1cc8c6..b0585219989 100644 --- a/library/cpp/regex/pire/pire/partition.h +++ b/library/cpp/regex/pire/pire/partition.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -37,154 +37,154 @@ namespace Pire { template<class T, class Eq> class Partition { private: - typedef TMap< T, ypair< size_t, TVector<T> > > Set; + typedef TMap< T, ypair< size_t, TVector<T> > > Set; public: - Partition(const Eq& eq) - : m_eq(eq) - , m_maxidx(0) - { - } - - /// Appends a new item into partition, creating new equivalience class if neccessary. - void Append(const T& t) { - DoAppend(m_set, t); - } - - typedef typename Set::const_iterator ConstIterator; - - ConstIterator Begin() const { - return m_set.begin(); - } - ConstIterator begin() const { - return m_set.begin(); - } - ConstIterator End() const { - return m_set.end(); - } - ConstIterator end() const { - return m_set.end(); - } - size_t Size() const { - return m_set.size(); - } - bool Empty() const { - return m_set.empty(); - } - - /// Returns an item equal to @p t. It is guaranteed that: - /// - representative(a) equals representative(b) iff a is equivalent to b; - /// - representative(a) is equivalent to a. - const T& Representative(const T& t) const - { - auto it = m_inv.find(t); - if (it != m_inv.end()) - return it->second; - else - return DefaultValue<T>(); - } - - bool Contains(const T& t) const - { - return m_inv.find(t) != m_inv.end(); - } - - /// Returns an index of set containing @p t. It is guaranteed that: - /// - index(a) equals index(b) iff a is equivalent to b; - /// - 0 <= index(a) < size(). - size_t Index(const T& t) const - { - auto it = m_inv.find(t); - if (it == m_inv.end()) - throw Error("Partition::index(): attempted to obtain an index of nonexistent item"); - auto it2 = m_set.find(it->second); - Y_ASSERT(it2 != m_set.end()); - return it2->second.first; - } - /// Returns the whole equivalence class of @p t (i.e. item @p i - /// is returned iff representative(i) == representative(t)). - const TVector<T>& Klass(const T& t) const - { - auto it = m_inv.find(t); - if (it == m_inv.end()) - throw Error("Partition::index(): attempted to obtain an index of nonexistent item"); - auto it2 = m_set.find(it->second); - Y_ASSERT(it2 != m_set.end()); - return it2->second.second; - } - - bool operator == (const Partition& rhs) const { return m_set == rhs.m_set; } - bool operator != (const Partition& rhs) const { return !(*this == rhs); } - - /// Splits the current sets into smaller ones, using given equivalence relation. - /// Requires given relation imply previous one (set either in ctor or - /// in preceeding calls to split()), but performs faster. - /// Replaces previous relation with given one. - void Split(const Eq& eq) - { - m_eq = eq; - - for (auto&& element : m_set) - if (element.second.second.size() > 1) { - TVector<T>& v = element.second.second; - auto bound = std::partition(v.begin(), v.end(), std::bind2nd(m_eq, v[0])); - if (bound == v.end()) - continue; - - Set delta; - for (auto it = bound, ie = v.end(); it != ie; ++it) - DoAppend(delta, *it); - - v.erase(bound, v.end()); - m_set.insert(delta.begin(), delta.end()); - } - } + Partition(const Eq& eq) + : m_eq(eq) + , m_maxidx(0) + { + } + + /// Appends a new item into partition, creating new equivalience class if neccessary. + void Append(const T& t) { + DoAppend(m_set, t); + } + + typedef typename Set::const_iterator ConstIterator; + + ConstIterator Begin() const { + return m_set.begin(); + } + ConstIterator begin() const { + return m_set.begin(); + } + ConstIterator End() const { + return m_set.end(); + } + ConstIterator end() const { + return m_set.end(); + } + size_t Size() const { + return m_set.size(); + } + bool Empty() const { + return m_set.empty(); + } + + /// Returns an item equal to @p t. It is guaranteed that: + /// - representative(a) equals representative(b) iff a is equivalent to b; + /// - representative(a) is equivalent to a. + const T& Representative(const T& t) const + { + auto it = m_inv.find(t); + if (it != m_inv.end()) + return it->second; + else + return DefaultValue<T>(); + } + + bool Contains(const T& t) const + { + return m_inv.find(t) != m_inv.end(); + } + + /// Returns an index of set containing @p t. It is guaranteed that: + /// - index(a) equals index(b) iff a is equivalent to b; + /// - 0 <= index(a) < size(). + size_t Index(const T& t) const + { + auto it = m_inv.find(t); + if (it == m_inv.end()) + throw Error("Partition::index(): attempted to obtain an index of nonexistent item"); + auto it2 = m_set.find(it->second); + Y_ASSERT(it2 != m_set.end()); + return it2->second.first; + } + /// Returns the whole equivalence class of @p t (i.e. item @p i + /// is returned iff representative(i) == representative(t)). + const TVector<T>& Klass(const T& t) const + { + auto it = m_inv.find(t); + if (it == m_inv.end()) + throw Error("Partition::index(): attempted to obtain an index of nonexistent item"); + auto it2 = m_set.find(it->second); + Y_ASSERT(it2 != m_set.end()); + return it2->second.second; + } + + bool operator == (const Partition& rhs) const { return m_set == rhs.m_set; } + bool operator != (const Partition& rhs) const { return !(*this == rhs); } + + /// Splits the current sets into smaller ones, using given equivalence relation. + /// Requires given relation imply previous one (set either in ctor or + /// in preceeding calls to split()), but performs faster. + /// Replaces previous relation with given one. + void Split(const Eq& eq) + { + m_eq = eq; + + for (auto&& element : m_set) + if (element.second.second.size() > 1) { + TVector<T>& v = element.second.second; + auto bound = std::partition(v.begin(), v.end(), std::bind2nd(m_eq, v[0])); + if (bound == v.end()) + continue; + + Set delta; + for (auto it = bound, ie = v.end(); it != ie; ++it) + DoAppend(delta, *it); + + v.erase(bound, v.end()); + m_set.insert(delta.begin(), delta.end()); + } + } private: - Eq m_eq; - Set m_set; - TMap<T, T> m_inv; - size_t m_maxidx; - - void DoAppend(Set& set, const T& t) - { - auto it = set.begin(); - auto end = set.end(); - for (; it != end; ++it) - if (m_eq(it->first, t)) { - it->second.second.push_back(t); - m_inv[t] = it->first; - break; - } - - if (it == end) { - // Begin new set - TVector<T> v(1, t); - set.insert(ymake_pair(t, ymake_pair(m_maxidx++, v))); - m_inv[t] = t; - } - } + Eq m_eq; + Set m_set; + TMap<T, T> m_inv; + size_t m_maxidx; + + void DoAppend(Set& set, const T& t) + { + auto it = set.begin(); + auto end = set.end(); + for (; it != end; ++it) + if (m_eq(it->first, t)) { + it->second.second.push_back(t); + m_inv[t] = it->first; + break; + } + + if (it == end) { + // Begin new set + TVector<T> v(1, t); + set.insert(ymake_pair(t, ymake_pair(m_maxidx++, v))); + m_inv[t] = t; + } + } }; // Mainly for debugging template<class T, class Eq> yostream& operator << (yostream& stream, const Partition<T, Eq>& partition) { - stream << "Partition {\n"; - for (auto&& partitionElement : partition) { - stream << " Class " << partitionElement.second.first << " \"" << partitionElement.first << "\" { "; - bool first = false; - for (auto&& element : partitionElement.second.second) { - if (first) - stream << ", "; - else - first = true; - stream << element; - } - stream << " }\n"; - } - stream << "}"; - return stream; + stream << "Partition {\n"; + for (auto&& partitionElement : partition) { + stream << " Class " << partitionElement.second.first << " \"" << partitionElement.first << "\" { "; + bool first = false; + for (auto&& element : partitionElement.second.second) { + if (first) + stream << ", "; + else + first = true; + stream << element; + } + stream << " }\n"; + } + stream << "}"; + return stream; } } diff --git a/library/cpp/regex/pire/pire/pire.h b/library/cpp/regex/pire/pire/pire.h index 305d70703a8..f036ce14f84 100644 --- a/library/cpp/regex/pire/pire/pire.h +++ b/library/cpp/regex/pire/pire/pire.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the diff --git a/library/cpp/regex/pire/pire/re_lexer.cpp b/library/cpp/regex/pire/pire/re_lexer.cpp index 456b015d834..509e2508887 100644 --- a/library/cpp/regex/pire/pire/re_lexer.cpp +++ b/library/cpp/regex/pire/pire/re_lexer.cpp @@ -241,8 +241,8 @@ namespace { for (; ch != End && ch != (Control | ']'); ch = CorrectChar(GetChar(), controls)) { if (ch == (Control | 'x')) { UngetChar(ch); - firstUnicode = true; - unicodeSymbol = ReadUnicodeCharacter(); + firstUnicode = true; + unicodeSymbol = ReadUnicodeCharacter(); } else { firstUnicode = false; } diff --git a/library/cpp/regex/pire/pire/re_lexer.h b/library/cpp/regex/pire/pire/re_lexer.h index 279f67e2c5a..d52ed207ddf 100644 --- a/library/cpp/regex/pire/pire/re_lexer.h +++ b/library/cpp/regex/pire/pire/re_lexer.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -52,18 +52,18 @@ using namespace Consts; namespace TokenTypes { enum { - None = 0, - Letters, - Count, - Dot, - Open, - Close, - Or, - And, - Not, - BeginMark, - EndMark, - End + None = 0, + Letters, + Count, + Dot, + Open, + Close, + Or, + And, + Not, + BeginMark, + EndMark, + End }; } @@ -74,31 +74,31 @@ enum { */ class Term { public: - typedef TVector<wchar32> String; - typedef TSet<String> Strings; + typedef TVector<wchar32> String; + typedef TSet<String> Strings; - typedef ypair<int, int> RepetitionCount; - typedef ypair<Strings, bool> CharacterRange; + typedef ypair<int, int> RepetitionCount; + typedef ypair<Strings, bool> CharacterRange; - struct DotTag {}; - struct BeginTag {}; - struct EndTag {}; + struct DotTag {}; + struct BeginTag {}; + struct EndTag {}; - Term(int type): m_type(type) {} - template<class T> Term(int type, T t): m_type(type), m_value(t) {} - Term(int type, const Any& value): m_type(type), m_value(value) {} + Term(int type): m_type(type) {} + template<class T> Term(int type, T t): m_type(type), m_value(t) {} + Term(int type, const Any& value): m_type(type), m_value(value) {} - static Term Character(wchar32 c); - static Term Repetition(int lower, int upper); - static Term Dot(); - static Term BeginMark(); - static Term EndMark(); + static Term Character(wchar32 c); + static Term Repetition(int lower, int upper); + static Term Dot(); + static Term BeginMark(); + static Term EndMark(); - int Type() const { return m_type; } - const Any& Value() const { return m_value; } + int Type() const { return m_type; } + const Any& Value() const { return m_value; } private: - int m_type; - Any m_value; + int m_type; + Any m_value; }; class Feature; @@ -108,78 +108,78 @@ class Feature; */ class Lexer { public: - // One-size-fits-all constructor set. - Lexer() - : m_encoding(&Encodings::Latin1()) - { InstallDefaultFeatures(); } - - explicit Lexer(const char* str) - : m_encoding(&Encodings::Latin1()) - { - InstallDefaultFeatures(); - Assign(str, str + strlen(str)); - } - template<class T> explicit Lexer(const T& t) - : m_encoding(&Encodings::Latin1()) - { - InstallDefaultFeatures(); - Assign(t.begin(), t.end()); - } - - template<class Iter> Lexer(Iter begin, Iter end) - : m_encoding(&Encodings::Latin1()) - { - InstallDefaultFeatures(); - Assign(begin, end); - } - ~Lexer(); - - template<class Iter> void Assign(Iter begin, Iter end) - { - m_input.clear(); - std::copy(begin, end, std::back_inserter(m_input)); - } - - /// The main lexer function. Extracts and returns the next term in input sequence. - Term Lex(); - /// Installs an additional lexer feature. - /// We declare both lvalue and rvalue reference types to fix some linker errors. - Lexer& AddFeature(THolder<Feature>& a); - Lexer& AddFeature(THolder<Feature>&& a); - - const Pire::Encoding& Encoding() const { return *m_encoding; } - Lexer& SetEncoding(const Pire::Encoding& encoding) { m_encoding = &encoding; return *this; } - void SetError(const char* msg) { errmsg = msg; } - void SetError(ystring msg) { errmsg = msg; } - ystring& GetError() { return errmsg; } - - Any& Retval() { return m_retval; } - - Fsm Parse(); - - void Parenthesized(Fsm& fsm); + // One-size-fits-all constructor set. + Lexer() + : m_encoding(&Encodings::Latin1()) + { InstallDefaultFeatures(); } + + explicit Lexer(const char* str) + : m_encoding(&Encodings::Latin1()) + { + InstallDefaultFeatures(); + Assign(str, str + strlen(str)); + } + template<class T> explicit Lexer(const T& t) + : m_encoding(&Encodings::Latin1()) + { + InstallDefaultFeatures(); + Assign(t.begin(), t.end()); + } + + template<class Iter> Lexer(Iter begin, Iter end) + : m_encoding(&Encodings::Latin1()) + { + InstallDefaultFeatures(); + Assign(begin, end); + } + ~Lexer(); + + template<class Iter> void Assign(Iter begin, Iter end) + { + m_input.clear(); + std::copy(begin, end, std::back_inserter(m_input)); + } + + /// The main lexer function. Extracts and returns the next term in input sequence. + Term Lex(); + /// Installs an additional lexer feature. + /// We declare both lvalue and rvalue reference types to fix some linker errors. + Lexer& AddFeature(THolder<Feature>& a); + Lexer& AddFeature(THolder<Feature>&& a); + + const Pire::Encoding& Encoding() const { return *m_encoding; } + Lexer& SetEncoding(const Pire::Encoding& encoding) { m_encoding = &encoding; return *this; } + void SetError(const char* msg) { errmsg = msg; } + void SetError(ystring msg) { errmsg = msg; } + ystring& GetError() { return errmsg; } + + Any& Retval() { return m_retval; } + + Fsm Parse(); + + void Parenthesized(Fsm& fsm); private: - Term DoLex(); + Term DoLex(); - wchar32 GetChar(); - wchar32 PeekChar(); - void UngetChar(wchar32 c); + wchar32 GetChar(); + wchar32 PeekChar(); + void UngetChar(wchar32 c); - void Error(const char* msg) { throw Pire::Error(msg); } + void Error(const char* msg) { throw Pire::Error(msg); } - void InstallDefaultFeatures(); + void InstallDefaultFeatures(); - TDeque<wchar32> m_input; - const Pire::Encoding* m_encoding; - TVector<THolder<Feature>> m_features; - Any m_retval; - ystring errmsg; + TDeque<wchar32> m_input; + const Pire::Encoding* m_encoding; + TVector<THolder<Feature>> m_features; + Any m_retval; + ystring errmsg; - friend class Feature; + friend class Feature; - Lexer(const Lexer&); - Lexer& operator = (const Lexer&); + Lexer(const Lexer&); + Lexer& operator = (const Lexer&); }; /** @@ -188,55 +188,55 @@ private: */ class Feature { public: - /// Precedence of features. The less the priority, the earlier - /// will Lex() be called, and the later will Alter() and Parenthesized() be called. - virtual int Priority() const { return 50; } - - /// Lexer will call this function to check whether the feature - /// wants to handle the next part of the input sequence in its - /// specific way. If it does not, features Lex() will not be called. - virtual bool Accepts(wchar32 /*c*/) const { return false; } - /// Should eat up some part of the input sequence, handle it - /// somehow and produce a terminal. - virtual Term Lex() { return Term(0); } - - /// This function recieves a shiny new terminal, and the feature - /// has a chance to hack it somehow if it wants. - virtual void Alter(Term&) {} - /// This function recieves a parenthesized part of a pattern, and the feature - /// has a chance to hack it somehow if it wants (its the way to implement - /// those perl-style (?@#$%:..) clauses). - virtual void Parenthesized(Fsm&) {} - - using Ptr = THolder<Feature>; - - virtual ~Feature() = default; + /// Precedence of features. The less the priority, the earlier + /// will Lex() be called, and the later will Alter() and Parenthesized() be called. + virtual int Priority() const { return 50; } + + /// Lexer will call this function to check whether the feature + /// wants to handle the next part of the input sequence in its + /// specific way. If it does not, features Lex() will not be called. + virtual bool Accepts(wchar32 /*c*/) const { return false; } + /// Should eat up some part of the input sequence, handle it + /// somehow and produce a terminal. + virtual Term Lex() { return Term(0); } + + /// This function recieves a shiny new terminal, and the feature + /// has a chance to hack it somehow if it wants. + virtual void Alter(Term&) {} + /// This function recieves a parenthesized part of a pattern, and the feature + /// has a chance to hack it somehow if it wants (its the way to implement + /// those perl-style (?@#$%:..) clauses). + virtual void Parenthesized(Fsm&) {} + + using Ptr = THolder<Feature>; + + virtual ~Feature() = default; protected: - // These functions are exposed versions of the corresponding lexer functions. - const Pire::Encoding& Encoding() const { return m_lexer->Encoding(); } - wchar32 GetChar() { return m_lexer->GetChar(); } - wchar32 PeekChar() { return m_lexer->PeekChar(); } - void UngetChar(wchar32 c) { m_lexer->UngetChar(c); } - wchar32 CorrectChar(wchar32 c, const char* controls); - void Error(const char* msg) { m_lexer->Error(msg); } + // These functions are exposed versions of the corresponding lexer functions. + const Pire::Encoding& Encoding() const { return m_lexer->Encoding(); } + wchar32 GetChar() { return m_lexer->GetChar(); } + wchar32 PeekChar() { return m_lexer->PeekChar(); } + void UngetChar(wchar32 c) { m_lexer->UngetChar(c); } + wchar32 CorrectChar(wchar32 c, const char* controls); + void Error(const char* msg) { m_lexer->Error(msg); } private: - friend class Lexer; - Lexer* m_lexer; + friend class Lexer; + Lexer* m_lexer; }; namespace Features { - /// Disables case sensitivity - Feature::Ptr CaseInsensitive(); - - /** - * Adds two more operations: - * (pattern1)&(pattern2) -- matches those strings which match both /pattern1/ and /pattern2/; - * ~(pattern) -- matches those strings which do not match /pattern/. - */ - Feature::Ptr AndNotSupport(); + /// Disables case sensitivity + Feature::Ptr CaseInsensitive(); + + /** + * Adds two more operations: + * (pattern1)&(pattern2) -- matches those strings which match both /pattern1/ and /pattern2/; + * ~(pattern) -- matches those strings which do not match /pattern/. + */ + Feature::Ptr AndNotSupport(); } } diff --git a/library/cpp/regex/pire/pire/read_unicode.cpp b/library/cpp/regex/pire/pire/read_unicode.cpp index 6278ad500aa..6422144c82c 100644 --- a/library/cpp/regex/pire/pire/read_unicode.cpp +++ b/library/cpp/regex/pire/pire/read_unicode.cpp @@ -26,58 +26,58 @@ #include <library/cpp/regex/pire/pire/re_lexer.h> namespace Pire { - wchar32 UnicodeReader::ReadUnicodeCharacter() { - ystring hexStr; - GetChar(); - wchar32 ch = PeekChar(); + wchar32 UnicodeReader::ReadUnicodeCharacter() { + ystring hexStr; + GetChar(); + wchar32 ch = PeekChar(); - if (ch == '{') { - GetChar(); - hexStr = ReadHexDigit( - [](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); }); - ch = GetChar(); - if (ch != '}') { - Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\""); - } - } else { - hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; }); - if (hexStr.size() != 2) { - Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols"); - } - } - return HexToDec(hexStr); - } + if (ch == '{') { + GetChar(); + hexStr = ReadHexDigit( + [](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); }); + ch = GetChar(); + if (ch != '}') { + Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\""); + } + } else { + hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; }); + if (hexStr.size() != 2) { + Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols"); + } + } + return HexToDec(hexStr); + } - bool UnicodeReader::IsHexDigit(wchar32 ch) { - return ch < 256 && std::isxdigit(ch) != 0; - } + bool UnicodeReader::IsHexDigit(wchar32 ch) { + return ch < 256 && std::isxdigit(ch) != 0; + } - ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) { - ystring result; - wchar32 ch = GetChar(); - while (!shouldStop(ch, result.size())) { - if (!IsHexDigit(ch)) { - Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number"); - } - result.push_back(ch); - ch = GetChar(); - } - UngetChar(ch); - return result; - } + ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) { + ystring result; + wchar32 ch = GetChar(); + while (!shouldStop(ch, result.size())) { + if (!IsHexDigit(ch)) { + Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number"); + } + result.push_back(ch); + ch = GetChar(); + } + UngetChar(ch); + return result; + } - wchar32 UnicodeReader::HexToDec(const ystring &hexStr) { - wchar32 converted; - try { - converted = std::stoul(hexStr, 0, 16); - } catch (std::out_of_range &) { - converted = MAX_UNICODE + 1; - } - if (converted > MAX_UNICODE) { - Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large"); - } - return converted; - } + wchar32 UnicodeReader::HexToDec(const ystring &hexStr) { + wchar32 converted; + try { + converted = std::stoul(hexStr, 0, 16); + } catch (std::out_of_range &) { + converted = MAX_UNICODE + 1; + } + if (converted > MAX_UNICODE) { + Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large"); + } + return converted; + } } diff --git a/library/cpp/regex/pire/pire/read_unicode.h b/library/cpp/regex/pire/pire/read_unicode.h index ea3d7599ebb..3c48dfe2453 100644 --- a/library/cpp/regex/pire/pire/read_unicode.h +++ b/library/cpp/regex/pire/pire/read_unicode.h @@ -24,17 +24,17 @@ #include <library/cpp/regex/pire/pire/re_lexer.h> namespace Pire { - class UnicodeReader : public Feature { - public: - wchar32 ReadUnicodeCharacter(); + class UnicodeReader : public Feature { + public: + wchar32 ReadUnicodeCharacter(); - private: - static const wchar32 MAX_UNICODE = 0x10FFFF; + private: + static const wchar32 MAX_UNICODE = 0x10FFFF; - bool IsHexDigit(wchar32 ch); - ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop); - wchar32 HexToDec(const ystring& hexStr); - }; + bool IsHexDigit(wchar32 ch); + ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop); + wchar32 HexToDec(const ystring& hexStr); + }; } diff --git a/library/cpp/regex/pire/pire/run.h b/library/cpp/regex/pire/pire/run.h index 2c536f7c3a2..905f6c32236 100644 --- a/library/cpp/regex/pire/pire/run.h +++ b/library/cpp/regex/pire/pire/run.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -35,17 +35,17 @@ namespace Pire { - template<class Scanner> - struct StDumper { - StDumper(const Scanner& sc, typename Scanner::State st): m_sc(&sc), m_st(st) {} - void Dump(yostream& stream) const { stream << m_sc->StateIndex(m_st) << (m_sc->Final(m_st) ? " [final]" : ""); } - private: - const Scanner* m_sc; - typename Scanner::State m_st; - }; - - template<class Scanner> StDumper<Scanner> StDump(const Scanner& sc, typename Scanner::State st) { return StDumper<Scanner>(sc, st); } - template<class Scanner> yostream& operator << (yostream& stream, const StDumper<Scanner>& stdump) { stdump.Dump(stream); return stream; } + template<class Scanner> + struct StDumper { + StDumper(const Scanner& sc, typename Scanner::State st): m_sc(&sc), m_st(st) {} + void Dump(yostream& stream) const { stream << m_sc->StateIndex(m_st) << (m_sc->Final(m_st) ? " [final]" : ""); } + private: + const Scanner* m_sc; + typename Scanner::State m_st; + }; + + template<class Scanner> StDumper<Scanner> StDump(const Scanner& sc, typename Scanner::State st) { return StDumper<Scanner>(sc, st); } + template<class Scanner> yostream& operator << (yostream& stream, const StDumper<Scanner>& stdump) { stdump.Dump(stream); return stream; } } namespace Pire { @@ -54,53 +54,53 @@ template<class Scanner> PIRE_FORCED_INLINE PIRE_HOT_FUNCTION void Step(const Scanner& scanner, typename Scanner::State& state, Char ch) { - Y_ASSERT(ch < MaxCharUnaligned); - typename Scanner::Action a = scanner.Next(state, ch); - scanner.TakeAction(state, a); + Y_ASSERT(ch < MaxCharUnaligned); + typename Scanner::Action a = scanner.Next(state, ch); + scanner.TakeAction(state, a); } namespace Impl { - enum Action { Continue, Stop }; - - template<class Scanner> - struct RunPred { - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action operator()(const Scanner&, const typename Scanner::State&, const char*) const { return Continue; } - }; - - template<class Scanner> - struct ShortestPrefixPred { - explicit ShortestPrefixPred(const char*& pos): m_pos(&pos) {} - - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const - { - if (sc.Final(st)) { - *m_pos = pos; - return Stop; - } else { - return (sc.Dead(st) ? Stop : Continue); - } - } - private: - const char** m_pos; - }; - - template<class Scanner> - struct LongestPrefixPred { - explicit LongestPrefixPred(const char*& pos): m_pos(&pos) {} - - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const - { - if (sc.Final(st)) - *m_pos = pos; - return (sc.Dead(st) ? Stop : Continue); - } - private: - const char** m_pos; - }; + enum Action { Continue, Stop }; + + template<class Scanner> + struct RunPred { + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action operator()(const Scanner&, const typename Scanner::State&, const char*) const { return Continue; } + }; + + template<class Scanner> + struct ShortestPrefixPred { + explicit ShortestPrefixPred(const char*& pos): m_pos(&pos) {} + + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const + { + if (sc.Final(st)) { + *m_pos = pos; + return Stop; + } else { + return (sc.Dead(st) ? Stop : Continue); + } + } + private: + const char** m_pos; + }; + + template<class Scanner> + struct LongestPrefixPred { + explicit LongestPrefixPred(const char*& pos): m_pos(&pos) {} + + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const + { + if (sc.Final(st)) + *m_pos = pos; + return (sc.Dead(st) ? Stop : Continue); + } + private: + const char** m_pos; + }; } @@ -108,125 +108,125 @@ namespace Impl { namespace Impl { - template<class Scanner, class Pred> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action SafeRunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred) - { - Y_ASSERT(pos <= sizeof(size_t)); - Y_ASSERT(size <= sizeof(size_t)); - Y_ASSERT(pos + size <= sizeof(size_t)); - - if (PIRE_UNLIKELY(size == 0)) - return Continue; - - const char* ptr = (const char*) p + pos; - for (; size--; ++ptr) { - Step(scanner, state, (unsigned char) *ptr); - if (pred(scanner, state, ptr + 1) == Stop) - return Stop; - } - return Continue; - } - - /// Effectively runs a scanner on a short data chunk, fit completely into one machine word. - template<class Scanner, class Pred> - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action RunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred) - { - Y_ASSERT(pos <= sizeof(size_t)); - Y_ASSERT(size <= sizeof(size_t)); - Y_ASSERT(pos + size <= sizeof(size_t)); - - if (PIRE_UNLIKELY(size == 0)) - return Continue; - - size_t chunk = Impl::ToLittleEndian(*p) >> 8*pos; - const char* ptr = (const char*) p + pos + size + 1; - - for (size_t i = size; i != 0; --i) { - Step(scanner, state, chunk & 0xFF); - if (pred(scanner, state, ptr - i) == Stop) - return Stop; - chunk >>= 8; - } - return Continue; - } - - template<class Scanner> - struct AlignedRunner { - - // Generic version for LongestPrefix()/ShortestPrefix() impelementations - template<class Pred> - static inline PIRE_HOT_FUNCTION - Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, Pred stop) - { - typename Scanner::State st = state; - Action ret = Continue; - for (; begin != end && (ret = RunChunk(scanner, st, begin, 0, sizeof(void*), stop)) == Continue; ++begin) - ; - state = st; - return ret; - } - - // A special version for Run() impelementation that skips predicate checks - static inline PIRE_HOT_FUNCTION - Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, RunPred<Scanner>) - { - typename Scanner::State st = state; - for (; begin != end; ++begin) { - size_t chunk = *begin; - for (size_t i = sizeof(chunk); i != 0; --i) { - Step(scanner, st, chunk & 0xFF); - chunk >>= 8; - } - } - state = st; - return Continue; - } - }; - - /// The main function: runs a scanner through given memory range. - template<class Scanner, class Pred> - inline void DoRun(const Scanner& scanner, typename Scanner::State& st, TStringBuf str, Pred pred) - { - - const size_t* head = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.begin())) & ~(sizeof(size_t)-1)); - const size_t* tail = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.end())) & ~(sizeof(size_t)-1)); - - size_t headSize = (sizeof(size_t) - (str.begin() - (const char*)head)); // The distance from @p begin to the end of the word containing @p begin - size_t tailSize = str.end() - (const char*) tail; // The distance from the beginning of the word containing @p end to the @p end - - Y_ASSERT(headSize >= 1 && headSize <= sizeof(size_t)); - Y_ASSERT(tailSize < sizeof(size_t)); - - if (head == tail) { - Impl::SafeRunChunk(scanner, st, head, sizeof(size_t) - headSize, str.end() - str.begin(), pred); - return; - } - - // st is passed by reference to this function. If we use it directly on each step the compiler will have to - // update it in memory because of pointer aliasing assumptions. Copying it into a local var allows the - // compiler to store it in a register. This saves some instructions and cycles - typename Scanner::State state = st; - - if (str.begin() != (const char*) head) { - if (Impl::RunChunk(scanner, state, head, sizeof(size_t) - headSize, headSize, pred) == Stop) { - st = state; - return; - } - ++head; - } - - if (Impl::AlignedRunner<Scanner>::RunAligned(scanner, state, head, tail, pred) == Stop) { - st = state; - return; - } - - if (tailSize) - Impl::SafeRunChunk(scanner, state, tail, 0, tailSize, pred); - - st = state; - } + template<class Scanner, class Pred> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action SafeRunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred) + { + Y_ASSERT(pos <= sizeof(size_t)); + Y_ASSERT(size <= sizeof(size_t)); + Y_ASSERT(pos + size <= sizeof(size_t)); + + if (PIRE_UNLIKELY(size == 0)) + return Continue; + + const char* ptr = (const char*) p + pos; + for (; size--; ++ptr) { + Step(scanner, state, (unsigned char) *ptr); + if (pred(scanner, state, ptr + 1) == Stop) + return Stop; + } + return Continue; + } + + /// Effectively runs a scanner on a short data chunk, fit completely into one machine word. + template<class Scanner, class Pred> + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action RunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred) + { + Y_ASSERT(pos <= sizeof(size_t)); + Y_ASSERT(size <= sizeof(size_t)); + Y_ASSERT(pos + size <= sizeof(size_t)); + + if (PIRE_UNLIKELY(size == 0)) + return Continue; + + size_t chunk = Impl::ToLittleEndian(*p) >> 8*pos; + const char* ptr = (const char*) p + pos + size + 1; + + for (size_t i = size; i != 0; --i) { + Step(scanner, state, chunk & 0xFF); + if (pred(scanner, state, ptr - i) == Stop) + return Stop; + chunk >>= 8; + } + return Continue; + } + + template<class Scanner> + struct AlignedRunner { + + // Generic version for LongestPrefix()/ShortestPrefix() impelementations + template<class Pred> + static inline PIRE_HOT_FUNCTION + Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, Pred stop) + { + typename Scanner::State st = state; + Action ret = Continue; + for (; begin != end && (ret = RunChunk(scanner, st, begin, 0, sizeof(void*), stop)) == Continue; ++begin) + ; + state = st; + return ret; + } + + // A special version for Run() impelementation that skips predicate checks + static inline PIRE_HOT_FUNCTION + Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, RunPred<Scanner>) + { + typename Scanner::State st = state; + for (; begin != end; ++begin) { + size_t chunk = *begin; + for (size_t i = sizeof(chunk); i != 0; --i) { + Step(scanner, st, chunk & 0xFF); + chunk >>= 8; + } + } + state = st; + return Continue; + } + }; + + /// The main function: runs a scanner through given memory range. + template<class Scanner, class Pred> + inline void DoRun(const Scanner& scanner, typename Scanner::State& st, TStringBuf str, Pred pred) + { + + const size_t* head = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.begin())) & ~(sizeof(size_t)-1)); + const size_t* tail = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.end())) & ~(sizeof(size_t)-1)); + + size_t headSize = (sizeof(size_t) - (str.begin() - (const char*)head)); // The distance from @p begin to the end of the word containing @p begin + size_t tailSize = str.end() - (const char*) tail; // The distance from the beginning of the word containing @p end to the @p end + + Y_ASSERT(headSize >= 1 && headSize <= sizeof(size_t)); + Y_ASSERT(tailSize < sizeof(size_t)); + + if (head == tail) { + Impl::SafeRunChunk(scanner, st, head, sizeof(size_t) - headSize, str.end() - str.begin(), pred); + return; + } + + // st is passed by reference to this function. If we use it directly on each step the compiler will have to + // update it in memory because of pointer aliasing assumptions. Copying it into a local var allows the + // compiler to store it in a register. This saves some instructions and cycles + typename Scanner::State state = st; + + if (str.begin() != (const char*) head) { + if (Impl::RunChunk(scanner, state, head, sizeof(size_t) - headSize, headSize, pred) == Stop) { + st = state; + return; + } + ++head; + } + + if (Impl::AlignedRunner<Scanner>::RunAligned(scanner, state, head, tail, pred) == Stop) { + st = state; + return; + } + + if (tailSize) + Impl::SafeRunChunk(scanner, state, tail, 0, tailSize, pred); + + st = state; + } } @@ -235,52 +235,52 @@ namespace Impl { template<class Scanner1, class Scanner2> inline void Run(const Scanner1& scanner1, const Scanner2& scanner2, typename Scanner1::State& state1, typename Scanner2::State& state2, TStringBuf str) { - typedef ScannerPair<Scanner1, Scanner2> Scanners; - Scanners pair(scanner1, scanner2); - typename Scanners::State states(state1, state2); - Run(pair, states, str); - state1 = states.first; - state2 = states.second; + typedef ScannerPair<Scanner1, Scanner2> Scanners; + Scanners pair(scanner1, scanner2); + typename Scanners::State states(state1, state2); + Run(pair, states, str); + state1 = states.first; + state2 = states.second; } #else namespace Impl { - /// A debug version of all Run() methods. - template<class Scanner, class Pred> - inline void DoRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end, Pred pred) - { - Cdbg << "Running regexp on string " << ystring(begin, ymin(end - begin, static_cast<ptrdiff_t>(100u))) << Endl; - Cdbg << "Initial state " << StDump(scanner, state) << Endl; - - if (pred(scanner, state, begin) == Stop) { - Cdbg << " exiting" << Endl; - return; - } - - for (; begin != end; ++begin) { - Step(scanner, state, (unsigned char)*begin); - Cdbg << *begin << " => state " << StDump(scanner, state) << Endl; - if (pred(scanner, state, begin + 1) == Stop) { - Cdbg << " exiting" << Endl; - return; - } - } - } + /// A debug version of all Run() methods. + template<class Scanner, class Pred> + inline void DoRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end, Pred pred) + { + Cdbg << "Running regexp on string " << ystring(begin, ymin(end - begin, static_cast<ptrdiff_t>(100u))) << Endl; + Cdbg << "Initial state " << StDump(scanner, state) << Endl; + + if (pred(scanner, state, begin) == Stop) { + Cdbg << " exiting" << Endl; + return; + } + + for (; begin != end; ++begin) { + Step(scanner, state, (unsigned char)*begin); + Cdbg << *begin << " => state " << StDump(scanner, state) << Endl; + if (pred(scanner, state, begin + 1) == Stop) { + Cdbg << " exiting" << Endl; + return; + } + } + } } #endif - + template<class Scanner> void Run(const Scanner& sc, typename Scanner::State& st, TStringBuf str) { - Impl::DoRun(sc, st, str, Impl::RunPred<Scanner>()); + Impl::DoRun(sc, st, str, Impl::RunPred<Scanner>()); } template<class Scanner> void Run(const Scanner& sc, typename Scanner::State& st, const char* begin, const char* end) { - Run(sc, st, TStringBuf(begin, end)); + Run(sc, st, TStringBuf(begin, end)); } /// Returns default constructed string_view{} if there is no matching prefix @@ -288,25 +288,25 @@ void Run(const Scanner& sc, typename Scanner::State& st, const char* begin, cons template<class Scanner> std::string_view LongestPrefix(const Scanner& sc, std::string_view str, bool throughBeginMark = false, bool throughEndMark = false) { - typename Scanner::State st; - sc.Initialize(st); - if (throughBeginMark) - Pire::Step(sc, st, BeginMark); - const char* pos = (sc.Final(st) ? str.data() : nullptr); - Impl::DoRun(sc, st, str, Impl::LongestPrefixPred<Scanner>(pos)); - if (throughEndMark) { - Pire::Step(sc, st, EndMark); - if (sc.Final(st)) - pos = str.data() + str.size(); - } - return pos ? str.substr(0, pos - str.data()) : std::string_view{}; + typename Scanner::State st; + sc.Initialize(st); + if (throughBeginMark) + Pire::Step(sc, st, BeginMark); + const char* pos = (sc.Final(st) ? str.data() : nullptr); + Impl::DoRun(sc, st, str, Impl::LongestPrefixPred<Scanner>(pos)); + if (throughEndMark) { + Pire::Step(sc, st, EndMark); + if (sc.Final(st)) + pos = str.data() + str.size(); + } + return pos ? str.substr(0, pos - str.data()) : std::string_view{}; } template<class Scanner> const char* LongestPrefix(const Scanner& sc, const char* begin, const char* end, bool throughBeginMark = false, bool throughEndMark = false) { - auto prefix = LongestPrefix(sc, std::string_view(begin, end - begin), throughBeginMark, throughEndMark); - return prefix.data() + prefix.size(); + auto prefix = LongestPrefix(sc, std::string_view(begin, end - begin), throughBeginMark, throughEndMark); + return prefix.data() + prefix.size(); } /// Returns default constructed string_view{} if there is no matching prefix @@ -314,20 +314,20 @@ const char* LongestPrefix(const Scanner& sc, const char* begin, const char* end, template<class Scanner> std::string_view ShortestPrefix(const Scanner& sc, std::string_view str, bool throughBeginMark = false, bool throughEndMark = false) { - typename Scanner::State st; - sc.Initialize(st); - if (throughBeginMark) - Pire::Step(sc, st, BeginMark); - if (sc.Final(st)) - return str.substr(0, 0); - const char* pos = nullptr; - Impl::DoRun(sc, st, str, Impl::ShortestPrefixPred<Scanner>(pos)); - if (throughEndMark) { - Pire::Step(sc, st, EndMark); - if (sc.Final(st) && !pos) - pos = str.data() + str.size(); - } - return pos ? str.substr(0, pos - str.data()) : std::string_view{}; + typename Scanner::State st; + sc.Initialize(st); + if (throughBeginMark) + Pire::Step(sc, st, BeginMark); + if (sc.Final(st)) + return str.substr(0, 0); + const char* pos = nullptr; + Impl::DoRun(sc, st, str, Impl::ShortestPrefixPred<Scanner>(pos)); + if (throughEndMark) { + Pire::Step(sc, st, EndMark); + if (sc.Final(st) && !pos) + pos = str.data() + str.size(); + } + return pos ? str.substr(0, pos - str.data()) : std::string_view{}; } template<class Scanner> @@ -337,7 +337,7 @@ const char* ShortestPrefix(const Scanner& sc, const char* begin, const char* end return prefix.data() + prefix.size(); } - + /// The same as above, but scans string in reverse direction /// (consider using Fsm::Reverse() for using in this function). /// Returns default constructed string_view{} if there is no matching suffix @@ -345,35 +345,35 @@ const char* ShortestPrefix(const Scanner& sc, const char* begin, const char* end template<class Scanner> inline std::string_view LongestSuffix(const Scanner& scanner, std::string_view str, bool throughEndMark = false, bool throughBeginMark = false) { - typename Scanner::State state; - scanner.Initialize(state); - if (throughEndMark) - Step(scanner, state, EndMark); - PIRE_IFDEBUG(Cdbg << "Running LongestSuffix on string " << ystring(str) << Endl); - PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl); - - std::string_view suffix{}; - auto begin = str.data() + str.size(); - while (begin != str.data() && !scanner.Dead(state)) { - if (scanner.Final(state)) - suffix = str.substr(begin - str.data()); - --begin; - Step(scanner, state, (unsigned char)*begin); - PIRE_IFDEBUG(Cdbg << *begin << " => state " << StDump(scanner, state) << Endl); - } - if (scanner.Final(state)) - suffix = str.substr(begin - str.data()); - if (throughBeginMark) { - Step(scanner, state, BeginMark); - if (scanner.Final(state)) - suffix = str.substr(begin - str.data()); - } - return suffix; + typename Scanner::State state; + scanner.Initialize(state); + if (throughEndMark) + Step(scanner, state, EndMark); + PIRE_IFDEBUG(Cdbg << "Running LongestSuffix on string " << ystring(str) << Endl); + PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl); + + std::string_view suffix{}; + auto begin = str.data() + str.size(); + while (begin != str.data() && !scanner.Dead(state)) { + if (scanner.Final(state)) + suffix = str.substr(begin - str.data()); + --begin; + Step(scanner, state, (unsigned char)*begin); + PIRE_IFDEBUG(Cdbg << *begin << " => state " << StDump(scanner, state) << Endl); + } + if (scanner.Final(state)) + suffix = str.substr(begin - str.data()); + if (throughBeginMark) { + Step(scanner, state, BeginMark); + if (scanner.Final(state)) + suffix = str.substr(begin - str.data()); + } + return suffix; } template<class Scanner> inline const char* LongestSuffix(const Scanner& scanner, const char* rbegin, const char* rend, bool throughEndMark = false, bool throughBeginMark = false) { - auto suffix = LongestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark); + auto suffix = LongestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark); return suffix.data() ? suffix.data() - 1 : nullptr; } @@ -383,52 +383,52 @@ inline const char* LongestSuffix(const Scanner& scanner, const char* rbegin, con template<class Scanner> inline std::string_view ShortestSuffix(const Scanner& scanner, std::string_view str, bool throughEndMark = false, bool throughBeginMark = false) { - auto begin = str.data() + str.size(); - typename Scanner::State state; - scanner.Initialize(state); - if (throughEndMark) - Step(scanner, state, EndMark); - PIRE_IFDEBUG(Cdbg << "Running ShortestSuffix on string " << ystring(str) << Endl); - PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl); - - while (begin != str.data() && !scanner.Final(state) && !scanner.Dead(state)) { - --begin; - scanner.Next(state, (unsigned char)*begin); - PIRE_IFDEBUG(Cdbg << *rbegin << " => state " << StDump(scanner, state) << Endl); - } - if (throughBeginMark) - Step(scanner, state, BeginMark); - return scanner.Final(state) ? str.substr(begin - str.data()) : std::string_view{}; + auto begin = str.data() + str.size(); + typename Scanner::State state; + scanner.Initialize(state); + if (throughEndMark) + Step(scanner, state, EndMark); + PIRE_IFDEBUG(Cdbg << "Running ShortestSuffix on string " << ystring(str) << Endl); + PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl); + + while (begin != str.data() && !scanner.Final(state) && !scanner.Dead(state)) { + --begin; + scanner.Next(state, (unsigned char)*begin); + PIRE_IFDEBUG(Cdbg << *rbegin << " => state " << StDump(scanner, state) << Endl); + } + if (throughBeginMark) + Step(scanner, state, BeginMark); + return scanner.Final(state) ? str.substr(begin - str.data()) : std::string_view{}; } template<class Scanner> inline const char* ShortestSuffix(const Scanner& scanner, const char* rbegin, const char* rend, bool throughEndMark = false, bool throughBeginMark = false) { - auto suffix = ShortestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark); - return suffix.data() ? suffix.data() - 1 : nullptr; + auto suffix = ShortestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark); + return suffix.data() ? suffix.data() - 1 : nullptr; } template<class Scanner> class RunHelper { public: - RunHelper(const Scanner& sc, typename Scanner::State st): Sc(&sc), St(st) {} - explicit RunHelper(const Scanner& sc): Sc(&sc) { Sc->Initialize(St); } + RunHelper(const Scanner& sc, typename Scanner::State st): Sc(&sc), St(st) {} + explicit RunHelper(const Scanner& sc): Sc(&sc) { Sc->Initialize(St); } - RunHelper<Scanner>& Step(Char letter) { Pire::Step(*Sc, St, letter); return *this; } - RunHelper<Scanner>& Run(TStringBuf str) { Pire::Run(*Sc, St, str); return *this; } - RunHelper<Scanner>& Run(const char* begin, const char* end) { return Run(TStringBuf(begin, end)); } - RunHelper<Scanner>& Run(const char* begin, size_t size) { return Run(TStringBuf(begin, begin + size)); } - RunHelper<Scanner>& Begin() { return Step(BeginMark); } - RunHelper<Scanner>& End() { return Step(EndMark); } + RunHelper<Scanner>& Step(Char letter) { Pire::Step(*Sc, St, letter); return *this; } + RunHelper<Scanner>& Run(TStringBuf str) { Pire::Run(*Sc, St, str); return *this; } + RunHelper<Scanner>& Run(const char* begin, const char* end) { return Run(TStringBuf(begin, end)); } + RunHelper<Scanner>& Run(const char* begin, size_t size) { return Run(TStringBuf(begin, begin + size)); } + RunHelper<Scanner>& Begin() { return Step(BeginMark); } + RunHelper<Scanner>& End() { return Step(EndMark); } - const typename Scanner::State& State() const { return St; } - struct Tag {}; - operator const Tag*() const { return Sc->Final(St) ? (const Tag*) this : 0; } - bool operator ! () const { return !Sc->Final(St); } + const typename Scanner::State& State() const { return St; } + struct Tag {}; + operator const Tag*() const { return Sc->Final(St) ? (const Tag*) this : 0; } + bool operator ! () const { return !Sc->Final(St); } private: - const Scanner* Sc; - typename Scanner::State St; + const Scanner* Sc; + typename Scanner::State St; }; template<class Scanner> @@ -442,22 +442,22 @@ RunHelper<Scanner> Runner(const Scanner& sc, typename Scanner::State st) { retur template<class Scanner> bool Matches(const Scanner& scanner, TStringBuf str) { - return Runner(scanner).Run(str); + return Runner(scanner).Run(str); } template<class Scanner> bool Matches(const Scanner& scanner, const char* begin, const char* end) { - return Runner(scanner).Run(TStringBuf(begin, end)); + return Runner(scanner).Run(TStringBuf(begin, end)); } /// Constructs an inline scanner in one statement template<class Scanner> Scanner MmappedScanner(const char* ptr, size_t size) { - Scanner s; - s.Mmap(ptr, size); - return s; + Scanner s; + s.Mmap(ptr, size); + return s; } } diff --git a/library/cpp/regex/pire/pire/scanner_io.cpp b/library/cpp/regex/pire/pire/scanner_io.cpp index 65cf9a1a93f..af7cfde3d48 100644 --- a/library/cpp/regex/pire/pire/scanner_io.cpp +++ b/library/cpp/regex/pire/pire/scanner_io.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -31,187 +31,187 @@ #include "align.h" namespace Pire { - + void SimpleScanner::Save(yostream* s) const { - SavePodType(s, Header(ScannerIOTypes::SimpleScanner, sizeof(m))); - Impl::AlignSave(s, sizeof(Header)); - Locals mc = m; - mc.initial -= reinterpret_cast<size_t>(m_transitions); - SavePodType(s, mc); - Impl::AlignSave(s, sizeof(mc)); - SavePodType(s, Empty()); - Impl::AlignSave(s, sizeof(Empty())); - if (!Empty()) { - Y_ASSERT(m_buffer); - Impl::AlignedSaveArray(s, m_buffer.Get(), BufSize()); - } + SavePodType(s, Header(ScannerIOTypes::SimpleScanner, sizeof(m))); + Impl::AlignSave(s, sizeof(Header)); + Locals mc = m; + mc.initial -= reinterpret_cast<size_t>(m_transitions); + SavePodType(s, mc); + Impl::AlignSave(s, sizeof(mc)); + SavePodType(s, Empty()); + Impl::AlignSave(s, sizeof(Empty())); + if (!Empty()) { + Y_ASSERT(m_buffer); + Impl::AlignedSaveArray(s, m_buffer.Get(), BufSize()); + } } void SimpleScanner::Load(yistream* s) { - SimpleScanner sc; - Impl::ValidateHeader(s, ScannerIOTypes::SimpleScanner, sizeof(sc.m)); - LoadPodType(s, sc.m); - Impl::AlignLoad(s, sizeof(sc.m)); - bool empty; - LoadPodType(s, empty); - Impl::AlignLoad(s, sizeof(empty)); - if (empty) { - sc.Alias(Null()); - } else { - sc.m_buffer = BufferType(new char[sc.BufSize()]); - Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize()); - sc.Markup(sc.m_buffer.Get()); - sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions); - } - Swap(sc); + SimpleScanner sc; + Impl::ValidateHeader(s, ScannerIOTypes::SimpleScanner, sizeof(sc.m)); + LoadPodType(s, sc.m); + Impl::AlignLoad(s, sizeof(sc.m)); + bool empty; + LoadPodType(s, empty); + Impl::AlignLoad(s, sizeof(empty)); + if (empty) { + sc.Alias(Null()); + } else { + sc.m_buffer = BufferType(new char[sc.BufSize()]); + Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize()); + sc.Markup(sc.m_buffer.Get()); + sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions); + } + Swap(sc); } void SlowScanner::Save(yostream* s) const { - SavePodType(s, Header(ScannerIOTypes::SlowScanner, sizeof(m))); - Impl::AlignSave(s, sizeof(Header)); - SavePodType(s, m); - Impl::AlignSave(s, sizeof(m)); - SavePodType(s, Empty()); - Impl::AlignSave(s, sizeof(Empty())); - if (!Empty()) { - Y_ASSERT(!m_vec.empty()); - Impl::AlignedSaveArray(s, m_letters, MaxChar); - Impl::AlignedSaveArray(s, m_finals, m.statesCount); - - size_t c = 0; - SavePodType<size_t>(s, 0); - for (auto&& i : m_vec) { - size_t n = c + i.size(); - SavePodType(s, n); - c = n; - } - Impl::AlignSave(s, (m_vec.size() + 1) * sizeof(size_t)); - - size_t size = 0; - for (auto&& i : m_vec) - if (!i.empty()) { - SavePodArray(s, &(i)[0], i.size()); - size += sizeof(unsigned) * i.size(); - } - Impl::AlignSave(s, size); - if (need_actions) { - size_t pos = 0; - for (TVector< TVector< Action > >::const_iterator i = m_actionsvec.begin(), ie = m_actionsvec.end(); i != ie; ++i) - if (!i->empty()) { - SavePodArray(s, &(*i)[0], i->size()); - pos += sizeof(Action) * i->size(); - } - Impl::AlignSave(s, pos); - } - } + SavePodType(s, Header(ScannerIOTypes::SlowScanner, sizeof(m))); + Impl::AlignSave(s, sizeof(Header)); + SavePodType(s, m); + Impl::AlignSave(s, sizeof(m)); + SavePodType(s, Empty()); + Impl::AlignSave(s, sizeof(Empty())); + if (!Empty()) { + Y_ASSERT(!m_vec.empty()); + Impl::AlignedSaveArray(s, m_letters, MaxChar); + Impl::AlignedSaveArray(s, m_finals, m.statesCount); + + size_t c = 0; + SavePodType<size_t>(s, 0); + for (auto&& i : m_vec) { + size_t n = c + i.size(); + SavePodType(s, n); + c = n; + } + Impl::AlignSave(s, (m_vec.size() + 1) * sizeof(size_t)); + + size_t size = 0; + for (auto&& i : m_vec) + if (!i.empty()) { + SavePodArray(s, &(i)[0], i.size()); + size += sizeof(unsigned) * i.size(); + } + Impl::AlignSave(s, size); + if (need_actions) { + size_t pos = 0; + for (TVector< TVector< Action > >::const_iterator i = m_actionsvec.begin(), ie = m_actionsvec.end(); i != ie; ++i) + if (!i->empty()) { + SavePodArray(s, &(*i)[0], i->size()); + pos += sizeof(Action) * i->size(); + } + Impl::AlignSave(s, pos); + } + } } void SlowScanner::Load(yistream* s) { - SlowScanner sc; - Impl::ValidateHeader(s, ScannerIOTypes::SlowScanner, sizeof(sc.m)); - LoadPodType(s, sc.m); - Impl::AlignLoad(s, sizeof(sc.m)); - bool empty; - LoadPodType(s, empty); - Impl::AlignLoad(s, sizeof(empty)); - sc.need_actions = need_actions; - if (empty) { - sc.Alias(Null()); - } else { - sc.m_vec.resize(sc.m.lettersCount * sc.m.statesCount); - if (sc.need_actions) - sc.m_actionsvec.resize(sc.m.lettersCount * sc.m.statesCount); - sc.m_vecptr = &sc.m_vec; - - sc.alloc(sc.m_letters, MaxChar); - Impl::AlignedLoadArray(s, sc.m_letters, MaxChar); - - sc.alloc(sc.m_finals, sc.m.statesCount); - Impl::AlignedLoadArray(s, sc.m_finals, sc.m.statesCount); - - size_t c; - LoadPodType(s, c); - auto act = sc.m_actionsvec.begin(); - for (auto&& i : sc.m_vec) { - size_t n; - LoadPodType(s, n); - i.resize(n - c); - if (sc.need_actions) { - act->resize(n - c); - ++act; - } - c = n; - } - Impl::AlignLoad(s, (m_vec.size() + 1) * sizeof(size_t)); - - size_t size = 0; - for (auto&& i : sc.m_vec) - if (!i.empty()) { - LoadPodArray(s, &(i)[0], i.size()); - size += sizeof(unsigned) * i.size(); - } - Impl::AlignLoad(s, size); - size_t actSize = 0; - if (sc.need_actions) { - for (auto&& i : sc.m_actionsvec) { - if (!i.empty()) { - LoadPodArray(s, &(i)[0], i.size()); - actSize += sizeof(Action) * i.size(); - } - } - Impl::AlignLoad(s, actSize); - } - } - Swap(sc); + SlowScanner sc; + Impl::ValidateHeader(s, ScannerIOTypes::SlowScanner, sizeof(sc.m)); + LoadPodType(s, sc.m); + Impl::AlignLoad(s, sizeof(sc.m)); + bool empty; + LoadPodType(s, empty); + Impl::AlignLoad(s, sizeof(empty)); + sc.need_actions = need_actions; + if (empty) { + sc.Alias(Null()); + } else { + sc.m_vec.resize(sc.m.lettersCount * sc.m.statesCount); + if (sc.need_actions) + sc.m_actionsvec.resize(sc.m.lettersCount * sc.m.statesCount); + sc.m_vecptr = &sc.m_vec; + + sc.alloc(sc.m_letters, MaxChar); + Impl::AlignedLoadArray(s, sc.m_letters, MaxChar); + + sc.alloc(sc.m_finals, sc.m.statesCount); + Impl::AlignedLoadArray(s, sc.m_finals, sc.m.statesCount); + + size_t c; + LoadPodType(s, c); + auto act = sc.m_actionsvec.begin(); + for (auto&& i : sc.m_vec) { + size_t n; + LoadPodType(s, n); + i.resize(n - c); + if (sc.need_actions) { + act->resize(n - c); + ++act; + } + c = n; + } + Impl::AlignLoad(s, (m_vec.size() + 1) * sizeof(size_t)); + + size_t size = 0; + for (auto&& i : sc.m_vec) + if (!i.empty()) { + LoadPodArray(s, &(i)[0], i.size()); + size += sizeof(unsigned) * i.size(); + } + Impl::AlignLoad(s, size); + size_t actSize = 0; + if (sc.need_actions) { + for (auto&& i : sc.m_actionsvec) { + if (!i.empty()) { + LoadPodArray(s, &(i)[0], i.size()); + actSize += sizeof(Action) * i.size(); + } + } + Impl::AlignLoad(s, actSize); + } + } + Swap(sc); } void LoadedScanner::Save(yostream* s) const { - Save(s, ScannerIOTypes::LoadedScanner); + Save(s, ScannerIOTypes::LoadedScanner); } void LoadedScanner::Save(yostream* s, ui32 type) const { - Y_ASSERT(type == ScannerIOTypes::LoadedScanner || type == ScannerIOTypes::NoGlueLimitCountingScanner); - SavePodType(s, Header(type, sizeof(m))); - Impl::AlignSave(s, sizeof(Header)); - Locals mc = m; - mc.initial -= reinterpret_cast<size_t>(m_jumps); - SavePodType(s, mc); - Impl::AlignSave(s, sizeof(mc)); - - Impl::AlignedSaveArray(s, m_letters, MaxChar); - Impl::AlignedSaveArray(s, m_jumps, m.statesCount * m.lettersCount); - Impl::AlignedSaveArray(s, m_tags, m.statesCount); + Y_ASSERT(type == ScannerIOTypes::LoadedScanner || type == ScannerIOTypes::NoGlueLimitCountingScanner); + SavePodType(s, Header(type, sizeof(m))); + Impl::AlignSave(s, sizeof(Header)); + Locals mc = m; + mc.initial -= reinterpret_cast<size_t>(m_jumps); + SavePodType(s, mc); + Impl::AlignSave(s, sizeof(mc)); + + Impl::AlignedSaveArray(s, m_letters, MaxChar); + Impl::AlignedSaveArray(s, m_jumps, m.statesCount * m.lettersCount); + Impl::AlignedSaveArray(s, m_tags, m.statesCount); } void LoadedScanner::Load(yistream* s) { - Load(s, nullptr); + Load(s, nullptr); } void LoadedScanner::Load(yistream* s, ui32* type) { - LoadedScanner sc; - Header header = Impl::ValidateHeader(s, ScannerIOTypes::LoadedScanner, sizeof(sc.m)); - if (type) { - *type = header.Type; - } - LoadPodType(s, sc.m); - Impl::AlignLoad(s, sizeof(sc.m)); - sc.m_buffer = BufferType(new char[sc.BufSize()]); - sc.Markup(sc.m_buffer.Get()); - Impl::AlignedLoadArray(s, sc.m_letters, MaxChar); - Impl::AlignedLoadArray(s, sc.m_jumps, sc.m.statesCount * sc.m.lettersCount); - if (header.Version == Header::RE_VERSION_WITH_MACTIONS) { - TVector<Action> actions(sc.m.statesCount * sc.m.lettersCount); - Impl::AlignedLoadArray(s, actions.data(), actions.size()); - } - Impl::AlignedLoadArray(s, sc.m_tags, sc.m.statesCount); - sc.m.initial += reinterpret_cast<size_t>(sc.m_jumps); - Swap(sc); + LoadedScanner sc; + Header header = Impl::ValidateHeader(s, ScannerIOTypes::LoadedScanner, sizeof(sc.m)); + if (type) { + *type = header.Type; + } + LoadPodType(s, sc.m); + Impl::AlignLoad(s, sizeof(sc.m)); + sc.m_buffer = BufferType(new char[sc.BufSize()]); + sc.Markup(sc.m_buffer.Get()); + Impl::AlignedLoadArray(s, sc.m_letters, MaxChar); + Impl::AlignedLoadArray(s, sc.m_jumps, sc.m.statesCount * sc.m.lettersCount); + if (header.Version == Header::RE_VERSION_WITH_MACTIONS) { + TVector<Action> actions(sc.m.statesCount * sc.m.lettersCount); + Impl::AlignedLoadArray(s, actions.data(), actions.size()); + } + Impl::AlignedLoadArray(s, sc.m_tags, sc.m.statesCount); + sc.m.initial += reinterpret_cast<size_t>(sc.m_jumps); + Swap(sc); } } diff --git a/library/cpp/regex/pire/pire/scanners/common.h b/library/cpp/regex/pire/pire/scanners/common.h index 4cffca5072f..59b4dcd9699 100644 --- a/library/cpp/regex/pire/pire/scanners/common.h +++ b/library/cpp/regex/pire/pire/scanners/common.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -30,94 +30,94 @@ #include <library/cpp/regex/pire/pire/platform.h> namespace Pire { - namespace ScannerIOTypes { - enum { - NoScanner = 0, - Scanner = 1, - SimpleScanner = 2, - SlowScanner = 3, - LoadedScanner = 4, - NoGlueLimitCountingScanner = 5, - }; - } + namespace ScannerIOTypes { + enum { + NoScanner = 0, + Scanner = 1, + SimpleScanner = 2, + SlowScanner = 3, + LoadedScanner = 4, + NoGlueLimitCountingScanner = 5, + }; + } - struct Header { - ui32 Magic; - ui32 Version; - ui32 PtrSize; - ui32 MaxWordSize; - ui32 Type; - ui32 HdrSize; + struct Header { + ui32 Magic; + ui32 Version; + ui32 PtrSize; + ui32 MaxWordSize; + ui32 Type; + ui32 HdrSize; - static const ui32 MAGIC = 0x45524950; // "PIRE" on litte-endian - static const ui32 RE_VERSION = 7; // Should be incremented each time when the format of serialized scanner changes - static const ui32 RE_VERSION_WITH_MACTIONS = 6; // LoadedScanner with m_actions, which is ignored + static const ui32 MAGIC = 0x45524950; // "PIRE" on litte-endian + static const ui32 RE_VERSION = 7; // Should be incremented each time when the format of serialized scanner changes + static const ui32 RE_VERSION_WITH_MACTIONS = 6; // LoadedScanner with m_actions, which is ignored - explicit Header(ui32 type, size_t hdrsize) - : Magic(MAGIC) - , Version(RE_VERSION) - , PtrSize(sizeof(void*)) - , MaxWordSize(sizeof(Impl::MaxSizeWord)) - , Type(type) - , HdrSize((ui32)hdrsize) - {} + explicit Header(ui32 type, size_t hdrsize) + : Magic(MAGIC) + , Version(RE_VERSION) + , PtrSize(sizeof(void*)) + , MaxWordSize(sizeof(Impl::MaxSizeWord)) + , Type(type) + , HdrSize((ui32)hdrsize) + {} - void Validate(ui32 type, size_t hdrsize) const - { - if (Magic != MAGIC || PtrSize != sizeof(void*) || MaxWordSize != sizeof(Impl::MaxSizeWord)) - throw Error("Serialized regexp incompatible with your system"); - if (Version != RE_VERSION && Version != RE_VERSION_WITH_MACTIONS) - throw Error("You are trying to used an incompatible version of a serialized regexp"); - if (type != ScannerIOTypes::NoScanner && type != Type && - !(type == ScannerIOTypes::LoadedScanner && Type == ScannerIOTypes::NoGlueLimitCountingScanner)) { - throw Error("Serialized regexp incompatible with your system"); - } - if (hdrsize != 0 && HdrSize != hdrsize) - throw Error("Serialized regexp incompatible with your system"); - } - }; + void Validate(ui32 type, size_t hdrsize) const + { + if (Magic != MAGIC || PtrSize != sizeof(void*) || MaxWordSize != sizeof(Impl::MaxSizeWord)) + throw Error("Serialized regexp incompatible with your system"); + if (Version != RE_VERSION && Version != RE_VERSION_WITH_MACTIONS) + throw Error("You are trying to used an incompatible version of a serialized regexp"); + if (type != ScannerIOTypes::NoScanner && type != Type && + !(type == ScannerIOTypes::LoadedScanner && Type == ScannerIOTypes::NoGlueLimitCountingScanner)) { + throw Error("Serialized regexp incompatible with your system"); + } + if (hdrsize != 0 && HdrSize != hdrsize) + throw Error("Serialized regexp incompatible with your system"); + } + }; - namespace Impl { - inline const void* AdvancePtr(const size_t*& ptr, size_t& size, size_t delta) - { - ptr = (const size_t*) ((const char*) ptr + delta); - size -= delta; - return (const void*) ptr; - } + namespace Impl { + inline const void* AdvancePtr(const size_t*& ptr, size_t& size, size_t delta) + { + ptr = (const size_t*) ((const char*) ptr + delta); + size -= delta; + return (const void*) ptr; + } - template<class T> - inline void MapPtr(T*& field, size_t count, const size_t*& p, size_t& size) - { - if (size < count * sizeof(*field)) - throw Error("EOF reached while mapping Pire::SlowScanner"); - field = (T*) p; - Impl::AdvancePtr(p, size, count * sizeof(*field)); - Impl::AlignPtr(p, size); - } + template<class T> + inline void MapPtr(T*& field, size_t count, const size_t*& p, size_t& size) + { + if (size < count * sizeof(*field)) + throw Error("EOF reached while mapping Pire::SlowScanner"); + field = (T*) p; + Impl::AdvancePtr(p, size, count * sizeof(*field)); + Impl::AlignPtr(p, size); + } - inline void CheckAlign(const void* ptr, size_t bound = sizeof(size_t)) - { - if (!IsAligned(ptr, bound)) - throw Error("Tried to mmap scanner at misaligned address"); - } + inline void CheckAlign(const void* ptr, size_t bound = sizeof(size_t)) + { + if (!IsAligned(ptr, bound)) + throw Error("Tried to mmap scanner at misaligned address"); + } - inline Header ValidateHeader(const size_t*& ptr, size_t& size, ui32 type, size_t hdrsize) - { - const Header* hdr; - MapPtr(hdr, 1, ptr, size); - hdr->Validate(type, hdrsize); - return *hdr; - } + inline Header ValidateHeader(const size_t*& ptr, size_t& size, ui32 type, size_t hdrsize) + { + const Header* hdr; + MapPtr(hdr, 1, ptr, size); + hdr->Validate(type, hdrsize); + return *hdr; + } - inline Header ValidateHeader(yistream* s, ui32 type, size_t hdrsize) - { - Header hdr(ScannerIOTypes::NoScanner, 0); - LoadPodType(s, hdr); - AlignLoad(s, sizeof(hdr)); - hdr.Validate(type, hdrsize); - return hdr; - } - } + inline Header ValidateHeader(yistream* s, ui32 type, size_t hdrsize) + { + Header hdr(ScannerIOTypes::NoScanner, 0); + LoadPodType(s, hdr); + AlignLoad(s, sizeof(hdr)); + hdr.Validate(type, hdrsize); + return hdr; + } + } } #endif diff --git a/library/cpp/regex/pire/pire/scanners/half_final.h b/library/cpp/regex/pire/pire/scanners/half_final.h index ea47a0118f3..b9c152b7d42 100644 --- a/library/cpp/regex/pire/pire/scanners/half_final.h +++ b/library/cpp/regex/pire/pire/scanners/half_final.h @@ -32,198 +32,198 @@ namespace Impl { template<typename Relocation, typename Shortcutting> class HalfFinalScanner : public Scanner<Relocation, Shortcutting> { public: - typedef typename Impl::Scanner<Relocation, Shortcutting> Scanner; - - HalfFinalScanner() : Scanner() {} - - explicit HalfFinalScanner(Fsm fsm_, size_t distance = 0) { - if (distance) { - fsm_ = CreateApproxFsm(fsm_, distance); - } - HalfFinalFsm fsm(fsm_); - fsm.MakeScanner(); - Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetFsm().Finals().size(), fsm.GetFsm().Initial(), 1); - BuildScanner(fsm.GetFsm(), *this); - } - - explicit HalfFinalScanner(const HalfFinalFsm& fsm) { - Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetTotalCount(), fsm.GetFsm().Initial(), 1); - BuildScanner(fsm.GetFsm(), *this); - BuildFinals(fsm); - } - - typedef typename Scanner::ScannerRowHeader ScannerRowHeader; - typedef typename Scanner::Action Action; - - class State { - public: - typedef TVector<size_t>::const_iterator IdsIterator; - - State() : ScannerState(0) {} - - State(const typename Scanner::State& otherState) : ScannerState(otherState) {} - - void GetMatchedRegexpsIds() { - MatchedRegexpsIds.clear(); - for (size_t i = 0; i < MatchedRegexps.size(); i++) { - if (MatchedRegexps[i]) { - MatchedRegexpsIds.push_back(i); - } - } - } - - IdsIterator IdsBegin() const { - return MatchedRegexpsIds.cbegin(); - } - - IdsIterator IdsEnd() const { - return MatchedRegexpsIds.cend(); - } - - bool operator==(const State& other) const { - return ScannerState == other.ScannerState && MatchedRegexps == other.MatchedRegexps; - } - - bool operator!=(const State& other) const { - return ScannerState != other.ScannerState || MatchedRegexps != other.MatchedRegexps; - } - - size_t Result(size_t regexp_id) const { - return MatchedRegexps[regexp_id]; - } - - void Save(yostream* s) const { - SavePodType(s, Pire::Header(5, sizeof(size_t))); - Impl::AlignSave(s, sizeof(Pire::Header)); - auto stateSizePair = ymake_pair(ScannerState, MatchedRegexps.size()); - SavePodType(s, stateSizePair); - Impl::AlignSave(s, sizeof(ypair<size_t, size_t>)); - Y_ASSERT(0); - } - - void Load(yistream* s) { - Impl::ValidateHeader(s, 5, sizeof(size_t)); - ypair<size_t, size_t> stateSizePair; - LoadPodType(s, stateSizePair); - Impl::AlignLoad(s, sizeof(ypair<size_t, size_t>)); - ScannerState = stateSizePair.first; - MatchedRegexps.clear(); - MatchedRegexps.resize(stateSizePair.second); - } - - private: - TVector<size_t> MatchedRegexpsIds; - typename Scanner::State ScannerState; - TVector<size_t> MatchedRegexps; - - friend class HalfFinalScanner<Relocation, Shortcutting>; - }; - - - /// Checks whether specified state is in any of the final sets - bool Final(const State& state) const { return Scanner::Final(state.ScannerState); } - - /// Checks whether specified state is 'dead' (i.e. scanner will never - /// reach any final state from current one) - bool Dead(const State& state) const { return Scanner::Dead(state.ScannerState); } - - typedef ypair<typename State::IdsIterator, typename State::IdsIterator> AcceptedRegexpsType; - - AcceptedRegexpsType AcceptedRegexps(State& state) const { - state.GetMatchedRegexpsIds(); - return ymake_pair(state.IdsBegin(), state.IdsEnd()); - } - - /// Returns an initial state for this scanner - void Initialize(State& state) const { - state.ScannerState = Scanner::m.initial; - state.MatchedRegexps.clear(); - state.MatchedRegexps.resize(Scanner::m.regexpsCount); - TakeAction(state, 0); - } - - Action NextTranslated(State& state, Char letter) const { - return Scanner::NextTranslated(state.ScannerState, letter); - } - - /// Handles one character - Action Next(State& state, Char c) const { - return Scanner::NextTranslated(state.ScannerState, Scanner::Translate(c)); - } - - void TakeAction(State& state, Action) const { - if (Final(state)) { - size_t idx = StateIndex(state); - const size_t *it = Scanner::m_final + Scanner::m_finalIndex[idx]; - while (*it != Scanner::End) { - state.MatchedRegexps[*it]++; - ++it; - } - } - } - - HalfFinalScanner(const HalfFinalScanner& s) : Scanner(s) {} - - HalfFinalScanner(const Scanner& s) : Scanner(s) {} - - HalfFinalScanner(HalfFinalScanner&& s) : Scanner(s) {} - - HalfFinalScanner(Scanner&& s) : Scanner(s) {} - - template<class AnotherRelocation> - HalfFinalScanner(const HalfFinalScanner<AnotherRelocation, Shortcutting>& s) - : Scanner(s) {} - - template<class AnotherRelocation> - HalfFinalScanner(const Impl::Scanner<AnotherRelocation, Shortcutting>& s) : Scanner(s) {} - - void Swap(HalfFinalScanner& s) { - Scanner::Swap(s); - } - - HalfFinalScanner& operator=(const HalfFinalScanner& s) { - HalfFinalScanner(s).Swap(*this); - return *this; - } - - size_t StateIndex(const State& s) const { - return Scanner::StateIndex(s.ScannerState); - } - - /** - * Agglutinates two scanners together, producing a larger scanner. - * Checking a string against that scanner effectively checks them against both agglutinated regexps - * (detailed information about matched regexps can be obtained with AcceptedRegexps()). - * - * Returns default-constructed scanner in case of failure - * (consult Scanner::Empty() to find out whether the operation was successful). - */ - static HalfFinalScanner Glue(const HalfFinalScanner& a, const HalfFinalScanner& b, size_t maxSize = 0) { - return Scanner::Glue(a, b, maxSize); - } - - ScannerRowHeader& Header(const State& s) { return Scanner::Header(s.ScannerState); } - - const ScannerRowHeader& Header(const State& s) const { return Scanner::Header(s.ScannerState); } + typedef typename Impl::Scanner<Relocation, Shortcutting> Scanner; + + HalfFinalScanner() : Scanner() {} + + explicit HalfFinalScanner(Fsm fsm_, size_t distance = 0) { + if (distance) { + fsm_ = CreateApproxFsm(fsm_, distance); + } + HalfFinalFsm fsm(fsm_); + fsm.MakeScanner(); + Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetFsm().Finals().size(), fsm.GetFsm().Initial(), 1); + BuildScanner(fsm.GetFsm(), *this); + } + + explicit HalfFinalScanner(const HalfFinalFsm& fsm) { + Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetTotalCount(), fsm.GetFsm().Initial(), 1); + BuildScanner(fsm.GetFsm(), *this); + BuildFinals(fsm); + } + + typedef typename Scanner::ScannerRowHeader ScannerRowHeader; + typedef typename Scanner::Action Action; + + class State { + public: + typedef TVector<size_t>::const_iterator IdsIterator; + + State() : ScannerState(0) {} + + State(const typename Scanner::State& otherState) : ScannerState(otherState) {} + + void GetMatchedRegexpsIds() { + MatchedRegexpsIds.clear(); + for (size_t i = 0; i < MatchedRegexps.size(); i++) { + if (MatchedRegexps[i]) { + MatchedRegexpsIds.push_back(i); + } + } + } + + IdsIterator IdsBegin() const { + return MatchedRegexpsIds.cbegin(); + } + + IdsIterator IdsEnd() const { + return MatchedRegexpsIds.cend(); + } + + bool operator==(const State& other) const { + return ScannerState == other.ScannerState && MatchedRegexps == other.MatchedRegexps; + } + + bool operator!=(const State& other) const { + return ScannerState != other.ScannerState || MatchedRegexps != other.MatchedRegexps; + } + + size_t Result(size_t regexp_id) const { + return MatchedRegexps[regexp_id]; + } + + void Save(yostream* s) const { + SavePodType(s, Pire::Header(5, sizeof(size_t))); + Impl::AlignSave(s, sizeof(Pire::Header)); + auto stateSizePair = ymake_pair(ScannerState, MatchedRegexps.size()); + SavePodType(s, stateSizePair); + Impl::AlignSave(s, sizeof(ypair<size_t, size_t>)); + Y_ASSERT(0); + } + + void Load(yistream* s) { + Impl::ValidateHeader(s, 5, sizeof(size_t)); + ypair<size_t, size_t> stateSizePair; + LoadPodType(s, stateSizePair); + Impl::AlignLoad(s, sizeof(ypair<size_t, size_t>)); + ScannerState = stateSizePair.first; + MatchedRegexps.clear(); + MatchedRegexps.resize(stateSizePair.second); + } + + private: + TVector<size_t> MatchedRegexpsIds; + typename Scanner::State ScannerState; + TVector<size_t> MatchedRegexps; + + friend class HalfFinalScanner<Relocation, Shortcutting>; + }; + + + /// Checks whether specified state is in any of the final sets + bool Final(const State& state) const { return Scanner::Final(state.ScannerState); } + + /// Checks whether specified state is 'dead' (i.e. scanner will never + /// reach any final state from current one) + bool Dead(const State& state) const { return Scanner::Dead(state.ScannerState); } + + typedef ypair<typename State::IdsIterator, typename State::IdsIterator> AcceptedRegexpsType; + + AcceptedRegexpsType AcceptedRegexps(State& state) const { + state.GetMatchedRegexpsIds(); + return ymake_pair(state.IdsBegin(), state.IdsEnd()); + } + + /// Returns an initial state for this scanner + void Initialize(State& state) const { + state.ScannerState = Scanner::m.initial; + state.MatchedRegexps.clear(); + state.MatchedRegexps.resize(Scanner::m.regexpsCount); + TakeAction(state, 0); + } + + Action NextTranslated(State& state, Char letter) const { + return Scanner::NextTranslated(state.ScannerState, letter); + } + + /// Handles one character + Action Next(State& state, Char c) const { + return Scanner::NextTranslated(state.ScannerState, Scanner::Translate(c)); + } + + void TakeAction(State& state, Action) const { + if (Final(state)) { + size_t idx = StateIndex(state); + const size_t *it = Scanner::m_final + Scanner::m_finalIndex[idx]; + while (*it != Scanner::End) { + state.MatchedRegexps[*it]++; + ++it; + } + } + } + + HalfFinalScanner(const HalfFinalScanner& s) : Scanner(s) {} + + HalfFinalScanner(const Scanner& s) : Scanner(s) {} + + HalfFinalScanner(HalfFinalScanner&& s) : Scanner(s) {} + + HalfFinalScanner(Scanner&& s) : Scanner(s) {} + + template<class AnotherRelocation> + HalfFinalScanner(const HalfFinalScanner<AnotherRelocation, Shortcutting>& s) + : Scanner(s) {} + + template<class AnotherRelocation> + HalfFinalScanner(const Impl::Scanner<AnotherRelocation, Shortcutting>& s) : Scanner(s) {} + + void Swap(HalfFinalScanner& s) { + Scanner::Swap(s); + } + + HalfFinalScanner& operator=(const HalfFinalScanner& s) { + HalfFinalScanner(s).Swap(*this); + return *this; + } + + size_t StateIndex(const State& s) const { + return Scanner::StateIndex(s.ScannerState); + } + + /** + * Agglutinates two scanners together, producing a larger scanner. + * Checking a string against that scanner effectively checks them against both agglutinated regexps + * (detailed information about matched regexps can be obtained with AcceptedRegexps()). + * + * Returns default-constructed scanner in case of failure + * (consult Scanner::Empty() to find out whether the operation was successful). + */ + static HalfFinalScanner Glue(const HalfFinalScanner& a, const HalfFinalScanner& b, size_t maxSize = 0) { + return Scanner::Glue(a, b, maxSize); + } + + ScannerRowHeader& Header(const State& s) { return Scanner::Header(s.ScannerState); } + + const ScannerRowHeader& Header(const State& s) const { return Scanner::Header(s.ScannerState); } private: - void BuildFinals(const HalfFinalFsm& fsm) { - Y_ASSERT(Scanner::m_buffer); - Y_ASSERT(fsm.GetFsm().Size() == Scanner::Size()); - auto finalWriter = Scanner::m_final; - for (size_t state = 0; state < Scanner::Size(); ++state) { - Scanner::m_finalIndex[state] = finalWriter - Scanner::m_final; - for (size_t i = 0; i < fsm.GetCount(state); i++) { - *finalWriter++ = 0; - } - *finalWriter++ = static_cast<size_t>(-1); - } - } - - template<class Scanner> - friend void Pire::BuildScanner(const Fsm&, Scanner&); - - typedef State InternalState; // Needed for agglutination + void BuildFinals(const HalfFinalFsm& fsm) { + Y_ASSERT(Scanner::m_buffer); + Y_ASSERT(fsm.GetFsm().Size() == Scanner::Size()); + auto finalWriter = Scanner::m_final; + for (size_t state = 0; state < Scanner::Size(); ++state) { + Scanner::m_finalIndex[state] = finalWriter - Scanner::m_final; + for (size_t i = 0; i < fsm.GetCount(state); i++) { + *finalWriter++ = 0; + } + *finalWriter++ = static_cast<size_t>(-1); + } + } + + template<class Scanner> + friend void Pire::BuildScanner(const Fsm&, Scanner&); + + typedef State InternalState; // Needed for agglutination }; } @@ -243,13 +243,13 @@ typedef Impl::HalfFinalScanner<Impl::Nonrelocatable, Impl::NoShortcuts> Nonreloc namespace std { - inline void swap(Pire::HalfFinalScanner& a, Pire::HalfFinalScanner& b) { - a.Swap(b); - } + inline void swap(Pire::HalfFinalScanner& a, Pire::HalfFinalScanner& b) { + a.Swap(b); + } - inline void swap(Pire::NonrelocHalfFinalScanner& a, Pire::NonrelocHalfFinalScanner& b) { - a.Swap(b); - } + inline void swap(Pire::NonrelocHalfFinalScanner& a, Pire::NonrelocHalfFinalScanner& b) { + a.Swap(b); + } } #endif diff --git a/library/cpp/regex/pire/pire/scanners/loaded.h b/library/cpp/regex/pire/pire/scanners/loaded.h index 1c5c99c9be9..c505b1c9efb 100644 --- a/library/cpp/regex/pire/pire/scanners/loaded.h +++ b/library/cpp/regex/pire/pire/scanners/loaded.h @@ -53,237 +53,237 @@ namespace Pire { */ class LoadedScanner { public: - typedef ui8 Letter; - typedef ui32 Action; - typedef ui8 Tag; + typedef ui8 Letter; + typedef ui32 Action; + typedef ui8 Tag; - typedef size_t InternalState; + typedef size_t InternalState; - union Transition { - size_t raw; // alignment hint for compiler - struct { - ui32 shift; - Action action; - }; - }; + union Transition { + size_t raw; // alignment hint for compiler + struct { + ui32 shift; + Action action; + }; + }; - // Override in subclass, if neccessary - enum { - FinalFlag = 0, - DeadFlag = 0 - }; + // Override in subclass, if neccessary + enum { + FinalFlag = 0, + DeadFlag = 0 + }; - static const size_t MAX_RE_COUNT = 16; + static const size_t MAX_RE_COUNT = 16; protected: - LoadedScanner() { Alias(Null()); } - - LoadedScanner(const LoadedScanner& s): m(s.m) - { - if (s.m_buffer) { - m_buffer = BufferType(new char [BufSize()]); - memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize()); - Markup(m_buffer.Get()); - m.initial = (InternalState)m_jumps + (s.m.initial - (InternalState)s.m_jumps); - } else { - Alias(s); - } - } - - void Swap(LoadedScanner& s) - { - DoSwap(m_buffer, s.m_buffer); - DoSwap(m.statesCount, s.m.statesCount); - DoSwap(m.lettersCount, s.m.lettersCount); - DoSwap(m.regexpsCount, s.m.regexpsCount); - DoSwap(m.initial, s.m.initial); - DoSwap(m_letters, s.m_letters); - DoSwap(m_jumps, s.m_jumps); - DoSwap(m_tags, s.m_tags); - } - - LoadedScanner& operator = (const LoadedScanner& s) { LoadedScanner(s).Swap(*this); return *this; } - LoadedScanner (LoadedScanner&& other) : LoadedScanner() { - Swap(other); - } - LoadedScanner& operator=(LoadedScanner&& other) { - Swap(other); - return *this; - } + LoadedScanner() { Alias(Null()); } + + LoadedScanner(const LoadedScanner& s): m(s.m) + { + if (s.m_buffer) { + m_buffer = BufferType(new char [BufSize()]); + memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize()); + Markup(m_buffer.Get()); + m.initial = (InternalState)m_jumps + (s.m.initial - (InternalState)s.m_jumps); + } else { + Alias(s); + } + } + + void Swap(LoadedScanner& s) + { + DoSwap(m_buffer, s.m_buffer); + DoSwap(m.statesCount, s.m.statesCount); + DoSwap(m.lettersCount, s.m.lettersCount); + DoSwap(m.regexpsCount, s.m.regexpsCount); + DoSwap(m.initial, s.m.initial); + DoSwap(m_letters, s.m_letters); + DoSwap(m_jumps, s.m_jumps); + DoSwap(m_tags, s.m_tags); + } + + LoadedScanner& operator = (const LoadedScanner& s) { LoadedScanner(s).Swap(*this); return *this; } + LoadedScanner (LoadedScanner&& other) : LoadedScanner() { + Swap(other); + } + LoadedScanner& operator=(LoadedScanner&& other) { + Swap(other); + return *this; + } public: - size_t Size() const { return m.statesCount; } - - bool Empty() const { return m_jumps == Null().m_jumps; } - - size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; } - - size_t LettersCount() const { return m.lettersCount; } - - const void* Mmap(const void* ptr, size_t size) { - return Mmap(ptr, size, nullptr); - } - - const void* Mmap(const void* ptr, size_t size, ui32* type) - { - Impl::CheckAlign(ptr); - LoadedScanner s; - const size_t* p = reinterpret_cast<const size_t*>(ptr); - Header header = Impl::ValidateHeader(p, size, ScannerIOTypes::LoadedScanner, sizeof(s.m)); - if (type) { - *type = header.Type; - } - - Locals* locals; - Impl::MapPtr(locals, 1, p, size); - memcpy(&s.m, locals, sizeof(s.m)); - - Impl::MapPtr(s.m_letters, MaxChar, p, size); - Impl::MapPtr(s.m_jumps, s.m.statesCount * s.m.lettersCount, p, size); - if (header.Version == Header::RE_VERSION_WITH_MACTIONS) { - Action* actions = 0; - Impl::MapPtr(actions, s.m.statesCount * s.m.lettersCount, p, size); - } - Impl::MapPtr(s.m_tags, s.m.statesCount, p, size); - - s.m.initial += reinterpret_cast<size_t>(s.m_jumps); - Swap(s); - - return (const void*) p; - } - - void Save(yostream*, ui32 type) const; - void Save(yostream*) const; - void Load(yistream*, ui32* type); - void Load(yistream*); - - template<class Eq> - void Init(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1) - { - m.statesCount = states; - m.lettersCount = letters.Size(); - m.regexpsCount = regexpsCount; - m_buffer = BufferType(new char[BufSize()]); - memset(m_buffer.Get(), 0, BufSize()); - Markup(m_buffer.Get()); - - m.initial = reinterpret_cast<size_t>(m_jumps + startState * m.lettersCount); - - // Build letter translation table - Fill(m_letters, m_letters + MaxChar, 0); - for (auto&& letter : letters) - for (auto&& character : letter.second.second) - m_letters[character] = letter.second.first; - } - - size_t StateSize() const - { - return m.lettersCount * sizeof(*m_jumps); - } - - size_t TransitionIndex(size_t state, Char c) const - { - return state * m.lettersCount + m_letters[c]; - } - - void SetJump(size_t oldState, Char c, size_t newState, Action action) - { - Y_ASSERT(m_buffer); - Y_ASSERT(oldState < m.statesCount); - Y_ASSERT(newState < m.statesCount); - - size_t shift = (newState - oldState) * StateSize(); - Transition tr; - tr.shift = (ui32)shift; - tr.action = action; - m_jumps[TransitionIndex(oldState, c)] = tr; - } - - Action RemapAction(Action action) { return action; } - - void SetInitial(size_t state) { Y_ASSERT(m_buffer); m.initial = reinterpret_cast<size_t>(m_jumps + state * m.lettersCount); } - void SetTag(size_t state, Tag tag) { Y_ASSERT(m_buffer); m_tags[state] = tag; } - void FinishBuild() {} - - size_t StateIdx(InternalState s) const - { - return (reinterpret_cast<Transition*>(s) - m_jumps) / m.lettersCount; - } - - i64 SignExtend(i32 i) const { return i; } - - size_t BufSize() const - { - return - MaxChar * sizeof(*m_letters) - + m.statesCount * StateSize() - + m.statesCount * sizeof(*m_tags) - ; - } + size_t Size() const { return m.statesCount; } + + bool Empty() const { return m_jumps == Null().m_jumps; } + + size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; } + + size_t LettersCount() const { return m.lettersCount; } + + const void* Mmap(const void* ptr, size_t size) { + return Mmap(ptr, size, nullptr); + } + + const void* Mmap(const void* ptr, size_t size, ui32* type) + { + Impl::CheckAlign(ptr); + LoadedScanner s; + const size_t* p = reinterpret_cast<const size_t*>(ptr); + Header header = Impl::ValidateHeader(p, size, ScannerIOTypes::LoadedScanner, sizeof(s.m)); + if (type) { + *type = header.Type; + } + + Locals* locals; + Impl::MapPtr(locals, 1, p, size); + memcpy(&s.m, locals, sizeof(s.m)); + + Impl::MapPtr(s.m_letters, MaxChar, p, size); + Impl::MapPtr(s.m_jumps, s.m.statesCount * s.m.lettersCount, p, size); + if (header.Version == Header::RE_VERSION_WITH_MACTIONS) { + Action* actions = 0; + Impl::MapPtr(actions, s.m.statesCount * s.m.lettersCount, p, size); + } + Impl::MapPtr(s.m_tags, s.m.statesCount, p, size); + + s.m.initial += reinterpret_cast<size_t>(s.m_jumps); + Swap(s); + + return (const void*) p; + } + + void Save(yostream*, ui32 type) const; + void Save(yostream*) const; + void Load(yistream*, ui32* type); + void Load(yistream*); + + template<class Eq> + void Init(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1) + { + m.statesCount = states; + m.lettersCount = letters.Size(); + m.regexpsCount = regexpsCount; + m_buffer = BufferType(new char[BufSize()]); + memset(m_buffer.Get(), 0, BufSize()); + Markup(m_buffer.Get()); + + m.initial = reinterpret_cast<size_t>(m_jumps + startState * m.lettersCount); + + // Build letter translation table + Fill(m_letters, m_letters + MaxChar, 0); + for (auto&& letter : letters) + for (auto&& character : letter.second.second) + m_letters[character] = letter.second.first; + } + + size_t StateSize() const + { + return m.lettersCount * sizeof(*m_jumps); + } + + size_t TransitionIndex(size_t state, Char c) const + { + return state * m.lettersCount + m_letters[c]; + } + + void SetJump(size_t oldState, Char c, size_t newState, Action action) + { + Y_ASSERT(m_buffer); + Y_ASSERT(oldState < m.statesCount); + Y_ASSERT(newState < m.statesCount); + + size_t shift = (newState - oldState) * StateSize(); + Transition tr; + tr.shift = (ui32)shift; + tr.action = action; + m_jumps[TransitionIndex(oldState, c)] = tr; + } + + Action RemapAction(Action action) { return action; } + + void SetInitial(size_t state) { Y_ASSERT(m_buffer); m.initial = reinterpret_cast<size_t>(m_jumps + state * m.lettersCount); } + void SetTag(size_t state, Tag tag) { Y_ASSERT(m_buffer); m_tags[state] = tag; } + void FinishBuild() {} + + size_t StateIdx(InternalState s) const + { + return (reinterpret_cast<Transition*>(s) - m_jumps) / m.lettersCount; + } + + i64 SignExtend(i32 i) const { return i; } + + size_t BufSize() const + { + return + MaxChar * sizeof(*m_letters) + + m.statesCount * StateSize() + + m.statesCount * sizeof(*m_tags) + ; + } protected: - static const Action IncrementMask = (1 << MAX_RE_COUNT) - 1; - static const Action ResetMask = IncrementMask << MAX_RE_COUNT; + static const Action IncrementMask = (1 << MAX_RE_COUNT) - 1; + static const Action ResetMask = IncrementMask << MAX_RE_COUNT; - // TODO: maybe, put fields in private section and provide data accessors + // TODO: maybe, put fields in private section and provide data accessors - struct Locals { - ui32 statesCount; - ui32 lettersCount; - ui32 regexpsCount; - size_t initial; - } m; + struct Locals { + ui32 statesCount; + ui32 lettersCount; + ui32 regexpsCount; + size_t initial; + } m; - using BufferType = TArrayHolder<char>; - BufferType m_buffer; + using BufferType = TArrayHolder<char>; + BufferType m_buffer; - Letter* m_letters; - Transition* m_jumps; - Tag* m_tags; + Letter* m_letters; + Transition* m_jumps; + Tag* m_tags; - virtual ~LoadedScanner(); + virtual ~LoadedScanner(); private: - explicit LoadedScanner(Fsm& fsm, size_t distance = 0) - { - if (distance) { - fsm = CreateApproxFsm(fsm, distance); - } - fsm.Canonize(); - Init(fsm.Size(), fsm.Letters(), fsm.Initial()); - BuildScanner(fsm, *this); - } - - inline static const LoadedScanner& Null() - { - static const LoadedScanner n = Fsm::MakeFalse().Compile<LoadedScanner>(); - return n; - } - - void Markup(void* buf) - { - m_letters = reinterpret_cast<Letter*>(buf); - m_jumps = reinterpret_cast<Transition*>(m_letters + MaxChar); - m_tags = reinterpret_cast<Tag*>(m_jumps + m.statesCount * m.lettersCount); - } - - void Alias(const LoadedScanner& s) - { - memcpy(&m, &s.m, sizeof(m)); - m_buffer = 0; - m_letters = s.m_letters; - m_jumps = s.m_jumps; - m_tags = s.m_tags; - } - - template<class Eq> - LoadedScanner(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1) - { - Init(states, letters, startState, regexpsCount); - } - - friend class Fsm; + explicit LoadedScanner(Fsm& fsm, size_t distance = 0) + { + if (distance) { + fsm = CreateApproxFsm(fsm, distance); + } + fsm.Canonize(); + Init(fsm.Size(), fsm.Letters(), fsm.Initial()); + BuildScanner(fsm, *this); + } + + inline static const LoadedScanner& Null() + { + static const LoadedScanner n = Fsm::MakeFalse().Compile<LoadedScanner>(); + return n; + } + + void Markup(void* buf) + { + m_letters = reinterpret_cast<Letter*>(buf); + m_jumps = reinterpret_cast<Transition*>(m_letters + MaxChar); + m_tags = reinterpret_cast<Tag*>(m_jumps + m.statesCount * m.lettersCount); + } + + void Alias(const LoadedScanner& s) + { + memcpy(&m, &s.m, sizeof(m)); + m_buffer = 0; + m_letters = s.m_letters; + m_jumps = s.m_jumps; + m_tags = s.m_tags; + } + + template<class Eq> + LoadedScanner(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1) + { + Init(states, letters, startState, regexpsCount); + } + + friend class Fsm; }; inline LoadedScanner::~LoadedScanner() = default; diff --git a/library/cpp/regex/pire/pire/scanners/multi.h b/library/cpp/regex/pire/pire/scanners/multi.h index 172f700ec92..ba45b86d940 100644 --- a/library/cpp/regex/pire/pire/scanners/multi.h +++ b/library/cpp/regex/pire/pire/scanners/multi.h @@ -44,42 +44,42 @@ namespace Pire { namespace Impl { - inline static ssize_t SignExtend(i32 i) { return i; } - template<class T> - class ScannerGlueCommon; + inline static ssize_t SignExtend(i32 i) { return i; } + template<class T> + class ScannerGlueCommon; - template<class T> - class ScannerGlueTask; + template<class T> + class ScannerGlueTask; - // This strategy allows to mmap() saved representation of a scanner. This is achieved by - // storing shifts instead of addresses in the transition table. - struct Relocatable { - static const size_t Signature = 1; - // Please note that Transition size is hardcoded as 32 bits. - // This limits size of transition table to 4G, but compresses - // it twice compared to 64-bit transitions. In future Transition - // can be made a template parameter if this is a concern. - typedef ui32 Transition; + // This strategy allows to mmap() saved representation of a scanner. This is achieved by + // storing shifts instead of addresses in the transition table. + struct Relocatable { + static const size_t Signature = 1; + // Please note that Transition size is hardcoded as 32 bits. + // This limits size of transition table to 4G, but compresses + // it twice compared to 64-bit transitions. In future Transition + // can be made a template parameter if this is a concern. + typedef ui32 Transition; - typedef const void* RetvalForMmap; + typedef const void* RetvalForMmap; - static size_t Go(size_t state, Transition shift) { return state + SignExtend(shift); } - static Transition Diff(size_t from, size_t to) { return static_cast<Transition>(to - from); } - }; + static size_t Go(size_t state, Transition shift) { return state + SignExtend(shift); } + static Transition Diff(size_t from, size_t to) { return static_cast<Transition>(to - from); } + }; - // With this strategy the transition table stores addresses. This makes the scanner faster - // compared to mmap()-ed - struct Nonrelocatable { - static const size_t Signature = 2; - typedef size_t Transition; + // With this strategy the transition table stores addresses. This makes the scanner faster + // compared to mmap()-ed + struct Nonrelocatable { + static const size_t Signature = 2; + typedef size_t Transition; - // Generates a compile-time error if Scanner<Nonrelocatable>::Mmap() - // (which is unsupported) is mistakenly called - typedef struct {} RetvalForMmap; + // Generates a compile-time error if Scanner<Nonrelocatable>::Mmap() + // (which is unsupported) is mistakenly called + typedef struct {} RetvalForMmap; - static size_t Go(size_t /*state*/, Transition shift) { return shift; } - static Transition Diff(size_t /*from*/, size_t to) { return to; } - }; + static size_t Go(size_t /*state*/, Transition shift) { return shift; } + static Transition Diff(size_t /*from*/, size_t to) { return to; } + }; // Scanner implementation parametrized by @@ -88,728 +88,728 @@ namespace Impl { template<class Relocation, class Shortcutting> class Scanner { protected: - enum { - FinalFlag = 1, - DeadFlag = 2, - Flags = FinalFlag | DeadFlag - }; + enum { + FinalFlag = 1, + DeadFlag = 2, + Flags = FinalFlag | DeadFlag + }; - static const size_t End = static_cast<size_t>(-1); + static const size_t End = static_cast<size_t>(-1); public: - typedef typename Relocation::Transition Transition; - - typedef ui16 Letter; - typedef ui32 Action; - typedef ui8 Tag; - - /// Some properties of the particular state. - struct CommonRowHeader { - size_t Flags; ///< Holds FinalFlag, DeadFlag, etc... - - CommonRowHeader(): Flags(0) {} - - template <class OtherCommonRowHeader> - CommonRowHeader& operator =(const OtherCommonRowHeader& other) - { - Flags = other.Flags; - return *this; - } - }; - - typedef typename Shortcutting::template ExtendedRowHeader<Scanner> ScannerRowHeader; - - Scanner() { Alias(Null()); } - - explicit Scanner(Fsm& fsm, size_t distance = 0) - { - if (distance) { - fsm = CreateApproxFsm(fsm, distance); - } - fsm.Canonize(); - Init(fsm.Size(), fsm.Letters(), fsm.Finals().size(), fsm.Initial(), 1); - BuildScanner(fsm, *this); - } - - - size_t Size() const { return m.statesCount; } - bool Empty() const { return m_transitions == Null().m_transitions; } - - typedef size_t State; - - size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; } - size_t LettersCount() const { return m.lettersCount; } - - /// Checks whether specified state is in any of the final sets - bool Final(const State& state) const { return (Header(state).Common.Flags & FinalFlag) != 0; } - - /// Checks whether specified state is 'dead' (i.e. scanner will never - /// reach any final state from current one) - bool Dead(const State& state) const { return (Header(state).Common.Flags & DeadFlag) != 0; } - - ypair<const size_t*, const size_t*> AcceptedRegexps(const State& state) const - { - size_t idx = (state - reinterpret_cast<size_t>(m_transitions)) / - (RowSize() * sizeof(Transition)); - const size_t* b = m_final + m_finalIndex[idx]; - const size_t* e = b; - while (*e != End) - ++e; - return ymake_pair(b, e); - } - - /// Returns an initial state for this scanner - void Initialize(State& state) const { state = m.initial; } - - Char Translate(Char ch) const - { - return m_letters[static_cast<size_t>(ch)]; - } - - /// Handles one letter - Action NextTranslated(State& state, Char letter) const - { - PIRE_IFDEBUG( - Y_ASSERT(state >= (size_t)m_transitions); - Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size())); - Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0); - ); - - state = Relocation::Go(state, reinterpret_cast<const Transition*>(state)[letter]); - - PIRE_IFDEBUG( - Y_ASSERT(state >= (size_t)m_transitions); - Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size())); - Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0); - ); - - return 0; - } - - /// Handles one character - Action Next(State& state, Char c) const - { - return NextTranslated(state, Translate(c)); - } - - void TakeAction(State&, Action) const {} - - Scanner(const Scanner& s): m(s.m) - { - if (!s.m_buffer) { - // Empty or mmap()-ed scanner - Alias(s); - } else { - // In-memory scanner - DeepCopy(s); - } - } - - Scanner(Scanner&& s) - { - Alias(Null()); - Swap(s); - } - - template<class AnotherRelocation> - Scanner(const Scanner<AnotherRelocation, Shortcutting>& s) - { - if (s.Empty()) - Alias(Null()); - else - DeepCopy(s); - } - - void Swap(Scanner& s) - { - Y_ASSERT(m.relocationSignature == s.m.relocationSignature); - Y_ASSERT(m.shortcuttingSignature == s.m.shortcuttingSignature); - DoSwap(m_buffer, s.m_buffer); - DoSwap(m.statesCount, s.m.statesCount); - DoSwap(m.lettersCount, s.m.lettersCount); - DoSwap(m.regexpsCount, s.m.regexpsCount); - DoSwap(m.initial, s.m.initial); - DoSwap(m_letters, s.m_letters); - DoSwap(m.finalTableSize, s.m.finalTableSize); - DoSwap(m_final, s.m_final); - DoSwap(m_finalIndex, s.m_finalIndex); - DoSwap(m_transitions, s.m_transitions); - } - - Scanner& operator = (const Scanner& s) { Scanner(s).Swap(*this); return *this; } - - /* - * Constructs the scanner from mmap()-ed memory range, returning a pointer - * to unconsumed part of the buffer. - */ - typename Relocation::RetvalForMmap Mmap(const void* ptr, size_t size) - { - Impl::CheckAlign(ptr, sizeof(size_t)); - Scanner s; - - const size_t* p = reinterpret_cast<const size_t*>(ptr); - Impl::ValidateHeader(p, size, ScannerIOTypes::Scanner, sizeof(m)); - if (size < sizeof(s.m)) - throw Error("EOF reached while mapping Pire::Scanner"); - - memcpy(&s.m, p, sizeof(s.m)); - if (s.m.relocationSignature != Relocation::Signature) - throw Error("Type mismatch while mmapping Pire::Scanner"); - Impl::AdvancePtr(p, size, sizeof(s.m)); - Impl::AlignPtr(p, size); - - if (Shortcutting::Signature != s.m.shortcuttingSignature) - throw Error("This scanner has different shortcutting type"); - - bool empty = *((const bool*) p); - Impl::AdvancePtr(p, size, sizeof(empty)); - Impl::AlignPtr(p, size); - - if (empty) - s.Alias(Null()); - else { - if (size < s.BufSize()) - throw Error("EOF reached while mapping NPire::Scanner"); - s.Markup(const_cast<size_t*>(p)); - Impl::AdvancePtr(p, size, s.BufSize()); - s.m.initial += reinterpret_cast<size_t>(s.m_transitions); - } - - Swap(s); - return Impl::AlignPtr(p, size); - } - - size_t StateIndex(State s) const - { - return (s - reinterpret_cast<size_t>(m_transitions)) / (RowSize() * sizeof(Transition)); - } - - /** - * Agglutinates two scanners together, producing a larger scanner. - * Checkig a string against that scanner effectively checks them against both agglutinated regexps - * (detailed information about matched regexps can be obtained with AcceptedRegexps()). - * - * Returns default-constructed scanner in case of failure - * (consult Scanner::Empty() to find out whether the operation was successful). - */ - static Scanner Glue(const Scanner& a, const Scanner& b, size_t maxSize = 0); - - // Returns the size of the memory buffer used (or required) by scanner. - size_t BufSize() const - { - return AlignUp( - MaxChar * sizeof(Letter) // Letters translation table - + m.finalTableSize * sizeof(size_t) // Final table - + m.statesCount * sizeof(size_t) // Final index - + RowSize() * m.statesCount * sizeof(Transition), // Transitions table - sizeof(size_t)); - } - - void Save(yostream*) const; - void Load(yistream*); - - ScannerRowHeader& Header(State s) { return *(ScannerRowHeader*) s; } - const ScannerRowHeader& Header(State s) const { return *(const ScannerRowHeader*) s; } + typedef typename Relocation::Transition Transition; + + typedef ui16 Letter; + typedef ui32 Action; + typedef ui8 Tag; + + /// Some properties of the particular state. + struct CommonRowHeader { + size_t Flags; ///< Holds FinalFlag, DeadFlag, etc... + + CommonRowHeader(): Flags(0) {} + + template <class OtherCommonRowHeader> + CommonRowHeader& operator =(const OtherCommonRowHeader& other) + { + Flags = other.Flags; + return *this; + } + }; + + typedef typename Shortcutting::template ExtendedRowHeader<Scanner> ScannerRowHeader; + + Scanner() { Alias(Null()); } + + explicit Scanner(Fsm& fsm, size_t distance = 0) + { + if (distance) { + fsm = CreateApproxFsm(fsm, distance); + } + fsm.Canonize(); + Init(fsm.Size(), fsm.Letters(), fsm.Finals().size(), fsm.Initial(), 1); + BuildScanner(fsm, *this); + } + + + size_t Size() const { return m.statesCount; } + bool Empty() const { return m_transitions == Null().m_transitions; } + + typedef size_t State; + + size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; } + size_t LettersCount() const { return m.lettersCount; } + + /// Checks whether specified state is in any of the final sets + bool Final(const State& state) const { return (Header(state).Common.Flags & FinalFlag) != 0; } + + /// Checks whether specified state is 'dead' (i.e. scanner will never + /// reach any final state from current one) + bool Dead(const State& state) const { return (Header(state).Common.Flags & DeadFlag) != 0; } + + ypair<const size_t*, const size_t*> AcceptedRegexps(const State& state) const + { + size_t idx = (state - reinterpret_cast<size_t>(m_transitions)) / + (RowSize() * sizeof(Transition)); + const size_t* b = m_final + m_finalIndex[idx]; + const size_t* e = b; + while (*e != End) + ++e; + return ymake_pair(b, e); + } + + /// Returns an initial state for this scanner + void Initialize(State& state) const { state = m.initial; } + + Char Translate(Char ch) const + { + return m_letters[static_cast<size_t>(ch)]; + } + + /// Handles one letter + Action NextTranslated(State& state, Char letter) const + { + PIRE_IFDEBUG( + Y_ASSERT(state >= (size_t)m_transitions); + Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size())); + Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0); + ); + + state = Relocation::Go(state, reinterpret_cast<const Transition*>(state)[letter]); + + PIRE_IFDEBUG( + Y_ASSERT(state >= (size_t)m_transitions); + Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size())); + Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0); + ); + + return 0; + } + + /// Handles one character + Action Next(State& state, Char c) const + { + return NextTranslated(state, Translate(c)); + } + + void TakeAction(State&, Action) const {} + + Scanner(const Scanner& s): m(s.m) + { + if (!s.m_buffer) { + // Empty or mmap()-ed scanner + Alias(s); + } else { + // In-memory scanner + DeepCopy(s); + } + } + + Scanner(Scanner&& s) + { + Alias(Null()); + Swap(s); + } + + template<class AnotherRelocation> + Scanner(const Scanner<AnotherRelocation, Shortcutting>& s) + { + if (s.Empty()) + Alias(Null()); + else + DeepCopy(s); + } + + void Swap(Scanner& s) + { + Y_ASSERT(m.relocationSignature == s.m.relocationSignature); + Y_ASSERT(m.shortcuttingSignature == s.m.shortcuttingSignature); + DoSwap(m_buffer, s.m_buffer); + DoSwap(m.statesCount, s.m.statesCount); + DoSwap(m.lettersCount, s.m.lettersCount); + DoSwap(m.regexpsCount, s.m.regexpsCount); + DoSwap(m.initial, s.m.initial); + DoSwap(m_letters, s.m_letters); + DoSwap(m.finalTableSize, s.m.finalTableSize); + DoSwap(m_final, s.m_final); + DoSwap(m_finalIndex, s.m_finalIndex); + DoSwap(m_transitions, s.m_transitions); + } + + Scanner& operator = (const Scanner& s) { Scanner(s).Swap(*this); return *this; } + + /* + * Constructs the scanner from mmap()-ed memory range, returning a pointer + * to unconsumed part of the buffer. + */ + typename Relocation::RetvalForMmap Mmap(const void* ptr, size_t size) + { + Impl::CheckAlign(ptr, sizeof(size_t)); + Scanner s; + + const size_t* p = reinterpret_cast<const size_t*>(ptr); + Impl::ValidateHeader(p, size, ScannerIOTypes::Scanner, sizeof(m)); + if (size < sizeof(s.m)) + throw Error("EOF reached while mapping Pire::Scanner"); + + memcpy(&s.m, p, sizeof(s.m)); + if (s.m.relocationSignature != Relocation::Signature) + throw Error("Type mismatch while mmapping Pire::Scanner"); + Impl::AdvancePtr(p, size, sizeof(s.m)); + Impl::AlignPtr(p, size); + + if (Shortcutting::Signature != s.m.shortcuttingSignature) + throw Error("This scanner has different shortcutting type"); + + bool empty = *((const bool*) p); + Impl::AdvancePtr(p, size, sizeof(empty)); + Impl::AlignPtr(p, size); + + if (empty) + s.Alias(Null()); + else { + if (size < s.BufSize()) + throw Error("EOF reached while mapping NPire::Scanner"); + s.Markup(const_cast<size_t*>(p)); + Impl::AdvancePtr(p, size, s.BufSize()); + s.m.initial += reinterpret_cast<size_t>(s.m_transitions); + } + + Swap(s); + return Impl::AlignPtr(p, size); + } + + size_t StateIndex(State s) const + { + return (s - reinterpret_cast<size_t>(m_transitions)) / (RowSize() * sizeof(Transition)); + } + + /** + * Agglutinates two scanners together, producing a larger scanner. + * Checkig a string against that scanner effectively checks them against both agglutinated regexps + * (detailed information about matched regexps can be obtained with AcceptedRegexps()). + * + * Returns default-constructed scanner in case of failure + * (consult Scanner::Empty() to find out whether the operation was successful). + */ + static Scanner Glue(const Scanner& a, const Scanner& b, size_t maxSize = 0); + + // Returns the size of the memory buffer used (or required) by scanner. + size_t BufSize() const + { + return AlignUp( + MaxChar * sizeof(Letter) // Letters translation table + + m.finalTableSize * sizeof(size_t) // Final table + + m.statesCount * sizeof(size_t) // Final index + + RowSize() * m.statesCount * sizeof(Transition), // Transitions table + sizeof(size_t)); + } + + void Save(yostream*) const; + void Load(yistream*); + + ScannerRowHeader& Header(State s) { return *(ScannerRowHeader*) s; } + const ScannerRowHeader& Header(State s) const { return *(const ScannerRowHeader*) s; } protected: - struct Locals { - ui32 statesCount; - ui32 lettersCount; - ui32 regexpsCount; - size_t initial; - ui32 finalTableSize; - size_t relocationSignature; - size_t shortcuttingSignature; - } m; - - using BufferType = TArrayHolder<char>; - BufferType m_buffer; - Letter* m_letters; - - size_t* m_final; - size_t* m_finalIndex; - - Transition* m_transitions; - - inline static const Scanner& Null() - { - static const Scanner n = Fsm::MakeFalse().Compile< Scanner<Relocation, Shortcutting> >(); - - return n; - } - - // Returns transition row size in Transition's. Row size_in bytes should be a multiple of sizeof(MaxSizeWord) - size_t RowSize() const { return AlignUp(m.lettersCount + HEADER_SIZE, sizeof(MaxSizeWord)/sizeof(Transition)); } - - static const size_t HEADER_SIZE = sizeof(ScannerRowHeader) / sizeof(Transition); - PIRE_STATIC_ASSERT(sizeof(ScannerRowHeader) % sizeof(Transition) == 0); - - template<class Eq> - void Init(size_t states, const Partition<Char, Eq>& letters, size_t finalStatesCount, size_t startState, size_t regexpsCount = 1) - { - std::memset(&m, 0, sizeof(m)); - m.relocationSignature = Relocation::Signature; - m.shortcuttingSignature = Shortcutting::Signature; - m.statesCount = states; - m.lettersCount = letters.Size(); - m.regexpsCount = regexpsCount; - m.finalTableSize = finalStatesCount + states; - - m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]); - memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t)); - Markup(AlignUp(m_buffer.Get(), sizeof(size_t))); - - for (size_t i = 0; i != Size(); ++i) - Header(IndexToState(i)) = ScannerRowHeader(); - - m.initial = reinterpret_cast<size_t>(m_transitions + startState * RowSize()); - - // Build letter translation table - for (auto&& letter : letters) - for (auto&& character : letter.second.second) - m_letters[character] = letter.second.first + HEADER_SIZE; - } - - /* - * Initializes pointers depending on buffer start, letters and states count - */ - void Markup(void* ptr) - { - Impl::CheckAlign(ptr, sizeof(size_t)); - m_letters = reinterpret_cast<Letter*>(ptr); - m_final = reinterpret_cast<size_t*>(m_letters + MaxChar); - m_finalIndex = reinterpret_cast<size_t*>(m_final + m.finalTableSize); - m_transitions = reinterpret_cast<Transition*>(m_finalIndex + m.statesCount); - } - - // Makes a shallow ("weak") copy of the given scanner. - // The copied scanner does not maintain lifetime of the original's entrails. - void Alias(const Scanner<Relocation, Shortcutting>& s) - { - memcpy(&m, &s.m, sizeof(m)); - m_buffer.Reset(); - m_letters = s.m_letters; - m_final = s.m_final; - m_finalIndex = s.m_finalIndex; - m_transitions = s.m_transitions; - } - - template<class AnotherRelocation> - void DeepCopy(const Scanner<AnotherRelocation, Shortcutting>& s) - { - // Don't want memory leaks, but we cannot free the buffer because there might be aliased instances - Y_ASSERT(m_buffer == nullptr); - - // Ensure that specializations of Scanner across different Relocations do not touch its Locals - static_assert(sizeof(m) == sizeof(s.m), "sizeof(m) == sizeof(s.m)"); - memcpy(&m, &s.m, sizeof(s.m)); - m.relocationSignature = Relocation::Signature; - m.shortcuttingSignature = Shortcutting::Signature; - m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]); - std::memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t)); - Markup(AlignUp(m_buffer.Get(), sizeof(size_t))); - - // Values in letter-to-leterclass table take into account row header size - for (size_t c = 0; c < MaxChar; ++c) { - m_letters[c] = s.m_letters[c] - s.HEADER_SIZE + HEADER_SIZE; - Y_ASSERT(c == Epsilon || m_letters[c] >= HEADER_SIZE); - Y_ASSERT(c == Epsilon || m_letters[c] < RowSize()); - } - memcpy(m_final, s.m_final, m.finalTableSize * sizeof(*m_final)); - memcpy(m_finalIndex, s.m_finalIndex, m.statesCount * sizeof(*m_finalIndex)); - - m.initial = IndexToState(s.StateIndex(s.m.initial)); - - for (size_t st = 0; st != m.statesCount; ++st) { - size_t oldstate = s.IndexToState(st); - size_t newstate = IndexToState(st); - Header(newstate) = s.Header(oldstate); - const typename Scanner<AnotherRelocation, Shortcutting>::Transition* os - = reinterpret_cast<const typename Scanner<AnotherRelocation, Shortcutting>::Transition*>(oldstate); - Transition* ns = reinterpret_cast<Transition*>(newstate); - - for (size_t let = 0; let != LettersCount(); ++let) { - size_t destIndex = s.StateIndex(AnotherRelocation::Go(oldstate, os[let + s.HEADER_SIZE])); - Transition tr = Relocation::Diff(newstate, IndexToState(destIndex)); - ns[let + HEADER_SIZE] = tr; - Y_ASSERT(Relocation::Go(newstate, tr) >= (size_t)m_transitions); - Y_ASSERT(Relocation::Go(newstate, tr) < (size_t)(m_transitions + RowSize()*Size())); - } - } - } - - - size_t IndexToState(size_t stateIndex) const - { - return reinterpret_cast<size_t>(m_transitions + stateIndex * RowSize()); - } - - void SetJump(size_t oldState, Char c, size_t newState, unsigned long /*payload*/ = 0) - { - Y_ASSERT(m_buffer); - Y_ASSERT(oldState < m.statesCount); - Y_ASSERT(newState < m.statesCount); - - m_transitions[oldState * RowSize() + m_letters[c]] - = Relocation::Diff(IndexToState(oldState), IndexToState(newState)); - } - - unsigned long RemapAction(unsigned long action) { return action; } - - void SetInitial(size_t state) - { - Y_ASSERT(m_buffer); - m.initial = IndexToState(state); - } - - void SetTag(size_t state, size_t value) - { - Y_ASSERT(m_buffer); - Header(IndexToState(state)).Common.Flags = value; - } - - // Fill shortcut masks for all the states - void BuildShortcuts() - { - Y_ASSERT(m_buffer); - - // Build the mapping from letter classes to characters - TVector< TVector<char> > letters(RowSize()); - for (unsigned ch = 0; ch != 1 << (sizeof(char)*8); ++ch) - letters[m_letters[ch]].push_back(ch); - - // Loop through all states in the transition table and - // check if it is possible to setup shortcuts - for (size_t i = 0; i != Size(); ++i) { - State st = IndexToState(i); - ScannerRowHeader& header = Header(st); - Shortcutting::SetNoExit(header); - size_t ind = 0; - size_t let = HEADER_SIZE; - for (; let != LettersCount() + HEADER_SIZE; ++let) { - // Check if the transition is not the same state - if (Relocation::Go(st, reinterpret_cast<const Transition*>(st)[let]) != st) { - if (ind + letters[let].size() > Shortcutting::ExitMaskCount) - break; - // For each character setup a mask - for (auto&& character : letters[let]) { - Shortcutting::SetMask(header, ind, character); - ++ind; - } - } - } - - if (let != LettersCount() + HEADER_SIZE) { - // Not enough space in ExitMasks, so reset all masks (which leads to bypassing the optimization) - Shortcutting::SetNoShortcut(header); - } - // Fill the rest of the shortcut masks with the last used mask - Shortcutting::FinishMasks(header, ind); - } - } - - // Fills final states table and builds shortcuts if possible - void FinishBuild() - { - Y_ASSERT(m_buffer); - auto finalWriter = m_final; - for (size_t state = 0; state != Size(); ++state) { - m_finalIndex[state] = finalWriter - m_final; - if (Header(IndexToState(state)).Common.Flags & FinalFlag) - *finalWriter++ = 0; - *finalWriter++ = static_cast<size_t>(-1); - } - BuildShortcuts(); - } - - size_t AcceptedRegexpsCount(size_t idx) const - { - const size_t* b = m_final + m_finalIndex[idx]; - const size_t* e = b; - while (*e != End) - ++e; - return e - b; - } - - template <class Scanner> - friend void Pire::BuildScanner(const Fsm&, Scanner&); - - typedef State InternalState; // Needed for agglutination - friend class ScannerGlueCommon<Scanner>; - friend class ScannerGlueTask<Scanner>; - - template<class AnotherRelocation, class AnotherShortcutting> - friend class Scanner; + struct Locals { + ui32 statesCount; + ui32 lettersCount; + ui32 regexpsCount; + size_t initial; + ui32 finalTableSize; + size_t relocationSignature; + size_t shortcuttingSignature; + } m; + + using BufferType = TArrayHolder<char>; + BufferType m_buffer; + Letter* m_letters; + + size_t* m_final; + size_t* m_finalIndex; + + Transition* m_transitions; + + inline static const Scanner& Null() + { + static const Scanner n = Fsm::MakeFalse().Compile< Scanner<Relocation, Shortcutting> >(); + + return n; + } + + // Returns transition row size in Transition's. Row size_in bytes should be a multiple of sizeof(MaxSizeWord) + size_t RowSize() const { return AlignUp(m.lettersCount + HEADER_SIZE, sizeof(MaxSizeWord)/sizeof(Transition)); } + + static const size_t HEADER_SIZE = sizeof(ScannerRowHeader) / sizeof(Transition); + PIRE_STATIC_ASSERT(sizeof(ScannerRowHeader) % sizeof(Transition) == 0); + + template<class Eq> + void Init(size_t states, const Partition<Char, Eq>& letters, size_t finalStatesCount, size_t startState, size_t regexpsCount = 1) + { + std::memset(&m, 0, sizeof(m)); + m.relocationSignature = Relocation::Signature; + m.shortcuttingSignature = Shortcutting::Signature; + m.statesCount = states; + m.lettersCount = letters.Size(); + m.regexpsCount = regexpsCount; + m.finalTableSize = finalStatesCount + states; + + m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]); + memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t)); + Markup(AlignUp(m_buffer.Get(), sizeof(size_t))); + + for (size_t i = 0; i != Size(); ++i) + Header(IndexToState(i)) = ScannerRowHeader(); + + m.initial = reinterpret_cast<size_t>(m_transitions + startState * RowSize()); + + // Build letter translation table + for (auto&& letter : letters) + for (auto&& character : letter.second.second) + m_letters[character] = letter.second.first + HEADER_SIZE; + } + + /* + * Initializes pointers depending on buffer start, letters and states count + */ + void Markup(void* ptr) + { + Impl::CheckAlign(ptr, sizeof(size_t)); + m_letters = reinterpret_cast<Letter*>(ptr); + m_final = reinterpret_cast<size_t*>(m_letters + MaxChar); + m_finalIndex = reinterpret_cast<size_t*>(m_final + m.finalTableSize); + m_transitions = reinterpret_cast<Transition*>(m_finalIndex + m.statesCount); + } + + // Makes a shallow ("weak") copy of the given scanner. + // The copied scanner does not maintain lifetime of the original's entrails. + void Alias(const Scanner<Relocation, Shortcutting>& s) + { + memcpy(&m, &s.m, sizeof(m)); + m_buffer.Reset(); + m_letters = s.m_letters; + m_final = s.m_final; + m_finalIndex = s.m_finalIndex; + m_transitions = s.m_transitions; + } + + template<class AnotherRelocation> + void DeepCopy(const Scanner<AnotherRelocation, Shortcutting>& s) + { + // Don't want memory leaks, but we cannot free the buffer because there might be aliased instances + Y_ASSERT(m_buffer == nullptr); + + // Ensure that specializations of Scanner across different Relocations do not touch its Locals + static_assert(sizeof(m) == sizeof(s.m), "sizeof(m) == sizeof(s.m)"); + memcpy(&m, &s.m, sizeof(s.m)); + m.relocationSignature = Relocation::Signature; + m.shortcuttingSignature = Shortcutting::Signature; + m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]); + std::memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t)); + Markup(AlignUp(m_buffer.Get(), sizeof(size_t))); + + // Values in letter-to-leterclass table take into account row header size + for (size_t c = 0; c < MaxChar; ++c) { + m_letters[c] = s.m_letters[c] - s.HEADER_SIZE + HEADER_SIZE; + Y_ASSERT(c == Epsilon || m_letters[c] >= HEADER_SIZE); + Y_ASSERT(c == Epsilon || m_letters[c] < RowSize()); + } + memcpy(m_final, s.m_final, m.finalTableSize * sizeof(*m_final)); + memcpy(m_finalIndex, s.m_finalIndex, m.statesCount * sizeof(*m_finalIndex)); + + m.initial = IndexToState(s.StateIndex(s.m.initial)); + + for (size_t st = 0; st != m.statesCount; ++st) { + size_t oldstate = s.IndexToState(st); + size_t newstate = IndexToState(st); + Header(newstate) = s.Header(oldstate); + const typename Scanner<AnotherRelocation, Shortcutting>::Transition* os + = reinterpret_cast<const typename Scanner<AnotherRelocation, Shortcutting>::Transition*>(oldstate); + Transition* ns = reinterpret_cast<Transition*>(newstate); + + for (size_t let = 0; let != LettersCount(); ++let) { + size_t destIndex = s.StateIndex(AnotherRelocation::Go(oldstate, os[let + s.HEADER_SIZE])); + Transition tr = Relocation::Diff(newstate, IndexToState(destIndex)); + ns[let + HEADER_SIZE] = tr; + Y_ASSERT(Relocation::Go(newstate, tr) >= (size_t)m_transitions); + Y_ASSERT(Relocation::Go(newstate, tr) < (size_t)(m_transitions + RowSize()*Size())); + } + } + } + + + size_t IndexToState(size_t stateIndex) const + { + return reinterpret_cast<size_t>(m_transitions + stateIndex * RowSize()); + } + + void SetJump(size_t oldState, Char c, size_t newState, unsigned long /*payload*/ = 0) + { + Y_ASSERT(m_buffer); + Y_ASSERT(oldState < m.statesCount); + Y_ASSERT(newState < m.statesCount); + + m_transitions[oldState * RowSize() + m_letters[c]] + = Relocation::Diff(IndexToState(oldState), IndexToState(newState)); + } + + unsigned long RemapAction(unsigned long action) { return action; } + + void SetInitial(size_t state) + { + Y_ASSERT(m_buffer); + m.initial = IndexToState(state); + } + + void SetTag(size_t state, size_t value) + { + Y_ASSERT(m_buffer); + Header(IndexToState(state)).Common.Flags = value; + } + + // Fill shortcut masks for all the states + void BuildShortcuts() + { + Y_ASSERT(m_buffer); + + // Build the mapping from letter classes to characters + TVector< TVector<char> > letters(RowSize()); + for (unsigned ch = 0; ch != 1 << (sizeof(char)*8); ++ch) + letters[m_letters[ch]].push_back(ch); + + // Loop through all states in the transition table and + // check if it is possible to setup shortcuts + for (size_t i = 0; i != Size(); ++i) { + State st = IndexToState(i); + ScannerRowHeader& header = Header(st); + Shortcutting::SetNoExit(header); + size_t ind = 0; + size_t let = HEADER_SIZE; + for (; let != LettersCount() + HEADER_SIZE; ++let) { + // Check if the transition is not the same state + if (Relocation::Go(st, reinterpret_cast<const Transition*>(st)[let]) != st) { + if (ind + letters[let].size() > Shortcutting::ExitMaskCount) + break; + // For each character setup a mask + for (auto&& character : letters[let]) { + Shortcutting::SetMask(header, ind, character); + ++ind; + } + } + } + + if (let != LettersCount() + HEADER_SIZE) { + // Not enough space in ExitMasks, so reset all masks (which leads to bypassing the optimization) + Shortcutting::SetNoShortcut(header); + } + // Fill the rest of the shortcut masks with the last used mask + Shortcutting::FinishMasks(header, ind); + } + } + + // Fills final states table and builds shortcuts if possible + void FinishBuild() + { + Y_ASSERT(m_buffer); + auto finalWriter = m_final; + for (size_t state = 0; state != Size(); ++state) { + m_finalIndex[state] = finalWriter - m_final; + if (Header(IndexToState(state)).Common.Flags & FinalFlag) + *finalWriter++ = 0; + *finalWriter++ = static_cast<size_t>(-1); + } + BuildShortcuts(); + } + + size_t AcceptedRegexpsCount(size_t idx) const + { + const size_t* b = m_final + m_finalIndex[idx]; + const size_t* e = b; + while (*e != End) + ++e; + return e - b; + } + + template <class Scanner> + friend void Pire::BuildScanner(const Fsm&, Scanner&); + + typedef State InternalState; // Needed for agglutination + friend class ScannerGlueCommon<Scanner>; + friend class ScannerGlueTask<Scanner>; + + template<class AnotherRelocation, class AnotherShortcutting> + friend class Scanner; friend struct ScannerSaver; #ifndef PIRE_DEBUG - friend struct AlignedRunner< Scanner<Relocation, Shortcutting> >; + friend struct AlignedRunner< Scanner<Relocation, Shortcutting> >; #endif }; // Helper class for Save/Load partial specialization struct ScannerSaver { - template<class Shortcutting> - static void SaveScanner(const Scanner<Relocatable, Shortcutting>& scanner, yostream* s) - { - typedef Scanner<Relocatable, Shortcutting> ScannerType; - - typename ScannerType::Locals mc = scanner.m; - mc.initial -= reinterpret_cast<size_t>(scanner.m_transitions); - SavePodType(s, Pire::Header(ScannerIOTypes::Scanner, sizeof(mc))); - Impl::AlignSave(s, sizeof(Pire::Header)); - SavePodType(s, mc); - Impl::AlignSave(s, sizeof(mc)); - SavePodType(s, scanner.Empty()); - Impl::AlignSave(s, sizeof(scanner.Empty())); - if (!scanner.Empty()) - Impl::AlignedSaveArray(s, scanner.m_buffer.Get(), scanner.BufSize()); - } - - template<class Shortcutting> - static void LoadScanner(Scanner<Relocatable, Shortcutting>& scanner, yistream* s) - { - typedef Scanner<Relocatable, Shortcutting> ScannerType; - - Scanner<Relocatable, Shortcutting> sc; - Impl::ValidateHeader(s, ScannerIOTypes::Scanner, sizeof(sc.m)); - LoadPodType(s, sc.m); - Impl::AlignLoad(s, sizeof(sc.m)); - if (Shortcutting::Signature != sc.m.shortcuttingSignature) - throw Error("This scanner has different shortcutting type"); - bool empty; - LoadPodType(s, empty); - Impl::AlignLoad(s, sizeof(empty)); - - if (empty) { - sc.Alias(ScannerType::Null()); - } else { - sc.m_buffer = TArrayHolder<char>(new char[sc.BufSize()]); - Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize()); - sc.Markup(sc.m_buffer.Get()); - sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions); - } - scanner.Swap(sc); - } - - // TODO: implement more effective serialization - // of nonrelocatable scanner if necessary - - template<class Shortcutting> - static void SaveScanner(const Scanner<Nonrelocatable, Shortcutting>& scanner, yostream* s) - { - Scanner<Relocatable, Shortcutting>(scanner).Save(s); - } - - template<class Shortcutting> - static void LoadScanner(Scanner<Nonrelocatable, Shortcutting>& scanner, yistream* s) - { - Scanner<Relocatable, Shortcutting> rs; - rs.Load(s); - Scanner<Nonrelocatable, Shortcutting>(rs).Swap(scanner); - } + template<class Shortcutting> + static void SaveScanner(const Scanner<Relocatable, Shortcutting>& scanner, yostream* s) + { + typedef Scanner<Relocatable, Shortcutting> ScannerType; + + typename ScannerType::Locals mc = scanner.m; + mc.initial -= reinterpret_cast<size_t>(scanner.m_transitions); + SavePodType(s, Pire::Header(ScannerIOTypes::Scanner, sizeof(mc))); + Impl::AlignSave(s, sizeof(Pire::Header)); + SavePodType(s, mc); + Impl::AlignSave(s, sizeof(mc)); + SavePodType(s, scanner.Empty()); + Impl::AlignSave(s, sizeof(scanner.Empty())); + if (!scanner.Empty()) + Impl::AlignedSaveArray(s, scanner.m_buffer.Get(), scanner.BufSize()); + } + + template<class Shortcutting> + static void LoadScanner(Scanner<Relocatable, Shortcutting>& scanner, yistream* s) + { + typedef Scanner<Relocatable, Shortcutting> ScannerType; + + Scanner<Relocatable, Shortcutting> sc; + Impl::ValidateHeader(s, ScannerIOTypes::Scanner, sizeof(sc.m)); + LoadPodType(s, sc.m); + Impl::AlignLoad(s, sizeof(sc.m)); + if (Shortcutting::Signature != sc.m.shortcuttingSignature) + throw Error("This scanner has different shortcutting type"); + bool empty; + LoadPodType(s, empty); + Impl::AlignLoad(s, sizeof(empty)); + + if (empty) { + sc.Alias(ScannerType::Null()); + } else { + sc.m_buffer = TArrayHolder<char>(new char[sc.BufSize()]); + Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize()); + sc.Markup(sc.m_buffer.Get()); + sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions); + } + scanner.Swap(sc); + } + + // TODO: implement more effective serialization + // of nonrelocatable scanner if necessary + + template<class Shortcutting> + static void SaveScanner(const Scanner<Nonrelocatable, Shortcutting>& scanner, yostream* s) + { + Scanner<Relocatable, Shortcutting>(scanner).Save(s); + } + + template<class Shortcutting> + static void LoadScanner(Scanner<Nonrelocatable, Shortcutting>& scanner, yistream* s) + { + Scanner<Relocatable, Shortcutting> rs; + rs.Load(s); + Scanner<Nonrelocatable, Shortcutting>(rs).Swap(scanner); + } }; template<class Relocation, class Shortcutting> void Scanner<Relocation, Shortcutting>::Save(yostream* s) const { - ScannerSaver::SaveScanner(*this, s); + ScannerSaver::SaveScanner(*this, s); } template<class Relocation, class Shortcutting> void Scanner<Relocation, Shortcutting>::Load(yistream* s) { - ScannerSaver::LoadScanner(*this, s); + ScannerSaver::LoadScanner(*this, s); } // Shortcutting policy that checks state exit masks template <size_t MaskCount> class ExitMasks { private: - enum { - NO_SHORTCUT_MASK = 1, // the state doesn't have shortcuts - NO_EXIT_MASK = 2 // the state has only transtions to itself (we can stop the scan) - }; - - template<class ScannerRowHeader, unsigned N> - struct MaskCheckerBase { - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool Check(const ScannerRowHeader& hdr, size_t alignOffset, Word chunk) - { - Word mask = CheckBytes(hdr.Mask(N, alignOffset), chunk); - for (int i = N-1; i >= 0; --i) { - mask = Or(mask, CheckBytes(hdr.Mask(i, alignOffset), chunk)); - } - return !IsAnySet(mask); - } - - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - const Word* DoRun(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) - { - for (; begin != end && Check(hdr, alignOffset, ToLittleEndian(*begin)); ++begin) {} - return begin; - } - }; - - template<class ScannerRowHeader, unsigned N, unsigned Nmax> - struct MaskChecker : MaskCheckerBase<ScannerRowHeader, N> { - typedef MaskCheckerBase<ScannerRowHeader, N> Base; - typedef MaskChecker<ScannerRowHeader, N+1, Nmax> Next; - - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) - { - if (hdr.Mask(N) == hdr.Mask(N + 1)) - return Base::DoRun(hdr, alignOffset, begin, end); - else - return Next::Run(hdr, alignOffset, begin, end); - } - }; - - template<class ScannerRowHeader, unsigned N> - struct MaskChecker<ScannerRowHeader, N, N> : MaskCheckerBase<ScannerRowHeader, N> { - typedef MaskCheckerBase<ScannerRowHeader, N> Base; - - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) - { - return Base::DoRun(hdr, alignOffset, begin, end); - } - }; - - // Compares the ExitMask[0] value without SSE reads which seems to be more optimal - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool CheckFirstMask(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t val) - { - return (scanner.Header(state).Mask(0) == val); - } + enum { + NO_SHORTCUT_MASK = 1, // the state doesn't have shortcuts + NO_EXIT_MASK = 2 // the state has only transtions to itself (we can stop the scan) + }; + + template<class ScannerRowHeader, unsigned N> + struct MaskCheckerBase { + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool Check(const ScannerRowHeader& hdr, size_t alignOffset, Word chunk) + { + Word mask = CheckBytes(hdr.Mask(N, alignOffset), chunk); + for (int i = N-1; i >= 0; --i) { + mask = Or(mask, CheckBytes(hdr.Mask(i, alignOffset), chunk)); + } + return !IsAnySet(mask); + } + + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + const Word* DoRun(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) + { + for (; begin != end && Check(hdr, alignOffset, ToLittleEndian(*begin)); ++begin) {} + return begin; + } + }; + + template<class ScannerRowHeader, unsigned N, unsigned Nmax> + struct MaskChecker : MaskCheckerBase<ScannerRowHeader, N> { + typedef MaskCheckerBase<ScannerRowHeader, N> Base; + typedef MaskChecker<ScannerRowHeader, N+1, Nmax> Next; + + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) + { + if (hdr.Mask(N) == hdr.Mask(N + 1)) + return Base::DoRun(hdr, alignOffset, begin, end); + else + return Next::Run(hdr, alignOffset, begin, end); + } + }; + + template<class ScannerRowHeader, unsigned N> + struct MaskChecker<ScannerRowHeader, N, N> : MaskCheckerBase<ScannerRowHeader, N> { + typedef MaskCheckerBase<ScannerRowHeader, N> Base; + + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end) + { + return Base::DoRun(hdr, alignOffset, begin, end); + } + }; + + // Compares the ExitMask[0] value without SSE reads which seems to be more optimal + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool CheckFirstMask(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t val) + { + return (scanner.Header(state).Mask(0) == val); + } public: - static const size_t ExitMaskCount = MaskCount; - static const size_t Signature = 0x2000 + MaskCount; - - template <class Scanner> - struct ExtendedRowHeader { - private: - /// In order to allow transition table to be aligned at sizeof(size_t) instead of - /// sizeof(Word) and still be able to read Masks at Word-aligned addresses each mask - /// occupies 2x space and only properly aligned part of it is read - enum { - SizeTInMaxSizeWord = sizeof(MaxSizeWord) / sizeof(size_t), - MaskSizeInSizeT = 2 * SizeTInMaxSizeWord, - }; - - public: - static const size_t ExitMaskCount = MaskCount; - - inline - const Word& Mask(size_t i, size_t alignOffset) const - { - Y_ASSERT(i < ExitMaskCount); - Y_ASSERT(alignOffset < SizeTInMaxSizeWord); - const Word* p = (const Word*)(ExitMasksArray + alignOffset + MaskSizeInSizeT * i); - Y_ASSERT(IsAligned(p, sizeof(Word))); - return *p; - } - - PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - size_t Mask(size_t i) const - { - Y_ASSERT(i < ExitMaskCount); - return ExitMasksArray[MaskSizeInSizeT*i]; - } - - void SetMask(size_t i, size_t val) - { - for (size_t j = 0; j < MaskSizeInSizeT; ++j) - ExitMasksArray[MaskSizeInSizeT*i + j] = val; - } - - ExtendedRowHeader() - { - for (size_t i = 0; i < ExitMaskCount; ++i) - SetMask(i, NO_SHORTCUT_MASK); - } - - template <class OtherScanner> - ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other) - { - PIRE_STATIC_ASSERT(ExitMaskCount == ExtendedRowHeader<OtherScanner>::ExitMaskCount); - Common = other.Common; - for (size_t i = 0; i < ExitMaskCount; ++i) - SetMask(i, other.Mask(i)); - return *this; - } - - private: - /// If this state loops for all letters except particular set - /// (common thing when matching something like /.*[Aa]/), - /// each ExitMask contains that letter in each byte of size_t. - /// - /// These masks are most commonly used for fast forwarding through parts - /// of the string matching /.*/ somewhere in the middle regexp. - size_t ExitMasksArray[ExitMaskCount * MaskSizeInSizeT]; - - public: - typename Scanner::CommonRowHeader Common; - }; - - template <class Header> - static void SetNoExit(Header& header) - { - header.SetMask(0, NO_EXIT_MASK); - } - - template <class Header> - static void SetNoShortcut(Header& header) - { - header.SetMask(0, NO_SHORTCUT_MASK); - } - - template <class Header> - static void SetMask(Header& header, size_t ind, char c) - { - header.SetMask(ind, FillSizeT(c)); - } - - template <class Header> - static void FinishMasks(Header& header, size_t ind) - { - if (ind == 0) - ind = 1; - // Fill the rest of the shortcut masks with the last used mask - size_t lastMask = header.Mask(ind - 1); - while (ind != ExitMaskCount) { - header.SetMask(ind, lastMask); - ++ind; - } - } - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool NoExit(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state) - { - return CheckFirstMask(scanner, state, NO_EXIT_MASK); - } - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool NoShortcut(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state) - { - return CheckFirstMask(scanner, state, NO_SHORTCUT_MASK); - } - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - const Word* Run(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t alignOffset, const Word* begin, const Word* end) - { - return MaskChecker<typename Scanner<Relocation, ExitMasks<MaskCount> >::ScannerRowHeader, 0, MaskCount - 1>::Run(scanner.Header(state), alignOffset, begin, end); - } + static const size_t ExitMaskCount = MaskCount; + static const size_t Signature = 0x2000 + MaskCount; + + template <class Scanner> + struct ExtendedRowHeader { + private: + /// In order to allow transition table to be aligned at sizeof(size_t) instead of + /// sizeof(Word) and still be able to read Masks at Word-aligned addresses each mask + /// occupies 2x space and only properly aligned part of it is read + enum { + SizeTInMaxSizeWord = sizeof(MaxSizeWord) / sizeof(size_t), + MaskSizeInSizeT = 2 * SizeTInMaxSizeWord, + }; + + public: + static const size_t ExitMaskCount = MaskCount; + + inline + const Word& Mask(size_t i, size_t alignOffset) const + { + Y_ASSERT(i < ExitMaskCount); + Y_ASSERT(alignOffset < SizeTInMaxSizeWord); + const Word* p = (const Word*)(ExitMasksArray + alignOffset + MaskSizeInSizeT * i); + Y_ASSERT(IsAligned(p, sizeof(Word))); + return *p; + } + + PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + size_t Mask(size_t i) const + { + Y_ASSERT(i < ExitMaskCount); + return ExitMasksArray[MaskSizeInSizeT*i]; + } + + void SetMask(size_t i, size_t val) + { + for (size_t j = 0; j < MaskSizeInSizeT; ++j) + ExitMasksArray[MaskSizeInSizeT*i + j] = val; + } + + ExtendedRowHeader() + { + for (size_t i = 0; i < ExitMaskCount; ++i) + SetMask(i, NO_SHORTCUT_MASK); + } + + template <class OtherScanner> + ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other) + { + PIRE_STATIC_ASSERT(ExitMaskCount == ExtendedRowHeader<OtherScanner>::ExitMaskCount); + Common = other.Common; + for (size_t i = 0; i < ExitMaskCount; ++i) + SetMask(i, other.Mask(i)); + return *this; + } + + private: + /// If this state loops for all letters except particular set + /// (common thing when matching something like /.*[Aa]/), + /// each ExitMask contains that letter in each byte of size_t. + /// + /// These masks are most commonly used for fast forwarding through parts + /// of the string matching /.*/ somewhere in the middle regexp. + size_t ExitMasksArray[ExitMaskCount * MaskSizeInSizeT]; + + public: + typename Scanner::CommonRowHeader Common; + }; + + template <class Header> + static void SetNoExit(Header& header) + { + header.SetMask(0, NO_EXIT_MASK); + } + + template <class Header> + static void SetNoShortcut(Header& header) + { + header.SetMask(0, NO_SHORTCUT_MASK); + } + + template <class Header> + static void SetMask(Header& header, size_t ind, char c) + { + header.SetMask(ind, FillSizeT(c)); + } + + template <class Header> + static void FinishMasks(Header& header, size_t ind) + { + if (ind == 0) + ind = 1; + // Fill the rest of the shortcut masks with the last used mask + size_t lastMask = header.Mask(ind - 1); + while (ind != ExitMaskCount) { + header.SetMask(ind, lastMask); + ++ind; + } + } + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool NoExit(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state) + { + return CheckFirstMask(scanner, state, NO_EXIT_MASK); + } + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool NoShortcut(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state) + { + return CheckFirstMask(scanner, state, NO_SHORTCUT_MASK); + } + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + const Word* Run(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t alignOffset, const Word* begin, const Word* end) + { + return MaskChecker<typename Scanner<Relocation, ExitMasks<MaskCount> >::ScannerRowHeader, 0, MaskCount - 1>::Run(scanner.Header(state), alignOffset, begin, end); + } }; @@ -817,57 +817,57 @@ public: // Shortcutting policy that doesn't do shortcuts struct NoShortcuts { - static const size_t ExitMaskCount = 0; - static const size_t Signature = 0x1000; - - template <class Scanner> - struct ExtendedRowHeader { - typename Scanner::CommonRowHeader Common; - - template <class OtherScanner> - ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other) - { - PIRE_STATIC_ASSERT(sizeof(ExtendedRowHeader) == sizeof(ExtendedRowHeader<OtherScanner>)); - Common = other.Common; - return *this; - } - }; - - template <class Header> - static void SetNoExit(Header&) {} - - template <class Header> - static void SetNoShortcut(Header&) {} - - template <class Header> - static void SetMask(Header&, size_t, char) {} - - template <class Header> - static void FinishMasks(Header&, size_t) {} - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool NoExit(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State) - { - // Cannot exit prematurely - return false; - } - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - bool NoShortcut(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State) - { - // There's no shortcut regardless of the state - return true; - } - - template <class Relocation> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - const Word* Run(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State, size_t, const Word* begin, const Word*) - { - // Stop shortcutting right at the beginning - return begin; - } + static const size_t ExitMaskCount = 0; + static const size_t Signature = 0x1000; + + template <class Scanner> + struct ExtendedRowHeader { + typename Scanner::CommonRowHeader Common; + + template <class OtherScanner> + ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other) + { + PIRE_STATIC_ASSERT(sizeof(ExtendedRowHeader) == sizeof(ExtendedRowHeader<OtherScanner>)); + Common = other.Common; + return *this; + } + }; + + template <class Header> + static void SetNoExit(Header&) {} + + template <class Header> + static void SetNoShortcut(Header&) {} + + template <class Header> + static void SetMask(Header&, size_t, char) {} + + template <class Header> + static void FinishMasks(Header&, size_t) {} + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool NoExit(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State) + { + // Cannot exit prematurely + return false; + } + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + bool NoShortcut(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State) + { + // There's no shortcut regardless of the state + return true; + } + + template <class Relocation> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + const Word* Run(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State, size_t, const Word* begin, const Word*) + { + // Stop shortcutting right at the beginning + return begin; + } }; #ifndef PIRE_DEBUG @@ -877,120 +877,120 @@ struct NoShortcuts { // Manually unrolled code proves to be faster template <class Scanner, unsigned Count> struct MultiChunk { - // Process Word-sized chunk which consist of >=1 size_t-sized chunks - template<class Pred> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action Process(const Scanner& scanner, typename Scanner::State& state, const size_t* p, Pred pred) - { - if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Continue) - return MultiChunk<Scanner, Count-1>::Process(scanner, state, ++p, pred); - else - return Stop; - } + // Process Word-sized chunk which consist of >=1 size_t-sized chunks + template<class Pred> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action Process(const Scanner& scanner, typename Scanner::State& state, const size_t* p, Pred pred) + { + if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Continue) + return MultiChunk<Scanner, Count-1>::Process(scanner, state, ++p, pred); + else + return Stop; + } }; template <class Scanner> struct MultiChunk<Scanner, 0> { - // Process Word-sized chunk which consist of >=1 size_t-sized chunks - template<class Pred> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action Process(const Scanner&, typename Scanner::State, const size_t*, Pred) - { - return Continue; - } + // Process Word-sized chunk which consist of >=1 size_t-sized chunks + template<class Pred> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action Process(const Scanner&, typename Scanner::State, const size_t*, Pred) + { + return Continue; + } }; // Efficiently runs a scanner through size_t-aligned memory range template<class Relocation, class Shortcutting> struct AlignedRunner< Scanner<Relocation, Shortcutting> > { private: - typedef Scanner<Relocation, Shortcutting> ScannerType; - - // Processes Word-sized chuck of memory (depending on the platform a Word might - // consist of multiple size_t chuncks) - template <class Pred> - static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION - Action RunMultiChunk(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, Pred pred) - { - return MultiChunk<ScannerType, sizeof(Word)/sizeof(size_t)>::Process(scanner, st, begin, pred); - } - - // Asserts if the scanner changes state while processing the byte range that is - // supposed to be skipped by a shortcut - static void ValidateSkip(const ScannerType& scanner, typename ScannerType::State st, const char* begin, const char* end) - { - typename ScannerType::State stateBefore = st; - for (const char* pos = begin; pos != end; ++pos) { - Step(scanner, st, (unsigned char)*pos); - Y_ASSERT(st == stateBefore); - } - } + typedef Scanner<Relocation, Shortcutting> ScannerType; + + // Processes Word-sized chuck of memory (depending on the platform a Word might + // consist of multiple size_t chuncks) + template <class Pred> + static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION + Action RunMultiChunk(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, Pred pred) + { + return MultiChunk<ScannerType, sizeof(Word)/sizeof(size_t)>::Process(scanner, st, begin, pred); + } + + // Asserts if the scanner changes state while processing the byte range that is + // supposed to be skipped by a shortcut + static void ValidateSkip(const ScannerType& scanner, typename ScannerType::State st, const char* begin, const char* end) + { + typename ScannerType::State stateBefore = st; + for (const char* pos = begin; pos != end; ++pos) { + Step(scanner, st, (unsigned char)*pos); + Y_ASSERT(st == stateBefore); + } + } public: - template<class Pred> - static inline PIRE_HOT_FUNCTION - Action RunAligned(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, const size_t* end , Pred pred) - { - typename ScannerType::State state = st; - const Word* head = AlignUp((const Word*) begin, sizeof(Word)); - const Word* tail = AlignDown((const Word*) end, sizeof(Word)); - for (; begin != (const size_t*) head && begin != end; ++begin) - if (RunChunk(scanner, state, begin, 0, sizeof(void*), pred) == Stop) { - st = state; - return Stop; - } - - if (begin == end) { - st = state; - return Continue; - } - if (Shortcutting::NoExit(scanner, state)) { - st = state; - return pred(scanner, state, ((const char*) end)); - } - - // Row size should be a multiple of MaxSizeWord size. Then alignOffset is the same for any state - Y_ASSERT((scanner.RowSize()*sizeof(typename ScannerType::Transition)) % sizeof(MaxSizeWord) == 0); - size_t alignOffset = (AlignUp((size_t)scanner.m_transitions, sizeof(Word)) - (size_t)scanner.m_transitions) / sizeof(size_t); - - bool noShortcut = Shortcutting::NoShortcut(scanner, state); - - while (true) { - // Do normal processing until a shortcut is possible - while (noShortcut && head != tail) { - if (RunMultiChunk(scanner, state, (const size_t*)head, pred) == Stop) { - st = state; - return Stop; - } - ++head; - noShortcut = Shortcutting::NoShortcut(scanner, state); - } - if (head == tail) - break; - - if (Shortcutting::NoExit(scanner, state)) { - st = state; - return pred(scanner, state, ((const char*) end)); - } - - // Do fast forwarding while it is possible - const Word* skipEnd = Shortcutting::Run(scanner, state, alignOffset, head, tail); - PIRE_IF_CHECKED(ValidateSkip(scanner, state, (const char*)head, (const char*)skipEnd)); - head = skipEnd; - noShortcut = true; - } - - for (size_t* p = (size_t*) tail; p != end; ++p) { - if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Stop) { - st = state; - return Stop; - } - } - - st = state; - return Continue; - } + template<class Pred> + static inline PIRE_HOT_FUNCTION + Action RunAligned(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, const size_t* end , Pred pred) + { + typename ScannerType::State state = st; + const Word* head = AlignUp((const Word*) begin, sizeof(Word)); + const Word* tail = AlignDown((const Word*) end, sizeof(Word)); + for (; begin != (const size_t*) head && begin != end; ++begin) + if (RunChunk(scanner, state, begin, 0, sizeof(void*), pred) == Stop) { + st = state; + return Stop; + } + + if (begin == end) { + st = state; + return Continue; + } + if (Shortcutting::NoExit(scanner, state)) { + st = state; + return pred(scanner, state, ((const char*) end)); + } + + // Row size should be a multiple of MaxSizeWord size. Then alignOffset is the same for any state + Y_ASSERT((scanner.RowSize()*sizeof(typename ScannerType::Transition)) % sizeof(MaxSizeWord) == 0); + size_t alignOffset = (AlignUp((size_t)scanner.m_transitions, sizeof(Word)) - (size_t)scanner.m_transitions) / sizeof(size_t); + + bool noShortcut = Shortcutting::NoShortcut(scanner, state); + + while (true) { + // Do normal processing until a shortcut is possible + while (noShortcut && head != tail) { + if (RunMultiChunk(scanner, state, (const size_t*)head, pred) == Stop) { + st = state; + return Stop; + } + ++head; + noShortcut = Shortcutting::NoShortcut(scanner, state); + } + if (head == tail) + break; + + if (Shortcutting::NoExit(scanner, state)) { + st = state; + return pred(scanner, state, ((const char*) end)); + } + + // Do fast forwarding while it is possible + const Word* skipEnd = Shortcutting::Run(scanner, state, alignOffset, head, tail); + PIRE_IF_CHECKED(ValidateSkip(scanner, state, (const char*)head, (const char*)skipEnd)); + head = skipEnd; + noShortcut = true; + } + + for (size_t* p = (size_t*) tail; p != end; ++p) { + if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Stop) { + st = state; + return Stop; + } + } + + st = state; + return Continue; + } }; #endif @@ -998,64 +998,64 @@ public: template<class Scanner> class ScannerGlueTask: public ScannerGlueCommon<Scanner> { public: - typedef ScannerGlueCommon<Scanner> Base; - typedef typename Base::State State; - using Base::Lhs; - using Base::Rhs; - using Base::Sc; - using Base::Letters; - - typedef GluedStateLookupTable<256*1024, typename Scanner::State> InvStates; - - ScannerGlueTask(const Scanner& lhs, const Scanner& rhs) - : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters)) - { - } - - void AcceptStates(const TVector<State>& states) - { - // Make up a new scanner and fill in the final table - - size_t finalTableSize = 0; - for (auto&& i : states) - finalTableSize += RangeLen(Lhs().AcceptedRegexps(i.first)) + RangeLen(Rhs().AcceptedRegexps(i.second)); - this->SetSc(THolder<Scanner>(new Scanner)); - Sc().Init(states.size(), Letters(), finalTableSize, size_t(0), Lhs().RegexpsCount() + Rhs().RegexpsCount()); - - auto finalWriter = Sc().m_final; - for (size_t state = 0; state != states.size(); ++state) { - Sc().m_finalIndex[state] = finalWriter - Sc().m_final; - finalWriter = Shift(Lhs().AcceptedRegexps(states[state].first), 0, finalWriter); - finalWriter = Shift(Rhs().AcceptedRegexps(states[state].second), Lhs().RegexpsCount(), finalWriter); - *finalWriter++ = static_cast<size_t>(-1); - - Sc().SetTag(state, ((Lhs().Final(states[state].first) || Rhs().Final(states[state].second)) ? Scanner::FinalFlag : 0) - | ((Lhs().Dead(states[state].first) && Rhs().Dead(states[state].second)) ? Scanner::DeadFlag : 0)); - } - } - - void Connect(size_t from, size_t to, Char letter) { Sc().SetJump(from, letter, to); } - - const Scanner& Success() - { - Sc().BuildShortcuts(); - return Sc(); - } + typedef ScannerGlueCommon<Scanner> Base; + typedef typename Base::State State; + using Base::Lhs; + using Base::Rhs; + using Base::Sc; + using Base::Letters; + + typedef GluedStateLookupTable<256*1024, typename Scanner::State> InvStates; + + ScannerGlueTask(const Scanner& lhs, const Scanner& rhs) + : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters)) + { + } + + void AcceptStates(const TVector<State>& states) + { + // Make up a new scanner and fill in the final table + + size_t finalTableSize = 0; + for (auto&& i : states) + finalTableSize += RangeLen(Lhs().AcceptedRegexps(i.first)) + RangeLen(Rhs().AcceptedRegexps(i.second)); + this->SetSc(THolder<Scanner>(new Scanner)); + Sc().Init(states.size(), Letters(), finalTableSize, size_t(0), Lhs().RegexpsCount() + Rhs().RegexpsCount()); + + auto finalWriter = Sc().m_final; + for (size_t state = 0; state != states.size(); ++state) { + Sc().m_finalIndex[state] = finalWriter - Sc().m_final; + finalWriter = Shift(Lhs().AcceptedRegexps(states[state].first), 0, finalWriter); + finalWriter = Shift(Rhs().AcceptedRegexps(states[state].second), Lhs().RegexpsCount(), finalWriter); + *finalWriter++ = static_cast<size_t>(-1); + + Sc().SetTag(state, ((Lhs().Final(states[state].first) || Rhs().Final(states[state].second)) ? Scanner::FinalFlag : 0) + | ((Lhs().Dead(states[state].first) && Rhs().Dead(states[state].second)) ? Scanner::DeadFlag : 0)); + } + } + + void Connect(size_t from, size_t to, Char letter) { Sc().SetJump(from, letter, to); } + + const Scanner& Success() + { + Sc().BuildShortcuts(); + return Sc(); + } private: - template<class Iter> - size_t RangeLen(ypair<Iter, Iter> range) const - { - return std::distance(range.first, range.second); - } - - template<class Iter, class OutIter> - OutIter Shift(ypair<Iter, Iter> range, size_t shift, OutIter out) const - { - for (; range.first != range.second; ++range.first, ++out) - *out = *range.first + shift; - return out; - } + template<class Iter> + size_t RangeLen(ypair<Iter, Iter> range) const + { + return std::distance(range.first, range.second); + } + + template<class Iter, class OutIter> + OutIter Shift(ypair<Iter, Iter> range, size_t shift, OutIter out) const + { + for (; range.first != range.second; ++range.first, ++out) + *out = *range.first + shift; + return out; + } }; } @@ -1064,35 +1064,35 @@ private: template<class Relocation, class Shortcutting> struct StDumper< Impl::Scanner<Relocation, Shortcutting> > { - typedef Impl::Scanner<Relocation, Shortcutting> ScannerType; + typedef Impl::Scanner<Relocation, Shortcutting> ScannerType; - StDumper(const ScannerType& sc, typename ScannerType::State st): m_sc(&sc), m_st(st) {} + StDumper(const ScannerType& sc, typename ScannerType::State st): m_sc(&sc), m_st(st) {} - void Dump(yostream& stream) const - { - stream << m_sc->StateIndex(m_st); - if (m_sc->Final(m_st)) - stream << " [final]"; - if (m_sc->Dead(m_st)) - stream << " [dead]"; - } + void Dump(yostream& stream) const + { + stream << m_sc->StateIndex(m_st); + if (m_sc->Final(m_st)) + stream << " [final]"; + if (m_sc->Dead(m_st)) + stream << " [dead]"; + } private: - const ScannerType* m_sc; - typename ScannerType::State m_st; + const ScannerType* m_sc; + typename ScannerType::State m_st; }; template<class Relocation, class Shortcutting> Impl::Scanner<Relocation, Shortcutting> Impl::Scanner<Relocation, Shortcutting>::Glue(const Impl::Scanner<Relocation, Shortcutting>& lhs, const Impl::Scanner<Relocation, Shortcutting>& rhs, size_t maxSize /* = 0 */) { - if (lhs.Empty()) - return rhs; - if (rhs.Empty()) - return lhs; - - static const size_t DefMaxSize = 80000; - Impl::ScannerGlueTask< Impl::Scanner<Relocation, Shortcutting> > task(lhs, rhs); - return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); + if (lhs.Empty()) + return rhs; + if (rhs.Empty()) + return lhs; + + static const size_t DefMaxSize = 80000; + Impl::ScannerGlueTask< Impl::Scanner<Relocation, Shortcutting> > task(lhs, rhs); + return Impl::Determine(task, maxSize ? maxSize : DefMaxSize); } @@ -1118,13 +1118,13 @@ typedef Impl::Scanner<Impl::Nonrelocatable, Impl::NoShortcuts> NonrelocScannerNo } namespace std { - inline void swap(Pire::Scanner& a, Pire::Scanner& b) { - a.Swap(b); - } + inline void swap(Pire::Scanner& a, Pire::Scanner& b) { + a.Swap(b); + } - inline void swap(Pire::NonrelocScanner& a, Pire::NonrelocScanner& b) { - a.Swap(b); - } + inline void swap(Pire::NonrelocScanner& a, Pire::NonrelocScanner& b) { + a.Swap(b); + } } diff --git a/library/cpp/regex/pire/pire/scanners/pair.h b/library/cpp/regex/pire/pire/scanners/pair.h index c12338a2a06..1c96e5dc0da 100644 --- a/library/cpp/regex/pire/pire/scanners/pair.h +++ b/library/cpp/regex/pire/pire/scanners/pair.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -30,70 +30,70 @@ namespace Pire { * If you need to run two scanners on the same string, using ScannerPair * is usually faster then running those scanners sequentially. */ - template<class Scanner1, class Scanner2> - class ScannerPair { - public: - typedef ypair<typename Scanner1::State, typename Scanner2::State> State; - typedef ypair<typename Scanner1::Action, typename Scanner2::Action> Action; - - ScannerPair() - : m_scanner1() - , m_scanner2() - { - } - ScannerPair(const Scanner1& s1, const Scanner2& s2) - : m_scanner1(&s1) - , m_scanner2(&s2) - { - } - - void Initialize(State& state) const - { - m_scanner1->Initialize(state.first); - m_scanner2->Initialize(state.second); - } - - Action Next(State& state, Char ch) const - { - return ymake_pair( - m_scanner1->Next(state.first, ch), - m_scanner2->Next(state.second, ch) - ); - } - - void TakeAction(State& s, Action a) const - { - m_scanner1->TakeAction(s.first, a.first); - m_scanner2->TakeAction(s.second, a.second); - } - - bool Final(const State& state) const - { - return m_scanner1->Final(state.first) || m_scanner2->Final(state.second); - } - - bool Dead(const State& state) const - { - return m_scanner1->Dead(state.first) && m_scanner2->Dead(state.second); - } - - ypair<size_t, size_t> StateIndex(const State& state) const - { - return ymake_pair(m_scanner1->StateIndex(state.first), m_scanner2->StateIndex(state.second)); - } - - Scanner1& First() { return *m_scanner1; } - Scanner2& Second() { return *m_scanner2; } - - const Scanner1& First() const { return *m_scanner1; } - const Scanner2& Second() const { return *m_scanner2; } - - private: - const Scanner1* m_scanner1; - const Scanner2* m_scanner2; - }; - - + template<class Scanner1, class Scanner2> + class ScannerPair { + public: + typedef ypair<typename Scanner1::State, typename Scanner2::State> State; + typedef ypair<typename Scanner1::Action, typename Scanner2::Action> Action; + + ScannerPair() + : m_scanner1() + , m_scanner2() + { + } + ScannerPair(const Scanner1& s1, const Scanner2& s2) + : m_scanner1(&s1) + , m_scanner2(&s2) + { + } + + void Initialize(State& state) const + { + m_scanner1->Initialize(state.first); + m_scanner2->Initialize(state.second); + } + + Action Next(State& state, Char ch) const + { + return ymake_pair( + m_scanner1->Next(state.first, ch), + m_scanner2->Next(state.second, ch) + ); + } + + void TakeAction(State& s, Action a) const + { + m_scanner1->TakeAction(s.first, a.first); + m_scanner2->TakeAction(s.second, a.second); + } + + bool Final(const State& state) const + { + return m_scanner1->Final(state.first) || m_scanner2->Final(state.second); + } + + bool Dead(const State& state) const + { + return m_scanner1->Dead(state.first) && m_scanner2->Dead(state.second); + } + + ypair<size_t, size_t> StateIndex(const State& state) const + { + return ymake_pair(m_scanner1->StateIndex(state.first), m_scanner2->StateIndex(state.second)); + } + + Scanner1& First() { return *m_scanner1; } + Scanner2& Second() { return *m_scanner2; } + + const Scanner1& First() const { return *m_scanner1; } + const Scanner2& Second() const { return *m_scanner2; } + + private: + const Scanner1* m_scanner1; + const Scanner2* m_scanner2; + }; + + } #endif diff --git a/library/cpp/regex/pire/pire/scanners/simple.h b/library/cpp/regex/pire/pire/scanners/simple.h index 9a5978dda21..85a77dcf728 100644 --- a/library/cpp/regex/pire/pire/scanners/simple.h +++ b/library/cpp/regex/pire/pire/scanners/simple.h @@ -39,220 +39,220 @@ namespace Pire { */ class SimpleScanner { private: - static const size_t STATE_ROW_SIZE = MaxChar + 1; // All characters + 1 element to store final state flag + static const size_t STATE_ROW_SIZE = MaxChar + 1; // All characters + 1 element to store final state flag public: - typedef size_t Transition; - typedef ui16 Letter; - typedef ui32 Action; - typedef ui8 Tag; - - SimpleScanner() { Alias(Null()); } - - explicit SimpleScanner(Fsm& fsm, size_t distance = 0); - - size_t Size() const { return m.statesCount; } - bool Empty() const { return m_transitions == Null().m_transitions; } - - typedef size_t State; - - size_t RegexpsCount() const { return Empty() ? 0 : 1; } - size_t LettersCount() const { return MaxChar; } - - /// Checks whether specified state is in any of the final sets - bool Final(const State& state) const { return *(((const Transition*) state) - 1) != 0; } - - bool Dead(const State&) const { return false; } - - ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const { - return Final(s) ? Accept() : Deny(); - } - - /// returns an initial state for this scanner - void Initialize(State& state) const { state = m.initial; } - - /// Handles one characters - Action Next(State& state, Char c) const - { - Transition shift = reinterpret_cast<const Transition*>(state)[c]; - state += shift; - return 0; - } - - bool TakeAction(State&, Action) const { return false; } - - SimpleScanner(const SimpleScanner& s): m(s.m) - { - if (!s.m_buffer) { - // Empty or mmap()-ed scanner, just copy pointers - m_buffer = 0; - m_transitions = s.m_transitions; - } else { - // In-memory scanner, perform deep copy - m_buffer = BufferType(new char[BufSize()]); - memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize()); - Markup(m_buffer.Get()); - - m.initial += (m_transitions - s.m_transitions) * sizeof(Transition); - } - } - - // Makes a shallow ("weak") copy of the given scanner. - // The copied scanner does not maintain lifetime of the original's entrails. - void Alias(const SimpleScanner& s) - { - m = s.m; - m_buffer.Reset(); - m_transitions = s.m_transitions; - } - - void Swap(SimpleScanner& s) - { - DoSwap(m_buffer, s.m_buffer); - DoSwap(m.statesCount, s.m.statesCount); - DoSwap(m.initial, s.m.initial); - DoSwap(m_transitions, s.m_transitions); - } - - SimpleScanner& operator = (const SimpleScanner& s) { SimpleScanner(s).Swap(*this); return *this; } - - ~SimpleScanner() = default; - - /* - * Constructs the scanner from mmap()-ed memory range, returning a pointer - * to unconsumed part of the buffer. - */ - const void* Mmap(const void* ptr, size_t size) - { - Impl::CheckAlign(ptr); - SimpleScanner s; - - const size_t* p = reinterpret_cast<const size_t*>(ptr); - Impl::ValidateHeader(p, size, ScannerIOTypes::SimpleScanner, sizeof(m)); - if (size < sizeof(s.m)) - throw Error("EOF reached while mapping NPire::Scanner"); - - memcpy(&s.m, p, sizeof(s.m)); - Impl::AdvancePtr(p, size, sizeof(s.m)); - Impl::AlignPtr(p, size); - - bool empty = *((const bool*) p); - Impl::AdvancePtr(p, size, sizeof(empty)); - Impl::AlignPtr(p, size); - - if (empty) - s.Alias(Null()); - else { - if (size < s.BufSize()) - throw Error("EOF reached while mapping NPire::Scanner"); - s.Markup(const_cast<size_t*>(p)); - s.m.initial += reinterpret_cast<size_t>(s.m_transitions); - - Swap(s); - Impl::AdvancePtr(p, size, BufSize()); - } - return Impl::AlignPtr(p, size); - } - - size_t StateIndex(State s) const - { - return (s - reinterpret_cast<size_t>(m_transitions)) / (STATE_ROW_SIZE * sizeof(Transition)); - } - - // Returns the size of the memory buffer used (or required) by scanner. - size_t BufSize() const - { - return STATE_ROW_SIZE * m.statesCount * sizeof(Transition); // Transitions table - } - - void Save(yostream*) const; - void Load(yistream*); + typedef size_t Transition; + typedef ui16 Letter; + typedef ui32 Action; + typedef ui8 Tag; + + SimpleScanner() { Alias(Null()); } + + explicit SimpleScanner(Fsm& fsm, size_t distance = 0); + + size_t Size() const { return m.statesCount; } + bool Empty() const { return m_transitions == Null().m_transitions; } + + typedef size_t State; + + size_t RegexpsCount() const { return Empty() ? 0 : 1; } + size_t LettersCount() const { return MaxChar; } + + /// Checks whether specified state is in any of the final sets + bool Final(const State& state) const { return *(((const Transition*) state) - 1) != 0; } + + bool Dead(const State&) const { return false; } + + ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const { + return Final(s) ? Accept() : Deny(); + } + + /// returns an initial state for this scanner + void Initialize(State& state) const { state = m.initial; } + + /// Handles one characters + Action Next(State& state, Char c) const + { + Transition shift = reinterpret_cast<const Transition*>(state)[c]; + state += shift; + return 0; + } + + bool TakeAction(State&, Action) const { return false; } + + SimpleScanner(const SimpleScanner& s): m(s.m) + { + if (!s.m_buffer) { + // Empty or mmap()-ed scanner, just copy pointers + m_buffer = 0; + m_transitions = s.m_transitions; + } else { + // In-memory scanner, perform deep copy + m_buffer = BufferType(new char[BufSize()]); + memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize()); + Markup(m_buffer.Get()); + + m.initial += (m_transitions - s.m_transitions) * sizeof(Transition); + } + } + + // Makes a shallow ("weak") copy of the given scanner. + // The copied scanner does not maintain lifetime of the original's entrails. + void Alias(const SimpleScanner& s) + { + m = s.m; + m_buffer.Reset(); + m_transitions = s.m_transitions; + } + + void Swap(SimpleScanner& s) + { + DoSwap(m_buffer, s.m_buffer); + DoSwap(m.statesCount, s.m.statesCount); + DoSwap(m.initial, s.m.initial); + DoSwap(m_transitions, s.m_transitions); + } + + SimpleScanner& operator = (const SimpleScanner& s) { SimpleScanner(s).Swap(*this); return *this; } + + ~SimpleScanner() = default; + + /* + * Constructs the scanner from mmap()-ed memory range, returning a pointer + * to unconsumed part of the buffer. + */ + const void* Mmap(const void* ptr, size_t size) + { + Impl::CheckAlign(ptr); + SimpleScanner s; + + const size_t* p = reinterpret_cast<const size_t*>(ptr); + Impl::ValidateHeader(p, size, ScannerIOTypes::SimpleScanner, sizeof(m)); + if (size < sizeof(s.m)) + throw Error("EOF reached while mapping NPire::Scanner"); + + memcpy(&s.m, p, sizeof(s.m)); + Impl::AdvancePtr(p, size, sizeof(s.m)); + Impl::AlignPtr(p, size); + + bool empty = *((const bool*) p); + Impl::AdvancePtr(p, size, sizeof(empty)); + Impl::AlignPtr(p, size); + + if (empty) + s.Alias(Null()); + else { + if (size < s.BufSize()) + throw Error("EOF reached while mapping NPire::Scanner"); + s.Markup(const_cast<size_t*>(p)); + s.m.initial += reinterpret_cast<size_t>(s.m_transitions); + + Swap(s); + Impl::AdvancePtr(p, size, BufSize()); + } + return Impl::AlignPtr(p, size); + } + + size_t StateIndex(State s) const + { + return (s - reinterpret_cast<size_t>(m_transitions)) / (STATE_ROW_SIZE * sizeof(Transition)); + } + + // Returns the size of the memory buffer used (or required) by scanner. + size_t BufSize() const + { + return STATE_ROW_SIZE * m.statesCount * sizeof(Transition); // Transitions table + } + + void Save(yostream*) const; + void Load(yistream*); protected: - struct Locals { - size_t statesCount; - size_t initial; - } m; - - using BufferType = TArrayHolder<char>; - BufferType m_buffer; - - Transition* m_transitions; - - inline static const SimpleScanner& Null() - { - static const SimpleScanner n = Fsm::MakeFalse().Compile<SimpleScanner>(); - return n; - } - - static ypair<const size_t*, const size_t*> Accept() - { - static size_t v[1] = { 0 }; - return ymake_pair(v, v + 1); - } - - static ypair<const size_t*, const size_t*> Deny() - { - static size_t v[1] = { 0 }; - return ymake_pair(v, v); - } - - /* - * Initializes pointers depending on buffer start, letters and states count - */ - void Markup(void* ptr) - { - m_transitions = reinterpret_cast<Transition*>(ptr); - } - - void SetJump(size_t oldState, Char c, size_t newState) - { - Y_ASSERT(m_buffer); - Y_ASSERT(oldState < m.statesCount); - Y_ASSERT(newState < m.statesCount); - m_transitions[oldState * STATE_ROW_SIZE + 1 + c] - = (((newState - oldState) * STATE_ROW_SIZE) * sizeof(Transition)); - } - - unsigned long RemapAction(unsigned long action) { return action; } - - void SetInitial(size_t state) - { - Y_ASSERT(m_buffer); - m.initial = reinterpret_cast<size_t>(m_transitions + state * STATE_ROW_SIZE + 1); - } - - void SetTag(size_t state, size_t tag) - { - Y_ASSERT(m_buffer); - m_transitions[state * STATE_ROW_SIZE] = tag; - } + struct Locals { + size_t statesCount; + size_t initial; + } m; + + using BufferType = TArrayHolder<char>; + BufferType m_buffer; + + Transition* m_transitions; + + inline static const SimpleScanner& Null() + { + static const SimpleScanner n = Fsm::MakeFalse().Compile<SimpleScanner>(); + return n; + } + + static ypair<const size_t*, const size_t*> Accept() + { + static size_t v[1] = { 0 }; + return ymake_pair(v, v + 1); + } + + static ypair<const size_t*, const size_t*> Deny() + { + static size_t v[1] = { 0 }; + return ymake_pair(v, v); + } + + /* + * Initializes pointers depending on buffer start, letters and states count + */ + void Markup(void* ptr) + { + m_transitions = reinterpret_cast<Transition*>(ptr); + } + + void SetJump(size_t oldState, Char c, size_t newState) + { + Y_ASSERT(m_buffer); + Y_ASSERT(oldState < m.statesCount); + Y_ASSERT(newState < m.statesCount); + m_transitions[oldState * STATE_ROW_SIZE + 1 + c] + = (((newState - oldState) * STATE_ROW_SIZE) * sizeof(Transition)); + } + + unsigned long RemapAction(unsigned long action) { return action; } + + void SetInitial(size_t state) + { + Y_ASSERT(m_buffer); + m.initial = reinterpret_cast<size_t>(m_transitions + state * STATE_ROW_SIZE + 1); + } + + void SetTag(size_t state, size_t tag) + { + Y_ASSERT(m_buffer); + m_transitions[state * STATE_ROW_SIZE] = tag; + } }; inline SimpleScanner::SimpleScanner(Fsm& fsm, size_t distance) { - if (distance) { - fsm = CreateApproxFsm(fsm, distance); - } - fsm.Canonize(); - - m.statesCount = fsm.Size(); - m_buffer = BufferType(new char[BufSize()]); - memset(m_buffer.Get(), 0, BufSize()); - Markup(m_buffer.Get()); - m.initial = reinterpret_cast<size_t>(m_transitions + fsm.Initial() * STATE_ROW_SIZE + 1); - for (size_t state = 0; state < fsm.Size(); ++state) - SetTag(state, fsm.Tag(state) | (fsm.IsFinal(state) ? 1 : 0)); - - for (size_t from = 0; from != fsm.Size(); ++from) - for (auto&& i : fsm.Letters()) { - const auto& tos = fsm.Destinations(from, i.first); - if (tos.empty()) - continue; - for (auto&& l : i.second.second) - for (auto&& to : tos) - SetJump(from, l, to); - } + if (distance) { + fsm = CreateApproxFsm(fsm, distance); + } + fsm.Canonize(); + + m.statesCount = fsm.Size(); + m_buffer = BufferType(new char[BufSize()]); + memset(m_buffer.Get(), 0, BufSize()); + Markup(m_buffer.Get()); + m.initial = reinterpret_cast<size_t>(m_transitions + fsm.Initial() * STATE_ROW_SIZE + 1); + for (size_t state = 0; state < fsm.Size(); ++state) + SetTag(state, fsm.Tag(state) | (fsm.IsFinal(state) ? 1 : 0)); + + for (size_t from = 0; from != fsm.Size(); ++from) + for (auto&& i : fsm.Letters()) { + const auto& tos = fsm.Destinations(from, i.first); + if (tos.empty()) + continue; + for (auto&& l : i.second.second) + for (auto&& to : tos) + SetJump(from, l, to); + } } diff --git a/library/cpp/regex/pire/pire/scanners/slow.h b/library/cpp/regex/pire/pire/scanners/slow.h index 5f90e4d5dde..0c26499d17e 100644 --- a/library/cpp/regex/pire/pire/scanners/slow.h +++ b/library/cpp/regex/pire/pire/scanners/slow.h @@ -51,380 +51,380 @@ namespace Pire { */ class SlowScanner { public: - typedef size_t Transition; - typedef ui16 Letter; - typedef ui32 Action; - typedef ui8 Tag; + typedef size_t Transition; + typedef ui16 Letter; + typedef ui32 Action; + typedef ui8 Tag; - enum { - FinalFlag = 1, - DeadFlag = 0 - }; + enum { + FinalFlag = 1, + DeadFlag = 0 + }; - struct State { - TVector<unsigned> states; - BitSet flags; + struct State { + TVector<unsigned> states; + BitSet flags; - State() {} - State(size_t size): flags(size) { states.reserve(size); } - void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); } + State() {} + State(size_t size): flags(size) { states.reserve(size); } + void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); } #ifdef PIRE_DEBUG - friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); } + friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); } #endif - }; - - SlowScanner(bool needActions = false) { - Alias(Null()); - need_actions = needActions; - } - - size_t GetLettersCount() const {return m.lettersCount; }; - - size_t Size() const { return GetSize(); } - size_t GetSize() const { return m.statesCount; } - bool Empty() const { return m_finals == Null().m_finals; } - - size_t Id() const {return (size_t) -1;} - size_t RegexpsCount() const { return Empty() ? 0 : 1; } - - void Initialize(State& state) const - { - state.states.clear(); - state.states.reserve(m.statesCount); - state.states.push_back(m.start); - BitSet(m.statesCount).Swap(state.flags); - } - - Char Translate(Char ch) const - { - return m_letters[static_cast<size_t>(ch)]; - } - - Action NextTranslated(const State& current, State& next, Char l) const - { - next.flags.Clear(); - next.states.clear(); - for (auto&& state : current.states) { - const unsigned* begin = 0; - const unsigned* end = 0; - if (!m_vecptr) { - const size_t* pos = m_jumpPos + state * m.lettersCount + l; - begin = m_jumps + pos[0]; - end = m_jumps + pos[1]; - } else { - const auto& v = (*m_vecptr)[state * m.lettersCount + l]; - if (!v.empty()) { - begin = &v[0]; - end = &v[0] + v.size(); - } - } - - for (; begin != end; ++begin) - if (!next.flags.Test(*begin)) { - next.flags.Set(*begin); - next.states.push_back(*begin); - } - } - - return 0; - } - - Action Next(const State& current, State& next, Char c) const - { - return NextTranslated(current, next, Translate(c)); - } - - bool TakeAction(State&, Action) const { return false; } - - Action NextTranslated(State& s, Char l) const - { - State dest(m.statesCount); - Action a = NextTranslated(s, dest, l); - s.Swap(dest); - return a; - } - - Action Next(State& s, Char c) const - { - return NextTranslated(s, Translate(c)); - } - - bool Final(const State& s) const - { - for (auto&& state : s.states) - if (m_finals[state]) - return true; - return false; - } - - bool Dead(const State&) const - { - return false; - } - - ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const { - return Final(s) ? Accept() : Deny(); - } - - bool CanStop(const State& s) const { - return Final(s); - } - - const void* Mmap(const void* ptr, size_t size) - { - Impl::CheckAlign(ptr); - SlowScanner s; - const size_t* p = reinterpret_cast<const size_t*>(ptr); - - Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m)); - Locals* locals; - Impl::MapPtr(locals, 1, p, size); - memcpy(&s.m, locals, sizeof(s.m)); - - bool empty = *((const bool*) p); - Impl::AdvancePtr(p, size, sizeof(empty)); - Impl::AlignPtr(p, size); - - if (empty) - s.Alias(Null()); - else { - s.m_vecptr = 0; - Impl::MapPtr(s.m_letters, MaxChar, p, size); - Impl::MapPtr(s.m_finals, s.m.statesCount, p, size); - Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size); - Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size); - if (need_actions) - Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size); - Swap(s); - } - return (const void*) p; - } - - void Swap(SlowScanner& s) - { - DoSwap(m_finals, s.m_finals); - DoSwap(m_jumps, s.m_jumps); - DoSwap(m_actions, s.m_actions); - DoSwap(m_jumpPos, s.m_jumpPos); - DoSwap(m.statesCount, s.m.statesCount); - DoSwap(m.lettersCount, s.m.lettersCount); - DoSwap(m.start, s.m.start); - DoSwap(m_letters, s.m_letters); - DoSwap(m_pool, s.m_pool); - DoSwap(m_vec, s.m_vec); - - DoSwap(m_vecptr, s.m_vecptr); - DoSwap(need_actions, s.need_actions); - DoSwap(m_actionsvec, s.m_actionsvec); - if (m_vecptr == &s.m_vec) - m_vecptr = &m_vec; - if (s.m_vecptr == &m_vec) - s.m_vecptr = &s.m_vec; - } - - SlowScanner(const SlowScanner& s) - : m(s.m) - , m_vec(s.m_vec) - , need_actions(s.need_actions) - , m_actionsvec(s.m_actionsvec) - { - if (s.m_vec.empty()) { - // Empty or mmap()-ed scanner, just copy pointers - m_finals = s.m_finals; - m_jumps = s.m_jumps; - m_actions = s.m_actions; - m_jumpPos = s.m_jumpPos; - m_letters = s.m_letters; - m_vecptr = 0; - } else { - // In-memory scanner, perform deep copy - alloc(m_letters, MaxChar); - memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar); - m_jumps = 0; - m_jumpPos = 0; - m_actions = 0; - alloc(m_finals, m.statesCount); - memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount); - m_vecptr = &m_vec; - } - } - - explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0) - : need_actions(needActions) - { - if (distance) { - fsm = CreateApproxFsm(fsm, distance); - } - if (removeEpsilons) - fsm.RemoveEpsilons(); - fsm.Sparse(!removeEpsilons); - - m.statesCount = fsm.Size(); - m.lettersCount = fsm.Letters().Size(); - - m_vec.resize(m.statesCount * m.lettersCount); - if (need_actions) - m_actionsvec.resize(m.statesCount * m.lettersCount); - m_vecptr = &m_vec; - alloc(m_letters, MaxChar); - m_jumps = 0; - m_actions = 0; - m_jumpPos = 0; - alloc(m_finals, m.statesCount); - - // Build letter translation table - Fill(m_letters, m_letters + MaxChar, 0); - for (auto&& letter : fsm.Letters()) - for (auto&& character : letter.second.second) - m_letters[character] = letter.second.first; - - m.start = fsm.Initial(); - BuildScanner(fsm, *this); - } - - - SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; } - - ~SlowScanner() - { - for (auto&& i : m_pool) - free(i); - } - - void Save(yostream*) const; - void Load(yistream*); - - const State& StateIndex(const State& s) const { return s; } + }; + + SlowScanner(bool needActions = false) { + Alias(Null()); + need_actions = needActions; + } + + size_t GetLettersCount() const {return m.lettersCount; }; + + size_t Size() const { return GetSize(); } + size_t GetSize() const { return m.statesCount; } + bool Empty() const { return m_finals == Null().m_finals; } + + size_t Id() const {return (size_t) -1;} + size_t RegexpsCount() const { return Empty() ? 0 : 1; } + + void Initialize(State& state) const + { + state.states.clear(); + state.states.reserve(m.statesCount); + state.states.push_back(m.start); + BitSet(m.statesCount).Swap(state.flags); + } + + Char Translate(Char ch) const + { + return m_letters[static_cast<size_t>(ch)]; + } + + Action NextTranslated(const State& current, State& next, Char l) const + { + next.flags.Clear(); + next.states.clear(); + for (auto&& state : current.states) { + const unsigned* begin = 0; + const unsigned* end = 0; + if (!m_vecptr) { + const size_t* pos = m_jumpPos + state * m.lettersCount + l; + begin = m_jumps + pos[0]; + end = m_jumps + pos[1]; + } else { + const auto& v = (*m_vecptr)[state * m.lettersCount + l]; + if (!v.empty()) { + begin = &v[0]; + end = &v[0] + v.size(); + } + } + + for (; begin != end; ++begin) + if (!next.flags.Test(*begin)) { + next.flags.Set(*begin); + next.states.push_back(*begin); + } + } + + return 0; + } + + Action Next(const State& current, State& next, Char c) const + { + return NextTranslated(current, next, Translate(c)); + } + + bool TakeAction(State&, Action) const { return false; } + + Action NextTranslated(State& s, Char l) const + { + State dest(m.statesCount); + Action a = NextTranslated(s, dest, l); + s.Swap(dest); + return a; + } + + Action Next(State& s, Char c) const + { + return NextTranslated(s, Translate(c)); + } + + bool Final(const State& s) const + { + for (auto&& state : s.states) + if (m_finals[state]) + return true; + return false; + } + + bool Dead(const State&) const + { + return false; + } + + ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const { + return Final(s) ? Accept() : Deny(); + } + + bool CanStop(const State& s) const { + return Final(s); + } + + const void* Mmap(const void* ptr, size_t size) + { + Impl::CheckAlign(ptr); + SlowScanner s; + const size_t* p = reinterpret_cast<const size_t*>(ptr); + + Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m)); + Locals* locals; + Impl::MapPtr(locals, 1, p, size); + memcpy(&s.m, locals, sizeof(s.m)); + + bool empty = *((const bool*) p); + Impl::AdvancePtr(p, size, sizeof(empty)); + Impl::AlignPtr(p, size); + + if (empty) + s.Alias(Null()); + else { + s.m_vecptr = 0; + Impl::MapPtr(s.m_letters, MaxChar, p, size); + Impl::MapPtr(s.m_finals, s.m.statesCount, p, size); + Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size); + Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size); + if (need_actions) + Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size); + Swap(s); + } + return (const void*) p; + } + + void Swap(SlowScanner& s) + { + DoSwap(m_finals, s.m_finals); + DoSwap(m_jumps, s.m_jumps); + DoSwap(m_actions, s.m_actions); + DoSwap(m_jumpPos, s.m_jumpPos); + DoSwap(m.statesCount, s.m.statesCount); + DoSwap(m.lettersCount, s.m.lettersCount); + DoSwap(m.start, s.m.start); + DoSwap(m_letters, s.m_letters); + DoSwap(m_pool, s.m_pool); + DoSwap(m_vec, s.m_vec); + + DoSwap(m_vecptr, s.m_vecptr); + DoSwap(need_actions, s.need_actions); + DoSwap(m_actionsvec, s.m_actionsvec); + if (m_vecptr == &s.m_vec) + m_vecptr = &m_vec; + if (s.m_vecptr == &m_vec) + s.m_vecptr = &s.m_vec; + } + + SlowScanner(const SlowScanner& s) + : m(s.m) + , m_vec(s.m_vec) + , need_actions(s.need_actions) + , m_actionsvec(s.m_actionsvec) + { + if (s.m_vec.empty()) { + // Empty or mmap()-ed scanner, just copy pointers + m_finals = s.m_finals; + m_jumps = s.m_jumps; + m_actions = s.m_actions; + m_jumpPos = s.m_jumpPos; + m_letters = s.m_letters; + m_vecptr = 0; + } else { + // In-memory scanner, perform deep copy + alloc(m_letters, MaxChar); + memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar); + m_jumps = 0; + m_jumpPos = 0; + m_actions = 0; + alloc(m_finals, m.statesCount); + memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount); + m_vecptr = &m_vec; + } + } + + explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0) + : need_actions(needActions) + { + if (distance) { + fsm = CreateApproxFsm(fsm, distance); + } + if (removeEpsilons) + fsm.RemoveEpsilons(); + fsm.Sparse(!removeEpsilons); + + m.statesCount = fsm.Size(); + m.lettersCount = fsm.Letters().Size(); + + m_vec.resize(m.statesCount * m.lettersCount); + if (need_actions) + m_actionsvec.resize(m.statesCount * m.lettersCount); + m_vecptr = &m_vec; + alloc(m_letters, MaxChar); + m_jumps = 0; + m_actions = 0; + m_jumpPos = 0; + alloc(m_finals, m.statesCount); + + // Build letter translation table + Fill(m_letters, m_letters + MaxChar, 0); + for (auto&& letter : fsm.Letters()) + for (auto&& character : letter.second.second) + m_letters[character] = letter.second.first; + + m.start = fsm.Initial(); + BuildScanner(fsm, *this); + } + + + SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; } + + ~SlowScanner() + { + for (auto&& i : m_pool) + free(i); + } + + void Save(yostream*) const; + void Load(yistream*); + + const State& StateIndex(const State& s) const { return s; } protected: - bool IsMmaped() const - { - return (!m_vecptr); - } - - size_t GetJump(size_t pos) const - { - return m_jumps[pos]; - } - - Action& GetAction(size_t pos) const - { - return m_actions[pos]; - } - - const TVector<Action>& GetActionsVec(size_t from) const - { - return m_actionsvec[from]; - } - - const TVector<unsigned int>& GetJumpsVec(size_t from) const - { - return m_vec[from]; - } - - size_t* GetJumpPos() const - { - return m_jumpPos; - } - - size_t GetStart() const - { - return m.start; - } - - bool IsFinal(size_t pos) const - { - return m_finals[pos]; - } + bool IsMmaped() const + { + return (!m_vecptr); + } + + size_t GetJump(size_t pos) const + { + return m_jumps[pos]; + } + + Action& GetAction(size_t pos) const + { + return m_actions[pos]; + } + + const TVector<Action>& GetActionsVec(size_t from) const + { + return m_actionsvec[from]; + } + + const TVector<unsigned int>& GetJumpsVec(size_t from) const + { + return m_vec[from]; + } + + size_t* GetJumpPos() const + { + return m_jumpPos; + } + + size_t GetStart() const + { + return m.start; + } + + bool IsFinal(size_t pos) const + { + return m_finals[pos]; + } private: - struct Locals { - size_t statesCount; - size_t lettersCount; - size_t start; - } m; - - bool* m_finals; - unsigned* m_jumps; - Action* m_actions; - size_t* m_jumpPos; - size_t* m_letters; - - TVector<void*> m_pool; - TVector< TVector<unsigned> > m_vec, *m_vecptr; - - bool need_actions; - TVector<TVector<Action>> m_actionsvec; - static const SlowScanner& Null(); - - template<class T> void alloc(T*& p, size_t size) - { - p = static_cast<T*>(malloc(size * sizeof(T))); - memset(p, 0, size * sizeof(T)); - m_pool.push_back(p); - } - - void Alias(const SlowScanner& s) - { - memcpy(&m, &s.m, sizeof(m)); - m_vec.clear(); - need_actions = s.need_actions; - m_actionsvec.clear(); - m_finals = s.m_finals; - m_jumps = s.m_jumps; - m_actions = s.m_actions; - m_jumpPos = s.m_jumpPos; - m_letters = s.m_letters; - m_vecptr = s.m_vecptr; - m_pool.clear(); - } - - void SetJump(size_t oldState, Char c, size_t newState, unsigned long action) - { - Y_ASSERT(!m_vec.empty()); - Y_ASSERT(oldState < m.statesCount); - Y_ASSERT(newState < m.statesCount); - - size_t idx = oldState * m.lettersCount + m_letters[c]; - m_vec[idx].push_back(newState); - if (need_actions) - m_actionsvec[idx].push_back(action); - } - - unsigned long RemapAction(unsigned long action) { return action; } - - void SetInitial(size_t state) { m.start = state; } - void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); } - - void FinishBuild() {} - - static ypair<const size_t*, const size_t*> Accept() - { - static size_t v[1] = { 0 }; - - return ymake_pair(v, v + 1); - } - - static ypair<const size_t*, const size_t*> Deny() - { - static size_t v[1] = { 0 }; - return ymake_pair(v, v); - } - - friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&); + struct Locals { + size_t statesCount; + size_t lettersCount; + size_t start; + } m; + + bool* m_finals; + unsigned* m_jumps; + Action* m_actions; + size_t* m_jumpPos; + size_t* m_letters; + + TVector<void*> m_pool; + TVector< TVector<unsigned> > m_vec, *m_vecptr; + + bool need_actions; + TVector<TVector<Action>> m_actionsvec; + static const SlowScanner& Null(); + + template<class T> void alloc(T*& p, size_t size) + { + p = static_cast<T*>(malloc(size * sizeof(T))); + memset(p, 0, size * sizeof(T)); + m_pool.push_back(p); + } + + void Alias(const SlowScanner& s) + { + memcpy(&m, &s.m, sizeof(m)); + m_vec.clear(); + need_actions = s.need_actions; + m_actionsvec.clear(); + m_finals = s.m_finals; + m_jumps = s.m_jumps; + m_actions = s.m_actions; + m_jumpPos = s.m_jumpPos; + m_letters = s.m_letters; + m_vecptr = s.m_vecptr; + m_pool.clear(); + } + + void SetJump(size_t oldState, Char c, size_t newState, unsigned long action) + { + Y_ASSERT(!m_vec.empty()); + Y_ASSERT(oldState < m.statesCount); + Y_ASSERT(newState < m.statesCount); + + size_t idx = oldState * m.lettersCount + m_letters[c]; + m_vec[idx].push_back(newState); + if (need_actions) + m_actionsvec[idx].push_back(action); + } + + unsigned long RemapAction(unsigned long action) { return action; } + + void SetInitial(size_t state) { m.start = state; } + void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); } + + void FinishBuild() {} + + static ypair<const size_t*, const size_t*> Accept() + { + static size_t v[1] = { 0 }; + + return ymake_pair(v, v + 1); + } + + static ypair<const size_t*, const size_t*> Deny() + { + static size_t v[1] = { 0 }; + return ymake_pair(v, v); + } + + friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&); }; template<> inline SlowScanner Fsm::Compile(size_t distance) { - return SlowScanner(*this, false, true, distance); + return SlowScanner(*this, false, true, distance); } inline const SlowScanner& SlowScanner::Null() { - static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>(); - return n; + static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>(); + return n; } #ifndef PIRE_DEBUG @@ -433,18 +433,18 @@ inline const SlowScanner& SlowScanner::Null() template<> inline void Run<SlowScanner>(const SlowScanner& scanner, SlowScanner::State& state, TStringBuf str) { - SlowScanner::State temp; - scanner.Initialize(temp); - - SlowScanner::State* src = &state; - SlowScanner::State* dest = &temp; - - for (auto it = str.begin(); it != str.end(); ++it) { - scanner.Next(*src, *dest, static_cast<unsigned char>(*it)); - DoSwap(src, dest); - } - if (src != &state) - state = *src; + SlowScanner::State temp; + scanner.Initialize(temp); + + SlowScanner::State* src = &state; + SlowScanner::State* dest = &temp; + + for (auto it = str.begin(); it != str.end(); ++it) { + scanner.Next(*src, *dest, static_cast<unsigned char>(*it)); + DoSwap(src, dest); + } + if (src != &state) + state = *src; } #endif diff --git a/library/cpp/regex/pire/pire/static_assert.h b/library/cpp/regex/pire/pire/static_assert.h index 90dd0ff4f01..5d671a1624f 100644 --- a/library/cpp/regex/pire/pire/static_assert.h +++ b/library/cpp/regex/pire/pire/static_assert.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -24,7 +24,7 @@ #define PIRE_ASSERT_H_INCLUDED namespace Pire { namespace Impl { - + // A static (compile-tile) assertion. // The idea was shamelessly borrowed from Boost. template<bool x> struct StaticAssertion; diff --git a/library/cpp/regex/pire/pire/stub/singleton.h b/library/cpp/regex/pire/pire/stub/singleton.h index f24e9244607..99fb6578f19 100644 --- a/library/cpp/regex/pire/pire/stub/singleton.h +++ b/library/cpp/regex/pire/pire/stub/singleton.h @@ -3,6 +3,6 @@ namespace Pire { template<class T> const T& DefaultValue() { - return Default<T>(); + return Default<T>(); } } diff --git a/library/cpp/regex/pire/pire/vbitset.h b/library/cpp/regex/pire/pire/vbitset.h index e255031b070..62b85aa05c3 100644 --- a/library/cpp/regex/pire/pire/vbitset.h +++ b/library/cpp/regex/pire/pire/vbitset.h @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -40,78 +40,78 @@ namespace Pire { /// A bitset with variable width class BitSet { public: - typedef size_t value_type; - typedef size_t* pointer; - typedef size_t& reference; - typedef const size_t& const_reference; - - class const_iterator; - - BitSet() - : m_data(1, 1) - { - } - BitSet(size_t size) - : m_data(RoundUp(size + 1) + 1) - , m_size(size) - { - m_data[RoundDown(size)] |= (1U << Remainder(size)); - } - - void Swap(BitSet& s) - { - m_data.swap(s.m_data); - DoSwap(m_size, s.m_size); - } - - /// Sets the specified bit to 1. - void Set(size_t pos) { - m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] |= (1U << Remainder(pos)); - } - - /// Resets the specified bit to 0. - void Reset(size_t pos) { - m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] &= ~(1U << Remainder(pos)); - } - - /// Checks whether the specified bit is set to 1. - bool Test(size_t pos) const { - return (m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] & (1U << Remainder(pos))) != 0; - } - - size_t Size() const { - return m_size; - } - - void Resize(size_t newsize) - { - m_data.resize(RoundUp(newsize + 1)); - if (Remainder(newsize) && !m_data.empty()) - m_data[m_data.size() - 1] &= ((1U << Remainder(newsize)) - 1); // Clear tail - m_data[RoundDown(newsize)] |= (1U << Remainder(newsize)); - } - - /// Resets all bits to 0. - void Clear() { memset(&m_data[0], 0, m_data.size() * sizeof(ContainerType)); } + typedef size_t value_type; + typedef size_t* pointer; + typedef size_t& reference; + typedef const size_t& const_reference; + + class const_iterator; + + BitSet() + : m_data(1, 1) + { + } + BitSet(size_t size) + : m_data(RoundUp(size + 1) + 1) + , m_size(size) + { + m_data[RoundDown(size)] |= (1U << Remainder(size)); + } + + void Swap(BitSet& s) + { + m_data.swap(s.m_data); + DoSwap(m_size, s.m_size); + } + + /// Sets the specified bit to 1. + void Set(size_t pos) { + m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] |= (1U << Remainder(pos)); + } + + /// Resets the specified bit to 0. + void Reset(size_t pos) { + m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] &= ~(1U << Remainder(pos)); + } + + /// Checks whether the specified bit is set to 1. + bool Test(size_t pos) const { + return (m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] & (1U << Remainder(pos))) != 0; + } + + size_t Size() const { + return m_size; + } + + void Resize(size_t newsize) + { + m_data.resize(RoundUp(newsize + 1)); + if (Remainder(newsize) && !m_data.empty()) + m_data[m_data.size() - 1] &= ((1U << Remainder(newsize)) - 1); // Clear tail + m_data[RoundDown(newsize)] |= (1U << Remainder(newsize)); + } + + /// Resets all bits to 0. + void Clear() { memset(&m_data[0], 0, m_data.size() * sizeof(ContainerType)); } private: - typedef unsigned char ContainerType; - static const size_t ItemSize = sizeof(ContainerType) * 8; - TVector<ContainerType> m_data; - size_t m_size; + typedef unsigned char ContainerType; + static const size_t ItemSize = sizeof(ContainerType) * 8; + TVector<ContainerType> m_data; + size_t m_size; - static size_t RoundUp(size_t x) { return x / ItemSize + ((x % ItemSize) ? 1 : 0); } - static size_t RoundDown(size_t x) { return x / ItemSize; } - static size_t Remainder(size_t x) { return x % ItemSize; } + static size_t RoundUp(size_t x) { return x / ItemSize + ((x % ItemSize) ? 1 : 0); } + static size_t RoundDown(size_t x) { return x / ItemSize; } + static size_t Remainder(size_t x) { return x % ItemSize; } #ifdef _DEBUG - size_t CheckSize(size_t size) const - { - if (size < m_size) - return size; - else - throw Error("BitSet: subscript out of range"); - } + size_t CheckSize(size_t size) const + { + if (size < m_size) + return size; + else + throw Error("BitSet: subscript out of range"); + } #endif }; diff --git a/library/cpp/regex/pire/ut/approx_matching_ut.cpp b/library/cpp/regex/pire/ut/approx_matching_ut.cpp index 0454b46a868..f8a85c271f4 100644 --- a/library/cpp/regex/pire/ut/approx_matching_ut.cpp +++ b/library/cpp/regex/pire/ut/approx_matching_ut.cpp @@ -24,356 +24,356 @@ #include "common.h" Y_UNIT_TEST_SUITE(ApproxMatchingTest) { - Pire::Fsm BuildFsm(const char *str) - { - Pire::Lexer lexer; - TVector<wchar32> ucs4; - - lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); - lexer.Assign(ucs4.begin(), ucs4.end()); - return lexer.Parse(); - } - - Y_UNIT_TEST(Simple) { - auto fsm = BuildFsm("^ab$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("ab"); - ACCEPTS("ax"); - ACCEPTS("xb"); - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("xab"); - ACCEPTS("axb"); - ACCEPTS("abx"); - ACCEPTS("aab"); - DENIES("xy"); - DENIES("abcd"); - DENIES("xabx"); - DENIES(""); - } - - fsm = BuildFsm("^ab$"); - APPROXIMATE_SCANNER(fsm, 2) { - ACCEPTS("ab"); - ACCEPTS("xy"); - ACCEPTS(""); - ACCEPTS("axbx"); - DENIES("xxabx"); - DENIES("xbxxx"); - } - } - - Y_UNIT_TEST(SpecialSymbols) { - auto fsm = BuildFsm("^.*ab$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("ab"); - ACCEPTS("xxxxab"); - ACCEPTS("xxxxabab"); - DENIES("xxxx"); - DENIES("abxxxx"); - } - - fsm = BuildFsm("^[a-c]$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("c"); - ACCEPTS("/"); - ACCEPTS(""); - ACCEPTS("ax"); - DENIES("xx"); - DENIES("abc"); - } - - fsm = BuildFsm("^x{4}$"); - APPROXIMATE_SCANNER(fsm, 2) { - DENIES ("x"); - ACCEPTS("xx"); - ACCEPTS("xxx"); - ACCEPTS("xxxx"); - ACCEPTS("xxxxx"); - ACCEPTS("xxxxxx"); - DENIES ("xxxxxxx"); - ACCEPTS("xxyy"); - ACCEPTS("xxyyx"); - ACCEPTS("xxxxyz"); - DENIES("xyyy"); - } - - fsm = BuildFsm("^(a|b)$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("x"); - ACCEPTS(""); - ACCEPTS("ax"); - DENIES("abc"); - DENIES("xx"); - } - - fsm = BuildFsm("^(ab|cd)$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("ab"); - ACCEPTS("cd"); - ACCEPTS("ax"); - ACCEPTS("xd"); - ACCEPTS("abx"); - ACCEPTS("a"); - DENIES("abcd"); - DENIES("xx"); - DENIES(""); - } - - fsm = BuildFsm("^[a-c]{3}$"); - APPROXIMATE_SCANNER(fsm, 2) { - ACCEPTS("abc"); - ACCEPTS("aaa"); - ACCEPTS("a"); - ACCEPTS("ax"); - ACCEPTS("abxcx"); - DENIES("x"); - DENIES(""); - DENIES("xaxx"); - } - - fsm = BuildFsm("^\\x{61}$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a"); - ACCEPTS("x"); - ACCEPTS(""); - ACCEPTS("ax"); - DENIES("axx"); - DENIES("xx"); - } - - fsm = BuildFsm("^a.bc$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("axxbc"); - ACCEPTS("abc"); - ACCEPTS("xabc"); - ACCEPTS("xaxbc"); - DENIES("bc"); - DENIES("abcx"); - } - } - - Y_UNIT_TEST(TestSurrounded) { - auto fsm = BuildFsm("abc").Surround(); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("abc"); - ACCEPTS("xabcx"); - ACCEPTS("xabx"); - ACCEPTS("axc"); - ACCEPTS("bac"); - DENIES("a"); - DENIES("xaxxxx"); - } - - fsm = BuildFsm("^abc$").Surround(); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("abc"); - ACCEPTS("abcx"); - ACCEPTS("xabc"); - ACCEPTS("axc"); - ACCEPTS("bac"); - DENIES("xabx"); - DENIES("axx"); - } - } - - Y_UNIT_TEST(GlueFsm) { - auto fsm = BuildFsm("^a$") | BuildFsm("^b$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS(""); - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("x"); - ACCEPTS("ab"); - DENIES("abb"); - } - - fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("cc"); - ACCEPTS("x"); - ACCEPTS("xa"); - ACCEPTS("c"); - ACCEPTS("xc"); - ACCEPTS("cxc"); - ACCEPTS(""); - } - } - - enum MutateOperation { - Begin, - Substitute = Begin, - Delete, - Insert, - End - }; - - ystring ChangeText(const ystring& text, int operation, int pos) - { - auto changedText = text; - switch (operation) { - case MutateOperation::Substitute: - changedText[pos] = 'x'; - break; - case MutateOperation::Delete: - changedText.erase(pos, 1); - break; - case MutateOperation::Insert: - changedText.insert(pos, 1, 'x'); - break; - } - - return changedText; - } - - Y_UNIT_TEST(StressTest) { - ystring text; - for (size_t letter = 0; letter < 10; ++letter) { - text += ystring(3, letter + 'a'); - } - const ystring regexp = "^" + text + "$"; - auto fsm = BuildFsm(regexp.data()); - - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS(text); - - for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { - for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { - auto changedText = ChangeText(text, operation, pos); - ACCEPTS(changedText); - } - } - } - - APPROXIMATE_SCANNER(fsm, 0) { - ACCEPTS(text); - - for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { - for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { - auto changedText = ChangeText(text, operation, pos); - DENIES(changedText); - } - } - } - - APPROXIMATE_SCANNER(fsm, 2) { - ACCEPTS(text); - - for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight - size_t posRight = text.size() - posLeft - 1; - for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { - for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { - auto changedText = ChangeText(text, operationRight, posRight); - changedText = ChangeText(changedText, operationLeft, posLeft); - ACCEPTS(changedText); - } - } - } - } - - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS(text); - - for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight - size_t posRight = text.size() - posLeft - 1; - for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { - for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { - auto changedText = ChangeText(text, operationRight, posRight); - changedText = ChangeText(changedText, operationLeft, posLeft); - DENIES(changedText); - } - } - } - } - } - - Y_UNIT_TEST(SwapLetters) { - auto fsm = BuildFsm("^abc$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("bac"); - ACCEPTS("acb"); - DENIES("cba"); - DENIES("bax"); - } - - fsm = BuildFsm("^abcd$"); - APPROXIMATE_SCANNER(fsm, 2) { - ACCEPTS("bacd"); - ACCEPTS("acbd"); - ACCEPTS("baxd"); - ACCEPTS("badc"); - ACCEPTS("bcad"); - ACCEPTS("bcda"); - DENIES("xcbx"); - DENIES("baxx"); - DENIES("ba"); - DENIES("cdab"); - } - - fsm = BuildFsm("^abc$"); - APPROXIMATE_SCANNER(fsm, 0) { - ACCEPTS("abc"); - DENIES("bac"); - } - - fsm = BuildFsm("^[a-c][1-3]$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("a3"); - ACCEPTS("c"); - ACCEPTS("1"); - ACCEPTS("1a"); - ACCEPTS("3b"); - DENIES("4a"); - } - - fsm = BuildFsm("^.*abc$"); - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS("ab"); - ACCEPTS("xxxxbac"); - DENIES("xxxxa"); - DENIES("xxxxcb"); - } - } - - Y_UNIT_TEST(SwapStressTest){ - ystring text; - for (size_t letter = 0; letter < 30; ++letter) { - text += ystring(1, (letter % 26) + 'a'); - } - const ystring regexp = "^" + text + "$"; - auto fsm = BuildFsm(regexp.data()); - auto changedText = text; - - APPROXIMATE_SCANNER(fsm, 1) { - ACCEPTS(text); - - for (size_t pos = 0; pos < text.size() - 1; ++pos) { - changedText[pos] = text[pos + 1]; - changedText[pos + 1] = text[pos]; - ACCEPTS(changedText); - changedText[pos] = text[pos]; - changedText[pos + 1] = text[pos + 1]; - } - } - - APPROXIMATE_SCANNER(fsm, 0) { - ACCEPTS(text); - - for (size_t pos = 0; pos < text.size() - 1; ++pos) { - changedText[pos] = text[pos + 1]; - changedText[pos + 1] = text[pos]; - DENIES(changedText); - changedText[pos] = text[pos]; - changedText[pos + 1] = text[pos + 1]; - } - } - } + Pire::Fsm BuildFsm(const char *str) + { + Pire::Lexer lexer; + TVector<wchar32> ucs4; + + lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + return lexer.Parse(); + } + + Y_UNIT_TEST(Simple) { + auto fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("ax"); + ACCEPTS("xb"); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("xab"); + ACCEPTS("axb"); + ACCEPTS("abx"); + ACCEPTS("aab"); + DENIES("xy"); + DENIES("abcd"); + DENIES("xabx"); + DENIES(""); + } + + fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("ab"); + ACCEPTS("xy"); + ACCEPTS(""); + ACCEPTS("axbx"); + DENIES("xxabx"); + DENIES("xbxxx"); + } + } + + Y_UNIT_TEST(SpecialSymbols) { + auto fsm = BuildFsm("^.*ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("ab"); + ACCEPTS("xxxxab"); + ACCEPTS("xxxxabab"); + DENIES("xxxx"); + DENIES("abxxxx"); + } + + fsm = BuildFsm("^[a-c]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("/"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("xx"); + DENIES("abc"); + } + + fsm = BuildFsm("^x{4}$"); + APPROXIMATE_SCANNER(fsm, 2) { + DENIES ("x"); + ACCEPTS("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + ACCEPTS("xxyy"); + ACCEPTS("xxyyx"); + ACCEPTS("xxxxyz"); + DENIES("xyyy"); + } + + fsm = BuildFsm("^(a|b)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("abc"); + DENIES("xx"); + } + + fsm = BuildFsm("^(ab|cd)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("cd"); + ACCEPTS("ax"); + ACCEPTS("xd"); + ACCEPTS("abx"); + ACCEPTS("a"); + DENIES("abcd"); + DENIES("xx"); + DENIES(""); + } + + fsm = BuildFsm("^[a-c]{3}$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("abc"); + ACCEPTS("aaa"); + ACCEPTS("a"); + ACCEPTS("ax"); + ACCEPTS("abxcx"); + DENIES("x"); + DENIES(""); + DENIES("xaxx"); + } + + fsm = BuildFsm("^\\x{61}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("axx"); + DENIES("xx"); + } + + fsm = BuildFsm("^a.bc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("axxbc"); + ACCEPTS("abc"); + ACCEPTS("xabc"); + ACCEPTS("xaxbc"); + DENIES("bc"); + DENIES("abcx"); + } + } + + Y_UNIT_TEST(TestSurrounded) { + auto fsm = BuildFsm("abc").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("xabcx"); + ACCEPTS("xabx"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("a"); + DENIES("xaxxxx"); + } + + fsm = BuildFsm("^abc$").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("abcx"); + ACCEPTS("xabc"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("xabx"); + DENIES("axx"); + } + } + + Y_UNIT_TEST(GlueFsm) { + auto fsm = BuildFsm("^a$") | BuildFsm("^b$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(""); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS("ab"); + DENIES("abb"); + } + + fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("cc"); + ACCEPTS("x"); + ACCEPTS("xa"); + ACCEPTS("c"); + ACCEPTS("xc"); + ACCEPTS("cxc"); + ACCEPTS(""); + } + } + + enum MutateOperation { + Begin, + Substitute = Begin, + Delete, + Insert, + End + }; + + ystring ChangeText(const ystring& text, int operation, int pos) + { + auto changedText = text; + switch (operation) { + case MutateOperation::Substitute: + changedText[pos] = 'x'; + break; + case MutateOperation::Delete: + changedText.erase(pos, 1); + break; + case MutateOperation::Insert: + changedText.insert(pos, 1, 'x'); + break; + } + + return changedText; + } + + Y_UNIT_TEST(StressTest) { + ystring text; + for (size_t letter = 0; letter < 10; ++letter) { + text += ystring(3, letter + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.data()); + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + ACCEPTS(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + DENIES(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + ACCEPTS(changedText); + } + } + } + } + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + DENIES(changedText); + } + } + } + } + } + + Y_UNIT_TEST(SwapLetters) { + auto fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("bac"); + ACCEPTS("acb"); + DENIES("cba"); + DENIES("bax"); + } + + fsm = BuildFsm("^abcd$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("bacd"); + ACCEPTS("acbd"); + ACCEPTS("baxd"); + ACCEPTS("badc"); + ACCEPTS("bcad"); + ACCEPTS("bcda"); + DENIES("xcbx"); + DENIES("baxx"); + DENIES("ba"); + DENIES("cdab"); + } + + fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS("abc"); + DENIES("bac"); + } + + fsm = BuildFsm("^[a-c][1-3]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a3"); + ACCEPTS("c"); + ACCEPTS("1"); + ACCEPTS("1a"); + ACCEPTS("3b"); + DENIES("4a"); + } + + fsm = BuildFsm("^.*abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("xxxxbac"); + DENIES("xxxxa"); + DENIES("xxxxcb"); + } + } + + Y_UNIT_TEST(SwapStressTest){ + ystring text; + for (size_t letter = 0; letter < 30; ++letter) { + text += ystring(1, (letter % 26) + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.data()); + auto changedText = text; + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + ACCEPTS(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + DENIES(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + } } diff --git a/library/cpp/regex/pire/ut/capture_ut.cpp b/library/cpp/regex/pire/ut/capture_ut.cpp index 3d339c56019..7303ac6b0e8 100644 --- a/library/cpp/regex/pire/ut/capture_ut.cpp +++ b/library/cpp/regex/pire/ut/capture_ut.cpp @@ -32,268 +32,268 @@ Y_UNIT_TEST_SUITE(TestPireCapture) { - using Pire::CapturingScanner; - using Pire::SlowCapturingScanner; - typedef Pire::CapturingScanner::State State; - - CapturingScanner Compile(const char* regexp, int index) - { - Pire::Lexer lexer; - - lexer.Assign(regexp, regexp + strlen(regexp)); - lexer.AddFeature(Pire::Features::CaseInsensitive()); - lexer.AddFeature(Pire::Features::Capture((size_t) index)); - - Pire::Fsm fsm = lexer.Parse(); - - fsm.Surround(); - fsm.Determine(); - return fsm.Compile<Pire::CapturingScanner>(); - } - - SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) - { - Pire::Lexer lexer; - lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index))); - lexer.SetEncoding(encoding); - TVector<wchar32> ucs4; - encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); - lexer.Assign(ucs4.begin(), ucs4.end()); - Pire::Fsm fsm = lexer.Parse(); - fsm.Surround(); - return fsm.Compile<Pire::SlowCapturingScanner>(); - } - - State RunRegexp(const CapturingScanner& scanner, const char* str) - { - State state; - scanner.Initialize(state); - Step(scanner, state, Pire::BeginMark); - Run(scanner, state, str, str + strlen(str)); - Step(scanner, state, Pire::EndMark); - return state; - } - - SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str) - { - SlowCapturingScanner::State state; - scanner.Initialize(state); - Run(scanner, state, str, str + strlen(str)); - return state; - } - - ystring Captured(const State& state, const char* str) - { - if (state.Captured()) - return ystring(str + state.Begin() - 1, str + state.End() - 1); - else - return ystring(); - } - - Y_UNIT_TEST(Trivial) - { - CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); - State state; - const char* str; - - str = "google_id = 'abcde';"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); - - str = "var google_id = 'abcde'; eval(google_id);"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); - - str = "google_id != 'abcde';"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(!state.Captured()); - } - - Y_UNIT_TEST(Sequential) - { - CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); - State state; - const char* str; - - str = "google_id = 'abcde'; google_id = 'xyz';"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde")); - - str = "var google_id = 'abc de'; google_id = 'xyz';"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz")); - } - - Y_UNIT_TEST(NegatedTerminator) - { - CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1); - State state; - const char* str; - - str = "=12345;"; - state = RunRegexp(scanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345")); - } - - Y_UNIT_TEST(Serialization) - { - const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;"; - CapturingScanner scanner2 = Compile(regex, 1); - SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1); - BufferOutput wbuf, wbuf2; - ::Save(&wbuf, scanner2); - ::Save(&wbuf2, slowScanner2); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size()); - CapturingScanner scanner; - SlowCapturingScanner slowScanner; - ::Load(&rbuf, scanner); - ::Load(&rbuf2, slowScanner); - - State state; - SlowCapturingScanner::State slowState; - const char* str; - - str = "google_id = 'abcde';"; - state = RunRegexp(scanner, str); - slowState = RunRegexp(slowScanner, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); - SlowCapturingScanner::SingleState final; - UNIT_ASSERT(slowScanner.GetCapture(slowState, final)); - ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin()); - UNIT_ASSERT_EQUAL(ans, ystring("abcde")); - - str = "google_id != 'abcde';"; - state = RunRegexp(scanner, str); - slowState = RunRegexp(slowScanner, str); - UNIT_ASSERT(!state.Captured()); - UNIT_ASSERT(!slowScanner.GetCapture(slowState, final)); - - CapturingScanner scanner3; - const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); - TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); - const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size()); - UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); - - str = "google_id = 'abcde';"; - state = RunRegexp(scanner3, str); - UNIT_ASSERT(state.Captured()); - UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); - - str = "google_id != 'abcde';"; - state = RunRegexp(scanner3, str); - UNIT_ASSERT(!state.Captured()); - - ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1); - try { - scanner3.Mmap(ptr, wbuf.Buffer().Size()); - UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); - } - catch (Pire::Error&) {} - - for (size_t offset = 1; offset < MaxTestOffset; ++offset) { - ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - try { - scanner3.Mmap(ptr, wbuf.Buffer().Size()); - if (offset % sizeof(size_t) != 0) { - UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); - } else { - str = "google_id = 'abcde';"; - state = RunRegexp(scanner3, str); - UNIT_ASSERT(state.Captured()); - } - } - catch (Pire::Error&) {} - } - } - - Y_UNIT_TEST(Empty) - { - Pire::CapturingScanner sc; - UNIT_ASSERT(sc.Empty()); - - UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash - - // Test Save/Load/Mmap - BufferOutput wbuf; - ::Save(&wbuf, sc); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - Pire::CapturingScanner sc3; - ::Load(&rbuf, sc3); - UNIT_CHECKPOINT(); RunRegexp(sc3, "a string"); - - const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); - TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); - const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - - Pire::CapturingScanner sc4; - const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); - UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); - UNIT_CHECKPOINT(); RunRegexp(sc4, "a string"); - } - - void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8()) - { - Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding); - SlowCapturingScanner::State st = RunRegexp(sc, text); - SlowCapturingScanner::SingleState fin; - bool ifCaptured = sc.GetCapture(st, fin); - if (ans) { - UNIT_ASSERT(ifCaptured); - ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin()); - UNIT_ASSERT_EQUAL(answer, captured); - } else { - UNIT_ASSERT(!ifCaptured); - } - } - - Y_UNIT_TEST(SlowCapturingNonGreedy) - { - const char* regexp = ".*?(pref.*suff)"; - const char* text = "pref ala bla pref cla suff dla"; - MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff")); - } - - Y_UNIT_TEST(SlowCaptureGreedy) - { - const char* regexp = ".*(pref.*suff)"; - const char* text = "pref ala bla pref cla suff dla"; - MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff")); - } - - Y_UNIT_TEST(SlowCaptureInOr) - { - const char* regexp = "(A)|A"; - const char* text = "A"; - MakeSlowCapturingTest(regexp, text, 1, true, ystring("A")); - const char* regexp2 = "A|(A)"; - MakeSlowCapturingTest(regexp2, text, 1, false); - } - - Y_UNIT_TEST(SlowCapturing) - { - const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)"; - const char* text = "http://vkontakte.ru/id100500"; - MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500")); - } - - Y_UNIT_TEST(Utf_8) - { - const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!"; - const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! "; - const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)"; - MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans)); - } + using Pire::CapturingScanner; + using Pire::SlowCapturingScanner; + typedef Pire::CapturingScanner::State State; + + CapturingScanner Compile(const char* regexp, int index) + { + Pire::Lexer lexer; + + lexer.Assign(regexp, regexp + strlen(regexp)); + lexer.AddFeature(Pire::Features::CaseInsensitive()); + lexer.AddFeature(Pire::Features::Capture((size_t) index)); + + Pire::Fsm fsm = lexer.Parse(); + + fsm.Surround(); + fsm.Determine(); + return fsm.Compile<Pire::CapturingScanner>(); + } + + SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + Pire::Lexer lexer; + lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index))); + lexer.SetEncoding(encoding); + TVector<wchar32> ucs4; + encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + Pire::Fsm fsm = lexer.Parse(); + fsm.Surround(); + return fsm.Compile<Pire::SlowCapturingScanner>(); + } + + State RunRegexp(const CapturingScanner& scanner, const char* str) + { + State state; + scanner.Initialize(state); + Step(scanner, state, Pire::BeginMark); + Run(scanner, state, str, str + strlen(str)); + Step(scanner, state, Pire::EndMark); + return state; + } + + SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str) + { + SlowCapturingScanner::State state; + scanner.Initialize(state); + Run(scanner, state, str, str + strlen(str)); + return state; + } + + ystring Captured(const State& state, const char* str) + { + if (state.Captured()) + return ystring(str + state.Begin() - 1, str + state.End() - 1); + else + return ystring(); + } + + Y_UNIT_TEST(Trivial) + { + CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); + State state; + const char* str; + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "var google_id = 'abcde'; eval(google_id);"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(!state.Captured()); + } + + Y_UNIT_TEST(Sequential) + { + CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); + State state; + const char* str; + + str = "google_id = 'abcde'; google_id = 'xyz';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde")); + + str = "var google_id = 'abc de'; google_id = 'xyz';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz")); + } + + Y_UNIT_TEST(NegatedTerminator) + { + CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1); + State state; + const char* str; + + str = "=12345;"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345")); + } + + Y_UNIT_TEST(Serialization) + { + const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;"; + CapturingScanner scanner2 = Compile(regex, 1); + SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1); + BufferOutput wbuf, wbuf2; + ::Save(&wbuf, scanner2); + ::Save(&wbuf2, slowScanner2); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size()); + CapturingScanner scanner; + SlowCapturingScanner slowScanner; + ::Load(&rbuf, scanner); + ::Load(&rbuf2, slowScanner); + + State state; + SlowCapturingScanner::State slowState; + const char* str; + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner, str); + slowState = RunRegexp(slowScanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + SlowCapturingScanner::SingleState final; + UNIT_ASSERT(slowScanner.GetCapture(slowState, final)); + ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin()); + UNIT_ASSERT_EQUAL(ans, ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner, str); + slowState = RunRegexp(slowScanner, str); + UNIT_ASSERT(!state.Captured()); + UNIT_ASSERT(!slowScanner.GetCapture(slowState, final)); + + CapturingScanner scanner3; + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(!state.Captured()); + + ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1); + try { + scanner3.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); + } + catch (Pire::Error&) {} + + for (size_t offset = 1; offset < MaxTestOffset; ++offset) { + ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + try { + scanner3.Mmap(ptr, wbuf.Buffer().Size()); + if (offset % sizeof(size_t) != 0) { + UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); + } else { + str = "google_id = 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(state.Captured()); + } + } + catch (Pire::Error&) {} + } + } + + Y_UNIT_TEST(Empty) + { + Pire::CapturingScanner sc; + UNIT_ASSERT(sc.Empty()); + + UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash + + // Test Save/Load/Mmap + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::CapturingScanner sc3; + ::Load(&rbuf, sc3); + UNIT_CHECKPOINT(); RunRegexp(sc3, "a string"); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + Pire::CapturingScanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + UNIT_CHECKPOINT(); RunRegexp(sc4, "a string"); + } + + void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding); + SlowCapturingScanner::State st = RunRegexp(sc, text); + SlowCapturingScanner::SingleState fin; + bool ifCaptured = sc.GetCapture(st, fin); + if (ans) { + UNIT_ASSERT(ifCaptured); + ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin()); + UNIT_ASSERT_EQUAL(answer, captured); + } else { + UNIT_ASSERT(!ifCaptured); + } + } + + Y_UNIT_TEST(SlowCapturingNonGreedy) + { + const char* regexp = ".*?(pref.*suff)"; + const char* text = "pref ala bla pref cla suff dla"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureGreedy) + { + const char* regexp = ".*(pref.*suff)"; + const char* text = "pref ala bla pref cla suff dla"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureInOr) + { + const char* regexp = "(A)|A"; + const char* text = "A"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("A")); + const char* regexp2 = "A|(A)"; + MakeSlowCapturingTest(regexp2, text, 1, false); + } + + Y_UNIT_TEST(SlowCapturing) + { + const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)"; + const char* text = "http://vkontakte.ru/id100500"; + MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500")); + } + + Y_UNIT_TEST(Utf_8) + { + const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!"; + const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! "; + const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans)); + } } diff --git a/library/cpp/regex/pire/ut/common.h b/library/cpp/regex/pire/ut/common.h index d79eedafb73..e88a2affc8d 100644 --- a/library/cpp/regex/pire/ut/common.h +++ b/library/cpp/regex/pire/ut/common.h @@ -2,7 +2,7 @@ * common.h -- * * Copyright (c) 2007-2010, Dmitry Prokoptsev <[email protected]>, - * Alexander Gololobov <[email protected]> + * Alexander Gololobov <[email protected]> * * This file is part of Pire, the Perl Incompatible * Regular Expressions library. @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -39,83 +39,83 @@ using namespace Pire; inline Pire::Fsm ParseRegexp(const char* str, const char* options = "", const Pire::Encoding** enc = 0) { - Pire::Lexer lexer; - TVector<wchar32> ucs4; - - bool surround = true; - for (; *options; ++options) { - if (*options == 'i') - lexer.AddFeature(Pire::Features::CaseInsensitive()); - else if (*options == 'u') - lexer.SetEncoding(Pire::Encodings::Utf8()); - else if (*options == 'n') - surround = false; - else if (*options == 'a') - lexer.AddFeature(Pire::Features::AndNotSupport()); - else - throw std::invalid_argument("Unknown option: " + ystring(1, *options)); - } - - if (enc) - *enc = &lexer.Encoding(); - - lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); - lexer.Assign(ucs4.begin(), ucs4.end()); - - Pire::Fsm fsm = lexer.Parse(); - if (surround) - fsm.Surround(); - return fsm; + Pire::Lexer lexer; + TVector<wchar32> ucs4; + + bool surround = true; + for (; *options; ++options) { + if (*options == 'i') + lexer.AddFeature(Pire::Features::CaseInsensitive()); + else if (*options == 'u') + lexer.SetEncoding(Pire::Encodings::Utf8()); + else if (*options == 'n') + surround = false; + else if (*options == 'a') + lexer.AddFeature(Pire::Features::AndNotSupport()); + else + throw std::invalid_argument("Unknown option: " + ystring(1, *options)); + } + + if (enc) + *enc = &lexer.Encoding(); + + lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + + Pire::Fsm fsm = lexer.Parse(); + if (surround) + fsm.Surround(); + return fsm; } inline bool HasError(const char* regexp) { - try { - ParseRegexp(regexp); - return false; - } catch (Pire::Error& ex) { - return true; - } + try { + ParseRegexp(regexp); + return false; + } catch (Pire::Error& ex) { + return true; + } } struct Scanners { - Pire::Scanner fast; - Pire::NonrelocScanner nonreloc; - Pire::SimpleScanner simple; - Pire::SlowScanner slow; - Pire::ScannerNoMask fastNoMask; - Pire::NonrelocScannerNoMask nonrelocNoMask; - Pire::HalfFinalScanner halfFinal; - Pire::HalfFinalScannerNoMask halfFinalNoMask; - Pire::NonrelocHalfFinalScanner nonrelocHalfFinal; - Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask; - - Scanners(const Pire::Fsm& fsm, size_t distance = 0) - : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance)) - , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance)) - , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance)) - , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance)) - , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance)) - , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance)) - , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance)) - , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance)) - , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance)) - , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance)) - {} - - Scanners(const char* str, const char* options = "") - { - Pire::Fsm fsm = ParseRegexp(str, options); - fast = Pire::Fsm(fsm).Compile<Pire::Scanner>(); - nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(); - simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(); - slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>(); - fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(); - nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(); - halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(); - halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(); - nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(); - nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(); - } + Pire::Scanner fast; + Pire::NonrelocScanner nonreloc; + Pire::SimpleScanner simple; + Pire::SlowScanner slow; + Pire::ScannerNoMask fastNoMask; + Pire::NonrelocScannerNoMask nonrelocNoMask; + Pire::HalfFinalScanner halfFinal; + Pire::HalfFinalScannerNoMask halfFinalNoMask; + Pire::NonrelocHalfFinalScanner nonrelocHalfFinal; + Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask; + + Scanners(const Pire::Fsm& fsm, size_t distance = 0) + : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance)) + , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance)) + , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance)) + , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance)) + , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance)) + , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance)) + , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance)) + , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance)) + , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance)) + , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance)) + {} + + Scanners(const char* str, const char* options = "") + { + Pire::Fsm fsm = ParseRegexp(str, options); + fast = Pire::Fsm(fsm).Compile<Pire::Scanner>(); + nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(); + simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(); + slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>(); + fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(); + nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(); + halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(); + halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(); + nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(); + nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(); + } }; #ifdef PIRE_DEBUG @@ -123,33 +123,33 @@ struct Scanners { template <class Scanner> inline ystring DbgState(const Scanner& scanner, typename Scanner::State state) { - return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); + return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); } /* inline ystring DbgState(const Pire::SimpleScanner& scanner, Pire::SimpleScanner::State state) { - return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); + return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); } */ inline ystring DbgState(const Pire::SlowScanner& scanner, const Pire::SlowScanner::State& state) { - return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring()); + return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring()); } template<class Scanner> void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end) { - for (; begin != end; ++begin) { - char tmp[8]; - if (*begin >= 32) { - tmp[0] = *begin; - tmp[1] = 0; - } else - snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin); - std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> "; - scanner.Next(state, (unsigned char) *begin); - std::clog << DbgState(scanner, state) << "\n"; - } + for (; begin != end; ++begin) { + char tmp[8]; + if (*begin >= 32) { + tmp[0] = *begin; + tmp[1] = 0; + } else + snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin); + std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> "; + scanner.Next(state, (unsigned char) *begin); + std::clog << DbgState(scanner, state) << "\n"; + } } #define Run DbgRun @@ -158,34 +158,34 @@ void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char* template<class Scanner> typename Scanner::State RunRegexp(const Scanner& scanner, const ystring& str) { - PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n"); - - typename Scanner::State state; - scanner.Initialize(state); - Step(scanner, state, BeginMark); - Run(scanner, state, str.c_str(), str.c_str() + str.length()); - Step(scanner, state, EndMark); - return state; + PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n"); + + typename Scanner::State state; + scanner.Initialize(state); + Step(scanner, state, BeginMark); + Run(scanner, state, str.c_str(), str.c_str() + str.length()); + Step(scanner, state, EndMark); + return state; } template<class Scanner> typename Scanner::State RunRegexp(const Scanner& scanner, const char* str) { - return RunRegexp(scanner, ystring(str)); + return RunRegexp(scanner, ystring(str)); } template<class Scanner> bool Matches(const Scanner& scanner, const ystring& str) { - auto state = RunRegexp(scanner, str); - auto result = scanner.AcceptedRegexps(state); - return result.first != result.second; + auto state = RunRegexp(scanner, str); + auto result = scanner.AcceptedRegexps(state); + return result.first != result.second; } template<class Scanner> bool Matches(const Scanner& scanner, const char* str) { - return Matches(scanner, ystring(str)); + return Matches(scanner, ystring(str)); } #define SCANNER(fsm) for (Scanners m_scanners(fsm), *m_flag = &m_scanners; m_flag; m_flag = 0) @@ -193,32 +193,32 @@ bool Matches(const Scanner& scanner, const char* str) #define REGEXP(pattern) for (Scanners m_scanners(pattern), *m_flag = &m_scanners; m_flag; m_flag = 0) #define REGEXP2(pattern,flags) for (Scanners m_scanners(pattern, flags), *m_flag = &m_scanners; m_flag; m_flag = 0) #define ACCEPTS(str) \ - do {\ - UNIT_ASSERT(Matches(m_scanners.fast, str));\ + do {\ + UNIT_ASSERT(Matches(m_scanners.fast, str));\ UNIT_ASSERT(Matches(m_scanners.nonreloc, str));\ - UNIT_ASSERT(Matches(m_scanners.simple, str));\ - UNIT_ASSERT(Matches(m_scanners.slow, str));\ - UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\ - UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\ - UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\ - UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\ - UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\ - UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ - } while (false) + UNIT_ASSERT(Matches(m_scanners.simple, str));\ + UNIT_ASSERT(Matches(m_scanners.slow, str));\ + UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\ + UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ + } while (false) #define DENIES(str) \ - do {\ - UNIT_ASSERT(!Matches(m_scanners.fast, str));\ + do {\ + UNIT_ASSERT(!Matches(m_scanners.fast, str));\ UNIT_ASSERT(!Matches(m_scanners.nonreloc, str));\ - UNIT_ASSERT(!Matches(m_scanners.simple, str));\ - UNIT_ASSERT(!Matches(m_scanners.slow, str));\ - UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\ - UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\ - UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\ - UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\ - UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\ - UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ - } while (false) + UNIT_ASSERT(!Matches(m_scanners.simple, str));\ + UNIT_ASSERT(!Matches(m_scanners.slow, str));\ + UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\ + UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ + } while (false) #endif diff --git a/library/cpp/regex/pire/ut/count_ut.cpp b/library/cpp/regex/pire/ut/count_ut.cpp index ffe7943fcc6..0db72a4ad56 100644 --- a/library/cpp/regex/pire/ut/count_ut.cpp +++ b/library/cpp/regex/pire/ut/count_ut.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -33,551 +33,551 @@ Y_UNIT_TEST_SUITE(TestCount) { - Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding) - { - Pire::Lexer lex; - lex.SetEncoding(encoding); - TVector<wchar32> ucs4; - encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); - lex.Assign(ucs4.begin(), ucs4.end()); - return lex.Parse(); - } - - template<class Scanner> - typename Scanner::State InitializedState(const Scanner& scanner) - { - typename Scanner::State state; - scanner.Initialize(state); - return state; - } - - template<class Scanner> - typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1) - { - if (len == (size_t)-1) len = strlen(text); - auto state = InitializedState(scanner); - Pire::Step(scanner, state, Pire::BeginMark); - Pire::Run(scanner, state, text, text + len); - Pire::Step(scanner, state, Pire::EndMark); - return state; - } - - template<class Scanner> - size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) - { - const auto regexpFsm = MkFsm(regexp, encoding); - const auto separatorFsm = MkFsm(separator, encoding); - return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0); - } - - size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) - { - const auto regexpFsm = MkFsm(regexp, encoding); - const auto separatorFsm = MkFsm(separator, encoding); - auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); - auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); - if (strcmp(separator, ".*") == 0) { - HalfFinalFsm fsm(regexpFsm); - fsm.MakeGreedyCounter(true); - auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); - fsm = HalfFinalFsm(regexpFsm); - fsm.MakeGreedyCounter(false); - auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); - UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult); - UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult); - } - UNIT_ASSERT_EQUAL(countingResult, newResult); - auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); - UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult); - return newResult; - } - - Y_UNIT_TEST(Count) - { - UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3)); - char aaa[] = "abc def\0 abc\0 def ghi, abc"; - UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6)); - UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6)); - UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8)); - UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3)); - UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4)); - UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 " - "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240" - "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 " - "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302" - "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270" - "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5)); - UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*", - "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 " - "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 " - "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. " - "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 " - "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 " - "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 " - "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> " - "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 " - "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 " - "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 " - "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 " - "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " - "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 " - "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> " - "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. " - "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> " - "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " - "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 " - "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! " - "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, " - "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 " - "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202" - "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264" - "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204" - "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275" - "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321" - "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321" - "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 " - "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277" - "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265" - "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321" - "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201" - "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277" - "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 " - "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271" - "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201" - "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270" - "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202" - "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202" - "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"), - size_t(6)); - UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />" - "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276" - "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">" - "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />" - "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203" - "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">" - "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />" - "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201" - "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260" - "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276" - "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320" - "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214" - "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7)); - UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3)); - UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1)); - UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +", - " \320\260\320\260\320\220 abc def \320\260 cd"), - size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state - } - - Y_UNIT_TEST(CountWithoutSeparator) - { - UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3)); - } - - Y_UNIT_TEST(CountGreedy) - { - const auto& enc = Pire::Encodings::Latin1(); - char text[] = "wwwsswwwsssswwws"; - UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); - UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); - UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); - UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); - } - - Y_UNIT_TEST(CountRepeating) - { - char text[] = "abbabbabbabbat"; - UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2)); - } - - template<class Scanner> - void CountGlueOne() - { - const auto& enc = Pire::Encodings::Utf8(); - auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); - auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); - auto sc = Scanner::Glue(sc1, sc2); - auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); - } - - Y_UNIT_TEST(CountGlue) - { - CountGlueOne<Pire::CountingScanner>(); - CountGlueOne<Pire::AdvancedCountingScanner>(); - CountGlueOne<Pire::NoGlueLimitCountingScanner>(); - } - - template <class Scanner> - void CountManyGluesOne(size_t maxRegexps) { - const auto& encoding = Pire::Encodings::Utf8(); - auto text = "abcdbaa aa"; - TVector<ypair<std::string, std::string>> tasks = { - {"a", ".*"}, - {"b", ".*"}, - {"c", ".*"}, - {"ba", ".*"}, - {"ab",".*"}, - }; - TVector<size_t> answers = {5, 2, 1, 1, 1}; - Scanner scanner; - size_t regexpsCount = 0; - for (; regexpsCount < maxRegexps; ++regexpsCount) { - const auto& task = tasks[regexpsCount % tasks.size()]; - const auto regexpFsm = MkFsm(task.first.c_str(), encoding); - const auto separatorFsm = MkFsm(task.second.c_str(), encoding); - Scanner nextScanner(regexpFsm, separatorFsm); - auto glue = Scanner::Glue(scanner, nextScanner); - if (glue.Empty()) { - break; - } - scanner = std::move(glue); - } - auto state = Run(scanner, text); - for (size_t i = 0; i < regexpsCount; ++i) { - UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]); - } - } - - Y_UNIT_TEST(CountManyGlues) - { - CountManyGluesOne<Pire::CountingScanner>(20); - CountManyGluesOne<Pire::AdvancedCountingScanner>(20); - CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50); - } - - template<class Scanner> - void CountBoundariesOne() - { - const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" }; - - const auto& enc = Pire::Encodings::Utf8(); - Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc)); - auto st = InitializedState(sc); - for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { - Pire::Step(sc, st, Pire::BeginMark); - Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i])); - Pire::Step(sc, st, Pire::EndMark); - } - UNIT_ASSERT_EQUAL(st.Result(0), size_t(3)); - - const auto& enc2 = Pire::Encodings::Latin1(); - Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2)); - auto st2 = InitializedState(sc2); - for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { - Pire::Step(sc2, st2, Pire::BeginMark); - Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i])); - Pire::Step(sc2, st2, Pire::EndMark); - } - UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7)); - } - - Y_UNIT_TEST(CountBoundaries) - { - CountBoundariesOne<Pire::CountingScanner>(); - CountBoundariesOne<Pire::AdvancedCountingScanner>(); - CountBoundariesOne<Pire::NoGlueLimitCountingScanner>(); - } - - template<class Scanner> - void SerializationOne() - { - const auto& enc = Pire::Encodings::Latin1(); - auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); - auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); - auto sc = Scanner::Glue(sc1, sc2); - - BufferOutput wbuf; - ::Save(&wbuf, sc); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - Scanner sc3; - ::Load(&rbuf, sc3); - - auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); - - const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); - TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); - - // Test mmap-ing at various alignments - for (size_t offset = 0; offset < MaxTestOffset; ++offset) { - const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - try { - Scanner sc4; - const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); - - if (offset % sizeof(size_t) != 0) { - UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping"); - } else { - UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); - - st = Run(sc4, "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); - } - } - catch (Pire::Error&) {} - } - } - - Y_UNIT_TEST(Serialization) - { - SerializationOne<Pire::CountingScanner>(); - SerializationOne<Pire::AdvancedCountingScanner>(); - SerializationOne<Pire::NoGlueLimitCountingScanner>(); - } - - template<class Scanner> - void Serialization_v6_compatibilityOne() - { - const auto& enc = Pire::Encodings::Latin1(); - auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); - auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); - auto sc = Scanner::Glue(sc1, sc2); - - BufferOutput wbuf; - ::Save(&wbuf, sc); - - // Patched scanner is a scanner of RE_VERSION 6. - // The patched scanner is concatenated with original scanner to - // make sure all content of patched scanner is consumed. - - const size_t ALIGNMENT = sizeof(size_t); - size_t actions_size = - sc.Size() * - sc.LettersCount() * - sizeof(typename Scanner::Action); - UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0); - size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag); - const char* src = wbuf.Buffer().Data(); - size_t src_size = wbuf.Buffer().Size(); - size_t patched_size = src_size + actions_size; - size_t bytes_before_actions = src_size - tags_size; - const int fill_char = 0x42; - - TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT); - char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT)); - char* patched = dst; - - // Insert dummy m_actions between m_jumps and m_tags. - memcpy(patched, src, bytes_before_actions); // copy members before m_actions - memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions - memcpy(patched + bytes_before_actions + actions_size, - src + bytes_before_actions, - tags_size); // m_tags - // Set version to 6 - // order of fields in header: magic, version, ... - ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1; - UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION); - *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS; - - // write normal scanner after patched one - char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT); - memcpy(normal, src, src_size); - char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT); - size_t dst_size = dst_end - dst; - - // test loading from stream - { - MemoryInput rbuf(dst, dst_size); - Scanner sc_patched, sc_normal; - ::Load(&rbuf, sc_patched); - ::Load(&rbuf, sc_normal); - auto st_patched = Run(sc_patched, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); - auto st_normal = Run(sc_normal, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); - } - - // test loading using Mmap - { - Scanner sc_patched, sc_normal; - const void* tail = sc_patched.Mmap(patched, patched_size); - UNIT_ASSERT_EQUAL(tail, normal); - const void* tail2 = sc_normal.Mmap(tail, src_size); - UNIT_ASSERT_EQUAL(tail2, dst_end); - auto st_patched = Run(sc_patched, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); - auto st_normal = Run(sc_normal, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); - } - } - - Y_UNIT_TEST(Serialization_v6_compatibility) - { - Serialization_v6_compatibilityOne<Pire::CountingScanner>(); - Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>(); - // NoGlueLimitCountingScanner is not v6_compatible - } - - Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) { - const auto& enc = Pire::Encodings::Latin1(); - auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); - auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); - auto sc = AdvancedCountingScanner::Glue(sc1, sc2); - - BufferOutput wbuf; - ::Save(&wbuf, sc); - - TVector<char> buf2(wbuf.Buffer().Size()); - memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size()); - - // test loading from stream - { - MemoryInput rbuf(buf2.data(), buf2.size()); - NoGlueLimitCountingScanner scanner; - ::Load(&rbuf, scanner); - auto state = Run(scanner, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); - } - - // test loading using Mmap - { - NoGlueLimitCountingScanner scanner; - const void* tail = scanner.Mmap(buf2.data(), buf2.size()); - UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size()); - auto state = Run(scanner, - "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); - UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); - } - } - - template<class Scanner> - void EmptyOne() - { - Scanner sc; - UNIT_ASSERT(sc.Empty()); - - UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash - - // Test glueing empty - const auto& enc = Pire::Encodings::Latin1(); - auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); - auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1)); - auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst"); - UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); - - // Test Save/Load/Mmap - BufferOutput wbuf; - ::Save(&wbuf, sc); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - Pire::CountingScanner sc3; - ::Load(&rbuf, sc3); - UNIT_CHECKPOINT(); Run(sc3, "a string"); - - const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); - TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); - const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - - Scanner sc4; - const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); - UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); - UNIT_CHECKPOINT(); Run(sc4, "a string"); - } - - Y_UNIT_TEST(Empty) - { - EmptyOne<Pire::CountingScanner>(); - EmptyOne<Pire::AdvancedCountingScanner>(); - EmptyOne<Pire::NoGlueLimitCountingScanner>(); - } - - template<typename Scanner> - TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) { - TVector<Scanner> scanners(6); - const auto regexpFsm = MkFsm(regexp, encoding); - HalfFinalFsm fsm(regexpFsm); - fsm.MakeGreedyCounter(true); - scanners[0] = Scanner(fsm); - fsm = HalfFinalFsm(regexpFsm); - fsm.MakeGreedyCounter(false); - scanners[1] = Scanner(fsm); - fsm = HalfFinalFsm(regexpFsm); - fsm.MakeNonGreedyCounter(true, true); - scanners[2] = Scanner(fsm); - fsm = HalfFinalFsm(regexpFsm); - fsm.MakeNonGreedyCounter(true, false); - scanners[3] = Scanner(fsm); - fsm = HalfFinalFsm(regexpFsm); - fsm.MakeNonGreedyCounter(false); - scanners[4] = Scanner(fsm); - scanners[5] = scanners[0]; - for (size_t i = 1; i < 5; i++) { - scanners[5] = Scanner::Glue(scanners[5], scanners[i]); - } - return scanners; - } - - template<typename Scanner> - void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) { - for (size_t i = 0; i < 5; i++) { - UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]); - } - auto state = Run(scanners[5], text, -1); - for (size_t i = 0; i < 5; i++) { - UNIT_ASSERT_EQUAL(state.Result(i), result[i]); - } - } - - template<typename Scanner> - void TestHalfFinalCount() { - HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8}); - HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7}); - } - - Y_UNIT_TEST(HalfFinal) - { - TestHalfFinalCount<Pire::HalfFinalScanner>(); - TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>(); - TestHalfFinalCount<Pire::HalfFinalScannerNoMask>(); - TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>(); - } - - template<typename Scanner> - void TestHalfFinalSerialization() { - auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+"); - BufferOutput wbuf; - for (size_t i = 0; i < 6; i++) { - ::Save(&wbuf, oldScanners[i]); - } - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - TVector<Scanner> scanners(6); - for (size_t i = 0; i < 6; i++) { - ::Load(&rbuf, scanners[i]); - } - - HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5}); - } - - Y_UNIT_TEST(HalfFinalSerialization) - { - TestHalfFinalSerialization<Pire::HalfFinalScanner>(); - TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>(); - } + Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding) + { + Pire::Lexer lex; + lex.SetEncoding(encoding); + TVector<wchar32> ucs4; + encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + lex.Assign(ucs4.begin(), ucs4.end()); + return lex.Parse(); + } + + template<class Scanner> + typename Scanner::State InitializedState(const Scanner& scanner) + { + typename Scanner::State state; + scanner.Initialize(state); + return state; + } + + template<class Scanner> + typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1) + { + if (len == (size_t)-1) len = strlen(text); + auto state = InitializedState(scanner); + Pire::Step(scanner, state, Pire::BeginMark); + Pire::Run(scanner, state, text, text + len); + Pire::Step(scanner, state, Pire::EndMark); + return state; + } + + template<class Scanner> + size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + const auto regexpFsm = MkFsm(regexp, encoding); + const auto separatorFsm = MkFsm(separator, encoding); + return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0); + } + + size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + const auto regexpFsm = MkFsm(regexp, encoding); + const auto separatorFsm = MkFsm(separator, encoding); + auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + if (strcmp(separator, ".*") == 0) { + HalfFinalFsm fsm(regexpFsm); + fsm.MakeGreedyCounter(true); + auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeGreedyCounter(false); + auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); + UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult); + UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult); + } + UNIT_ASSERT_EQUAL(countingResult, newResult); + auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult); + return newResult; + } + + Y_UNIT_TEST(Count) + { + UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3)); + char aaa[] = "abc def\0 abc\0 def ghi, abc"; + UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6)); + UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6)); + UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8)); + UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3)); + UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4)); + UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 " + "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240" + "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 " + "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302" + "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270" + "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5)); + UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*", + "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 " + "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 " + "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. " + "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 " + "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 " + "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 " + "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 " + "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 " + "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 " + "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 " + "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 " + "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> " + "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. " + "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> " + "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " + "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! " + "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, " + "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 " + "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202" + "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264" + "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204" + "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275" + "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321" + "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321" + "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 " + "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277" + "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265" + "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321" + "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201" + "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277" + "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 " + "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271" + "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201" + "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270" + "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202" + "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202" + "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"), + size_t(6)); + UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276" + "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">" + "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203" + "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">" + "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201" + "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260" + "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276" + "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320" + "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214" + "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7)); + UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +", + " \320\260\320\260\320\220 abc def \320\260 cd"), + size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state + } + + Y_UNIT_TEST(CountWithoutSeparator) + { + UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3)); + } + + Y_UNIT_TEST(CountGreedy) + { + const auto& enc = Pire::Encodings::Latin1(); + char text[] = "wwwsswwwsssswwws"; + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); + } + + Y_UNIT_TEST(CountRepeating) + { + char text[] = "abbabbabbabbat"; + UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2)); + } + + template<class Scanner> + void CountGlueOne() + { + const auto& enc = Pire::Encodings::Utf8(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + } + + Y_UNIT_TEST(CountGlue) + { + CountGlueOne<Pire::CountingScanner>(); + CountGlueOne<Pire::AdvancedCountingScanner>(); + CountGlueOne<Pire::NoGlueLimitCountingScanner>(); + } + + template <class Scanner> + void CountManyGluesOne(size_t maxRegexps) { + const auto& encoding = Pire::Encodings::Utf8(); + auto text = "abcdbaa aa"; + TVector<ypair<std::string, std::string>> tasks = { + {"a", ".*"}, + {"b", ".*"}, + {"c", ".*"}, + {"ba", ".*"}, + {"ab",".*"}, + }; + TVector<size_t> answers = {5, 2, 1, 1, 1}; + Scanner scanner; + size_t regexpsCount = 0; + for (; regexpsCount < maxRegexps; ++regexpsCount) { + const auto& task = tasks[regexpsCount % tasks.size()]; + const auto regexpFsm = MkFsm(task.first.c_str(), encoding); + const auto separatorFsm = MkFsm(task.second.c_str(), encoding); + Scanner nextScanner(regexpFsm, separatorFsm); + auto glue = Scanner::Glue(scanner, nextScanner); + if (glue.Empty()) { + break; + } + scanner = std::move(glue); + } + auto state = Run(scanner, text); + for (size_t i = 0; i < regexpsCount; ++i) { + UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]); + } + } + + Y_UNIT_TEST(CountManyGlues) + { + CountManyGluesOne<Pire::CountingScanner>(20); + CountManyGluesOne<Pire::AdvancedCountingScanner>(20); + CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50); + } + + template<class Scanner> + void CountBoundariesOne() + { + const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" }; + + const auto& enc = Pire::Encodings::Utf8(); + Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc)); + auto st = InitializedState(sc); + for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { + Pire::Step(sc, st, Pire::BeginMark); + Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i])); + Pire::Step(sc, st, Pire::EndMark); + } + UNIT_ASSERT_EQUAL(st.Result(0), size_t(3)); + + const auto& enc2 = Pire::Encodings::Latin1(); + Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2)); + auto st2 = InitializedState(sc2); + for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { + Pire::Step(sc2, st2, Pire::BeginMark); + Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i])); + Pire::Step(sc2, st2, Pire::EndMark); + } + UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7)); + } + + Y_UNIT_TEST(CountBoundaries) + { + CountBoundariesOne<Pire::CountingScanner>(); + CountBoundariesOne<Pire::AdvancedCountingScanner>(); + CountBoundariesOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<class Scanner> + void SerializationOne() + { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Scanner sc3; + ::Load(&rbuf, sc3); + + auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + + // Test mmap-ing at various alignments + for (size_t offset = 0; offset < MaxTestOffset; ++offset) { + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + try { + Scanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + + if (offset % sizeof(size_t) != 0) { + UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping"); + } else { + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + + st = Run(sc4, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + } + } + catch (Pire::Error&) {} + } + } + + Y_UNIT_TEST(Serialization) + { + SerializationOne<Pire::CountingScanner>(); + SerializationOne<Pire::AdvancedCountingScanner>(); + SerializationOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<class Scanner> + void Serialization_v6_compatibilityOne() + { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + // Patched scanner is a scanner of RE_VERSION 6. + // The patched scanner is concatenated with original scanner to + // make sure all content of patched scanner is consumed. + + const size_t ALIGNMENT = sizeof(size_t); + size_t actions_size = + sc.Size() * + sc.LettersCount() * + sizeof(typename Scanner::Action); + UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0); + size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag); + const char* src = wbuf.Buffer().Data(); + size_t src_size = wbuf.Buffer().Size(); + size_t patched_size = src_size + actions_size; + size_t bytes_before_actions = src_size - tags_size; + const int fill_char = 0x42; + + TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT); + char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT)); + char* patched = dst; + + // Insert dummy m_actions between m_jumps and m_tags. + memcpy(patched, src, bytes_before_actions); // copy members before m_actions + memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions + memcpy(patched + bytes_before_actions + actions_size, + src + bytes_before_actions, + tags_size); // m_tags + // Set version to 6 + // order of fields in header: magic, version, ... + ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1; + UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION); + *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS; + + // write normal scanner after patched one + char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT); + memcpy(normal, src, src_size); + char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT); + size_t dst_size = dst_end - dst; + + // test loading from stream + { + MemoryInput rbuf(dst, dst_size); + Scanner sc_patched, sc_normal; + ::Load(&rbuf, sc_patched); + ::Load(&rbuf, sc_normal); + auto st_patched = Run(sc_patched, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); + auto st_normal = Run(sc_normal, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); + } + + // test loading using Mmap + { + Scanner sc_patched, sc_normal; + const void* tail = sc_patched.Mmap(patched, patched_size); + UNIT_ASSERT_EQUAL(tail, normal); + const void* tail2 = sc_normal.Mmap(tail, src_size); + UNIT_ASSERT_EQUAL(tail2, dst_end); + auto st_patched = Run(sc_patched, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); + auto st_normal = Run(sc_normal, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); + } + } + + Y_UNIT_TEST(Serialization_v6_compatibility) + { + Serialization_v6_compatibilityOne<Pire::CountingScanner>(); + Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>(); + // NoGlueLimitCountingScanner is not v6_compatible + } + + Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = AdvancedCountingScanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + TVector<char> buf2(wbuf.Buffer().Size()); + memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + // test loading from stream + { + MemoryInput rbuf(buf2.data(), buf2.size()); + NoGlueLimitCountingScanner scanner; + ::Load(&rbuf, scanner); + auto state = Run(scanner, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); + } + + // test loading using Mmap + { + NoGlueLimitCountingScanner scanner; + const void* tail = scanner.Mmap(buf2.data(), buf2.size()); + UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size()); + auto state = Run(scanner, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); + } + } + + template<class Scanner> + void EmptyOne() + { + Scanner sc; + UNIT_ASSERT(sc.Empty()); + + UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash + + // Test glueing empty + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1)); + auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + + // Test Save/Load/Mmap + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::CountingScanner sc3; + ::Load(&rbuf, sc3); + UNIT_CHECKPOINT(); Run(sc3, "a string"); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + Scanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + UNIT_CHECKPOINT(); Run(sc4, "a string"); + } + + Y_UNIT_TEST(Empty) + { + EmptyOne<Pire::CountingScanner>(); + EmptyOne<Pire::AdvancedCountingScanner>(); + EmptyOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<typename Scanner> + TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) { + TVector<Scanner> scanners(6); + const auto regexpFsm = MkFsm(regexp, encoding); + HalfFinalFsm fsm(regexpFsm); + fsm.MakeGreedyCounter(true); + scanners[0] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeGreedyCounter(false); + scanners[1] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(true, true); + scanners[2] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(true, false); + scanners[3] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(false); + scanners[4] = Scanner(fsm); + scanners[5] = scanners[0]; + for (size_t i = 1; i < 5; i++) { + scanners[5] = Scanner::Glue(scanners[5], scanners[i]); + } + return scanners; + } + + template<typename Scanner> + void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) { + for (size_t i = 0; i < 5; i++) { + UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]); + } + auto state = Run(scanners[5], text, -1); + for (size_t i = 0; i < 5; i++) { + UNIT_ASSERT_EQUAL(state.Result(i), result[i]); + } + } + + template<typename Scanner> + void TestHalfFinalCount() { + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7}); + } + + Y_UNIT_TEST(HalfFinal) + { + TestHalfFinalCount<Pire::HalfFinalScanner>(); + TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>(); + TestHalfFinalCount<Pire::HalfFinalScannerNoMask>(); + TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>(); + } + + template<typename Scanner> + void TestHalfFinalSerialization() { + auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+"); + BufferOutput wbuf; + for (size_t i = 0; i < 6; i++) { + ::Save(&wbuf, oldScanners[i]); + } + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + TVector<Scanner> scanners(6); + for (size_t i = 0; i < 6; i++) { + ::Load(&rbuf, scanners[i]); + } + + HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5}); + } + + Y_UNIT_TEST(HalfFinalSerialization) + { + TestHalfFinalSerialization<Pire::HalfFinalScanner>(); + TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>(); + } } diff --git a/library/cpp/regex/pire/ut/easy_ut.cpp b/library/cpp/regex/pire/ut/easy_ut.cpp index 5f0f8303fce..a1a4b688582 100644 --- a/library/cpp/regex/pire/ut/easy_ut.cpp +++ b/library/cpp/regex/pire/ut/easy_ut.cpp @@ -31,27 +31,27 @@ #include <easy.h> Y_UNIT_TEST_SUITE(TestPireEasy) { - + Y_UNIT_TEST(Match) { - Pire::Regexp re("(foo|bar)+", Pire::I); - UNIT_ASSERT("prefix fOoBaR suffix" ==~ re); - UNIT_ASSERT(!("bla bla bla" ==~ re)); + Pire::Regexp re("(foo|bar)+", Pire::I); + UNIT_ASSERT("prefix fOoBaR suffix" ==~ re); + UNIT_ASSERT(!("bla bla bla" ==~ re)); } Y_UNIT_TEST(Utf8) { - Pire::Regexp re("^.$", Pire::I | Pire::UTF8); - UNIT_ASSERT("\x41" ==~ re); - UNIT_ASSERT(!("\x81" ==~ re)); + Pire::Regexp re("^.$", Pire::I | Pire::UTF8); + UNIT_ASSERT("\x41" ==~ re); + UNIT_ASSERT(!("\x81" ==~ re)); } Y_UNIT_TEST(TwoFeatures) { - Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT); - UNIT_ASSERT("abc" ==~ re); - UNIT_ASSERT("ABC" ==~ re); - UNIT_ASSERT(!("adc" ==~ re)); + Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT); + UNIT_ASSERT("abc" ==~ re); + UNIT_ASSERT("ABC" ==~ re); + UNIT_ASSERT(!("adc" ==~ re)); } - + } diff --git a/library/cpp/regex/pire/ut/glyph_ut.cpp b/library/cpp/regex/pire/ut/glyph_ut.cpp index 05ef56b01bc..3955029266d 100644 --- a/library/cpp/regex/pire/ut/glyph_ut.cpp +++ b/library/cpp/regex/pire/ut/glyph_ut.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -28,36 +28,36 @@ Y_UNIT_TEST_SUITE(Glyphs) { - Pire::Fsm ParseFsm(const char* regexp) - { - TVector<wchar32> ucs4; - Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); - return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround(); - } + Pire::Fsm ParseFsm(const char* regexp) + { + TVector<wchar32> ucs4; + Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround(); + } #define NOGL_REGEXP(str) REGEXP2(str, "u") #define GL_REGEXP(str) SCANNER(ParseFsm(str)) - Y_UNIT_TEST(Glyphs) - { - NOGL_REGEXP("regexp") { - ACCEPTS("regexp"); - DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); - } - - GL_REGEXP("regexp") { - ACCEPTS("regexp"); - ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); - } - - NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { - DENIES("regexp"); - ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); - } - - GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { - ACCEPTS("regexp"); - ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); - } - } + Y_UNIT_TEST(Glyphs) + { + NOGL_REGEXP("regexp") { + ACCEPTS("regexp"); + DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + GL_REGEXP("regexp") { + ACCEPTS("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { + DENIES("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { + ACCEPTS("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + } } diff --git a/library/cpp/regex/pire/ut/inline_ut.cpp b/library/cpp/regex/pire/ut/inline_ut.cpp index 3ba31dfaa86..055c5b28bf9 100644 --- a/library/cpp/regex/pire/ut/inline_ut.cpp +++ b/library/cpp/regex/pire/ut/inline_ut.cpp @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -32,60 +32,60 @@ Y_UNIT_TEST_SUITE(TestPireInline) { template<class Scanner> typename Scanner::State RunRegexp(const Scanner& scanner, const char* str) { - typename Scanner::State state; - scanner.Initialize(state); - Step(scanner, state, Pire::BeginMark); - Run(scanner, state, str, str + strlen(str)); - Step(scanner, state, Pire::EndMark); - return state; + typename Scanner::State state; + scanner.Initialize(state); + Step(scanner, state, Pire::BeginMark); + Run(scanner, state, str, str + strlen(str)); + Step(scanner, state, Pire::EndMark); + return state; } template<class Scanner> bool Matches(const Scanner& scanner, const char* str) { - return scanner.Final(RunRegexp(scanner, str)); + return scanner.Final(RunRegexp(scanner, str)); } template<class Scanner> bool Matches2(const Scanner& scanner, const char* str) { - return Pire::Matches(scanner, str); + return Pire::Matches(scanner, str); } bool ParticularMatch(Pire::Scanner& sc, Pire::Scanner::State st, size_t idx) { - std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st); - return std::distance(p.first, p.second) == 1 && *p.first == idx; + std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st); + return std::distance(p.first, p.second) == 1 && *p.first == idx; } Y_UNIT_TEST(Inline) { - Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is"); - UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/")); - UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/")); - UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/")); + Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is"); + UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/")); + UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/")); - Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i"); - UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/")); - UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/")); - UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/")); + Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i"); + UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/")); } - + Y_UNIT_TEST(InlineGlue) { - // Check whether pire_inline handles comments as well: - - /* - a C-style comment outside a regexp; */ - Pire::Scanner sc = PIRE_REGEXP( - "foo", "", /* - a C-style comment inside a regexp; */ - "bar", "", // - a C++-style comment inside a regexp; - "baz", "" - ); - // - a C++-style comment outside a regexp. - UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0)); - UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1)); - UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2)); - UNIT_ASSERT(!Matches2(sc, "xxx")); + // Check whether pire_inline handles comments as well: + + /* - a C-style comment outside a regexp; */ + Pire::Scanner sc = PIRE_REGEXP( + "foo", "", /* - a C-style comment inside a regexp; */ + "bar", "", // - a C++-style comment inside a regexp; + "baz", "" + ); + // - a C++-style comment outside a regexp. + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0)); + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1)); + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2)); + UNIT_ASSERT(!Matches2(sc, "xxx")); } } diff --git a/library/cpp/regex/pire/ut/pire_ut.cpp b/library/cpp/regex/pire/ut/pire_ut.cpp index 13f3f2ec717..5fa66c36545 100644 --- a/library/cpp/regex/pire/ut/pire_ut.cpp +++ b/library/cpp/regex/pire/ut/pire_ut.cpp @@ -37,237 +37,237 @@ Y_UNIT_TEST_SUITE(TestPire) { Y_UNIT_TEST(String) { - REGEXP("abc") { - ACCEPTS("def abc ghi"); - ACCEPTS("abc"); - DENIES ("def abd ghi"); - } + REGEXP("abc") { + ACCEPTS("def abc ghi"); + ACCEPTS("abc"); + DENIES ("def abd ghi"); + } } Y_UNIT_TEST(Boundaries) { - REGEXP("^abc") { - ACCEPTS("abc ghi"); - DENIES ("def abc"); - } + REGEXP("^abc") { + ACCEPTS("abc ghi"); + DENIES ("def abc"); + } - REGEXP("abc$") { - DENIES ("abc ghi"); - ACCEPTS("def abc"); - } + REGEXP("abc$") { + DENIES ("abc ghi"); + ACCEPTS("def abc"); + } } Y_UNIT_TEST(Primitives) { - REGEXP("abc|def") { - ACCEPTS("def"); - ACCEPTS("abc"); - DENIES ("deb"); - } - - REGEXP("ad*e") { - ACCEPTS("xaez"); - ACCEPTS("xadez"); - ACCEPTS("xaddez"); - ACCEPTS("xadddddddddddddddddddddddez"); - DENIES ("xafez"); - } - - REGEXP("ad+e") { - DENIES ("xaez"); - ACCEPTS("xadez"); - ACCEPTS("xaddez"); - ACCEPTS("xadddddddddddddddddddddddez"); - DENIES ("xafez"); - } - - REGEXP("ad?e") { - ACCEPTS("xaez"); - ACCEPTS("xadez"); - DENIES ("xaddez"); - DENIES ("xafez"); - } - - REGEXP("a.{1}e") { - ACCEPTS("axe"); - DENIES ("ae"); - DENIES ("axye"); - } + REGEXP("abc|def") { + ACCEPTS("def"); + ACCEPTS("abc"); + DENIES ("deb"); + } + + REGEXP("ad*e") { + ACCEPTS("xaez"); + ACCEPTS("xadez"); + ACCEPTS("xaddez"); + ACCEPTS("xadddddddddddddddddddddddez"); + DENIES ("xafez"); + } + + REGEXP("ad+e") { + DENIES ("xaez"); + ACCEPTS("xadez"); + ACCEPTS("xaddez"); + ACCEPTS("xadddddddddddddddddddddddez"); + DENIES ("xafez"); + } + + REGEXP("ad?e") { + ACCEPTS("xaez"); + ACCEPTS("xadez"); + DENIES ("xaddez"); + DENIES ("xafez"); + } + + REGEXP("a.{1}e") { + ACCEPTS("axe"); + DENIES ("ae"); + DENIES ("axye"); + } } void TestMassAlternatives(const char* pattern) { - REGEXP(pattern) { - ACCEPTS("abc"); - ACCEPTS("def"); - ACCEPTS("ghi"); - ACCEPTS("klm"); - DENIES ("aei"); - DENIES ("klc"); - } + REGEXP(pattern) { + ACCEPTS("abc"); + ACCEPTS("def"); + ACCEPTS("ghi"); + ACCEPTS("klm"); + DENIES ("aei"); + DENIES ("klc"); + } } Y_UNIT_TEST(MassAlternatives) { - TestMassAlternatives("((abc|def)|ghi)|klm"); + TestMassAlternatives("((abc|def)|ghi)|klm"); - TestMassAlternatives("(abc|def)|(ghi|klm)"); + TestMassAlternatives("(abc|def)|(ghi|klm)"); - TestMassAlternatives("abc|(def|(ghi|klm))"); + TestMassAlternatives("abc|(def|(ghi|klm))"); - TestMassAlternatives("abc|(def|ghi)|klm"); + TestMassAlternatives("abc|(def|ghi)|klm"); } Y_UNIT_TEST(Composition) { - REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") { - ACCEPTS("/regexp/i"); - ACCEPTS("/regexp2/"); - DENIES ("regexp"); + REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") { + ACCEPTS("/regexp/i"); + ACCEPTS("/regexp2/"); + DENIES ("regexp"); - ACCEPTS("/dir\\/file/"); - DENIES ("/dir/file/"); + ACCEPTS("/dir\\/file/"); + DENIES ("/dir/file/"); - ACCEPTS("/dir\\\\/"); - DENIES ("/dir\\\\/file/"); - } + ACCEPTS("/dir\\\\/"); + DENIES ("/dir\\\\/file/"); + } - REGEXP("Head(Inner)*Tail") { - ACCEPTS("HeadInnerTail"); - ACCEPTS("HeadInnerInnerTail"); - DENIES ("HeadInneInnerTail"); - ACCEPTS("HeadTail"); - } + REGEXP("Head(Inner)*Tail") { + ACCEPTS("HeadInnerTail"); + ACCEPTS("HeadInnerInnerTail"); + DENIES ("HeadInneInnerTail"); + ACCEPTS("HeadTail"); + } } Y_UNIT_TEST(Repetition) { - REGEXP("^x{3,6}$") { - DENIES ("xx"); - ACCEPTS("xxx"); - ACCEPTS("xxxx"); - ACCEPTS("xxxxx"); - ACCEPTS("xxxxxx"); - DENIES ("xxxxxxx"); - } - - REGEXP("^x{3,}$") { - DENIES ("xx"); - ACCEPTS("xxx"); - ACCEPTS("xxxx"); - ACCEPTS("xxxxxxxxxxx"); - ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - } - - REGEXP("^x{3}$") { - DENIES ("x"); - DENIES ("xx"); - ACCEPTS("xxx"); - DENIES ("xxxx"); - DENIES ("xxxxx"); - DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - } - - REGEXP("x.{3,10}$") { - for (size_t size = 0; size < 20; ++size) { - ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e'); - if (size >= 3 && size <= 10) - ACCEPTS(str.c_str()); - else - DENIES(str.c_str()); - } - } + REGEXP("^x{3,6}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + } + + REGEXP("^x{3,}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxxxxxxxx"); + ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^x{3}$") { + DENIES ("x"); + DENIES ("xx"); + ACCEPTS("xxx"); + DENIES ("xxxx"); + DENIES ("xxxxx"); + DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("x.{3,10}$") { + for (size_t size = 0; size < 20; ++size) { + ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e'); + if (size >= 3 && size <= 10) + ACCEPTS(str.c_str()); + else + DENIES(str.c_str()); + } + } } Y_UNIT_TEST(UTF8) { - REGEXP2("^.$", "u") { - // A single-byte sequence 0xxx xxxx - ACCEPTS("\x41"); - DENIES ("\x81"); - - // A two-byte sequence: 110x xxxx | 10xx xxxx - ACCEPTS("\xC1\x81"); - DENIES ("\xC1"); - DENIES ("\xC1\x41"); - DENIES ("\xC1\xC2"); - DENIES ("\xC1\x81\x82"); - - // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx - ACCEPTS("\xE1\x81\x82"); - DENIES ("\xE1"); - DENIES ("\xE1\x42"); - DENIES ("\xE1\x42\x43"); - DENIES ("\xE1\xC2\xC3"); - DENIES ("\xE1\x82"); - DENIES ("\xE1\x82\x83\x84"); - - // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx - ACCEPTS("\xF1\x81\x82\x83"); - } - - REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y"); + REGEXP2("^.$", "u") { + // A single-byte sequence 0xxx xxxx + ACCEPTS("\x41"); + DENIES ("\x81"); + + // A two-byte sequence: 110x xxxx | 10xx xxxx + ACCEPTS("\xC1\x81"); + DENIES ("\xC1"); + DENIES ("\xC1\x41"); + DENIES ("\xC1\xC2"); + DENIES ("\xC1\x81\x82"); + + // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx + ACCEPTS("\xE1\x81\x82"); + DENIES ("\xE1"); + DENIES ("\xE1\x42"); + DENIES ("\xE1\x42\x43"); + DENIES ("\xE1\xC2\xC3"); + DENIES ("\xE1\x82"); + DENIES ("\xE1\x82\x83\x84"); + + // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx + ACCEPTS("\xF1\x81\x82\x83"); + } + + REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y"); } Y_UNIT_TEST(AndNot) { - REGEXP2("<([0-9]+&~123&~456)>", "a") { - ACCEPTS("<111>"); - ACCEPTS("<124>"); - DENIES ("<123>"); - DENIES ("<456>"); - DENIES ("<abc>"); - } + REGEXP2("<([0-9]+&~123&~456)>", "a") { + ACCEPTS("<111>"); + ACCEPTS("<124>"); + DENIES ("<123>"); + DENIES ("<456>"); + DENIES ("<abc>"); + } - REGEXP2("[0-9]+\\&1+", "a") { - DENIES("111"); - ACCEPTS("123&111"); - } + REGEXP2("[0-9]+\\&1+", "a") { + DENIES("111"); + ACCEPTS("123&111"); + } } Y_UNIT_TEST(Empty) { - Scanners s("\\s*", "n"); - Pire::Scanner::State state; - s.fast.Initialize(state); - UNIT_ASSERT(s.fast.Final(state)); - Pire::SimpleScanner::State stateSF; - s.simple.Initialize(stateSF); - UNIT_ASSERT(s.simple.Final(stateSF)); + Scanners s("\\s*", "n"); + Pire::Scanner::State state; + s.fast.Initialize(state); + UNIT_ASSERT(s.fast.Final(state)); + Pire::SimpleScanner::State stateSF; + s.simple.Initialize(stateSF); + UNIT_ASSERT(s.simple.Final(stateSF)); } Y_UNIT_TEST(Misc) { - REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a"); - REGEXP("\\t") ACCEPTS("\t"); + REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a"); + REGEXP("\\t") ACCEPTS("\t"); - SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) { - ACCEPTS("str"); - DENIES("str_http"); - } + SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) { + ACCEPTS("str"); + DENIES("str_http"); + } - SCANNER(~Pire::Fsm()) ACCEPTS("str"); + SCANNER(~Pire::Fsm()) ACCEPTS("str"); } Y_UNIT_TEST(Ranges) { - REGEXP("a\\W") { - ACCEPTS("a,"); - DENIES("ab"); - } + REGEXP("a\\W") { + ACCEPTS("a,"); + DENIES("ab"); + } - try { - REGEXP("abc[def") {} - UNIT_ASSERT(!"Should report syntax error"); - } - catch (Pire::Error&) {} + try { + REGEXP("abc[def") {} + UNIT_ASSERT(!"Should report syntax error"); + } + catch (Pire::Error&) {} } Y_UNIT_TEST(Reverse) { - SCANNER(ParseRegexp("abcdef").Reverse()) { - ACCEPTS("fedcba"); - DENIES ("abcdef"); - } + SCANNER(ParseRegexp("abcdef").Reverse()) { + ACCEPTS("fedcba"); + DENIES ("abcdef"); + } } #if defined(__GNUC__) @@ -277,480 +277,480 @@ Y_UNIT_TEST(Reverse) Y_UNIT_TEST(PrefixSuffix) { - static const char* pattern = "-->"; - Pire::Fsm fsm = ParseRegexp(pattern, "n"); - Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>(); - Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>(); - Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>(); - - static const char* text = "1234567890 --> middle --> end"; - const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text)); - UNIT_ASSERT_EQUAL(end, text + 14); - const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; - UNIT_ASSERT_EQUAL(begin, text + 11); - auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text)); - UNIT_ASSERT_EQUAL(view.data(), text + 11); - UNIT_ASSERT_EQUAL(view.size(), 3); - - end = Pire::LongestPrefix(ngsc, text, text + strlen(text)); - UNIT_ASSERT_EQUAL(end, text + 25); - begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; - UNIT_ASSERT_EQUAL(begin, text + 22); - view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text)); - UNIT_ASSERT_EQUAL(view.data(), text + 22); - UNIT_ASSERT_EQUAL(view.size(), 3); - - end = Pire::ShortestPrefix(gsc, text, text + strlen(text)); - UNIT_ASSERT_EQUAL(end, text + 14); - begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; - UNIT_ASSERT_EQUAL(begin, text + 11); - view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text)); - UNIT_ASSERT_EQUAL(view.data(), text + 11); - UNIT_ASSERT_EQUAL(view.size(), 3); - - end = Pire::ShortestPrefix(ngsc, text, text + strlen(text)); - UNIT_ASSERT_EQUAL(end, text + 14); - begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; - UNIT_ASSERT_EQUAL(begin, text + 11); - view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text)); - UNIT_ASSERT_EQUAL(view.data(), text + 11); - UNIT_ASSERT_EQUAL(view.size(), 3); + static const char* pattern = "-->"; + Pire::Fsm fsm = ParseRegexp(pattern, "n"); + Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>(); + Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>(); + Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>(); + + static const char* text = "1234567890 --> middle --> end"; + const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::LongestPrefix(ngsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 25); + begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 22); + view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 22); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::ShortestPrefix(gsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::ShortestPrefix(ngsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); } #if defined(__GNUC__) #pragma GCC diagnostic pop #endif Y_UNIT_TEST(PrefixSuffixEmptyView) { - const std::string_view empty{}; - auto checkAnswer = [](std::string_view answer) { - return !answer.data() && answer.size() == 0; - }; - - TVector<ystring> patterns = { - "", - "a", - ".*", - "a.*", - ".*a" - }; - - for (const auto& pattern: patterns) { - Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); - UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern); - UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern); - UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern); - UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern); - } + const std::string_view empty{}; + auto checkAnswer = [](std::string_view answer) { + return !answer.data() && answer.size() == 0; + }; + + TVector<ystring> patterns = { + "", + "a", + ".*", + "a.*", + ".*a" + }; + + for (const auto& pattern: patterns) { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern); + } } namespace { - ssize_t LongestPrefixLen(const char* pattern, const char* str) - { - Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); - const char* end = Pire::LongestPrefix(sc, str, str + strlen(str)); - return end ? end - str : -1; - } - - ssize_t ShortestPrefixLen(const char* pattern, const char* str) - { - Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); - const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str)); - return end ? end - str : -1; - } - - ssize_t LongestSuffixLen(const char* pattern, const char* str) - { - Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); - const char* rbegin = str + strlen(str) - 1; - const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1); - return rend ? rbegin - rend : -1; - } - - ssize_t ShortestSuffixLen(const char* pattern, const char* str) { - Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); - const char* rbegin = str + strlen(str) - 1; - const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1); - return rend ? rbegin - rend : -1; - } + ssize_t LongestPrefixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* end = Pire::LongestPrefix(sc, str, str + strlen(str)); + return end ? end - str : -1; + } + + ssize_t ShortestPrefixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str)); + return end ? end - str : -1; + } + + ssize_t LongestSuffixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* rbegin = str + strlen(str) - 1; + const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1); + return rend ? rbegin - rend : -1; + } + + ssize_t ShortestSuffixLen(const char* pattern, const char* str) { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* rbegin = str + strlen(str) - 1; + const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1); + return rend ? rbegin - rend : -1; + } } Y_UNIT_TEST(ScanBoundaries) { - struct Case { - ystring pattern; - ystring text; - ssize_t shortestPrefixLen; - ssize_t longestPrefixLen; - - ystring ToString() const { - return ystring("Pattern: ") + pattern + ", text: " + text; - } - }; - - TVector <Case> cases = { - { - "a*", - "", - 0, - 0, - }, - { - "a", - "", - -1, - -1, - }, - { - "fixed", - "fixed prefix", - 5, - 5, - }, - { - "fixed", - "a fixed nonexistent prefix", - -1, - -1, - }, - { - "a*", - "aaabbb", - 0, - 3, - }, - { - "a*", - "bbbbbb", - 0, - 0, - }, - { - "a*", - "aaaaaa", - 0, - 6, - }, - { - "aa*", - "aaabbb", - 1, - 3, - }, - { - "a*a", - "aaaaaa", - 1, - 6, - }, - { - ".*a", - "bbbba", - 5, - 5, - }, - { - ".*", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", - 0, - 80, - }, - { - ".*a", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", - 81, - 81, - }, - { - ".*a", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", - 81, - 162, - }, - { - ".*b", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", - -1, - -1, - }, - { - ".*a.*", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", - 81, - 162, - }, - { - ".*a.*b", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", - 162, - 162, - }, - { - "1.*a.*", - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" - "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", - 81, - 162, - }, - { - "a+", - "bbbbbb", - -1, - -1, - }, - }; - - for (const auto& test: cases) { - UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString()); - UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString()); - auto reversed = test.text; - ReverseInPlace(reversed); - UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString()); - UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString()); - } + struct Case { + ystring pattern; + ystring text; + ssize_t shortestPrefixLen; + ssize_t longestPrefixLen; + + ystring ToString() const { + return ystring("Pattern: ") + pattern + ", text: " + text; + } + }; + + TVector <Case> cases = { + { + "a*", + "", + 0, + 0, + }, + { + "a", + "", + -1, + -1, + }, + { + "fixed", + "fixed prefix", + 5, + 5, + }, + { + "fixed", + "a fixed nonexistent prefix", + -1, + -1, + }, + { + "a*", + "aaabbb", + 0, + 3, + }, + { + "a*", + "bbbbbb", + 0, + 0, + }, + { + "a*", + "aaaaaa", + 0, + 6, + }, + { + "aa*", + "aaabbb", + 1, + 3, + }, + { + "a*a", + "aaaaaa", + 1, + 6, + }, + { + ".*a", + "bbbba", + 5, + 5, + }, + { + ".*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", + 0, + 80, + }, + { + ".*a", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", + 81, + 81, + }, + { + ".*a", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", + 81, + 162, + }, + { + ".*b", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", + -1, + -1, + }, + { + ".*a.*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 81, + 162, + }, + { + ".*a.*b", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 162, + 162, + }, + { + "1.*a.*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 81, + 162, + }, + { + "a+", + "bbbbbb", + -1, + -1, + }, + }; + + for (const auto& test: cases) { + UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString()); + UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString()); + auto reversed = test.text; + ReverseInPlace(reversed); + UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString()); + UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString()); + } } Y_UNIT_TEST(ScanTermination) { - Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>(); - // Scanning must terminate at first dead state. If it does not, - // we will pass through the end of our string and end up with segfault. - const char str[] = "aaab"; - const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str)); - UNIT_ASSERT(p == &str[0] + 3); + Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>(); + // Scanning must terminate at first dead state. If it does not, + // we will pass through the end of our string and end up with segfault. + const char str[] = "aaab"; + const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str)); + UNIT_ASSERT(p == &str[0] + 3); } struct BasicMmapTest { - template <class Scanner> - static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str) - { - try { - sc.Mmap(ptr, sz); - if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) { - UNIT_ASSERT(!"Failed to check for misaligned mmaping"); - } else { - UNIT_ASSERT(Matches(sc, str)); - } - } - catch (Pire::Error&) {} - } + template <class Scanner> + static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str) + { + try { + sc.Mmap(ptr, sz); + if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) { + UNIT_ASSERT(!"Failed to check for misaligned mmaping"); + } else { + UNIT_ASSERT(Matches(sc, str)); + } + } + catch (Pire::Error&) {} + } }; template <class Sc1, class Sc2> void TestCopyingHelper() { - Pire::Fsm fsm = ParseRegexp("^r$", ""); - Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>()); + Pire::Fsm fsm = ParseRegexp("^r$", ""); + Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>()); - // Test copy ctor - UNIT_ASSERT(Matches(Sc2(sc1), "r")); - UNIT_ASSERT(!Matches(Sc2(sc1), "p")); + // Test copy ctor + UNIT_ASSERT(Matches(Sc2(sc1), "r")); + UNIT_ASSERT(!Matches(Sc2(sc1), "p")); - // Test '=' operator - Sc2 sc2; - sc2 = sc1; - UNIT_ASSERT(Matches(sc2, "r")); - UNIT_ASSERT(!Matches(sc2, "p")); + // Test '=' operator + Sc2 sc2; + sc2 = sc1; + UNIT_ASSERT(Matches(sc2, "r")); + UNIT_ASSERT(!Matches(sc2, "p")); } template <class Sc1, class Sc2> void TestCopying() { - TestCopyingHelper<Sc1, Sc2>(); - TestCopyingHelper<Sc2, Sc1>(); + TestCopyingHelper<Sc1, Sc2>(); + TestCopyingHelper<Sc2, Sc1>(); } Y_UNIT_TEST(Copying) { - TestCopying<Pire::Scanner, Pire::NonrelocScanner>(); - TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>(); - TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>(); - TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>(); + TestCopying<Pire::Scanner, Pire::NonrelocScanner>(); + TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>(); + TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>(); + TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>(); } template<class Scanner> void MatchScanner(Scanner& scanner) { - UNIT_ASSERT(Matches(scanner, "regexp")); - UNIT_ASSERT(!Matches(scanner, "regxp")); - UNIT_ASSERT(!Matches(scanner, "regexp t")); + UNIT_ASSERT(Matches(scanner, "regexp")); + UNIT_ASSERT(!Matches(scanner, "regxp")); + UNIT_ASSERT(!Matches(scanner, "regexp t")); } template<class Scanner> void LoadAndMatchScanner(MemoryInput& rbuf, Scanner& scanner) { - Load(&rbuf, scanner); - MatchScanner(scanner); + Load(&rbuf, scanner); + MatchScanner(scanner); } template<class Scanner> const char* MmapAndMatchScanner(Scanner& scanner, const char* ptr, size_t size) { - const char* ptr2 = (const char*)scanner.Mmap(ptr, size); - MatchScanner(scanner); - return ptr2; + const char* ptr2 = (const char*)scanner.Mmap(ptr, size); + MatchScanner(scanner); + return ptr2; } Y_UNIT_TEST(Serialization) { - Scanners s("^regexp$"); - - BufferOutput wbuf; - Save(&wbuf, s.fast); - Save(&wbuf, s.simple); - Save(&wbuf, s.slow); - Save(&wbuf, s.fastNoMask); - Save(&wbuf, s.nonreloc); - Save(&wbuf, s.nonrelocNoMask); - Save(&wbuf, s.halfFinal); - Save(&wbuf, s.halfFinalNoMask); - Save(&wbuf, s.nonrelocHalfFinal); - Save(&wbuf, s.nonrelocHalfFinalNoMask); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - LoadAndMatchScanner(rbuf, s.fast); - LoadAndMatchScanner(rbuf, s.simple); - LoadAndMatchScanner(rbuf, s.slow); - LoadAndMatchScanner(rbuf, s.fastNoMask); - LoadAndMatchScanner(rbuf, s.nonreloc); - LoadAndMatchScanner(rbuf, s.nonrelocNoMask); - LoadAndMatchScanner(rbuf, s.halfFinal); - LoadAndMatchScanner(rbuf, s.halfFinalNoMask); - LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal); - LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask); - - Pire::Scanner fast; - Pire::SimpleScanner simple; - Pire::SlowScanner slow; - Pire::ScannerNoMask fastNoMask; - Pire::HalfFinalScanner halfFinal; - Pire::HalfFinalScannerNoMask halfFinalNoMask; - Pire::Scanner fast1; - Pire::ScannerNoMask fastNoMask1; - Pire::HalfFinalScanner halfFinal1; - Pire::HalfFinalScannerNoMask halfFinalNoMask1; - const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); - TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); - const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); - const char* end = ptr + wbuf.Buffer().Size(); - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - - const char* ptr2 = 0; - ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr); - size_t fastSize = ptr2 - ptr; - ptr = ptr2; - ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr); - size_t simpleSize = ptr2 - ptr; - ptr = ptr2; - ptr = MmapAndMatchScanner(slow, ptr, end - ptr); - ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr); - // Nonreloc-s are saved as Scaner-s, so read them again - ptr = MmapAndMatchScanner(fast1, ptr, end - ptr); - ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr); - - ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr); - ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr); - ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr); - ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr); - UNIT_ASSERT_EQUAL(ptr, end); - - for (size_t offset = 1; offset < MaxTestOffset; ++offset) { - ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; - end = ptr + wbuf.Buffer().Size(); - memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); - BasicMmapTest::Match(fast, ptr, end - ptr, "regexp"); - ptr = ptr + fastSize; - BasicMmapTest::Match(simple, ptr, end - ptr, "regexp"); - ptr = ptr + simpleSize; - BasicMmapTest::Match(slow, ptr, end - ptr, "regexp"); - } + Scanners s("^regexp$"); + + BufferOutput wbuf; + Save(&wbuf, s.fast); + Save(&wbuf, s.simple); + Save(&wbuf, s.slow); + Save(&wbuf, s.fastNoMask); + Save(&wbuf, s.nonreloc); + Save(&wbuf, s.nonrelocNoMask); + Save(&wbuf, s.halfFinal); + Save(&wbuf, s.halfFinalNoMask); + Save(&wbuf, s.nonrelocHalfFinal); + Save(&wbuf, s.nonrelocHalfFinalNoMask); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + LoadAndMatchScanner(rbuf, s.fast); + LoadAndMatchScanner(rbuf, s.simple); + LoadAndMatchScanner(rbuf, s.slow); + LoadAndMatchScanner(rbuf, s.fastNoMask); + LoadAndMatchScanner(rbuf, s.nonreloc); + LoadAndMatchScanner(rbuf, s.nonrelocNoMask); + LoadAndMatchScanner(rbuf, s.halfFinal); + LoadAndMatchScanner(rbuf, s.halfFinalNoMask); + LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal); + LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask); + + Pire::Scanner fast; + Pire::SimpleScanner simple; + Pire::SlowScanner slow; + Pire::ScannerNoMask fastNoMask; + Pire::HalfFinalScanner halfFinal; + Pire::HalfFinalScannerNoMask halfFinalNoMask; + Pire::Scanner fast1; + Pire::ScannerNoMask fastNoMask1; + Pire::HalfFinalScanner halfFinal1; + Pire::HalfFinalScannerNoMask halfFinalNoMask1; + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + const char* end = ptr + wbuf.Buffer().Size(); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + const char* ptr2 = 0; + ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr); + size_t fastSize = ptr2 - ptr; + ptr = ptr2; + ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr); + size_t simpleSize = ptr2 - ptr; + ptr = ptr2; + ptr = MmapAndMatchScanner(slow, ptr, end - ptr); + ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr); + // Nonreloc-s are saved as Scaner-s, so read them again + ptr = MmapAndMatchScanner(fast1, ptr, end - ptr); + ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr); + + ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr); + UNIT_ASSERT_EQUAL(ptr, end); + + for (size_t offset = 1; offset < MaxTestOffset; ++offset) { + ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + end = ptr + wbuf.Buffer().Size(); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + BasicMmapTest::Match(fast, ptr, end - ptr, "regexp"); + ptr = ptr + fastSize; + BasicMmapTest::Match(simple, ptr, end - ptr, "regexp"); + ptr = ptr + simpleSize; + BasicMmapTest::Match(slow, ptr, end - ptr, "regexp"); + } } Y_UNIT_TEST(TestShortcuts) { - REGEXP("aaa") { - ACCEPTS("......................................aaa............."); - DENIES ("......................................aab............."); - DENIES ("......................................................"); - } - REGEXP("[ab]{3}") { - ACCEPTS("......................................aaa............."); - ACCEPTS("......................................aab............."); - ACCEPTS("......................................bbb............."); - DENIES ("......................................................"); - } - REGEXP2("\xD0\xB0", "u") { - ACCEPTS("......................................\xD0\xB0..............."); - ACCEPTS("...................................\xD0\xB0.................."); - ACCEPTS("................................\xD0\xB0....................."); - } + REGEXP("aaa") { + ACCEPTS("......................................aaa............."); + DENIES ("......................................aab............."); + DENIES ("......................................................"); + } + REGEXP("[ab]{3}") { + ACCEPTS("......................................aaa............."); + ACCEPTS("......................................aab............."); + ACCEPTS("......................................bbb............."); + DENIES ("......................................................"); + } + REGEXP2("\xD0\xB0", "u") { + ACCEPTS("......................................\xD0\xB0..............."); + ACCEPTS("...................................\xD0\xB0.................."); + ACCEPTS("................................\xD0\xB0....................."); + } } template<class Scanner> void TestGlue() { - Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>(); - Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>(); - Scanner glued = Scanner::Glue(sc1, sc2); - UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2)); - - auto state = RunRegexp(glued, "aaa"); - auto res = glued.AcceptedRegexps(state); - UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); - UNIT_ASSERT_EQUAL(*res.first, size_t(0)); - - state = RunRegexp(glued, "bbb"); - res = glued.AcceptedRegexps(state); - UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); - UNIT_ASSERT_EQUAL(*res.first, size_t(1)); - - state = RunRegexp(glued, "aaabbb"); - res = glued.AcceptedRegexps(state); - UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2)); - UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); - UNIT_ASSERT_EQUAL(res.first[1], size_t(1)); - - state = RunRegexp(glued, "ccc"); - res = glued.AcceptedRegexps(state); - UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0)); - - Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>(); - glued = Scanner::Glue(sc3, glued); - UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3)); - - state = RunRegexp(glued, "ccc"); - res = glued.AcceptedRegexps(state); - UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); - UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); - Scanner sc4 = Scanner::Glue( - ParseRegexp("a", "n").Compile<Scanner>(), - ParseRegexp("c", "n").Compile<Scanner>() - ); - state = RunRegexp(sc4, "ac"); - res = sc4.AcceptedRegexps(state); - UNIT_ASSERT(res.second == res.first); - state = RunRegexp(sc4, "ac"); - UNIT_ASSERT(!sc4.Final(state)); + Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>(); + Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>(); + Scanner glued = Scanner::Glue(sc1, sc2); + UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2)); + + auto state = RunRegexp(glued, "aaa"); + auto res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(*res.first, size_t(0)); + + state = RunRegexp(glued, "bbb"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(*res.first, size_t(1)); + + state = RunRegexp(glued, "aaabbb"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2)); + UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); + UNIT_ASSERT_EQUAL(res.first[1], size_t(1)); + + state = RunRegexp(glued, "ccc"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0)); + + Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>(); + glued = Scanner::Glue(sc3, glued); + UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3)); + + state = RunRegexp(glued, "ccc"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); + Scanner sc4 = Scanner::Glue( + ParseRegexp("a", "n").Compile<Scanner>(), + ParseRegexp("c", "n").Compile<Scanner>() + ); + state = RunRegexp(sc4, "ac"); + res = sc4.AcceptedRegexps(state); + UNIT_ASSERT(res.second == res.first); + state = RunRegexp(sc4, "ac"); + UNIT_ASSERT(!sc4.Final(state)); } Y_UNIT_TEST(Glue) { - TestGlue<Pire::Scanner>(); - TestGlue<Pire::NonrelocScanner>(); - TestGlue<Pire::ScannerNoMask>(); - TestGlue<Pire::NonrelocScannerNoMask>(); - TestGlue<Pire::HalfFinalScanner>(); - TestGlue<Pire::NonrelocHalfFinalScanner>(); - TestGlue<Pire::HalfFinalScannerNoMask>(); - TestGlue<Pire::NonrelocHalfFinalScannerNoMask>(); + TestGlue<Pire::Scanner>(); + TestGlue<Pire::NonrelocScanner>(); + TestGlue<Pire::ScannerNoMask>(); + TestGlue<Pire::NonrelocScannerNoMask>(); + TestGlue<Pire::HalfFinalScanner>(); + TestGlue<Pire::NonrelocHalfFinalScanner>(); + TestGlue<Pire::HalfFinalScannerNoMask>(); + TestGlue<Pire::NonrelocHalfFinalScannerNoMask>(); } Y_UNIT_TEST(Slow) { - Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>(); - // 123456789012345678901234567890 - UNIT_ASSERT( Matches(sc, "....a..............................")); - UNIT_ASSERT(!Matches(sc, "....a...............................")); - UNIT_ASSERT(!Matches(sc, "....a.............................")); + Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>(); + // 123456789012345678901234567890 + UNIT_ASSERT( Matches(sc, "....a..............................")); + UNIT_ASSERT(!Matches(sc, "....a...............................")); + UNIT_ASSERT(!Matches(sc, "....a.............................")); } struct astring: private std::vector<char> { @@ -775,33 +775,33 @@ struct astring: private std::vector<char> { Y_UNIT_TEST(Aligned) { - using ystring = astring; - - UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*))); - - REGEXP("xy") { - // Short string with aligned head - ACCEPTS(ystring("xy").c_str()); - DENIES (ystring("yz").c_str()); - // Short string, unaligned - ACCEPTS(ystring(".xy").c_str() + 1); - DENIES (ystring(".yz").c_str() + 1); - // Short string with aligned tail - ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2); - DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2); - } - - REGEXP("abcde") { - // Everything aligned, match occurs in the middle - ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str()); - DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str()); - // Unaligned head - ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1); - DENIES (ystring(".ZxbcdeZZZ").c_str() + 1); - // Unaligned tail - ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str()); - DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str()); - } + using ystring = astring; + + UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*))); + + REGEXP("xy") { + // Short string with aligned head + ACCEPTS(ystring("xy").c_str()); + DENIES (ystring("yz").c_str()); + // Short string, unaligned + ACCEPTS(ystring(".xy").c_str() + 1); + DENIES (ystring(".yz").c_str() + 1); + // Short string with aligned tail + ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2); + DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2); + } + + REGEXP("abcde") { + // Everything aligned, match occurs in the middle + ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str()); + DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str()); + // Unaligned head + ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1); + DENIES (ystring(".ZxbcdeZZZ").c_str() + 1); + // Unaligned tail + ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str()); + DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str()); + } } #undef Run @@ -809,73 +809,73 @@ Y_UNIT_TEST(Aligned) template <class Scanner> void BasicTestEmptySaveLoadMmap() { - Scanner sc; - UNIT_ASSERT(sc.Empty()); - UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0)); - UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash + Scanner sc; + UNIT_ASSERT(sc.Empty()); + UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash - BufferOutput wbuf; - UNIT_CHECKPOINT(); Save(&wbuf, sc); + BufferOutput wbuf; + UNIT_CHECKPOINT(); Save(&wbuf, sc); - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - Scanner sc3; - /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3); - UNIT_ASSERT(sc3.Empty()); - UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End(); + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Scanner sc3; + /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3); + UNIT_ASSERT(sc3.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End(); - Scanner sc4; - /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size()); - UNIT_ASSERT(sc4.Empty()); - UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End(); + Scanner sc4; + /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size()); + UNIT_ASSERT(sc4.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End(); } Y_UNIT_TEST(EmptyScanner) { - // Tests for Scanner - BasicTestEmptySaveLoadMmap<Pire::Scanner>(); - BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>(); - BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>(); - BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>(); - - Pire::Scanner sc; - Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc); - UNIT_ASSERT(scsc.Empty()); - UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0)); - UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End(); - - Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); - UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); - UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); - UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1)); - UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End(); - UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1)); - UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End(); - - // Tests for NonrelocScanner - Pire::NonrelocScanner nsc; - UNIT_ASSERT(nsc.Empty()); - UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0)); - UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End(); - - Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); - UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); - UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); - - { - BufferOutput wbuf; - UNIT_CHECKPOINT(); Save(&wbuf, nsc); - - MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); - Pire::NonrelocScanner nsc3; - /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3); - UNIT_ASSERT(nsc3.Empty()); - UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End(); - } - - BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>(); - - BasicTestEmptySaveLoadMmap<Pire::SlowScanner>(); + // Tests for Scanner + BasicTestEmptySaveLoadMmap<Pire::Scanner>(); + BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>(); + BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>(); + BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>(); + + Pire::Scanner sc; + Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc); + UNIT_ASSERT(scsc.Empty()); + UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End(); + + Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End(); + + // Tests for NonrelocScanner + Pire::NonrelocScanner nsc; + UNIT_ASSERT(nsc.Empty()); + UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End(); + + Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); + + { + BufferOutput wbuf; + UNIT_CHECKPOINT(); Save(&wbuf, nsc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::NonrelocScanner nsc3; + /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3); + UNIT_ASSERT(nsc3.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End(); + } + + BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>(); + + BasicTestEmptySaveLoadMmap<Pire::SlowScanner>(); } Y_UNIT_TEST(NullPointer) diff --git a/library/cpp/regex/pire/ut/read_unicode_ut.cpp b/library/cpp/regex/pire/ut/read_unicode_ut.cpp index 17569096873..16f627d9da8 100644 --- a/library/cpp/regex/pire/ut/read_unicode_ut.cpp +++ b/library/cpp/regex/pire/ut/read_unicode_ut.cpp @@ -26,282 +26,282 @@ #include "common.h" Y_UNIT_TEST_SUITE(ReadUnicodeTest) { - ystring CreateStringWithZeroSymbol(const char* str, size_t pos) { - ystring result = str; - Y_ASSERT(pos < result.size()); - result[pos] = '\0'; - return result; - } - - Y_UNIT_TEST(ZeroSymbol) - { - REGEXP("\\x{0}") { - ACCEPTS(CreateStringWithZeroSymbol("a", 0)); - ACCEPTS(CreateStringWithZeroSymbol("some text", 3)); - DENIES("string without zero"); - } - - REGEXP("the\\x00middle") { - ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6)); - DENIES(CreateStringWithZeroSymbol("in the middle", 5)); - DENIES("in the middle"); - } - } - - Y_UNIT_TEST(SymbolsByCodes) - { - REGEXP("\\x{41}") { - ACCEPTS("A"); - ACCEPTS("tAst string"); - DENIES("test string"); - } - - REGEXP("\\x26abc") { - ACCEPTS("&abc;"); - DENIES("test &ab"); - DENIES("without"); - } - } - - Y_UNIT_TEST(ErrorsWhileCompiling) - { - UNIT_ASSERT(HasError("\\x")); - UNIT_ASSERT(HasError("\\x0")); - UNIT_ASSERT(HasError("\\xfu")); - UNIT_ASSERT(HasError("\\xs1")); - UNIT_ASSERT(HasError("\\x 0")); - UNIT_ASSERT(HasError("\\x0 ")); - - UNIT_ASSERT(HasError("\\x{2A1")); - UNIT_ASSERT(HasError("\\x{")); - UNIT_ASSERT(HasError("\\x}")); - UNIT_ASSERT(HasError("\\x2}")); - UNIT_ASSERT(HasError("\\x{{3}")); - UNIT_ASSERT(HasError("\\x{2a{5}")); - - UNIT_ASSERT(HasError("\\x{}")); - UNIT_ASSERT(HasError("\\x{+3}")); - UNIT_ASSERT(HasError("\\x{-3}")); - UNIT_ASSERT(HasError("\\x{ 2F}")); - UNIT_ASSERT(HasError("\\x{2A F}")); - UNIT_ASSERT(HasError("\\x{2Arft}")); - UNIT_ASSERT(HasError("\\x{110000}")); - - UNIT_ASSERT(!HasError("\\x{fB1}")); - UNIT_ASSERT(!HasError("\\x00")); - UNIT_ASSERT(!HasError("\\x{10FFFF}")); - } - - Y_UNIT_TEST(OneCharacterRange) - { - SCANNER("[\\x{61}]") { - ACCEPTS("a"); - ACCEPTS("bac"); - DENIES("test"); - } - - SCANNER("[\\x3f]") { - ACCEPTS("?"); - ACCEPTS("test?"); - DENIES("test"); - } - } - - Y_UNIT_TEST(CharacterRange) { - REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("?"); - ACCEPTS("acd"); - ACCEPTS("bcd"); - ACCEPTS("cd?"); - ACCEPTS("ab?"); - DENIES("cd"); - } - - REGEXP("[\\x{61}-\\x{63}]") { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("c"); - ACCEPTS("qwertya"); - DENIES("d"); - } - - REGEXP("[\\x61-\\x61]") { - ACCEPTS("a"); - ACCEPTS("qwertya"); - DENIES("b"); - } - - REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") { - ACCEPTS("&"); - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("?"); - ACCEPTS("ade"); - ACCEPTS("ab?"); - DENIES("d"); - } - - REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("A"); - ACCEPTS("B"); - DENIES("c"); - DENIES("C"); - } - - REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") { - ACCEPTS("Aa"); - ACCEPTS("Ab"); - ACCEPTS("Ba"); - ACCEPTS("Bb"); - DENIES("a"); - DENIES("b"); - DENIES("A"); - DENIES("B"); - DENIES("ab"); - DENIES("AB"); - DENIES("Ca"); - } - } - - Y_UNIT_TEST(RangeExcludeCharacters) { - REGEXP("[^\\x{61}]") { - ACCEPTS("b"); - ACCEPTS("c"); - ACCEPTS("aba"); - DENIES("a"); - DENIES("aaa"); - } - - REGEXP("[^\\x{61}-\\x{7a}]") { - ACCEPTS("A"); - ACCEPTS("123"); - ACCEPTS("acb1"); - DENIES("a"); - DENIES("abcxyz"); - } - } - - Y_UNIT_TEST(MixedRange) { - REGEXP("[\\x{61}B]") { - ACCEPTS("a"); - ACCEPTS("B"); - ACCEPTS("atestB"); - DENIES("test"); - } - - REGEXP("[^\\x{61}A]") { - ACCEPTS("b"); - ACCEPTS("B"); - ACCEPTS("atestB"); - DENIES("a"); - DENIES("A"); - DENIES("aaAA"); - } - - REGEXP("[0-9][\\x{61}-\\x{62}A-B]") { - ACCEPTS("0a"); - ACCEPTS("1A"); - ACCEPTS("5b"); - ACCEPTS("9B"); - ACCEPTS("1atestB"); - ACCEPTS("2Atest"); - DENIES("aB"); - DENIES("testb"); - DENIES("test"); - } - - REGEXP("[\\x{61}-c]") { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("c"); - ACCEPTS("testb"); - DENIES("d"); - } - - REGEXP("[^a-\\x{7a}]") { - ACCEPTS("A"); - ACCEPTS("123"); - ACCEPTS("acb1"); - DENIES("a"); - DENIES("abcxyz"); - } - - REGEXP("[\\x{41}-Ba-\\x{62}]") { - ACCEPTS("a"); - ACCEPTS("b"); - ACCEPTS("A"); - ACCEPTS("B"); - DENIES("c"); - DENIES("C"); - } - } - - Y_UNIT_TEST(CompilingRange) - { - UNIT_ASSERT(HasError("[\\x41")); - UNIT_ASSERT(HasError("[\\xfq]")); - UNIT_ASSERT(HasError("[\\x{01}-]")); - - UNIT_ASSERT(!HasError("[\\x{10FFFF}]")); - UNIT_ASSERT(!HasError("[\\x{00}]")); - UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]")); - - UNIT_ASSERT(!HasError("[^\\xFF]")); - UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]")); - UNIT_ASSERT(!HasError("[-\\x{01}]")); - } - - Y_UNIT_TEST(UnicodeRepetition) - { - REGEXP("^\\x{78}{3,6}$") { - DENIES ("xx"); - ACCEPTS("xxx"); - ACCEPTS("xxxx"); - ACCEPTS("xxxxx"); - ACCEPTS("xxxxxx"); - DENIES ("xxxxxxx"); - } - - REGEXP("^x{3,}$") { - DENIES ("xx"); - ACCEPTS("xxx"); - ACCEPTS("xxxx"); - ACCEPTS("xxxxxxxxxxx"); - ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - } - - REGEXP("^\\x{78}{3}$") { - DENIES ("x"); - DENIES ("xx"); - ACCEPTS("xxx"); - DENIES ("xxxx"); - DENIES ("xxxxx"); - DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - } - - REGEXP("^([\\x{78}-\\x{79}]){2}$") { - DENIES("x"); - DENIES("y"); - ACCEPTS("xx"); - ACCEPTS("xy"); - ACCEPTS("yx"); - ACCEPTS("yy"); - DENIES("xxy"); - DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - } - } - - Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed) - { - REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") { - ACCEPTS("w"); - DENIES ("x"); - ACCEPTS("y"); - } - } + ystring CreateStringWithZeroSymbol(const char* str, size_t pos) { + ystring result = str; + Y_ASSERT(pos < result.size()); + result[pos] = '\0'; + return result; + } + + Y_UNIT_TEST(ZeroSymbol) + { + REGEXP("\\x{0}") { + ACCEPTS(CreateStringWithZeroSymbol("a", 0)); + ACCEPTS(CreateStringWithZeroSymbol("some text", 3)); + DENIES("string without zero"); + } + + REGEXP("the\\x00middle") { + ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6)); + DENIES(CreateStringWithZeroSymbol("in the middle", 5)); + DENIES("in the middle"); + } + } + + Y_UNIT_TEST(SymbolsByCodes) + { + REGEXP("\\x{41}") { + ACCEPTS("A"); + ACCEPTS("tAst string"); + DENIES("test string"); + } + + REGEXP("\\x26abc") { + ACCEPTS("&abc;"); + DENIES("test &ab"); + DENIES("without"); + } + } + + Y_UNIT_TEST(ErrorsWhileCompiling) + { + UNIT_ASSERT(HasError("\\x")); + UNIT_ASSERT(HasError("\\x0")); + UNIT_ASSERT(HasError("\\xfu")); + UNIT_ASSERT(HasError("\\xs1")); + UNIT_ASSERT(HasError("\\x 0")); + UNIT_ASSERT(HasError("\\x0 ")); + + UNIT_ASSERT(HasError("\\x{2A1")); + UNIT_ASSERT(HasError("\\x{")); + UNIT_ASSERT(HasError("\\x}")); + UNIT_ASSERT(HasError("\\x2}")); + UNIT_ASSERT(HasError("\\x{{3}")); + UNIT_ASSERT(HasError("\\x{2a{5}")); + + UNIT_ASSERT(HasError("\\x{}")); + UNIT_ASSERT(HasError("\\x{+3}")); + UNIT_ASSERT(HasError("\\x{-3}")); + UNIT_ASSERT(HasError("\\x{ 2F}")); + UNIT_ASSERT(HasError("\\x{2A F}")); + UNIT_ASSERT(HasError("\\x{2Arft}")); + UNIT_ASSERT(HasError("\\x{110000}")); + + UNIT_ASSERT(!HasError("\\x{fB1}")); + UNIT_ASSERT(!HasError("\\x00")); + UNIT_ASSERT(!HasError("\\x{10FFFF}")); + } + + Y_UNIT_TEST(OneCharacterRange) + { + SCANNER("[\\x{61}]") { + ACCEPTS("a"); + ACCEPTS("bac"); + DENIES("test"); + } + + SCANNER("[\\x3f]") { + ACCEPTS("?"); + ACCEPTS("test?"); + DENIES("test"); + } + } + + Y_UNIT_TEST(CharacterRange) { + REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("?"); + ACCEPTS("acd"); + ACCEPTS("bcd"); + ACCEPTS("cd?"); + ACCEPTS("ab?"); + DENIES("cd"); + } + + REGEXP("[\\x{61}-\\x{63}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("qwertya"); + DENIES("d"); + } + + REGEXP("[\\x61-\\x61]") { + ACCEPTS("a"); + ACCEPTS("qwertya"); + DENIES("b"); + } + + REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") { + ACCEPTS("&"); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("?"); + ACCEPTS("ade"); + ACCEPTS("ab?"); + DENIES("d"); + } + + REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("A"); + ACCEPTS("B"); + DENIES("c"); + DENIES("C"); + } + + REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") { + ACCEPTS("Aa"); + ACCEPTS("Ab"); + ACCEPTS("Ba"); + ACCEPTS("Bb"); + DENIES("a"); + DENIES("b"); + DENIES("A"); + DENIES("B"); + DENIES("ab"); + DENIES("AB"); + DENIES("Ca"); + } + } + + Y_UNIT_TEST(RangeExcludeCharacters) { + REGEXP("[^\\x{61}]") { + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("aba"); + DENIES("a"); + DENIES("aaa"); + } + + REGEXP("[^\\x{61}-\\x{7a}]") { + ACCEPTS("A"); + ACCEPTS("123"); + ACCEPTS("acb1"); + DENIES("a"); + DENIES("abcxyz"); + } + } + + Y_UNIT_TEST(MixedRange) { + REGEXP("[\\x{61}B]") { + ACCEPTS("a"); + ACCEPTS("B"); + ACCEPTS("atestB"); + DENIES("test"); + } + + REGEXP("[^\\x{61}A]") { + ACCEPTS("b"); + ACCEPTS("B"); + ACCEPTS("atestB"); + DENIES("a"); + DENIES("A"); + DENIES("aaAA"); + } + + REGEXP("[0-9][\\x{61}-\\x{62}A-B]") { + ACCEPTS("0a"); + ACCEPTS("1A"); + ACCEPTS("5b"); + ACCEPTS("9B"); + ACCEPTS("1atestB"); + ACCEPTS("2Atest"); + DENIES("aB"); + DENIES("testb"); + DENIES("test"); + } + + REGEXP("[\\x{61}-c]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("testb"); + DENIES("d"); + } + + REGEXP("[^a-\\x{7a}]") { + ACCEPTS("A"); + ACCEPTS("123"); + ACCEPTS("acb1"); + DENIES("a"); + DENIES("abcxyz"); + } + + REGEXP("[\\x{41}-Ba-\\x{62}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("A"); + ACCEPTS("B"); + DENIES("c"); + DENIES("C"); + } + } + + Y_UNIT_TEST(CompilingRange) + { + UNIT_ASSERT(HasError("[\\x41")); + UNIT_ASSERT(HasError("[\\xfq]")); + UNIT_ASSERT(HasError("[\\x{01}-]")); + + UNIT_ASSERT(!HasError("[\\x{10FFFF}]")); + UNIT_ASSERT(!HasError("[\\x{00}]")); + UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]")); + + UNIT_ASSERT(!HasError("[^\\xFF]")); + UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]")); + UNIT_ASSERT(!HasError("[-\\x{01}]")); + } + + Y_UNIT_TEST(UnicodeRepetition) + { + REGEXP("^\\x{78}{3,6}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + } + + REGEXP("^x{3,}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxxxxxxxx"); + ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^\\x{78}{3}$") { + DENIES ("x"); + DENIES ("xx"); + ACCEPTS("xxx"); + DENIES ("xxxx"); + DENIES ("xxxxx"); + DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^([\\x{78}-\\x{79}]){2}$") { + DENIES("x"); + DENIES("y"); + ACCEPTS("xx"); + ACCEPTS("xy"); + ACCEPTS("yx"); + ACCEPTS("yy"); + DENIES("xxy"); + DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + } + + Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed) + { + REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") { + ACCEPTS("w"); + DENIES ("x"); + ACCEPTS("y"); + } + } } |