summaryrefslogtreecommitdiffstats
path: root/library/cpp/regex
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2025-07-28 23:26:41 +0300
committerthegeorg <[email protected]>2025-07-28 23:42:24 +0300
commit2e3c965f3bac5a35f0ce39c7ae4e81bac879542f (patch)
tree7de46185d4d1bc4b1808109d95f72b914fa4adce /library/cpp/regex
parent48b676b29c3a9750b57dfb8c1c3c6aaa5e056a66 (diff)
library/cpp/pire: Tab to space
``` $ ya-subst '\t' ' ' $ ya-subst '\s+$' '' ``` commit_hash:402b2f02694cb6fae6bd903d31e429e1cc9a5d4b
Diffstat (limited to 'library/cpp/regex')
-rw-r--r--library/cpp/regex/pire/inline/inline.l340
-rw-r--r--library/cpp/regex/pire/pire/align.h136
-rw-r--r--library/cpp/regex/pire/pire/any.h170
-rw-r--r--library/cpp/regex/pire/pire/approx_matching.cpp112
-rw-r--r--library/cpp/regex/pire/pire/approx_matching.h2
-rw-r--r--library/cpp/regex/pire/pire/classes.cpp222
-rw-r--r--library/cpp/regex/pire/pire/defs.h82
-rw-r--r--library/cpp/regex/pire/pire/determine.h224
-rw-r--r--library/cpp/regex/pire/pire/easy.cpp4
-rw-r--r--library/cpp/regex/pire/pire/easy.h320
-rw-r--r--library/cpp/regex/pire/pire/encoding.cpp174
-rw-r--r--library/cpp/regex/pire/pire/encoding.h52
-rw-r--r--library/cpp/regex/pire/pire/extra.h2
-rw-r--r--library/cpp/regex/pire/pire/extra/capture.cpp196
-rw-r--r--library/cpp/regex/pire/pire/extra/capture.h1030
-rw-r--r--library/cpp/regex/pire/pire/extra/count.cpp1740
-rw-r--r--library/cpp/regex/pire/pire/extra/count.h726
-rw-r--r--library/cpp/regex/pire/pire/extra/glyphs.cpp196
-rw-r--r--library/cpp/regex/pire/pire/extra/glyphs.h14
-rw-r--r--library/cpp/regex/pire/pire/fsm.cpp1868
-rw-r--r--library/cpp/regex/pire/pire/fsm.h490
-rw-r--r--library/cpp/regex/pire/pire/glue.h178
-rw-r--r--library/cpp/regex/pire/pire/half_final_fsm.cpp662
-rw-r--r--library/cpp/regex/pire/pire/half_final_fsm.h60
-rw-r--r--library/cpp/regex/pire/pire/minimize.h280
-rw-r--r--library/cpp/regex/pire/pire/partition.h278
-rw-r--r--library/cpp/regex/pire/pire/pire.h2
-rw-r--r--library/cpp/regex/pire/pire/re_lexer.cpp4
-rw-r--r--library/cpp/regex/pire/pire/re_lexer.h274
-rw-r--r--library/cpp/regex/pire/pire/read_unicode.cpp96
-rw-r--r--library/cpp/regex/pire/pire/read_unicode.h18
-rw-r--r--library/cpp/regex/pire/pire/run.h590
-rw-r--r--library/cpp/regex/pire/pire/scanner_io.cpp308
-rw-r--r--library/cpp/regex/pire/pire/scanners/common.h160
-rw-r--r--library/cpp/regex/pire/pire/scanners/half_final.h392
-rw-r--r--library/cpp/regex/pire/pire/scanners/loaded.h426
-rw-r--r--library/cpp/regex/pire/pire/scanners/multi.h1900
-rw-r--r--library/cpp/regex/pire/pire/scanners/pair.h130
-rw-r--r--library/cpp/regex/pire/pire/scanners/simple.h412
-rw-r--r--library/cpp/regex/pire/pire/scanners/slow.h734
-rw-r--r--library/cpp/regex/pire/pire/static_assert.h4
-rw-r--r--library/cpp/regex/pire/pire/stub/singleton.h2
-rw-r--r--library/cpp/regex/pire/pire/vbitset.h136
-rw-r--r--library/cpp/regex/pire/ut/approx_matching_ut.cpp704
-rw-r--r--library/cpp/regex/pire/ut/capture_ut.cpp528
-rw-r--r--library/cpp/regex/pire/ut/common.h244
-rw-r--r--library/cpp/regex/pire/ut/count_ut.cpp1096
-rw-r--r--library/cpp/regex/pire/ut/easy_ut.cpp24
-rw-r--r--library/cpp/regex/pire/ut/glyph_ut.cpp58
-rw-r--r--library/cpp/regex/pire/ut/inline_ut.cpp66
-rw-r--r--library/cpp/regex/pire/ut/pire_ut.cpp1330
-rw-r--r--library/cpp/regex/pire/ut/read_unicode_ut.cpp554
52 files changed, 9875 insertions, 9875 deletions
diff --git a/library/cpp/regex/pire/inline/inline.l b/library/cpp/regex/pire/inline/inline.l
index a198ab9f978..72c1e32c6ab 100644
--- a/library/cpp/regex/pire/inline/inline.l
+++ b/library/cpp/regex/pire/inline/inline.l
@@ -47,27 +47,27 @@ static int isatty(int) { return 0; }
class Die {
public:
- Die() {
- Msg = filename.empty() ? "pire_inline" : (filename + ":" + ToString(line) + ":");
- }
+ Die() {
+ Msg = filename.empty() ? "pire_inline" : (filename + ":" + ToString(line) + ":");
+ }
- template<class T>
- Die& operator << (const T& t) {
- Msg += ToString(t);
- return *this;
- }
+ template<class T>
+ Die& operator << (const T& t) {
+ Msg += ToString(t);
+ return *this;
+ }
- ~Die() {
- fprintf(stderr, "%s\n", Msg.c_str());
- exit(1);
- }
+ ~Die() {
+ fprintf(stderr, "%s\n", Msg.c_str());
+ exit(1);
+ }
private:
- ystring Msg;
+ ystring Msg;
};
Die DieHelper() {
- return Die();
+ return Die();
}
void putChar(char c) { putc(c, yyout); }
@@ -90,115 +90,115 @@ void eatComment(void (*action)(char));
<INITIAL>"PIRE_REGEXP"[:space:]*"(" { BEGIN(Regexp); args.clear(); args.push_back(ystring()); }
<Regexp>"\""([^\"]|\\.)*"\"" {
- ystring& s = args.back();
- const char* p;
- for (p = yytext + 1; *p && p[1]; ++p) {
- if (*p == '\\') {
- ++p;
- if (!*p)
- Die() << "string ends with a backslash";
- else if (*p == '\'' || *p == '\"' || *p == '\\')
- s.push_back(*p);
- else if (*p == 'n')
- s.push_back('\n');
- else if (*p == 't')
- s.push_back('\t');
- else if (isdigit(*p)) {
- const char* beg = p;
- while (isdigit(*p))
- ++p;
- s.push_back(strtol(ystring(beg, p).c_str(), 0, 8));
- } else if (*p == 'x') {
- const char* beg = p;
- while (isdigit(*p) || (*p > 'a' && *p <= 'f') || (*p > 'A' && *p < 'F'))
- ++p;
- s.push_back(strtol(ystring(beg, p).c_str(), 0, 16));
- } else
- Die() << "unknown escape sequence (\\" << *p << ")";
- } else
- s.push_back(*p);
- }
- if (!*p)
- Die() << "string ends with a backslash";
+ ystring& s = args.back();
+ const char* p;
+ for (p = yytext + 1; *p && p[1]; ++p) {
+ if (*p == '\\') {
+ ++p;
+ if (!*p)
+ Die() << "string ends with a backslash";
+ else if (*p == '\'' || *p == '\"' || *p == '\\')
+ s.push_back(*p);
+ else if (*p == 'n')
+ s.push_back('\n');
+ else if (*p == 't')
+ s.push_back('\t');
+ else if (isdigit(*p)) {
+ const char* beg = p;
+ while (isdigit(*p))
+ ++p;
+ s.push_back(strtol(ystring(beg, p).c_str(), 0, 8));
+ } else if (*p == 'x') {
+ const char* beg = p;
+ while (isdigit(*p) || (*p > 'a' && *p <= 'f') || (*p > 'A' && *p < 'F'))
+ ++p;
+ s.push_back(strtol(ystring(beg, p).c_str(), 0, 16));
+ } else
+ Die() << "unknown escape sequence (\\" << *p << ")";
+ } else
+ s.push_back(*p);
+ }
+ if (!*p)
+ Die() << "string ends with a backslash";
}
<Regexp>[ \t] {}
<Regexp>\n { ++line; }
<Regexp>"," { args.push_back(ystring()); }
<Regexp>")" {
- if (args.size() & 1 || args.empty())
- Die() << "Usage: PIRE_REGEXP(\"regexp1\", \"flags1\" [, \"regexp2\", \"flags2\" [,...] ])";
-
- bool first = true;
- Pire::Scanner sc;
- ystring pattern;
- for (auto i = args.begin(), ie = args.end(); i != ie; i += 2) {
-
- Pire::Lexer lexer(i->c_str(), i->c_str() + i->size());
- bool surround = false;
- bool greedy = false;
- bool reverse = false;
- for (const char* option = (i+1)->c_str(); *option; ++option) {
- if (*option == 'i')
- lexer.AddFeature(Pire::Features::CaseInsensitive());
- else if (*option == 'u')
- lexer.SetEncoding(Pire::Encodings::Utf8());
- else if (*option == 's')
- surround = true;
- else if (*option == 'a')
- lexer.AddFeature(Pire::Features::AndNotSupport());
- else if (*option == 'g')
- greedy = true;
- else if (*option == 'r')
- reverse = true;
- else
- Die() << "unknown option " << *option << "";
- }
-
- Pire::Fsm fsm;
- try {
- fsm = lexer.Parse();
- }
- catch (std::exception& e) {
- Die() << "" << filename << ":" << line << ": " << e.what() << "";
- }
- if (reverse)
- fsm.Reverse();
- if (greedy && surround)
- Die() << "greedy and surround options are incompatible";
- if (greedy)
- fsm = ~fsm.Surrounded() + fsm;
- else if (surround)
- fsm.Surround();
-
- Pire::Scanner tsc(fsm);
- if (first) {
- pattern = *i;
- first = false;
- tsc.Swap(sc);
- } else {
- sc = Pire::Scanner::Glue(sc, tsc);
- pattern += " | ";
- pattern += *i;
- }
- }
-
- BufferOutput buf;
- AlignedOutput stream(&buf);
- Save(&stream, sc);
-
- fprintf(yyout, "Pire::MmappedScanner<Pire::Scanner>(PIRE_LITERAL( // %s \n \"", pattern.c_str());
- size_t pos = 5;
- for (auto i = buf.Buffer().Begin(), ie = buf.Buffer().End(); i != ie; ++i) {
- pos += fprintf(yyout, "\\x%02X", static_cast<unsigned char>(*i));
- if (pos >= 78) {
- fprintf(yyout, "\"\n \"");
- pos = 5;
- }
- }
- fprintf(yyout, "\"), %u)\n#line %d \"%s\"\n",
- (unsigned int) buf.Buffer().Size(), line, filename.c_str());
- BEGIN(INITIAL);
+ if (args.size() & 1 || args.empty())
+ Die() << "Usage: PIRE_REGEXP(\"regexp1\", \"flags1\" [, \"regexp2\", \"flags2\" [,...] ])";
+
+ bool first = true;
+ Pire::Scanner sc;
+ ystring pattern;
+ for (auto i = args.begin(), ie = args.end(); i != ie; i += 2) {
+
+ Pire::Lexer lexer(i->c_str(), i->c_str() + i->size());
+ bool surround = false;
+ bool greedy = false;
+ bool reverse = false;
+ for (const char* option = (i+1)->c_str(); *option; ++option) {
+ if (*option == 'i')
+ lexer.AddFeature(Pire::Features::CaseInsensitive());
+ else if (*option == 'u')
+ lexer.SetEncoding(Pire::Encodings::Utf8());
+ else if (*option == 's')
+ surround = true;
+ else if (*option == 'a')
+ lexer.AddFeature(Pire::Features::AndNotSupport());
+ else if (*option == 'g')
+ greedy = true;
+ else if (*option == 'r')
+ reverse = true;
+ else
+ Die() << "unknown option " << *option << "";
+ }
+
+ Pire::Fsm fsm;
+ try {
+ fsm = lexer.Parse();
+ }
+ catch (std::exception& e) {
+ Die() << "" << filename << ":" << line << ": " << e.what() << "";
+ }
+ if (reverse)
+ fsm.Reverse();
+ if (greedy && surround)
+ Die() << "greedy and surround options are incompatible";
+ if (greedy)
+ fsm = ~fsm.Surrounded() + fsm;
+ else if (surround)
+ fsm.Surround();
+
+ Pire::Scanner tsc(fsm);
+ if (first) {
+ pattern = *i;
+ first = false;
+ tsc.Swap(sc);
+ } else {
+ sc = Pire::Scanner::Glue(sc, tsc);
+ pattern += " | ";
+ pattern += *i;
+ }
+ }
+
+ BufferOutput buf;
+ AlignedOutput stream(&buf);
+ Save(&stream, sc);
+
+ fprintf(yyout, "Pire::MmappedScanner<Pire::Scanner>(PIRE_LITERAL( // %s \n \"", pattern.c_str());
+ size_t pos = 5;
+ for (auto i = buf.Buffer().Begin(), ie = buf.Buffer().End(); i != ie; ++i) {
+ pos += fprintf(yyout, "\\x%02X", static_cast<unsigned char>(*i));
+ if (pos >= 78) {
+ fprintf(yyout, "\"\n \"");
+ pos = 5;
+ }
+ }
+ fprintf(yyout, "\"), %u)\n#line %d \"%s\"\n",
+ (unsigned int) buf.Buffer().Size(), line, filename.c_str());
+ BEGIN(INITIAL);
}
<INITIAL>. { putc(*yytext, yyout); }
@@ -209,26 +209,26 @@ void eatComment(void (*action)(char));
void eatComment(void (*action)(char))
{
- int c;
- action('/'); action('*');
- for (;;) {
- while ((c = yyinput()) != EOF && c != '*') {
- if (c == '\n')
- ++line;
- action(c);
- }
- if (c == '*') {
- action(c);
- while ((c = yyinput()) == '*')
- action(c);
- if (c == '/') {
- action(c);
- break;
- }
- }
- if (c == EOF)
- Die() << "EOF in comment";
- }
+ int c;
+ action('/'); action('*');
+ for (;;) {
+ while ((c = yyinput()) != EOF && c != '*') {
+ if (c == '\n')
+ ++line;
+ action(c);
+ }
+ if (c == '*') {
+ action(c);
+ while ((c = yyinput()) == '*')
+ action(c);
+ if (c == '/') {
+ action(c);
+ break;
+ }
+ }
+ if (c == EOF)
+ Die() << "EOF in comment";
+ }
}
int yywrap() { return 1; }
@@ -236,37 +236,37 @@ int yywrap() { return 1; }
int main(int argc, char** argv)
{
- // Suppress warnings
- static_cast<void>(&yy_fatal_error);
- static_cast<void>(&yyunput);
-
-
- try {
- const char* outfile = 0;
- if (argc >= 3 && !strcmp(argv[1], "-o")) {
- outfile = argv[2];
- argv += 2, argc -= 2;
- }
- if (argc == 2)
- filename = ystring(argv[1]);
- else if (argc > 2)
- Die() << "usage: pire_inline [-o outfile] [infile]";
-
- yyin = stdin, yyout = stdout;
- if (outfile && (yyout = fopen(outfile, "w")) == NULL)
- Die() << "cannot open file " << outfile << " for writing";
- if (!filename.empty()) {
- if ((yyin = fopen(filename.c_str(), "r")) == NULL)
- Die() << "cannot open file " << filename.c_str() << "\n";
- } else
- filename = "(stdin)";
-
-
- yylex();
- return 0;
- }
- catch (std::exception& e) {
- fprintf(stderr, "%s\n", e.what());
- return 1;
- }
+ // Suppress warnings
+ static_cast<void>(&yy_fatal_error);
+ static_cast<void>(&yyunput);
+
+
+ try {
+ const char* outfile = 0;
+ if (argc >= 3 && !strcmp(argv[1], "-o")) {
+ outfile = argv[2];
+ argv += 2, argc -= 2;
+ }
+ if (argc == 2)
+ filename = ystring(argv[1]);
+ else if (argc > 2)
+ Die() << "usage: pire_inline [-o outfile] [infile]";
+
+ yyin = stdin, yyout = stdout;
+ if (outfile && (yyout = fopen(outfile, "w")) == NULL)
+ Die() << "cannot open file " << outfile << " for writing";
+ if (!filename.empty()) {
+ if ((yyin = fopen(filename.c_str(), "r")) == NULL)
+ Die() << "cannot open file " << filename.c_str() << "\n";
+ } else
+ filename = "(stdin)";
+
+
+ yylex();
+ return 0;
+ }
+ catch (std::exception& e) {
+ fprintf(stderr, "%s\n", e.what());
+ return 1;
+ }
}
diff --git a/library/cpp/regex/pire/pire/align.h b/library/cpp/regex/pire/pire/align.h
index 9e482f1b44a..595b8cbde83 100644
--- a/library/cpp/regex/pire/pire/align.h
+++ b/library/cpp/regex/pire/pire/align.h
@@ -12,7 +12,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -31,73 +31,73 @@
#include "platform.h"
namespace Pire {
-
- namespace Impl {
-
- template<class T>
- inline T AlignUp(T t, size_t bound)
- {
- return (T) (((size_t) t + (bound-1)) & ~(bound-1));
- }
-
- template<class T>
- inline T AlignDown(T t, size_t bound)
- {
- return (T) ((size_t) t & ~(bound-1));
- }
-
- inline void AlignSave(yostream* s, size_t size)
- {
- size_t tail = AlignUp(size, sizeof(size_t)) - size;
- if (tail) {
- static const char buf[sizeof(MaxSizeWord)] = {0};
- SavePodArray(s, buf, tail);
- }
- }
-
- inline void AlignLoad(yistream* s, size_t size)
- {
- size_t tail = AlignUp(size, sizeof(size_t)) - size;
- if (tail) {
- char buf[sizeof(MaxSizeWord)];
- LoadPodArray(s, buf, tail);
- }
- }
-
- template<class T>
- inline void AlignedSaveArray(yostream* s, const T* array, size_t count)
- {
- SavePodArray(s, array, count);
- AlignSave(s, sizeof(*array) * count);
- }
-
- template<class T>
- inline void AlignedLoadArray(yistream* s, T* array, size_t count)
- {
- LoadPodArray(s, array, count);
- AlignLoad(s, sizeof(*array) * count);
- }
-
- template<class T>
- inline bool IsAligned(T t, size_t bound)
- {
- return ((size_t) t & (bound-1)) == 0;
- }
-
- inline const void* AlignPtr(const size_t*& p, size_t& size)
- {
- if (!IsAligned(p, sizeof(size_t))) {
- const size_t* next = AlignUp(p, sizeof(size_t));
- if (next > p+size)
- throw Error("EOF reached in NPire::Impl::align");
- size -= (next - p);
- p = next;
- }
- return (const void*) p;
- }
-
- }
-
+
+ namespace Impl {
+
+ template<class T>
+ inline T AlignUp(T t, size_t bound)
+ {
+ return (T) (((size_t) t + (bound-1)) & ~(bound-1));
+ }
+
+ template<class T>
+ inline T AlignDown(T t, size_t bound)
+ {
+ return (T) ((size_t) t & ~(bound-1));
+ }
+
+ inline void AlignSave(yostream* s, size_t size)
+ {
+ size_t tail = AlignUp(size, sizeof(size_t)) - size;
+ if (tail) {
+ static const char buf[sizeof(MaxSizeWord)] = {0};
+ SavePodArray(s, buf, tail);
+ }
+ }
+
+ inline void AlignLoad(yistream* s, size_t size)
+ {
+ size_t tail = AlignUp(size, sizeof(size_t)) - size;
+ if (tail) {
+ char buf[sizeof(MaxSizeWord)];
+ LoadPodArray(s, buf, tail);
+ }
+ }
+
+ template<class T>
+ inline void AlignedSaveArray(yostream* s, const T* array, size_t count)
+ {
+ SavePodArray(s, array, count);
+ AlignSave(s, sizeof(*array) * count);
+ }
+
+ template<class T>
+ inline void AlignedLoadArray(yistream* s, T* array, size_t count)
+ {
+ LoadPodArray(s, array, count);
+ AlignLoad(s, sizeof(*array) * count);
+ }
+
+ template<class T>
+ inline bool IsAligned(T t, size_t bound)
+ {
+ return ((size_t) t & (bound-1)) == 0;
+ }
+
+ inline const void* AlignPtr(const size_t*& p, size_t& size)
+ {
+ if (!IsAligned(p, sizeof(size_t))) {
+ const size_t* next = AlignUp(p, sizeof(size_t));
+ if (next > p+size)
+ throw Error("EOF reached in NPire::Impl::align");
+ size -= (next - p);
+ p = next;
+ }
+ return (const void*) p;
+ }
+
+ }
+
}
#endif
diff --git a/library/cpp/regex/pire/pire/any.h b/library/cpp/regex/pire/pire/any.h
index 671a4832381..4e968857dc8 100644
--- a/library/cpp/regex/pire/pire/any.h
+++ b/library/cpp/regex/pire/pire/any.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -34,98 +34,98 @@ namespace Pire {
class Any {
public:
- Any() = default;
-
- Any(const Any& any)
- {
- if (any.h)
- h = any.h->Duplicate();
- }
-
- Any& operator= (Any any)
- {
- any.Swap(*this);
- return *this;
- }
-
- template <class T>
- Any(const T& t)
- : h(new Holder<T>(t))
- {
- }
-
- bool Empty() const {
- return !h;
- }
- template <class T>
- bool IsA() const {
- return h && h->IsA(typeid(T));
- }
-
- template <class T>
- T& As()
- {
- if (h && IsA<T>())
- return *reinterpret_cast<T*>(h->Ptr());
- else
- throw Pire::Error("type mismatch");
- }
-
- template <class T>
- const T& As() const
- {
- if (h && IsA<T>())
- return *reinterpret_cast<const T*>(h->Ptr());
- else
- throw Pire::Error("type mismatch");
- }
-
- void Swap(Any& a) noexcept {
- DoSwap(h, a.h);
- }
+ Any() = default;
+
+ Any(const Any& any)
+ {
+ if (any.h)
+ h = any.h->Duplicate();
+ }
+
+ Any& operator= (Any any)
+ {
+ any.Swap(*this);
+ return *this;
+ }
+
+ template <class T>
+ Any(const T& t)
+ : h(new Holder<T>(t))
+ {
+ }
+
+ bool Empty() const {
+ return !h;
+ }
+ template <class T>
+ bool IsA() const {
+ return h && h->IsA(typeid(T));
+ }
+
+ template <class T>
+ T& As()
+ {
+ if (h && IsA<T>())
+ return *reinterpret_cast<T*>(h->Ptr());
+ else
+ throw Pire::Error("type mismatch");
+ }
+
+ template <class T>
+ const T& As() const
+ {
+ if (h && IsA<T>())
+ return *reinterpret_cast<const T*>(h->Ptr());
+ else
+ throw Pire::Error("type mismatch");
+ }
+
+ void Swap(Any& a) noexcept {
+ DoSwap(h, a.h);
+ }
private:
- struct AbstractHolder {
- virtual ~AbstractHolder() {
- }
- virtual THolder<AbstractHolder> Duplicate() const = 0;
- virtual bool IsA(const std::type_info& id) const = 0;
- virtual void* Ptr() = 0;
- virtual const void* Ptr() const = 0;
- };
-
- template <class T>
- struct Holder: public AbstractHolder {
- Holder(T t)
- : d(t)
- {
- }
- THolder<AbstractHolder> Duplicate() const {
- return THolder<AbstractHolder>(new Holder<T>(d));
- }
- bool IsA(const std::type_info& id) const {
- return id == typeid(T);
- }
- void* Ptr() {
- return &d;
- }
- const void* Ptr() const {
- return &d;
- }
- private:
- T d;
- };
-
- THolder<AbstractHolder> h;
+ struct AbstractHolder {
+ virtual ~AbstractHolder() {
+ }
+ virtual THolder<AbstractHolder> Duplicate() const = 0;
+ virtual bool IsA(const std::type_info& id) const = 0;
+ virtual void* Ptr() = 0;
+ virtual const void* Ptr() const = 0;
+ };
+
+ template <class T>
+ struct Holder: public AbstractHolder {
+ Holder(T t)
+ : d(t)
+ {
+ }
+ THolder<AbstractHolder> Duplicate() const {
+ return THolder<AbstractHolder>(new Holder<T>(d));
+ }
+ bool IsA(const std::type_info& id) const {
+ return id == typeid(T);
+ }
+ void* Ptr() {
+ return &d;
+ }
+ const void* Ptr() const {
+ return &d;
+ }
+ private:
+ T d;
+ };
+
+ THolder<AbstractHolder> h;
};
}
namespace std {
- inline void swap(Pire::Any& a, Pire::Any& b) {
- a.Swap(b);
- }
+ inline void swap(Pire::Any& a, Pire::Any& b) {
+ a.Swap(b);
+ }
}
#endif
diff --git a/library/cpp/regex/pire/pire/approx_matching.cpp b/library/cpp/regex/pire/pire/approx_matching.cpp
index 23f74ca01df..fb8adf1885d 100644
--- a/library/cpp/regex/pire/pire/approx_matching.cpp
+++ b/library/cpp/regex/pire/pire/approx_matching.cpp
@@ -23,72 +23,72 @@
#include "approx_matching.h"
namespace Pire {
- Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) {
- Fsm approxFsm = regexp;
+ Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) {
+ Fsm approxFsm = regexp;
- TVector<TSet<Char>> outgoingLettersTable(regexp.Size());
- for (size_t state = 0; state < regexp.Size(); ++state) {
- outgoingLettersTable[state] = regexp.OutgoingLetters(state);
- }
+ TVector<TSet<Char>> outgoingLettersTable(regexp.Size());
+ for (size_t state = 0; state < regexp.Size(); ++state) {
+ outgoingLettersTable[state] = regexp.OutgoingLetters(state);
+ }
- TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size());
- for (size_t state = 0; state < regexp.Size(); ++state) {
- for (Char letter : outgoingLettersTable[state]) {
- destinationsTable[state][letter] = regexp.Destinations(state, letter);
- }
- }
+ TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size());
+ for (size_t state = 0; state < regexp.Size(); ++state) {
+ for (Char letter : outgoingLettersTable[state]) {
+ destinationsTable[state][letter] = regexp.Destinations(state, letter);
+ }
+ }
- for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) {
- approxFsm.Import(regexp);
- const auto shift = fsmIdx * regexp.Size();
+ for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) {
+ approxFsm.Import(regexp);
+ const auto shift = fsmIdx * regexp.Size();
- for (size_t state = 0; state < regexp.Size(); ++state) {
- for (Char letter : outgoingLettersTable[state]) {
- for (size_t to : destinationsTable[state][letter]) {
- for (Char ch = 0; ch < MaxChar; ++ch) {
- if (!approxFsm.Connected(state + shift, to + shift, ch)) {
- approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch);
- }
- }
+ for (size_t state = 0; state < regexp.Size(); ++state) {
+ for (Char letter : outgoingLettersTable[state]) {
+ for (size_t to : destinationsTable[state][letter]) {
+ for (Char ch = 0; ch < MaxChar; ++ch) {
+ if (!approxFsm.Connected(state + shift, to + shift, ch)) {
+ approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch);
+ }
+ }
- approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon);
- }
+ approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon);
+ }
- for (Char ch = 0; ch < MaxChar; ++ch) {
- approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch);
- }
- }
+ for (Char ch = 0; ch < MaxChar; ++ch) {
+ approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch);
+ }
+ }
- if (regexp.IsFinal(state)) {
- approxFsm.SetFinal(state + shift + regexp.Size(), true);
- }
- }
- }
+ if (regexp.IsFinal(state)) {
+ approxFsm.SetFinal(state + shift + regexp.Size(), true);
+ }
+ }
+ }
- size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0;
- for (size_t state = 0; state < maxState; ++state) {
- size_t currentDist = state / regexp.Size();
- size_t intState = state % regexp.Size();
+ size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0;
+ for (size_t state = 0; state < maxState; ++state) {
+ size_t currentDist = state / regexp.Size();
+ size_t intState = state % regexp.Size();
- for (Char firstLetter : outgoingLettersTable[intState]) {
- for (size_t firstDest : destinationsTable[intState][firstLetter]) {
- for (Char secondLetter : outgoingLettersTable[firstDest]) {
- for (size_t secondDest : destinationsTable[firstDest][secondLetter]) {
- if (secondDest != intState || firstDest != intState) {
- approxFsm.Resize(approxFsm.Size() + 1);
+ for (Char firstLetter : outgoingLettersTable[intState]) {
+ for (size_t firstDest : destinationsTable[intState][firstLetter]) {
+ for (Char secondLetter : outgoingLettersTable[firstDest]) {
+ for (size_t secondDest : destinationsTable[firstDest][secondLetter]) {
+ if (secondDest != intState || firstDest != intState) {
+ approxFsm.Resize(approxFsm.Size() + 1);
- size_t to = secondDest + (currentDist + 1) * regexp.Size();
- size_t middle = approxFsm.Size() - 1;
+ size_t to = secondDest + (currentDist + 1) * regexp.Size();
+ size_t middle = approxFsm.Size() - 1;
- approxFsm.Connect(state, middle, secondLetter);
- approxFsm.Connect(middle, to, firstLetter);
- }
- }
- }
- }
- }
- }
+ approxFsm.Connect(state, middle, secondLetter);
+ approxFsm.Connect(middle, to, firstLetter);
+ }
+ }
+ }
+ }
+ }
+ }
- return approxFsm;
- }
+ return approxFsm;
+ }
}
diff --git a/library/cpp/regex/pire/pire/approx_matching.h b/library/cpp/regex/pire/pire/approx_matching.h
index fc2a9fd61c1..c20d68ce6ac 100644
--- a/library/cpp/regex/pire/pire/approx_matching.h
+++ b/library/cpp/regex/pire/pire/approx_matching.h
@@ -24,5 +24,5 @@
#include "fsm.h"
namespace Pire {
- Fsm CreateApproxFsm(const Fsm& regexp, size_t distance);
+ Fsm CreateApproxFsm(const Fsm& regexp, size_t distance);
}
diff --git a/library/cpp/regex/pire/pire/classes.cpp b/library/cpp/regex/pire/pire/classes.cpp
index 7dd531ab3e1..3558e775994 100644
--- a/library/cpp/regex/pire/pire/classes.cpp
+++ b/library/cpp/regex/pire/pire/classes.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -32,120 +32,120 @@ namespace Pire {
namespace {
- class CharClassesTable: private NonCopyable {
- private:
- class CharClass {
- public:
- CharClass() {}
- explicit CharClass(wchar32 ch) { m_bounds.push_back(ymake_pair(ch, ch)); }
- CharClass(wchar32 lower, wchar32 upper) { m_bounds.push_back(ymake_pair(lower, upper)); }
-
- CharClass& operator |= (const CharClass& cc)
- {
- std::copy(cc.m_bounds.begin(), cc.m_bounds.end(), std::back_inserter(m_bounds));
- return *this;
- }
-
- CharClass operator | (const CharClass& cc) const
- {
- CharClass r(*this);
- r |= cc;
- return r;
- }
-
- TSet<wchar32> ToSet() const
- {
- TSet<wchar32> ret;
- for (auto&& bound : m_bounds)
- for (wchar32 c = bound.first; c <= bound.second; ++c)
- ret.insert(c);
- return ret;
- }
-
- private:
- TVector<ypair<wchar32, wchar32> > m_bounds;
- };
-
- public:
- bool Has(wchar32 wc) const
- {
- return (m_classes.find(to_lower(wc & ~ControlMask)) != m_classes.end());
- }
-
- TSet<wchar32> Get(wchar32 wc) const
- {
- auto it = m_classes.find(to_lower(wc & ~ControlMask));
- if (it == m_classes.end())
- throw Error("Unknown character class");
- return it->second.ToSet();
- }
-
- CharClassesTable()
- {
- m_classes['l'] = CharClass('A', 'Z') | CharClass('a', 'z');
- m_classes['c']
- = CharClass(0x0410, 0x044F) // Russian capital A to Russan capital YA, Russian small A to Russian small YA
- | CharClass(0x0401) // Russian capital Yo
- | CharClass(0x0451) // Russian small Yo
- ;
-
- m_classes['w'] = m_classes['l'] | m_classes['c'];
- m_classes['d'] = CharClass('0', '9');
- m_classes['s']
- = CharClass(' ') | CharClass('\t') | CharClass('\r') | CharClass('\n')
- | CharClass(0x00A0) // Non-breaking space
- ;
-
- // A few special classes which do not have any negation
- m_classes['n'] = CharClass('\n');
- m_classes['r'] = CharClass('\r');
- m_classes['t'] = CharClass('\t');
- }
-
- TMap<wchar32, CharClass> m_classes;
- };
-
- class CharClassesImpl: public Feature {
- public:
- CharClassesImpl(): m_table(Singleton<CharClassesTable>()) {}
- int Priority() const { return 10; }
-
- void Alter(Term& t)
- {
- if (t.Value().IsA<Term::CharacterRange>()) {
- const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>();
- typedef Term::CharacterRange::first_type CharSet;
- const CharSet& old = range.first;
- CharSet altered;
- bool pos = false;
- bool neg = false;
- for (auto&& i : old)
- if (i.size() == 1 && (i[0] & ControlMask) == Control && m_table->Has(i[0])) {
- if (is_upper(i[0] & ~ControlMask))
- neg = true;
- else
- pos = true;
-
- TSet<wchar32> klass = m_table->Get(i[0]);
- for (auto&& j : klass)
- altered.insert(Term::String(1, j));
- } else
- altered.insert(i);
-
- if (neg && (pos || range.second))
- Error("Positive and negative character ranges mixed");
- t = Term(t.Type(), Term::CharacterRange(altered, neg || range.second));
- }
- }
-
- private:
- CharClassesTable* m_table;
- };
+ class CharClassesTable: private NonCopyable {
+ private:
+ class CharClass {
+ public:
+ CharClass() {}
+ explicit CharClass(wchar32 ch) { m_bounds.push_back(ymake_pair(ch, ch)); }
+ CharClass(wchar32 lower, wchar32 upper) { m_bounds.push_back(ymake_pair(lower, upper)); }
+
+ CharClass& operator |= (const CharClass& cc)
+ {
+ std::copy(cc.m_bounds.begin(), cc.m_bounds.end(), std::back_inserter(m_bounds));
+ return *this;
+ }
+
+ CharClass operator | (const CharClass& cc) const
+ {
+ CharClass r(*this);
+ r |= cc;
+ return r;
+ }
+
+ TSet<wchar32> ToSet() const
+ {
+ TSet<wchar32> ret;
+ for (auto&& bound : m_bounds)
+ for (wchar32 c = bound.first; c <= bound.second; ++c)
+ ret.insert(c);
+ return ret;
+ }
+
+ private:
+ TVector<ypair<wchar32, wchar32> > m_bounds;
+ };
+
+ public:
+ bool Has(wchar32 wc) const
+ {
+ return (m_classes.find(to_lower(wc & ~ControlMask)) != m_classes.end());
+ }
+
+ TSet<wchar32> Get(wchar32 wc) const
+ {
+ auto it = m_classes.find(to_lower(wc & ~ControlMask));
+ if (it == m_classes.end())
+ throw Error("Unknown character class");
+ return it->second.ToSet();
+ }
+
+ CharClassesTable()
+ {
+ m_classes['l'] = CharClass('A', 'Z') | CharClass('a', 'z');
+ m_classes['c']
+ = CharClass(0x0410, 0x044F) // Russian capital A to Russan capital YA, Russian small A to Russian small YA
+ | CharClass(0x0401) // Russian capital Yo
+ | CharClass(0x0451) // Russian small Yo
+ ;
+
+ m_classes['w'] = m_classes['l'] | m_classes['c'];
+ m_classes['d'] = CharClass('0', '9');
+ m_classes['s']
+ = CharClass(' ') | CharClass('\t') | CharClass('\r') | CharClass('\n')
+ | CharClass(0x00A0) // Non-breaking space
+ ;
+
+ // A few special classes which do not have any negation
+ m_classes['n'] = CharClass('\n');
+ m_classes['r'] = CharClass('\r');
+ m_classes['t'] = CharClass('\t');
+ }
+
+ TMap<wchar32, CharClass> m_classes;
+ };
+
+ class CharClassesImpl: public Feature {
+ public:
+ CharClassesImpl(): m_table(Singleton<CharClassesTable>()) {}
+ int Priority() const { return 10; }
+
+ void Alter(Term& t)
+ {
+ if (t.Value().IsA<Term::CharacterRange>()) {
+ const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>();
+ typedef Term::CharacterRange::first_type CharSet;
+ const CharSet& old = range.first;
+ CharSet altered;
+ bool pos = false;
+ bool neg = false;
+ for (auto&& i : old)
+ if (i.size() == 1 && (i[0] & ControlMask) == Control && m_table->Has(i[0])) {
+ if (is_upper(i[0] & ~ControlMask))
+ neg = true;
+ else
+ pos = true;
+
+ TSet<wchar32> klass = m_table->Get(i[0]);
+ for (auto&& j : klass)
+ altered.insert(Term::String(1, j));
+ } else
+ altered.insert(i);
+
+ if (neg && (pos || range.second))
+ Error("Positive and negative character ranges mixed");
+ t = Term(t.Type(), Term::CharacterRange(altered, neg || range.second));
+ }
+ }
+
+ private:
+ CharClassesTable* m_table;
+ };
}
namespace Features {
- Feature::Ptr CharClasses() { return Feature::Ptr(new CharClassesImpl); }
+ Feature::Ptr CharClasses() { return Feature::Ptr(new CharClassesImpl); }
}
}
diff --git a/library/cpp/regex/pire/pire/defs.h b/library/cpp/regex/pire/pire/defs.h
index 894cc780b72..18570cd5bfe 100644
--- a/library/cpp/regex/pire/pire/defs.h
+++ b/library/cpp/regex/pire/pire/defs.h
@@ -37,73 +37,73 @@
namespace Pire {
#ifdef PIRE_DEBUG
-# define PIRE_IFDEBUG(x) x
+# define PIRE_IFDEBUG(x) x
#else
-# define PIRE_IFDEBUG(x)
+# define PIRE_IFDEBUG(x)
#endif
#ifdef PIRE_CHECKED
-# define PIRE_IF_CHECKED(e) e
+# define PIRE_IF_CHECKED(e) e
#else
-# define PIRE_IF_CHECKED(e)
+# define PIRE_IF_CHECKED(e)
#endif
- typedef unsigned short Char;
+ typedef unsigned short Char;
- namespace SpecialChar {
- enum {
- Epsilon = 257,
- BeginMark = 258,
- EndMark = 259,
+ namespace SpecialChar {
+ enum {
+ Epsilon = 257,
+ BeginMark = 258,
+ EndMark = 259,
- // Actual size of input alphabet
- MaxCharUnaligned = 260,
+ // Actual size of input alphabet
+ MaxCharUnaligned = 260,
- // Size of letter transition tables, must be a multiple of the machine word size
- MaxChar = (MaxCharUnaligned + (sizeof(void*)-1)) & ~(sizeof(void*)-1)
- };
- }
+ // Size of letter transition tables, must be a multiple of the machine word size
+ MaxChar = (MaxCharUnaligned + (sizeof(void*)-1)) & ~(sizeof(void*)-1)
+ };
+ }
- using namespace SpecialChar;
+ using namespace SpecialChar;
- namespace Impl {
+ namespace Impl {
#ifndef PIRE_WORDS_BIGENDIAN
- inline size_t ToLittleEndian(size_t val) { return val; }
+ inline size_t ToLittleEndian(size_t val) { return val; }
#else
- template<unsigned N>
- inline size_t SwapBytes(size_t val)
- {
- static const size_t Mask = (1 << (N/2)) - 1;
- return ((SwapBytes<N/2>(val) & Mask) << (N/2)) | SwapBytes<N/2>(val >> (N/2));
- }
+ template<unsigned N>
+ inline size_t SwapBytes(size_t val)
+ {
+ static const size_t Mask = (1 << (N/2)) - 1;
+ return ((SwapBytes<N/2>(val) & Mask) << (N/2)) | SwapBytes<N/2>(val >> (N/2));
+ }
- template<>
- inline size_t SwapBytes<8>(size_t val) { return val & 0xFF; }
+ template<>
+ inline size_t SwapBytes<8>(size_t val) { return val & 0xFF; }
- inline size_t ToLittleEndian(size_t val) { return SwapBytes<sizeof(val)*8>(val); }
+ inline size_t ToLittleEndian(size_t val) { return SwapBytes<sizeof(val)*8>(val); }
#endif
struct Struct { void* p; };
- }
+ }
}
#ifndef PIRE_ALIGNED_DECL
-# if defined(PIRE_HAVE_ALIGNAS)
-# define PIRE_ALIGNED_DECL(x) alignas(::Pire::Impl::Struct) static const char x[]
-# elif defined(PIRE_HAVE_ATTR_ALIGNED)
-# define PIRE_ALIGNED_DECL(x) static const char x[] __attribute__((aligned(sizeof(void*))))
-# elif defined(PIRE_HAVE_DECLSPEC_ALIGN)
-# define PIRE_ALIGNED_DECL(x) __declspec(align(8)) static const char x[]
-# endif
+# if defined(PIRE_HAVE_ALIGNAS)
+# define PIRE_ALIGNED_DECL(x) alignas(::Pire::Impl::Struct) static const char x[]
+# elif defined(PIRE_HAVE_ATTR_ALIGNED)
+# define PIRE_ALIGNED_DECL(x) static const char x[] __attribute__((aligned(sizeof(void*))))
+# elif defined(PIRE_HAVE_DECLSPEC_ALIGN)
+# define PIRE_ALIGNED_DECL(x) __declspec(align(8)) static const char x[]
+# endif
#endif
#ifndef PIRE_LITERAL
-# if defined(PIRE_HAVE_LAMBDAS)
-# define PIRE_LITERAL(data) ([]() -> const char* { PIRE_ALIGNED_DECL(__pire_regexp__) = data; return __pire_regexp__; })()
-# elif defined(PIRE_HAVE_SCOPED_EXPR)
-# define PIRE_LITERAL(data) ({ PIRE_ALIGNED_DECL(__pire_regexp__) = data; __pire_regexp__; })
-# endif
+# if defined(PIRE_HAVE_LAMBDAS)
+# define PIRE_LITERAL(data) ([]() -> const char* { PIRE_ALIGNED_DECL(__pire_regexp__) = data; return __pire_regexp__; })()
+# elif defined(PIRE_HAVE_SCOPED_EXPR)
+# define PIRE_LITERAL(data) ({ PIRE_ALIGNED_DECL(__pire_regexp__) = data; __pire_regexp__; })
+# endif
#endif
#endif
diff --git a/library/cpp/regex/pire/pire/determine.h b/library/cpp/regex/pire/pire/determine.h
index 96ee1b52749..d60304a265d 100644
--- a/library/cpp/regex/pire/pire/determine.h
+++ b/library/cpp/regex/pire/pire/determine.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -29,117 +29,117 @@
#include "partition.h"
namespace Pire {
- namespace Impl {
-
- /**
- * An interface of a determination task.
- * You don't have to derive from this class; it is just a start point template.
- */
- class DetermineTask {
- private:
- struct ImplementationSpecific1;
- struct ImplementationSpecific2;
-
- public:
- /// A type representing a new state (may be a set of old states, a pair of them, etc...)
- typedef ImplementationSpecific1 State;
-
- /// A type of letter equivalence classes table.
- typedef Partition<char, ImplementationSpecific2> LettersTbl;
-
- /// A container used for storing map of states to thier indices.
- typedef TMap<State, size_t> InvStates;
-
- /// Should return used letters' partition.
- const LettersTbl& Letters() const;
-
- /// Should return initial state (surprise!)
- State Initial() const;
-
- /// Should calculate next state, given the current state and a letter.
- State Next(State state, Char letter) const;
-
- /// Should return true iff the state need to be processed.
- bool IsRequired(const State& /*state*/) const { return true; }
-
- /// Called when the set of new states is closed.
- void AcceptStates(const TVector<State>& newstates);
-
- /// Called for each transition from one new state to another.
- void Connect(size_t from, size_t to, Char letter);
-
- typedef bool Result;
- Result Success() { return true; }
- Result Failure() { return false; }
- };
-
- /**
- * A helper function for FSM determining and all determine-like algorithms
- * like scanners' agglutination.
- *
- * Given an indirectly specified automaton (through Task::Initial() and Task::Next()
- * functions, see above), performs a breadth-first traversal, finding and enumerating
- * all effectively reachable states. Then passes all found states and transitions
- * between them back to the task.
- *
- * Initial state is always placed at zero position.
- *
- * Please note that the function does not take care of any payload (including final flags);
- * it is the task's responsibility to agglutinate them properly.
- *
- * Returns task.Succeed() if everything was done; task.Failure() if maximum limit of state count was reached.
- */
- template<class Task>
- typename Task::Result Determine(Task& task, size_t maxSize)
- {
- typedef typename Task::State State;
- typedef typename Task::InvStates InvStates;
- typedef TDeque< TVector<size_t> > TransitionTable;
-
- TVector<State> states;
- InvStates invstates;
- TransitionTable transitions;
- TVector<size_t> stateIndices;
-
- states.push_back(task.Initial());
- invstates.insert(typename InvStates::value_type(states[0], 0));
-
- for (size_t stateIdx = 0; stateIdx < states.size(); ++stateIdx) {
- if (!task.IsRequired(states[stateIdx]))
- continue;
- TransitionTable::value_type row(task.Letters().Size());
- for (auto&& letter : task.Letters()) {
- State newState = task.Next(states[stateIdx], letter.first);
- auto i = invstates.find(newState);
- if (i == invstates.end()) {
- if (!maxSize--)
- return task.Failure();
- i = invstates.insert(typename InvStates::value_type(newState, states.size())).first;
- states.push_back(newState);
- }
- row[letter.second.first] = i->second;
- }
- transitions.push_back(row);
- stateIndices.push_back(stateIdx);
- }
-
- TVector<Char> invletters(task.Letters().Size());
- for (auto&& letter : task.Letters())
- invletters[letter.second.first] = letter.first;
-
- task.AcceptStates(states);
- size_t from = 0;
- for (TransitionTable::iterator i = transitions.begin(), ie = transitions.end(); i != ie; ++i, ++from) {
- TVector<Char>::iterator l = invletters.begin();
- for (TransitionTable::value_type::iterator j = i->begin(), je = i->end(); j != je; ++j, ++l)
- task.Connect(stateIndices[from], *j, *l);
- }
- return task.Success();
- }
-
- // Faster transition table representation for determined FSM
- typedef TVector<size_t> DeterminedTransitions;
- }
+ namespace Impl {
+
+ /**
+ * An interface of a determination task.
+ * You don't have to derive from this class; it is just a start point template.
+ */
+ class DetermineTask {
+ private:
+ struct ImplementationSpecific1;
+ struct ImplementationSpecific2;
+
+ public:
+ /// A type representing a new state (may be a set of old states, a pair of them, etc...)
+ typedef ImplementationSpecific1 State;
+
+ /// A type of letter equivalence classes table.
+ typedef Partition<char, ImplementationSpecific2> LettersTbl;
+
+ /// A container used for storing map of states to thier indices.
+ typedef TMap<State, size_t> InvStates;
+
+ /// Should return used letters' partition.
+ const LettersTbl& Letters() const;
+
+ /// Should return initial state (surprise!)
+ State Initial() const;
+
+ /// Should calculate next state, given the current state and a letter.
+ State Next(State state, Char letter) const;
+
+ /// Should return true iff the state need to be processed.
+ bool IsRequired(const State& /*state*/) const { return true; }
+
+ /// Called when the set of new states is closed.
+ void AcceptStates(const TVector<State>& newstates);
+
+ /// Called for each transition from one new state to another.
+ void Connect(size_t from, size_t to, Char letter);
+
+ typedef bool Result;
+ Result Success() { return true; }
+ Result Failure() { return false; }
+ };
+
+ /**
+ * A helper function for FSM determining and all determine-like algorithms
+ * like scanners' agglutination.
+ *
+ * Given an indirectly specified automaton (through Task::Initial() and Task::Next()
+ * functions, see above), performs a breadth-first traversal, finding and enumerating
+ * all effectively reachable states. Then passes all found states and transitions
+ * between them back to the task.
+ *
+ * Initial state is always placed at zero position.
+ *
+ * Please note that the function does not take care of any payload (including final flags);
+ * it is the task's responsibility to agglutinate them properly.
+ *
+ * Returns task.Succeed() if everything was done; task.Failure() if maximum limit of state count was reached.
+ */
+ template<class Task>
+ typename Task::Result Determine(Task& task, size_t maxSize)
+ {
+ typedef typename Task::State State;
+ typedef typename Task::InvStates InvStates;
+ typedef TDeque< TVector<size_t> > TransitionTable;
+
+ TVector<State> states;
+ InvStates invstates;
+ TransitionTable transitions;
+ TVector<size_t> stateIndices;
+
+ states.push_back(task.Initial());
+ invstates.insert(typename InvStates::value_type(states[0], 0));
+
+ for (size_t stateIdx = 0; stateIdx < states.size(); ++stateIdx) {
+ if (!task.IsRequired(states[stateIdx]))
+ continue;
+ TransitionTable::value_type row(task.Letters().Size());
+ for (auto&& letter : task.Letters()) {
+ State newState = task.Next(states[stateIdx], letter.first);
+ auto i = invstates.find(newState);
+ if (i == invstates.end()) {
+ if (!maxSize--)
+ return task.Failure();
+ i = invstates.insert(typename InvStates::value_type(newState, states.size())).first;
+ states.push_back(newState);
+ }
+ row[letter.second.first] = i->second;
+ }
+ transitions.push_back(row);
+ stateIndices.push_back(stateIdx);
+ }
+
+ TVector<Char> invletters(task.Letters().Size());
+ for (auto&& letter : task.Letters())
+ invletters[letter.second.first] = letter.first;
+
+ task.AcceptStates(states);
+ size_t from = 0;
+ for (TransitionTable::iterator i = transitions.begin(), ie = transitions.end(); i != ie; ++i, ++from) {
+ TVector<Char>::iterator l = invletters.begin();
+ for (TransitionTable::value_type::iterator j = i->begin(), je = i->end(); j != je; ++j, ++l)
+ task.Connect(stateIndices[from], *j, *l);
+ }
+ return task.Success();
+ }
+
+ // Faster transition table representation for determined FSM
+ typedef TVector<size_t> DeterminedTransitions;
+ }
}
#endif
diff --git a/library/cpp/regex/pire/pire/easy.cpp b/library/cpp/regex/pire/pire/easy.cpp
index bcb56c693bb..8fd4d255f5e 100644
--- a/library/cpp/regex/pire/pire/easy.cpp
+++ b/library/cpp/regex/pire/pire/easy.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -26,7 +26,7 @@ namespace Pire {
const Option<const Encoding&> UTF8(&Pire::Encodings::Utf8);
const Option<const Encoding&> LATIN1(&Pire::Encodings::Latin1);
-
+
const Option<Feature::Ptr> I(&Pire::Features::CaseInsensitive);
const Option<Feature::Ptr> ANDNOT(&Pire::Features::AndNotSupport);
diff --git a/library/cpp/regex/pire/pire/easy.h b/library/cpp/regex/pire/pire/easy.h
index 6434cd6f223..2fa3cbd5256 100644
--- a/library/cpp/regex/pire/pire/easy.h
+++ b/library/cpp/regex/pire/pire/easy.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -57,76 +57,76 @@
#include "vbitset.h"
namespace Pire {
-
+
template<class Arg> class Option;
class Options {
public:
- Options(): m_encoding(&Pire::Encodings::Latin1()) {}
- ~Options() { Clear(); }
-
- void Add(const Pire::Encoding& encoding) { m_encoding = &encoding; }
- void Add(Feature::Ptr&& feature) { m_features.push_back(std::move(feature)); }
-
- struct Proxy {
- Options* const o;
- /*implicit*/ Proxy(Options* opts): o(opts) {}
- };
- operator Proxy() { return Proxy(this); }
-
- Options(Options& o): m_encoding(o.m_encoding) { m_features.swap(o.m_features); }
- Options& operator = (Options& o) { m_encoding = o.m_encoding; m_features = std::move(o.m_features); o.Clear(); return *this; }
-
- Options(Proxy p): m_encoding(p.o->m_encoding) { m_features.swap(p.o->m_features); }
- Options& operator = (Proxy p) { m_encoding = p.o->m_encoding; m_features = std::move(p.o->m_features); p.o->Clear(); return *this; }
-
- void Apply(Lexer& lexer)
- {
- lexer.SetEncoding(*m_encoding);
- for (auto&& i : m_features) {
- lexer.AddFeature(i);
- i = 0;
- }
- m_features.clear();
- }
-
- template<class ArgT>
- /*implicit*/ Options(const Option<ArgT>& option);
-
- const Pire::Encoding& Encoding() const { return *m_encoding; }
+ Options(): m_encoding(&Pire::Encodings::Latin1()) {}
+ ~Options() { Clear(); }
+
+ void Add(const Pire::Encoding& encoding) { m_encoding = &encoding; }
+ void Add(Feature::Ptr&& feature) { m_features.push_back(std::move(feature)); }
+
+ struct Proxy {
+ Options* const o;
+ /*implicit*/ Proxy(Options* opts): o(opts) {}
+ };
+ operator Proxy() { return Proxy(this); }
+
+ Options(Options& o): m_encoding(o.m_encoding) { m_features.swap(o.m_features); }
+ Options& operator = (Options& o) { m_encoding = o.m_encoding; m_features = std::move(o.m_features); o.Clear(); return *this; }
+
+ Options(Proxy p): m_encoding(p.o->m_encoding) { m_features.swap(p.o->m_features); }
+ Options& operator = (Proxy p) { m_encoding = p.o->m_encoding; m_features = std::move(p.o->m_features); p.o->Clear(); return *this; }
+
+ void Apply(Lexer& lexer)
+ {
+ lexer.SetEncoding(*m_encoding);
+ for (auto&& i : m_features) {
+ lexer.AddFeature(i);
+ i = 0;
+ }
+ m_features.clear();
+ }
+
+ template<class ArgT>
+ /*implicit*/ Options(const Option<ArgT>& option);
+
+ const Pire::Encoding& Encoding() const { return *m_encoding; }
private:
- const Pire::Encoding* m_encoding;
- TVector<Feature::Ptr> m_features;
-
- void Clear()
- {
- m_features.clear();
- }
+ const Pire::Encoding* m_encoding;
+ TVector<Feature::Ptr> m_features;
+
+ void Clear()
+ {
+ m_features.clear();
+ }
};
template<class Arg>
class Option {
public:
- typedef Arg (*Ctor)();
-
- Option(Ctor ctor): m_ctor(ctor) {}
-
- friend Options operator | (Options::Proxy options, const Option<Arg>& self)
- {
- Options ret(options);
- ret.Add((*self.m_ctor)());
- return ret;
- }
-
- template<class Arg2>
- friend Options operator | (const Option<Arg2>& a, const Option<Arg>& b)
- {
- return Options() | a | b;
- }
+ typedef Arg (*Ctor)();
+
+ Option(Ctor ctor): m_ctor(ctor) {}
+
+ friend Options operator | (Options::Proxy options, const Option<Arg>& self)
+ {
+ Options ret(options);
+ ret.Add((*self.m_ctor)());
+ return ret;
+ }
+
+ template<class Arg2>
+ friend Options operator | (const Option<Arg2>& a, const Option<Arg>& b)
+ {
+ return Options() | a | b;
+ }
private:
- Ctor m_ctor;
+ Ctor m_ctor;
};
@@ -139,109 +139,109 @@ extern const Option<Feature::Ptr> ANDNOT;
class Regexp {
public:
- template<class Pattern>
- explicit Regexp(Pattern pattern, Options options = Options())
- {
- Init(PatternBounds(pattern), options);
- }
-
- template<class Pattern, class Arg>
- Regexp(Pattern pattern, Option<Arg> option)
- {
- Init(PatternBounds(pattern), Options() | option);
- }
-
- explicit Regexp(Scanner sc): m_scanner(sc) {}
- explicit Regexp(SlowScanner ssc): m_slow(ssc) {}
-
- bool Matches(TStringBuf buf) const
- {
- if (!m_scanner.Empty())
- return Runner(m_scanner).Begin().Run(buf).End();
- else
- return Runner(m_slow).Begin().Run(buf).End();
- }
-
- bool Matches(const char* begin, const char* end) const
- {
- return Matches(TStringBuf(begin, end));
- }
-
- /// A helper class allowing '==~' operator for regexps
- class MatchProxy {
- public:
- MatchProxy(const Regexp& re): m_re(&re) {}
- friend bool operator == (const char* str, const MatchProxy& re) { return re.m_re->Matches(str); }
- friend bool operator == (const ystring& str, const MatchProxy& re) { return re.m_re->Matches(str); }
-
- private:
- const Regexp* m_re;
- };
- MatchProxy operator ~() const { return MatchProxy(*this); }
-
+ template<class Pattern>
+ explicit Regexp(Pattern pattern, Options options = Options())
+ {
+ Init(PatternBounds(pattern), options);
+ }
+
+ template<class Pattern, class Arg>
+ Regexp(Pattern pattern, Option<Arg> option)
+ {
+ Init(PatternBounds(pattern), Options() | option);
+ }
+
+ explicit Regexp(Scanner sc): m_scanner(sc) {}
+ explicit Regexp(SlowScanner ssc): m_slow(ssc) {}
+
+ bool Matches(TStringBuf buf) const
+ {
+ if (!m_scanner.Empty())
+ return Runner(m_scanner).Begin().Run(buf).End();
+ else
+ return Runner(m_slow).Begin().Run(buf).End();
+ }
+
+ bool Matches(const char* begin, const char* end) const
+ {
+ return Matches(TStringBuf(begin, end));
+ }
+
+ /// A helper class allowing '==~' operator for regexps
+ class MatchProxy {
+ public:
+ MatchProxy(const Regexp& re): m_re(&re) {}
+ friend bool operator == (const char* str, const MatchProxy& re) { return re.m_re->Matches(str); }
+ friend bool operator == (const ystring& str, const MatchProxy& re) { return re.m_re->Matches(str); }
+
+ private:
+ const Regexp* m_re;
+ };
+ MatchProxy operator ~() const { return MatchProxy(*this); }
+
private:
- Scanner m_scanner;
- SlowScanner m_slow;
-
- ypair<const char*, const char*> PatternBounds(const ystring& pattern)
- {
- static const char c = 0;
- return pattern.empty() ? ymake_pair(&c, &c) : ymake_pair(pattern.c_str(), pattern.c_str() + pattern.size());
- }
-
- ypair<const char*, const char*> PatternBounds(const char* pattern)
- {
- return ymake_pair(pattern, pattern + strlen(pattern));
- }
-
- void Init(ypair<const char*, const char*> rawPattern, Options options)
- {
- TVector<wchar32> pattern;
- options.Encoding().FromLocal(rawPattern.first, rawPattern.second, std::back_inserter(pattern));
-
- Lexer lexer(pattern);
- options.Apply(lexer);
- Fsm fsm = lexer.Parse();
-
- if (!BeginsWithCircumflex(fsm))
- fsm.PrependAnything();
- fsm.AppendAnything();
-
- if (fsm.Determine())
- m_scanner = fsm.Compile<Scanner>();
- else
- m_slow = fsm.Compile<SlowScanner>();
- }
-
- static bool BeginsWithCircumflex(const Fsm& fsm)
- {
- typedef Fsm::StatesSet Set;
- TDeque<size_t> queue;
- BitSet handled(fsm.Size());
-
- queue.push_back(fsm.Initial());
- handled.Set(fsm.Initial());
-
- while (!queue.empty()) {
- Set s = fsm.Destinations(queue.front(), SpecialChar::Epsilon);
- for (auto&& i : s) {
- if (!handled.Test(i)) {
- handled.Set(i);
- queue.push_back(i);
- }
- }
-
- TSet<Char> lets = fsm.OutgoingLetters(queue.front());
- lets.erase(SpecialChar::Epsilon);
- lets.erase(SpecialChar::BeginMark);
- if (!lets.empty())
- return false;
-
- queue.pop_front();
- }
-
- return true;
- }
+ Scanner m_scanner;
+ SlowScanner m_slow;
+
+ ypair<const char*, const char*> PatternBounds(const ystring& pattern)
+ {
+ static const char c = 0;
+ return pattern.empty() ? ymake_pair(&c, &c) : ymake_pair(pattern.c_str(), pattern.c_str() + pattern.size());
+ }
+
+ ypair<const char*, const char*> PatternBounds(const char* pattern)
+ {
+ return ymake_pair(pattern, pattern + strlen(pattern));
+ }
+
+ void Init(ypair<const char*, const char*> rawPattern, Options options)
+ {
+ TVector<wchar32> pattern;
+ options.Encoding().FromLocal(rawPattern.first, rawPattern.second, std::back_inserter(pattern));
+
+ Lexer lexer(pattern);
+ options.Apply(lexer);
+ Fsm fsm = lexer.Parse();
+
+ if (!BeginsWithCircumflex(fsm))
+ fsm.PrependAnything();
+ fsm.AppendAnything();
+
+ if (fsm.Determine())
+ m_scanner = fsm.Compile<Scanner>();
+ else
+ m_slow = fsm.Compile<SlowScanner>();
+ }
+
+ static bool BeginsWithCircumflex(const Fsm& fsm)
+ {
+ typedef Fsm::StatesSet Set;
+ TDeque<size_t> queue;
+ BitSet handled(fsm.Size());
+
+ queue.push_back(fsm.Initial());
+ handled.Set(fsm.Initial());
+
+ while (!queue.empty()) {
+ Set s = fsm.Destinations(queue.front(), SpecialChar::Epsilon);
+ for (auto&& i : s) {
+ if (!handled.Test(i)) {
+ handled.Set(i);
+ queue.push_back(i);
+ }
+ }
+
+ TSet<Char> lets = fsm.OutgoingLetters(queue.front());
+ lets.erase(SpecialChar::Epsilon);
+ lets.erase(SpecialChar::BeginMark);
+ if (!lets.empty())
+ return false;
+
+ queue.pop_front();
+ }
+
+ return true;
+ }
};
};
diff --git a/library/cpp/regex/pire/pire/encoding.cpp b/library/cpp/regex/pire/pire/encoding.cpp
index d5000c31464..1a2ac6872f3 100644
--- a/library/cpp/regex/pire/pire/encoding.cpp
+++ b/library/cpp/regex/pire/pire/encoding.cpp
@@ -37,97 +37,97 @@ namespace Pire {
namespace {
- class Latin1: public Encoding {
- public:
- Latin1() : Encoding() {}
-
- wchar32 FromLocal(const char*& begin, const char* end) const
- {
- if (begin == end)
- throw Error("EOF reached in Pire::Latin1::fromLocal()");
- else if (static_cast<unsigned char>(*begin) >= 0x80)
- throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)");
- else
- return (wchar32) *begin++;
- }
-
- ystring ToLocal(wchar32 ch) const
- {
- if (ch < 0x80)
- return ystring(1, (char) ch);
- else
- return ystring();
- }
-
- void AppendDot(Fsm& fsm) const { fsm.AppendDot(); }
- };
-
- namespace UtfRanges {
-
- static const size_t MaxLen = 4;
- static const size_t First[MaxLen][2] = {
- {0x00, 0x80},
- {0xC0, 0xE0},
- {0xE0, 0xF0},
- {0xF0, 0xF8}
- };
- static const size_t Next[2] = {0x80, 0xC0};
- }
-
-
- class Utf8: public Encoding {
- public:
- Utf8() : Encoding() {}
-
- wchar32 FromLocal(const char*& begin, const char* end) const
- {
- wchar32 rune;
- size_t len;
- if (SafeReadUTF8Char(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK)
- throw Error("Error reading UTF8 sequence");
- begin += len;
- return rune;
- }
-
- ystring ToLocal(wchar32 c) const
- {
- ystring ret(UTF8RuneLenByUCS(c), ' ');
- size_t len;
- unsigned char* p = (unsigned char*) &*ret.begin();
- if (SafeWriteUTF8Char(c, len, p, p + ret.size()) != RECODE_OK)
- Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error");
- return ret;
- }
-
- void AppendDot(Fsm& fsm) const
- {
- size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen);
- for (size_t i = 0; i < UtfRanges::MaxLen; ++i)
- for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter)
- fsm.ConnectFinal(fsm.Size() - i - 1, letter);
- for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i)
- for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter)
- fsm.Connect(last + i, last + i + 1, letter);
- fsm.ClearFinal();
- fsm.SetFinal(fsm.Size() - 1, true);
- fsm.SetIsDetermined(false);
- }
- };
+ class Latin1: public Encoding {
+ public:
+ Latin1() : Encoding() {}
+
+ wchar32 FromLocal(const char*& begin, const char* end) const
+ {
+ if (begin == end)
+ throw Error("EOF reached in Pire::Latin1::fromLocal()");
+ else if (static_cast<unsigned char>(*begin) >= 0x80)
+ throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)");
+ else
+ return (wchar32) *begin++;
+ }
+
+ ystring ToLocal(wchar32 ch) const
+ {
+ if (ch < 0x80)
+ return ystring(1, (char) ch);
+ else
+ return ystring();
+ }
+
+ void AppendDot(Fsm& fsm) const { fsm.AppendDot(); }
+ };
+
+ namespace UtfRanges {
+
+ static const size_t MaxLen = 4;
+ static const size_t First[MaxLen][2] = {
+ {0x00, 0x80},
+ {0xC0, 0xE0},
+ {0xE0, 0xF0},
+ {0xF0, 0xF8}
+ };
+ static const size_t Next[2] = {0x80, 0xC0};
+ }
+
+
+ class Utf8: public Encoding {
+ public:
+ Utf8() : Encoding() {}
+
+ wchar32 FromLocal(const char*& begin, const char* end) const
+ {
+ wchar32 rune;
+ size_t len;
+ if (SafeReadUTF8Char(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK)
+ throw Error("Error reading UTF8 sequence");
+ begin += len;
+ return rune;
+ }
+
+ ystring ToLocal(wchar32 c) const
+ {
+ ystring ret(UTF8RuneLenByUCS(c), ' ');
+ size_t len;
+ unsigned char* p = (unsigned char*) &*ret.begin();
+ if (SafeWriteUTF8Char(c, len, p, p + ret.size()) != RECODE_OK)
+ Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error");
+ return ret;
+ }
+
+ void AppendDot(Fsm& fsm) const
+ {
+ size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen);
+ for (size_t i = 0; i < UtfRanges::MaxLen; ++i)
+ for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter)
+ fsm.ConnectFinal(fsm.Size() - i - 1, letter);
+ for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i)
+ for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter)
+ fsm.Connect(last + i, last + i + 1, letter);
+ fsm.ClearFinal();
+ fsm.SetFinal(fsm.Size() - 1, true);
+ fsm.SetIsDetermined(false);
+ }
+ };
}
namespace Encodings {
- const Encoding& Utf8()
- {
- static const Pire::Utf8 utf8;
- return utf8;
- }
-
- const Encoding& Latin1()
- {
- static const Pire::Latin1 latin1;
- return latin1;
- }
+ const Encoding& Utf8()
+ {
+ static const Pire::Utf8 utf8;
+ return utf8;
+ }
+
+ const Encoding& Latin1()
+ {
+ static const Pire::Latin1 latin1;
+ return latin1;
+ }
}
diff --git a/library/cpp/regex/pire/pire/encoding.h b/library/cpp/regex/pire/pire/encoding.h
index 5009d18cdac..fceab3b9751 100644
--- a/library/cpp/regex/pire/pire/encoding.h
+++ b/library/cpp/regex/pire/pire/encoding.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -34,34 +34,34 @@ class Fsm;
class Encoding {
public:
- virtual ~Encoding() {}
-
- /// Should read bytes from @p begin and return the corresponding Unicode
- /// character, advancing @p begin.
- virtual wchar32 FromLocal(const char*& begin, const char* end) const = 0;
-
- /// Opposite to FromLocal(), transforms given Unicode character into
- /// the string in the encoding.
- virtual ystring ToLocal(wchar32 c) const = 0;
-
- /// Given the FSM, should append the representation of a dot in the ecoding
- /// to that FSM.
- virtual void AppendDot(Fsm&) const = 0;
-
- template<class OutputIter>
- OutputIter FromLocal(const char* begin, const char* end, OutputIter iter) const
- {
- while (begin != end) {
- *iter = FromLocal(begin, end);
- ++iter;
- }
- return iter;
- }
+ virtual ~Encoding() {}
+
+ /// Should read bytes from @p begin and return the corresponding Unicode
+ /// character, advancing @p begin.
+ virtual wchar32 FromLocal(const char*& begin, const char* end) const = 0;
+
+ /// Opposite to FromLocal(), transforms given Unicode character into
+ /// the string in the encoding.
+ virtual ystring ToLocal(wchar32 c) const = 0;
+
+ /// Given the FSM, should append the representation of a dot in the ecoding
+ /// to that FSM.
+ virtual void AppendDot(Fsm&) const = 0;
+
+ template<class OutputIter>
+ OutputIter FromLocal(const char* begin, const char* end, OutputIter iter) const
+ {
+ while (begin != end) {
+ *iter = FromLocal(begin, end);
+ ++iter;
+ }
+ return iter;
+ }
};
namespace Encodings {
- const Encoding& Latin1();
- const Encoding& Utf8();
+ const Encoding& Latin1();
+ const Encoding& Utf8();
};
diff --git a/library/cpp/regex/pire/pire/extra.h b/library/cpp/regex/pire/pire/extra.h
index 1ee9eee9eea..b7bcf9e4544 100644
--- a/library/cpp/regex/pire/pire/extra.h
+++ b/library/cpp/regex/pire/pire/extra.h
@@ -12,7 +12,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
diff --git a/library/cpp/regex/pire/pire/extra/capture.cpp b/library/cpp/regex/pire/pire/extra/capture.cpp
index fb4cdf6d815..47ec60d7f30 100644
--- a/library/cpp/regex/pire/pire/extra/capture.cpp
+++ b/library/cpp/regex/pire/pire/extra/capture.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -26,110 +26,110 @@
#include "capture.h"
namespace Pire {
-
+
namespace {
- class CaptureImpl: public Feature {
- public:
- CaptureImpl(size_t pos)
- : State(0)
- , Pos(pos)
- , Level(0)
- , StateRepetition(NoRepetition)
- {}
-
- bool Accepts(wchar32 c) const { return c == '(' || c == '+' || c == '*' || c == '?' || c == '{'; }
- Term Lex()
- {
- wchar32 c = GetChar();
- if (!Accepts(c))
- Error("How did we get here?!..");
- if (c != '(') {
- wchar32 next = PeekChar();
- if (next == '?') {
- StateRepetition = NonGreedyRepetition;
- GetChar();
- }
- else
- StateRepetition = GreedyRepetition;
- }
- else if (State == 0 && Pos > 1)
- --Pos;
- else if (State == 0 && Pos == 1) {
- State = 1;
- Level = 0;
- } else if (State == 1) {
- ++Level;
- }
- if (c == '(')
- return Term(TokenTypes::Open);
- else if (c == '+')
- return Term::Repetition(1, Inf);
- else if (c == '*')
- return Term::Repetition(0, Inf);
- else if (c == '?')
- return Term::Repetition(0, 1);
- else {
- UngetChar(c);
- return Term(0);
- }
- }
-
- void Parenthesized(Fsm& fsm)
- {
- if (StateRepetition != NoRepetition) {
- bool greedy = (StateRepetition == GreedyRepetition);
- SetRepetitionMark(fsm, greedy);
- StateRepetition = NoRepetition;
- } else if (State == 1 && Level == 0) {
- SetCaptureMark(fsm);
- State = 2;
- } else if (State == 1 && Level > 0)
- --Level;
- }
- private:
- unsigned State;
- size_t Pos;
- size_t Level;
- RepetitionTypes StateRepetition;
+ class CaptureImpl: public Feature {
+ public:
+ CaptureImpl(size_t pos)
+ : State(0)
+ , Pos(pos)
+ , Level(0)
+ , StateRepetition(NoRepetition)
+ {}
+
+ bool Accepts(wchar32 c) const { return c == '(' || c == '+' || c == '*' || c == '?' || c == '{'; }
+ Term Lex()
+ {
+ wchar32 c = GetChar();
+ if (!Accepts(c))
+ Error("How did we get here?!..");
+ if (c != '(') {
+ wchar32 next = PeekChar();
+ if (next == '?') {
+ StateRepetition = NonGreedyRepetition;
+ GetChar();
+ }
+ else
+ StateRepetition = GreedyRepetition;
+ }
+ else if (State == 0 && Pos > 1)
+ --Pos;
+ else if (State == 0 && Pos == 1) {
+ State = 1;
+ Level = 0;
+ } else if (State == 1) {
+ ++Level;
+ }
+ if (c == '(')
+ return Term(TokenTypes::Open);
+ else if (c == '+')
+ return Term::Repetition(1, Inf);
+ else if (c == '*')
+ return Term::Repetition(0, Inf);
+ else if (c == '?')
+ return Term::Repetition(0, 1);
+ else {
+ UngetChar(c);
+ return Term(0);
+ }
+ }
- void SetRepetitionMark(Fsm& fsm, bool greedy)
- {
- fsm.Resize(fsm.Size() + 1);
- fsm.ConnectFinal(fsm.Size() - 1);
+ void Parenthesized(Fsm& fsm)
+ {
+ if (StateRepetition != NoRepetition) {
+ bool greedy = (StateRepetition == GreedyRepetition);
+ SetRepetitionMark(fsm, greedy);
+ StateRepetition = NoRepetition;
+ } else if (State == 1 && Level == 0) {
+ SetCaptureMark(fsm);
+ State = 2;
+ } else if (State == 1 && Level > 0)
+ --Level;
+ }
+ private:
+ unsigned State;
+ size_t Pos;
+ size_t Level;
+ RepetitionTypes StateRepetition;
- for (size_t state = 0; state < fsm.Size() - 1; ++state)
- if (fsm.IsFinal(state))
- if (greedy)
- fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndRepetition);
- else
- fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndNonGreedyRepetition);
- fsm.ClearFinal();
- fsm.SetFinal(fsm.Size() - 1, true);
- fsm.SetIsDetermined(false);
- }
+ void SetRepetitionMark(Fsm& fsm, bool greedy)
+ {
+ fsm.Resize(fsm.Size() + 1);
+ fsm.ConnectFinal(fsm.Size() - 1);
- void SetCaptureMark(Fsm& fsm)
- {
- fsm.Resize(fsm.Size() + 2);
- fsm.Connect(fsm.Size() - 2, fsm.Initial());
- fsm.ConnectFinal(fsm.Size() - 1);
+ for (size_t state = 0; state < fsm.Size() - 1; ++state)
+ if (fsm.IsFinal(state))
+ if (greedy)
+ fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndRepetition);
+ else
+ fsm.SetOutput(state, fsm.Size() - 1, SlowCapturingScanner::EndNonGreedyRepetition);
+ fsm.ClearFinal();
+ fsm.SetFinal(fsm.Size() - 1, true);
+ fsm.SetIsDetermined(false);
+ }
- fsm.SetOutput(fsm.Size() - 2, fsm.Initial(), CapturingScanner::BeginCapture);
- for (size_t state = 0; state < fsm.Size() - 2; ++state)
- if (fsm.IsFinal(state))
- fsm.SetOutput(state, fsm.Size() - 1, CapturingScanner::EndCapture);
+ void SetCaptureMark(Fsm& fsm)
+ {
+ fsm.Resize(fsm.Size() + 2);
+ fsm.Connect(fsm.Size() - 2, fsm.Initial());
+ fsm.ConnectFinal(fsm.Size() - 1);
- fsm.SetInitial(fsm.Size() - 2);
- fsm.ClearFinal();
- fsm.SetFinal(fsm.Size() - 1, true);
- fsm.SetIsDetermined(false);
- }
-
- void FinishBuild() {}
- };
+ fsm.SetOutput(fsm.Size() - 2, fsm.Initial(), CapturingScanner::BeginCapture);
+ for (size_t state = 0; state < fsm.Size() - 2; ++state)
+ if (fsm.IsFinal(state))
+ fsm.SetOutput(state, fsm.Size() - 1, CapturingScanner::EndCapture);
+
+ fsm.SetInitial(fsm.Size() - 2);
+ fsm.ClearFinal();
+ fsm.SetFinal(fsm.Size() - 1, true);
+ fsm.SetIsDetermined(false);
+ }
+
+ void FinishBuild() {}
+ };
}
-
+
namespace Features {
- Feature::Ptr Capture(size_t pos) { return Feature::Ptr(new CaptureImpl(pos)); }
+ Feature::Ptr Capture(size_t pos) { return Feature::Ptr(new CaptureImpl(pos)); }
};
}
diff --git a/library/cpp/regex/pire/pire/extra/capture.h b/library/cpp/regex/pire/pire/extra/capture.h
index 8ed6bc7f801..b4dab8fd235 100644
--- a/library/cpp/regex/pire/pire/extra/capture.h
+++ b/library/cpp/regex/pire/pire/extra/capture.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -48,121 +48,121 @@ namespace Pire {
*/
class CapturingScanner: public LoadedScanner {
public:
- enum {
- NoAction = 0,
- BeginCapture = 1,
- EndCapture = 2,
-
- FinalFlag = 1
- };
-
- class State {
- public:
- bool Captured() const { return (m_begin != npos) && (m_end != npos); }
- size_t Begin() const { return m_begin; }
- size_t End() const { return m_end; }
- private:
- static const size_t npos = static_cast<size_t>(-1);
- size_t m_state;
- size_t m_begin;
- size_t m_end;
- size_t m_counter;
- friend class CapturingScanner;
+ enum {
+ NoAction = 0,
+ BeginCapture = 1,
+ EndCapture = 2,
+
+ FinalFlag = 1
+ };
+
+ class State {
+ public:
+ bool Captured() const { return (m_begin != npos) && (m_end != npos); }
+ size_t Begin() const { return m_begin; }
+ size_t End() const { return m_end; }
+ private:
+ static const size_t npos = static_cast<size_t>(-1);
+ size_t m_state;
+ size_t m_begin;
+ size_t m_end;
+ size_t m_counter;
+ friend class CapturingScanner;
#ifdef PIRE_DEBUG
- friend yostream& operator << (yostream& s, const State& state)
- {
- s << state.m_state;
- if (state.m_begin != State::npos || state.m_end != npos) {
- s << " [";
- if (state.m_begin != State::npos)
- s << 'b';
- if (state.m_end != State::npos)
- s << 'e';
- s << "]";
- }
- return s;
- }
+ friend yostream& operator << (yostream& s, const State& state)
+ {
+ s << state.m_state;
+ if (state.m_begin != State::npos || state.m_end != npos) {
+ s << " [";
+ if (state.m_begin != State::npos)
+ s << 'b';
+ if (state.m_end != State::npos)
+ s << 'e';
+ s << "]";
+ }
+ return s;
+ }
#endif
- };
-
- void Initialize(State& state) const
- {
- state.m_state = m.initial;
- state.m_begin = state.m_end = State::npos;
- state.m_counter = 0;
- }
-
- void TakeAction(State& s, Action a) const
- {
- if ((a & BeginCapture) && !s.Captured())
- s.m_begin = s.m_counter - 1;
- else if (a & EndCapture) {
- if (s.m_end == State::npos)
- s.m_end = s.m_counter - 1;
- }
- }
-
- Char Translate(Char ch) const
- {
- return m_letters[static_cast<size_t>(ch)];
- }
-
- Action NextTranslated(State& s, unsigned char c) const
- {
- Transition x = reinterpret_cast<const Transition*>(s.m_state)[c];
- s.m_state += SignExtend(x.shift);
- ++s.m_counter;
-
- return x.action;
- }
-
- Action Next(State& s, Char c) const
- {
- return NextTranslated(s, Translate(c));
- }
-
- Action Next(const State& current, State& n, Char c) const
- {
- n = current;
- return Next(n, c);
- }
-
- bool CanStop(const State& s) const
- {
- return Final(s);
- }
-
- bool Final(const State& s) const { return m_tags[(reinterpret_cast<Transition*>(s.m_state) - m_jumps) / m.lettersCount] & FinalFlag; }
-
- bool Dead(const State&) const { return false; }
-
- CapturingScanner() {}
- CapturingScanner(const CapturingScanner& s): LoadedScanner(s) {}
- explicit CapturingScanner(Fsm& fsm, size_t distance = 0)
- {
- if (distance) {
- fsm = CreateApproxFsm(fsm, distance);
- }
- fsm.Canonize();
- Init(fsm.Size(), fsm.Letters(), fsm.Initial());
- BuildScanner(fsm, *this);
- }
-
- void Swap(CapturingScanner& s) { LoadedScanner::Swap(s); }
- CapturingScanner& operator = (const CapturingScanner& s) { CapturingScanner(s).Swap(*this); return *this; }
-
- size_t StateIndex(const State& s) const { return StateIdx(s.m_state); }
+ };
+
+ void Initialize(State& state) const
+ {
+ state.m_state = m.initial;
+ state.m_begin = state.m_end = State::npos;
+ state.m_counter = 0;
+ }
+
+ void TakeAction(State& s, Action a) const
+ {
+ if ((a & BeginCapture) && !s.Captured())
+ s.m_begin = s.m_counter - 1;
+ else if (a & EndCapture) {
+ if (s.m_end == State::npos)
+ s.m_end = s.m_counter - 1;
+ }
+ }
+
+ Char Translate(Char ch) const
+ {
+ return m_letters[static_cast<size_t>(ch)];
+ }
+
+ Action NextTranslated(State& s, unsigned char c) const
+ {
+ Transition x = reinterpret_cast<const Transition*>(s.m_state)[c];
+ s.m_state += SignExtend(x.shift);
+ ++s.m_counter;
+
+ return x.action;
+ }
+
+ Action Next(State& s, Char c) const
+ {
+ return NextTranslated(s, Translate(c));
+ }
+
+ Action Next(const State& current, State& n, Char c) const
+ {
+ n = current;
+ return Next(n, c);
+ }
+
+ bool CanStop(const State& s) const
+ {
+ return Final(s);
+ }
+
+ bool Final(const State& s) const { return m_tags[(reinterpret_cast<Transition*>(s.m_state) - m_jumps) / m.lettersCount] & FinalFlag; }
+
+ bool Dead(const State&) const { return false; }
+
+ CapturingScanner() {}
+ CapturingScanner(const CapturingScanner& s): LoadedScanner(s) {}
+ explicit CapturingScanner(Fsm& fsm, size_t distance = 0)
+ {
+ if (distance) {
+ fsm = CreateApproxFsm(fsm, distance);
+ }
+ fsm.Canonize();
+ Init(fsm.Size(), fsm.Letters(), fsm.Initial());
+ BuildScanner(fsm, *this);
+ }
+
+ void Swap(CapturingScanner& s) { LoadedScanner::Swap(s); }
+ CapturingScanner& operator = (const CapturingScanner& s) { CapturingScanner(s).Swap(*this); return *this; }
+
+ size_t StateIndex(const State& s) const { return StateIdx(s.m_state); }
private:
- friend void BuildScanner<CapturingScanner>(const Fsm&, CapturingScanner&);
+ friend void BuildScanner<CapturingScanner>(const Fsm&, CapturingScanner&);
};
enum RepetitionTypes { // They are sorted by their priorities
- NonGreedyRepetition,
- NoRepetition,
- GreedyRepetition,
+ NonGreedyRepetition,
+ NoRepetition,
+ GreedyRepetition,
};
/**
@@ -171,419 +171,419 @@ enum RepetitionTypes { // They are sorted by their priorities
*/
class SlowCapturingScanner : public SlowScanner {
public:
- enum {
- Nothing = 0,
- BeginCapture = 1,
- EndCapture = 2,
- EndRepetition = 4,
- EndNonGreedyRepetition = 8,
-
- FinalFlag = 1,
- };
-
- const ui32 ActionsCapture = BeginCapture | EndCapture;
-
- class SingleState {
- public:
- bool Captured() const
- {
- return (m_begin != m_npos && m_end != m_npos);
- }
-
- bool HasBegin() const
- {
- return (m_begin != m_npos);
- }
-
- bool HasEnd() const
- {
- return (m_end != m_npos);
- }
-
- SingleState(size_t num = 0)
- {
- m_state = num;
- m_begin = m_npos;
- m_end = m_npos;
- }
-
- void SetBegin(size_t pos)
- {
- if (m_begin == m_npos)
- m_begin = pos;
- }
-
- void SetEnd(size_t pos)
- {
- if (m_end == m_npos)
- m_end = pos;
- }
-
- size_t Begin() const
- {
- return GetBegin();
- }
-
- size_t End() const
- {
- return GetEnd();
- }
-
- size_t GetBegin() const
- {
- return m_begin;
- }
-
- size_t GetEnd() const
- {
- return m_end;
- }
-
- size_t GetNum() const
- {
- return m_state;
- }
-
- private:
- size_t m_state;
- size_t m_begin;
- size_t m_end;
- static const size_t m_npos = static_cast<size_t>(-1);
- };
-
- class State {
- public:
- State()
- : m_strpos(0)
- , m_matched(false) {}
-
- size_t GetPos() const
- {
- return m_strpos;
- }
-
- const SingleState& GetState(size_t pos) const
- {
- return m_states[pos];
- }
-
- void SetPos(size_t newPos)
- {
- m_strpos = newPos;
- }
-
- void PushState(SingleState& st)
- {
- m_states.push_back(st);
- }
-
- void PushState(const SingleState& st)
- {
- m_states.push_back(st);
- }
-
- size_t GetSize() const
- {
- return m_states.size();
- }
-
- const TVector<SingleState>& GetStates() const
- {
- return m_states;
- }
-
- bool IsMatched() const
- {
- return m_matched;
- }
-
- const SingleState& GetMatched() const
- {
- return m_match;
- }
-
- void AddMatch(const SingleState& Matched)
- {
- m_matched = true;
- m_match = Matched;
- }
-
- private:
- TVector<SingleState> m_states;
- size_t m_strpos;
- bool m_matched;
- SingleState m_match;
- };
-
- class Transition {
- private:
- unsigned long m_stateto;
- Action m_action;
-
- public:
- unsigned long GetState() const
- {
- return m_stateto;
- }
-
- Action GetAction() const
- {
- return m_action;
- }
-
- Transition(unsigned long state, Action act = 0)
- : m_stateto(state)
- , m_action(act)
- {
- }
- };
-
- class PriorityStates {
- private:
- TVector<SingleState> m_nonGreedy;
- TVector<SingleState> m_nothing;
- TVector<SingleState> m_greedy;
-
- public:
- void Push(const SingleState& st, RepetitionTypes repetition)
- {
- Get(repetition).push_back(st);
- }
-
- TVector<SingleState>& Get(RepetitionTypes repetition)
- {
- switch (repetition) {
- case NonGreedyRepetition:
- return m_nonGreedy;
- case NoRepetition:
- return m_nothing;
- case GreedyRepetition:
- return m_greedy;
- }
- }
-
- const TVector<SingleState>& Get(RepetitionTypes repetition) const
- {
- switch (repetition) {
- case NonGreedyRepetition:
- return m_nonGreedy;
- case NoRepetition:
- return m_nothing;
- case GreedyRepetition:
- return m_greedy;
- }
- }
- };
-
- SlowScanner::State GetNextStates(const SingleState& cur, Char letter) const
- {
- SlowScanner::State st(GetSize());
- st.states.push_back(cur.GetNum());
- SlowScanner::State nextState(GetSize());
- SlowScanner::NextTranslated(st, nextState, letter);
- return nextState;
- }
-
- size_t GetPosition(const SingleState& state, Char letter) const
- {
- return state.GetNum() * GetLettersCount() + letter;
- }
-
- void NextStates(const SingleState& state, Char letter, TVector<Transition>& nextStates) const
- {
- if (IsMmaped()) {
- const size_t* pos = GetJumpPos() + GetPosition(state, letter);
- size_t posBegin = pos[0];
- size_t posEnd = pos[1];
- for (size_t i = posBegin; i < posEnd; ++i)
- nextStates.emplace_back(GetJump(i), GetAction(i));
- } else {
- size_t num = GetPosition(state, letter);
- const auto& jumpVec = GetJumpsVec(num);
- const auto& actionVec = GetActionsVec(num);
- for (size_t i = 0; i < jumpVec.size(); ++i)
- nextStates.emplace_back(jumpVec[i], actionVec[i]);
- }
- }
-
- void InsertStates(const PriorityStates& states, TVector<SingleState>& nonGreedy, TVector<SingleState>& nothing, TVector<SingleState>& greedy) const
- {
- for (auto& greed : {ymake_pair(&nonGreedy, NonGreedyRepetition), ymake_pair(&nothing, NoRepetition), ymake_pair(&greedy, GreedyRepetition)}) {
- auto& vec = greed.first;
- auto& tag = greed.second;
- vec->insert(vec->end(), states.Get(tag).begin(), states.Get(tag).end());
- }
- }
-
- void NextAndGetToGroups(PriorityStates& states, const SingleState& cur,
- Char letter, size_t pos, TVector<bool>& used) const
- {
- TVector<Transition> nextStates;
- NextStates(cur, letter, nextStates);
- for (const auto& trans : nextStates) {
- size_t st = trans.GetState();
- if (used[st])
- continue;
- used[st] = true;
- SingleState state(st);
- const auto& action = trans.GetAction();
- state.SetBegin(cur.GetBegin());
- state.SetEnd(cur.GetEnd());
- if (action & BeginCapture && !cur.HasBegin()) {
- state.SetBegin(pos);
- }
- if (action & EndCapture && !cur.HasEnd()) {
- state.SetEnd(pos);
- }
- PriorityStates statesInside;
- NextAndGetToGroups(statesInside, state, Translate(Epsilon), pos, used);
- statesInside.Push(state, NoRepetition);
- if (action & EndNonGreedyRepetition) {
- auto& nongreedy = states.Get(NonGreedyRepetition);
- InsertStates(statesInside, nongreedy, nongreedy, nongreedy);
- }
- else if (!(action & EndRepetition))
- InsertStates(statesInside, states.Get(NonGreedyRepetition), states.Get(NoRepetition), states.Get(GreedyRepetition));
- else {
- auto& greedy = states.Get(GreedyRepetition);
- InsertStates(statesInside, greedy, greedy, greedy);
- }
- }
- }
-
- bool Captured(const SingleState& st, bool& matched) const
- {
- matched = false;
- if (IsFinal(st.GetNum())) {
- matched = true;
- if (st.HasBegin())
- return true;
- }
- TVector<Transition> nextStates;
- NextStates(st, Translate(EndMark), nextStates);
- for (const auto& trans : nextStates)
- {
- size_t state = trans.GetState();
- if (IsFinal(state)) {
- matched = true;
- if (st.HasBegin() || (trans.GetAction() & ActionsCapture))
- return true;
- } else { // After EndMark there can be Epsilon-transitions to the Final State
- TVector<Transition> epsilonTrans;
- SingleState newSt(state);
- NextStates(newSt, Translate(Epsilon), epsilonTrans);
- for (auto new_trans : epsilonTrans) {
- size_t fin = new_trans.GetState();
- if (IsFinal(fin)) {
- matched = true;
- if (st.HasBegin() || (trans.GetAction() & ActionsCapture))
- return true;
- }
- }
- }
- }
- return false;
- }
-
- bool Matched(const SingleState& st) const
- {
- bool matched;
- Captured(st, matched);
- return matched;
- }
-
- bool GetCapture(const State& st, SingleState& final) const
- {
- size_t pos = 0;
- bool matched = false;
- bool ans = false;
- while (pos < st.GetSize() && !matched) {
- ans = Captured(st.GetState(pos), matched);
- ++pos;
- }
- if (matched) {
- final = st.GetState(pos - 1);
- return ans;
- } else {
- if (st.IsMatched()) {
- final = st.GetMatched();
- return true;
- }
- return false;
- }
- }
-
- bool PushState(State& nlist, const SingleState& state) const
- {
- nlist.PushState(state);
- if (Matched(state)) {
- nlist.AddMatch(state);
- return true;
- }
- return false;
- }
-
- void UpdateNList(State& nlist, const PriorityStates& states) const
- {
- static constexpr std::array<RepetitionTypes, 3> m_type_by_priority{{NonGreedyRepetition, NoRepetition, GreedyRepetition}};
- for (const auto type : m_type_by_priority) {
- for (const auto& state : states.Get(type)) {
- if (PushState(nlist, state)) // Because we have strict priorities, after matching some state, we can be sure, that not states after will be better
- return;
- }
- }
- }
-
- void Initialize(State& nlist) const
- {
- PriorityStates states;
- SingleState init(GetStart());
- TVector<bool> used(GetSize());
- NextAndGetToGroups(states, init, Translate(BeginMark), 0, used);
- NextAndGetToGroups(states, 0, Translate(BeginMark), 0, used);
- UpdateNList(nlist, states);
- }
-
- Action NextTranslated(State& clist, Char letter) const
- {
- State nlist;
- if (clist.IsMatched())
- nlist.AddMatch(clist.GetMatched());
- nlist.SetPos(clist.GetPos() + 1);
- size_t strpos = nlist.GetPos();
- TVector<bool> used(GetSize());
- size_t pos = 0;
- while (pos < clist.GetSize()) {
- PriorityStates states;
- NextAndGetToGroups(states, clist.GetState(pos), letter, strpos, used);
- UpdateNList(nlist, states);
- ++pos;
- }
- DoSwap(clist, nlist);
- return 0;
- }
-
- void TakeAction(State&, Action) const {}
-
- Action Next(State& st, Char letter) const
- {
- return NextTranslated(st, Translate(letter));
- }
+ enum {
+ Nothing = 0,
+ BeginCapture = 1,
+ EndCapture = 2,
+ EndRepetition = 4,
+ EndNonGreedyRepetition = 8,
+
+ FinalFlag = 1,
+ };
+
+ const ui32 ActionsCapture = BeginCapture | EndCapture;
+
+ class SingleState {
+ public:
+ bool Captured() const
+ {
+ return (m_begin != m_npos && m_end != m_npos);
+ }
+
+ bool HasBegin() const
+ {
+ return (m_begin != m_npos);
+ }
+
+ bool HasEnd() const
+ {
+ return (m_end != m_npos);
+ }
+
+ SingleState(size_t num = 0)
+ {
+ m_state = num;
+ m_begin = m_npos;
+ m_end = m_npos;
+ }
+
+ void SetBegin(size_t pos)
+ {
+ if (m_begin == m_npos)
+ m_begin = pos;
+ }
+
+ void SetEnd(size_t pos)
+ {
+ if (m_end == m_npos)
+ m_end = pos;
+ }
+
+ size_t Begin() const
+ {
+ return GetBegin();
+ }
+
+ size_t End() const
+ {
+ return GetEnd();
+ }
+
+ size_t GetBegin() const
+ {
+ return m_begin;
+ }
+
+ size_t GetEnd() const
+ {
+ return m_end;
+ }
+
+ size_t GetNum() const
+ {
+ return m_state;
+ }
+
+ private:
+ size_t m_state;
+ size_t m_begin;
+ size_t m_end;
+ static const size_t m_npos = static_cast<size_t>(-1);
+ };
+
+ class State {
+ public:
+ State()
+ : m_strpos(0)
+ , m_matched(false) {}
+
+ size_t GetPos() const
+ {
+ return m_strpos;
+ }
+
+ const SingleState& GetState(size_t pos) const
+ {
+ return m_states[pos];
+ }
+
+ void SetPos(size_t newPos)
+ {
+ m_strpos = newPos;
+ }
+
+ void PushState(SingleState& st)
+ {
+ m_states.push_back(st);
+ }
+
+ void PushState(const SingleState& st)
+ {
+ m_states.push_back(st);
+ }
+
+ size_t GetSize() const
+ {
+ return m_states.size();
+ }
+
+ const TVector<SingleState>& GetStates() const
+ {
+ return m_states;
+ }
+
+ bool IsMatched() const
+ {
+ return m_matched;
+ }
+
+ const SingleState& GetMatched() const
+ {
+ return m_match;
+ }
+
+ void AddMatch(const SingleState& Matched)
+ {
+ m_matched = true;
+ m_match = Matched;
+ }
+
+ private:
+ TVector<SingleState> m_states;
+ size_t m_strpos;
+ bool m_matched;
+ SingleState m_match;
+ };
+
+ class Transition {
+ private:
+ unsigned long m_stateto;
+ Action m_action;
+
+ public:
+ unsigned long GetState() const
+ {
+ return m_stateto;
+ }
+
+ Action GetAction() const
+ {
+ return m_action;
+ }
+
+ Transition(unsigned long state, Action act = 0)
+ : m_stateto(state)
+ , m_action(act)
+ {
+ }
+ };
+
+ class PriorityStates {
+ private:
+ TVector<SingleState> m_nonGreedy;
+ TVector<SingleState> m_nothing;
+ TVector<SingleState> m_greedy;
+
+ public:
+ void Push(const SingleState& st, RepetitionTypes repetition)
+ {
+ Get(repetition).push_back(st);
+ }
+
+ TVector<SingleState>& Get(RepetitionTypes repetition)
+ {
+ switch (repetition) {
+ case NonGreedyRepetition:
+ return m_nonGreedy;
+ case NoRepetition:
+ return m_nothing;
+ case GreedyRepetition:
+ return m_greedy;
+ }
+ }
+
+ const TVector<SingleState>& Get(RepetitionTypes repetition) const
+ {
+ switch (repetition) {
+ case NonGreedyRepetition:
+ return m_nonGreedy;
+ case NoRepetition:
+ return m_nothing;
+ case GreedyRepetition:
+ return m_greedy;
+ }
+ }
+ };
+
+ SlowScanner::State GetNextStates(const SingleState& cur, Char letter) const
+ {
+ SlowScanner::State st(GetSize());
+ st.states.push_back(cur.GetNum());
+ SlowScanner::State nextState(GetSize());
+ SlowScanner::NextTranslated(st, nextState, letter);
+ return nextState;
+ }
+
+ size_t GetPosition(const SingleState& state, Char letter) const
+ {
+ return state.GetNum() * GetLettersCount() + letter;
+ }
+
+ void NextStates(const SingleState& state, Char letter, TVector<Transition>& nextStates) const
+ {
+ if (IsMmaped()) {
+ const size_t* pos = GetJumpPos() + GetPosition(state, letter);
+ size_t posBegin = pos[0];
+ size_t posEnd = pos[1];
+ for (size_t i = posBegin; i < posEnd; ++i)
+ nextStates.emplace_back(GetJump(i), GetAction(i));
+ } else {
+ size_t num = GetPosition(state, letter);
+ const auto& jumpVec = GetJumpsVec(num);
+ const auto& actionVec = GetActionsVec(num);
+ for (size_t i = 0; i < jumpVec.size(); ++i)
+ nextStates.emplace_back(jumpVec[i], actionVec[i]);
+ }
+ }
+
+ void InsertStates(const PriorityStates& states, TVector<SingleState>& nonGreedy, TVector<SingleState>& nothing, TVector<SingleState>& greedy) const
+ {
+ for (auto& greed : {ymake_pair(&nonGreedy, NonGreedyRepetition), ymake_pair(&nothing, NoRepetition), ymake_pair(&greedy, GreedyRepetition)}) {
+ auto& vec = greed.first;
+ auto& tag = greed.second;
+ vec->insert(vec->end(), states.Get(tag).begin(), states.Get(tag).end());
+ }
+ }
+
+ void NextAndGetToGroups(PriorityStates& states, const SingleState& cur,
+ Char letter, size_t pos, TVector<bool>& used) const
+ {
+ TVector<Transition> nextStates;
+ NextStates(cur, letter, nextStates);
+ for (const auto& trans : nextStates) {
+ size_t st = trans.GetState();
+ if (used[st])
+ continue;
+ used[st] = true;
+ SingleState state(st);
+ const auto& action = trans.GetAction();
+ state.SetBegin(cur.GetBegin());
+ state.SetEnd(cur.GetEnd());
+ if (action & BeginCapture && !cur.HasBegin()) {
+ state.SetBegin(pos);
+ }
+ if (action & EndCapture && !cur.HasEnd()) {
+ state.SetEnd(pos);
+ }
+ PriorityStates statesInside;
+ NextAndGetToGroups(statesInside, state, Translate(Epsilon), pos, used);
+ statesInside.Push(state, NoRepetition);
+ if (action & EndNonGreedyRepetition) {
+ auto& nongreedy = states.Get(NonGreedyRepetition);
+ InsertStates(statesInside, nongreedy, nongreedy, nongreedy);
+ }
+ else if (!(action & EndRepetition))
+ InsertStates(statesInside, states.Get(NonGreedyRepetition), states.Get(NoRepetition), states.Get(GreedyRepetition));
+ else {
+ auto& greedy = states.Get(GreedyRepetition);
+ InsertStates(statesInside, greedy, greedy, greedy);
+ }
+ }
+ }
+
+ bool Captured(const SingleState& st, bool& matched) const
+ {
+ matched = false;
+ if (IsFinal(st.GetNum())) {
+ matched = true;
+ if (st.HasBegin())
+ return true;
+ }
+ TVector<Transition> nextStates;
+ NextStates(st, Translate(EndMark), nextStates);
+ for (const auto& trans : nextStates)
+ {
+ size_t state = trans.GetState();
+ if (IsFinal(state)) {
+ matched = true;
+ if (st.HasBegin() || (trans.GetAction() & ActionsCapture))
+ return true;
+ } else { // After EndMark there can be Epsilon-transitions to the Final State
+ TVector<Transition> epsilonTrans;
+ SingleState newSt(state);
+ NextStates(newSt, Translate(Epsilon), epsilonTrans);
+ for (auto new_trans : epsilonTrans) {
+ size_t fin = new_trans.GetState();
+ if (IsFinal(fin)) {
+ matched = true;
+ if (st.HasBegin() || (trans.GetAction() & ActionsCapture))
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ bool Matched(const SingleState& st) const
+ {
+ bool matched;
+ Captured(st, matched);
+ return matched;
+ }
+
+ bool GetCapture(const State& st, SingleState& final) const
+ {
+ size_t pos = 0;
+ bool matched = false;
+ bool ans = false;
+ while (pos < st.GetSize() && !matched) {
+ ans = Captured(st.GetState(pos), matched);
+ ++pos;
+ }
+ if (matched) {
+ final = st.GetState(pos - 1);
+ return ans;
+ } else {
+ if (st.IsMatched()) {
+ final = st.GetMatched();
+ return true;
+ }
+ return false;
+ }
+ }
+
+ bool PushState(State& nlist, const SingleState& state) const
+ {
+ nlist.PushState(state);
+ if (Matched(state)) {
+ nlist.AddMatch(state);
+ return true;
+ }
+ return false;
+ }
+
+ void UpdateNList(State& nlist, const PriorityStates& states) const
+ {
+ static constexpr std::array<RepetitionTypes, 3> m_type_by_priority{{NonGreedyRepetition, NoRepetition, GreedyRepetition}};
+ for (const auto type : m_type_by_priority) {
+ for (const auto& state : states.Get(type)) {
+ if (PushState(nlist, state)) // Because we have strict priorities, after matching some state, we can be sure, that not states after will be better
+ return;
+ }
+ }
+ }
+
+ void Initialize(State& nlist) const
+ {
+ PriorityStates states;
+ SingleState init(GetStart());
+ TVector<bool> used(GetSize());
+ NextAndGetToGroups(states, init, Translate(BeginMark), 0, used);
+ NextAndGetToGroups(states, 0, Translate(BeginMark), 0, used);
+ UpdateNList(nlist, states);
+ }
+
+ Action NextTranslated(State& clist, Char letter) const
+ {
+ State nlist;
+ if (clist.IsMatched())
+ nlist.AddMatch(clist.GetMatched());
+ nlist.SetPos(clist.GetPos() + 1);
+ size_t strpos = nlist.GetPos();
+ TVector<bool> used(GetSize());
+ size_t pos = 0;
+ while (pos < clist.GetSize()) {
+ PriorityStates states;
+ NextAndGetToGroups(states, clist.GetState(pos), letter, strpos, used);
+ UpdateNList(nlist, states);
+ ++pos;
+ }
+ DoSwap(clist, nlist);
+ return 0;
+ }
+
+ void TakeAction(State&, Action) const {}
+
+ Action Next(State& st, Char letter) const
+ {
+ return NextTranslated(st, Translate(letter));
+ }
public:
- SlowCapturingScanner()
- : SlowScanner(true)
- {
- }
-
- SlowCapturingScanner(Fsm& fsm, size_t distance = 0)
- : SlowScanner(fsm, true, false, distance)
- {
- }
+ SlowCapturingScanner()
+ : SlowScanner(true)
+ {
+ }
+
+ SlowCapturingScanner(Fsm& fsm, size_t distance = 0)
+ : SlowScanner(fsm, true, false, distance)
+ {
+ }
};
namespace Features {
- Feature::Ptr Capture(size_t pos);
+ Feature::Ptr Capture(size_t pos);
}
}
diff --git a/library/cpp/regex/pire/pire/extra/count.cpp b/library/cpp/regex/pire/pire/extra/count.cpp
index 2fb7c8d0613..df3d520400a 100644
--- a/library/cpp/regex/pire/pire/extra/count.cpp
+++ b/library/cpp/regex/pire/pire/extra/count.cpp
@@ -44,221 +44,221 @@ class CountingFsmTask;
class CountingFsm {
public:
- typedef Fsm::LettersTbl LettersTbl;
-
- enum {
- NotMatched = 1 << 0,
- Matched = 1 << 1,
- Separated = 1 << 2,
- };
-
- explicit CountingFsm(Fsm re, Fsm sep)
- : mFsm(std::move(re))
- {
- mFsm.Canonize();
- const auto reMatchedStates = mFsm.Finals();
-
- sep.Canonize();
- for (size_t state = 0; state < sep.Size(); ++state) {
- sep.SetTag(state, Separated);
- }
- mFsm += sep;
-
- mReInitial = mFsm.Initial();
- const auto allowEmptySeparator = sep.IsFinal(sep.Initial());
- for (auto reMatchedState : reMatchedStates) {
- mFsm.SetTag(reMatchedState, Matched);
- if (allowEmptySeparator) {
- mFsm.SetFinal(reMatchedState, true);
- }
- }
-
- mFsm.PrependAnything();
- mFsm.RemoveEpsilons();
- }
-
- const LettersTbl& Letters() const {
- return mFsm.Letters();
- }
-
- const Fsm& Determined() const {
- return mDetermined;
- }
-
- Action Output(size_t from, Char letter) const {
- const auto& row = mActions[from];
- const auto it = row.find(letter);
- if (it != row.end()) {
- return it->second;
- } else {
- return 0;
- }
- }
-
- bool Simple() const {
- return mSimple;
- }
-
- bool Determine();
- void Minimize();
+ typedef Fsm::LettersTbl LettersTbl;
+
+ enum {
+ NotMatched = 1 << 0,
+ Matched = 1 << 1,
+ Separated = 1 << 2,
+ };
+
+ explicit CountingFsm(Fsm re, Fsm sep)
+ : mFsm(std::move(re))
+ {
+ mFsm.Canonize();
+ const auto reMatchedStates = mFsm.Finals();
+
+ sep.Canonize();
+ for (size_t state = 0; state < sep.Size(); ++state) {
+ sep.SetTag(state, Separated);
+ }
+ mFsm += sep;
+
+ mReInitial = mFsm.Initial();
+ const auto allowEmptySeparator = sep.IsFinal(sep.Initial());
+ for (auto reMatchedState : reMatchedStates) {
+ mFsm.SetTag(reMatchedState, Matched);
+ if (allowEmptySeparator) {
+ mFsm.SetFinal(reMatchedState, true);
+ }
+ }
+
+ mFsm.PrependAnything();
+ mFsm.RemoveEpsilons();
+ }
+
+ const LettersTbl& Letters() const {
+ return mFsm.Letters();
+ }
+
+ const Fsm& Determined() const {
+ return mDetermined;
+ }
+
+ Action Output(size_t from, Char letter) const {
+ const auto& row = mActions[from];
+ const auto it = row.find(letter);
+ if (it != row.end()) {
+ return it->second;
+ } else {
+ return 0;
+ }
+ }
+
+ bool Simple() const {
+ return mSimple;
+ }
+
+ bool Determine();
+ void Minimize();
private:
- void SwapTaskOutputs(CountingFsmTask& task);
+ void SwapTaskOutputs(CountingFsmTask& task);
private:
- Fsm mFsm;
- size_t mReInitial;
- Fsm mDetermined;
- TransitionTagTable mActions;
- bool mSimple;
+ Fsm mFsm;
+ size_t mReInitial;
+ Fsm mDetermined;
+ TransitionTagTable mActions;
+ bool mSimple;
};
class CountingFsmTask {
public:
- typedef Fsm::LettersTbl LettersTbl;
+ typedef Fsm::LettersTbl LettersTbl;
- virtual ~CountingFsmTask() {}
+ virtual ~CountingFsmTask() {}
- void Connect(size_t from, size_t to, Char letter) {
- mNewFsm.Connect(from, to, letter);
- }
+ void Connect(size_t from, size_t to, Char letter) {
+ mNewFsm.Connect(from, to, letter);
+ }
- typedef bool Result;
+ typedef bool Result;
- static Result Success() {
- return true;
- }
+ static Result Success() {
+ return true;
+ }
- static Result Failure() {
- return false;
- }
+ static Result Failure() {
+ return false;
+ }
- Fsm& Output() {
- return mNewFsm;
- }
+ Fsm& Output() {
+ return mNewFsm;
+ }
- TransitionTagTable& Actions() {
- return mNewActions;
- }
+ TransitionTagTable& Actions() {
+ return mNewActions;
+ }
protected:
- void ResizeOutput(size_t size) {
- mNewFsm.Resize(size);
- mNewActions.resize(size);
- }
+ void ResizeOutput(size_t size) {
+ mNewFsm.Resize(size);
+ mNewActions.resize(size);
+ }
private:
- Fsm mNewFsm;
- TransitionTagTable mNewActions;
+ Fsm mNewFsm;
+ TransitionTagTable mNewActions;
};
class StateLessForMinimize {
public:
- StateLessForMinimize(const CountingFsm& fsm) : mFsm(fsm) {}
-
- bool operator()(size_t first, size_t second) const {
- for (auto&& lettersEl : mFsm.Letters()) {
- const auto letter = lettersEl.first;
- if (mFsm.Output(first, letter) != mFsm.Output(second, letter)) {
- return mFsm.Output(first, letter) < mFsm.Output(second, letter);
- }
- }
- return false;
- }
+ StateLessForMinimize(const CountingFsm& fsm) : mFsm(fsm) {}
+
+ bool operator()(size_t first, size_t second) const {
+ for (auto&& lettersEl : mFsm.Letters()) {
+ const auto letter = lettersEl.first;
+ if (mFsm.Output(first, letter) != mFsm.Output(second, letter)) {
+ return mFsm.Output(first, letter) < mFsm.Output(second, letter);
+ }
+ }
+ return false;
+ }
private:
- const CountingFsm& mFsm;
+ const CountingFsm& mFsm;
};
class CountingFsmMinimizeTask : public CountingFsmTask {
public:
- explicit CountingFsmMinimizeTask(const CountingFsm& fsm)
- : mFsm(fsm)
- , reversedTransitions(fsm.Determined().Size())
- , StateClass(fsm.Determined().Size())
- , Classes(0)
- {
- TMap<size_t, size_t, StateLessForMinimize> stateClassMap = TMap<size_t, size_t, StateLessForMinimize>(StateLessForMinimize(mFsm));
- for (size_t state = 0; state < mFsm.Determined().Size(); ++state) {
- if (stateClassMap.find(state) == stateClassMap.end()) {
- stateClassMap[state] = Classes++;
- }
- StateClass[state] = stateClassMap[state];
- reversedTransitions[state].resize(mFsm.Letters().Size());
- }
-
- for (size_t state = 0; state < mFsm.Determined().Size(); ++state) {
- TSet<ypair<Char, size_t>> usedTransitions;
- for (const auto& letter : mFsm.Letters()) {
- const auto destination = Next(state, letter.first);
- const auto letterId = letter.second.first;
- if (usedTransitions.find(ymake_pair(letterId, destination)) == usedTransitions.end()) {
- usedTransitions.insert(ymake_pair(letterId, destination));
- reversedTransitions[destination][letterId].push_back(state);
- }
- }
- }
- }
-
- TVector<size_t>& GetStateClass() { return StateClass; }
-
- size_t& GetClassesNumber() { return Classes; }
-
- size_t LettersCount() const {
- return mFsm.Letters().Size();
- }
-
- bool IsDetermined() const {
- return mFsm.Determined().IsDetermined();
- }
-
- size_t Size() const {
- return mFsm.Determined().Size();
- }
-
- const TVector<size_t>& Previous(size_t state, size_t letter) const {
- return reversedTransitions[state][letter];
- }
-
- void AcceptStates() {
- ResizeOutput(Classes);
- auto& newFsm = Output();
- auto& newActions = Actions();
- newFsm.SetFinal(0, false);
-
- // Unite equality classes into new states
- for (size_t from = 0; from != Size(); ++from) {
- const auto fromMinimized = StateClass[from];
- for (auto&& letter : mFsm.Letters()) {
- const auto representative = letter.first;
- const auto next = Next(from, representative);
- const auto nextMinimized = StateClass[next];
- Connect(fromMinimized, nextMinimized, representative);
- const auto outputs = mFsm.Output(from, representative);
- if (outputs) {
- newActions[fromMinimized][representative] = outputs;
- }
- }
- if (mFsm.Determined().IsFinal(from)) {
- newFsm.SetFinal(fromMinimized, true);
- }
- }
- newFsm.SetInitial(StateClass[mFsm.Determined().Initial()]);
- newFsm.SetIsDetermined(true);
- }
+ explicit CountingFsmMinimizeTask(const CountingFsm& fsm)
+ : mFsm(fsm)
+ , reversedTransitions(fsm.Determined().Size())
+ , StateClass(fsm.Determined().Size())
+ , Classes(0)
+ {
+ TMap<size_t, size_t, StateLessForMinimize> stateClassMap = TMap<size_t, size_t, StateLessForMinimize>(StateLessForMinimize(mFsm));
+ for (size_t state = 0; state < mFsm.Determined().Size(); ++state) {
+ if (stateClassMap.find(state) == stateClassMap.end()) {
+ stateClassMap[state] = Classes++;
+ }
+ StateClass[state] = stateClassMap[state];
+ reversedTransitions[state].resize(mFsm.Letters().Size());
+ }
+
+ for (size_t state = 0; state < mFsm.Determined().Size(); ++state) {
+ TSet<ypair<Char, size_t>> usedTransitions;
+ for (const auto& letter : mFsm.Letters()) {
+ const auto destination = Next(state, letter.first);
+ const auto letterId = letter.second.first;
+ if (usedTransitions.find(ymake_pair(letterId, destination)) == usedTransitions.end()) {
+ usedTransitions.insert(ymake_pair(letterId, destination));
+ reversedTransitions[destination][letterId].push_back(state);
+ }
+ }
+ }
+ }
+
+ TVector<size_t>& GetStateClass() { return StateClass; }
+
+ size_t& GetClassesNumber() { return Classes; }
+
+ size_t LettersCount() const {
+ return mFsm.Letters().Size();
+ }
+
+ bool IsDetermined() const {
+ return mFsm.Determined().IsDetermined();
+ }
+
+ size_t Size() const {
+ return mFsm.Determined().Size();
+ }
+
+ const TVector<size_t>& Previous(size_t state, size_t letter) const {
+ return reversedTransitions[state][letter];
+ }
+
+ void AcceptStates() {
+ ResizeOutput(Classes);
+ auto& newFsm = Output();
+ auto& newActions = Actions();
+ newFsm.SetFinal(0, false);
+
+ // Unite equality classes into new states
+ for (size_t from = 0; from != Size(); ++from) {
+ const auto fromMinimized = StateClass[from];
+ for (auto&& letter : mFsm.Letters()) {
+ const auto representative = letter.first;
+ const auto next = Next(from, representative);
+ const auto nextMinimized = StateClass[next];
+ Connect(fromMinimized, nextMinimized, representative);
+ const auto outputs = mFsm.Output(from, representative);
+ if (outputs) {
+ newActions[fromMinimized][representative] = outputs;
+ }
+ }
+ if (mFsm.Determined().IsFinal(from)) {
+ newFsm.SetFinal(fromMinimized, true);
+ }
+ }
+ newFsm.SetInitial(StateClass[mFsm.Determined().Initial()]);
+ newFsm.SetIsDetermined(true);
+ }
private:
- const CountingFsm& mFsm;
- TVector<TVector<TVector<size_t>>> reversedTransitions;
- TVector<size_t> StateClass;
- size_t Classes;
-
- size_t Next(size_t state, Char letter) const {
- const auto& tos = mFsm.Determined().Destinations(state, letter);
- Y_ASSERT(tos.size() == 1);
- return *tos.begin();
- }
+ const CountingFsm& mFsm;
+ TVector<TVector<TVector<size_t>>> reversedTransitions;
+ TVector<size_t> StateClass;
+ size_t Classes;
+
+ size_t Next(size_t state, Char letter) const {
+ const auto& tos = mFsm.Determined().Destinations(state, letter);
+ Y_ASSERT(tos.size() == 1);
+ return *tos.begin();
+ }
};
typedef size_t RawState;
@@ -267,612 +267,612 @@ typedef TSet<TaggedState> StateGroup;
struct DeterminedState {
public:
- StateGroup matched;
- StateGroup unmatched;
- StateGroup separated;
- StateGroup lagging;
+ StateGroup matched;
+ StateGroup unmatched;
+ StateGroup separated;
+ StateGroup lagging;
};
bool operator < (const DeterminedState& left, const DeterminedState& right) {
- auto asTuple = [](const DeterminedState& state) {
- return std::tie(state.matched, state.unmatched, state.separated, state.lagging);
- };
+ auto asTuple = [](const DeterminedState& state) {
+ return std::tie(state.matched, state.unmatched, state.separated, state.lagging);
+ };
- return asTuple(left) < asTuple(right);
+ return asTuple(left) < asTuple(right);
}
bool InvalidCharRange(const TVector<Char>& range) {
- for (const auto letter : range) {
- if (letter < MaxCharUnaligned && letter != 256) {
- return false;
- }
- }
- return true;
+ for (const auto letter : range) {
+ if (letter < MaxCharUnaligned && letter != 256) {
+ return false;
+ }
+ }
+ return true;
}
class BasicCountingFsmDetermineTask : public CountingFsmTask {
public:
- using CountingFsmTask::LettersTbl;
- typedef DeterminedState State;
- typedef TMap<State, size_t> InvStates;
-
- explicit BasicCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
- : mFsm(fsm)
- , mReInitial{reInitial}
- {
- mDeadStates = fsm.DeadStates();
- for (auto&& letter : fsm.Letters()) {
- if (InvalidCharRange(letter.second.second)) {
- mInvalidLetters.insert(letter.first);
- }
- }
- }
-
- const LettersTbl& Letters() const {
- return mFsm.Letters();
- }
-
- State Initial() const {
- return State{StateGroup{}, InitialGroup(), StateGroup{}, StateGroup{}};
- }
-
- bool IsRequired(const State& state) const {
- Y_UNUSED(state);
- return true;
- }
-
- State Next(const State& state, Char letter) const {
- if (mInvalidLetters.count(letter) != 0) {
- AddAction(state, letter, CountingFsm::NotMatched);
- return Initial();
- }
-
- auto next = PrepareNextState(state, letter);
- AddAction(state, letter, CalculateTransitionTag(state, next));
- PostProcessNextState(next);
- NormalizeState(next);
-
- return next;
- }
-
- void AcceptStates(const TVector<State>& states)
- {
- ResizeOutput(states.size());
- auto& newFsm = Output();
- auto& newActions = Actions();
- newFsm.SetInitial(0);
- newFsm.SetIsDetermined(true);
-
- for (size_t ns = 0; ns < states.size(); ++ns) {
- const auto& state = states[ns];
- newFsm.SetFinal(ns, HasFinals(state.unmatched));
-
- auto outputIt = mActionByState.find(state);
- if (outputIt != mActionByState.end()) {
- newActions[ns].swap(outputIt->second);
- }
- }
- }
+ using CountingFsmTask::LettersTbl;
+ typedef DeterminedState State;
+ typedef TMap<State, size_t> InvStates;
+
+ explicit BasicCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
+ : mFsm(fsm)
+ , mReInitial{reInitial}
+ {
+ mDeadStates = fsm.DeadStates();
+ for (auto&& letter : fsm.Letters()) {
+ if (InvalidCharRange(letter.second.second)) {
+ mInvalidLetters.insert(letter.first);
+ }
+ }
+ }
+
+ const LettersTbl& Letters() const {
+ return mFsm.Letters();
+ }
+
+ State Initial() const {
+ return State{StateGroup{}, InitialGroup(), StateGroup{}, StateGroup{}};
+ }
+
+ bool IsRequired(const State& state) const {
+ Y_UNUSED(state);
+ return true;
+ }
+
+ State Next(const State& state, Char letter) const {
+ if (mInvalidLetters.count(letter) != 0) {
+ AddAction(state, letter, CountingFsm::NotMatched);
+ return Initial();
+ }
+
+ auto next = PrepareNextState(state, letter);
+ AddAction(state, letter, CalculateTransitionTag(state, next));
+ PostProcessNextState(next);
+ NormalizeState(next);
+
+ return next;
+ }
+
+ void AcceptStates(const TVector<State>& states)
+ {
+ ResizeOutput(states.size());
+ auto& newFsm = Output();
+ auto& newActions = Actions();
+ newFsm.SetInitial(0);
+ newFsm.SetIsDetermined(true);
+
+ for (size_t ns = 0; ns < states.size(); ++ns) {
+ const auto& state = states[ns];
+ newFsm.SetFinal(ns, HasFinals(state.unmatched));
+
+ auto outputIt = mActionByState.find(state);
+ if (outputIt != mActionByState.end()) {
+ newActions[ns].swap(outputIt->second);
+ }
+ }
+ }
protected:
- void SplitDestinations(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, Char letter) const {
- for (const auto& state : source) {
- MakeTaggedStates(matched, unmatched, separated, mFsm.Destinations(state.first, letter), state.second);
- if (mFsm.IsFinal(state.first)) {
- // Implicit epsilon transitions from final states to reInitial after matching separator
- MakeTaggedStates(separated, separated, separated, mFsm.Destinations(mReInitial, letter), CountingFsm::Separated);
- }
- }
- }
-
- Action CalculateTransitionTagImpl(const State& dest) const {
- Action result = 0;
- if (!dest.matched.empty()) {
- result = AdvancedCountingScanner::IncrementAction;
- } else if (dest.unmatched.empty()) {
- if (!dest.separated.empty()) {
- for (const auto& state : dest.separated) {
- if (state.second == CountingFsm::Matched) {
- result = AdvancedCountingScanner::IncrementAction;
- }
- }
- } else {
- result = AdvancedCountingScanner::ResetAction;
- for (const auto& state : dest.lagging) {
- if (state.second != CountingFsm::NotMatched) {
- result |= AdvancedCountingScanner::IncrementAction;
- }
- }
- }
- }
- return result;
- }
-
- unsigned long TagsOfGroup(const StateGroup& group) const {
- unsigned long result = 0;
- for (const auto& state : group) {
- result |= state.second;
- }
- return result;
- }
-
- void SplitGroupByTag(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, bool useFsmTag) const {
- for (const auto& state : source) {
- auto tag = useFsmTag ? mFsm.Tag(state.first) : state.second;
- if (tag == CountingFsm::Matched) {
- matched.insert(state);
- } else if (tag == CountingFsm::Separated) {
- separated.insert(state);
- } else {
- unmatched.insert(state);
- }
- }
- }
-
- void UpdateLaggingStates(State& state, bool moveToLagging) const {
- if (!state.matched.empty()) {
- if (moveToLagging) {
- state.lagging.insert(state.unmatched.cbegin(), state.unmatched.cend());
- state.lagging.insert(state.separated.cbegin(), state.separated.cend());
- }
- state.unmatched.clear();
- state.separated.clear();
- }
- if (state.unmatched.empty() && !state.separated.empty()) {
- const auto unmatchedTags = TagsOfGroup(state.separated);
- if ((unmatchedTags & CountingFsm::Matched) && (unmatchedTags != CountingFsm::Matched)) {
- StateGroup separatedMatched;
- for (const auto& separatedState : state.separated) {
- if (separatedState.second == CountingFsm::Matched) {
- separatedMatched.insert(separatedState);
- } else if (moveToLagging) {
- state.lagging.insert(separatedState);
- }
- }
- state.separated.swap(separatedMatched);
- }
- }
- }
-
- void RemoveDuplicateLaggingStates(State& state) const {
- const auto statesToRemove = GetRawStates({state.matched, state.unmatched, state.separated}, 0);
- const auto unmatchedStatesToRemove = GetRawStates({state.lagging}, CountingFsm::NotMatched);
-
- StateGroup newLagging;
- for (const auto& taggedState : state.lagging) {
- if (statesToRemove.count(taggedState.first) == 0) {
- if (taggedState.second != CountingFsm::NotMatched || unmatchedStatesToRemove.count(taggedState.first) == 0) {
- newLagging.insert(taggedState);
- }
- }
- }
- state.lagging.swap(newLagging);
- }
-
- void RemoveDuplicateSeparatedStates(State& state) const {
- if (state.separated.empty()) {
- return;
- }
- const auto statesToRemove = GetRawStates({state.matched, state.unmatched}, 0);
- RemoveRawStates(state.separated, statesToRemove);
- }
-
- void NormalizeState(State& state) const {
- if (!state.matched.empty()) {
- Y_ASSERT(state.unmatched.empty());
- state.unmatched.swap(state.matched);
- }
-
- if (state.unmatched.empty() && !state.separated.empty()) {
- state.unmatched.swap(state.separated);
- }
-
- if (state.unmatched.empty() && !state.lagging.empty()) {
- State groups;
- SplitGroupByTag(groups.matched, groups.unmatched, groups.separated, state.lagging, false);
- if (!groups.matched.empty()) {
- state.unmatched.swap(groups.matched);
- state.separated.swap(groups.separated);
- state.lagging.swap(groups.unmatched);
- } else if (!groups.separated.empty()) {
- state.unmatched.swap(groups.separated);
- state.lagging.swap(groups.unmatched);
- } else {
- state.unmatched.swap(groups.unmatched);
- state.lagging.swap(groups.matched); // just clear
- }
- }
- }
+ void SplitDestinations(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, Char letter) const {
+ for (const auto& state : source) {
+ MakeTaggedStates(matched, unmatched, separated, mFsm.Destinations(state.first, letter), state.second);
+ if (mFsm.IsFinal(state.first)) {
+ // Implicit epsilon transitions from final states to reInitial after matching separator
+ MakeTaggedStates(separated, separated, separated, mFsm.Destinations(mReInitial, letter), CountingFsm::Separated);
+ }
+ }
+ }
+
+ Action CalculateTransitionTagImpl(const State& dest) const {
+ Action result = 0;
+ if (!dest.matched.empty()) {
+ result = AdvancedCountingScanner::IncrementAction;
+ } else if (dest.unmatched.empty()) {
+ if (!dest.separated.empty()) {
+ for (const auto& state : dest.separated) {
+ if (state.second == CountingFsm::Matched) {
+ result = AdvancedCountingScanner::IncrementAction;
+ }
+ }
+ } else {
+ result = AdvancedCountingScanner::ResetAction;
+ for (const auto& state : dest.lagging) {
+ if (state.second != CountingFsm::NotMatched) {
+ result |= AdvancedCountingScanner::IncrementAction;
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ unsigned long TagsOfGroup(const StateGroup& group) const {
+ unsigned long result = 0;
+ for (const auto& state : group) {
+ result |= state.second;
+ }
+ return result;
+ }
+
+ void SplitGroupByTag(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const StateGroup& source, bool useFsmTag) const {
+ for (const auto& state : source) {
+ auto tag = useFsmTag ? mFsm.Tag(state.first) : state.second;
+ if (tag == CountingFsm::Matched) {
+ matched.insert(state);
+ } else if (tag == CountingFsm::Separated) {
+ separated.insert(state);
+ } else {
+ unmatched.insert(state);
+ }
+ }
+ }
+
+ void UpdateLaggingStates(State& state, bool moveToLagging) const {
+ if (!state.matched.empty()) {
+ if (moveToLagging) {
+ state.lagging.insert(state.unmatched.cbegin(), state.unmatched.cend());
+ state.lagging.insert(state.separated.cbegin(), state.separated.cend());
+ }
+ state.unmatched.clear();
+ state.separated.clear();
+ }
+ if (state.unmatched.empty() && !state.separated.empty()) {
+ const auto unmatchedTags = TagsOfGroup(state.separated);
+ if ((unmatchedTags & CountingFsm::Matched) && (unmatchedTags != CountingFsm::Matched)) {
+ StateGroup separatedMatched;
+ for (const auto& separatedState : state.separated) {
+ if (separatedState.second == CountingFsm::Matched) {
+ separatedMatched.insert(separatedState);
+ } else if (moveToLagging) {
+ state.lagging.insert(separatedState);
+ }
+ }
+ state.separated.swap(separatedMatched);
+ }
+ }
+ }
+
+ void RemoveDuplicateLaggingStates(State& state) const {
+ const auto statesToRemove = GetRawStates({state.matched, state.unmatched, state.separated}, 0);
+ const auto unmatchedStatesToRemove = GetRawStates({state.lagging}, CountingFsm::NotMatched);
+
+ StateGroup newLagging;
+ for (const auto& taggedState : state.lagging) {
+ if (statesToRemove.count(taggedState.first) == 0) {
+ if (taggedState.second != CountingFsm::NotMatched || unmatchedStatesToRemove.count(taggedState.first) == 0) {
+ newLagging.insert(taggedState);
+ }
+ }
+ }
+ state.lagging.swap(newLagging);
+ }
+
+ void RemoveDuplicateSeparatedStates(State& state) const {
+ if (state.separated.empty()) {
+ return;
+ }
+ const auto statesToRemove = GetRawStates({state.matched, state.unmatched}, 0);
+ RemoveRawStates(state.separated, statesToRemove);
+ }
+
+ void NormalizeState(State& state) const {
+ if (!state.matched.empty()) {
+ Y_ASSERT(state.unmatched.empty());
+ state.unmatched.swap(state.matched);
+ }
+
+ if (state.unmatched.empty() && !state.separated.empty()) {
+ state.unmatched.swap(state.separated);
+ }
+
+ if (state.unmatched.empty() && !state.lagging.empty()) {
+ State groups;
+ SplitGroupByTag(groups.matched, groups.unmatched, groups.separated, state.lagging, false);
+ if (!groups.matched.empty()) {
+ state.unmatched.swap(groups.matched);
+ state.separated.swap(groups.separated);
+ state.lagging.swap(groups.unmatched);
+ } else if (!groups.separated.empty()) {
+ state.unmatched.swap(groups.separated);
+ state.lagging.swap(groups.unmatched);
+ } else {
+ state.unmatched.swap(groups.unmatched);
+ state.lagging.swap(groups.matched); // just clear
+ }
+ }
+ }
private:
- virtual State PrepareNextState(const State& state, Char letter) const = 0;
-
- virtual void PostProcessNextState(State& next) const = 0;
-
- virtual Action CalculateTransitionTag(const State& source, const State& dest) const {
- Y_UNUSED(source);
- return CalculateTransitionTagImpl(dest);
- }
-
- virtual StateGroup InitialGroup() const {
- return StateGroup{TaggedState{mFsm.Initial(), CountingFsm::NotMatched}};
- }
-
- void AddAction(State from, Char letter, unsigned long value) const {
- if (!value) {
- return;
- }
- TransitionTagRow& row = mActionByState[from];
- row[letter] = value;
- }
-
- void MakeTaggedStates(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const Fsm::StatesSet& destinations, unsigned long sourceTag) const {
- for (const auto destState : destinations) {
- if (mDeadStates.count(destState) == 0) {
- const auto destTag = mFsm.Tag(destState);
- if (sourceTag != CountingFsm::Matched && destTag == CountingFsm::Matched) {
- matched.insert(ymake_pair(destState, destTag));
- } else if (sourceTag == CountingFsm::Separated || destTag == CountingFsm::Separated) {
- separated.insert(ymake_pair(destState, CountingFsm::Separated));
- } else {
- unmatched.insert(ymake_pair(destState, sourceTag));
- }
- }
- }
- }
-
- bool HasFinals(const StateGroup& states) const {
- for (const auto& state : states) {
- if (mFsm.IsFinal(state.first)) {
- return true;
- }
- }
- return false;
- }
-
- Fsm::StatesSet GetRawStates(const TVector<std::reference_wrapper<const StateGroup>> groups, unsigned long excludedTags) const {
- Fsm::StatesSet result;
- for (const auto& group : groups) {
- for (const auto& taggedState : group.get()) {
- if (!(taggedState.second & excludedTags)) {
- result.insert(taggedState.first);
- }
- }
- }
- return result;
- }
-
- void RemoveRawStates(StateGroup& group, const Fsm::StatesSet& states) const {
- StateGroup removing;
- for (const auto& taggedState : group) {
- if (states.count(taggedState.first) != 0) {
- removing.insert(taggedState);
- }
- }
- for (const auto& taggedState : removing) {
- group.erase(taggedState);
- }
- }
+ virtual State PrepareNextState(const State& state, Char letter) const = 0;
+
+ virtual void PostProcessNextState(State& next) const = 0;
+
+ virtual Action CalculateTransitionTag(const State& source, const State& dest) const {
+ Y_UNUSED(source);
+ return CalculateTransitionTagImpl(dest);
+ }
+
+ virtual StateGroup InitialGroup() const {
+ return StateGroup{TaggedState{mFsm.Initial(), CountingFsm::NotMatched}};
+ }
+
+ void AddAction(State from, Char letter, unsigned long value) const {
+ if (!value) {
+ return;
+ }
+ TransitionTagRow& row = mActionByState[from];
+ row[letter] = value;
+ }
+
+ void MakeTaggedStates(StateGroup& matched, StateGroup& unmatched, StateGroup& separated, const Fsm::StatesSet& destinations, unsigned long sourceTag) const {
+ for (const auto destState : destinations) {
+ if (mDeadStates.count(destState) == 0) {
+ const auto destTag = mFsm.Tag(destState);
+ if (sourceTag != CountingFsm::Matched && destTag == CountingFsm::Matched) {
+ matched.insert(ymake_pair(destState, destTag));
+ } else if (sourceTag == CountingFsm::Separated || destTag == CountingFsm::Separated) {
+ separated.insert(ymake_pair(destState, CountingFsm::Separated));
+ } else {
+ unmatched.insert(ymake_pair(destState, sourceTag));
+ }
+ }
+ }
+ }
+
+ bool HasFinals(const StateGroup& states) const {
+ for (const auto& state : states) {
+ if (mFsm.IsFinal(state.first)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ Fsm::StatesSet GetRawStates(const TVector<std::reference_wrapper<const StateGroup>> groups, unsigned long excludedTags) const {
+ Fsm::StatesSet result;
+ for (const auto& group : groups) {
+ for (const auto& taggedState : group.get()) {
+ if (!(taggedState.second & excludedTags)) {
+ result.insert(taggedState.first);
+ }
+ }
+ }
+ return result;
+ }
+
+ void RemoveRawStates(StateGroup& group, const Fsm::StatesSet& states) const {
+ StateGroup removing;
+ for (const auto& taggedState : group) {
+ if (states.count(taggedState.first) != 0) {
+ removing.insert(taggedState);
+ }
+ }
+ for (const auto& taggedState : removing) {
+ group.erase(taggedState);
+ }
+ }
private:
- const Fsm& mFsm;
- RawState mReInitial;
- Fsm::StatesSet mDeadStates;
- TSet<Char> mInvalidLetters;
+ const Fsm& mFsm;
+ RawState mReInitial;
+ Fsm::StatesSet mDeadStates;
+ TSet<Char> mInvalidLetters;
- mutable TMap<State, TransitionTagRow> mActionByState;
+ mutable TMap<State, TransitionTagRow> mActionByState;
};
class CountingFsmDetermineTask : public BasicCountingFsmDetermineTask {
public:
- using BasicCountingFsmDetermineTask::State;
- using BasicCountingFsmDetermineTask::LettersTbl;
- using BasicCountingFsmDetermineTask::InvStates;
+ using BasicCountingFsmDetermineTask::State;
+ using BasicCountingFsmDetermineTask::LettersTbl;
+ using BasicCountingFsmDetermineTask::InvStates;
- explicit CountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
- : BasicCountingFsmDetermineTask{fsm, reInitial}
- {}
+ explicit CountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
+ : BasicCountingFsmDetermineTask{fsm, reInitial}
+ {}
private:
- State PrepareNextState(const State& state, Char letter) const override {
- State next;
- SplitDestinations(next.matched, next.unmatched, next.separated, state.unmatched, letter);
- SplitDestinations(next.separated, next.separated, next.separated, state.separated, letter);
- SplitDestinations(next.lagging, next.lagging, next.lagging, state.lagging, letter);
- return next;
- }
-
- void PostProcessNextState(State& next) const override {
- UpdateLaggingStates(next, true);
- RemoveDuplicateLaggingStates(next);
- RemoveDuplicateSeparatedStates(next);
- }
+ State PrepareNextState(const State& state, Char letter) const override {
+ State next;
+ SplitDestinations(next.matched, next.unmatched, next.separated, state.unmatched, letter);
+ SplitDestinations(next.separated, next.separated, next.separated, state.separated, letter);
+ SplitDestinations(next.lagging, next.lagging, next.lagging, state.lagging, letter);
+ return next;
+ }
+
+ void PostProcessNextState(State& next) const override {
+ UpdateLaggingStates(next, true);
+ RemoveDuplicateLaggingStates(next);
+ RemoveDuplicateSeparatedStates(next);
+ }
};
class SimpleCountingFsmDetermineTask : public BasicCountingFsmDetermineTask {
public:
- using BasicCountingFsmDetermineTask::State;
- using BasicCountingFsmDetermineTask::LettersTbl;
- using BasicCountingFsmDetermineTask::InvStates;
+ using BasicCountingFsmDetermineTask::State;
+ using BasicCountingFsmDetermineTask::LettersTbl;
+ using BasicCountingFsmDetermineTask::InvStates;
- static constexpr unsigned long MixedTags = CountingFsm::Separated | CountingFsm::Matched;
+ static constexpr unsigned long MixedTags = CountingFsm::Separated | CountingFsm::Matched;
- SimpleCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
- : BasicCountingFsmDetermineTask{fsm, reInitial}
- , mStartState{reInitial, CountingFsm::NotMatched}
- {}
+ SimpleCountingFsmDetermineTask(const Fsm& fsm, RawState reInitial)
+ : BasicCountingFsmDetermineTask{fsm, reInitial}
+ , mStartState{reInitial, CountingFsm::NotMatched}
+ {}
private:
- State PrepareNextState(const State& state, Char letter) const override {
- State next;
- auto from = state;
- const auto fromIsEmpty = IsEmptyState(from);
- if (fromIsEmpty) {
- from.unmatched.insert(mStartState);
- }
- Y_ASSERT(IsValidState(from));
-
- SplitDestinations(next.matched, next.unmatched, next.separated, from.unmatched, letter);
- if (next.matched.empty() && !next.separated.empty()) {
- if (next.unmatched.empty()) {
- SplitSeparatedByFsmTag(next);
- if (next.separated.size() > 1) {
- RemoveDuplicateSeparatedStates(next);
- }
- if (next.unmatched.empty()) {
- next.unmatched.swap(next.separated);
- }
- } else {
- ChooseOneSeparatedState(next);
- }
- }
- if (next.matched.empty() && next.separated.empty() && !from.separated.empty()) {
- if (!next.unmatched.empty()) {
- ChooseOneDestState(next.separated, from.separated, letter);
- } else {
- SplitDestinations(next.matched, next.unmatched, next.separated, from.separated, letter);
- if (next.matched.empty() && !next.separated.empty()) {
- SplitSeparatedByFsmTag(next);
- }
- }
- ChooseOneSeparatedState(next);
- }
- if (!fromIsEmpty && IsEmptyState(next)) {
- ChooseOneDestState(next.lagging, StateGroup{mStartState}, letter);
- }
-
- return next;
- }
-
- void PostProcessNextState(State& next) const override {
- if (!next.lagging.empty()) {
- next.unmatched.swap(next.lagging);
- }
- UpdateLaggingStates(next, false);
- RemoveDuplicateSeparatedStates(next);
- }
-
- Action CalculateTransitionTag(const State& source, const State& dest) const override {
- Action tag = CalculateTransitionTagImpl(dest);
- if (!((TagsOfGroup(source.unmatched) | TagsOfGroup(source.separated)) & MixedTags)) {
- tag &= AdvancedCountingScanner::IncrementAction;
- }
- return tag;
- }
-
- StateGroup InitialGroup() const override {
- return StateGroup{};
- }
-
- bool IsEmptyState(const State& state) const {
- return state.matched.empty() && state.unmatched.empty() && state.separated.empty() && state.lagging.empty();
- }
-
- bool IsValidState(const State& state) const {
- return state.matched.empty() && state.unmatched.size() <= 1 && state.separated.size() <= 1 && state.lagging.empty();
- }
-
- void SplitSeparatedByFsmTag(State& state) const {
- Y_ASSERT(state.unmatched.empty());
- StateGroup separated;
- separated.swap(state.separated);
- SplitGroupByTag(state.matched, state.unmatched, state.separated, separated, true);
- }
-
- void ChooseOneDestState(StateGroup& dest, const StateGroup& source, Char letter) const {
- State destState;
- SplitDestinations(destState.matched, destState.unmatched, destState.separated, source, letter);
- if (!destState.matched.empty()) {
- dest.swap(destState.matched);
- } else if (!destState.separated.empty()) {
- dest.swap(destState.separated);
- } else if (!destState.unmatched.empty()) {
- dest.swap(destState.unmatched);
- }
- }
-
- void ChooseOneSeparatedState(State& state) const {
- if (state.separated.size() <= 1) {
- return;
- }
- RemoveDuplicateSeparatedStates(state);
- State splitted;
- SplitGroupByTag(splitted.matched, splitted.unmatched, splitted.separated, state.separated, true);
- if (!splitted.separated.empty()) {
- state.separated.swap(splitted.separated);
- } else if (!splitted.matched.empty()) {
- state.separated.swap(splitted.matched);
- }
- }
+ State PrepareNextState(const State& state, Char letter) const override {
+ State next;
+ auto from = state;
+ const auto fromIsEmpty = IsEmptyState(from);
+ if (fromIsEmpty) {
+ from.unmatched.insert(mStartState);
+ }
+ Y_ASSERT(IsValidState(from));
+
+ SplitDestinations(next.matched, next.unmatched, next.separated, from.unmatched, letter);
+ if (next.matched.empty() && !next.separated.empty()) {
+ if (next.unmatched.empty()) {
+ SplitSeparatedByFsmTag(next);
+ if (next.separated.size() > 1) {
+ RemoveDuplicateSeparatedStates(next);
+ }
+ if (next.unmatched.empty()) {
+ next.unmatched.swap(next.separated);
+ }
+ } else {
+ ChooseOneSeparatedState(next);
+ }
+ }
+ if (next.matched.empty() && next.separated.empty() && !from.separated.empty()) {
+ if (!next.unmatched.empty()) {
+ ChooseOneDestState(next.separated, from.separated, letter);
+ } else {
+ SplitDestinations(next.matched, next.unmatched, next.separated, from.separated, letter);
+ if (next.matched.empty() && !next.separated.empty()) {
+ SplitSeparatedByFsmTag(next);
+ }
+ }
+ ChooseOneSeparatedState(next);
+ }
+ if (!fromIsEmpty && IsEmptyState(next)) {
+ ChooseOneDestState(next.lagging, StateGroup{mStartState}, letter);
+ }
+
+ return next;
+ }
+
+ void PostProcessNextState(State& next) const override {
+ if (!next.lagging.empty()) {
+ next.unmatched.swap(next.lagging);
+ }
+ UpdateLaggingStates(next, false);
+ RemoveDuplicateSeparatedStates(next);
+ }
+
+ Action CalculateTransitionTag(const State& source, const State& dest) const override {
+ Action tag = CalculateTransitionTagImpl(dest);
+ if (!((TagsOfGroup(source.unmatched) | TagsOfGroup(source.separated)) & MixedTags)) {
+ tag &= AdvancedCountingScanner::IncrementAction;
+ }
+ return tag;
+ }
+
+ StateGroup InitialGroup() const override {
+ return StateGroup{};
+ }
+
+ bool IsEmptyState(const State& state) const {
+ return state.matched.empty() && state.unmatched.empty() && state.separated.empty() && state.lagging.empty();
+ }
+
+ bool IsValidState(const State& state) const {
+ return state.matched.empty() && state.unmatched.size() <= 1 && state.separated.size() <= 1 && state.lagging.empty();
+ }
+
+ void SplitSeparatedByFsmTag(State& state) const {
+ Y_ASSERT(state.unmatched.empty());
+ StateGroup separated;
+ separated.swap(state.separated);
+ SplitGroupByTag(state.matched, state.unmatched, state.separated, separated, true);
+ }
+
+ void ChooseOneDestState(StateGroup& dest, const StateGroup& source, Char letter) const {
+ State destState;
+ SplitDestinations(destState.matched, destState.unmatched, destState.separated, source, letter);
+ if (!destState.matched.empty()) {
+ dest.swap(destState.matched);
+ } else if (!destState.separated.empty()) {
+ dest.swap(destState.separated);
+ } else if (!destState.unmatched.empty()) {
+ dest.swap(destState.unmatched);
+ }
+ }
+
+ void ChooseOneSeparatedState(State& state) const {
+ if (state.separated.size() <= 1) {
+ return;
+ }
+ RemoveDuplicateSeparatedStates(state);
+ State splitted;
+ SplitGroupByTag(splitted.matched, splitted.unmatched, splitted.separated, state.separated, true);
+ if (!splitted.separated.empty()) {
+ state.separated.swap(splitted.separated);
+ } else if (!splitted.matched.empty()) {
+ state.separated.swap(splitted.matched);
+ }
+ }
private:
- TaggedState mStartState;
+ TaggedState mStartState;
};
bool CountingFsm::Determine() {
- CountingFsmDetermineTask task{mFsm, mReInitial};
- size_t maxSize = mFsm.Size() * 4096;
- if (Pire::Impl::Determine(task, maxSize)) {
- SwapTaskOutputs(task);
- mSimple = false;
- } else {
- SimpleCountingFsmDetermineTask simpleTask{mFsm, mReInitial};
- if (Pire::Impl::Determine(simpleTask, std::numeric_limits<size_t>::max())) {
- SwapTaskOutputs(simpleTask);
- mSimple = true;
- } else {
- return false;
- }
- }
- return true;
+ CountingFsmDetermineTask task{mFsm, mReInitial};
+ size_t maxSize = mFsm.Size() * 4096;
+ if (Pire::Impl::Determine(task, maxSize)) {
+ SwapTaskOutputs(task);
+ mSimple = false;
+ } else {
+ SimpleCountingFsmDetermineTask simpleTask{mFsm, mReInitial};
+ if (Pire::Impl::Determine(simpleTask, std::numeric_limits<size_t>::max())) {
+ SwapTaskOutputs(simpleTask);
+ mSimple = true;
+ } else {
+ return false;
+ }
+ }
+ return true;
}
void CountingFsm::Minimize() {
- CountingFsmMinimizeTask task{*this};
- Pire::Impl::Minimize(task);
- SwapTaskOutputs(task);
+ CountingFsmMinimizeTask task{*this};
+ Pire::Impl::Minimize(task);
+ SwapTaskOutputs(task);
}
void CountingFsm::SwapTaskOutputs(CountingFsmTask& task) {
- task.Output().Swap(mDetermined);
- task.Actions().swap(mActions);
+ task.Output().Swap(mDetermined);
+ task.Actions().swap(mActions);
}
}
namespace {
- Pire::Fsm FsmForDot() { Pire::Fsm f; f.AppendDot(); return f; }
- Pire::Fsm FsmForChar(Pire::Char c) { Pire::Fsm f; f.AppendSpecial(c); return f; }
+ Pire::Fsm FsmForDot() { Pire::Fsm f; f.AppendDot(); return f; }
+ Pire::Fsm FsmForChar(Pire::Char c) { Pire::Fsm f; f.AppendSpecial(c); return f; }
}
CountingScanner::CountingScanner(const Fsm& re, const Fsm& sep)
{
- Fsm res = re;
- res.Surround();
- Fsm sep_re = ((sep & ~res) /* | Fsm()*/) + re;
- sep_re.Determine();
-
- Fsm dup = sep_re;
- for (size_t i = 0; i < dup.Size(); ++i)
- dup.SetTag(i, Matched);
- size_t oldsize = sep_re.Size();
- sep_re.Import(dup);
- for (Fsm::FinalTable::const_iterator i = sep_re.Finals().begin(), ie = sep_re.Finals().end(); i != ie; ++i)
- if (*i < oldsize)
- sep_re.Connect(*i, oldsize + *i);
-
- sep_re |= (FsmForDot() | FsmForChar(Pire::BeginMark) | FsmForChar(Pire::EndMark));
-
- // Make a full Cartesian product of two sep_res
- sep_re.Determine();
- sep_re.Unsparse();
- TSet<size_t> dead = sep_re.DeadStates();
-
- PIRE_IFDEBUG(Cdbg << "=== Original FSM ===" << Endl << sep_re << ">>> " << sep_re.Size() << " states, dead: [" << Join(dead.begin(), dead.end(), ", ") << "]" << Endl);
-
- Fsm sq;
-
- typedef ypair<size_t, size_t> NewState;
- TVector<NewState> states;
- TMap<NewState, size_t> invstates;
-
- states.push_back(NewState(sep_re.Initial(), sep_re.Initial()));
- invstates.insert(ymake_pair(states.back(), states.size() - 1));
-
- // TODO: this loop reminds me a general determination task...
- for (size_t curstate = 0; curstate < states.size(); ++curstate) {
-
- unsigned long tag = sep_re.Tag(states[curstate].first);
- if (tag)
- sq.SetTag(curstate, tag);
- sq.SetFinal(curstate, sep_re.IsFinal(states[curstate].first));
-
- PIRE_IFDEBUG(Cdbg << "State " << curstate << " = (" << states[curstate].first << ", " << states[curstate].second << ")" << Endl);
- for (Fsm::LettersTbl::ConstIterator lit = sep_re.Letters().Begin(), lie = sep_re.Letters().End(); lit != lie; ++lit) {
-
- Char letter = lit->first;
-
- const Fsm::StatesSet& mr = sep_re.Destinations(states[curstate].first, letter);
- const Fsm::StatesSet& br = sep_re.Destinations(states[curstate].second, letter);
-
- if (mr.size() != 1)
- Y_ASSERT(!"Wrong transition size for main");
- if (br.size() != 1)
- Y_ASSERT(!"Wrong transition size for backup");
-
- NewState ns(*mr.begin(), *br.begin());
- PIRE_IFDEBUG(NewState savedNs = ns);
- unsigned long outputs = 0;
-
- PIRE_IFDEBUG(ystring dbgout);
- if (dead.find(ns.first) != dead.end()) {
- PIRE_IFDEBUG(dbgout = ((sep_re.Tag(ns.first) & Matched) ? ", ++cur" : ", max <- cur"));
- outputs = DeadFlag | (sep_re.Tag(ns.first) & Matched);
- ns.first = ns.second;
- }
- if (sep_re.IsFinal(ns.first) || (sep_re.IsFinal(ns.second) && !(sep_re.Tag(ns.first) & Matched)))
- ns.second = sep_re.Initial();
-
- PIRE_IFDEBUG(if (ns != savedNs) Cdbg << "Diverted transition to (" << savedNs.first << ", " << savedNs.second << ") on " << (char) letter << " to (" << ns.first << ", " << ns.second << ")" << dbgout << Endl);
-
- TMap<NewState, size_t>::iterator nsi = invstates.find(ns);
- if (nsi == invstates.end()) {
- PIRE_IFDEBUG(Cdbg << "New state " << states.size() << " = (" << ns.first << ", " << ns.second << ")" << Endl);
- states.push_back(ns);
- nsi = invstates.insert(ymake_pair(states.back(), states.size() - 1)).first;
- sq.Resize(states.size());
- }
-
- for (TVector<Char>::const_iterator li = lit->second.second.begin(), le = lit->second.second.end(); li != le; ++li)
- sq.Connect(curstate, nsi->second, *li);
- if (outputs)
- sq.SetOutput(curstate, nsi->second, outputs);
- }
- }
-
- sq.Determine();
-
- PIRE_IFDEBUG(Cdbg << "=== FSM ===" << Endl << sq << Endl);
- Init(sq.Size(), sq.Letters(), sq.Initial(), 1);
- BuildScanner(sq, *this);
+ Fsm res = re;
+ res.Surround();
+ Fsm sep_re = ((sep & ~res) /* | Fsm()*/) + re;
+ sep_re.Determine();
+
+ Fsm dup = sep_re;
+ for (size_t i = 0; i < dup.Size(); ++i)
+ dup.SetTag(i, Matched);
+ size_t oldsize = sep_re.Size();
+ sep_re.Import(dup);
+ for (Fsm::FinalTable::const_iterator i = sep_re.Finals().begin(), ie = sep_re.Finals().end(); i != ie; ++i)
+ if (*i < oldsize)
+ sep_re.Connect(*i, oldsize + *i);
+
+ sep_re |= (FsmForDot() | FsmForChar(Pire::BeginMark) | FsmForChar(Pire::EndMark));
+
+ // Make a full Cartesian product of two sep_res
+ sep_re.Determine();
+ sep_re.Unsparse();
+ TSet<size_t> dead = sep_re.DeadStates();
+
+ PIRE_IFDEBUG(Cdbg << "=== Original FSM ===" << Endl << sep_re << ">>> " << sep_re.Size() << " states, dead: [" << Join(dead.begin(), dead.end(), ", ") << "]" << Endl);
+
+ Fsm sq;
+
+ typedef ypair<size_t, size_t> NewState;
+ TVector<NewState> states;
+ TMap<NewState, size_t> invstates;
+
+ states.push_back(NewState(sep_re.Initial(), sep_re.Initial()));
+ invstates.insert(ymake_pair(states.back(), states.size() - 1));
+
+ // TODO: this loop reminds me a general determination task...
+ for (size_t curstate = 0; curstate < states.size(); ++curstate) {
+
+ unsigned long tag = sep_re.Tag(states[curstate].first);
+ if (tag)
+ sq.SetTag(curstate, tag);
+ sq.SetFinal(curstate, sep_re.IsFinal(states[curstate].first));
+
+ PIRE_IFDEBUG(Cdbg << "State " << curstate << " = (" << states[curstate].first << ", " << states[curstate].second << ")" << Endl);
+ for (Fsm::LettersTbl::ConstIterator lit = sep_re.Letters().Begin(), lie = sep_re.Letters().End(); lit != lie; ++lit) {
+
+ Char letter = lit->first;
+
+ const Fsm::StatesSet& mr = sep_re.Destinations(states[curstate].first, letter);
+ const Fsm::StatesSet& br = sep_re.Destinations(states[curstate].second, letter);
+
+ if (mr.size() != 1)
+ Y_ASSERT(!"Wrong transition size for main");
+ if (br.size() != 1)
+ Y_ASSERT(!"Wrong transition size for backup");
+
+ NewState ns(*mr.begin(), *br.begin());
+ PIRE_IFDEBUG(NewState savedNs = ns);
+ unsigned long outputs = 0;
+
+ PIRE_IFDEBUG(ystring dbgout);
+ if (dead.find(ns.first) != dead.end()) {
+ PIRE_IFDEBUG(dbgout = ((sep_re.Tag(ns.first) & Matched) ? ", ++cur" : ", max <- cur"));
+ outputs = DeadFlag | (sep_re.Tag(ns.first) & Matched);
+ ns.first = ns.second;
+ }
+ if (sep_re.IsFinal(ns.first) || (sep_re.IsFinal(ns.second) && !(sep_re.Tag(ns.first) & Matched)))
+ ns.second = sep_re.Initial();
+
+ PIRE_IFDEBUG(if (ns != savedNs) Cdbg << "Diverted transition to (" << savedNs.first << ", " << savedNs.second << ") on " << (char) letter << " to (" << ns.first << ", " << ns.second << ")" << dbgout << Endl);
+
+ TMap<NewState, size_t>::iterator nsi = invstates.find(ns);
+ if (nsi == invstates.end()) {
+ PIRE_IFDEBUG(Cdbg << "New state " << states.size() << " = (" << ns.first << ", " << ns.second << ")" << Endl);
+ states.push_back(ns);
+ nsi = invstates.insert(ymake_pair(states.back(), states.size() - 1)).first;
+ sq.Resize(states.size());
+ }
+
+ for (TVector<Char>::const_iterator li = lit->second.second.begin(), le = lit->second.second.end(); li != le; ++li)
+ sq.Connect(curstate, nsi->second, *li);
+ if (outputs)
+ sq.SetOutput(curstate, nsi->second, outputs);
+ }
+ }
+
+ sq.Determine();
+
+ PIRE_IFDEBUG(Cdbg << "=== FSM ===" << Endl << sq << Endl);
+ Init(sq.Size(), sq.Letters(), sq.Initial(), 1);
+ BuildScanner(sq, *this);
}
namespace Impl {
template <class AdvancedScanner>
AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple) {
- Impl::CountingFsm countingFsm{re, sep};
- if (!countingFsm.Determine()) {
- throw Error("regexp pattern too complicated");
- }
- countingFsm.Minimize();
- if (simple) {
- *simple = countingFsm.Simple();
- }
-
- const auto& determined = countingFsm.Determined();
- const auto& letters = countingFsm.Letters();
-
- AdvancedScanner scanner;
- scanner.Init(determined.Size(), letters, determined.Initial(), 1);
- for (size_t from = 0; from != determined.Size(); ++from) {
- for (auto&& lettersEl : letters) {
- const auto letter = lettersEl.first;
- const auto& tos = determined.Destinations(from, letter);
- Y_ASSERT(tos.size() == 1);
- scanner.SetJump(from, letter, *tos.begin(), scanner.RemapAction(countingFsm.Output(from, letter)));
- }
- }
- return scanner;
+ Impl::CountingFsm countingFsm{re, sep};
+ if (!countingFsm.Determine()) {
+ throw Error("regexp pattern too complicated");
+ }
+ countingFsm.Minimize();
+ if (simple) {
+ *simple = countingFsm.Simple();
+ }
+
+ const auto& determined = countingFsm.Determined();
+ const auto& letters = countingFsm.Letters();
+
+ AdvancedScanner scanner;
+ scanner.Init(determined.Size(), letters, determined.Initial(), 1);
+ for (size_t from = 0; from != determined.Size(); ++from) {
+ for (auto&& lettersEl : letters) {
+ const auto letter = lettersEl.first;
+ const auto& tos = determined.Destinations(from, letter);
+ Y_ASSERT(tos.size() == 1);
+ scanner.SetJump(from, letter, *tos.begin(), scanner.RemapAction(countingFsm.Output(from, letter)));
+ }
+ }
+ return scanner;
}
} // namespace Impl
AdvancedCountingScanner::AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple)
- : AdvancedCountingScanner(Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(re, sep, simple))
+ : AdvancedCountingScanner(Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(re, sep, simple))
{
}
NoGlueLimitCountingScanner::NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple)
- : NoGlueLimitCountingScanner(Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(re, sep, simple))
+ : NoGlueLimitCountingScanner(Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(re, sep, simple))
{
}
@@ -882,101 +882,101 @@ namespace Impl {
template<class Scanner>
class CountingScannerGlueTask: public ScannerGlueCommon<Scanner> {
public:
- using typename ScannerGlueCommon<Scanner>::State;
- using TAction = typename Scanner::Action;
- using InternalState = typename Scanner::InternalState;
- typedef TMap<State, size_t> InvStates;
-
- CountingScannerGlueTask(const Scanner& lhs, const Scanner& rhs)
- : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters))
- {
- }
-
- void AcceptStates(const TVector<State>& states)
- {
- States = states;
- this->SetSc(THolder<Scanner>(new Scanner));
- this->Sc().Init(states.size(), this->Letters(), 0, this->Lhs().RegexpsCount() + this->Rhs().RegexpsCount());
-
- for (size_t i = 0; i < states.size(); ++i)
- this->Sc().SetTag(i, this->Lhs().m_tags[this->Lhs().StateIdx(states[i].first)] | (this->Rhs().m_tags[this->Rhs().StateIdx(states[i].second)] << 3));
- }
-
- void Connect(size_t from, size_t to, Char letter)
- {
- this->Sc().SetJump(from, letter, to,
- Action(this->Lhs(), States[from].first, letter) | (Action(this->Rhs(), States[from].second, letter) << this->Lhs().RegexpsCount()));
- }
+ using typename ScannerGlueCommon<Scanner>::State;
+ using TAction = typename Scanner::Action;
+ using InternalState = typename Scanner::InternalState;
+ typedef TMap<State, size_t> InvStates;
+
+ CountingScannerGlueTask(const Scanner& lhs, const Scanner& rhs)
+ : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters))
+ {
+ }
+
+ void AcceptStates(const TVector<State>& states)
+ {
+ States = states;
+ this->SetSc(THolder<Scanner>(new Scanner));
+ this->Sc().Init(states.size(), this->Letters(), 0, this->Lhs().RegexpsCount() + this->Rhs().RegexpsCount());
+
+ for (size_t i = 0; i < states.size(); ++i)
+ this->Sc().SetTag(i, this->Lhs().m_tags[this->Lhs().StateIdx(states[i].first)] | (this->Rhs().m_tags[this->Rhs().StateIdx(states[i].second)] << 3));
+ }
+
+ void Connect(size_t from, size_t to, Char letter)
+ {
+ this->Sc().SetJump(from, letter, to,
+ Action(this->Lhs(), States[from].first, letter) | (Action(this->Rhs(), States[from].second, letter) << this->Lhs().RegexpsCount()));
+ }
protected:
- TVector<State> States;
- TAction Action(const Scanner& sc, InternalState state, Char letter) const
- {
- size_t state_index = sc.StateIdx(state);
- size_t transition_index = sc.TransitionIndex(state_index, letter);
- const auto& tr = sc.m_jumps[transition_index];
- return tr.action;
- }
+ TVector<State> States;
+ TAction Action(const Scanner& sc, InternalState state, Char letter) const
+ {
+ size_t state_index = sc.StateIdx(state);
+ size_t transition_index = sc.TransitionIndex(state_index, letter);
+ const auto& tr = sc.m_jumps[transition_index];
+ return tr.action;
+ }
};
class NoGlueLimitCountingScannerGlueTask : public CountingScannerGlueTask<NoGlueLimitCountingScanner> {
public:
- using ActionIndex = NoGlueLimitCountingScanner::ActionIndex;
- struct TGlueAction {
- TVector<ActionIndex> resets;
- TVector<ActionIndex> increments;
- bool operator<(const TGlueAction& rhs) const {
- return std::tie(increments, resets) < std::tie(rhs.increments, rhs.resets);
- }
- };
- using TGlueMap = TMap<TGlueAction, ActionIndex>;
-
- NoGlueLimitCountingScannerGlueTask(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs)
- : CountingScannerGlueTask(lhs, rhs)
- {
- }
-
- void Connect(size_t from, size_t to, Char letter)
- {
- TGlueAction glue_action;
- this->Lhs().GetActions(Action(this->Lhs(), States[from].first, letter), 0,
- std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments));
- this->Rhs().GetActions(Action(this->Rhs(), States[from].second, letter), this->Lhs().RegexpsCount(),
- std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments));
- Y_ASSERT(
- std::is_sorted(glue_action.increments.begin(), glue_action.increments.end()) &&
- std::is_sorted(glue_action.resets.begin(), glue_action.resets.end())
- );
-
- if (glue_action.increments.empty() && glue_action.resets.empty()) {
- this->Sc().SetJump(from, letter, to, 0);
- return;
- }
-
- auto action_iter = glue_map_.find(glue_action);
- if (action_iter == glue_map_.end()) {
- glue_map_[glue_action] = glue_actions_.size();
- for (const auto& ids : {glue_action.resets, glue_action.increments}) {
- glue_actions_.push_back(ids.size());
- std::copy(ids.begin(), ids.end(), std::back_inserter(glue_actions_));
- }
- }
-
- this->Sc().SetJump(from, letter, to, glue_map_[glue_action]);
- }
-
- // Return type is same as in parent class
- // TODO: Maybe return by value to use move semantic?
- const NoGlueLimitCountingScanner& Success()
- {
- glue_actions_[0] = glue_actions_.size();
- Sc().AcceptActions(glue_actions_);
- return Sc();
- }
+ using ActionIndex = NoGlueLimitCountingScanner::ActionIndex;
+ struct TGlueAction {
+ TVector<ActionIndex> resets;
+ TVector<ActionIndex> increments;
+ bool operator<(const TGlueAction& rhs) const {
+ return std::tie(increments, resets) < std::tie(rhs.increments, rhs.resets);
+ }
+ };
+ using TGlueMap = TMap<TGlueAction, ActionIndex>;
+
+ NoGlueLimitCountingScannerGlueTask(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs)
+ : CountingScannerGlueTask(lhs, rhs)
+ {
+ }
+
+ void Connect(size_t from, size_t to, Char letter)
+ {
+ TGlueAction glue_action;
+ this->Lhs().GetActions(Action(this->Lhs(), States[from].first, letter), 0,
+ std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments));
+ this->Rhs().GetActions(Action(this->Rhs(), States[from].second, letter), this->Lhs().RegexpsCount(),
+ std::back_inserter(glue_action.resets), std::back_inserter(glue_action.increments));
+ Y_ASSERT(
+ std::is_sorted(glue_action.increments.begin(), glue_action.increments.end()) &&
+ std::is_sorted(glue_action.resets.begin(), glue_action.resets.end())
+ );
+
+ if (glue_action.increments.empty() && glue_action.resets.empty()) {
+ this->Sc().SetJump(from, letter, to, 0);
+ return;
+ }
+
+ auto action_iter = glue_map_.find(glue_action);
+ if (action_iter == glue_map_.end()) {
+ glue_map_[glue_action] = glue_actions_.size();
+ for (const auto& ids : {glue_action.resets, glue_action.increments}) {
+ glue_actions_.push_back(ids.size());
+ std::copy(ids.begin(), ids.end(), std::back_inserter(glue_actions_));
+ }
+ }
+
+ this->Sc().SetJump(from, letter, to, glue_map_[glue_action]);
+ }
+
+ // Return type is same as in parent class
+ // TODO: Maybe return by value to use move semantic?
+ const NoGlueLimitCountingScanner& Success()
+ {
+ glue_actions_[0] = glue_actions_.size();
+ Sc().AcceptActions(glue_actions_);
+ return Sc();
+ }
private:
- TGlueMap glue_map_;
- TVector<ActionIndex> glue_actions_ = {1};
+ TGlueMap glue_map_;
+ TVector<ActionIndex> glue_actions_ = {1};
};
@@ -984,85 +984,85 @@ private:
CountingScanner CountingScanner::Glue(const CountingScanner& lhs, const CountingScanner& rhs, size_t maxSize /* = 0 */)
{
- if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) {
- return CountingScanner();
- }
- static constexpr size_t DefMaxSize = 250000;
- Impl::CountingScannerGlueTask<CountingScanner> task(lhs, rhs);
- return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
+ if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) {
+ return CountingScanner();
+ }
+ static constexpr size_t DefMaxSize = 250000;
+ Impl::CountingScannerGlueTask<CountingScanner> task(lhs, rhs);
+ return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
}
AdvancedCountingScanner AdvancedCountingScanner::Glue(const AdvancedCountingScanner& lhs, const AdvancedCountingScanner& rhs, size_t maxSize /* = 0 */)
{
- if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) {
- return AdvancedCountingScanner();
- }
- static constexpr size_t DefMaxSize = 250000;
- Impl::CountingScannerGlueTask<AdvancedCountingScanner> task(lhs, rhs);
- return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
+ if (lhs.RegexpsCount() + rhs.RegexpsCount() > MAX_RE_COUNT) {
+ return AdvancedCountingScanner();
+ }
+ static constexpr size_t DefMaxSize = 250000;
+ Impl::CountingScannerGlueTask<AdvancedCountingScanner> task(lhs, rhs);
+ return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
}
NoGlueLimitCountingScanner NoGlueLimitCountingScanner::Glue(const NoGlueLimitCountingScanner& lhs, const NoGlueLimitCountingScanner& rhs, size_t maxSize /* = 0 */)
{
- static constexpr size_t DefMaxSize = 250000;
- Impl::NoGlueLimitCountingScannerGlueTask task(lhs, rhs);
- return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
+ static constexpr size_t DefMaxSize = 250000;
+ Impl::NoGlueLimitCountingScannerGlueTask task(lhs, rhs);
+ return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
}
// Should Save(), Load() and Mmap() functions return stream/pointer in aligned state?
// Now they don't because tests don't require it.
void NoGlueLimitCountingScanner::Save(yostream* s) const {
- Y_ASSERT(!AdvancedScannerCompatibilityMode);
- LoadedScanner::Save(s, ScannerIOTypes::NoGlueLimitCountingScanner);
- if (Actions) {
- SavePodArray(s, Actions, *Actions);
- } else {
- const ActionIndex zeroSize = 0;
- SavePodType(s, zeroSize);
- }
+ Y_ASSERT(!AdvancedScannerCompatibilityMode);
+ LoadedScanner::Save(s, ScannerIOTypes::NoGlueLimitCountingScanner);
+ if (Actions) {
+ SavePodArray(s, Actions, *Actions);
+ } else {
+ const ActionIndex zeroSize = 0;
+ SavePodType(s, zeroSize);
+ }
}
void NoGlueLimitCountingScanner::Load(yistream* s) {
- ui32 type;
- LoadedScanner::Load(s, &type);
- ActionIndex actionsSize;
- if (type == ScannerIOTypes::NoGlueLimitCountingScanner) {
- LoadPodType(s, actionsSize);
-
- if (actionsSize == 0) {
- ActionsBuffer.reset();
- Actions = nullptr;
- } else {
- ActionsBuffer = TActionsBuffer(new ActionIndex[actionsSize]);
- ActionsBuffer[0] = actionsSize;
- LoadPodArray(s, &ActionsBuffer[1], actionsSize - 1);
- Actions = ActionsBuffer.get();
- }
- } else {
- Y_ASSERT(type == ScannerIOTypes::LoadedScanner);
- AdvancedScannerCompatibilityMode = true;
- }
+ ui32 type;
+ LoadedScanner::Load(s, &type);
+ ActionIndex actionsSize;
+ if (type == ScannerIOTypes::NoGlueLimitCountingScanner) {
+ LoadPodType(s, actionsSize);
+
+ if (actionsSize == 0) {
+ ActionsBuffer.reset();
+ Actions = nullptr;
+ } else {
+ ActionsBuffer = TActionsBuffer(new ActionIndex[actionsSize]);
+ ActionsBuffer[0] = actionsSize;
+ LoadPodArray(s, &ActionsBuffer[1], actionsSize - 1);
+ Actions = ActionsBuffer.get();
+ }
+ } else {
+ Y_ASSERT(type == ScannerIOTypes::LoadedScanner);
+ AdvancedScannerCompatibilityMode = true;
+ }
}
const void* NoGlueLimitCountingScanner::Mmap(const void* ptr, size_t size) {
- NoGlueLimitCountingScanner scanner;
- ui32 type;
- auto p = static_cast<const size_t*> (scanner.LoadedScanner::Mmap(ptr, size, &type));
-
- if (type == ScannerIOTypes::NoGlueLimitCountingScanner) {
- scanner.Actions = reinterpret_cast<const ActionIndex*>(p);
- if (*scanner.Actions == 0) {
- scanner.Actions = nullptr;
- Impl::AdvancePtr(p, size, sizeof(ActionIndex));
- } else {
- Impl::AdvancePtr(p, size, *scanner.Actions * sizeof(ActionIndex));
- }
- } else {
- Y_ASSERT(type == ScannerIOTypes::LoadedScanner);
- scanner.AdvancedScannerCompatibilityMode = true;
- }
- Swap(scanner);
- return static_cast<const void*>(p);
+ NoGlueLimitCountingScanner scanner;
+ ui32 type;
+ auto p = static_cast<const size_t*> (scanner.LoadedScanner::Mmap(ptr, size, &type));
+
+ if (type == ScannerIOTypes::NoGlueLimitCountingScanner) {
+ scanner.Actions = reinterpret_cast<const ActionIndex*>(p);
+ if (*scanner.Actions == 0) {
+ scanner.Actions = nullptr;
+ Impl::AdvancePtr(p, size, sizeof(ActionIndex));
+ } else {
+ Impl::AdvancePtr(p, size, *scanner.Actions * sizeof(ActionIndex));
+ }
+ } else {
+ Y_ASSERT(type == ScannerIOTypes::LoadedScanner);
+ scanner.AdvancedScannerCompatibilityMode = true;
+ }
+ Swap(scanner);
+ return static_cast<const void*>(p);
}
}
diff --git a/library/cpp/regex/pire/pire/extra/count.h b/library/cpp/regex/pire/pire/extra/count.h
index 9032a7054a6..53ef98c8dd2 100644
--- a/library/cpp/regex/pire/pire/extra/count.h
+++ b/library/cpp/regex/pire/pire/extra/count.h
@@ -33,81 +33,81 @@ namespace Pire {
class Fsm;
namespace Impl {
- template<class T>
- class ScannerGlueCommon;
+ template<class T>
+ class ScannerGlueCommon;
- template<class T>
- class CountingScannerGlueTask;
+ template<class T>
+ class CountingScannerGlueTask;
- class NoGlueLimitCountingScannerGlueTask;
+ class NoGlueLimitCountingScannerGlueTask;
- template <class AdvancedScanner>
- AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple);
+ template <class AdvancedScanner>
+ AdvancedScanner MakeAdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple);
};
template<size_t I>
class IncrementPerformer {
public:
- template<typename State, typename Action>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Do(State& s, Action mask)
- {
- if (mask & (1 << (I - 1))) {
- Increment(s);
- }
- IncrementPerformer<I - 1>::Do(s, mask);
- }
+ template<typename State, typename Action>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Do(State& s, Action mask)
+ {
+ if (mask & (1 << (I - 1))) {
+ Increment(s);
+ }
+ IncrementPerformer<I - 1>::Do(s, mask);
+ }
private:
- template<typename State>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Increment(State& s)
- {
- ++s.m_current[I - 1];
- }
+ template<typename State>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Increment(State& s)
+ {
+ ++s.m_current[I - 1];
+ }
};
template<>
class IncrementPerformer<0> {
public:
- template<typename State, typename Action>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Do(State&, Action)
- {
- }
+ template<typename State, typename Action>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Do(State&, Action)
+ {
+ }
};
template<size_t I>
class ResetPerformer {
public:
- template<typename State, typename Action>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Do(State& s, Action mask)
- {
- if (mask & (1 << (LoadedScanner::MAX_RE_COUNT + (I - 1))) && s.m_current[I - 1]) {
- Reset(s);
- }
- ResetPerformer<I - 1>::Do(s, mask);
- }
+ template<typename State, typename Action>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Do(State& s, Action mask)
+ {
+ if (mask & (1 << (LoadedScanner::MAX_RE_COUNT + (I - 1))) && s.m_current[I - 1]) {
+ Reset(s);
+ }
+ ResetPerformer<I - 1>::Do(s, mask);
+ }
private:
- template<typename State>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Reset(State& s)
- {
- s.m_total[I - 1] = ymax(s.m_total[I - 1], s.m_current[I - 1]);
- s.m_current[I - 1] = 0;
- }
+ template<typename State>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Reset(State& s)
+ {
+ s.m_total[I - 1] = ymax(s.m_total[I - 1], s.m_current[I - 1]);
+ s.m_current[I - 1] = 0;
+ }
};
template<>
class ResetPerformer<0> {
public:
- template<typename State, typename Action>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- static void Do(State&, Action)
- {
- }
+ template<typename State, typename Action>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ static void Do(State&, Action)
+ {
+ }
};
/**
@@ -118,385 +118,385 @@ public:
template<class DerivedScanner, class State>
class BaseCountingScanner: public LoadedScanner {
public:
- enum {
- IncrementAction = 1,
- ResetAction = 2,
+ enum {
+ IncrementAction = 1,
+ ResetAction = 2,
- FinalFlag = 0,
- DeadFlag = 1,
- };
+ FinalFlag = 0,
+ DeadFlag = 1,
+ };
- void Initialize(State& state) const
- {
- state.m_state = m.initial;
- memset(&state.m_current, 0, sizeof(state.m_current));
- memset(&state.m_total, 0, sizeof(state.m_total));
- state.m_updatedMask = 0;
- }
+ void Initialize(State& state) const
+ {
+ state.m_state = m.initial;
+ memset(&state.m_current, 0, sizeof(state.m_current));
+ memset(&state.m_total, 0, sizeof(state.m_total));
+ state.m_updatedMask = 0;
+ }
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void TakeAction(State& s, Action a) const
- {
- static_cast<const DerivedScanner*>(this)->template TakeActionImpl<MAX_RE_COUNT>(s, a);
- }
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void TakeAction(State& s, Action a) const
+ {
+ static_cast<const DerivedScanner*>(this)->template TakeActionImpl<MAX_RE_COUNT>(s, a);
+ }
- bool CanStop(const State&) const { return false; }
+ bool CanStop(const State&) const { return false; }
- Char Translate(Char ch) const
- {
- return m_letters[static_cast<size_t>(ch)];
- }
+ Char Translate(Char ch) const
+ {
+ return m_letters[static_cast<size_t>(ch)];
+ }
- Action NextTranslated(State& s, Char c) const
- {
- Transition x = reinterpret_cast<const Transition*>(s.m_state)[c];
- s.m_state += SignExtend(x.shift);
- return x.action;
- }
+ Action NextTranslated(State& s, Char c) const
+ {
+ Transition x = reinterpret_cast<const Transition*>(s.m_state)[c];
+ s.m_state += SignExtend(x.shift);
+ return x.action;
+ }
- Action Next(State& s, Char c) const
- {
- return NextTranslated(s, Translate(c));
- }
+ Action Next(State& s, Char c) const
+ {
+ return NextTranslated(s, Translate(c));
+ }
- Action Next(const State& current, State& n, Char c) const
- {
- n = current;
- return Next(n, c);
- }
+ Action Next(const State& current, State& n, Char c) const
+ {
+ n = current;
+ return Next(n, c);
+ }
- bool Final(const State& /*state*/) const { return false; }
+ bool Final(const State& /*state*/) const { return false; }
- bool Dead(const State&) const { return false; }
+ bool Dead(const State&) const { return false; }
- using LoadedScanner::Swap;
+ using LoadedScanner::Swap;
- size_t StateIndex(const State& s) const { return StateIdx(s.m_state); }
+ size_t StateIndex(const State& s) const { return StateIdx(s.m_state); }
protected:
- using LoadedScanner::Init;
- using LoadedScanner::InternalState;
-
- template<size_t ActualReCount>
- void PerformIncrement(State& s, Action mask) const
- {
- if (mask) {
- IncrementPerformer<ActualReCount>::Do(s, mask);
- s.m_updatedMask |= ((size_t)mask) << MAX_RE_COUNT;
- }
- }
-
- template<size_t ActualReCount>
- void PerformReset(State& s, Action mask) const
- {
- mask &= s.m_updatedMask;
- if (mask) {
- ResetPerformer<ActualReCount>::Do(s, mask);
- s.m_updatedMask &= (Action)~mask;
- }
- }
-
- void Next(InternalState& s, Char c) const
- {
- Transition x = reinterpret_cast<const Transition*>(s)[Translate(c)];
- s += SignExtend(x.shift);
- }
+ using LoadedScanner::Init;
+ using LoadedScanner::InternalState;
+
+ template<size_t ActualReCount>
+ void PerformIncrement(State& s, Action mask) const
+ {
+ if (mask) {
+ IncrementPerformer<ActualReCount>::Do(s, mask);
+ s.m_updatedMask |= ((size_t)mask) << MAX_RE_COUNT;
+ }
+ }
+
+ template<size_t ActualReCount>
+ void PerformReset(State& s, Action mask) const
+ {
+ mask &= s.m_updatedMask;
+ if (mask) {
+ ResetPerformer<ActualReCount>::Do(s, mask);
+ s.m_updatedMask &= (Action)~mask;
+ }
+ }
+
+ void Next(InternalState& s, Char c) const
+ {
+ Transition x = reinterpret_cast<const Transition*>(s)[Translate(c)];
+ s += SignExtend(x.shift);
+ }
};
template <size_t MAX_RE_COUNT>
class CountingState {
public:
- size_t Result(int i) const { return ymax(m_current[i], m_total[i]); }
+ size_t Result(int i) const { return ymax(m_current[i], m_total[i]); }
private:
- using InternalState = LoadedScanner::InternalState;
- InternalState m_state;
- ui32 m_current[MAX_RE_COUNT];
- ui32 m_total[MAX_RE_COUNT];
- size_t m_updatedMask;
+ using InternalState = LoadedScanner::InternalState;
+ InternalState m_state;
+ ui32 m_current[MAX_RE_COUNT];
+ ui32 m_total[MAX_RE_COUNT];
+ size_t m_updatedMask;
- template <class DerivedScanner, class State>
- friend class BaseCountingScanner;
+ template <class DerivedScanner, class State>
+ friend class BaseCountingScanner;
- template<size_t I>
- friend class IncrementPerformer;
+ template<size_t I>
+ friend class IncrementPerformer;
- template<size_t I>
- friend class ResetPerformer;
+ template<size_t I>
+ friend class ResetPerformer;
#ifdef PIRE_DEBUG
- friend yostream& operator << (yostream& s, const State& state)
- {
- s << state.m_state << " ( ";
- for (size_t i = 0; i < MAX_RE_COUNT; ++i)
- s << state.m_current[i] << '/' << state.m_total[i] << ' ';
- return s << ')';
- }
+ friend yostream& operator << (yostream& s, const State& state)
+ {
+ s << state.m_state << " ( ";
+ for (size_t i = 0; i < MAX_RE_COUNT; ++i)
+ s << state.m_current[i] << '/' << state.m_total[i] << ' ';
+ return s << ')';
+ }
#endif
};
class CountingScanner : public BaseCountingScanner<CountingScanner, CountingState<LoadedScanner::MAX_RE_COUNT>> {
public:
- using State = CountingState<MAX_RE_COUNT>;
- enum {
- Matched = 2,
- };
-
- CountingScanner() {}
- CountingScanner(const Fsm& re, const Fsm& sep);
-
- static CountingScanner Glue(const CountingScanner& a, const CountingScanner& b, size_t maxSize = 0);
-
- template<size_t ActualReCount>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void TakeActionImpl(State& s, Action a) const
- {
- if (a & IncrementMask)
- PerformIncrement<ActualReCount>(s, a);
- if (a & ResetMask)
- PerformReset<ActualReCount>(s, a);
- }
+ using State = CountingState<MAX_RE_COUNT>;
+ enum {
+ Matched = 2,
+ };
+
+ CountingScanner() {}
+ CountingScanner(const Fsm& re, const Fsm& sep);
+
+ static CountingScanner Glue(const CountingScanner& a, const CountingScanner& b, size_t maxSize = 0);
+
+ template<size_t ActualReCount>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void TakeActionImpl(State& s, Action a) const
+ {
+ if (a & IncrementMask)
+ PerformIncrement<ActualReCount>(s, a);
+ if (a & ResetMask)
+ PerformReset<ActualReCount>(s, a);
+ }
private:
- Action RemapAction(Action action)
- {
- if (action == (Matched | DeadFlag))
- return 1;
- else if (action == DeadFlag)
- return 1 << MAX_RE_COUNT;
- else
- return 0;
- }
-
- friend void BuildScanner<CountingScanner>(const Fsm&, CountingScanner&);
- friend class Impl::ScannerGlueCommon<CountingScanner>;
- friend class Impl::CountingScannerGlueTask<CountingScanner>;
+ Action RemapAction(Action action)
+ {
+ if (action == (Matched | DeadFlag))
+ return 1;
+ else if (action == DeadFlag)
+ return 1 << MAX_RE_COUNT;
+ else
+ return 0;
+ }
+
+ friend void BuildScanner<CountingScanner>(const Fsm&, CountingScanner&);
+ friend class Impl::ScannerGlueCommon<CountingScanner>;
+ friend class Impl::CountingScannerGlueTask<CountingScanner>;
};
class AdvancedCountingScanner : public BaseCountingScanner<AdvancedCountingScanner, CountingState<LoadedScanner::MAX_RE_COUNT>> {
public:
- using State = CountingState<MAX_RE_COUNT>;
+ using State = CountingState<MAX_RE_COUNT>;
- AdvancedCountingScanner() {}
- AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr);
+ AdvancedCountingScanner() {}
+ AdvancedCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr);
- static AdvancedCountingScanner Glue(const AdvancedCountingScanner& a, const AdvancedCountingScanner& b, size_t maxSize = 0);
+ static AdvancedCountingScanner Glue(const AdvancedCountingScanner& a, const AdvancedCountingScanner& b, size_t maxSize = 0);
- template<size_t ActualReCount>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void TakeActionImpl(State& s, Action a) const
- {
- if (a & ResetMask) {
- PerformReset<ActualReCount>(s, a);
- }
- if (a & IncrementMask) {
- PerformIncrement<ActualReCount>(s, a);
- }
- }
+ template<size_t ActualReCount>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void TakeActionImpl(State& s, Action a) const
+ {
+ if (a & ResetMask) {
+ PerformReset<ActualReCount>(s, a);
+ }
+ if (a & IncrementMask) {
+ PerformIncrement<ActualReCount>(s, a);
+ }
+ }
private:
- Action RemapAction(Action action)
- {
- Action result = 0;
- if (action & ResetAction) {
- result = 1 << MAX_RE_COUNT;
- }
- if (action & IncrementAction) {
- result |= 1;
- }
- return result;
- }
-
- friend class Impl::ScannerGlueCommon<AdvancedCountingScanner>;
- friend class Impl::CountingScannerGlueTask<AdvancedCountingScanner>;
- friend AdvancedCountingScanner Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(const Fsm&, const Fsm&, bool*);
+ Action RemapAction(Action action)
+ {
+ Action result = 0;
+ if (action & ResetAction) {
+ result = 1 << MAX_RE_COUNT;
+ }
+ if (action & IncrementAction) {
+ result |= 1;
+ }
+ return result;
+ }
+
+ friend class Impl::ScannerGlueCommon<AdvancedCountingScanner>;
+ friend class Impl::CountingScannerGlueTask<AdvancedCountingScanner>;
+ friend AdvancedCountingScanner Impl::MakeAdvancedCountingScanner<AdvancedCountingScanner>(const Fsm&, const Fsm&, bool*);
};
class NoGlueLimitCountingState {
public:
- size_t Result(int i) const { return ymax(m_current[i], m_total[i]); }
+ size_t Result(int i) const { return ymax(m_current[i], m_total[i]); }
void Initialize(size_t initial, size_t regexpsCount) {
- m_state = initial;
- m_current.assign(regexpsCount, 0);
- m_total.assign(regexpsCount, 0);
- }
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void Reset(size_t regexpId) {
- m_current[regexpId] = 0;
- }
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void Increment(size_t regexp_id) {
- ++m_current[regexp_id];
- m_total[regexp_id] = ymax(m_total[regexp_id], m_current[regexp_id]);
- }
-
- template<size_t I>
- friend class IncrementPerformer;
-
- template<size_t I>
- friend class ResetPerformer;
+ m_state = initial;
+ m_current.assign(regexpsCount, 0);
+ m_total.assign(regexpsCount, 0);
+ }
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void Reset(size_t regexpId) {
+ m_current[regexpId] = 0;
+ }
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void Increment(size_t regexp_id) {
+ ++m_current[regexp_id];
+ m_total[regexp_id] = ymax(m_total[regexp_id], m_current[regexp_id]);
+ }
+
+ template<size_t I>
+ friend class IncrementPerformer;
+
+ template<size_t I>
+ friend class ResetPerformer;
private:
- LoadedScanner::InternalState m_state;
- TVector<ui32> m_current;
- TVector<ui32> m_total;
+ LoadedScanner::InternalState m_state;
+ TVector<ui32> m_current;
+ TVector<ui32> m_total;
- template <class DerivedScanner, class State>
- friend class BaseCountingScanner;
+ template <class DerivedScanner, class State>
+ friend class BaseCountingScanner;
#ifdef PIRE_DEBUG
- yostream& operator << (yostream& s, const State& state)
- {
- s << state.m_state << " ( ";
- for (size_t i = 0; i < state.m_current.size(); ++i)
- s << state.m_current[i] << '/' << state.m_total[i] << ' ';
- return s << ')';
- }
+ yostream& operator << (yostream& s, const State& state)
+ {
+ s << state.m_state << " ( ";
+ for (size_t i = 0; i < state.m_current.size(); ++i)
+ s << state.m_current[i] << '/' << state.m_total[i] << ' ';
+ return s << ')';
+ }
#endif
};
class NoGlueLimitCountingScanner : public BaseCountingScanner<NoGlueLimitCountingScanner, NoGlueLimitCountingState> {
public:
- using State = NoGlueLimitCountingState;
- using ActionIndex = ui32;
- using TActionsBuffer = std::unique_ptr<ActionIndex[]>;
+ using State = NoGlueLimitCountingState;
+ using ActionIndex = ui32;
+ using TActionsBuffer = std::unique_ptr<ActionIndex[]>;
private:
- TActionsBuffer ActionsBuffer;
- const ActionIndex* Actions = nullptr;
- bool AdvancedScannerCompatibilityMode = false;
+ TActionsBuffer ActionsBuffer;
+ const ActionIndex* Actions = nullptr;
+ bool AdvancedScannerCompatibilityMode = false;
public:
- NoGlueLimitCountingScanner() = default;
- NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr);
- NoGlueLimitCountingScanner(const NoGlueLimitCountingScanner& rhs)
- : BaseCountingScanner(rhs)
- , AdvancedScannerCompatibilityMode(rhs.AdvancedScannerCompatibilityMode)
- {
- if (rhs.ActionsBuffer) {
- Y_ASSERT(rhs.Actions);
- ActionsBuffer = TActionsBuffer(new ActionIndex [*rhs.Actions]);
- std::copy_n(rhs.ActionsBuffer.get(), *rhs.Actions, ActionsBuffer.get());
- Actions = ActionsBuffer.get();
- } else {
- Actions = rhs.Actions;
- }
- }
-
- NoGlueLimitCountingScanner(NoGlueLimitCountingScanner&& other) : BaseCountingScanner() {
- Swap(other);
- }
-
- NoGlueLimitCountingScanner& operator=(NoGlueLimitCountingScanner rhs) {
- Swap(rhs);
- return *this;
- }
-
- void Swap(NoGlueLimitCountingScanner& s) {
- LoadedScanner::Swap(s);
- DoSwap(ActionsBuffer, s.ActionsBuffer);
- DoSwap(Actions, s.Actions);
- DoSwap(AdvancedScannerCompatibilityMode, s.AdvancedScannerCompatibilityMode);
- }
-
- void Initialize(State& state) const
- {
- state.Initialize(m.initial, RegexpsCount());
- }
-
- template <size_t ActualReCount>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- void TakeActionImpl(State& s, Action a) const
- {
- if (!a) {
- return;
- }
- if (AdvancedScannerCompatibilityMode) {
- AdvancedScannerTakeActionImpl<ActualReCount>(s, a);
- return;
- }
- // Note: it's important to perform resets before increments,
- // as it's possible for one repetition group to stop and another begin at the same symbol
- if (Actions) {
- auto action = Actions + a;
- for (auto reset_count = *action++; reset_count--;) {
- s.Reset(*action++);
- }
- for (auto inc_count = *action++; inc_count--;) {
- s.Increment(*action++);
- }
- } else {
- Y_ASSERT(RegexpsCount() == 1);
- if (a & ResetAction) {
- s.Reset(0);
- }
- if (a & IncrementAction) {
- s.Increment(0);
- }
- }
- }
-
- void Save(yostream* s) const;
-
- void Load(yistream* s);
-
- const void* Mmap(const void* ptr, size_t size);
-
- static NoGlueLimitCountingScanner Glue(const NoGlueLimitCountingScanner& a, const NoGlueLimitCountingScanner& b, size_t maxSize = 0);
+ NoGlueLimitCountingScanner() = default;
+ NoGlueLimitCountingScanner(const Fsm& re, const Fsm& sep, bool* simple = nullptr);
+ NoGlueLimitCountingScanner(const NoGlueLimitCountingScanner& rhs)
+ : BaseCountingScanner(rhs)
+ , AdvancedScannerCompatibilityMode(rhs.AdvancedScannerCompatibilityMode)
+ {
+ if (rhs.ActionsBuffer) {
+ Y_ASSERT(rhs.Actions);
+ ActionsBuffer = TActionsBuffer(new ActionIndex [*rhs.Actions]);
+ std::copy_n(rhs.ActionsBuffer.get(), *rhs.Actions, ActionsBuffer.get());
+ Actions = ActionsBuffer.get();
+ } else {
+ Actions = rhs.Actions;
+ }
+ }
+
+ NoGlueLimitCountingScanner(NoGlueLimitCountingScanner&& other) : BaseCountingScanner() {
+ Swap(other);
+ }
+
+ NoGlueLimitCountingScanner& operator=(NoGlueLimitCountingScanner rhs) {
+ Swap(rhs);
+ return *this;
+ }
+
+ void Swap(NoGlueLimitCountingScanner& s) {
+ LoadedScanner::Swap(s);
+ DoSwap(ActionsBuffer, s.ActionsBuffer);
+ DoSwap(Actions, s.Actions);
+ DoSwap(AdvancedScannerCompatibilityMode, s.AdvancedScannerCompatibilityMode);
+ }
+
+ void Initialize(State& state) const
+ {
+ state.Initialize(m.initial, RegexpsCount());
+ }
+
+ template <size_t ActualReCount>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ void TakeActionImpl(State& s, Action a) const
+ {
+ if (!a) {
+ return;
+ }
+ if (AdvancedScannerCompatibilityMode) {
+ AdvancedScannerTakeActionImpl<ActualReCount>(s, a);
+ return;
+ }
+ // Note: it's important to perform resets before increments,
+ // as it's possible for one repetition group to stop and another begin at the same symbol
+ if (Actions) {
+ auto action = Actions + a;
+ for (auto reset_count = *action++; reset_count--;) {
+ s.Reset(*action++);
+ }
+ for (auto inc_count = *action++; inc_count--;) {
+ s.Increment(*action++);
+ }
+ } else {
+ Y_ASSERT(RegexpsCount() == 1);
+ if (a & ResetAction) {
+ s.Reset(0);
+ }
+ if (a & IncrementAction) {
+ s.Increment(0);
+ }
+ }
+ }
+
+ void Save(yostream* s) const;
+
+ void Load(yistream* s);
+
+ const void* Mmap(const void* ptr, size_t size);
+
+ static NoGlueLimitCountingScanner Glue(const NoGlueLimitCountingScanner& a, const NoGlueLimitCountingScanner& b, size_t maxSize = 0);
private:
- Action RemapAction(Action action)
- {
- return action;
- }
-
- template <class Iterator>
- void GetActions(Action a, ActionIndex id_shift, Iterator output_resets, Iterator output_increments) const {
- if (!a) {
- return;
- }
- if (!Actions) {
- if (a & ResetAction) {
- *output_resets++ = id_shift;
- }
- if (a & NoGlueLimitCountingScanner::IncrementAction) {
- *output_increments++ = id_shift;
- }
- return;
- }
- auto action = Actions + a;
- for (auto output : {output_resets, output_increments}) {
- for (auto count = *action++; count--;) {
- *output++ = *action++ + id_shift;
- }
- }
- }
-
- void AcceptActions(const TVector<ActionIndex>& actions) {
- Y_ASSERT(!Actions);
- Y_ASSERT(!actions.empty());
- Y_ASSERT(actions[0] == actions.size());
-
- ActionsBuffer = TActionsBuffer(new ActionIndex[actions.size()]);
- std::copy(actions.begin(), actions.end(), ActionsBuffer.get());
- Actions = ActionsBuffer.get();
- }
-
- template <size_t ActualReCount>
- void AdvancedScannerTakeActionImpl(State& s, Action a) const {
- if (a & ResetMask) {
- ResetPerformer<ActualReCount>::Do(s, a);
- }
- if (a & IncrementMask) {
- IncrementPerformer<ActualReCount>::Do(s, a);
- }
- }
-
- friend class Impl::ScannerGlueCommon<NoGlueLimitCountingScanner>;
- friend class Impl::CountingScannerGlueTask<NoGlueLimitCountingScanner>;
- friend class Impl::NoGlueLimitCountingScannerGlueTask;
- friend NoGlueLimitCountingScanner Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(const Fsm&, const Fsm&, bool*);
+ Action RemapAction(Action action)
+ {
+ return action;
+ }
+
+ template <class Iterator>
+ void GetActions(Action a, ActionIndex id_shift, Iterator output_resets, Iterator output_increments) const {
+ if (!a) {
+ return;
+ }
+ if (!Actions) {
+ if (a & ResetAction) {
+ *output_resets++ = id_shift;
+ }
+ if (a & NoGlueLimitCountingScanner::IncrementAction) {
+ *output_increments++ = id_shift;
+ }
+ return;
+ }
+ auto action = Actions + a;
+ for (auto output : {output_resets, output_increments}) {
+ for (auto count = *action++; count--;) {
+ *output++ = *action++ + id_shift;
+ }
+ }
+ }
+
+ void AcceptActions(const TVector<ActionIndex>& actions) {
+ Y_ASSERT(!Actions);
+ Y_ASSERT(!actions.empty());
+ Y_ASSERT(actions[0] == actions.size());
+
+ ActionsBuffer = TActionsBuffer(new ActionIndex[actions.size()]);
+ std::copy(actions.begin(), actions.end(), ActionsBuffer.get());
+ Actions = ActionsBuffer.get();
+ }
+
+ template <size_t ActualReCount>
+ void AdvancedScannerTakeActionImpl(State& s, Action a) const {
+ if (a & ResetMask) {
+ ResetPerformer<ActualReCount>::Do(s, a);
+ }
+ if (a & IncrementMask) {
+ IncrementPerformer<ActualReCount>::Do(s, a);
+ }
+ }
+
+ friend class Impl::ScannerGlueCommon<NoGlueLimitCountingScanner>;
+ friend class Impl::CountingScannerGlueTask<NoGlueLimitCountingScanner>;
+ friend class Impl::NoGlueLimitCountingScannerGlueTask;
+ friend NoGlueLimitCountingScanner Impl::MakeAdvancedCountingScanner<NoGlueLimitCountingScanner>(const Fsm&, const Fsm&, bool*);
};
}
diff --git a/library/cpp/regex/pire/pire/extra/glyphs.cpp b/library/cpp/regex/pire/pire/extra/glyphs.cpp
index fc16e5dae78..e44872cad7c 100644
--- a/library/cpp/regex/pire/pire/extra/glyphs.cpp
+++ b/library/cpp/regex/pire/pire/extra/glyphs.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -38,106 +38,106 @@ namespace Pire {
namespace {
- /*
- * A class providing a function which returns a character
- * whose glyph resembles that of given char, if any;
- * otherwise returns given char itself.
- */
- class GlyphTable {
- private:
- TList< TVector<wchar32> > m_classes;
- TMap<wchar32, TVector<wchar32>*> m_map;
-
- struct GlyphClass {
- TVector<wchar32>* m_class;
- TMap<wchar32, TVector<wchar32>*> *m_map;
-
- GlyphClass& operator << (wchar32 c)
- {
- m_class->push_back(c);
- m_map->insert(ymake_pair(c, m_class));
- return *this;
- }
- };
-
- GlyphClass Class()
- {
- GlyphClass cl;
- m_classes.push_back(TVector<wchar32>());
- cl.m_class = &m_classes.back();
- cl.m_map = &m_map;
- return cl;
- }
-
- public:
-
- const TVector<wchar32>& Klass(wchar32 x) const
- {
- TMap<wchar32, TVector<wchar32>*>::const_iterator i = m_map.find(x);
- if (i != m_map.end())
- return *i->second;
- else
- return DefaultValue< TVector<wchar32> >();
- }
-
- GlyphTable()
- {
- Class() << 'A' << 0x0410;
- Class() << 'B' << 0x0412;
- Class() << 'C' << 0x0421;
- Class() << 'E' << 0x0415 << 0x0401;
- Class() << 'H' << 0x041D;
- Class() << 'K' << 0x041A;
- Class() << 'M' << 0x041C;
- Class() << 'O' << 0x041E;
- Class() << 'P' << 0x0420;
- Class() << 'T' << 0x0422;
- Class() << 'X' << 0x0425;
-
- Class() << 'a' << 0x0430;
- Class() << 'c' << 0x0441;
- Class() << 'e' << 0x0435 << 0x0451;
- Class() << 'm' << 0x0442;
- Class() << 'o' << 0x043E;
- Class() << 'p' << 0x0440;
- Class() << 'u' << 0x0438;
- Class() << 'x' << 0x0445;
- Class() << 'y' << 0x0443;
- }
- };
-
- class GlueSimilarGlyphsImpl: public Feature {
- public:
- GlueSimilarGlyphsImpl(): m_table(Singleton<GlyphTable>()) {}
- int Priority() const { return 9; }
-
- void Alter(Term& t)
- {
- if (t.Value().IsA<Term::CharacterRange>()) {
- const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>();
- typedef Term::CharacterRange::first_type CharSet;
- const CharSet& old = range.first;
- CharSet altered;
- for (auto&& i : old) {
- const TVector<wchar32>* klass = 0;
- if (i.size() == 1 && !(klass = &m_table->Klass(i[0]))->empty())
- for (auto&& j : *klass)
- altered.insert(Term::String(1, j));
- else
- altered.insert(i);
- }
-
- t = Term(t.Type(), Term::CharacterRange(altered, range.second));
- }
- }
-
- private:
- GlyphTable* m_table;
- };
+ /*
+ * A class providing a function which returns a character
+ * whose glyph resembles that of given char, if any;
+ * otherwise returns given char itself.
+ */
+ class GlyphTable {
+ private:
+ TList< TVector<wchar32> > m_classes;
+ TMap<wchar32, TVector<wchar32>*> m_map;
+
+ struct GlyphClass {
+ TVector<wchar32>* m_class;
+ TMap<wchar32, TVector<wchar32>*> *m_map;
+
+ GlyphClass& operator << (wchar32 c)
+ {
+ m_class->push_back(c);
+ m_map->insert(ymake_pair(c, m_class));
+ return *this;
+ }
+ };
+
+ GlyphClass Class()
+ {
+ GlyphClass cl;
+ m_classes.push_back(TVector<wchar32>());
+ cl.m_class = &m_classes.back();
+ cl.m_map = &m_map;
+ return cl;
+ }
+
+ public:
+
+ const TVector<wchar32>& Klass(wchar32 x) const
+ {
+ TMap<wchar32, TVector<wchar32>*>::const_iterator i = m_map.find(x);
+ if (i != m_map.end())
+ return *i->second;
+ else
+ return DefaultValue< TVector<wchar32> >();
+ }
+
+ GlyphTable()
+ {
+ Class() << 'A' << 0x0410;
+ Class() << 'B' << 0x0412;
+ Class() << 'C' << 0x0421;
+ Class() << 'E' << 0x0415 << 0x0401;
+ Class() << 'H' << 0x041D;
+ Class() << 'K' << 0x041A;
+ Class() << 'M' << 0x041C;
+ Class() << 'O' << 0x041E;
+ Class() << 'P' << 0x0420;
+ Class() << 'T' << 0x0422;
+ Class() << 'X' << 0x0425;
+
+ Class() << 'a' << 0x0430;
+ Class() << 'c' << 0x0441;
+ Class() << 'e' << 0x0435 << 0x0451;
+ Class() << 'm' << 0x0442;
+ Class() << 'o' << 0x043E;
+ Class() << 'p' << 0x0440;
+ Class() << 'u' << 0x0438;
+ Class() << 'x' << 0x0445;
+ Class() << 'y' << 0x0443;
+ }
+ };
+
+ class GlueSimilarGlyphsImpl: public Feature {
+ public:
+ GlueSimilarGlyphsImpl(): m_table(Singleton<GlyphTable>()) {}
+ int Priority() const { return 9; }
+
+ void Alter(Term& t)
+ {
+ if (t.Value().IsA<Term::CharacterRange>()) {
+ const Term::CharacterRange& range = t.Value().As<Term::CharacterRange>();
+ typedef Term::CharacterRange::first_type CharSet;
+ const CharSet& old = range.first;
+ CharSet altered;
+ for (auto&& i : old) {
+ const TVector<wchar32>* klass = 0;
+ if (i.size() == 1 && !(klass = &m_table->Klass(i[0]))->empty())
+ for (auto&& j : *klass)
+ altered.insert(Term::String(1, j));
+ else
+ altered.insert(i);
+ }
+
+ t = Term(t.Type(), Term::CharacterRange(altered, range.second));
+ }
+ }
+
+ private:
+ GlyphTable* m_table;
+ };
}
namespace Features {
- Feature::Ptr GlueSimilarGlyphs() { return Feature::Ptr(new GlueSimilarGlyphsImpl); }
+ Feature::Ptr GlueSimilarGlyphs() { return Feature::Ptr(new GlueSimilarGlyphsImpl); }
}
}
diff --git a/library/cpp/regex/pire/pire/extra/glyphs.h b/library/cpp/regex/pire/pire/extra/glyphs.h
index 678b9e15c4a..57fb1ce37cd 100644
--- a/library/cpp/regex/pire/pire/extra/glyphs.h
+++ b/library/cpp/regex/pire/pire/extra/glyphs.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -29,12 +29,12 @@ namespace Pire {
class Feature;
namespace Features {
- /**
- * A feature which tells Pire not to distinguish latin
- * and cyrillic letters having identical shapes
- * (e.g. latin A and cyrillic A).
- */
- Feature::Ptr GlueSimilarGlyphs();
+ /**
+ * A feature which tells Pire not to distinguish latin
+ * and cyrillic letters having identical shapes
+ * (e.g. latin A and cyrillic A).
+ */
+ Feature::Ptr GlueSimilarGlyphs();
}
}
diff --git a/library/cpp/regex/pire/pire/fsm.cpp b/library/cpp/regex/pire/pire/fsm.cpp
index 24b4a9ab086..69956438178 100644
--- a/library/cpp/regex/pire/pire/fsm.cpp
+++ b/library/cpp/regex/pire/pire/fsm.cpp
@@ -45,250 +45,250 @@ namespace Pire {
ystring CharDump(Char c)
{
- char buf[8];
- if (c == '"')
- return ystring("\\\"");
- else if (c == '[' || c == ']' || c == '-' || c == '^') {
- snprintf(buf, sizeof(buf)-1, "\\\\%c", c);
- return ystring(buf);
- } else if (c >= 32 && c < 127)
- return ystring(1, static_cast<char>(c));
- else if (c == '\n')
- return ystring("\\\\n");
- else if (c == '\t')
- return ystring("\\\\t");
- else if (c == '\r')
- return ystring("\\\\r");
- else if (c < 256) {
- snprintf(buf, sizeof(buf)-1, "\\\\%03o", static_cast<int>(c));
- return ystring(buf);
- } else if (c == Epsilon)
- return ystring("<Epsilon>");
- else if (c == BeginMark)
- return ystring("<Begin>");
- else if (c == EndMark)
- return ystring("<End>");
- else
- return ystring("<?" "?" "?>");
+ char buf[8];
+ if (c == '"')
+ return ystring("\\\"");
+ else if (c == '[' || c == ']' || c == '-' || c == '^') {
+ snprintf(buf, sizeof(buf)-1, "\\\\%c", c);
+ return ystring(buf);
+ } else if (c >= 32 && c < 127)
+ return ystring(1, static_cast<char>(c));
+ else if (c == '\n')
+ return ystring("\\\\n");
+ else if (c == '\t')
+ return ystring("\\\\t");
+ else if (c == '\r')
+ return ystring("\\\\r");
+ else if (c < 256) {
+ snprintf(buf, sizeof(buf)-1, "\\\\%03o", static_cast<int>(c));
+ return ystring(buf);
+ } else if (c == Epsilon)
+ return ystring("<Epsilon>");
+ else if (c == BeginMark)
+ return ystring("<Begin>");
+ else if (c == EndMark)
+ return ystring("<End>");
+ else
+ return ystring("<?" "?" "?>");
}
void Fsm::DumpState(yostream& s, size_t state) const
{
- // Fill in a 'row': Q -> exp(V) (for current state)
- TVector< ybitset<MaxChar> > row(Size());
- for (auto&& transition : m_transitions[state])
- for (auto&& transitionState : transition.second) {
- if (transitionState >= Size()) {
- std::cerr << "WTF?! Transition from " << state << " on letter " << transition.first << " leads to non-existing state " << transitionState << "\n";
- Y_ASSERT(false);
- }
- if (Letters().Contains(transition.first)) {
- const TVector<Char>& letters = Letters().Klass(Letters().Representative(transition.first));
- for (auto&& letter : letters)
- row[transitionState].set(letter);
- } else
- row[transitionState].set(transition.first);
- }
-
- bool statePrinted = false;
- // Display each destination state
- for (auto rit = row.begin(), rie = row.end(); rit != rie; ++rit) {
- unsigned begin = 0, end = 0;
-
- ystring delimiter;
- ystring label;
- if (rit->test(Epsilon)) {
- label += delimiter + CharDump(Epsilon);
- delimiter = " ";
- }
- if (rit->test(BeginMark)) {
- label += delimiter + CharDump(BeginMark);
- delimiter = " ";
- }
- if (rit->test(EndMark)) {
- label += delimiter + CharDump(EndMark);
- delimiter = " ";
- }
- unsigned count = 0;
- for (unsigned i = 0; i < 256; ++i)
- if (rit->test(i))
- ++count;
- if (count != 0 && count != 256) {
- label += delimiter + "[";
- bool complementary = (count > 128);
- if (count > 128)
- label += "^";
- while (begin < 256) {
- for (begin = end; begin < 256 && (rit->test(begin) == complementary); ++begin)
- ;
- for (end = begin; end < 256 && (rit->test(end) == !complementary); ++end)
- ;
- if (begin + 1 == end) {
- label += CharDump(begin);
- delimiter = " ";
- } else if (begin != end) {
- label += CharDump(begin) + "-" + (CharDump(end-1));
- delimiter = " ";
- }
- }
- label += "]";
- delimiter = " ";
- } else if (count == 256) {
- label += delimiter + ".";
- delimiter = " ";
- }
- if (!label.empty()) {
- if (!statePrinted) {
- s << " " << state << "[shape=\"" << (IsFinal(state) ? "double" : "") << "circle\",label=\"" << state;
- auto ti = tags.find(state);
- if (ti != tags.end())
- s << " (tags: " << ti->second << ")";
- s << "\"]\n";
- if (Initial() == state)
- s << " \"initial\" -> " << state << '\n';
- statePrinted = true;
- }
- s << " " << state << " -> " << std::distance(row.begin(), rit) << "[label=\"" << label;
-
- // Display outputs
- auto oit = outputs.find(state);
- if (oit != outputs.end()) {
- auto oit2 = oit->second.find(std::distance(row.begin(), rit));
- if (oit2 == oit->second.end())
- ;
- else {
- TVector<int> payload;
- for (unsigned i = 0; i < sizeof(oit2->second) * 8; ++i)
- if (oit2->second & (1ul << i))
- payload.push_back(i);
- if (!payload.empty())
- s << " (outputs: " << Join(payload.begin(), payload.end(), ", ") << ")";
- }
- }
-
- s << "\"]\n";
- }
- }
-
- if (statePrinted)
- s << '\n';
+ // Fill in a 'row': Q -> exp(V) (for current state)
+ TVector< ybitset<MaxChar> > row(Size());
+ for (auto&& transition : m_transitions[state])
+ for (auto&& transitionState : transition.second) {
+ if (transitionState >= Size()) {
+ std::cerr << "WTF?! Transition from " << state << " on letter " << transition.first << " leads to non-existing state " << transitionState << "\n";
+ Y_ASSERT(false);
+ }
+ if (Letters().Contains(transition.first)) {
+ const TVector<Char>& letters = Letters().Klass(Letters().Representative(transition.first));
+ for (auto&& letter : letters)
+ row[transitionState].set(letter);
+ } else
+ row[transitionState].set(transition.first);
+ }
+
+ bool statePrinted = false;
+ // Display each destination state
+ for (auto rit = row.begin(), rie = row.end(); rit != rie; ++rit) {
+ unsigned begin = 0, end = 0;
+
+ ystring delimiter;
+ ystring label;
+ if (rit->test(Epsilon)) {
+ label += delimiter + CharDump(Epsilon);
+ delimiter = " ";
+ }
+ if (rit->test(BeginMark)) {
+ label += delimiter + CharDump(BeginMark);
+ delimiter = " ";
+ }
+ if (rit->test(EndMark)) {
+ label += delimiter + CharDump(EndMark);
+ delimiter = " ";
+ }
+ unsigned count = 0;
+ for (unsigned i = 0; i < 256; ++i)
+ if (rit->test(i))
+ ++count;
+ if (count != 0 && count != 256) {
+ label += delimiter + "[";
+ bool complementary = (count > 128);
+ if (count > 128)
+ label += "^";
+ while (begin < 256) {
+ for (begin = end; begin < 256 && (rit->test(begin) == complementary); ++begin)
+ ;
+ for (end = begin; end < 256 && (rit->test(end) == !complementary); ++end)
+ ;
+ if (begin + 1 == end) {
+ label += CharDump(begin);
+ delimiter = " ";
+ } else if (begin != end) {
+ label += CharDump(begin) + "-" + (CharDump(end-1));
+ delimiter = " ";
+ }
+ }
+ label += "]";
+ delimiter = " ";
+ } else if (count == 256) {
+ label += delimiter + ".";
+ delimiter = " ";
+ }
+ if (!label.empty()) {
+ if (!statePrinted) {
+ s << " " << state << "[shape=\"" << (IsFinal(state) ? "double" : "") << "circle\",label=\"" << state;
+ auto ti = tags.find(state);
+ if (ti != tags.end())
+ s << " (tags: " << ti->second << ")";
+ s << "\"]\n";
+ if (Initial() == state)
+ s << " \"initial\" -> " << state << '\n';
+ statePrinted = true;
+ }
+ s << " " << state << " -> " << std::distance(row.begin(), rit) << "[label=\"" << label;
+
+ // Display outputs
+ auto oit = outputs.find(state);
+ if (oit != outputs.end()) {
+ auto oit2 = oit->second.find(std::distance(row.begin(), rit));
+ if (oit2 == oit->second.end())
+ ;
+ else {
+ TVector<int> payload;
+ for (unsigned i = 0; i < sizeof(oit2->second) * 8; ++i)
+ if (oit2->second & (1ul << i))
+ payload.push_back(i);
+ if (!payload.empty())
+ s << " (outputs: " << Join(payload.begin(), payload.end(), ", ") << ")";
+ }
+ }
+
+ s << "\"]\n";
+ }
+ }
+
+ if (statePrinted)
+ s << '\n';
}
void Fsm::DumpTo(yostream& s, const ystring& name) const
{
- s << "digraph {\n \"initial\"[shape=\"plaintext\",label=\"" << name << "\"]\n\n";
- for (size_t state = 0; state < Size(); ++state) {
- DumpState(s, state);
- }
- s << "}\n\n";
+ s << "digraph {\n \"initial\"[shape=\"plaintext\",label=\"" << name << "\"]\n\n";
+ for (size_t state = 0; state < Size(); ++state) {
+ DumpState(s, state);
+ }
+ s << "}\n\n";
}
yostream& operator << (yostream& s, const Fsm& fsm) { fsm.DumpTo(s); return s; }
namespace {
- template<class Vector> void resizeVector(Vector& v, size_t s) { v.resize(s); }
+ template<class Vector> void resizeVector(Vector& v, size_t s) { v.resize(s); }
}
Fsm::Fsm():
- m_transitions(1),
- initial(0),
- letters(m_transitions),
- m_sparsed(false),
- determined(false),
- isAlternative(false)
+ m_transitions(1),
+ initial(0),
+ letters(m_transitions),
+ m_sparsed(false),
+ determined(false),
+ isAlternative(false)
{
- m_final.insert(0);
+ m_final.insert(0);
}
Fsm Fsm::MakeFalse()
{
- Fsm f;
- f.SetFinal(0, false);
- return f;
+ Fsm f;
+ f.SetFinal(0, false);
+ return f;
}
Char Fsm::Translate(Char c) const
{
- if (!m_sparsed || c == Epsilon)
- return c;
- else
- return Letters().Representative(c);
+ if (!m_sparsed || c == Epsilon)
+ return c;
+ else
+ return Letters().Representative(c);
}
bool Fsm::Connected(size_t from, size_t to, Char c) const
{
- auto it = m_transitions[from].find(Translate(c));
- return (it != m_transitions[from].end() && it->second.find(to) != it->second.end());
+ auto it = m_transitions[from].find(Translate(c));
+ return (it != m_transitions[from].end() && it->second.find(to) != it->second.end());
}
bool Fsm::Connected(size_t from, size_t to) const
{
- for (auto i = m_transitions[from].begin(), ie = m_transitions[from].end(); i != ie; ++i)
- if (i->second.find(to) != i->second.end())
- return true;
- return false;
+ for (auto i = m_transitions[from].begin(), ie = m_transitions[from].end(); i != ie; ++i)
+ if (i->second.find(to) != i->second.end())
+ return true;
+ return false;
}
const Fsm::StatesSet& Fsm::Destinations(size_t from, Char c) const
{
- auto i = m_transitions[from].find(Translate(c));
- return (i != m_transitions[from].end()) ? i->second : DefaultValue<StatesSet>();
+ auto i = m_transitions[from].find(Translate(c));
+ return (i != m_transitions[from].end()) ? i->second : DefaultValue<StatesSet>();
}
TSet<Char> Fsm::OutgoingLetters(size_t state) const
{
- TSet<Char> ret;
- for (auto&& i : m_transitions[state])
- ret.insert(i.first);
- return ret;
+ TSet<Char> ret;
+ for (auto&& i : m_transitions[state])
+ ret.insert(i.first);
+ return ret;
}
size_t Fsm::Resize(size_t newSize)
{
- size_t ret = Size();
- m_transitions.resize(newSize);
- return ret;
+ size_t ret = Size();
+ m_transitions.resize(newSize);
+ return ret;
}
void Fsm::Swap(Fsm& fsm)
{
- DoSwap(m_transitions, fsm.m_transitions);
- DoSwap(initial, fsm.initial);
- DoSwap(m_final, fsm.m_final);
- DoSwap(letters, fsm.letters);
- DoSwap(determined, fsm.determined);
- DoSwap(outputs, fsm.outputs);
- DoSwap(tags, fsm.tags);
- DoSwap(isAlternative, fsm.isAlternative);
+ DoSwap(m_transitions, fsm.m_transitions);
+ DoSwap(initial, fsm.initial);
+ DoSwap(m_final, fsm.m_final);
+ DoSwap(letters, fsm.letters);
+ DoSwap(determined, fsm.determined);
+ DoSwap(outputs, fsm.outputs);
+ DoSwap(tags, fsm.tags);
+ DoSwap(isAlternative, fsm.isAlternative);
}
void Fsm::SetFinal(size_t state, bool final)
{
- if (final)
- m_final.insert(state);
- else
- m_final.erase(state);
+ if (final)
+ m_final.insert(state);
+ else
+ m_final.erase(state);
}
Fsm& Fsm::AppendDot()
{
- Resize(Size() + 1);
- for (size_t letter = 0; letter != (1 << (sizeof(char)*8)); ++letter)
- ConnectFinal(Size() - 1, letter);
- ClearFinal();
- SetFinal(Size() - 1, true);
- determined = false;
+ Resize(Size() + 1);
+ for (size_t letter = 0; letter != (1 << (sizeof(char)*8)); ++letter)
+ ConnectFinal(Size() - 1, letter);
+ ClearFinal();
+ SetFinal(Size() - 1, true);
+ determined = false;
return *this;
}
Fsm& Fsm::Append(char c)
{
- Resize(Size() + 1);
- ConnectFinal(Size() - 1, static_cast<unsigned char>(c));
- ClearFinal();
- SetFinal(Size() - 1, true);
- determined = false;
+ Resize(Size() + 1);
+ ConnectFinal(Size() - 1, static_cast<unsigned char>(c));
+ ClearFinal();
+ SetFinal(Size() - 1, true);
+ determined = false;
return *this;
}
@@ -301,78 +301,78 @@ Fsm& Fsm::Append(const ystring& str)
Fsm& Fsm::AppendSpecial(Char c)
{
- Resize(Size() + 1);
- ConnectFinal(Size() - 1, c);
- ClearFinal();
- SetFinal(Size() - 1, true);
- determined = false;
+ Resize(Size() + 1);
+ ConnectFinal(Size() - 1, c);
+ ClearFinal();
+ SetFinal(Size() - 1, true);
+ determined = false;
return *this;
}
Fsm& Fsm::AppendStrings(const TVector<ystring>& strings)
{
- for (auto&& i : strings)
- if (i.empty())
- throw Error("None of strings passed to appendStrings() can be empty");
-
- Resize(Size() + 1);
- size_t end = Size() - 1;
-
- // A local transitions table: (oldstate, char) -> newstate.
- // Valid for all letters in given strings except final ones,
- // which are always connected to the end state.
-
- // NB: since each FSM contains at least one state,
- // state #0 cannot appear in LTRs. Thus we can use this
- // criteria to test whether a transition has been created or not.
- typedef ypair<size_t, char> Transition;
- TMap<char, size_t> startLtr;
- TMap<Transition, size_t> ltr;
-
- // A presense of a transition in this set indicates that
- // a that transition already points somewhere (either to end
- // or somewhere else). Another attempt to create such transition
- // will clear `determined flag.
- TSet<Transition> usedTransitions;
- TSet<char> usedFirsts;
-
- for (const auto& str : strings) {
- if (str.size() > 1) {
-
- // First letter: all previously final states are connected to the new state
- size_t& firstJump = startLtr[str[0]];
- if (!firstJump) {
- firstJump = Resize(Size() + 1);
- ConnectFinal(firstJump, static_cast<unsigned char>(str[0]));
- determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end());
- }
-
- // All other letters except last one
- size_t state = firstJump;
- for (auto cit = str.begin() + 1, cie = str.end() - 1; cit != cie; ++cit) {
- size_t& newState = ltr[ymake_pair(state, *cit)];
- if (!newState) {
- newState = Resize(Size() + 1);
- Connect(state, newState, static_cast<unsigned char>(*cit));
- determined = determined && (usedTransitions.find(ymake_pair(state, *cit)) != usedTransitions.end());
- }
- state = newState;
- }
-
- // The last letter: connect the current state to end
- unsigned char last = static_cast<unsigned char>(*(str.end() - 1));
- Connect(state, end, last);
- determined = determined && (usedTransitions.find(ymake_pair(state, last)) != usedTransitions.end());
-
- } else {
- // The single letter: connect all the previously final states to end
- ConnectFinal(end, static_cast<unsigned char>(str[0]));
- determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end());
- }
- }
-
- ClearFinal();
- SetFinal(end, true);
+ for (auto&& i : strings)
+ if (i.empty())
+ throw Error("None of strings passed to appendStrings() can be empty");
+
+ Resize(Size() + 1);
+ size_t end = Size() - 1;
+
+ // A local transitions table: (oldstate, char) -> newstate.
+ // Valid for all letters in given strings except final ones,
+ // which are always connected to the end state.
+
+ // NB: since each FSM contains at least one state,
+ // state #0 cannot appear in LTRs. Thus we can use this
+ // criteria to test whether a transition has been created or not.
+ typedef ypair<size_t, char> Transition;
+ TMap<char, size_t> startLtr;
+ TMap<Transition, size_t> ltr;
+
+ // A presense of a transition in this set indicates that
+ // a that transition already points somewhere (either to end
+ // or somewhere else). Another attempt to create such transition
+ // will clear `determined flag.
+ TSet<Transition> usedTransitions;
+ TSet<char> usedFirsts;
+
+ for (const auto& str : strings) {
+ if (str.size() > 1) {
+
+ // First letter: all previously final states are connected to the new state
+ size_t& firstJump = startLtr[str[0]];
+ if (!firstJump) {
+ firstJump = Resize(Size() + 1);
+ ConnectFinal(firstJump, static_cast<unsigned char>(str[0]));
+ determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end());
+ }
+
+ // All other letters except last one
+ size_t state = firstJump;
+ for (auto cit = str.begin() + 1, cie = str.end() - 1; cit != cie; ++cit) {
+ size_t& newState = ltr[ymake_pair(state, *cit)];
+ if (!newState) {
+ newState = Resize(Size() + 1);
+ Connect(state, newState, static_cast<unsigned char>(*cit));
+ determined = determined && (usedTransitions.find(ymake_pair(state, *cit)) != usedTransitions.end());
+ }
+ state = newState;
+ }
+
+ // The last letter: connect the current state to end
+ unsigned char last = static_cast<unsigned char>(*(str.end() - 1));
+ Connect(state, end, last);
+ determined = determined && (usedTransitions.find(ymake_pair(state, last)) != usedTransitions.end());
+
+ } else {
+ // The single letter: connect all the previously final states to end
+ ConnectFinal(end, static_cast<unsigned char>(str[0]));
+ determined = determined && (usedFirsts.find(str[0]) != usedFirsts.end());
+ }
+ }
+
+ ClearFinal();
+ SetFinal(end, true);
return *this;
}
@@ -382,384 +382,384 @@ void Fsm::Import(const Fsm& rhs)
// PIRE_IFDEBUG(LOG_DEBUG("fsm") << "=== Left-hand side ===\n" << *this);
// PIRE_IFDEBUG(LOG_DEBUG("fsm") << "=== Right-hand side ===\n" << rhs);
- size_t oldsize = Resize(Size() + rhs.Size());
-
- for (auto&& outer : m_transitions) {
- for (auto&& letter : letters) {
- auto targets = outer.find(letter.first);
- if (targets == outer.end())
- continue;
- for (auto&& character : letter.second.second)
- if (character != letter.first)
- outer.insert(ymake_pair(character, targets->second));
- }
- }
-
- auto dest = m_transitions.begin() + oldsize;
- for (auto outer = rhs.m_transitions.begin(), outerEnd = rhs.m_transitions.end(); outer != outerEnd; ++outer, ++dest) {
- for (auto&& inner : *outer) {
- TSet<size_t> targets;
- std::transform(inner.second.begin(), inner.second.end(), std::inserter(targets, targets.begin()),
- std::bind2nd(std::plus<size_t>(), oldsize));
- dest->insert(ymake_pair(inner.first, targets));
- }
-
- for (auto&& letter : rhs.letters) {
- auto targets = dest->find(letter.first);
- if (targets == dest->end())
- continue;
- for (auto&& character : letter.second.second)
- if (character != letter.first)
- dest->insert(ymake_pair(character, targets->second));
- }
- }
-
- // Import outputs
- for (auto&& output : rhs.outputs) {
- auto& dest = outputs[output.first + oldsize];
- for (auto&& element : output.second)
- dest.insert(ymake_pair(element.first + oldsize, element.second));
- }
-
- // Import tags
- for (auto&& tag : rhs.tags)
- tags.insert(ymake_pair(tag.first + oldsize, tag.second));
-
- letters = LettersTbl(LettersEquality(m_transitions));
+ size_t oldsize = Resize(Size() + rhs.Size());
+
+ for (auto&& outer : m_transitions) {
+ for (auto&& letter : letters) {
+ auto targets = outer.find(letter.first);
+ if (targets == outer.end())
+ continue;
+ for (auto&& character : letter.second.second)
+ if (character != letter.first)
+ outer.insert(ymake_pair(character, targets->second));
+ }
+ }
+
+ auto dest = m_transitions.begin() + oldsize;
+ for (auto outer = rhs.m_transitions.begin(), outerEnd = rhs.m_transitions.end(); outer != outerEnd; ++outer, ++dest) {
+ for (auto&& inner : *outer) {
+ TSet<size_t> targets;
+ std::transform(inner.second.begin(), inner.second.end(), std::inserter(targets, targets.begin()),
+ std::bind2nd(std::plus<size_t>(), oldsize));
+ dest->insert(ymake_pair(inner.first, targets));
+ }
+
+ for (auto&& letter : rhs.letters) {
+ auto targets = dest->find(letter.first);
+ if (targets == dest->end())
+ continue;
+ for (auto&& character : letter.second.second)
+ if (character != letter.first)
+ dest->insert(ymake_pair(character, targets->second));
+ }
+ }
+
+ // Import outputs
+ for (auto&& output : rhs.outputs) {
+ auto& dest = outputs[output.first + oldsize];
+ for (auto&& element : output.second)
+ dest.insert(ymake_pair(element.first + oldsize, element.second));
+ }
+
+ // Import tags
+ for (auto&& tag : rhs.tags)
+ tags.insert(ymake_pair(tag.first + oldsize, tag.second));
+
+ letters = LettersTbl(LettersEquality(m_transitions));
}
void Fsm::Connect(size_t from, size_t to, Char c /* = Epsilon */)
{
- m_transitions[from][c].insert(to);
- ClearHints();
+ m_transitions[from][c].insert(to);
+ ClearHints();
}
void Fsm::ConnectFinal(size_t to, Char c /* = Epsilon */)
{
- for (auto&& final : m_final)
- Connect(final, to, c);
- ClearHints();
+ for (auto&& final : m_final)
+ Connect(final, to, c);
+ ClearHints();
}
void Fsm::Disconnect(size_t from, size_t to, Char c)
{
- auto i = m_transitions[from].find(c);
- if (i != m_transitions[from].end())
- i->second.erase(to);
- ClearHints();
+ auto i = m_transitions[from].find(c);
+ if (i != m_transitions[from].end())
+ i->second.erase(to);
+ ClearHints();
}
void Fsm::Disconnect(size_t from, size_t to)
{
- for (auto&& i : m_transitions[from])
- i.second.erase(to);
- ClearHints();
+ for (auto&& i : m_transitions[from])
+ i.second.erase(to);
+ ClearHints();
}
unsigned long Fsm::Output(size_t from, size_t to) const
{
- auto i = outputs.find(from);
- if (i == outputs.end())
- return 0;
- auto j = i->second.find(to);
- if (j == i->second.end())
- return 0;
- else
- return j->second;
+ auto i = outputs.find(from);
+ if (i == outputs.end())
+ return 0;
+ auto j = i->second.find(to);
+ if (j == i->second.end())
+ return 0;
+ else
+ return j->second;
}
Fsm& Fsm::operator += (const Fsm& rhs)
{
- size_t lhsSize = Size();
- Import(rhs);
+ size_t lhsSize = Size();
+ Import(rhs);
- const TransitionRow& row = m_transitions[lhsSize + rhs.initial];
+ const TransitionRow& row = m_transitions[lhsSize + rhs.initial];
- for (auto&& outer : row)
- for (auto&& inner : outer.second)
- ConnectFinal(inner, outer.first);
+ for (auto&& outer : row)
+ for (auto&& inner : outer.second)
+ ConnectFinal(inner, outer.first);
- auto out = rhs.outputs.find(rhs.initial);
- if (out != rhs.outputs.end())
- for (auto&& toAndOutput : out->second) {
- for (auto&& final : m_final)
- outputs[final].insert(ymake_pair(toAndOutput.first + lhsSize, toAndOutput.second));
- }
+ auto out = rhs.outputs.find(rhs.initial);
+ if (out != rhs.outputs.end())
+ for (auto&& toAndOutput : out->second) {
+ for (auto&& final : m_final)
+ outputs[final].insert(ymake_pair(toAndOutput.first + lhsSize, toAndOutput.second));
+ }
- ClearFinal();
- for (auto&& letter : rhs.m_final)
- SetFinal(letter + lhsSize, true);
- determined = false;
+ ClearFinal();
+ for (auto&& letter : rhs.m_final)
+ SetFinal(letter + lhsSize, true);
+ determined = false;
- ClearHints();
- PIRE_IFDEBUG(Cdbg << "=== After addition ===" << Endl << *this << Endl);
+ ClearHints();
+ PIRE_IFDEBUG(Cdbg << "=== After addition ===" << Endl << *this << Endl);
- return *this;
+ return *this;
}
Fsm& Fsm::operator |= (const Fsm& rhs)
{
- size_t lhsSize = Size();
-
- Import(rhs);
- for (auto&& final : rhs.m_final)
- m_final.insert(final + lhsSize);
-
- if (!isAlternative && !rhs.isAlternative) {
- Resize(Size() + 1);
- Connect(Size() - 1, initial);
- Connect(Size() - 1, lhsSize + rhs.initial);
- initial = Size() - 1;
- } else if (isAlternative && !rhs.isAlternative) {
- Connect(initial, lhsSize + rhs.initial, Epsilon);
- } else if (!isAlternative && rhs.isAlternative) {
- Connect(lhsSize + rhs.initial, initial, Epsilon);
- initial = rhs.initial + lhsSize;
- } else if (isAlternative && rhs.isAlternative) {
- const StatesSet& tos = rhs.Destinations(rhs.initial, Epsilon);
- for (auto&& to : tos) {
- Connect(initial, to + lhsSize, Epsilon);
- Disconnect(rhs.initial + lhsSize, to + lhsSize, Epsilon);
- }
- }
-
- determined = false;
- isAlternative = true;
- return *this;
+ size_t lhsSize = Size();
+
+ Import(rhs);
+ for (auto&& final : rhs.m_final)
+ m_final.insert(final + lhsSize);
+
+ if (!isAlternative && !rhs.isAlternative) {
+ Resize(Size() + 1);
+ Connect(Size() - 1, initial);
+ Connect(Size() - 1, lhsSize + rhs.initial);
+ initial = Size() - 1;
+ } else if (isAlternative && !rhs.isAlternative) {
+ Connect(initial, lhsSize + rhs.initial, Epsilon);
+ } else if (!isAlternative && rhs.isAlternative) {
+ Connect(lhsSize + rhs.initial, initial, Epsilon);
+ initial = rhs.initial + lhsSize;
+ } else if (isAlternative && rhs.isAlternative) {
+ const StatesSet& tos = rhs.Destinations(rhs.initial, Epsilon);
+ for (auto&& to : tos) {
+ Connect(initial, to + lhsSize, Epsilon);
+ Disconnect(rhs.initial + lhsSize, to + lhsSize, Epsilon);
+ }
+ }
+
+ determined = false;
+ isAlternative = true;
+ return *this;
}
Fsm& Fsm::operator &= (const Fsm& rhs)
{
- Fsm rhs2(rhs);
- Complement();
- rhs2.Complement();
- *this |= rhs2;
- Complement();
- return *this;
+ Fsm rhs2(rhs);
+ Complement();
+ rhs2.Complement();
+ *this |= rhs2;
+ Complement();
+ return *this;
}
Fsm& Fsm::Iterate()
{
- PIRE_IFDEBUG(Cdbg << "Iterating:" << Endl << *this << Endl);
- Resize(Size() + 2);
+ PIRE_IFDEBUG(Cdbg << "Iterating:" << Endl << *this << Endl);
+ Resize(Size() + 2);
- Connect(Size() - 2, Size() - 1);
- Connect(Size() - 2, initial);
- ConnectFinal(initial);
- ConnectFinal(Size() - 1);
+ Connect(Size() - 2, Size() - 1);
+ Connect(Size() - 2, initial);
+ ConnectFinal(initial);
+ ConnectFinal(Size() - 1);
- ClearFinal();
- SetFinal(Size() - 1, true);
- initial = Size() - 2;
+ ClearFinal();
+ SetFinal(Size() - 1, true);
+ initial = Size() - 2;
- determined = false;
+ determined = false;
- PIRE_IFDEBUG(Cdbg << "Iterated:" << Endl << *this << Endl);
- return *this;
+ PIRE_IFDEBUG(Cdbg << "Iterated:" << Endl << *this << Endl);
+ return *this;
}
Fsm& Fsm::Complement()
{
- if (!Determine())
- throw Error("Regexp pattern too complicated");
- Minimize();
- Resize(Size() + 1);
- for (size_t i = 0; i < Size(); ++i)
- if (!IsFinal(i))
- Connect(i, Size() - 1);
- ClearFinal();
- SetFinal(Size() - 1, true);
- determined = false;
-
- return *this;
+ if (!Determine())
+ throw Error("Regexp pattern too complicated");
+ Minimize();
+ Resize(Size() + 1);
+ for (size_t i = 0; i < Size(); ++i)
+ if (!IsFinal(i))
+ Connect(i, Size() - 1);
+ ClearFinal();
+ SetFinal(Size() - 1, true);
+ determined = false;
+
+ return *this;
}
Fsm Fsm::operator *(size_t count) const
{
- Fsm ret;
- while (count--)
- ret += *this;
- return ret;
+ Fsm ret;
+ while (count--)
+ ret += *this;
+ return ret;
}
void Fsm::MakePrefix()
{
- RemoveDeadEnds();
- for (size_t i = 0; i < Size(); ++i)
- if (!m_transitions[i].empty())
- m_final.insert(i);
- ClearHints();
+ RemoveDeadEnds();
+ for (size_t i = 0; i < Size(); ++i)
+ if (!m_transitions[i].empty())
+ m_final.insert(i);
+ ClearHints();
}
void Fsm::MakeSuffix()
{
- for (size_t i = 0; i < Size(); ++i)
- if (i != initial)
- Connect(initial, i);
- ClearHints();
+ for (size_t i = 0; i < Size(); ++i)
+ if (i != initial)
+ Connect(initial, i);
+ ClearHints();
}
Fsm& Fsm::Reverse()
{
- Fsm out;
- out.Resize(Size() + 1);
- out.letters = Letters();
-
- // Invert transitions
- for (size_t from = 0; from < Size(); ++from)
- for (auto&& i : m_transitions[from])
- for (auto&& j : i.second)
- out.Connect(j, from, i.first);
-
- // Invert initial and final states
- out.m_final.clear();
- out.SetFinal(initial, true);
- for (auto i : m_final)
- out.Connect(Size(), i, Epsilon);
- out.SetInitial(Size());
-
- // Invert outputs
- for (auto&& i : outputs)
- for (auto&& j : i.second)
- out.SetOutput(j.first, i.first, j.second);
-
- // Preserve tags (although thier semantics are usually heavily broken at this point)
- out.tags = tags;
-
- // Apply
- Swap(out);
- return *this;
+ Fsm out;
+ out.Resize(Size() + 1);
+ out.letters = Letters();
+
+ // Invert transitions
+ for (size_t from = 0; from < Size(); ++from)
+ for (auto&& i : m_transitions[from])
+ for (auto&& j : i.second)
+ out.Connect(j, from, i.first);
+
+ // Invert initial and final states
+ out.m_final.clear();
+ out.SetFinal(initial, true);
+ for (auto i : m_final)
+ out.Connect(Size(), i, Epsilon);
+ out.SetInitial(Size());
+
+ // Invert outputs
+ for (auto&& i : outputs)
+ for (auto&& j : i.second)
+ out.SetOutput(j.first, i.first, j.second);
+
+ // Preserve tags (although thier semantics are usually heavily broken at this point)
+ out.tags = tags;
+
+ // Apply
+ Swap(out);
+ return *this;
}
TSet<size_t> Fsm::DeadStates() const
{
- TSet<size_t> res;
-
- for (int invert = 0; invert <= 1; ++invert) {
- Fsm digraph;
- digraph.Resize(Size());
- for (TransitionTable::const_iterator j = m_transitions.begin(), je = m_transitions.end(); j != je; ++j) {
- for (TransitionRow::const_iterator k = j->begin(), ke = j->end(); k != ke; ++k) {
- for (StatesSet::const_iterator toSt = k->second.begin(), toSte = k->second.end(); toSt != toSte; ++toSt) {
- // We only care if the states are connected or not regerdless through what letter
- if (invert) {
- // Build an FSM with inverted transitions
- digraph.Connect(*toSt, j - m_transitions.begin(), 0);
- } else {
- digraph.Connect(j - m_transitions.begin(), *toSt, 0);
- }
- }
- }
- }
-
- TVector<bool> unchecked(Size(), true);
- TVector<bool> useless(Size(), true);
- TDeque<size_t> queue;
-
- // Put all final (or initial) states into queue, marking them useful
- for (size_t i = 0; i < Size(); ++i)
- if ((invert && IsFinal(i)) || (!invert && Initial() == i)) {
- useless[i] = false;
- queue.push_back(i);
- }
-
- // Do the breadth-first search, marking all states
- // from which already marked states are reachable
- while (!queue.empty()) {
- size_t to = queue.front();
- queue.pop_front();
-
- // All the states that are connected to this state in the transition matrix are useful
- const StatesSet& connections = (digraph.m_transitions[to])[0];
- for (auto&& fr : connections) {
- // Enqueue the state for further traversal if it hasnt been already checked
- if (unchecked[fr] && useless[fr]) {
- useless[fr] = false;
- queue.push_back(fr);
- }
- }
-
- // Now we consider this state checked
- unchecked[to] = false;
- }
-
- for (size_t i = 0; i < Size(); ++i) {
- if (useless[i]) {
- res.insert(i);
- }
- }
- }
-
- return res;
+ TSet<size_t> res;
+
+ for (int invert = 0; invert <= 1; ++invert) {
+ Fsm digraph;
+ digraph.Resize(Size());
+ for (TransitionTable::const_iterator j = m_transitions.begin(), je = m_transitions.end(); j != je; ++j) {
+ for (TransitionRow::const_iterator k = j->begin(), ke = j->end(); k != ke; ++k) {
+ for (StatesSet::const_iterator toSt = k->second.begin(), toSte = k->second.end(); toSt != toSte; ++toSt) {
+ // We only care if the states are connected or not regerdless through what letter
+ if (invert) {
+ // Build an FSM with inverted transitions
+ digraph.Connect(*toSt, j - m_transitions.begin(), 0);
+ } else {
+ digraph.Connect(j - m_transitions.begin(), *toSt, 0);
+ }
+ }
+ }
+ }
+
+ TVector<bool> unchecked(Size(), true);
+ TVector<bool> useless(Size(), true);
+ TDeque<size_t> queue;
+
+ // Put all final (or initial) states into queue, marking them useful
+ for (size_t i = 0; i < Size(); ++i)
+ if ((invert && IsFinal(i)) || (!invert && Initial() == i)) {
+ useless[i] = false;
+ queue.push_back(i);
+ }
+
+ // Do the breadth-first search, marking all states
+ // from which already marked states are reachable
+ while (!queue.empty()) {
+ size_t to = queue.front();
+ queue.pop_front();
+
+ // All the states that are connected to this state in the transition matrix are useful
+ const StatesSet& connections = (digraph.m_transitions[to])[0];
+ for (auto&& fr : connections) {
+ // Enqueue the state for further traversal if it hasnt been already checked
+ if (unchecked[fr] && useless[fr]) {
+ useless[fr] = false;
+ queue.push_back(fr);
+ }
+ }
+
+ // Now we consider this state checked
+ unchecked[to] = false;
+ }
+
+ for (size_t i = 0; i < Size(); ++i) {
+ if (useless[i]) {
+ res.insert(i);
+ }
+ }
+ }
+
+ return res;
}
void Fsm::RemoveDeadEnds()
{
- PIRE_IFDEBUG(Cdbg << "Removing dead ends on:" << Endl << *this << Endl);
-
- TSet<size_t> dead = DeadStates();
- // Erase all useless states
- for (auto&& i : dead) {
- PIRE_IFDEBUG(Cdbg << "Removing useless state " << i << Endl);
- m_transitions[i].clear();
- for (auto&& j : m_transitions)
- for (auto&& k : j)
- k.second.erase(i);
- }
- ClearHints();
-
- PIRE_IFDEBUG(Cdbg << "Result:" << Endl << *this << Endl);
+ PIRE_IFDEBUG(Cdbg << "Removing dead ends on:" << Endl << *this << Endl);
+
+ TSet<size_t> dead = DeadStates();
+ // Erase all useless states
+ for (auto&& i : dead) {
+ PIRE_IFDEBUG(Cdbg << "Removing useless state " << i << Endl);
+ m_transitions[i].clear();
+ for (auto&& j : m_transitions)
+ for (auto&& k : j)
+ k.second.erase(i);
+ }
+ ClearHints();
+
+ PIRE_IFDEBUG(Cdbg << "Result:" << Endl << *this << Endl);
}
// This method is one step of Epsilon-connection removal algorithm.
// It merges transitions, tags, and outputs of 'to' state into 'from' state
void Fsm::MergeEpsilonConnection(size_t from, size_t to)
{
- unsigned long frEpsOutput = 0;
- bool fsEpsOutputExists = false;
-
- // Is there an output for 'from'->'to' transition?
- if (outputs.find(from) != outputs.end() && outputs[from].find(to) != outputs[from].end()) {
- frEpsOutput = outputs[from][to];
- fsEpsOutputExists = true;
- }
-
- // Merge transitions from 'to' state into transitions from 'from' state
- for (auto&& transition : m_transitions[to]) {
- TSet<size_t> connStates;
- std::copy(transition.second.begin(), transition.second.end(),
- std::inserter(m_transitions[from][transition.first], m_transitions[from][transition.first].end()));
-
- // If there is an output of the 'from'->'to' connection it has to be set to all
- // new connections that were merged from 'to' state
- if (fsEpsOutputExists) {
- // Compute the set of states that are reachable from 'to' state
- std::copy(transition.second.begin(), transition.second.end(), std::inserter(connStates, connStates.end()));
-
- // For each of these states add an output equal to the Epsilon-connection output
- for (auto&& newConnSt : connStates) {
- outputs[from][newConnSt] |= frEpsOutput;
- }
- }
- }
-
- // Mark 'from' state final if 'to' state is final
- if (IsFinal(to))
- SetFinal(from, true);
-
- // Combine tags
- auto ti = tags.find(to);
- if (ti != tags.end())
- tags[from] |= ti->second;
-
- // Merge all 'to' into 'from' outputs:
- // outputs[from][i] |= (outputs[from][to] | outputs[to][i])
- auto toOit = outputs.find(to);
- if (toOit != outputs.end()) {
- for (auto&& output : toOit->second) {
- outputs[from][output.first] |= (frEpsOutput | output.second);
- }
- }
+ unsigned long frEpsOutput = 0;
+ bool fsEpsOutputExists = false;
+
+ // Is there an output for 'from'->'to' transition?
+ if (outputs.find(from) != outputs.end() && outputs[from].find(to) != outputs[from].end()) {
+ frEpsOutput = outputs[from][to];
+ fsEpsOutputExists = true;
+ }
+
+ // Merge transitions from 'to' state into transitions from 'from' state
+ for (auto&& transition : m_transitions[to]) {
+ TSet<size_t> connStates;
+ std::copy(transition.second.begin(), transition.second.end(),
+ std::inserter(m_transitions[from][transition.first], m_transitions[from][transition.first].end()));
+
+ // If there is an output of the 'from'->'to' connection it has to be set to all
+ // new connections that were merged from 'to' state
+ if (fsEpsOutputExists) {
+ // Compute the set of states that are reachable from 'to' state
+ std::copy(transition.second.begin(), transition.second.end(), std::inserter(connStates, connStates.end()));
+
+ // For each of these states add an output equal to the Epsilon-connection output
+ for (auto&& newConnSt : connStates) {
+ outputs[from][newConnSt] |= frEpsOutput;
+ }
+ }
+ }
+
+ // Mark 'from' state final if 'to' state is final
+ if (IsFinal(to))
+ SetFinal(from, true);
+
+ // Combine tags
+ auto ti = tags.find(to);
+ if (ti != tags.end())
+ tags[from] |= ti->second;
+
+ // Merge all 'to' into 'from' outputs:
+ // outputs[from][i] |= (outputs[from][to] | outputs[to][i])
+ auto toOit = outputs.find(to);
+ if (toOit != outputs.end()) {
+ for (auto&& output : toOit->second) {
+ outputs[from][output.first] |= (frEpsOutput | output.second);
+ }
+ }
}
// Assuming the epsilon transitions is possible from 'from' to 'thru',
@@ -768,467 +768,467 @@ void Fsm::MergeEpsilonConnection(size_t from, size_t to)
// Updates inverse map of epsilon transitions as well.
void Fsm::ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps)
{
- PIRE_IFDEBUG(Cdbg << "In Fsm::ShortCutEpsilon(" << from << ", " << thru << ")\n");
- const StatesSet& to = Destinations(thru, Epsilon);
- Outputs::iterator outIt = outputs.find(from);
- unsigned long fromThruOut = Output(from, thru);
- for (auto&& toElement : to) {
- PIRE_IFDEBUG(Cdbg << "Epsilon connecting " << from << " --> " << thru << " --> " << toElement << "\n");
- Connect(from, toElement, Epsilon);
- inveps[toElement].insert(from);
- if (outIt != outputs.end())
- outIt->second[toElement] |= (fromThruOut | Output(thru, toElement));
- }
+ PIRE_IFDEBUG(Cdbg << "In Fsm::ShortCutEpsilon(" << from << ", " << thru << ")\n");
+ const StatesSet& to = Destinations(thru, Epsilon);
+ Outputs::iterator outIt = outputs.find(from);
+ unsigned long fromThruOut = Output(from, thru);
+ for (auto&& toElement : to) {
+ PIRE_IFDEBUG(Cdbg << "Epsilon connecting " << from << " --> " << thru << " --> " << toElement << "\n");
+ Connect(from, toElement, Epsilon);
+ inveps[toElement].insert(from);
+ if (outIt != outputs.end())
+ outIt->second[toElement] |= (fromThruOut | Output(thru, toElement));
+ }
}
// Removes all Epsilon-connections by iterating though states and merging each Epsilon-connection
// effects from 'to' state into 'from' state
void Fsm::RemoveEpsilons()
{
- Unsparse();
-
- // Build inverse map of epsilon transitions
- TVector< TSet<size_t> > inveps(Size()); // We have to use TSet<> here since we want it sorted
- for (size_t from = 0; from != Size(); ++from) {
- const StatesSet& tos = Destinations(from, Epsilon);
- for (auto&& to : tos)
- inveps[to].insert(from);
- }
-
- // Make a transitive closure of all epsilon transitions (Floyd-Warshall algorithm)
- // (if there exists an epsilon-path between two states, epsilon-connect them directly)
- for (size_t thru = 0; thru != Size(); ++thru)
- for (auto&& from : inveps[thru])
- // inveps[thru] may alter during loop body, hence we cannot cache ivneps[thru].end()
- if (from != thru)
- ShortCutEpsilon(from, thru, inveps);
-
- PIRE_IFDEBUG(Cdbg << "=== After epsilons shortcut\n" << *this << Endl);
-
- // Iterate through all epsilon-connected state pairs, merging states together
- for (size_t from = 0; from != Size(); ++from) {
- const StatesSet& to = Destinations(from, Epsilon);
- for (auto&& toElement : to)
- if (toElement != from)
- MergeEpsilonConnection(from, toElement); // it's a NOP if to == from, so don't waste time
- }
-
- PIRE_IFDEBUG(Cdbg << "=== After epsilons merged\n" << *this << Endl);
-
- // Drop all epsilon transitions
- for (auto&& i : m_transitions)
- i.erase(Epsilon);
-
- Sparse();
- ClearHints();
+ Unsparse();
+
+ // Build inverse map of epsilon transitions
+ TVector< TSet<size_t> > inveps(Size()); // We have to use TSet<> here since we want it sorted
+ for (size_t from = 0; from != Size(); ++from) {
+ const StatesSet& tos = Destinations(from, Epsilon);
+ for (auto&& to : tos)
+ inveps[to].insert(from);
+ }
+
+ // Make a transitive closure of all epsilon transitions (Floyd-Warshall algorithm)
+ // (if there exists an epsilon-path between two states, epsilon-connect them directly)
+ for (size_t thru = 0; thru != Size(); ++thru)
+ for (auto&& from : inveps[thru])
+ // inveps[thru] may alter during loop body, hence we cannot cache ivneps[thru].end()
+ if (from != thru)
+ ShortCutEpsilon(from, thru, inveps);
+
+ PIRE_IFDEBUG(Cdbg << "=== After epsilons shortcut\n" << *this << Endl);
+
+ // Iterate through all epsilon-connected state pairs, merging states together
+ for (size_t from = 0; from != Size(); ++from) {
+ const StatesSet& to = Destinations(from, Epsilon);
+ for (auto&& toElement : to)
+ if (toElement != from)
+ MergeEpsilonConnection(from, toElement); // it's a NOP if to == from, so don't waste time
+ }
+
+ PIRE_IFDEBUG(Cdbg << "=== After epsilons merged\n" << *this << Endl);
+
+ // Drop all epsilon transitions
+ for (auto&& i : m_transitions)
+ i.erase(Epsilon);
+
+ Sparse();
+ ClearHints();
}
bool Fsm::LettersEquality::operator()(Char a, Char b) const
{
- for (auto&& outer : *m_tbl) {
- auto ia = outer.find(a);
- auto ib = outer.find(b);
- if (ia == outer.end() && ib == outer.end())
- continue;
- else if (ia == outer.end() || ib == outer.end() || ia->second != ib->second) {
- return false;
- }
- }
- return true;
+ for (auto&& outer : *m_tbl) {
+ auto ia = outer.find(a);
+ auto ib = outer.find(b);
+ if (ia == outer.end() && ib == outer.end())
+ continue;
+ else if (ia == outer.end() || ib == outer.end() || ia->second != ib->second) {
+ return false;
+ }
+ }
+ return true;
}
void Fsm::Sparse(bool needEpsilons /* = false */)
{
- letters = LettersTbl(LettersEquality(m_transitions));
- for (unsigned letter = 0; letter < MaxChar; ++letter)
- if (letter != Epsilon || needEpsilons)
- letters.Append(letter);
+ letters = LettersTbl(LettersEquality(m_transitions));
+ for (unsigned letter = 0; letter < MaxChar; ++letter)
+ if (letter != Epsilon || needEpsilons)
+ letters.Append(letter);
- m_sparsed = true;
- PIRE_IFDEBUG(Cdbg << "Letter classes = " << letters << Endl);
+ m_sparsed = true;
+ PIRE_IFDEBUG(Cdbg << "Letter classes = " << letters << Endl);
}
void Fsm::Unsparse()
{
- for (auto&& letter : letters)
- for (auto&& i : m_transitions)
- for (auto&& j : letter.second.second)
- i[j] = i[letter.first];
- m_sparsed = false;
+ for (auto&& letter : letters)
+ for (auto&& i : m_transitions)
+ for (auto&& j : letter.second.second)
+ i[j] = i[letter.first];
+ m_sparsed = false;
}
// Returns a set of 'terminal states', which are those of the final states,
// from which a transition to themselves on any letter is possible.
TSet<size_t> Fsm::TerminalStates() const
{
- TSet<size_t> terminals;
- for (auto&& final : m_final) {
- bool ok = true;
- for (auto&& letter : letters) {
- auto dests = m_transitions[final].find(letter.first);
- ok = ok && (dests != m_transitions[final].end() && dests->second.find(final) != dests->second.end());
- }
- if (ok)
- terminals.insert(final);
- }
- return terminals;
+ TSet<size_t> terminals;
+ for (auto&& final : m_final) {
+ bool ok = true;
+ for (auto&& letter : letters) {
+ auto dests = m_transitions[final].find(letter.first);
+ ok = ok && (dests != m_transitions[final].end() && dests->second.find(final) != dests->second.end());
+ }
+ if (ok)
+ terminals.insert(final);
+ }
+ return terminals;
}
namespace Impl {
class FsmDetermineTask {
public:
- typedef TVector<size_t> State;
- typedef Fsm::LettersTbl LettersTbl;
- typedef TMap<State, size_t> InvStates;
-
- FsmDetermineTask(const Fsm& fsm)
- : mFsm(fsm)
- , mTerminals(fsm.TerminalStates())
- {
- PIRE_IFDEBUG(Cdbg << "Terminal states: [" << Join(mTerminals.begin(), mTerminals.end(), ", ") << "]" << Endl);
- }
- const LettersTbl& Letters() const { return mFsm.letters; }
-
- State Initial() const { return State(1, mFsm.initial); }
- bool IsRequired(const State& state) const
- {
- for (auto&& i : state)
- if (mTerminals.find(i) != mTerminals.end())
- return false;
- return true;
- }
-
- State Next(const State& state, Char letter) const
- {
- State next;
- next.reserve(20);
- for (auto&& from : state) {
- const auto& part = mFsm.Destinations(from, letter);
- std::copy(part.begin(), part.end(), std::back_inserter(next));
- }
-
- std::sort(next.begin(), next.end());
- next.erase(std::unique(next.begin(), next.end()), next.end());
- PIRE_IFDEBUG(Cdbg << "Returning transition [" << Join(state.begin(), state.end(), ", ") << "] --" << letter
- << "--> [" << Join(next.begin(), next.end(), ", ") << "]" << Endl);
- return next;
- }
-
- void AcceptStates(const TVector<State>& states)
- {
- mNewFsm.Resize(states.size());
- mNewFsm.initial = 0;
- mNewFsm.determined = true;
- mNewFsm.letters = Letters();
- mNewFsm.m_final.clear();
- for (size_t ns = 0; ns < states.size(); ++ns) {
- PIRE_IFDEBUG(Cdbg << "State " << ns << " = [" << Join(states[ns].begin(), states[ns].end(), ", ") << "]" << Endl);
- for (auto&& j : states[ns]) {
-
- // If it was a terminal state, connect it to itself
- if (mTerminals.find(j) != mTerminals.end()) {
- for (auto&& letter : Letters())
- mNewFsm.Connect(ns, ns, letter.first);
- mNewTerminals.insert(ns);
- PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes terminal because of old state " << j << Endl);
- }
- }
- for (auto&& j : states[ns]) {
- // If any state containing in our one is marked final, mark the new state final as well
- if (mFsm.IsFinal(j)) {
- PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes final because of old state " << j << Endl);
- mNewFsm.SetFinal(ns, true);
- if (mFsm.tags.empty())
- // Weve got no tags and already know that the state is final,
- // hence weve done with this state and got nothing more to do.
- break;
- }
-
- // Bitwise OR all tags in states
- auto ti = mFsm.tags.find(j);
- if (ti != mFsm.tags.end()) {
- PIRE_IFDEBUG(Cdbg << "State " << ns << " carries tag " << ti->second << " because of old state " << j << Endl);
- mNewFsm.tags[ns] |= ti->second;
- }
- }
- }
- // For each old state, prepare a list of new state it is contained in
- typedef TMap< size_t, TVector<size_t> > Old2New;
- Old2New old2new;
- for (size_t ns = 0; ns < states.size(); ++ns)
- for (auto&& j : states[ns])
- old2new[j].push_back(ns);
-
- // Copy all outputs
- for (auto&& i : mFsm.outputs) {
- for (auto&& j : i.second) {
- auto from = old2new.find(i.first);
- auto to = old2new.find(j.first);
- if (from != old2new.end() && to != old2new.end()) {
- for (auto&& k : from->second)
- for (auto&& l : to->second)
- mNewFsm.outputs[k][l] |= j.second;
- }
- }
- }
- PIRE_IFDEBUG(Cdbg << "New terminals = [" << Join(mNewTerminals.begin(), mNewTerminals.end(), ",") << "]" << Endl);
- }
-
- void Connect(size_t from, size_t to, Char letter)
- {
- PIRE_IFDEBUG(Cdbg << "Connecting " << from << " --" << letter << "--> " << to << Endl);
- Y_ASSERT(mNewTerminals.find(from) == mNewTerminals.end());
- mNewFsm.Connect(from, to, letter);
- }
- typedef bool Result;
-
- Result Success() {
- Fsm::Outputs oldOutputs;
- // remove redundant outputs
- oldOutputs.swap(mNewFsm.outputs);
- for (size_t from = 0; from < mNewFsm.Size(); ++from) {
- auto fromOutput = oldOutputs.find(from);
- if (fromOutput == oldOutputs.end())
- continue;
- const auto& newTransitionsRow = mNewFsm.m_transitions[from];
- for (auto&& row : newTransitionsRow) {
- for (auto&& stateIt : row.second) {
- auto toOutput = fromOutput->second.find(stateIt);
- if (toOutput != fromOutput->second.end()) {
- mNewFsm.outputs[from].insert(*toOutput);
- }
- }
- }
- }
- return true;
- }
-
- Result Failure() { return false; }
-
- Fsm& Output() { return mNewFsm; }
+ typedef TVector<size_t> State;
+ typedef Fsm::LettersTbl LettersTbl;
+ typedef TMap<State, size_t> InvStates;
+
+ FsmDetermineTask(const Fsm& fsm)
+ : mFsm(fsm)
+ , mTerminals(fsm.TerminalStates())
+ {
+ PIRE_IFDEBUG(Cdbg << "Terminal states: [" << Join(mTerminals.begin(), mTerminals.end(), ", ") << "]" << Endl);
+ }
+ const LettersTbl& Letters() const { return mFsm.letters; }
+
+ State Initial() const { return State(1, mFsm.initial); }
+ bool IsRequired(const State& state) const
+ {
+ for (auto&& i : state)
+ if (mTerminals.find(i) != mTerminals.end())
+ return false;
+ return true;
+ }
+
+ State Next(const State& state, Char letter) const
+ {
+ State next;
+ next.reserve(20);
+ for (auto&& from : state) {
+ const auto& part = mFsm.Destinations(from, letter);
+ std::copy(part.begin(), part.end(), std::back_inserter(next));
+ }
+
+ std::sort(next.begin(), next.end());
+ next.erase(std::unique(next.begin(), next.end()), next.end());
+ PIRE_IFDEBUG(Cdbg << "Returning transition [" << Join(state.begin(), state.end(), ", ") << "] --" << letter
+ << "--> [" << Join(next.begin(), next.end(), ", ") << "]" << Endl);
+ return next;
+ }
+
+ void AcceptStates(const TVector<State>& states)
+ {
+ mNewFsm.Resize(states.size());
+ mNewFsm.initial = 0;
+ mNewFsm.determined = true;
+ mNewFsm.letters = Letters();
+ mNewFsm.m_final.clear();
+ for (size_t ns = 0; ns < states.size(); ++ns) {
+ PIRE_IFDEBUG(Cdbg << "State " << ns << " = [" << Join(states[ns].begin(), states[ns].end(), ", ") << "]" << Endl);
+ for (auto&& j : states[ns]) {
+
+ // If it was a terminal state, connect it to itself
+ if (mTerminals.find(j) != mTerminals.end()) {
+ for (auto&& letter : Letters())
+ mNewFsm.Connect(ns, ns, letter.first);
+ mNewTerminals.insert(ns);
+ PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes terminal because of old state " << j << Endl);
+ }
+ }
+ for (auto&& j : states[ns]) {
+ // If any state containing in our one is marked final, mark the new state final as well
+ if (mFsm.IsFinal(j)) {
+ PIRE_IFDEBUG(Cdbg << "State " << ns << " becomes final because of old state " << j << Endl);
+ mNewFsm.SetFinal(ns, true);
+ if (mFsm.tags.empty())
+ // Weve got no tags and already know that the state is final,
+ // hence weve done with this state and got nothing more to do.
+ break;
+ }
+
+ // Bitwise OR all tags in states
+ auto ti = mFsm.tags.find(j);
+ if (ti != mFsm.tags.end()) {
+ PIRE_IFDEBUG(Cdbg << "State " << ns << " carries tag " << ti->second << " because of old state " << j << Endl);
+ mNewFsm.tags[ns] |= ti->second;
+ }
+ }
+ }
+ // For each old state, prepare a list of new state it is contained in
+ typedef TMap< size_t, TVector<size_t> > Old2New;
+ Old2New old2new;
+ for (size_t ns = 0; ns < states.size(); ++ns)
+ for (auto&& j : states[ns])
+ old2new[j].push_back(ns);
+
+ // Copy all outputs
+ for (auto&& i : mFsm.outputs) {
+ for (auto&& j : i.second) {
+ auto from = old2new.find(i.first);
+ auto to = old2new.find(j.first);
+ if (from != old2new.end() && to != old2new.end()) {
+ for (auto&& k : from->second)
+ for (auto&& l : to->second)
+ mNewFsm.outputs[k][l] |= j.second;
+ }
+ }
+ }
+ PIRE_IFDEBUG(Cdbg << "New terminals = [" << Join(mNewTerminals.begin(), mNewTerminals.end(), ",") << "]" << Endl);
+ }
+
+ void Connect(size_t from, size_t to, Char letter)
+ {
+ PIRE_IFDEBUG(Cdbg << "Connecting " << from << " --" << letter << "--> " << to << Endl);
+ Y_ASSERT(mNewTerminals.find(from) == mNewTerminals.end());
+ mNewFsm.Connect(from, to, letter);
+ }
+ typedef bool Result;
+
+ Result Success() {
+ Fsm::Outputs oldOutputs;
+ // remove redundant outputs
+ oldOutputs.swap(mNewFsm.outputs);
+ for (size_t from = 0; from < mNewFsm.Size(); ++from) {
+ auto fromOutput = oldOutputs.find(from);
+ if (fromOutput == oldOutputs.end())
+ continue;
+ const auto& newTransitionsRow = mNewFsm.m_transitions[from];
+ for (auto&& row : newTransitionsRow) {
+ for (auto&& stateIt : row.second) {
+ auto toOutput = fromOutput->second.find(stateIt);
+ if (toOutput != fromOutput->second.end()) {
+ mNewFsm.outputs[from].insert(*toOutput);
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ Result Failure() { return false; }
+
+ Fsm& Output() { return mNewFsm; }
private:
- const Fsm& mFsm;
- Fsm mNewFsm;
- TSet<size_t> mTerminals;
- TSet<size_t> mNewTerminals;
+ const Fsm& mFsm;
+ Fsm mNewFsm;
+ TSet<size_t> mTerminals;
+ TSet<size_t> mNewTerminals;
};
}
bool Fsm::Determine(size_t maxsize /* = 0 */)
{
- static const unsigned MaxSize = 200000;
- if (determined)
- return true;
-
- PIRE_IFDEBUG(Cdbg << "=== Initial ===" << Endl << *this << Endl);
-
- RemoveEpsilons();
- PIRE_IFDEBUG(Cdbg << "=== After all epsilons removed" << Endl << *this << Endl);
-
- Impl::FsmDetermineTask task(*this);
- if (Pire::Impl::Determine(task, maxsize ? maxsize : MaxSize)) {
- task.Output().Swap(*this);
- PIRE_IFDEBUG(Cdbg << "=== Determined ===" << Endl << *this << Endl);
- return true;
- } else
- return false;
+ static const unsigned MaxSize = 200000;
+ if (determined)
+ return true;
+
+ PIRE_IFDEBUG(Cdbg << "=== Initial ===" << Endl << *this << Endl);
+
+ RemoveEpsilons();
+ PIRE_IFDEBUG(Cdbg << "=== After all epsilons removed" << Endl << *this << Endl);
+
+ Impl::FsmDetermineTask task(*this);
+ if (Pire::Impl::Determine(task, maxsize ? maxsize : MaxSize)) {
+ task.Output().Swap(*this);
+ PIRE_IFDEBUG(Cdbg << "=== Determined ===" << Endl << *this << Endl);
+ return true;
+ } else
+ return false;
}
namespace Impl {
class FsmMinimizeTask {
public:
- explicit FsmMinimizeTask(const Fsm& fsm)
- : mFsm(fsm)
- , reversedTransitions(fsm.Size())
- , StateClass(fsm.Size())
- , Classes(0)
- {
- Y_ASSERT(mFsm.IsDetermined());
-
- TMap<bool, size_t> FinalStateClassMap;
-
- for (size_t state = 0; state < mFsm.Size(); ++state) {
- reversedTransitions[state].resize(mFsm.Letters().Size());
- if (FinalStateClassMap.find(mFsm.IsFinal(state)) == FinalStateClassMap.end()) {
- FinalStateClassMap[mFsm.IsFinal(state)] = Classes++;
- }
- StateClass[state] = FinalStateClassMap[mFsm.IsFinal(state)];
- }
-
- for (size_t state = 0; state < mFsm.Size(); ++state) {
- TSet<ypair<Char, size_t>> usedTransitions;
- for (const auto& transition : mFsm.m_transitions[state]) {
- Y_ASSERT(transition.second.size() == 1);
- auto destination = *transition.second.begin();
- auto letter = mFsm.Letters().Index(transition.first);
- if (usedTransitions.find(ymake_pair(letter, destination)) == usedTransitions.end()) {
- usedTransitions.insert(ymake_pair(letter, destination));
- reversedTransitions[destination][letter].push_back(state);
- }
- }
- }
- }
-
- TVector<size_t>& GetStateClass() { return StateClass; }
-
- size_t& GetClassesNumber() { return Classes; }
-
- size_t LettersCount() const {
- return mFsm.Letters().Size();
- }
-
- bool IsDetermined() const {
- return mFsm.IsDetermined();
- }
-
- size_t Size() const {
- return mFsm.Size();
- }
-
- const TVector<size_t>& Previous(size_t state, size_t letter) const {
- return reversedTransitions[state][letter];
- }
-
- void AcceptStates() {
- mNewFsm.Resize(Classes);
- mNewFsm.letters = mFsm.letters;
- mNewFsm.determined = mFsm.determined;
- mNewFsm.m_sparsed = mFsm.m_sparsed;
- mNewFsm.SetFinal(0, false);
-
- // Unite equality classes into new states
- size_t fromIdx = 0;
- for (auto from = mFsm.m_transitions.begin(), fromEnd = mFsm.m_transitions.end(); from != fromEnd; ++from, ++fromIdx) {
- size_t dest = StateClass[fromIdx];
- PIRE_IFDEBUG(Cdbg << "[min] State " << fromIdx << " becomes state " << dest << Endl);
- for (auto&& letter : *from) {
- Y_ASSERT(letter.second.size() == 1 || !"FSM::minimize(): FSM not deterministic");
- mNewFsm.Connect(dest, StateClass[*letter.second.begin()], letter.first);
- }
- if (mFsm.IsFinal(fromIdx)) {
- mNewFsm.SetFinal(dest, true);
- PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " becomes final because of old state " << fromIdx << Endl);
- }
-
- // Append tags
- auto ti = mFsm.tags.find(fromIdx);
- if (ti != mFsm.tags.end()) {
- mNewFsm.tags[dest] |= ti->second;
- PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " carries tag " << ti->second << " because of old state " << fromIdx << Endl);
- }
- }
- mNewFsm.initial = StateClass[mFsm.initial];
-
- // Restore outputs
- for (auto&& output : mFsm.outputs)
- for (auto&& output2 : output.second)
- mNewFsm.outputs[StateClass[output.first]].insert(ymake_pair(StateClass[output2.first], output2.second));
- }
-
- typedef bool Result;
-
- Result Success() {
- return true;
- }
-
- Result Failure() {
- return false;
- }
-
- Fsm& Output() {
- return mNewFsm;
- }
+ explicit FsmMinimizeTask(const Fsm& fsm)
+ : mFsm(fsm)
+ , reversedTransitions(fsm.Size())
+ , StateClass(fsm.Size())
+ , Classes(0)
+ {
+ Y_ASSERT(mFsm.IsDetermined());
+
+ TMap<bool, size_t> FinalStateClassMap;
+
+ for (size_t state = 0; state < mFsm.Size(); ++state) {
+ reversedTransitions[state].resize(mFsm.Letters().Size());
+ if (FinalStateClassMap.find(mFsm.IsFinal(state)) == FinalStateClassMap.end()) {
+ FinalStateClassMap[mFsm.IsFinal(state)] = Classes++;
+ }
+ StateClass[state] = FinalStateClassMap[mFsm.IsFinal(state)];
+ }
+
+ for (size_t state = 0; state < mFsm.Size(); ++state) {
+ TSet<ypair<Char, size_t>> usedTransitions;
+ for (const auto& transition : mFsm.m_transitions[state]) {
+ Y_ASSERT(transition.second.size() == 1);
+ auto destination = *transition.second.begin();
+ auto letter = mFsm.Letters().Index(transition.first);
+ if (usedTransitions.find(ymake_pair(letter, destination)) == usedTransitions.end()) {
+ usedTransitions.insert(ymake_pair(letter, destination));
+ reversedTransitions[destination][letter].push_back(state);
+ }
+ }
+ }
+ }
+
+ TVector<size_t>& GetStateClass() { return StateClass; }
+
+ size_t& GetClassesNumber() { return Classes; }
+
+ size_t LettersCount() const {
+ return mFsm.Letters().Size();
+ }
+
+ bool IsDetermined() const {
+ return mFsm.IsDetermined();
+ }
+
+ size_t Size() const {
+ return mFsm.Size();
+ }
+
+ const TVector<size_t>& Previous(size_t state, size_t letter) const {
+ return reversedTransitions[state][letter];
+ }
+
+ void AcceptStates() {
+ mNewFsm.Resize(Classes);
+ mNewFsm.letters = mFsm.letters;
+ mNewFsm.determined = mFsm.determined;
+ mNewFsm.m_sparsed = mFsm.m_sparsed;
+ mNewFsm.SetFinal(0, false);
+
+ // Unite equality classes into new states
+ size_t fromIdx = 0;
+ for (auto from = mFsm.m_transitions.begin(), fromEnd = mFsm.m_transitions.end(); from != fromEnd; ++from, ++fromIdx) {
+ size_t dest = StateClass[fromIdx];
+ PIRE_IFDEBUG(Cdbg << "[min] State " << fromIdx << " becomes state " << dest << Endl);
+ for (auto&& letter : *from) {
+ Y_ASSERT(letter.second.size() == 1 || !"FSM::minimize(): FSM not deterministic");
+ mNewFsm.Connect(dest, StateClass[*letter.second.begin()], letter.first);
+ }
+ if (mFsm.IsFinal(fromIdx)) {
+ mNewFsm.SetFinal(dest, true);
+ PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " becomes final because of old state " << fromIdx << Endl);
+ }
+
+ // Append tags
+ auto ti = mFsm.tags.find(fromIdx);
+ if (ti != mFsm.tags.end()) {
+ mNewFsm.tags[dest] |= ti->second;
+ PIRE_IFDEBUG(Cdbg << "[min] New state " << dest << " carries tag " << ti->second << " because of old state " << fromIdx << Endl);
+ }
+ }
+ mNewFsm.initial = StateClass[mFsm.initial];
+
+ // Restore outputs
+ for (auto&& output : mFsm.outputs)
+ for (auto&& output2 : output.second)
+ mNewFsm.outputs[StateClass[output.first]].insert(ymake_pair(StateClass[output2.first], output2.second));
+ }
+
+ typedef bool Result;
+
+ Result Success() {
+ return true;
+ }
+
+ Result Failure() {
+ return false;
+ }
+
+ Fsm& Output() {
+ return mNewFsm;
+ }
private:
- const Fsm& mFsm;
- Fsm mNewFsm;
- TVector<TVector<TVector<size_t>>> reversedTransitions;
- TVector<size_t> StateClass;
- size_t Classes;
+ const Fsm& mFsm;
+ Fsm mNewFsm;
+ TVector<TVector<TVector<size_t>>> reversedTransitions;
+ TVector<size_t> StateClass;
+ size_t Classes;
};
}
void Fsm::Minimize()
{
- // Minimization algorithm is only applicable to a determined FSM.
- Y_ASSERT(determined);
+ // Minimization algorithm is only applicable to a determined FSM.
+ Y_ASSERT(determined);
- Impl::FsmMinimizeTask task{*this};
- if (Pire::Impl::Minimize(task)) {
- task.Output().Swap(*this);
- }
+ Impl::FsmMinimizeTask task{*this};
+ if (Pire::Impl::Minimize(task)) {
+ task.Output().Swap(*this);
+ }
}
Fsm& Fsm::Canonize(size_t maxSize /* = 0 */)
{
- if (!IsDetermined()) {
- if (!Determine(maxSize))
- throw Error("regexp pattern too complicated");
- }
- Minimize();
- return *this;
+ if (!IsDetermined()) {
+ if (!Determine(maxSize))
+ throw Error("regexp pattern too complicated");
+ }
+ Minimize();
+ return *this;
}
void Fsm::PrependAnything()
{
- size_t newstate = Size();
- Resize(Size() + 1);
- for (size_t letter = 0; letter < MaxChar; ++letter)
- Connect(newstate, newstate, letter);
+ size_t newstate = Size();
+ Resize(Size() + 1);
+ for (size_t letter = 0; letter < MaxChar; ++letter)
+ Connect(newstate, newstate, letter);
- Connect(newstate, initial);
- initial = newstate;
+ Connect(newstate, initial);
+ initial = newstate;
- determined = false;
+ determined = false;
}
void Fsm::AppendAnything()
{
- size_t newstate = Size();
- Resize(Size() + 1);
- for (size_t letter = 0; letter < MaxChar; ++letter)
- Connect(newstate, newstate, letter);
+ size_t newstate = Size();
+ Resize(Size() + 1);
+ for (size_t letter = 0; letter < MaxChar; ++letter)
+ Connect(newstate, newstate, letter);
- ConnectFinal(newstate);
- ClearFinal();
- SetFinal(newstate, 1);
+ ConnectFinal(newstate);
+ ClearFinal();
+ SetFinal(newstate, 1);
- determined = false;
+ determined = false;
}
Fsm& Fsm::Surround()
{
- PrependAnything();
- AppendAnything();
- return *this;
+ PrependAnything();
+ AppendAnything();
+ return *this;
}
void Fsm::Divert(size_t from, size_t to, size_t dest)
{
- if (to == dest)
- return;
-
- // Assign the output
- auto oi = outputs.find(from);
- if (oi != outputs.end()) {
- auto oi2 = oi->second.find(to);
- if (oi2 != oi->second.end()) {
- unsigned long output = oi2->second;
- oi->second.erase(oi2);
- oi->second.insert(ymake_pair(dest, output));
- }
- }
-
- // Assign the transition
- for (auto&& i : m_transitions[from]) {
- auto di = i.second.find(to);
- if (di != i.second.end()) {
- i.second.erase(di);
- i.second.insert(dest);
- }
- }
-
- ClearHints();
+ if (to == dest)
+ return;
+
+ // Assign the output
+ auto oi = outputs.find(from);
+ if (oi != outputs.end()) {
+ auto oi2 = oi->second.find(to);
+ if (oi2 != oi->second.end()) {
+ unsigned long output = oi2->second;
+ oi->second.erase(oi2);
+ oi->second.insert(ymake_pair(dest, output));
+ }
+ }
+
+ // Assign the transition
+ for (auto&& i : m_transitions[from]) {
+ auto di = i.second.find(to);
+ if (di != i.second.end()) {
+ i.second.erase(di);
+ i.second.insert(dest);
+ }
+ }
+
+ ClearHints();
}
diff --git a/library/cpp/regex/pire/pire/fsm.h b/library/cpp/regex/pire/pire/fsm.h
index fce84616d94..64ca6dd7c19 100644
--- a/library/cpp/regex/pire/pire/fsm.h
+++ b/library/cpp/regex/pire/pire/fsm.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -32,252 +32,252 @@
namespace Pire {
- namespace Impl {
- class FsmDetermineTask;
- class FsmMinimizeTask;
+ namespace Impl {
+ class FsmDetermineTask;
+ class FsmMinimizeTask;
class HalfFinalDetermineTask;
- }
-
- /// A Flying Spaghetti Monster... no, just a Finite State Machine.
- class Fsm {
- public:
- typedef ybitset<MaxChar> Charset;
-
- Fsm();
- void Swap(Fsm& fsm);
-
- static Fsm MakeFalse();
-
- /// Current number of states
- size_t Size() const { return m_transitions.size(); }
-
- Fsm& Append(char c);
- Fsm& Append(const ystring& str);
- Fsm& AppendSpecial(Char c);
-
- /// Efficiently appends a union of passed strings to FSM.
- /// Used for ranges (e.g. [a-z]), character classes (e.g. \w, \d)
- /// and case-insensitive comparison of multibyte characters,
- /// when one string represents a lowercase variant of a character,
- /// while another string represents its uppercase variant.
- Fsm& AppendStrings(const TVector<ystring>& strings);
-
- /// Appends a part matching a single byte (any).
- Fsm& AppendDot();
-
- /// Appends and prepends the FSM with the iterated dot (see above).
- Fsm& Surround(); // returns *this
- Fsm Surrounded() const { Fsm copy(*this); copy.Surround(); return copy; }
-
- Fsm& operator += (const Fsm& rhs); ///< Concatenation
- Fsm& operator |= (const Fsm& rhs); ///< Alternation
- Fsm& operator &= (const Fsm& rhs); ///< Conjunction
- Fsm& Iterate(); ///< Klene star
- Fsm& Complement(); ///< Complementation
- Fsm& operator *= (size_t count) { *this = *this * count; return *this; }
-
- Fsm operator + (const Fsm& rhs) const { Fsm a(*this); return a += rhs; }
- Fsm operator | (const Fsm& rhs) const { Fsm a(*this); return a |= rhs; }
- Fsm operator & (const Fsm& rhs) const { Fsm a(*this); return a &= rhs; }
- Fsm operator * () const { Fsm a(*this); return a.Iterate(); }
- Fsm operator ~ () const { Fsm a(*this); return a.Complement(); }
- Fsm operator * (size_t count) const;
-
- // === Raw FSM construction ===
-
- /// Connects two states with given transition
- void Connect(size_t from, size_t to, Char c = Epsilon);
-
- /// Removes given character from the specified transition.
- void Disconnect(size_t from, size_t to, Char c);
-
- /// Completely removes given transition
- void Disconnect(size_t from, size_t to);
-
- /// Creates an FSM which matches any prefix of any word current FSM matches.
- void MakePrefix();
-
- /// Creates an FSM which matches any suffix of any word current FSM matches.
- void MakeSuffix();
-
- /// Does the one way part of Surround().
- void PrependAnything();
- void AppendAnything();
-
- /// Creates an FSM which matches reversed strings matched by current FSM.
- Fsm& Reverse();
-
- /// Returns a set of states from which no final states are reachable
- TSet<size_t> DeadStates() const;
-
- /// Removes all dead end paths from FSM
- void RemoveDeadEnds();
-
- /// Determines and minimizes the FSM if neccessary. Returns *this.
- Fsm& Canonize(size_t maxSize = 0);
-
- template<class Scanner>
- Scanner Compile(size_t distance = 0);
-
- void DumpState(yostream& s, size_t state) const;
- void DumpTo(yostream& s, const ystring& name = "") const;
-
- typedef TSet<size_t> StatesSet;
- typedef TMap<size_t, StatesSet> TransitionRow;
- typedef TVector<TransitionRow> TransitionTable;
-
- struct LettersEquality {
- LettersEquality(const Fsm::TransitionTable& tbl): m_tbl(&tbl) {}
- bool operator()(Char a, Char b) const;
- private:
- const Fsm::TransitionTable* m_tbl;
- };
-
- typedef TSet<size_t> FinalTable;
- typedef Partition<Char, LettersEquality> LettersTbl;
-
-
- /*
- * A very low level FSM building interface.
- *
- * It is generally unwise to call any of these functions unless you are building
- * your own scanner, your own ecoding or exaclty know what you are doing.
- */
- unsigned long Tag(size_t state) const { Tags::const_iterator i = tags.find(state); return (i == tags.end()) ? 0 : i->second; }
- void SetTag(size_t state, unsigned long tag) { tags[state] = tag; }
-
- unsigned long Output(size_t from, size_t to) const;
- void SetOutput(size_t from, size_t to, unsigned long output) { outputs[from][to] = output; }
- void ClearOutputs() { outputs.clear(); }
-
- const FinalTable& Finals() const { return m_final; }
- bool IsFinal(size_t state) const { return m_final.find(state) != m_final.end(); }
- void SetFinal(size_t size, bool final);
- void ClearFinal() { m_final.clear(); }
-
- /// Removes all espilon transitions from the FSM. Does not change the FSMs language.
- void RemoveEpsilons();
-
- /// Resize FSM to newSize states. Returns old size.
- size_t Resize(size_t newSize);
-
- /// Imports foreign transition table
- void Import(const Fsm& rhs);
-
- /// Connects all final state with given state
- void ConnectFinal(size_t to, Char c = Epsilon);
-
- /// Diverts all transition between two given states to @p dest, preserving outputs
- void Divert(size_t from, size_t to, size_t dest);
-
- /// Checks whether two states are connected using given letter.
- bool Connected(size_t from, size_t to, Char c) const;
-
- /// Returns a set of letters on which a transition from the specified state exists
- TSet<Char> OutgoingLetters(size_t state) const;
-
- /// Returns a set of states where a transition from the given state using the given letter is possible
- const StatesSet& Destinations(size_t from, Char letter) const;
-
- /// Checks whether two states are connected using any letter.
- bool Connected(size_t from, size_t to) const;
- size_t Initial() const { return initial; }
- void SetInitial(size_t init) { initial = init; }
-
- const LettersTbl& Letters() const { return letters; }
-
- /// Determines the FSM.
- /// Breaks FSM invariant of having a single final state, so high-level FSM building
- /// functions (i.e. Append(), operator+(), etc...) no longer can be applied to the FSM
- /// until the invariants have been manually restored.
- /// return value: successful?
- bool Determine(size_t maxsize = 0);
- bool IsDetermined() const { return determined; }
- void SetIsDetermined(bool det) { determined = det; }
-
- /// Minimizes amount of states in the regexp.
- /// Requires a determined FSM.
- void Minimize();
-
-
- /// Builds letters equivalence classes
- void Sparse(bool needEpsilons = false);
-
- /// Unpacks all letters equivalence classs back into transitions table
- void Unsparse();
-
- private:
-
- /// Transitions table :: Q x V -> exp(Q)
- TransitionTable m_transitions;
-
- /// Initial state
- size_t initial;
-
- /// Final states.
- FinalTable m_final;
-
- LettersTbl letters;
-
- /// Does 'letters' make sense?
- bool m_sparsed;
-
- /// Is the FSM already determined?
- bool determined;
-
- /// Output
- typedef TMap< size_t, TMap<size_t, unsigned long> > Outputs;
- Outputs outputs;
-
- typedef TMap<size_t, unsigned long> Tags;
- Tags tags;
-
- /// Heuristics hit: true iff this FSM is a union of two other FSMs
- bool isAlternative;
-
- void ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps); ///< internal
- void MergeEpsilonConnection(size_t from, size_t to); ///< internal
-
- TSet<size_t> TerminalStates() const;
-
- Char Translate(Char c) const;
-
- void ClearHints() { isAlternative = false; }
-
- friend class Impl::FsmDetermineTask;
- friend class Impl::FsmMinimizeTask;
+ }
+
+ /// A Flying Spaghetti Monster... no, just a Finite State Machine.
+ class Fsm {
+ public:
+ typedef ybitset<MaxChar> Charset;
+
+ Fsm();
+ void Swap(Fsm& fsm);
+
+ static Fsm MakeFalse();
+
+ /// Current number of states
+ size_t Size() const { return m_transitions.size(); }
+
+ Fsm& Append(char c);
+ Fsm& Append(const ystring& str);
+ Fsm& AppendSpecial(Char c);
+
+ /// Efficiently appends a union of passed strings to FSM.
+ /// Used for ranges (e.g. [a-z]), character classes (e.g. \w, \d)
+ /// and case-insensitive comparison of multibyte characters,
+ /// when one string represents a lowercase variant of a character,
+ /// while another string represents its uppercase variant.
+ Fsm& AppendStrings(const TVector<ystring>& strings);
+
+ /// Appends a part matching a single byte (any).
+ Fsm& AppendDot();
+
+ /// Appends and prepends the FSM with the iterated dot (see above).
+ Fsm& Surround(); // returns *this
+ Fsm Surrounded() const { Fsm copy(*this); copy.Surround(); return copy; }
+
+ Fsm& operator += (const Fsm& rhs); ///< Concatenation
+ Fsm& operator |= (const Fsm& rhs); ///< Alternation
+ Fsm& operator &= (const Fsm& rhs); ///< Conjunction
+ Fsm& Iterate(); ///< Klene star
+ Fsm& Complement(); ///< Complementation
+ Fsm& operator *= (size_t count) { *this = *this * count; return *this; }
+
+ Fsm operator + (const Fsm& rhs) const { Fsm a(*this); return a += rhs; }
+ Fsm operator | (const Fsm& rhs) const { Fsm a(*this); return a |= rhs; }
+ Fsm operator & (const Fsm& rhs) const { Fsm a(*this); return a &= rhs; }
+ Fsm operator * () const { Fsm a(*this); return a.Iterate(); }
+ Fsm operator ~ () const { Fsm a(*this); return a.Complement(); }
+ Fsm operator * (size_t count) const;
+
+ // === Raw FSM construction ===
+
+ /// Connects two states with given transition
+ void Connect(size_t from, size_t to, Char c = Epsilon);
+
+ /// Removes given character from the specified transition.
+ void Disconnect(size_t from, size_t to, Char c);
+
+ /// Completely removes given transition
+ void Disconnect(size_t from, size_t to);
+
+ /// Creates an FSM which matches any prefix of any word current FSM matches.
+ void MakePrefix();
+
+ /// Creates an FSM which matches any suffix of any word current FSM matches.
+ void MakeSuffix();
+
+ /// Does the one way part of Surround().
+ void PrependAnything();
+ void AppendAnything();
+
+ /// Creates an FSM which matches reversed strings matched by current FSM.
+ Fsm& Reverse();
+
+ /// Returns a set of states from which no final states are reachable
+ TSet<size_t> DeadStates() const;
+
+ /// Removes all dead end paths from FSM
+ void RemoveDeadEnds();
+
+ /// Determines and minimizes the FSM if neccessary. Returns *this.
+ Fsm& Canonize(size_t maxSize = 0);
+
+ template<class Scanner>
+ Scanner Compile(size_t distance = 0);
+
+ void DumpState(yostream& s, size_t state) const;
+ void DumpTo(yostream& s, const ystring& name = "") const;
+
+ typedef TSet<size_t> StatesSet;
+ typedef TMap<size_t, StatesSet> TransitionRow;
+ typedef TVector<TransitionRow> TransitionTable;
+
+ struct LettersEquality {
+ LettersEquality(const Fsm::TransitionTable& tbl): m_tbl(&tbl) {}
+ bool operator()(Char a, Char b) const;
+ private:
+ const Fsm::TransitionTable* m_tbl;
+ };
+
+ typedef TSet<size_t> FinalTable;
+ typedef Partition<Char, LettersEquality> LettersTbl;
+
+
+ /*
+ * A very low level FSM building interface.
+ *
+ * It is generally unwise to call any of these functions unless you are building
+ * your own scanner, your own ecoding or exaclty know what you are doing.
+ */
+ unsigned long Tag(size_t state) const { Tags::const_iterator i = tags.find(state); return (i == tags.end()) ? 0 : i->second; }
+ void SetTag(size_t state, unsigned long tag) { tags[state] = tag; }
+
+ unsigned long Output(size_t from, size_t to) const;
+ void SetOutput(size_t from, size_t to, unsigned long output) { outputs[from][to] = output; }
+ void ClearOutputs() { outputs.clear(); }
+
+ const FinalTable& Finals() const { return m_final; }
+ bool IsFinal(size_t state) const { return m_final.find(state) != m_final.end(); }
+ void SetFinal(size_t size, bool final);
+ void ClearFinal() { m_final.clear(); }
+
+ /// Removes all espilon transitions from the FSM. Does not change the FSMs language.
+ void RemoveEpsilons();
+
+ /// Resize FSM to newSize states. Returns old size.
+ size_t Resize(size_t newSize);
+
+ /// Imports foreign transition table
+ void Import(const Fsm& rhs);
+
+ /// Connects all final state with given state
+ void ConnectFinal(size_t to, Char c = Epsilon);
+
+ /// Diverts all transition between two given states to @p dest, preserving outputs
+ void Divert(size_t from, size_t to, size_t dest);
+
+ /// Checks whether two states are connected using given letter.
+ bool Connected(size_t from, size_t to, Char c) const;
+
+ /// Returns a set of letters on which a transition from the specified state exists
+ TSet<Char> OutgoingLetters(size_t state) const;
+
+ /// Returns a set of states where a transition from the given state using the given letter is possible
+ const StatesSet& Destinations(size_t from, Char letter) const;
+
+ /// Checks whether two states are connected using any letter.
+ bool Connected(size_t from, size_t to) const;
+ size_t Initial() const { return initial; }
+ void SetInitial(size_t init) { initial = init; }
+
+ const LettersTbl& Letters() const { return letters; }
+
+ /// Determines the FSM.
+ /// Breaks FSM invariant of having a single final state, so high-level FSM building
+ /// functions (i.e. Append(), operator+(), etc...) no longer can be applied to the FSM
+ /// until the invariants have been manually restored.
+ /// return value: successful?
+ bool Determine(size_t maxsize = 0);
+ bool IsDetermined() const { return determined; }
+ void SetIsDetermined(bool det) { determined = det; }
+
+ /// Minimizes amount of states in the regexp.
+ /// Requires a determined FSM.
+ void Minimize();
+
+
+ /// Builds letters equivalence classes
+ void Sparse(bool needEpsilons = false);
+
+ /// Unpacks all letters equivalence classs back into transitions table
+ void Unsparse();
+
+ private:
+
+ /// Transitions table :: Q x V -> exp(Q)
+ TransitionTable m_transitions;
+
+ /// Initial state
+ size_t initial;
+
+ /// Final states.
+ FinalTable m_final;
+
+ LettersTbl letters;
+
+ /// Does 'letters' make sense?
+ bool m_sparsed;
+
+ /// Is the FSM already determined?
+ bool determined;
+
+ /// Output
+ typedef TMap< size_t, TMap<size_t, unsigned long> > Outputs;
+ Outputs outputs;
+
+ typedef TMap<size_t, unsigned long> Tags;
+ Tags tags;
+
+ /// Heuristics hit: true iff this FSM is a union of two other FSMs
+ bool isAlternative;
+
+ void ShortCutEpsilon(size_t from, size_t thru, TVector< TSet<size_t> >& inveps); ///< internal
+ void MergeEpsilonConnection(size_t from, size_t to); ///< internal
+
+ TSet<size_t> TerminalStates() const;
+
+ Char Translate(Char c) const;
+
+ void ClearHints() { isAlternative = false; }
+
+ friend class Impl::FsmDetermineTask;
+ friend class Impl::FsmMinimizeTask;
friend class Impl::HalfFinalDetermineTask;
- };
-
- template<class Scanner>
- void BuildScanner(const Fsm& fsm, Scanner& r)
- {
- TSet<size_t> dead;
- if (Scanner::DeadFlag)
- dead = fsm.DeadStates();
-
- for (size_t state = 0; state < fsm.Size(); ++state)
- r.SetTag(state, typename Scanner::Tag(fsm.Tag(state)
- | (fsm.IsFinal(state) ? Scanner::FinalFlag : 0)
- | ((dead.find(state) != dead.end()) ? Scanner::DeadFlag : 0)));
-
- for (size_t from = 0; from != fsm.Size(); ++from)
- for (Fsm::LettersTbl::ConstIterator lit = fsm.Letters().Begin(), lie = fsm.Letters().End(); lit != lie; ++lit) {
- const Fsm::StatesSet& tos = fsm.Destinations(from, lit->first);
- for (Fsm::StatesSet::const_iterator to = tos.begin(), toEnd = tos.end(); to != toEnd; ++to)
- r.SetJump(from, lit->first, *to, r.RemapAction(fsm.Output(from, *to)));
- }
-
- r.FinishBuild();
- }
-
- template<class Scanner>
- inline Scanner Fsm::Compile(size_t distance)
- {
- return Scanner(*this, distance);
- }
-
- yostream& operator << (yostream&, const Fsm&);
+ };
+
+ template<class Scanner>
+ void BuildScanner(const Fsm& fsm, Scanner& r)
+ {
+ TSet<size_t> dead;
+ if (Scanner::DeadFlag)
+ dead = fsm.DeadStates();
+
+ for (size_t state = 0; state < fsm.Size(); ++state)
+ r.SetTag(state, typename Scanner::Tag(fsm.Tag(state)
+ | (fsm.IsFinal(state) ? Scanner::FinalFlag : 0)
+ | ((dead.find(state) != dead.end()) ? Scanner::DeadFlag : 0)));
+
+ for (size_t from = 0; from != fsm.Size(); ++from)
+ for (Fsm::LettersTbl::ConstIterator lit = fsm.Letters().Begin(), lie = fsm.Letters().End(); lit != lie; ++lit) {
+ const Fsm::StatesSet& tos = fsm.Destinations(from, lit->first);
+ for (Fsm::StatesSet::const_iterator to = tos.begin(), toEnd = tos.end(); to != toEnd; ++to)
+ r.SetJump(from, lit->first, *to, r.RemapAction(fsm.Output(from, *to)));
+ }
+
+ r.FinishBuild();
+ }
+
+ template<class Scanner>
+ inline Scanner Fsm::Compile(size_t distance)
+ {
+ return Scanner(*this, distance);
+ }
+
+ yostream& operator << (yostream&, const Fsm&);
}
#endif
diff --git a/library/cpp/regex/pire/pire/glue.h b/library/cpp/regex/pire/pire/glue.h
index d8d6cb00e5b..3308a58863d 100644
--- a/library/cpp/regex/pire/pire/glue.h
+++ b/library/cpp/regex/pire/pire/glue.h
@@ -12,7 +12,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -45,7 +45,7 @@ public:
private:
typename Scanner::Letter* m_lhs;
typename Scanner::Letter* m_rhs;
-};
+};
// This lookup table is used instead of std::map.
// The key idea is to specify size which is a power of 2 in order to use >> and | instead of
@@ -54,113 +54,113 @@ private:
template <size_t N, class State>
class GluedStateLookupTable {
public:
- static const size_t MaxSize = N;
- typedef ypair<State, State> key_type;
- typedef size_t mapped_type;
- typedef ypair<key_type, mapped_type> value_type;
- typedef value_type* iterator;
- typedef const value_type* const_iterator;
-
- GluedStateLookupTable()
- : mMap(new value_type[N])
- , mFilled(N, false)
- {}
-
- ~GluedStateLookupTable() = default;
-
- const_iterator end() const {
- return mMap.Get() + MaxSize;
- }
- // Note that in fact mMap is sparsed and traditional [begin,end)
- // traversal is unavailable; hence no begin() method here.
- // end() is only valid for comparing with find() result.
- const_iterator find(const key_type& st) const {
- size_t ind = Search(st);
- return mFilled[ind] ? (mMap.Get() + ind) : end();
- }
-
- ypair<iterator, bool> insert(const value_type& v) {
- size_t ind = Search(v.first);
- if (!mFilled[ind]) {
- mMap[ind] = v;
- mFilled[ind] = true;
- return ymake_pair(mMap.Get() + ind, true);
- } else
- return ymake_pair(mMap.Get() + ind, false);
- }
+ static const size_t MaxSize = N;
+ typedef ypair<State, State> key_type;
+ typedef size_t mapped_type;
+ typedef ypair<key_type, mapped_type> value_type;
+ typedef value_type* iterator;
+ typedef const value_type* const_iterator;
+
+ GluedStateLookupTable()
+ : mMap(new value_type[N])
+ , mFilled(N, false)
+ {}
+
+ ~GluedStateLookupTable() = default;
+
+ const_iterator end() const {
+ return mMap.Get() + MaxSize;
+ }
+ // Note that in fact mMap is sparsed and traditional [begin,end)
+ // traversal is unavailable; hence no begin() method here.
+ // end() is only valid for comparing with find() result.
+ const_iterator find(const key_type& st) const {
+ size_t ind = Search(st);
+ return mFilled[ind] ? (mMap.Get() + ind) : end();
+ }
+
+ ypair<iterator, bool> insert(const value_type& v) {
+ size_t ind = Search(v.first);
+ if (!mFilled[ind]) {
+ mMap[ind] = v;
+ mFilled[ind] = true;
+ return ymake_pair(mMap.Get() + ind, true);
+ } else
+ return ymake_pair(mMap.Get() + ind, false);
+ }
private:
- size_t Search(const key_type& st) const {
- size_t startInd = (Hash(st) % N);
- for (size_t ind = startInd; ind != (startInd + N - 1) % N; ind = (ind + 1) % N) {
- if (!mFilled[ind] || mMap[ind].first == st) {
- return ind;
- }
- }
- return (size_t)-1;
- }
-
- static size_t Hash(const key_type& st) {
- return size_t((st.first >> 2) ^ (st.second >> 4) ^ (st.second << 10));
- }
-
- TArrayHolder<value_type> mMap;
- TVector<bool> mFilled;
-
- // Noncopyable
- GluedStateLookupTable(const GluedStateLookupTable&);
- GluedStateLookupTable& operator = (const GluedStateLookupTable&);
+ size_t Search(const key_type& st) const {
+ size_t startInd = (Hash(st) % N);
+ for (size_t ind = startInd; ind != (startInd + N - 1) % N; ind = (ind + 1) % N) {
+ if (!mFilled[ind] || mMap[ind].first == st) {
+ return ind;
+ }
+ }
+ return (size_t)-1;
+ }
+
+ static size_t Hash(const key_type& st) {
+ return size_t((st.first >> 2) ^ (st.second >> 4) ^ (st.second << 10));
+ }
+
+ TArrayHolder<value_type> mMap;
+ TVector<bool> mFilled;
+
+ // Noncopyable
+ GluedStateLookupTable(const GluedStateLookupTable&);
+ GluedStateLookupTable& operator = (const GluedStateLookupTable&);
};
template<class Scanner>
class ScannerGlueCommon {
public:
- typedef Partition< Char, Impl::LettersEquality<Scanner> > LettersTbl;
+ typedef Partition< Char, Impl::LettersEquality<Scanner> > LettersTbl;
- typedef ypair<typename Scanner::InternalState, typename Scanner::InternalState> State;
- ScannerGlueCommon(const Scanner& lhs, const Scanner& rhs, const LettersTbl& letters)
- : m_lhs(lhs)
- , m_rhs(rhs)
- , m_letters(letters)
- {
- // Form a new letters partition
- for (unsigned ch = 0; ch < MaxChar; ++ch)
- if (ch != Epsilon)
- m_letters.Append(ch);
- }
+ typedef ypair<typename Scanner::InternalState, typename Scanner::InternalState> State;
+ ScannerGlueCommon(const Scanner& lhs, const Scanner& rhs, const LettersTbl& letters)
+ : m_lhs(lhs)
+ , m_rhs(rhs)
+ , m_letters(letters)
+ {
+ // Form a new letters partition
+ for (unsigned ch = 0; ch < MaxChar; ++ch)
+ if (ch != Epsilon)
+ m_letters.Append(ch);
+ }
- const LettersTbl& Letters() const { return m_letters; }
+ const LettersTbl& Letters() const { return m_letters; }
- const Scanner& Lhs() const { return m_lhs; }
- const Scanner& Rhs() const { return m_rhs; }
+ const Scanner& Lhs() const { return m_lhs; }
+ const Scanner& Rhs() const { return m_rhs; }
- State Initial() const { return State(Lhs().m.initial, Rhs().m.initial); }
+ State Initial() const { return State(Lhs().m.initial, Rhs().m.initial); }
- State Next(State state, Char letter) const
- {
- Lhs().Next(state.first, letter);
- Rhs().Next(state.second, letter);
- return state;
- }
+ State Next(State state, Char letter) const
+ {
+ Lhs().Next(state.first, letter);
+ Rhs().Next(state.second, letter);
+ return state;
+ }
- bool IsRequired(const State& /*state*/) const { return true; }
+ bool IsRequired(const State& /*state*/) const { return true; }
- typedef Scanner Result;
- const Scanner& Success() const { return *m_result; }
- Scanner Failure() const { return Scanner(); }
+ typedef Scanner Result;
+ const Scanner& Success() const { return *m_result; }
+ Scanner Failure() const { return Scanner(); }
protected:
- Scanner& Sc() { return *m_result; }
- void SetSc(THolder<Scanner>&& sc) { m_result = std::move(sc); }
+ Scanner& Sc() { return *m_result; }
+ void SetSc(THolder<Scanner>&& sc) { m_result = std::move(sc); }
private:
- const Scanner& m_lhs;
- const Scanner& m_rhs;
- LettersTbl m_letters;
- THolder<Scanner> m_result;
+ const Scanner& m_lhs;
+ const Scanner& m_rhs;
+ LettersTbl m_letters;
+ THolder<Scanner> m_result;
};
-}
+}
}
#endif
diff --git a/library/cpp/regex/pire/pire/half_final_fsm.cpp b/library/cpp/regex/pire/pire/half_final_fsm.cpp
index e45d03b9e2c..9ce22eda0e1 100644
--- a/library/cpp/regex/pire/pire/half_final_fsm.cpp
+++ b/library/cpp/regex/pire/pire/half_final_fsm.cpp
@@ -3,335 +3,335 @@
#include "half_final_fsm.h"
namespace Pire {
- size_t HalfFinalFsm::MaxCountDepth = 10;
-
- void HalfFinalFsm::MakeScanner() {
- fsm.Canonize();
- bool allowHalfFinals = AllowHalfFinals();
- if (!allowHalfFinals) {
- MakeHalfFinal();
- return;
- }
- DisconnectFinals(true);
- }
-
- bool HalfFinalFsm::AllowHalfFinals() {
- fsm.Canonize();
- for (size_t state = 0; state < fsm.Size(); ++state) {
- if (fsm.IsFinal(state)) {
- for (const auto& let : fsm.Letters()) {
- bool hasFinalTransition = fsm.Destinations(state, let.first).empty();
- for (const auto& to : fsm.Destinations(state, let.first)) {
- if (fsm.IsFinal(to)) {
- hasFinalTransition = true;
- }
- }
- if (!hasFinalTransition) {
- return false;
- }
- }
- }
- }
- return true;
- }
-
- void HalfFinalFsm::MakeHalfFinal() {
- fsm.Unsparse();
- const auto newFinal = fsm.Size();
- fsm.Resize(newFinal + 1);
- for (unsigned letter = 0; letter < MaxChar; ++letter) {
- if (letter != Epsilon) {
- fsm.Connect(newFinal, newFinal, letter);
- }
- }
- for (size_t state = 0; state < fsm.Size(); ++state) {
- bool hasFinalTransitions = false;
- for (const auto& to : fsm.Destinations(state, EndMark)) {
- if (fsm.IsFinal(to)) {
- hasFinalTransitions = true;
- break;
- }
- }
- if (hasFinalTransitions) {
- Fsm::StatesSet destinations = fsm.Destinations(state, EndMark);
- for (const auto& to : destinations) {
- fsm.Disconnect(state, to, EndMark);
- }
- fsm.Connect(state, newFinal, EndMark);
- }
- }
- fsm.ClearFinal();
- fsm.SetFinal(newFinal, true);
- fsm.Sparse();
- }
-
- void HalfFinalFsm::DisconnectFinals(bool allowIntersects) {
- fsm.Unsparse();
- for (size_t state = 0; state != fsm.Size(); ++state) {
- fsm.SetTag(state, 0);
- if (fsm.IsFinal(state)) {
- for (unsigned letter = 0; letter < MaxChar; ++letter) {
- Fsm::StatesSet destinations = fsm.Destinations(state, letter);
- for (const auto& to : destinations) {
- fsm.Disconnect(state, to, letter);
- }
- }
- if (!allowIntersects) {
- fsm.Connect(state, fsm.Initial());
- }
- }
- }
- if (allowIntersects) {
- fsm.PrependAnything();
- }
- fsm.Sparse();
- fsm.SetIsDetermined(false);
- fsm.Canonize();
- }
-
- void HalfFinalFsm::MakeNonGreedyCounter(bool allowIntersects /* = true */, bool simplify /* = true */) {
- fsm.Canonize();
- fsm.PrependAnything();
- fsm.RemoveDeadEnds();
- fsm.Canonize();
- if (!allowIntersects || simplify) {
- DisconnectFinals(allowIntersects);
- }
- }
-
- void HalfFinalFsm::MakeGreedyCounter(bool simplify /* = true */) {
- fsm.Canonize();
- fsm.RemoveDeadEnds();
- size_t determineFactor = MaxCountDepth;
- if (simplify) {
- determineFactor = 1;
- }
- Determine(determineFactor);
- if (simplify) {
- fsm.Minimize();
- }
- fsm.RemoveDeadEnds();
- }
-
- namespace Impl {
-
- class HalfFinalDetermineState {
- public:
- HalfFinalDetermineState(const Fsm& fsm, bool initial = false, size_t lastFinalCount = 0)
- : mFsm(fsm)
- , ToAdd(0)
- , LastFinalCount(lastFinalCount)
- {
- if (initial) {
- FinishBuild(1);
- }
- }
-
- HalfFinalDetermineState Next(Char letter, size_t maxCount) const {
- HalfFinalDetermineState next(mFsm, false, LastFinalCount);
- for (const auto& state : States) {
- for (const auto& nextState : mFsm.Destinations(state.State, letter)) {
- next.AddState(nextState, state.Count, state.ReachedFinal);
- }
- }
- next.FinishBuild(maxCount, States.back().Count);
- if (letter == EndMark) {
- next.ToAdd += next.LastFinalCount;
- next.LastFinalCount = 0;
- next.States.clear();
- next.AddState(mFsm.Initial(), 0, false, true);
- return next;
- }
- return next;
- }
-
- void CopyData(Fsm& newFsm, size_t index) const {
- if (ToAdd) {
- newFsm.SetFinal(index, true);
- newFsm.SetTag(index, ToAdd);
- }
- }
-
- bool operator<(const HalfFinalDetermineState& otherState) const {
- if (ToAdd != otherState.ToAdd) {
- return ToAdd < otherState.ToAdd;
- }
- if (LastFinalCount != otherState.LastFinalCount) {
- return LastFinalCount < otherState.LastFinalCount;
- }
- return States < otherState.States;
- }
-
- struct StateHolder {
- size_t State;
- size_t Count;
- bool ReachedFinal;
-
- bool operator<(const StateHolder& other) const {
- if (State != other.State) {
- return State < other.State;
- }
- if (Count != other.Count) {
- return Count < other.Count;
- }
- return ReachedFinal < other.ReachedFinal;
- }
- };
-
- private:
- const Fsm& mFsm;
- TVector<StateHolder> States;
- size_t ToAdd;
- size_t LastFinalCount;
-
- void AddState(size_t state, size_t count, bool reachedFinal, bool force = false) {
- size_t newLastFinalCount = LastFinalCount;
- if (mFsm.IsFinal(state) && !reachedFinal) {
- ++count;
- reachedFinal = true;
- newLastFinalCount = count;
- }
- for (const auto& addedState : States) {
- if (addedState.State == state) {
- return;
- }
- }
- if (States.empty() || !mFsm.IsFinal(States.back().State) || force) {
- States.push_back({state, count, reachedFinal});
- LastFinalCount = newLastFinalCount;
- }
- }
-
- void FinishBuild(size_t maxCount, size_t lastCount = 0) {
- if (!States.empty() && mFsm.IsFinal(States.back().State)) {
- lastCount = States.back().Count;
- }
- AddState(mFsm.Initial(), lastCount, false, true);
- LastFinalCount = std::min(LastFinalCount, maxCount);
- size_t minCount = States[0].Count;
- for (auto& state : States) {
- if (state.Count > maxCount) {
- state.Count = maxCount;
- }
- minCount = std::min(state.Count, minCount);
- }
- ToAdd = minCount;
- for (auto& state : States) {
- state.Count -= minCount;
- }
- LastFinalCount -= minCount;
- }
- };
-
- class HalfFinalDetermineTask {
- public:
- typedef HalfFinalDetermineState State;
- typedef Fsm::LettersTbl LettersTbl;
- typedef TMap<State, size_t> InvStates;
-
- HalfFinalDetermineTask(const Fsm& fsm, size_t maxCount)
- : mFsm(fsm)
- , MaxCount(maxCount)
- {
- size_t oldSize = mFsm.Size();
- mFsm.Import(fsm);
- mFsm.Unsparse();
- for (size_t state = 0; state < mFsm.Size(); ++state) {
- for (Char letter = 0; letter < MaxChar; ++letter) {
- Fsm::StatesSet destinations = mFsm.Destinations(state, letter);
- for (const auto destination : destinations) {
- size_t newDestination = destination % oldSize;
- if (letter == EndMark) {
- newDestination += oldSize;
- }
- if (destination != newDestination) {
- mFsm.Disconnect(state, destination, letter);
- mFsm.Connect(state, newDestination, letter);
- }
- }
- }
- if (mFsm.Destinations(state, EndMark).size() == 0) {
- mFsm.Connect(state, oldSize + mFsm.Initial(), EndMark);
- }
- }
- mFsm.Sparse();
- }
-
- const LettersTbl& Letters() const { return mFsm.Letters(); }
-
- State Initial() const {
- return State(mFsm, true);
- }
-
- State Next(const State& state, Char letter) const {
- return state.Next(letter, MaxCount);
- }
-
- bool IsRequired(const State& /*state*/) const { return true; }
-
- void AcceptStates(const TVector<State>& newStates) {
- mNewFsm.Resize(newStates.size());
- mNewFsm.SetInitial(0);
- mNewFsm.SetIsDetermined(true);
- mNewFsm.letters = Letters();
- mNewFsm.ClearFinal();
- for (size_t i = 0; i < newStates.size(); i++) {
- newStates[i].CopyData(mNewFsm, i);
- }
- }
-
- void Connect(size_t from, size_t to, Char letter) {
- Y_ASSERT(mNewFsm.Destinations(from, letter).size() == 0);
- mNewFsm.Connect(from, to, letter);
- }
-
- typedef bool Result;
-
- Result Success() { return true; }
-
- Result Failure() { return false; }
-
- Fsm& Output() { return mNewFsm; }
-
- void SetMaxCount(size_t maxCount) {
- MaxCount = maxCount;
- }
-
- private:
- Fsm mFsm;
- size_t MaxCount;
- Fsm mNewFsm;
- };
- }
-
- void HalfFinalFsm::Determine(size_t depth) {
- static const unsigned MaxSize = 200000;
-
- Impl::HalfFinalDetermineTask task(fsm, depth);
- if (!Pire::Impl::Determine(task, MaxSize)) {
- task.SetMaxCount(1);
- Pire::Impl::Determine(task, MaxSize);
- }
-
- task.Output().Swap(fsm);
- }
-
- size_t HalfFinalFsm::GetCount(size_t state) const {
- if (fsm.IsFinal(state)) {
- if (fsm.Tag(state)) {
- return fsm.Tag(state);
- } else {
- return 1;
- }
- }
- return 0;
- }
-
- size_t HalfFinalFsm::GetTotalCount() const {
- size_t count = 0;
- for (size_t state = 0; state < fsm.Size(); ++state) {
- count += GetCount(state);
- }
- return count;
- }
+ size_t HalfFinalFsm::MaxCountDepth = 10;
+
+ void HalfFinalFsm::MakeScanner() {
+ fsm.Canonize();
+ bool allowHalfFinals = AllowHalfFinals();
+ if (!allowHalfFinals) {
+ MakeHalfFinal();
+ return;
+ }
+ DisconnectFinals(true);
+ }
+
+ bool HalfFinalFsm::AllowHalfFinals() {
+ fsm.Canonize();
+ for (size_t state = 0; state < fsm.Size(); ++state) {
+ if (fsm.IsFinal(state)) {
+ for (const auto& let : fsm.Letters()) {
+ bool hasFinalTransition = fsm.Destinations(state, let.first).empty();
+ for (const auto& to : fsm.Destinations(state, let.first)) {
+ if (fsm.IsFinal(to)) {
+ hasFinalTransition = true;
+ }
+ }
+ if (!hasFinalTransition) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+ }
+
+ void HalfFinalFsm::MakeHalfFinal() {
+ fsm.Unsparse();
+ const auto newFinal = fsm.Size();
+ fsm.Resize(newFinal + 1);
+ for (unsigned letter = 0; letter < MaxChar; ++letter) {
+ if (letter != Epsilon) {
+ fsm.Connect(newFinal, newFinal, letter);
+ }
+ }
+ for (size_t state = 0; state < fsm.Size(); ++state) {
+ bool hasFinalTransitions = false;
+ for (const auto& to : fsm.Destinations(state, EndMark)) {
+ if (fsm.IsFinal(to)) {
+ hasFinalTransitions = true;
+ break;
+ }
+ }
+ if (hasFinalTransitions) {
+ Fsm::StatesSet destinations = fsm.Destinations(state, EndMark);
+ for (const auto& to : destinations) {
+ fsm.Disconnect(state, to, EndMark);
+ }
+ fsm.Connect(state, newFinal, EndMark);
+ }
+ }
+ fsm.ClearFinal();
+ fsm.SetFinal(newFinal, true);
+ fsm.Sparse();
+ }
+
+ void HalfFinalFsm::DisconnectFinals(bool allowIntersects) {
+ fsm.Unsparse();
+ for (size_t state = 0; state != fsm.Size(); ++state) {
+ fsm.SetTag(state, 0);
+ if (fsm.IsFinal(state)) {
+ for (unsigned letter = 0; letter < MaxChar; ++letter) {
+ Fsm::StatesSet destinations = fsm.Destinations(state, letter);
+ for (const auto& to : destinations) {
+ fsm.Disconnect(state, to, letter);
+ }
+ }
+ if (!allowIntersects) {
+ fsm.Connect(state, fsm.Initial());
+ }
+ }
+ }
+ if (allowIntersects) {
+ fsm.PrependAnything();
+ }
+ fsm.Sparse();
+ fsm.SetIsDetermined(false);
+ fsm.Canonize();
+ }
+
+ void HalfFinalFsm::MakeNonGreedyCounter(bool allowIntersects /* = true */, bool simplify /* = true */) {
+ fsm.Canonize();
+ fsm.PrependAnything();
+ fsm.RemoveDeadEnds();
+ fsm.Canonize();
+ if (!allowIntersects || simplify) {
+ DisconnectFinals(allowIntersects);
+ }
+ }
+
+ void HalfFinalFsm::MakeGreedyCounter(bool simplify /* = true */) {
+ fsm.Canonize();
+ fsm.RemoveDeadEnds();
+ size_t determineFactor = MaxCountDepth;
+ if (simplify) {
+ determineFactor = 1;
+ }
+ Determine(determineFactor);
+ if (simplify) {
+ fsm.Minimize();
+ }
+ fsm.RemoveDeadEnds();
+ }
+
+ namespace Impl {
+
+ class HalfFinalDetermineState {
+ public:
+ HalfFinalDetermineState(const Fsm& fsm, bool initial = false, size_t lastFinalCount = 0)
+ : mFsm(fsm)
+ , ToAdd(0)
+ , LastFinalCount(lastFinalCount)
+ {
+ if (initial) {
+ FinishBuild(1);
+ }
+ }
+
+ HalfFinalDetermineState Next(Char letter, size_t maxCount) const {
+ HalfFinalDetermineState next(mFsm, false, LastFinalCount);
+ for (const auto& state : States) {
+ for (const auto& nextState : mFsm.Destinations(state.State, letter)) {
+ next.AddState(nextState, state.Count, state.ReachedFinal);
+ }
+ }
+ next.FinishBuild(maxCount, States.back().Count);
+ if (letter == EndMark) {
+ next.ToAdd += next.LastFinalCount;
+ next.LastFinalCount = 0;
+ next.States.clear();
+ next.AddState(mFsm.Initial(), 0, false, true);
+ return next;
+ }
+ return next;
+ }
+
+ void CopyData(Fsm& newFsm, size_t index) const {
+ if (ToAdd) {
+ newFsm.SetFinal(index, true);
+ newFsm.SetTag(index, ToAdd);
+ }
+ }
+
+ bool operator<(const HalfFinalDetermineState& otherState) const {
+ if (ToAdd != otherState.ToAdd) {
+ return ToAdd < otherState.ToAdd;
+ }
+ if (LastFinalCount != otherState.LastFinalCount) {
+ return LastFinalCount < otherState.LastFinalCount;
+ }
+ return States < otherState.States;
+ }
+
+ struct StateHolder {
+ size_t State;
+ size_t Count;
+ bool ReachedFinal;
+
+ bool operator<(const StateHolder& other) const {
+ if (State != other.State) {
+ return State < other.State;
+ }
+ if (Count != other.Count) {
+ return Count < other.Count;
+ }
+ return ReachedFinal < other.ReachedFinal;
+ }
+ };
+
+ private:
+ const Fsm& mFsm;
+ TVector<StateHolder> States;
+ size_t ToAdd;
+ size_t LastFinalCount;
+
+ void AddState(size_t state, size_t count, bool reachedFinal, bool force = false) {
+ size_t newLastFinalCount = LastFinalCount;
+ if (mFsm.IsFinal(state) && !reachedFinal) {
+ ++count;
+ reachedFinal = true;
+ newLastFinalCount = count;
+ }
+ for (const auto& addedState : States) {
+ if (addedState.State == state) {
+ return;
+ }
+ }
+ if (States.empty() || !mFsm.IsFinal(States.back().State) || force) {
+ States.push_back({state, count, reachedFinal});
+ LastFinalCount = newLastFinalCount;
+ }
+ }
+
+ void FinishBuild(size_t maxCount, size_t lastCount = 0) {
+ if (!States.empty() && mFsm.IsFinal(States.back().State)) {
+ lastCount = States.back().Count;
+ }
+ AddState(mFsm.Initial(), lastCount, false, true);
+ LastFinalCount = std::min(LastFinalCount, maxCount);
+ size_t minCount = States[0].Count;
+ for (auto& state : States) {
+ if (state.Count > maxCount) {
+ state.Count = maxCount;
+ }
+ minCount = std::min(state.Count, minCount);
+ }
+ ToAdd = minCount;
+ for (auto& state : States) {
+ state.Count -= minCount;
+ }
+ LastFinalCount -= minCount;
+ }
+ };
+
+ class HalfFinalDetermineTask {
+ public:
+ typedef HalfFinalDetermineState State;
+ typedef Fsm::LettersTbl LettersTbl;
+ typedef TMap<State, size_t> InvStates;
+
+ HalfFinalDetermineTask(const Fsm& fsm, size_t maxCount)
+ : mFsm(fsm)
+ , MaxCount(maxCount)
+ {
+ size_t oldSize = mFsm.Size();
+ mFsm.Import(fsm);
+ mFsm.Unsparse();
+ for (size_t state = 0; state < mFsm.Size(); ++state) {
+ for (Char letter = 0; letter < MaxChar; ++letter) {
+ Fsm::StatesSet destinations = mFsm.Destinations(state, letter);
+ for (const auto destination : destinations) {
+ size_t newDestination = destination % oldSize;
+ if (letter == EndMark) {
+ newDestination += oldSize;
+ }
+ if (destination != newDestination) {
+ mFsm.Disconnect(state, destination, letter);
+ mFsm.Connect(state, newDestination, letter);
+ }
+ }
+ }
+ if (mFsm.Destinations(state, EndMark).size() == 0) {
+ mFsm.Connect(state, oldSize + mFsm.Initial(), EndMark);
+ }
+ }
+ mFsm.Sparse();
+ }
+
+ const LettersTbl& Letters() const { return mFsm.Letters(); }
+
+ State Initial() const {
+ return State(mFsm, true);
+ }
+
+ State Next(const State& state, Char letter) const {
+ return state.Next(letter, MaxCount);
+ }
+
+ bool IsRequired(const State& /*state*/) const { return true; }
+
+ void AcceptStates(const TVector<State>& newStates) {
+ mNewFsm.Resize(newStates.size());
+ mNewFsm.SetInitial(0);
+ mNewFsm.SetIsDetermined(true);
+ mNewFsm.letters = Letters();
+ mNewFsm.ClearFinal();
+ for (size_t i = 0; i < newStates.size(); i++) {
+ newStates[i].CopyData(mNewFsm, i);
+ }
+ }
+
+ void Connect(size_t from, size_t to, Char letter) {
+ Y_ASSERT(mNewFsm.Destinations(from, letter).size() == 0);
+ mNewFsm.Connect(from, to, letter);
+ }
+
+ typedef bool Result;
+
+ Result Success() { return true; }
+
+ Result Failure() { return false; }
+
+ Fsm& Output() { return mNewFsm; }
+
+ void SetMaxCount(size_t maxCount) {
+ MaxCount = maxCount;
+ }
+
+ private:
+ Fsm mFsm;
+ size_t MaxCount;
+ Fsm mNewFsm;
+ };
+ }
+
+ void HalfFinalFsm::Determine(size_t depth) {
+ static const unsigned MaxSize = 200000;
+
+ Impl::HalfFinalDetermineTask task(fsm, depth);
+ if (!Pire::Impl::Determine(task, MaxSize)) {
+ task.SetMaxCount(1);
+ Pire::Impl::Determine(task, MaxSize);
+ }
+
+ task.Output().Swap(fsm);
+ }
+
+ size_t HalfFinalFsm::GetCount(size_t state) const {
+ if (fsm.IsFinal(state)) {
+ if (fsm.Tag(state)) {
+ return fsm.Tag(state);
+ } else {
+ return 1;
+ }
+ }
+ return 0;
+ }
+
+ size_t HalfFinalFsm::GetTotalCount() const {
+ size_t count = 0;
+ for (size_t state = 0; state < fsm.Size(); ++state) {
+ count += GetCount(state);
+ }
+ return count;
+ }
}
diff --git a/library/cpp/regex/pire/pire/half_final_fsm.h b/library/cpp/regex/pire/pire/half_final_fsm.h
index 83828f8bb37..1742d999895 100644
--- a/library/cpp/regex/pire/pire/half_final_fsm.h
+++ b/library/cpp/regex/pire/pire/half_final_fsm.h
@@ -2,47 +2,47 @@
#include "defs.h"
namespace Pire {
- class HalfFinalFsm {
- public:
- HalfFinalFsm(const Fsm& sourceFsm) : fsm(sourceFsm) {}
+ class HalfFinalFsm {
+ public:
+ HalfFinalFsm(const Fsm& sourceFsm) : fsm(sourceFsm) {}
- void MakeScanner();
+ void MakeScanner();
- /// Non greedy counter without allowed intersects works correctly on all regexps
- /// Non simplified non greedy counter with allowed intersects counts number of positions in string,
- /// on which ends at least one substring that matches regexp
- /// Simplified non greedy counter with allowed intersects does not always work correctly,
- /// but has fewer number of states and more regexps can be glued into single scanner
- void MakeNonGreedyCounter(bool allowIntersects = true, bool simplify = true);
+ /// Non greedy counter without allowed intersects works correctly on all regexps
+ /// Non simplified non greedy counter with allowed intersects counts number of positions in string,
+ /// on which ends at least one substring that matches regexp
+ /// Simplified non greedy counter with allowed intersects does not always work correctly,
+ /// but has fewer number of states and more regexps can be glued into single scanner
+ void MakeNonGreedyCounter(bool allowIntersects = true, bool simplify = true);
- // Simplified counter does not work correctly on all regexps, but has less number of states
- // and allows to glue larger number of scanners into one within the same size limit
- void MakeGreedyCounter(bool simplify = true);
+ // Simplified counter does not work correctly on all regexps, but has less number of states
+ // and allows to glue larger number of scanners into one within the same size limit
+ void MakeGreedyCounter(bool simplify = true);
- const Fsm& GetFsm() const { return fsm; }
+ const Fsm& GetFsm() const { return fsm; }
- template<class Scanner>
- Scanner Compile() const;
+ template<class Scanner>
+ Scanner Compile() const;
- size_t GetCount(size_t state) const;
+ size_t GetCount(size_t state) const;
- size_t GetTotalCount() const;
+ size_t GetTotalCount() const;
- static size_t MaxCountDepth;
- private:
- Fsm fsm;
+ static size_t MaxCountDepth;
+ private:
+ Fsm fsm;
- bool AllowHalfFinals();
+ bool AllowHalfFinals();
- void MakeHalfFinal();
+ void MakeHalfFinal();
- void DisconnectFinals(bool allowIntersects);
+ void DisconnectFinals(bool allowIntersects);
- void Determine(size_t depth = MaxCountDepth);
- };
+ void Determine(size_t depth = MaxCountDepth);
+ };
- template<class Scanner>
- Scanner HalfFinalFsm::Compile() const {
- auto scanner = Scanner(*this);
- }
+ template<class Scanner>
+ Scanner HalfFinalFsm::Compile() const {
+ auto scanner = Scanner(*this);
+ }
}
diff --git a/library/cpp/regex/pire/pire/minimize.h b/library/cpp/regex/pire/pire/minimize.h
index d58c5ce79ea..38f65985c78 100644
--- a/library/cpp/regex/pire/pire/minimize.h
+++ b/library/cpp/regex/pire/pire/minimize.h
@@ -5,149 +5,149 @@
#include "partition.h"
namespace Pire {
- namespace Impl {
+ namespace Impl {
- /**
- * An interface of a minimization task.
- * You don't have to derive from this class; it is just a start point template.
- */
- class MinimizeTask {
- private:
- struct ImplementationSpecific1;
+ /**
+ * An interface of a minimization task.
+ * You don't have to derive from this class; it is just a start point template.
+ */
+ class MinimizeTask {
+ private:
+ struct ImplementationSpecific1;
- public:
- // States must be represented by size_t.
+ public:
+ // States must be represented by size_t.
- /// States must be initially divided into some equivalence classes.
- /// If states are in the same equivalence class, they may be merged without loosing state specific info.
- /// Equivalence classes must have indexes from 0 to (Classes - 1).
- /// The algorithm will modify equivalent classes and in the end
- /// all states in the same equivalent class can be merged into one state
- TVector<size_t>& GetStateClass() { return StateClass; }
-
- /// Returns number of equivalent classes
- size_t& GetClassesNumber() { return Classes; }
-
- /// Should return number of letter classes
- size_t LettersCount() const;
-
- /// Should return true if FSM is determined.
- bool IsDetermined() const;
-
- /// Should return number of states.
- size_t Size() const;
-
- /// Should calculate vector of previous states by, given the current state and incoming letter class index.
- const TVector<size_t>& Previous(size_t state, size_t letter) const;
-
- /// Called when states equivalent classes are formed, and written in StateClass.
- void AcceptStates();
-
- typedef bool Result;
-
- Result Success() { return true; }
-
- Result Failure() { return false; }
-
- private:
- TVector<size_t> StateClass;
-
- size_t Classes;
- };
-
- // Minimizes Determined FSM using Hopcroft algorithm, works in O(Size * log(Size) * MaxChar) time,
- // requires O(Size * MaxChar * sizof(size_t)) memory.
- template<class Task>
- typename Task::Result Minimize(Task& task)
- {
- // Minimization algorithm is only applicable to a determined FSM.
- if (!task.IsDetermined()) {
- return task.Failure();
- }
-
- typedef ypair<size_t, size_t> ClassLetter;
-
- TVector<ybitset<MaxChar>> queuedClasses(task.Size());
-
- TDeque<ClassLetter> classesToProcess;
-
- TVector<TVector<size_t>> classStates(task.Size());
-
- TVector<size_t>& stateClass = task.GetStateClass();
-
- for (size_t state = 0; state < task.Size(); ++state) {
- classStates[stateClass[state]].push_back(state);
- }
-
- for (size_t classIndex = 0; classIndex < task.GetClassesNumber(); ++classIndex) {
- for (size_t letter = 0; letter < task.LettersCount(); ++letter) {
- classesToProcess.push_back(ymake_pair(classIndex, letter));
- queuedClasses[classIndex][letter] = 1;
- }
- }
-
- TVector<size_t> classChange(task.Size());
- TVector<TVector<size_t>> removedStates(task.Size());
-
- while (classesToProcess.size()) {
- const auto currentClass = classesToProcess.front().first;
- const auto currentLetter = classesToProcess.front().second;
- classesToProcess.pop_front();
- queuedClasses[currentClass][currentLetter] = 0;
- TVector<size_t> splittedClasses;
-
- for (const auto& classState : classStates[currentClass]) {
- for (const auto& state: task.Previous(classState, currentLetter)) {
- if (classChange[stateClass[state]] != task.GetClassesNumber()) {
- classChange[stateClass[state]] = task.GetClassesNumber();
- splittedClasses.push_back(stateClass[state]);
- }
- removedStates[stateClass[state]].push_back(state);
- }
- }
-
-
- for (const auto& splittedClass : splittedClasses) {
- if (removedStates[splittedClass].size() == classStates[splittedClass].size()) {
- classChange[splittedClass] = 0;
- removedStates[splittedClass].clear();
- continue;
- }
-
- const auto newClass = task.GetClassesNumber()++;
- classChange[splittedClass] = newClass;
- std::swap(classStates[newClass], removedStates[splittedClass]);
- for (const auto& state : classStates[newClass]) {
- stateClass[state] = newClass;
- }
-
- auto iter = classStates[splittedClass].begin();
- for (const auto state : classStates[splittedClass]) {
- if (stateClass[state] == splittedClass) {
- *iter = state;
- ++iter;
- }
- }
- classStates[splittedClass].erase(iter, classStates[splittedClass].end());
-
- for (size_t letter = 0; letter < task.LettersCount(); ++letter) {
- if (queuedClasses[splittedClass][letter]
- || classStates[splittedClass].size() > classStates[newClass].size()) {
-
- queuedClasses[newClass][letter] = 1;
- classesToProcess.push_back(ymake_pair(newClass, letter));
- } else {
- queuedClasses[splittedClass][letter] = 1;
- classesToProcess.push_back(ymake_pair(splittedClass, letter));
- }
- }
- }
- }
-
- task.AcceptStates();
- return task.Success();
- }
- }
+ /// States must be initially divided into some equivalence classes.
+ /// If states are in the same equivalence class, they may be merged without loosing state specific info.
+ /// Equivalence classes must have indexes from 0 to (Classes - 1).
+ /// The algorithm will modify equivalent classes and in the end
+ /// all states in the same equivalent class can be merged into one state
+ TVector<size_t>& GetStateClass() { return StateClass; }
+
+ /// Returns number of equivalent classes
+ size_t& GetClassesNumber() { return Classes; }
+
+ /// Should return number of letter classes
+ size_t LettersCount() const;
+
+ /// Should return true if FSM is determined.
+ bool IsDetermined() const;
+
+ /// Should return number of states.
+ size_t Size() const;
+
+ /// Should calculate vector of previous states by, given the current state and incoming letter class index.
+ const TVector<size_t>& Previous(size_t state, size_t letter) const;
+
+ /// Called when states equivalent classes are formed, and written in StateClass.
+ void AcceptStates();
+
+ typedef bool Result;
+
+ Result Success() { return true; }
+
+ Result Failure() { return false; }
+
+ private:
+ TVector<size_t> StateClass;
+
+ size_t Classes;
+ };
+
+ // Minimizes Determined FSM using Hopcroft algorithm, works in O(Size * log(Size) * MaxChar) time,
+ // requires O(Size * MaxChar * sizof(size_t)) memory.
+ template<class Task>
+ typename Task::Result Minimize(Task& task)
+ {
+ // Minimization algorithm is only applicable to a determined FSM.
+ if (!task.IsDetermined()) {
+ return task.Failure();
+ }
+
+ typedef ypair<size_t, size_t> ClassLetter;
+
+ TVector<ybitset<MaxChar>> queuedClasses(task.Size());
+
+ TDeque<ClassLetter> classesToProcess;
+
+ TVector<TVector<size_t>> classStates(task.Size());
+
+ TVector<size_t>& stateClass = task.GetStateClass();
+
+ for (size_t state = 0; state < task.Size(); ++state) {
+ classStates[stateClass[state]].push_back(state);
+ }
+
+ for (size_t classIndex = 0; classIndex < task.GetClassesNumber(); ++classIndex) {
+ for (size_t letter = 0; letter < task.LettersCount(); ++letter) {
+ classesToProcess.push_back(ymake_pair(classIndex, letter));
+ queuedClasses[classIndex][letter] = 1;
+ }
+ }
+
+ TVector<size_t> classChange(task.Size());
+ TVector<TVector<size_t>> removedStates(task.Size());
+
+ while (classesToProcess.size()) {
+ const auto currentClass = classesToProcess.front().first;
+ const auto currentLetter = classesToProcess.front().second;
+ classesToProcess.pop_front();
+ queuedClasses[currentClass][currentLetter] = 0;
+ TVector<size_t> splittedClasses;
+
+ for (const auto& classState : classStates[currentClass]) {
+ for (const auto& state: task.Previous(classState, currentLetter)) {
+ if (classChange[stateClass[state]] != task.GetClassesNumber()) {
+ classChange[stateClass[state]] = task.GetClassesNumber();
+ splittedClasses.push_back(stateClass[state]);
+ }
+ removedStates[stateClass[state]].push_back(state);
+ }
+ }
+
+
+ for (const auto& splittedClass : splittedClasses) {
+ if (removedStates[splittedClass].size() == classStates[splittedClass].size()) {
+ classChange[splittedClass] = 0;
+ removedStates[splittedClass].clear();
+ continue;
+ }
+
+ const auto newClass = task.GetClassesNumber()++;
+ classChange[splittedClass] = newClass;
+ std::swap(classStates[newClass], removedStates[splittedClass]);
+ for (const auto& state : classStates[newClass]) {
+ stateClass[state] = newClass;
+ }
+
+ auto iter = classStates[splittedClass].begin();
+ for (const auto state : classStates[splittedClass]) {
+ if (stateClass[state] == splittedClass) {
+ *iter = state;
+ ++iter;
+ }
+ }
+ classStates[splittedClass].erase(iter, classStates[splittedClass].end());
+
+ for (size_t letter = 0; letter < task.LettersCount(); ++letter) {
+ if (queuedClasses[splittedClass][letter]
+ || classStates[splittedClass].size() > classStates[newClass].size()) {
+
+ queuedClasses[newClass][letter] = 1;
+ classesToProcess.push_back(ymake_pair(newClass, letter));
+ } else {
+ queuedClasses[splittedClass][letter] = 1;
+ classesToProcess.push_back(ymake_pair(splittedClass, letter));
+ }
+ }
+ }
+ }
+
+ task.AcceptStates();
+ return task.Success();
+ }
+ }
}
#endif
diff --git a/library/cpp/regex/pire/pire/partition.h b/library/cpp/regex/pire/pire/partition.h
index ae8ae1cc8c6..b0585219989 100644
--- a/library/cpp/regex/pire/pire/partition.h
+++ b/library/cpp/regex/pire/pire/partition.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -37,154 +37,154 @@ namespace Pire {
template<class T, class Eq>
class Partition {
private:
- typedef TMap< T, ypair< size_t, TVector<T> > > Set;
+ typedef TMap< T, ypair< size_t, TVector<T> > > Set;
public:
- Partition(const Eq& eq)
- : m_eq(eq)
- , m_maxidx(0)
- {
- }
-
- /// Appends a new item into partition, creating new equivalience class if neccessary.
- void Append(const T& t) {
- DoAppend(m_set, t);
- }
-
- typedef typename Set::const_iterator ConstIterator;
-
- ConstIterator Begin() const {
- return m_set.begin();
- }
- ConstIterator begin() const {
- return m_set.begin();
- }
- ConstIterator End() const {
- return m_set.end();
- }
- ConstIterator end() const {
- return m_set.end();
- }
- size_t Size() const {
- return m_set.size();
- }
- bool Empty() const {
- return m_set.empty();
- }
-
- /// Returns an item equal to @p t. It is guaranteed that:
- /// - representative(a) equals representative(b) iff a is equivalent to b;
- /// - representative(a) is equivalent to a.
- const T& Representative(const T& t) const
- {
- auto it = m_inv.find(t);
- if (it != m_inv.end())
- return it->second;
- else
- return DefaultValue<T>();
- }
-
- bool Contains(const T& t) const
- {
- return m_inv.find(t) != m_inv.end();
- }
-
- /// Returns an index of set containing @p t. It is guaranteed that:
- /// - index(a) equals index(b) iff a is equivalent to b;
- /// - 0 <= index(a) < size().
- size_t Index(const T& t) const
- {
- auto it = m_inv.find(t);
- if (it == m_inv.end())
- throw Error("Partition::index(): attempted to obtain an index of nonexistent item");
- auto it2 = m_set.find(it->second);
- Y_ASSERT(it2 != m_set.end());
- return it2->second.first;
- }
- /// Returns the whole equivalence class of @p t (i.e. item @p i
- /// is returned iff representative(i) == representative(t)).
- const TVector<T>& Klass(const T& t) const
- {
- auto it = m_inv.find(t);
- if (it == m_inv.end())
- throw Error("Partition::index(): attempted to obtain an index of nonexistent item");
- auto it2 = m_set.find(it->second);
- Y_ASSERT(it2 != m_set.end());
- return it2->second.second;
- }
-
- bool operator == (const Partition& rhs) const { return m_set == rhs.m_set; }
- bool operator != (const Partition& rhs) const { return !(*this == rhs); }
-
- /// Splits the current sets into smaller ones, using given equivalence relation.
- /// Requires given relation imply previous one (set either in ctor or
- /// in preceeding calls to split()), but performs faster.
- /// Replaces previous relation with given one.
- void Split(const Eq& eq)
- {
- m_eq = eq;
-
- for (auto&& element : m_set)
- if (element.second.second.size() > 1) {
- TVector<T>& v = element.second.second;
- auto bound = std::partition(v.begin(), v.end(), std::bind2nd(m_eq, v[0]));
- if (bound == v.end())
- continue;
-
- Set delta;
- for (auto it = bound, ie = v.end(); it != ie; ++it)
- DoAppend(delta, *it);
-
- v.erase(bound, v.end());
- m_set.insert(delta.begin(), delta.end());
- }
- }
+ Partition(const Eq& eq)
+ : m_eq(eq)
+ , m_maxidx(0)
+ {
+ }
+
+ /// Appends a new item into partition, creating new equivalience class if neccessary.
+ void Append(const T& t) {
+ DoAppend(m_set, t);
+ }
+
+ typedef typename Set::const_iterator ConstIterator;
+
+ ConstIterator Begin() const {
+ return m_set.begin();
+ }
+ ConstIterator begin() const {
+ return m_set.begin();
+ }
+ ConstIterator End() const {
+ return m_set.end();
+ }
+ ConstIterator end() const {
+ return m_set.end();
+ }
+ size_t Size() const {
+ return m_set.size();
+ }
+ bool Empty() const {
+ return m_set.empty();
+ }
+
+ /// Returns an item equal to @p t. It is guaranteed that:
+ /// - representative(a) equals representative(b) iff a is equivalent to b;
+ /// - representative(a) is equivalent to a.
+ const T& Representative(const T& t) const
+ {
+ auto it = m_inv.find(t);
+ if (it != m_inv.end())
+ return it->second;
+ else
+ return DefaultValue<T>();
+ }
+
+ bool Contains(const T& t) const
+ {
+ return m_inv.find(t) != m_inv.end();
+ }
+
+ /// Returns an index of set containing @p t. It is guaranteed that:
+ /// - index(a) equals index(b) iff a is equivalent to b;
+ /// - 0 <= index(a) < size().
+ size_t Index(const T& t) const
+ {
+ auto it = m_inv.find(t);
+ if (it == m_inv.end())
+ throw Error("Partition::index(): attempted to obtain an index of nonexistent item");
+ auto it2 = m_set.find(it->second);
+ Y_ASSERT(it2 != m_set.end());
+ return it2->second.first;
+ }
+ /// Returns the whole equivalence class of @p t (i.e. item @p i
+ /// is returned iff representative(i) == representative(t)).
+ const TVector<T>& Klass(const T& t) const
+ {
+ auto it = m_inv.find(t);
+ if (it == m_inv.end())
+ throw Error("Partition::index(): attempted to obtain an index of nonexistent item");
+ auto it2 = m_set.find(it->second);
+ Y_ASSERT(it2 != m_set.end());
+ return it2->second.second;
+ }
+
+ bool operator == (const Partition& rhs) const { return m_set == rhs.m_set; }
+ bool operator != (const Partition& rhs) const { return !(*this == rhs); }
+
+ /// Splits the current sets into smaller ones, using given equivalence relation.
+ /// Requires given relation imply previous one (set either in ctor or
+ /// in preceeding calls to split()), but performs faster.
+ /// Replaces previous relation with given one.
+ void Split(const Eq& eq)
+ {
+ m_eq = eq;
+
+ for (auto&& element : m_set)
+ if (element.second.second.size() > 1) {
+ TVector<T>& v = element.second.second;
+ auto bound = std::partition(v.begin(), v.end(), std::bind2nd(m_eq, v[0]));
+ if (bound == v.end())
+ continue;
+
+ Set delta;
+ for (auto it = bound, ie = v.end(); it != ie; ++it)
+ DoAppend(delta, *it);
+
+ v.erase(bound, v.end());
+ m_set.insert(delta.begin(), delta.end());
+ }
+ }
private:
- Eq m_eq;
- Set m_set;
- TMap<T, T> m_inv;
- size_t m_maxidx;
-
- void DoAppend(Set& set, const T& t)
- {
- auto it = set.begin();
- auto end = set.end();
- for (; it != end; ++it)
- if (m_eq(it->first, t)) {
- it->second.second.push_back(t);
- m_inv[t] = it->first;
- break;
- }
-
- if (it == end) {
- // Begin new set
- TVector<T> v(1, t);
- set.insert(ymake_pair(t, ymake_pair(m_maxidx++, v)));
- m_inv[t] = t;
- }
- }
+ Eq m_eq;
+ Set m_set;
+ TMap<T, T> m_inv;
+ size_t m_maxidx;
+
+ void DoAppend(Set& set, const T& t)
+ {
+ auto it = set.begin();
+ auto end = set.end();
+ for (; it != end; ++it)
+ if (m_eq(it->first, t)) {
+ it->second.second.push_back(t);
+ m_inv[t] = it->first;
+ break;
+ }
+
+ if (it == end) {
+ // Begin new set
+ TVector<T> v(1, t);
+ set.insert(ymake_pair(t, ymake_pair(m_maxidx++, v)));
+ m_inv[t] = t;
+ }
+ }
};
// Mainly for debugging
template<class T, class Eq>
yostream& operator << (yostream& stream, const Partition<T, Eq>& partition)
{
- stream << "Partition {\n";
- for (auto&& partitionElement : partition) {
- stream << " Class " << partitionElement.second.first << " \"" << partitionElement.first << "\" { ";
- bool first = false;
- for (auto&& element : partitionElement.second.second) {
- if (first)
- stream << ", ";
- else
- first = true;
- stream << element;
- }
- stream << " }\n";
- }
- stream << "}";
- return stream;
+ stream << "Partition {\n";
+ for (auto&& partitionElement : partition) {
+ stream << " Class " << partitionElement.second.first << " \"" << partitionElement.first << "\" { ";
+ bool first = false;
+ for (auto&& element : partitionElement.second.second) {
+ if (first)
+ stream << ", ";
+ else
+ first = true;
+ stream << element;
+ }
+ stream << " }\n";
+ }
+ stream << "}";
+ return stream;
}
}
diff --git a/library/cpp/regex/pire/pire/pire.h b/library/cpp/regex/pire/pire/pire.h
index 305d70703a8..f036ce14f84 100644
--- a/library/cpp/regex/pire/pire/pire.h
+++ b/library/cpp/regex/pire/pire/pire.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
diff --git a/library/cpp/regex/pire/pire/re_lexer.cpp b/library/cpp/regex/pire/pire/re_lexer.cpp
index 456b015d834..509e2508887 100644
--- a/library/cpp/regex/pire/pire/re_lexer.cpp
+++ b/library/cpp/regex/pire/pire/re_lexer.cpp
@@ -241,8 +241,8 @@ namespace {
for (; ch != End && ch != (Control | ']'); ch = CorrectChar(GetChar(), controls)) {
if (ch == (Control | 'x')) {
UngetChar(ch);
- firstUnicode = true;
- unicodeSymbol = ReadUnicodeCharacter();
+ firstUnicode = true;
+ unicodeSymbol = ReadUnicodeCharacter();
} else {
firstUnicode = false;
}
diff --git a/library/cpp/regex/pire/pire/re_lexer.h b/library/cpp/regex/pire/pire/re_lexer.h
index 279f67e2c5a..d52ed207ddf 100644
--- a/library/cpp/regex/pire/pire/re_lexer.h
+++ b/library/cpp/regex/pire/pire/re_lexer.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -52,18 +52,18 @@ using namespace Consts;
namespace TokenTypes {
enum {
- None = 0,
- Letters,
- Count,
- Dot,
- Open,
- Close,
- Or,
- And,
- Not,
- BeginMark,
- EndMark,
- End
+ None = 0,
+ Letters,
+ Count,
+ Dot,
+ Open,
+ Close,
+ Or,
+ And,
+ Not,
+ BeginMark,
+ EndMark,
+ End
};
}
@@ -74,31 +74,31 @@ enum {
*/
class Term {
public:
- typedef TVector<wchar32> String;
- typedef TSet<String> Strings;
+ typedef TVector<wchar32> String;
+ typedef TSet<String> Strings;
- typedef ypair<int, int> RepetitionCount;
- typedef ypair<Strings, bool> CharacterRange;
+ typedef ypair<int, int> RepetitionCount;
+ typedef ypair<Strings, bool> CharacterRange;
- struct DotTag {};
- struct BeginTag {};
- struct EndTag {};
+ struct DotTag {};
+ struct BeginTag {};
+ struct EndTag {};
- Term(int type): m_type(type) {}
- template<class T> Term(int type, T t): m_type(type), m_value(t) {}
- Term(int type, const Any& value): m_type(type), m_value(value) {}
+ Term(int type): m_type(type) {}
+ template<class T> Term(int type, T t): m_type(type), m_value(t) {}
+ Term(int type, const Any& value): m_type(type), m_value(value) {}
- static Term Character(wchar32 c);
- static Term Repetition(int lower, int upper);
- static Term Dot();
- static Term BeginMark();
- static Term EndMark();
+ static Term Character(wchar32 c);
+ static Term Repetition(int lower, int upper);
+ static Term Dot();
+ static Term BeginMark();
+ static Term EndMark();
- int Type() const { return m_type; }
- const Any& Value() const { return m_value; }
+ int Type() const { return m_type; }
+ const Any& Value() const { return m_value; }
private:
- int m_type;
- Any m_value;
+ int m_type;
+ Any m_value;
};
class Feature;
@@ -108,78 +108,78 @@ class Feature;
*/
class Lexer {
public:
- // One-size-fits-all constructor set.
- Lexer()
- : m_encoding(&Encodings::Latin1())
- { InstallDefaultFeatures(); }
-
- explicit Lexer(const char* str)
- : m_encoding(&Encodings::Latin1())
- {
- InstallDefaultFeatures();
- Assign(str, str + strlen(str));
- }
- template<class T> explicit Lexer(const T& t)
- : m_encoding(&Encodings::Latin1())
- {
- InstallDefaultFeatures();
- Assign(t.begin(), t.end());
- }
-
- template<class Iter> Lexer(Iter begin, Iter end)
- : m_encoding(&Encodings::Latin1())
- {
- InstallDefaultFeatures();
- Assign(begin, end);
- }
- ~Lexer();
-
- template<class Iter> void Assign(Iter begin, Iter end)
- {
- m_input.clear();
- std::copy(begin, end, std::back_inserter(m_input));
- }
-
- /// The main lexer function. Extracts and returns the next term in input sequence.
- Term Lex();
- /// Installs an additional lexer feature.
- /// We declare both lvalue and rvalue reference types to fix some linker errors.
- Lexer& AddFeature(THolder<Feature>& a);
- Lexer& AddFeature(THolder<Feature>&& a);
-
- const Pire::Encoding& Encoding() const { return *m_encoding; }
- Lexer& SetEncoding(const Pire::Encoding& encoding) { m_encoding = &encoding; return *this; }
- void SetError(const char* msg) { errmsg = msg; }
- void SetError(ystring msg) { errmsg = msg; }
- ystring& GetError() { return errmsg; }
-
- Any& Retval() { return m_retval; }
-
- Fsm Parse();
-
- void Parenthesized(Fsm& fsm);
+ // One-size-fits-all constructor set.
+ Lexer()
+ : m_encoding(&Encodings::Latin1())
+ { InstallDefaultFeatures(); }
+
+ explicit Lexer(const char* str)
+ : m_encoding(&Encodings::Latin1())
+ {
+ InstallDefaultFeatures();
+ Assign(str, str + strlen(str));
+ }
+ template<class T> explicit Lexer(const T& t)
+ : m_encoding(&Encodings::Latin1())
+ {
+ InstallDefaultFeatures();
+ Assign(t.begin(), t.end());
+ }
+
+ template<class Iter> Lexer(Iter begin, Iter end)
+ : m_encoding(&Encodings::Latin1())
+ {
+ InstallDefaultFeatures();
+ Assign(begin, end);
+ }
+ ~Lexer();
+
+ template<class Iter> void Assign(Iter begin, Iter end)
+ {
+ m_input.clear();
+ std::copy(begin, end, std::back_inserter(m_input));
+ }
+
+ /// The main lexer function. Extracts and returns the next term in input sequence.
+ Term Lex();
+ /// Installs an additional lexer feature.
+ /// We declare both lvalue and rvalue reference types to fix some linker errors.
+ Lexer& AddFeature(THolder<Feature>& a);
+ Lexer& AddFeature(THolder<Feature>&& a);
+
+ const Pire::Encoding& Encoding() const { return *m_encoding; }
+ Lexer& SetEncoding(const Pire::Encoding& encoding) { m_encoding = &encoding; return *this; }
+ void SetError(const char* msg) { errmsg = msg; }
+ void SetError(ystring msg) { errmsg = msg; }
+ ystring& GetError() { return errmsg; }
+
+ Any& Retval() { return m_retval; }
+
+ Fsm Parse();
+
+ void Parenthesized(Fsm& fsm);
private:
- Term DoLex();
+ Term DoLex();
- wchar32 GetChar();
- wchar32 PeekChar();
- void UngetChar(wchar32 c);
+ wchar32 GetChar();
+ wchar32 PeekChar();
+ void UngetChar(wchar32 c);
- void Error(const char* msg) { throw Pire::Error(msg); }
+ void Error(const char* msg) { throw Pire::Error(msg); }
- void InstallDefaultFeatures();
+ void InstallDefaultFeatures();
- TDeque<wchar32> m_input;
- const Pire::Encoding* m_encoding;
- TVector<THolder<Feature>> m_features;
- Any m_retval;
- ystring errmsg;
+ TDeque<wchar32> m_input;
+ const Pire::Encoding* m_encoding;
+ TVector<THolder<Feature>> m_features;
+ Any m_retval;
+ ystring errmsg;
- friend class Feature;
+ friend class Feature;
- Lexer(const Lexer&);
- Lexer& operator = (const Lexer&);
+ Lexer(const Lexer&);
+ Lexer& operator = (const Lexer&);
};
/**
@@ -188,55 +188,55 @@ private:
*/
class Feature {
public:
- /// Precedence of features. The less the priority, the earlier
- /// will Lex() be called, and the later will Alter() and Parenthesized() be called.
- virtual int Priority() const { return 50; }
-
- /// Lexer will call this function to check whether the feature
- /// wants to handle the next part of the input sequence in its
- /// specific way. If it does not, features Lex() will not be called.
- virtual bool Accepts(wchar32 /*c*/) const { return false; }
- /// Should eat up some part of the input sequence, handle it
- /// somehow and produce a terminal.
- virtual Term Lex() { return Term(0); }
-
- /// This function recieves a shiny new terminal, and the feature
- /// has a chance to hack it somehow if it wants.
- virtual void Alter(Term&) {}
- /// This function recieves a parenthesized part of a pattern, and the feature
- /// has a chance to hack it somehow if it wants (its the way to implement
- /// those perl-style (?@#$%:..) clauses).
- virtual void Parenthesized(Fsm&) {}
-
- using Ptr = THolder<Feature>;
-
- virtual ~Feature() = default;
+ /// Precedence of features. The less the priority, the earlier
+ /// will Lex() be called, and the later will Alter() and Parenthesized() be called.
+ virtual int Priority() const { return 50; }
+
+ /// Lexer will call this function to check whether the feature
+ /// wants to handle the next part of the input sequence in its
+ /// specific way. If it does not, features Lex() will not be called.
+ virtual bool Accepts(wchar32 /*c*/) const { return false; }
+ /// Should eat up some part of the input sequence, handle it
+ /// somehow and produce a terminal.
+ virtual Term Lex() { return Term(0); }
+
+ /// This function recieves a shiny new terminal, and the feature
+ /// has a chance to hack it somehow if it wants.
+ virtual void Alter(Term&) {}
+ /// This function recieves a parenthesized part of a pattern, and the feature
+ /// has a chance to hack it somehow if it wants (its the way to implement
+ /// those perl-style (?@#$%:..) clauses).
+ virtual void Parenthesized(Fsm&) {}
+
+ using Ptr = THolder<Feature>;
+
+ virtual ~Feature() = default;
protected:
- // These functions are exposed versions of the corresponding lexer functions.
- const Pire::Encoding& Encoding() const { return m_lexer->Encoding(); }
- wchar32 GetChar() { return m_lexer->GetChar(); }
- wchar32 PeekChar() { return m_lexer->PeekChar(); }
- void UngetChar(wchar32 c) { m_lexer->UngetChar(c); }
- wchar32 CorrectChar(wchar32 c, const char* controls);
- void Error(const char* msg) { m_lexer->Error(msg); }
+ // These functions are exposed versions of the corresponding lexer functions.
+ const Pire::Encoding& Encoding() const { return m_lexer->Encoding(); }
+ wchar32 GetChar() { return m_lexer->GetChar(); }
+ wchar32 PeekChar() { return m_lexer->PeekChar(); }
+ void UngetChar(wchar32 c) { m_lexer->UngetChar(c); }
+ wchar32 CorrectChar(wchar32 c, const char* controls);
+ void Error(const char* msg) { m_lexer->Error(msg); }
private:
- friend class Lexer;
- Lexer* m_lexer;
+ friend class Lexer;
+ Lexer* m_lexer;
};
namespace Features {
- /// Disables case sensitivity
- Feature::Ptr CaseInsensitive();
-
- /**
- * Adds two more operations:
- * (pattern1)&(pattern2) -- matches those strings which match both /pattern1/ and /pattern2/;
- * ~(pattern) -- matches those strings which do not match /pattern/.
- */
- Feature::Ptr AndNotSupport();
+ /// Disables case sensitivity
+ Feature::Ptr CaseInsensitive();
+
+ /**
+ * Adds two more operations:
+ * (pattern1)&(pattern2) -- matches those strings which match both /pattern1/ and /pattern2/;
+ * ~(pattern) -- matches those strings which do not match /pattern/.
+ */
+ Feature::Ptr AndNotSupport();
}
}
diff --git a/library/cpp/regex/pire/pire/read_unicode.cpp b/library/cpp/regex/pire/pire/read_unicode.cpp
index 6278ad500aa..6422144c82c 100644
--- a/library/cpp/regex/pire/pire/read_unicode.cpp
+++ b/library/cpp/regex/pire/pire/read_unicode.cpp
@@ -26,58 +26,58 @@
#include <library/cpp/regex/pire/pire/re_lexer.h>
namespace Pire {
- wchar32 UnicodeReader::ReadUnicodeCharacter() {
- ystring hexStr;
- GetChar();
- wchar32 ch = PeekChar();
+ wchar32 UnicodeReader::ReadUnicodeCharacter() {
+ ystring hexStr;
+ GetChar();
+ wchar32 ch = PeekChar();
- if (ch == '{') {
- GetChar();
- hexStr = ReadHexDigit(
- [](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); });
- ch = GetChar();
- if (ch != '}') {
- Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\"");
- }
- } else {
- hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; });
- if (hexStr.size() != 2) {
- Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols");
- }
- }
- return HexToDec(hexStr);
- }
+ if (ch == '{') {
+ GetChar();
+ hexStr = ReadHexDigit(
+ [](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); });
+ ch = GetChar();
+ if (ch != '}') {
+ Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\"");
+ }
+ } else {
+ hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; });
+ if (hexStr.size() != 2) {
+ Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols");
+ }
+ }
+ return HexToDec(hexStr);
+ }
- bool UnicodeReader::IsHexDigit(wchar32 ch) {
- return ch < 256 && std::isxdigit(ch) != 0;
- }
+ bool UnicodeReader::IsHexDigit(wchar32 ch) {
+ return ch < 256 && std::isxdigit(ch) != 0;
+ }
- ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) {
- ystring result;
- wchar32 ch = GetChar();
- while (!shouldStop(ch, result.size())) {
- if (!IsHexDigit(ch)) {
- Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number");
- }
- result.push_back(ch);
- ch = GetChar();
- }
- UngetChar(ch);
- return result;
- }
+ ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) {
+ ystring result;
+ wchar32 ch = GetChar();
+ while (!shouldStop(ch, result.size())) {
+ if (!IsHexDigit(ch)) {
+ Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number");
+ }
+ result.push_back(ch);
+ ch = GetChar();
+ }
+ UngetChar(ch);
+ return result;
+ }
- wchar32 UnicodeReader::HexToDec(const ystring &hexStr) {
- wchar32 converted;
- try {
- converted = std::stoul(hexStr, 0, 16);
- } catch (std::out_of_range &) {
- converted = MAX_UNICODE + 1;
- }
- if (converted > MAX_UNICODE) {
- Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large");
- }
- return converted;
- }
+ wchar32 UnicodeReader::HexToDec(const ystring &hexStr) {
+ wchar32 converted;
+ try {
+ converted = std::stoul(hexStr, 0, 16);
+ } catch (std::out_of_range &) {
+ converted = MAX_UNICODE + 1;
+ }
+ if (converted > MAX_UNICODE) {
+ Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large");
+ }
+ return converted;
+ }
}
diff --git a/library/cpp/regex/pire/pire/read_unicode.h b/library/cpp/regex/pire/pire/read_unicode.h
index ea3d7599ebb..3c48dfe2453 100644
--- a/library/cpp/regex/pire/pire/read_unicode.h
+++ b/library/cpp/regex/pire/pire/read_unicode.h
@@ -24,17 +24,17 @@
#include <library/cpp/regex/pire/pire/re_lexer.h>
namespace Pire {
- class UnicodeReader : public Feature {
- public:
- wchar32 ReadUnicodeCharacter();
+ class UnicodeReader : public Feature {
+ public:
+ wchar32 ReadUnicodeCharacter();
- private:
- static const wchar32 MAX_UNICODE = 0x10FFFF;
+ private:
+ static const wchar32 MAX_UNICODE = 0x10FFFF;
- bool IsHexDigit(wchar32 ch);
- ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop);
- wchar32 HexToDec(const ystring& hexStr);
- };
+ bool IsHexDigit(wchar32 ch);
+ ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop);
+ wchar32 HexToDec(const ystring& hexStr);
+ };
}
diff --git a/library/cpp/regex/pire/pire/run.h b/library/cpp/regex/pire/pire/run.h
index 2c536f7c3a2..905f6c32236 100644
--- a/library/cpp/regex/pire/pire/run.h
+++ b/library/cpp/regex/pire/pire/run.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -35,17 +35,17 @@
namespace Pire {
- template<class Scanner>
- struct StDumper {
- StDumper(const Scanner& sc, typename Scanner::State st): m_sc(&sc), m_st(st) {}
- void Dump(yostream& stream) const { stream << m_sc->StateIndex(m_st) << (m_sc->Final(m_st) ? " [final]" : ""); }
- private:
- const Scanner* m_sc;
- typename Scanner::State m_st;
- };
-
- template<class Scanner> StDumper<Scanner> StDump(const Scanner& sc, typename Scanner::State st) { return StDumper<Scanner>(sc, st); }
- template<class Scanner> yostream& operator << (yostream& stream, const StDumper<Scanner>& stdump) { stdump.Dump(stream); return stream; }
+ template<class Scanner>
+ struct StDumper {
+ StDumper(const Scanner& sc, typename Scanner::State st): m_sc(&sc), m_st(st) {}
+ void Dump(yostream& stream) const { stream << m_sc->StateIndex(m_st) << (m_sc->Final(m_st) ? " [final]" : ""); }
+ private:
+ const Scanner* m_sc;
+ typename Scanner::State m_st;
+ };
+
+ template<class Scanner> StDumper<Scanner> StDump(const Scanner& sc, typename Scanner::State st) { return StDumper<Scanner>(sc, st); }
+ template<class Scanner> yostream& operator << (yostream& stream, const StDumper<Scanner>& stdump) { stdump.Dump(stream); return stream; }
}
namespace Pire {
@@ -54,53 +54,53 @@ template<class Scanner>
PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
void Step(const Scanner& scanner, typename Scanner::State& state, Char ch)
{
- Y_ASSERT(ch < MaxCharUnaligned);
- typename Scanner::Action a = scanner.Next(state, ch);
- scanner.TakeAction(state, a);
+ Y_ASSERT(ch < MaxCharUnaligned);
+ typename Scanner::Action a = scanner.Next(state, ch);
+ scanner.TakeAction(state, a);
}
namespace Impl {
- enum Action { Continue, Stop };
-
- template<class Scanner>
- struct RunPred {
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action operator()(const Scanner&, const typename Scanner::State&, const char*) const { return Continue; }
- };
-
- template<class Scanner>
- struct ShortestPrefixPred {
- explicit ShortestPrefixPred(const char*& pos): m_pos(&pos) {}
-
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const
- {
- if (sc.Final(st)) {
- *m_pos = pos;
- return Stop;
- } else {
- return (sc.Dead(st) ? Stop : Continue);
- }
- }
- private:
- const char** m_pos;
- };
-
- template<class Scanner>
- struct LongestPrefixPred {
- explicit LongestPrefixPred(const char*& pos): m_pos(&pos) {}
-
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const
- {
- if (sc.Final(st))
- *m_pos = pos;
- return (sc.Dead(st) ? Stop : Continue);
- }
- private:
- const char** m_pos;
- };
+ enum Action { Continue, Stop };
+
+ template<class Scanner>
+ struct RunPred {
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action operator()(const Scanner&, const typename Scanner::State&, const char*) const { return Continue; }
+ };
+
+ template<class Scanner>
+ struct ShortestPrefixPred {
+ explicit ShortestPrefixPred(const char*& pos): m_pos(&pos) {}
+
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const
+ {
+ if (sc.Final(st)) {
+ *m_pos = pos;
+ return Stop;
+ } else {
+ return (sc.Dead(st) ? Stop : Continue);
+ }
+ }
+ private:
+ const char** m_pos;
+ };
+
+ template<class Scanner>
+ struct LongestPrefixPred {
+ explicit LongestPrefixPred(const char*& pos): m_pos(&pos) {}
+
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action operator()(const Scanner& sc, const typename Scanner::State& st, const char* pos) const
+ {
+ if (sc.Final(st))
+ *m_pos = pos;
+ return (sc.Dead(st) ? Stop : Continue);
+ }
+ private:
+ const char** m_pos;
+ };
}
@@ -108,125 +108,125 @@ namespace Impl {
namespace Impl {
- template<class Scanner, class Pred>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action SafeRunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred)
- {
- Y_ASSERT(pos <= sizeof(size_t));
- Y_ASSERT(size <= sizeof(size_t));
- Y_ASSERT(pos + size <= sizeof(size_t));
-
- if (PIRE_UNLIKELY(size == 0))
- return Continue;
-
- const char* ptr = (const char*) p + pos;
- for (; size--; ++ptr) {
- Step(scanner, state, (unsigned char) *ptr);
- if (pred(scanner, state, ptr + 1) == Stop)
- return Stop;
- }
- return Continue;
- }
-
- /// Effectively runs a scanner on a short data chunk, fit completely into one machine word.
- template<class Scanner, class Pred>
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action RunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred)
- {
- Y_ASSERT(pos <= sizeof(size_t));
- Y_ASSERT(size <= sizeof(size_t));
- Y_ASSERT(pos + size <= sizeof(size_t));
-
- if (PIRE_UNLIKELY(size == 0))
- return Continue;
-
- size_t chunk = Impl::ToLittleEndian(*p) >> 8*pos;
- const char* ptr = (const char*) p + pos + size + 1;
-
- for (size_t i = size; i != 0; --i) {
- Step(scanner, state, chunk & 0xFF);
- if (pred(scanner, state, ptr - i) == Stop)
- return Stop;
- chunk >>= 8;
- }
- return Continue;
- }
-
- template<class Scanner>
- struct AlignedRunner {
-
- // Generic version for LongestPrefix()/ShortestPrefix() impelementations
- template<class Pred>
- static inline PIRE_HOT_FUNCTION
- Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, Pred stop)
- {
- typename Scanner::State st = state;
- Action ret = Continue;
- for (; begin != end && (ret = RunChunk(scanner, st, begin, 0, sizeof(void*), stop)) == Continue; ++begin)
- ;
- state = st;
- return ret;
- }
-
- // A special version for Run() impelementation that skips predicate checks
- static inline PIRE_HOT_FUNCTION
- Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, RunPred<Scanner>)
- {
- typename Scanner::State st = state;
- for (; begin != end; ++begin) {
- size_t chunk = *begin;
- for (size_t i = sizeof(chunk); i != 0; --i) {
- Step(scanner, st, chunk & 0xFF);
- chunk >>= 8;
- }
- }
- state = st;
- return Continue;
- }
- };
-
- /// The main function: runs a scanner through given memory range.
- template<class Scanner, class Pred>
- inline void DoRun(const Scanner& scanner, typename Scanner::State& st, TStringBuf str, Pred pred)
- {
-
- const size_t* head = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.begin())) & ~(sizeof(size_t)-1));
- const size_t* tail = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.end())) & ~(sizeof(size_t)-1));
-
- size_t headSize = (sizeof(size_t) - (str.begin() - (const char*)head)); // The distance from @p begin to the end of the word containing @p begin
- size_t tailSize = str.end() - (const char*) tail; // The distance from the beginning of the word containing @p end to the @p end
-
- Y_ASSERT(headSize >= 1 && headSize <= sizeof(size_t));
- Y_ASSERT(tailSize < sizeof(size_t));
-
- if (head == tail) {
- Impl::SafeRunChunk(scanner, st, head, sizeof(size_t) - headSize, str.end() - str.begin(), pred);
- return;
- }
-
- // st is passed by reference to this function. If we use it directly on each step the compiler will have to
- // update it in memory because of pointer aliasing assumptions. Copying it into a local var allows the
- // compiler to store it in a register. This saves some instructions and cycles
- typename Scanner::State state = st;
-
- if (str.begin() != (const char*) head) {
- if (Impl::RunChunk(scanner, state, head, sizeof(size_t) - headSize, headSize, pred) == Stop) {
- st = state;
- return;
- }
- ++head;
- }
-
- if (Impl::AlignedRunner<Scanner>::RunAligned(scanner, state, head, tail, pred) == Stop) {
- st = state;
- return;
- }
-
- if (tailSize)
- Impl::SafeRunChunk(scanner, state, tail, 0, tailSize, pred);
-
- st = state;
- }
+ template<class Scanner, class Pred>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action SafeRunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred)
+ {
+ Y_ASSERT(pos <= sizeof(size_t));
+ Y_ASSERT(size <= sizeof(size_t));
+ Y_ASSERT(pos + size <= sizeof(size_t));
+
+ if (PIRE_UNLIKELY(size == 0))
+ return Continue;
+
+ const char* ptr = (const char*) p + pos;
+ for (; size--; ++ptr) {
+ Step(scanner, state, (unsigned char) *ptr);
+ if (pred(scanner, state, ptr + 1) == Stop)
+ return Stop;
+ }
+ return Continue;
+ }
+
+ /// Effectively runs a scanner on a short data chunk, fit completely into one machine word.
+ template<class Scanner, class Pred>
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action RunChunk(const Scanner& scanner, typename Scanner::State& state, const size_t* p, size_t pos, size_t size, Pred pred)
+ {
+ Y_ASSERT(pos <= sizeof(size_t));
+ Y_ASSERT(size <= sizeof(size_t));
+ Y_ASSERT(pos + size <= sizeof(size_t));
+
+ if (PIRE_UNLIKELY(size == 0))
+ return Continue;
+
+ size_t chunk = Impl::ToLittleEndian(*p) >> 8*pos;
+ const char* ptr = (const char*) p + pos + size + 1;
+
+ for (size_t i = size; i != 0; --i) {
+ Step(scanner, state, chunk & 0xFF);
+ if (pred(scanner, state, ptr - i) == Stop)
+ return Stop;
+ chunk >>= 8;
+ }
+ return Continue;
+ }
+
+ template<class Scanner>
+ struct AlignedRunner {
+
+ // Generic version for LongestPrefix()/ShortestPrefix() impelementations
+ template<class Pred>
+ static inline PIRE_HOT_FUNCTION
+ Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, Pred stop)
+ {
+ typename Scanner::State st = state;
+ Action ret = Continue;
+ for (; begin != end && (ret = RunChunk(scanner, st, begin, 0, sizeof(void*), stop)) == Continue; ++begin)
+ ;
+ state = st;
+ return ret;
+ }
+
+ // A special version for Run() impelementation that skips predicate checks
+ static inline PIRE_HOT_FUNCTION
+ Action RunAligned(const Scanner& scanner, typename Scanner::State& state, const size_t* begin, const size_t* end, RunPred<Scanner>)
+ {
+ typename Scanner::State st = state;
+ for (; begin != end; ++begin) {
+ size_t chunk = *begin;
+ for (size_t i = sizeof(chunk); i != 0; --i) {
+ Step(scanner, st, chunk & 0xFF);
+ chunk >>= 8;
+ }
+ }
+ state = st;
+ return Continue;
+ }
+ };
+
+ /// The main function: runs a scanner through given memory range.
+ template<class Scanner, class Pred>
+ inline void DoRun(const Scanner& scanner, typename Scanner::State& st, TStringBuf str, Pred pred)
+ {
+
+ const size_t* head = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.begin())) & ~(sizeof(size_t)-1));
+ const size_t* tail = reinterpret_cast<const size_t*>((reinterpret_cast<uintptr_t>(str.end())) & ~(sizeof(size_t)-1));
+
+ size_t headSize = (sizeof(size_t) - (str.begin() - (const char*)head)); // The distance from @p begin to the end of the word containing @p begin
+ size_t tailSize = str.end() - (const char*) tail; // The distance from the beginning of the word containing @p end to the @p end
+
+ Y_ASSERT(headSize >= 1 && headSize <= sizeof(size_t));
+ Y_ASSERT(tailSize < sizeof(size_t));
+
+ if (head == tail) {
+ Impl::SafeRunChunk(scanner, st, head, sizeof(size_t) - headSize, str.end() - str.begin(), pred);
+ return;
+ }
+
+ // st is passed by reference to this function. If we use it directly on each step the compiler will have to
+ // update it in memory because of pointer aliasing assumptions. Copying it into a local var allows the
+ // compiler to store it in a register. This saves some instructions and cycles
+ typename Scanner::State state = st;
+
+ if (str.begin() != (const char*) head) {
+ if (Impl::RunChunk(scanner, state, head, sizeof(size_t) - headSize, headSize, pred) == Stop) {
+ st = state;
+ return;
+ }
+ ++head;
+ }
+
+ if (Impl::AlignedRunner<Scanner>::RunAligned(scanner, state, head, tail, pred) == Stop) {
+ st = state;
+ return;
+ }
+
+ if (tailSize)
+ Impl::SafeRunChunk(scanner, state, tail, 0, tailSize, pred);
+
+ st = state;
+ }
}
@@ -235,52 +235,52 @@ namespace Impl {
template<class Scanner1, class Scanner2>
inline void Run(const Scanner1& scanner1, const Scanner2& scanner2, typename Scanner1::State& state1, typename Scanner2::State& state2, TStringBuf str)
{
- typedef ScannerPair<Scanner1, Scanner2> Scanners;
- Scanners pair(scanner1, scanner2);
- typename Scanners::State states(state1, state2);
- Run(pair, states, str);
- state1 = states.first;
- state2 = states.second;
+ typedef ScannerPair<Scanner1, Scanner2> Scanners;
+ Scanners pair(scanner1, scanner2);
+ typename Scanners::State states(state1, state2);
+ Run(pair, states, str);
+ state1 = states.first;
+ state2 = states.second;
}
#else
namespace Impl {
- /// A debug version of all Run() methods.
- template<class Scanner, class Pred>
- inline void DoRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end, Pred pred)
- {
- Cdbg << "Running regexp on string " << ystring(begin, ymin(end - begin, static_cast<ptrdiff_t>(100u))) << Endl;
- Cdbg << "Initial state " << StDump(scanner, state) << Endl;
-
- if (pred(scanner, state, begin) == Stop) {
- Cdbg << " exiting" << Endl;
- return;
- }
-
- for (; begin != end; ++begin) {
- Step(scanner, state, (unsigned char)*begin);
- Cdbg << *begin << " => state " << StDump(scanner, state) << Endl;
- if (pred(scanner, state, begin + 1) == Stop) {
- Cdbg << " exiting" << Endl;
- return;
- }
- }
- }
+ /// A debug version of all Run() methods.
+ template<class Scanner, class Pred>
+ inline void DoRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end, Pred pred)
+ {
+ Cdbg << "Running regexp on string " << ystring(begin, ymin(end - begin, static_cast<ptrdiff_t>(100u))) << Endl;
+ Cdbg << "Initial state " << StDump(scanner, state) << Endl;
+
+ if (pred(scanner, state, begin) == Stop) {
+ Cdbg << " exiting" << Endl;
+ return;
+ }
+
+ for (; begin != end; ++begin) {
+ Step(scanner, state, (unsigned char)*begin);
+ Cdbg << *begin << " => state " << StDump(scanner, state) << Endl;
+ if (pred(scanner, state, begin + 1) == Stop) {
+ Cdbg << " exiting" << Endl;
+ return;
+ }
+ }
+ }
}
#endif
-
+
template<class Scanner>
void Run(const Scanner& sc, typename Scanner::State& st, TStringBuf str)
{
- Impl::DoRun(sc, st, str, Impl::RunPred<Scanner>());
+ Impl::DoRun(sc, st, str, Impl::RunPred<Scanner>());
}
template<class Scanner>
void Run(const Scanner& sc, typename Scanner::State& st, const char* begin, const char* end)
{
- Run(sc, st, TStringBuf(begin, end));
+ Run(sc, st, TStringBuf(begin, end));
}
/// Returns default constructed string_view{} if there is no matching prefix
@@ -288,25 +288,25 @@ void Run(const Scanner& sc, typename Scanner::State& st, const char* begin, cons
template<class Scanner>
std::string_view LongestPrefix(const Scanner& sc, std::string_view str, bool throughBeginMark = false, bool throughEndMark = false)
{
- typename Scanner::State st;
- sc.Initialize(st);
- if (throughBeginMark)
- Pire::Step(sc, st, BeginMark);
- const char* pos = (sc.Final(st) ? str.data() : nullptr);
- Impl::DoRun(sc, st, str, Impl::LongestPrefixPred<Scanner>(pos));
- if (throughEndMark) {
- Pire::Step(sc, st, EndMark);
- if (sc.Final(st))
- pos = str.data() + str.size();
- }
- return pos ? str.substr(0, pos - str.data()) : std::string_view{};
+ typename Scanner::State st;
+ sc.Initialize(st);
+ if (throughBeginMark)
+ Pire::Step(sc, st, BeginMark);
+ const char* pos = (sc.Final(st) ? str.data() : nullptr);
+ Impl::DoRun(sc, st, str, Impl::LongestPrefixPred<Scanner>(pos));
+ if (throughEndMark) {
+ Pire::Step(sc, st, EndMark);
+ if (sc.Final(st))
+ pos = str.data() + str.size();
+ }
+ return pos ? str.substr(0, pos - str.data()) : std::string_view{};
}
template<class Scanner>
const char* LongestPrefix(const Scanner& sc, const char* begin, const char* end, bool throughBeginMark = false, bool throughEndMark = false)
{
- auto prefix = LongestPrefix(sc, std::string_view(begin, end - begin), throughBeginMark, throughEndMark);
- return prefix.data() + prefix.size();
+ auto prefix = LongestPrefix(sc, std::string_view(begin, end - begin), throughBeginMark, throughEndMark);
+ return prefix.data() + prefix.size();
}
/// Returns default constructed string_view{} if there is no matching prefix
@@ -314,20 +314,20 @@ const char* LongestPrefix(const Scanner& sc, const char* begin, const char* end,
template<class Scanner>
std::string_view ShortestPrefix(const Scanner& sc, std::string_view str, bool throughBeginMark = false, bool throughEndMark = false)
{
- typename Scanner::State st;
- sc.Initialize(st);
- if (throughBeginMark)
- Pire::Step(sc, st, BeginMark);
- if (sc.Final(st))
- return str.substr(0, 0);
- const char* pos = nullptr;
- Impl::DoRun(sc, st, str, Impl::ShortestPrefixPred<Scanner>(pos));
- if (throughEndMark) {
- Pire::Step(sc, st, EndMark);
- if (sc.Final(st) && !pos)
- pos = str.data() + str.size();
- }
- return pos ? str.substr(0, pos - str.data()) : std::string_view{};
+ typename Scanner::State st;
+ sc.Initialize(st);
+ if (throughBeginMark)
+ Pire::Step(sc, st, BeginMark);
+ if (sc.Final(st))
+ return str.substr(0, 0);
+ const char* pos = nullptr;
+ Impl::DoRun(sc, st, str, Impl::ShortestPrefixPred<Scanner>(pos));
+ if (throughEndMark) {
+ Pire::Step(sc, st, EndMark);
+ if (sc.Final(st) && !pos)
+ pos = str.data() + str.size();
+ }
+ return pos ? str.substr(0, pos - str.data()) : std::string_view{};
}
template<class Scanner>
@@ -337,7 +337,7 @@ const char* ShortestPrefix(const Scanner& sc, const char* begin, const char* end
return prefix.data() + prefix.size();
}
-
+
/// The same as above, but scans string in reverse direction
/// (consider using Fsm::Reverse() for using in this function).
/// Returns default constructed string_view{} if there is no matching suffix
@@ -345,35 +345,35 @@ const char* ShortestPrefix(const Scanner& sc, const char* begin, const char* end
template<class Scanner>
inline std::string_view LongestSuffix(const Scanner& scanner, std::string_view str, bool throughEndMark = false, bool throughBeginMark = false)
{
- typename Scanner::State state;
- scanner.Initialize(state);
- if (throughEndMark)
- Step(scanner, state, EndMark);
- PIRE_IFDEBUG(Cdbg << "Running LongestSuffix on string " << ystring(str) << Endl);
- PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl);
-
- std::string_view suffix{};
- auto begin = str.data() + str.size();
- while (begin != str.data() && !scanner.Dead(state)) {
- if (scanner.Final(state))
- suffix = str.substr(begin - str.data());
- --begin;
- Step(scanner, state, (unsigned char)*begin);
- PIRE_IFDEBUG(Cdbg << *begin << " => state " << StDump(scanner, state) << Endl);
- }
- if (scanner.Final(state))
- suffix = str.substr(begin - str.data());
- if (throughBeginMark) {
- Step(scanner, state, BeginMark);
- if (scanner.Final(state))
- suffix = str.substr(begin - str.data());
- }
- return suffix;
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ if (throughEndMark)
+ Step(scanner, state, EndMark);
+ PIRE_IFDEBUG(Cdbg << "Running LongestSuffix on string " << ystring(str) << Endl);
+ PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl);
+
+ std::string_view suffix{};
+ auto begin = str.data() + str.size();
+ while (begin != str.data() && !scanner.Dead(state)) {
+ if (scanner.Final(state))
+ suffix = str.substr(begin - str.data());
+ --begin;
+ Step(scanner, state, (unsigned char)*begin);
+ PIRE_IFDEBUG(Cdbg << *begin << " => state " << StDump(scanner, state) << Endl);
+ }
+ if (scanner.Final(state))
+ suffix = str.substr(begin - str.data());
+ if (throughBeginMark) {
+ Step(scanner, state, BeginMark);
+ if (scanner.Final(state))
+ suffix = str.substr(begin - str.data());
+ }
+ return suffix;
}
template<class Scanner>
inline const char* LongestSuffix(const Scanner& scanner, const char* rbegin, const char* rend, bool throughEndMark = false, bool throughBeginMark = false) {
- auto suffix = LongestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark);
+ auto suffix = LongestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark);
return suffix.data() ? suffix.data() - 1 : nullptr;
}
@@ -383,52 +383,52 @@ inline const char* LongestSuffix(const Scanner& scanner, const char* rbegin, con
template<class Scanner>
inline std::string_view ShortestSuffix(const Scanner& scanner, std::string_view str, bool throughEndMark = false, bool throughBeginMark = false)
{
- auto begin = str.data() + str.size();
- typename Scanner::State state;
- scanner.Initialize(state);
- if (throughEndMark)
- Step(scanner, state, EndMark);
- PIRE_IFDEBUG(Cdbg << "Running ShortestSuffix on string " << ystring(str) << Endl);
- PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl);
-
- while (begin != str.data() && !scanner.Final(state) && !scanner.Dead(state)) {
- --begin;
- scanner.Next(state, (unsigned char)*begin);
- PIRE_IFDEBUG(Cdbg << *rbegin << " => state " << StDump(scanner, state) << Endl);
- }
- if (throughBeginMark)
- Step(scanner, state, BeginMark);
- return scanner.Final(state) ? str.substr(begin - str.data()) : std::string_view{};
+ auto begin = str.data() + str.size();
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ if (throughEndMark)
+ Step(scanner, state, EndMark);
+ PIRE_IFDEBUG(Cdbg << "Running ShortestSuffix on string " << ystring(str) << Endl);
+ PIRE_IFDEBUG(Cdbg << "Initial state " << StDump(scanner, state) << Endl);
+
+ while (begin != str.data() && !scanner.Final(state) && !scanner.Dead(state)) {
+ --begin;
+ scanner.Next(state, (unsigned char)*begin);
+ PIRE_IFDEBUG(Cdbg << *rbegin << " => state " << StDump(scanner, state) << Endl);
+ }
+ if (throughBeginMark)
+ Step(scanner, state, BeginMark);
+ return scanner.Final(state) ? str.substr(begin - str.data()) : std::string_view{};
}
template<class Scanner>
inline const char* ShortestSuffix(const Scanner& scanner, const char* rbegin, const char* rend, bool throughEndMark = false, bool throughBeginMark = false) {
- auto suffix = ShortestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark);
- return suffix.data() ? suffix.data() - 1 : nullptr;
+ auto suffix = ShortestSuffix(scanner, std::string_view(rend + 1, rbegin - rend), throughEndMark, throughBeginMark);
+ return suffix.data() ? suffix.data() - 1 : nullptr;
}
template<class Scanner>
class RunHelper {
public:
- RunHelper(const Scanner& sc, typename Scanner::State st): Sc(&sc), St(st) {}
- explicit RunHelper(const Scanner& sc): Sc(&sc) { Sc->Initialize(St); }
+ RunHelper(const Scanner& sc, typename Scanner::State st): Sc(&sc), St(st) {}
+ explicit RunHelper(const Scanner& sc): Sc(&sc) { Sc->Initialize(St); }
- RunHelper<Scanner>& Step(Char letter) { Pire::Step(*Sc, St, letter); return *this; }
- RunHelper<Scanner>& Run(TStringBuf str) { Pire::Run(*Sc, St, str); return *this; }
- RunHelper<Scanner>& Run(const char* begin, const char* end) { return Run(TStringBuf(begin, end)); }
- RunHelper<Scanner>& Run(const char* begin, size_t size) { return Run(TStringBuf(begin, begin + size)); }
- RunHelper<Scanner>& Begin() { return Step(BeginMark); }
- RunHelper<Scanner>& End() { return Step(EndMark); }
+ RunHelper<Scanner>& Step(Char letter) { Pire::Step(*Sc, St, letter); return *this; }
+ RunHelper<Scanner>& Run(TStringBuf str) { Pire::Run(*Sc, St, str); return *this; }
+ RunHelper<Scanner>& Run(const char* begin, const char* end) { return Run(TStringBuf(begin, end)); }
+ RunHelper<Scanner>& Run(const char* begin, size_t size) { return Run(TStringBuf(begin, begin + size)); }
+ RunHelper<Scanner>& Begin() { return Step(BeginMark); }
+ RunHelper<Scanner>& End() { return Step(EndMark); }
- const typename Scanner::State& State() const { return St; }
- struct Tag {};
- operator const Tag*() const { return Sc->Final(St) ? (const Tag*) this : 0; }
- bool operator ! () const { return !Sc->Final(St); }
+ const typename Scanner::State& State() const { return St; }
+ struct Tag {};
+ operator const Tag*() const { return Sc->Final(St) ? (const Tag*) this : 0; }
+ bool operator ! () const { return !Sc->Final(St); }
private:
- const Scanner* Sc;
- typename Scanner::State St;
+ const Scanner* Sc;
+ typename Scanner::State St;
};
template<class Scanner>
@@ -442,22 +442,22 @@ RunHelper<Scanner> Runner(const Scanner& sc, typename Scanner::State st) { retur
template<class Scanner>
bool Matches(const Scanner& scanner, TStringBuf str)
{
- return Runner(scanner).Run(str);
+ return Runner(scanner).Run(str);
}
template<class Scanner>
bool Matches(const Scanner& scanner, const char* begin, const char* end)
{
- return Runner(scanner).Run(TStringBuf(begin, end));
+ return Runner(scanner).Run(TStringBuf(begin, end));
}
/// Constructs an inline scanner in one statement
template<class Scanner>
Scanner MmappedScanner(const char* ptr, size_t size)
{
- Scanner s;
- s.Mmap(ptr, size);
- return s;
+ Scanner s;
+ s.Mmap(ptr, size);
+ return s;
}
}
diff --git a/library/cpp/regex/pire/pire/scanner_io.cpp b/library/cpp/regex/pire/pire/scanner_io.cpp
index 65cf9a1a93f..af7cfde3d48 100644
--- a/library/cpp/regex/pire/pire/scanner_io.cpp
+++ b/library/cpp/regex/pire/pire/scanner_io.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -31,187 +31,187 @@
#include "align.h"
namespace Pire {
-
+
void SimpleScanner::Save(yostream* s) const
{
- SavePodType(s, Header(ScannerIOTypes::SimpleScanner, sizeof(m)));
- Impl::AlignSave(s, sizeof(Header));
- Locals mc = m;
- mc.initial -= reinterpret_cast<size_t>(m_transitions);
- SavePodType(s, mc);
- Impl::AlignSave(s, sizeof(mc));
- SavePodType(s, Empty());
- Impl::AlignSave(s, sizeof(Empty()));
- if (!Empty()) {
- Y_ASSERT(m_buffer);
- Impl::AlignedSaveArray(s, m_buffer.Get(), BufSize());
- }
+ SavePodType(s, Header(ScannerIOTypes::SimpleScanner, sizeof(m)));
+ Impl::AlignSave(s, sizeof(Header));
+ Locals mc = m;
+ mc.initial -= reinterpret_cast<size_t>(m_transitions);
+ SavePodType(s, mc);
+ Impl::AlignSave(s, sizeof(mc));
+ SavePodType(s, Empty());
+ Impl::AlignSave(s, sizeof(Empty()));
+ if (!Empty()) {
+ Y_ASSERT(m_buffer);
+ Impl::AlignedSaveArray(s, m_buffer.Get(), BufSize());
+ }
}
void SimpleScanner::Load(yistream* s)
{
- SimpleScanner sc;
- Impl::ValidateHeader(s, ScannerIOTypes::SimpleScanner, sizeof(sc.m));
- LoadPodType(s, sc.m);
- Impl::AlignLoad(s, sizeof(sc.m));
- bool empty;
- LoadPodType(s, empty);
- Impl::AlignLoad(s, sizeof(empty));
- if (empty) {
- sc.Alias(Null());
- } else {
- sc.m_buffer = BufferType(new char[sc.BufSize()]);
- Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize());
- sc.Markup(sc.m_buffer.Get());
- sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions);
- }
- Swap(sc);
+ SimpleScanner sc;
+ Impl::ValidateHeader(s, ScannerIOTypes::SimpleScanner, sizeof(sc.m));
+ LoadPodType(s, sc.m);
+ Impl::AlignLoad(s, sizeof(sc.m));
+ bool empty;
+ LoadPodType(s, empty);
+ Impl::AlignLoad(s, sizeof(empty));
+ if (empty) {
+ sc.Alias(Null());
+ } else {
+ sc.m_buffer = BufferType(new char[sc.BufSize()]);
+ Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize());
+ sc.Markup(sc.m_buffer.Get());
+ sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions);
+ }
+ Swap(sc);
}
void SlowScanner::Save(yostream* s) const
{
- SavePodType(s, Header(ScannerIOTypes::SlowScanner, sizeof(m)));
- Impl::AlignSave(s, sizeof(Header));
- SavePodType(s, m);
- Impl::AlignSave(s, sizeof(m));
- SavePodType(s, Empty());
- Impl::AlignSave(s, sizeof(Empty()));
- if (!Empty()) {
- Y_ASSERT(!m_vec.empty());
- Impl::AlignedSaveArray(s, m_letters, MaxChar);
- Impl::AlignedSaveArray(s, m_finals, m.statesCount);
-
- size_t c = 0;
- SavePodType<size_t>(s, 0);
- for (auto&& i : m_vec) {
- size_t n = c + i.size();
- SavePodType(s, n);
- c = n;
- }
- Impl::AlignSave(s, (m_vec.size() + 1) * sizeof(size_t));
-
- size_t size = 0;
- for (auto&& i : m_vec)
- if (!i.empty()) {
- SavePodArray(s, &(i)[0], i.size());
- size += sizeof(unsigned) * i.size();
- }
- Impl::AlignSave(s, size);
- if (need_actions) {
- size_t pos = 0;
- for (TVector< TVector< Action > >::const_iterator i = m_actionsvec.begin(), ie = m_actionsvec.end(); i != ie; ++i)
- if (!i->empty()) {
- SavePodArray(s, &(*i)[0], i->size());
- pos += sizeof(Action) * i->size();
- }
- Impl::AlignSave(s, pos);
- }
- }
+ SavePodType(s, Header(ScannerIOTypes::SlowScanner, sizeof(m)));
+ Impl::AlignSave(s, sizeof(Header));
+ SavePodType(s, m);
+ Impl::AlignSave(s, sizeof(m));
+ SavePodType(s, Empty());
+ Impl::AlignSave(s, sizeof(Empty()));
+ if (!Empty()) {
+ Y_ASSERT(!m_vec.empty());
+ Impl::AlignedSaveArray(s, m_letters, MaxChar);
+ Impl::AlignedSaveArray(s, m_finals, m.statesCount);
+
+ size_t c = 0;
+ SavePodType<size_t>(s, 0);
+ for (auto&& i : m_vec) {
+ size_t n = c + i.size();
+ SavePodType(s, n);
+ c = n;
+ }
+ Impl::AlignSave(s, (m_vec.size() + 1) * sizeof(size_t));
+
+ size_t size = 0;
+ for (auto&& i : m_vec)
+ if (!i.empty()) {
+ SavePodArray(s, &(i)[0], i.size());
+ size += sizeof(unsigned) * i.size();
+ }
+ Impl::AlignSave(s, size);
+ if (need_actions) {
+ size_t pos = 0;
+ for (TVector< TVector< Action > >::const_iterator i = m_actionsvec.begin(), ie = m_actionsvec.end(); i != ie; ++i)
+ if (!i->empty()) {
+ SavePodArray(s, &(*i)[0], i->size());
+ pos += sizeof(Action) * i->size();
+ }
+ Impl::AlignSave(s, pos);
+ }
+ }
}
void SlowScanner::Load(yistream* s)
{
- SlowScanner sc;
- Impl::ValidateHeader(s, ScannerIOTypes::SlowScanner, sizeof(sc.m));
- LoadPodType(s, sc.m);
- Impl::AlignLoad(s, sizeof(sc.m));
- bool empty;
- LoadPodType(s, empty);
- Impl::AlignLoad(s, sizeof(empty));
- sc.need_actions = need_actions;
- if (empty) {
- sc.Alias(Null());
- } else {
- sc.m_vec.resize(sc.m.lettersCount * sc.m.statesCount);
- if (sc.need_actions)
- sc.m_actionsvec.resize(sc.m.lettersCount * sc.m.statesCount);
- sc.m_vecptr = &sc.m_vec;
-
- sc.alloc(sc.m_letters, MaxChar);
- Impl::AlignedLoadArray(s, sc.m_letters, MaxChar);
-
- sc.alloc(sc.m_finals, sc.m.statesCount);
- Impl::AlignedLoadArray(s, sc.m_finals, sc.m.statesCount);
-
- size_t c;
- LoadPodType(s, c);
- auto act = sc.m_actionsvec.begin();
- for (auto&& i : sc.m_vec) {
- size_t n;
- LoadPodType(s, n);
- i.resize(n - c);
- if (sc.need_actions) {
- act->resize(n - c);
- ++act;
- }
- c = n;
- }
- Impl::AlignLoad(s, (m_vec.size() + 1) * sizeof(size_t));
-
- size_t size = 0;
- for (auto&& i : sc.m_vec)
- if (!i.empty()) {
- LoadPodArray(s, &(i)[0], i.size());
- size += sizeof(unsigned) * i.size();
- }
- Impl::AlignLoad(s, size);
- size_t actSize = 0;
- if (sc.need_actions) {
- for (auto&& i : sc.m_actionsvec) {
- if (!i.empty()) {
- LoadPodArray(s, &(i)[0], i.size());
- actSize += sizeof(Action) * i.size();
- }
- }
- Impl::AlignLoad(s, actSize);
- }
- }
- Swap(sc);
+ SlowScanner sc;
+ Impl::ValidateHeader(s, ScannerIOTypes::SlowScanner, sizeof(sc.m));
+ LoadPodType(s, sc.m);
+ Impl::AlignLoad(s, sizeof(sc.m));
+ bool empty;
+ LoadPodType(s, empty);
+ Impl::AlignLoad(s, sizeof(empty));
+ sc.need_actions = need_actions;
+ if (empty) {
+ sc.Alias(Null());
+ } else {
+ sc.m_vec.resize(sc.m.lettersCount * sc.m.statesCount);
+ if (sc.need_actions)
+ sc.m_actionsvec.resize(sc.m.lettersCount * sc.m.statesCount);
+ sc.m_vecptr = &sc.m_vec;
+
+ sc.alloc(sc.m_letters, MaxChar);
+ Impl::AlignedLoadArray(s, sc.m_letters, MaxChar);
+
+ sc.alloc(sc.m_finals, sc.m.statesCount);
+ Impl::AlignedLoadArray(s, sc.m_finals, sc.m.statesCount);
+
+ size_t c;
+ LoadPodType(s, c);
+ auto act = sc.m_actionsvec.begin();
+ for (auto&& i : sc.m_vec) {
+ size_t n;
+ LoadPodType(s, n);
+ i.resize(n - c);
+ if (sc.need_actions) {
+ act->resize(n - c);
+ ++act;
+ }
+ c = n;
+ }
+ Impl::AlignLoad(s, (m_vec.size() + 1) * sizeof(size_t));
+
+ size_t size = 0;
+ for (auto&& i : sc.m_vec)
+ if (!i.empty()) {
+ LoadPodArray(s, &(i)[0], i.size());
+ size += sizeof(unsigned) * i.size();
+ }
+ Impl::AlignLoad(s, size);
+ size_t actSize = 0;
+ if (sc.need_actions) {
+ for (auto&& i : sc.m_actionsvec) {
+ if (!i.empty()) {
+ LoadPodArray(s, &(i)[0], i.size());
+ actSize += sizeof(Action) * i.size();
+ }
+ }
+ Impl::AlignLoad(s, actSize);
+ }
+ }
+ Swap(sc);
}
void LoadedScanner::Save(yostream* s) const {
- Save(s, ScannerIOTypes::LoadedScanner);
+ Save(s, ScannerIOTypes::LoadedScanner);
}
void LoadedScanner::Save(yostream* s, ui32 type) const
{
- Y_ASSERT(type == ScannerIOTypes::LoadedScanner || type == ScannerIOTypes::NoGlueLimitCountingScanner);
- SavePodType(s, Header(type, sizeof(m)));
- Impl::AlignSave(s, sizeof(Header));
- Locals mc = m;
- mc.initial -= reinterpret_cast<size_t>(m_jumps);
- SavePodType(s, mc);
- Impl::AlignSave(s, sizeof(mc));
-
- Impl::AlignedSaveArray(s, m_letters, MaxChar);
- Impl::AlignedSaveArray(s, m_jumps, m.statesCount * m.lettersCount);
- Impl::AlignedSaveArray(s, m_tags, m.statesCount);
+ Y_ASSERT(type == ScannerIOTypes::LoadedScanner || type == ScannerIOTypes::NoGlueLimitCountingScanner);
+ SavePodType(s, Header(type, sizeof(m)));
+ Impl::AlignSave(s, sizeof(Header));
+ Locals mc = m;
+ mc.initial -= reinterpret_cast<size_t>(m_jumps);
+ SavePodType(s, mc);
+ Impl::AlignSave(s, sizeof(mc));
+
+ Impl::AlignedSaveArray(s, m_letters, MaxChar);
+ Impl::AlignedSaveArray(s, m_jumps, m.statesCount * m.lettersCount);
+ Impl::AlignedSaveArray(s, m_tags, m.statesCount);
}
void LoadedScanner::Load(yistream* s) {
- Load(s, nullptr);
+ Load(s, nullptr);
}
void LoadedScanner::Load(yistream* s, ui32* type)
{
- LoadedScanner sc;
- Header header = Impl::ValidateHeader(s, ScannerIOTypes::LoadedScanner, sizeof(sc.m));
- if (type) {
- *type = header.Type;
- }
- LoadPodType(s, sc.m);
- Impl::AlignLoad(s, sizeof(sc.m));
- sc.m_buffer = BufferType(new char[sc.BufSize()]);
- sc.Markup(sc.m_buffer.Get());
- Impl::AlignedLoadArray(s, sc.m_letters, MaxChar);
- Impl::AlignedLoadArray(s, sc.m_jumps, sc.m.statesCount * sc.m.lettersCount);
- if (header.Version == Header::RE_VERSION_WITH_MACTIONS) {
- TVector<Action> actions(sc.m.statesCount * sc.m.lettersCount);
- Impl::AlignedLoadArray(s, actions.data(), actions.size());
- }
- Impl::AlignedLoadArray(s, sc.m_tags, sc.m.statesCount);
- sc.m.initial += reinterpret_cast<size_t>(sc.m_jumps);
- Swap(sc);
+ LoadedScanner sc;
+ Header header = Impl::ValidateHeader(s, ScannerIOTypes::LoadedScanner, sizeof(sc.m));
+ if (type) {
+ *type = header.Type;
+ }
+ LoadPodType(s, sc.m);
+ Impl::AlignLoad(s, sizeof(sc.m));
+ sc.m_buffer = BufferType(new char[sc.BufSize()]);
+ sc.Markup(sc.m_buffer.Get());
+ Impl::AlignedLoadArray(s, sc.m_letters, MaxChar);
+ Impl::AlignedLoadArray(s, sc.m_jumps, sc.m.statesCount * sc.m.lettersCount);
+ if (header.Version == Header::RE_VERSION_WITH_MACTIONS) {
+ TVector<Action> actions(sc.m.statesCount * sc.m.lettersCount);
+ Impl::AlignedLoadArray(s, actions.data(), actions.size());
+ }
+ Impl::AlignedLoadArray(s, sc.m_tags, sc.m.statesCount);
+ sc.m.initial += reinterpret_cast<size_t>(sc.m_jumps);
+ Swap(sc);
}
}
diff --git a/library/cpp/regex/pire/pire/scanners/common.h b/library/cpp/regex/pire/pire/scanners/common.h
index 4cffca5072f..59b4dcd9699 100644
--- a/library/cpp/regex/pire/pire/scanners/common.h
+++ b/library/cpp/regex/pire/pire/scanners/common.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -30,94 +30,94 @@
#include <library/cpp/regex/pire/pire/platform.h>
namespace Pire {
- namespace ScannerIOTypes {
- enum {
- NoScanner = 0,
- Scanner = 1,
- SimpleScanner = 2,
- SlowScanner = 3,
- LoadedScanner = 4,
- NoGlueLimitCountingScanner = 5,
- };
- }
+ namespace ScannerIOTypes {
+ enum {
+ NoScanner = 0,
+ Scanner = 1,
+ SimpleScanner = 2,
+ SlowScanner = 3,
+ LoadedScanner = 4,
+ NoGlueLimitCountingScanner = 5,
+ };
+ }
- struct Header {
- ui32 Magic;
- ui32 Version;
- ui32 PtrSize;
- ui32 MaxWordSize;
- ui32 Type;
- ui32 HdrSize;
+ struct Header {
+ ui32 Magic;
+ ui32 Version;
+ ui32 PtrSize;
+ ui32 MaxWordSize;
+ ui32 Type;
+ ui32 HdrSize;
- static const ui32 MAGIC = 0x45524950; // "PIRE" on litte-endian
- static const ui32 RE_VERSION = 7; // Should be incremented each time when the format of serialized scanner changes
- static const ui32 RE_VERSION_WITH_MACTIONS = 6; // LoadedScanner with m_actions, which is ignored
+ static const ui32 MAGIC = 0x45524950; // "PIRE" on litte-endian
+ static const ui32 RE_VERSION = 7; // Should be incremented each time when the format of serialized scanner changes
+ static const ui32 RE_VERSION_WITH_MACTIONS = 6; // LoadedScanner with m_actions, which is ignored
- explicit Header(ui32 type, size_t hdrsize)
- : Magic(MAGIC)
- , Version(RE_VERSION)
- , PtrSize(sizeof(void*))
- , MaxWordSize(sizeof(Impl::MaxSizeWord))
- , Type(type)
- , HdrSize((ui32)hdrsize)
- {}
+ explicit Header(ui32 type, size_t hdrsize)
+ : Magic(MAGIC)
+ , Version(RE_VERSION)
+ , PtrSize(sizeof(void*))
+ , MaxWordSize(sizeof(Impl::MaxSizeWord))
+ , Type(type)
+ , HdrSize((ui32)hdrsize)
+ {}
- void Validate(ui32 type, size_t hdrsize) const
- {
- if (Magic != MAGIC || PtrSize != sizeof(void*) || MaxWordSize != sizeof(Impl::MaxSizeWord))
- throw Error("Serialized regexp incompatible with your system");
- if (Version != RE_VERSION && Version != RE_VERSION_WITH_MACTIONS)
- throw Error("You are trying to used an incompatible version of a serialized regexp");
- if (type != ScannerIOTypes::NoScanner && type != Type &&
- !(type == ScannerIOTypes::LoadedScanner && Type == ScannerIOTypes::NoGlueLimitCountingScanner)) {
- throw Error("Serialized regexp incompatible with your system");
- }
- if (hdrsize != 0 && HdrSize != hdrsize)
- throw Error("Serialized regexp incompatible with your system");
- }
- };
+ void Validate(ui32 type, size_t hdrsize) const
+ {
+ if (Magic != MAGIC || PtrSize != sizeof(void*) || MaxWordSize != sizeof(Impl::MaxSizeWord))
+ throw Error("Serialized regexp incompatible with your system");
+ if (Version != RE_VERSION && Version != RE_VERSION_WITH_MACTIONS)
+ throw Error("You are trying to used an incompatible version of a serialized regexp");
+ if (type != ScannerIOTypes::NoScanner && type != Type &&
+ !(type == ScannerIOTypes::LoadedScanner && Type == ScannerIOTypes::NoGlueLimitCountingScanner)) {
+ throw Error("Serialized regexp incompatible with your system");
+ }
+ if (hdrsize != 0 && HdrSize != hdrsize)
+ throw Error("Serialized regexp incompatible with your system");
+ }
+ };
- namespace Impl {
- inline const void* AdvancePtr(const size_t*& ptr, size_t& size, size_t delta)
- {
- ptr = (const size_t*) ((const char*) ptr + delta);
- size -= delta;
- return (const void*) ptr;
- }
+ namespace Impl {
+ inline const void* AdvancePtr(const size_t*& ptr, size_t& size, size_t delta)
+ {
+ ptr = (const size_t*) ((const char*) ptr + delta);
+ size -= delta;
+ return (const void*) ptr;
+ }
- template<class T>
- inline void MapPtr(T*& field, size_t count, const size_t*& p, size_t& size)
- {
- if (size < count * sizeof(*field))
- throw Error("EOF reached while mapping Pire::SlowScanner");
- field = (T*) p;
- Impl::AdvancePtr(p, size, count * sizeof(*field));
- Impl::AlignPtr(p, size);
- }
+ template<class T>
+ inline void MapPtr(T*& field, size_t count, const size_t*& p, size_t& size)
+ {
+ if (size < count * sizeof(*field))
+ throw Error("EOF reached while mapping Pire::SlowScanner");
+ field = (T*) p;
+ Impl::AdvancePtr(p, size, count * sizeof(*field));
+ Impl::AlignPtr(p, size);
+ }
- inline void CheckAlign(const void* ptr, size_t bound = sizeof(size_t))
- {
- if (!IsAligned(ptr, bound))
- throw Error("Tried to mmap scanner at misaligned address");
- }
+ inline void CheckAlign(const void* ptr, size_t bound = sizeof(size_t))
+ {
+ if (!IsAligned(ptr, bound))
+ throw Error("Tried to mmap scanner at misaligned address");
+ }
- inline Header ValidateHeader(const size_t*& ptr, size_t& size, ui32 type, size_t hdrsize)
- {
- const Header* hdr;
- MapPtr(hdr, 1, ptr, size);
- hdr->Validate(type, hdrsize);
- return *hdr;
- }
+ inline Header ValidateHeader(const size_t*& ptr, size_t& size, ui32 type, size_t hdrsize)
+ {
+ const Header* hdr;
+ MapPtr(hdr, 1, ptr, size);
+ hdr->Validate(type, hdrsize);
+ return *hdr;
+ }
- inline Header ValidateHeader(yistream* s, ui32 type, size_t hdrsize)
- {
- Header hdr(ScannerIOTypes::NoScanner, 0);
- LoadPodType(s, hdr);
- AlignLoad(s, sizeof(hdr));
- hdr.Validate(type, hdrsize);
- return hdr;
- }
- }
+ inline Header ValidateHeader(yistream* s, ui32 type, size_t hdrsize)
+ {
+ Header hdr(ScannerIOTypes::NoScanner, 0);
+ LoadPodType(s, hdr);
+ AlignLoad(s, sizeof(hdr));
+ hdr.Validate(type, hdrsize);
+ return hdr;
+ }
+ }
}
#endif
diff --git a/library/cpp/regex/pire/pire/scanners/half_final.h b/library/cpp/regex/pire/pire/scanners/half_final.h
index ea47a0118f3..b9c152b7d42 100644
--- a/library/cpp/regex/pire/pire/scanners/half_final.h
+++ b/library/cpp/regex/pire/pire/scanners/half_final.h
@@ -32,198 +32,198 @@ namespace Impl {
template<typename Relocation, typename Shortcutting>
class HalfFinalScanner : public Scanner<Relocation, Shortcutting> {
public:
- typedef typename Impl::Scanner<Relocation, Shortcutting> Scanner;
-
- HalfFinalScanner() : Scanner() {}
-
- explicit HalfFinalScanner(Fsm fsm_, size_t distance = 0) {
- if (distance) {
- fsm_ = CreateApproxFsm(fsm_, distance);
- }
- HalfFinalFsm fsm(fsm_);
- fsm.MakeScanner();
- Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetFsm().Finals().size(), fsm.GetFsm().Initial(), 1);
- BuildScanner(fsm.GetFsm(), *this);
- }
-
- explicit HalfFinalScanner(const HalfFinalFsm& fsm) {
- Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetTotalCount(), fsm.GetFsm().Initial(), 1);
- BuildScanner(fsm.GetFsm(), *this);
- BuildFinals(fsm);
- }
-
- typedef typename Scanner::ScannerRowHeader ScannerRowHeader;
- typedef typename Scanner::Action Action;
-
- class State {
- public:
- typedef TVector<size_t>::const_iterator IdsIterator;
-
- State() : ScannerState(0) {}
-
- State(const typename Scanner::State& otherState) : ScannerState(otherState) {}
-
- void GetMatchedRegexpsIds() {
- MatchedRegexpsIds.clear();
- for (size_t i = 0; i < MatchedRegexps.size(); i++) {
- if (MatchedRegexps[i]) {
- MatchedRegexpsIds.push_back(i);
- }
- }
- }
-
- IdsIterator IdsBegin() const {
- return MatchedRegexpsIds.cbegin();
- }
-
- IdsIterator IdsEnd() const {
- return MatchedRegexpsIds.cend();
- }
-
- bool operator==(const State& other) const {
- return ScannerState == other.ScannerState && MatchedRegexps == other.MatchedRegexps;
- }
-
- bool operator!=(const State& other) const {
- return ScannerState != other.ScannerState || MatchedRegexps != other.MatchedRegexps;
- }
-
- size_t Result(size_t regexp_id) const {
- return MatchedRegexps[regexp_id];
- }
-
- void Save(yostream* s) const {
- SavePodType(s, Pire::Header(5, sizeof(size_t)));
- Impl::AlignSave(s, sizeof(Pire::Header));
- auto stateSizePair = ymake_pair(ScannerState, MatchedRegexps.size());
- SavePodType(s, stateSizePair);
- Impl::AlignSave(s, sizeof(ypair<size_t, size_t>));
- Y_ASSERT(0);
- }
-
- void Load(yistream* s) {
- Impl::ValidateHeader(s, 5, sizeof(size_t));
- ypair<size_t, size_t> stateSizePair;
- LoadPodType(s, stateSizePair);
- Impl::AlignLoad(s, sizeof(ypair<size_t, size_t>));
- ScannerState = stateSizePair.first;
- MatchedRegexps.clear();
- MatchedRegexps.resize(stateSizePair.second);
- }
-
- private:
- TVector<size_t> MatchedRegexpsIds;
- typename Scanner::State ScannerState;
- TVector<size_t> MatchedRegexps;
-
- friend class HalfFinalScanner<Relocation, Shortcutting>;
- };
-
-
- /// Checks whether specified state is in any of the final sets
- bool Final(const State& state) const { return Scanner::Final(state.ScannerState); }
-
- /// Checks whether specified state is 'dead' (i.e. scanner will never
- /// reach any final state from current one)
- bool Dead(const State& state) const { return Scanner::Dead(state.ScannerState); }
-
- typedef ypair<typename State::IdsIterator, typename State::IdsIterator> AcceptedRegexpsType;
-
- AcceptedRegexpsType AcceptedRegexps(State& state) const {
- state.GetMatchedRegexpsIds();
- return ymake_pair(state.IdsBegin(), state.IdsEnd());
- }
-
- /// Returns an initial state for this scanner
- void Initialize(State& state) const {
- state.ScannerState = Scanner::m.initial;
- state.MatchedRegexps.clear();
- state.MatchedRegexps.resize(Scanner::m.regexpsCount);
- TakeAction(state, 0);
- }
-
- Action NextTranslated(State& state, Char letter) const {
- return Scanner::NextTranslated(state.ScannerState, letter);
- }
-
- /// Handles one character
- Action Next(State& state, Char c) const {
- return Scanner::NextTranslated(state.ScannerState, Scanner::Translate(c));
- }
-
- void TakeAction(State& state, Action) const {
- if (Final(state)) {
- size_t idx = StateIndex(state);
- const size_t *it = Scanner::m_final + Scanner::m_finalIndex[idx];
- while (*it != Scanner::End) {
- state.MatchedRegexps[*it]++;
- ++it;
- }
- }
- }
-
- HalfFinalScanner(const HalfFinalScanner& s) : Scanner(s) {}
-
- HalfFinalScanner(const Scanner& s) : Scanner(s) {}
-
- HalfFinalScanner(HalfFinalScanner&& s) : Scanner(s) {}
-
- HalfFinalScanner(Scanner&& s) : Scanner(s) {}
-
- template<class AnotherRelocation>
- HalfFinalScanner(const HalfFinalScanner<AnotherRelocation, Shortcutting>& s)
- : Scanner(s) {}
-
- template<class AnotherRelocation>
- HalfFinalScanner(const Impl::Scanner<AnotherRelocation, Shortcutting>& s) : Scanner(s) {}
-
- void Swap(HalfFinalScanner& s) {
- Scanner::Swap(s);
- }
-
- HalfFinalScanner& operator=(const HalfFinalScanner& s) {
- HalfFinalScanner(s).Swap(*this);
- return *this;
- }
-
- size_t StateIndex(const State& s) const {
- return Scanner::StateIndex(s.ScannerState);
- }
-
- /**
- * Agglutinates two scanners together, producing a larger scanner.
- * Checking a string against that scanner effectively checks them against both agglutinated regexps
- * (detailed information about matched regexps can be obtained with AcceptedRegexps()).
- *
- * Returns default-constructed scanner in case of failure
- * (consult Scanner::Empty() to find out whether the operation was successful).
- */
- static HalfFinalScanner Glue(const HalfFinalScanner& a, const HalfFinalScanner& b, size_t maxSize = 0) {
- return Scanner::Glue(a, b, maxSize);
- }
-
- ScannerRowHeader& Header(const State& s) { return Scanner::Header(s.ScannerState); }
-
- const ScannerRowHeader& Header(const State& s) const { return Scanner::Header(s.ScannerState); }
+ typedef typename Impl::Scanner<Relocation, Shortcutting> Scanner;
+
+ HalfFinalScanner() : Scanner() {}
+
+ explicit HalfFinalScanner(Fsm fsm_, size_t distance = 0) {
+ if (distance) {
+ fsm_ = CreateApproxFsm(fsm_, distance);
+ }
+ HalfFinalFsm fsm(fsm_);
+ fsm.MakeScanner();
+ Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetFsm().Finals().size(), fsm.GetFsm().Initial(), 1);
+ BuildScanner(fsm.GetFsm(), *this);
+ }
+
+ explicit HalfFinalScanner(const HalfFinalFsm& fsm) {
+ Scanner::Init(fsm.GetFsm().Size(), fsm.GetFsm().Letters(), fsm.GetTotalCount(), fsm.GetFsm().Initial(), 1);
+ BuildScanner(fsm.GetFsm(), *this);
+ BuildFinals(fsm);
+ }
+
+ typedef typename Scanner::ScannerRowHeader ScannerRowHeader;
+ typedef typename Scanner::Action Action;
+
+ class State {
+ public:
+ typedef TVector<size_t>::const_iterator IdsIterator;
+
+ State() : ScannerState(0) {}
+
+ State(const typename Scanner::State& otherState) : ScannerState(otherState) {}
+
+ void GetMatchedRegexpsIds() {
+ MatchedRegexpsIds.clear();
+ for (size_t i = 0; i < MatchedRegexps.size(); i++) {
+ if (MatchedRegexps[i]) {
+ MatchedRegexpsIds.push_back(i);
+ }
+ }
+ }
+
+ IdsIterator IdsBegin() const {
+ return MatchedRegexpsIds.cbegin();
+ }
+
+ IdsIterator IdsEnd() const {
+ return MatchedRegexpsIds.cend();
+ }
+
+ bool operator==(const State& other) const {
+ return ScannerState == other.ScannerState && MatchedRegexps == other.MatchedRegexps;
+ }
+
+ bool operator!=(const State& other) const {
+ return ScannerState != other.ScannerState || MatchedRegexps != other.MatchedRegexps;
+ }
+
+ size_t Result(size_t regexp_id) const {
+ return MatchedRegexps[regexp_id];
+ }
+
+ void Save(yostream* s) const {
+ SavePodType(s, Pire::Header(5, sizeof(size_t)));
+ Impl::AlignSave(s, sizeof(Pire::Header));
+ auto stateSizePair = ymake_pair(ScannerState, MatchedRegexps.size());
+ SavePodType(s, stateSizePair);
+ Impl::AlignSave(s, sizeof(ypair<size_t, size_t>));
+ Y_ASSERT(0);
+ }
+
+ void Load(yistream* s) {
+ Impl::ValidateHeader(s, 5, sizeof(size_t));
+ ypair<size_t, size_t> stateSizePair;
+ LoadPodType(s, stateSizePair);
+ Impl::AlignLoad(s, sizeof(ypair<size_t, size_t>));
+ ScannerState = stateSizePair.first;
+ MatchedRegexps.clear();
+ MatchedRegexps.resize(stateSizePair.second);
+ }
+
+ private:
+ TVector<size_t> MatchedRegexpsIds;
+ typename Scanner::State ScannerState;
+ TVector<size_t> MatchedRegexps;
+
+ friend class HalfFinalScanner<Relocation, Shortcutting>;
+ };
+
+
+ /// Checks whether specified state is in any of the final sets
+ bool Final(const State& state) const { return Scanner::Final(state.ScannerState); }
+
+ /// Checks whether specified state is 'dead' (i.e. scanner will never
+ /// reach any final state from current one)
+ bool Dead(const State& state) const { return Scanner::Dead(state.ScannerState); }
+
+ typedef ypair<typename State::IdsIterator, typename State::IdsIterator> AcceptedRegexpsType;
+
+ AcceptedRegexpsType AcceptedRegexps(State& state) const {
+ state.GetMatchedRegexpsIds();
+ return ymake_pair(state.IdsBegin(), state.IdsEnd());
+ }
+
+ /// Returns an initial state for this scanner
+ void Initialize(State& state) const {
+ state.ScannerState = Scanner::m.initial;
+ state.MatchedRegexps.clear();
+ state.MatchedRegexps.resize(Scanner::m.regexpsCount);
+ TakeAction(state, 0);
+ }
+
+ Action NextTranslated(State& state, Char letter) const {
+ return Scanner::NextTranslated(state.ScannerState, letter);
+ }
+
+ /// Handles one character
+ Action Next(State& state, Char c) const {
+ return Scanner::NextTranslated(state.ScannerState, Scanner::Translate(c));
+ }
+
+ void TakeAction(State& state, Action) const {
+ if (Final(state)) {
+ size_t idx = StateIndex(state);
+ const size_t *it = Scanner::m_final + Scanner::m_finalIndex[idx];
+ while (*it != Scanner::End) {
+ state.MatchedRegexps[*it]++;
+ ++it;
+ }
+ }
+ }
+
+ HalfFinalScanner(const HalfFinalScanner& s) : Scanner(s) {}
+
+ HalfFinalScanner(const Scanner& s) : Scanner(s) {}
+
+ HalfFinalScanner(HalfFinalScanner&& s) : Scanner(s) {}
+
+ HalfFinalScanner(Scanner&& s) : Scanner(s) {}
+
+ template<class AnotherRelocation>
+ HalfFinalScanner(const HalfFinalScanner<AnotherRelocation, Shortcutting>& s)
+ : Scanner(s) {}
+
+ template<class AnotherRelocation>
+ HalfFinalScanner(const Impl::Scanner<AnotherRelocation, Shortcutting>& s) : Scanner(s) {}
+
+ void Swap(HalfFinalScanner& s) {
+ Scanner::Swap(s);
+ }
+
+ HalfFinalScanner& operator=(const HalfFinalScanner& s) {
+ HalfFinalScanner(s).Swap(*this);
+ return *this;
+ }
+
+ size_t StateIndex(const State& s) const {
+ return Scanner::StateIndex(s.ScannerState);
+ }
+
+ /**
+ * Agglutinates two scanners together, producing a larger scanner.
+ * Checking a string against that scanner effectively checks them against both agglutinated regexps
+ * (detailed information about matched regexps can be obtained with AcceptedRegexps()).
+ *
+ * Returns default-constructed scanner in case of failure
+ * (consult Scanner::Empty() to find out whether the operation was successful).
+ */
+ static HalfFinalScanner Glue(const HalfFinalScanner& a, const HalfFinalScanner& b, size_t maxSize = 0) {
+ return Scanner::Glue(a, b, maxSize);
+ }
+
+ ScannerRowHeader& Header(const State& s) { return Scanner::Header(s.ScannerState); }
+
+ const ScannerRowHeader& Header(const State& s) const { return Scanner::Header(s.ScannerState); }
private:
- void BuildFinals(const HalfFinalFsm& fsm) {
- Y_ASSERT(Scanner::m_buffer);
- Y_ASSERT(fsm.GetFsm().Size() == Scanner::Size());
- auto finalWriter = Scanner::m_final;
- for (size_t state = 0; state < Scanner::Size(); ++state) {
- Scanner::m_finalIndex[state] = finalWriter - Scanner::m_final;
- for (size_t i = 0; i < fsm.GetCount(state); i++) {
- *finalWriter++ = 0;
- }
- *finalWriter++ = static_cast<size_t>(-1);
- }
- }
-
- template<class Scanner>
- friend void Pire::BuildScanner(const Fsm&, Scanner&);
-
- typedef State InternalState; // Needed for agglutination
+ void BuildFinals(const HalfFinalFsm& fsm) {
+ Y_ASSERT(Scanner::m_buffer);
+ Y_ASSERT(fsm.GetFsm().Size() == Scanner::Size());
+ auto finalWriter = Scanner::m_final;
+ for (size_t state = 0; state < Scanner::Size(); ++state) {
+ Scanner::m_finalIndex[state] = finalWriter - Scanner::m_final;
+ for (size_t i = 0; i < fsm.GetCount(state); i++) {
+ *finalWriter++ = 0;
+ }
+ *finalWriter++ = static_cast<size_t>(-1);
+ }
+ }
+
+ template<class Scanner>
+ friend void Pire::BuildScanner(const Fsm&, Scanner&);
+
+ typedef State InternalState; // Needed for agglutination
};
}
@@ -243,13 +243,13 @@ typedef Impl::HalfFinalScanner<Impl::Nonrelocatable, Impl::NoShortcuts> Nonreloc
namespace std {
- inline void swap(Pire::HalfFinalScanner& a, Pire::HalfFinalScanner& b) {
- a.Swap(b);
- }
+ inline void swap(Pire::HalfFinalScanner& a, Pire::HalfFinalScanner& b) {
+ a.Swap(b);
+ }
- inline void swap(Pire::NonrelocHalfFinalScanner& a, Pire::NonrelocHalfFinalScanner& b) {
- a.Swap(b);
- }
+ inline void swap(Pire::NonrelocHalfFinalScanner& a, Pire::NonrelocHalfFinalScanner& b) {
+ a.Swap(b);
+ }
}
#endif
diff --git a/library/cpp/regex/pire/pire/scanners/loaded.h b/library/cpp/regex/pire/pire/scanners/loaded.h
index 1c5c99c9be9..c505b1c9efb 100644
--- a/library/cpp/regex/pire/pire/scanners/loaded.h
+++ b/library/cpp/regex/pire/pire/scanners/loaded.h
@@ -53,237 +53,237 @@ namespace Pire {
*/
class LoadedScanner {
public:
- typedef ui8 Letter;
- typedef ui32 Action;
- typedef ui8 Tag;
+ typedef ui8 Letter;
+ typedef ui32 Action;
+ typedef ui8 Tag;
- typedef size_t InternalState;
+ typedef size_t InternalState;
- union Transition {
- size_t raw; // alignment hint for compiler
- struct {
- ui32 shift;
- Action action;
- };
- };
+ union Transition {
+ size_t raw; // alignment hint for compiler
+ struct {
+ ui32 shift;
+ Action action;
+ };
+ };
- // Override in subclass, if neccessary
- enum {
- FinalFlag = 0,
- DeadFlag = 0
- };
+ // Override in subclass, if neccessary
+ enum {
+ FinalFlag = 0,
+ DeadFlag = 0
+ };
- static const size_t MAX_RE_COUNT = 16;
+ static const size_t MAX_RE_COUNT = 16;
protected:
- LoadedScanner() { Alias(Null()); }
-
- LoadedScanner(const LoadedScanner& s): m(s.m)
- {
- if (s.m_buffer) {
- m_buffer = BufferType(new char [BufSize()]);
- memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize());
- Markup(m_buffer.Get());
- m.initial = (InternalState)m_jumps + (s.m.initial - (InternalState)s.m_jumps);
- } else {
- Alias(s);
- }
- }
-
- void Swap(LoadedScanner& s)
- {
- DoSwap(m_buffer, s.m_buffer);
- DoSwap(m.statesCount, s.m.statesCount);
- DoSwap(m.lettersCount, s.m.lettersCount);
- DoSwap(m.regexpsCount, s.m.regexpsCount);
- DoSwap(m.initial, s.m.initial);
- DoSwap(m_letters, s.m_letters);
- DoSwap(m_jumps, s.m_jumps);
- DoSwap(m_tags, s.m_tags);
- }
-
- LoadedScanner& operator = (const LoadedScanner& s) { LoadedScanner(s).Swap(*this); return *this; }
- LoadedScanner (LoadedScanner&& other) : LoadedScanner() {
- Swap(other);
- }
- LoadedScanner& operator=(LoadedScanner&& other) {
- Swap(other);
- return *this;
- }
+ LoadedScanner() { Alias(Null()); }
+
+ LoadedScanner(const LoadedScanner& s): m(s.m)
+ {
+ if (s.m_buffer) {
+ m_buffer = BufferType(new char [BufSize()]);
+ memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize());
+ Markup(m_buffer.Get());
+ m.initial = (InternalState)m_jumps + (s.m.initial - (InternalState)s.m_jumps);
+ } else {
+ Alias(s);
+ }
+ }
+
+ void Swap(LoadedScanner& s)
+ {
+ DoSwap(m_buffer, s.m_buffer);
+ DoSwap(m.statesCount, s.m.statesCount);
+ DoSwap(m.lettersCount, s.m.lettersCount);
+ DoSwap(m.regexpsCount, s.m.regexpsCount);
+ DoSwap(m.initial, s.m.initial);
+ DoSwap(m_letters, s.m_letters);
+ DoSwap(m_jumps, s.m_jumps);
+ DoSwap(m_tags, s.m_tags);
+ }
+
+ LoadedScanner& operator = (const LoadedScanner& s) { LoadedScanner(s).Swap(*this); return *this; }
+ LoadedScanner (LoadedScanner&& other) : LoadedScanner() {
+ Swap(other);
+ }
+ LoadedScanner& operator=(LoadedScanner&& other) {
+ Swap(other);
+ return *this;
+ }
public:
- size_t Size() const { return m.statesCount; }
-
- bool Empty() const { return m_jumps == Null().m_jumps; }
-
- size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; }
-
- size_t LettersCount() const { return m.lettersCount; }
-
- const void* Mmap(const void* ptr, size_t size) {
- return Mmap(ptr, size, nullptr);
- }
-
- const void* Mmap(const void* ptr, size_t size, ui32* type)
- {
- Impl::CheckAlign(ptr);
- LoadedScanner s;
- const size_t* p = reinterpret_cast<const size_t*>(ptr);
- Header header = Impl::ValidateHeader(p, size, ScannerIOTypes::LoadedScanner, sizeof(s.m));
- if (type) {
- *type = header.Type;
- }
-
- Locals* locals;
- Impl::MapPtr(locals, 1, p, size);
- memcpy(&s.m, locals, sizeof(s.m));
-
- Impl::MapPtr(s.m_letters, MaxChar, p, size);
- Impl::MapPtr(s.m_jumps, s.m.statesCount * s.m.lettersCount, p, size);
- if (header.Version == Header::RE_VERSION_WITH_MACTIONS) {
- Action* actions = 0;
- Impl::MapPtr(actions, s.m.statesCount * s.m.lettersCount, p, size);
- }
- Impl::MapPtr(s.m_tags, s.m.statesCount, p, size);
-
- s.m.initial += reinterpret_cast<size_t>(s.m_jumps);
- Swap(s);
-
- return (const void*) p;
- }
-
- void Save(yostream*, ui32 type) const;
- void Save(yostream*) const;
- void Load(yistream*, ui32* type);
- void Load(yistream*);
-
- template<class Eq>
- void Init(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
- {
- m.statesCount = states;
- m.lettersCount = letters.Size();
- m.regexpsCount = regexpsCount;
- m_buffer = BufferType(new char[BufSize()]);
- memset(m_buffer.Get(), 0, BufSize());
- Markup(m_buffer.Get());
-
- m.initial = reinterpret_cast<size_t>(m_jumps + startState * m.lettersCount);
-
- // Build letter translation table
- Fill(m_letters, m_letters + MaxChar, 0);
- for (auto&& letter : letters)
- for (auto&& character : letter.second.second)
- m_letters[character] = letter.second.first;
- }
-
- size_t StateSize() const
- {
- return m.lettersCount * sizeof(*m_jumps);
- }
-
- size_t TransitionIndex(size_t state, Char c) const
- {
- return state * m.lettersCount + m_letters[c];
- }
-
- void SetJump(size_t oldState, Char c, size_t newState, Action action)
- {
- Y_ASSERT(m_buffer);
- Y_ASSERT(oldState < m.statesCount);
- Y_ASSERT(newState < m.statesCount);
-
- size_t shift = (newState - oldState) * StateSize();
- Transition tr;
- tr.shift = (ui32)shift;
- tr.action = action;
- m_jumps[TransitionIndex(oldState, c)] = tr;
- }
-
- Action RemapAction(Action action) { return action; }
-
- void SetInitial(size_t state) { Y_ASSERT(m_buffer); m.initial = reinterpret_cast<size_t>(m_jumps + state * m.lettersCount); }
- void SetTag(size_t state, Tag tag) { Y_ASSERT(m_buffer); m_tags[state] = tag; }
- void FinishBuild() {}
-
- size_t StateIdx(InternalState s) const
- {
- return (reinterpret_cast<Transition*>(s) - m_jumps) / m.lettersCount;
- }
-
- i64 SignExtend(i32 i) const { return i; }
-
- size_t BufSize() const
- {
- return
- MaxChar * sizeof(*m_letters)
- + m.statesCount * StateSize()
- + m.statesCount * sizeof(*m_tags)
- ;
- }
+ size_t Size() const { return m.statesCount; }
+
+ bool Empty() const { return m_jumps == Null().m_jumps; }
+
+ size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; }
+
+ size_t LettersCount() const { return m.lettersCount; }
+
+ const void* Mmap(const void* ptr, size_t size) {
+ return Mmap(ptr, size, nullptr);
+ }
+
+ const void* Mmap(const void* ptr, size_t size, ui32* type)
+ {
+ Impl::CheckAlign(ptr);
+ LoadedScanner s;
+ const size_t* p = reinterpret_cast<const size_t*>(ptr);
+ Header header = Impl::ValidateHeader(p, size, ScannerIOTypes::LoadedScanner, sizeof(s.m));
+ if (type) {
+ *type = header.Type;
+ }
+
+ Locals* locals;
+ Impl::MapPtr(locals, 1, p, size);
+ memcpy(&s.m, locals, sizeof(s.m));
+
+ Impl::MapPtr(s.m_letters, MaxChar, p, size);
+ Impl::MapPtr(s.m_jumps, s.m.statesCount * s.m.lettersCount, p, size);
+ if (header.Version == Header::RE_VERSION_WITH_MACTIONS) {
+ Action* actions = 0;
+ Impl::MapPtr(actions, s.m.statesCount * s.m.lettersCount, p, size);
+ }
+ Impl::MapPtr(s.m_tags, s.m.statesCount, p, size);
+
+ s.m.initial += reinterpret_cast<size_t>(s.m_jumps);
+ Swap(s);
+
+ return (const void*) p;
+ }
+
+ void Save(yostream*, ui32 type) const;
+ void Save(yostream*) const;
+ void Load(yistream*, ui32* type);
+ void Load(yistream*);
+
+ template<class Eq>
+ void Init(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
+ {
+ m.statesCount = states;
+ m.lettersCount = letters.Size();
+ m.regexpsCount = regexpsCount;
+ m_buffer = BufferType(new char[BufSize()]);
+ memset(m_buffer.Get(), 0, BufSize());
+ Markup(m_buffer.Get());
+
+ m.initial = reinterpret_cast<size_t>(m_jumps + startState * m.lettersCount);
+
+ // Build letter translation table
+ Fill(m_letters, m_letters + MaxChar, 0);
+ for (auto&& letter : letters)
+ for (auto&& character : letter.second.second)
+ m_letters[character] = letter.second.first;
+ }
+
+ size_t StateSize() const
+ {
+ return m.lettersCount * sizeof(*m_jumps);
+ }
+
+ size_t TransitionIndex(size_t state, Char c) const
+ {
+ return state * m.lettersCount + m_letters[c];
+ }
+
+ void SetJump(size_t oldState, Char c, size_t newState, Action action)
+ {
+ Y_ASSERT(m_buffer);
+ Y_ASSERT(oldState < m.statesCount);
+ Y_ASSERT(newState < m.statesCount);
+
+ size_t shift = (newState - oldState) * StateSize();
+ Transition tr;
+ tr.shift = (ui32)shift;
+ tr.action = action;
+ m_jumps[TransitionIndex(oldState, c)] = tr;
+ }
+
+ Action RemapAction(Action action) { return action; }
+
+ void SetInitial(size_t state) { Y_ASSERT(m_buffer); m.initial = reinterpret_cast<size_t>(m_jumps + state * m.lettersCount); }
+ void SetTag(size_t state, Tag tag) { Y_ASSERT(m_buffer); m_tags[state] = tag; }
+ void FinishBuild() {}
+
+ size_t StateIdx(InternalState s) const
+ {
+ return (reinterpret_cast<Transition*>(s) - m_jumps) / m.lettersCount;
+ }
+
+ i64 SignExtend(i32 i) const { return i; }
+
+ size_t BufSize() const
+ {
+ return
+ MaxChar * sizeof(*m_letters)
+ + m.statesCount * StateSize()
+ + m.statesCount * sizeof(*m_tags)
+ ;
+ }
protected:
- static const Action IncrementMask = (1 << MAX_RE_COUNT) - 1;
- static const Action ResetMask = IncrementMask << MAX_RE_COUNT;
+ static const Action IncrementMask = (1 << MAX_RE_COUNT) - 1;
+ static const Action ResetMask = IncrementMask << MAX_RE_COUNT;
- // TODO: maybe, put fields in private section and provide data accessors
+ // TODO: maybe, put fields in private section and provide data accessors
- struct Locals {
- ui32 statesCount;
- ui32 lettersCount;
- ui32 regexpsCount;
- size_t initial;
- } m;
+ struct Locals {
+ ui32 statesCount;
+ ui32 lettersCount;
+ ui32 regexpsCount;
+ size_t initial;
+ } m;
- using BufferType = TArrayHolder<char>;
- BufferType m_buffer;
+ using BufferType = TArrayHolder<char>;
+ BufferType m_buffer;
- Letter* m_letters;
- Transition* m_jumps;
- Tag* m_tags;
+ Letter* m_letters;
+ Transition* m_jumps;
+ Tag* m_tags;
- virtual ~LoadedScanner();
+ virtual ~LoadedScanner();
private:
- explicit LoadedScanner(Fsm& fsm, size_t distance = 0)
- {
- if (distance) {
- fsm = CreateApproxFsm(fsm, distance);
- }
- fsm.Canonize();
- Init(fsm.Size(), fsm.Letters(), fsm.Initial());
- BuildScanner(fsm, *this);
- }
-
- inline static const LoadedScanner& Null()
- {
- static const LoadedScanner n = Fsm::MakeFalse().Compile<LoadedScanner>();
- return n;
- }
-
- void Markup(void* buf)
- {
- m_letters = reinterpret_cast<Letter*>(buf);
- m_jumps = reinterpret_cast<Transition*>(m_letters + MaxChar);
- m_tags = reinterpret_cast<Tag*>(m_jumps + m.statesCount * m.lettersCount);
- }
-
- void Alias(const LoadedScanner& s)
- {
- memcpy(&m, &s.m, sizeof(m));
- m_buffer = 0;
- m_letters = s.m_letters;
- m_jumps = s.m_jumps;
- m_tags = s.m_tags;
- }
-
- template<class Eq>
- LoadedScanner(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
- {
- Init(states, letters, startState, regexpsCount);
- }
-
- friend class Fsm;
+ explicit LoadedScanner(Fsm& fsm, size_t distance = 0)
+ {
+ if (distance) {
+ fsm = CreateApproxFsm(fsm, distance);
+ }
+ fsm.Canonize();
+ Init(fsm.Size(), fsm.Letters(), fsm.Initial());
+ BuildScanner(fsm, *this);
+ }
+
+ inline static const LoadedScanner& Null()
+ {
+ static const LoadedScanner n = Fsm::MakeFalse().Compile<LoadedScanner>();
+ return n;
+ }
+
+ void Markup(void* buf)
+ {
+ m_letters = reinterpret_cast<Letter*>(buf);
+ m_jumps = reinterpret_cast<Transition*>(m_letters + MaxChar);
+ m_tags = reinterpret_cast<Tag*>(m_jumps + m.statesCount * m.lettersCount);
+ }
+
+ void Alias(const LoadedScanner& s)
+ {
+ memcpy(&m, &s.m, sizeof(m));
+ m_buffer = 0;
+ m_letters = s.m_letters;
+ m_jumps = s.m_jumps;
+ m_tags = s.m_tags;
+ }
+
+ template<class Eq>
+ LoadedScanner(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
+ {
+ Init(states, letters, startState, regexpsCount);
+ }
+
+ friend class Fsm;
};
inline LoadedScanner::~LoadedScanner() = default;
diff --git a/library/cpp/regex/pire/pire/scanners/multi.h b/library/cpp/regex/pire/pire/scanners/multi.h
index 172f700ec92..ba45b86d940 100644
--- a/library/cpp/regex/pire/pire/scanners/multi.h
+++ b/library/cpp/regex/pire/pire/scanners/multi.h
@@ -44,42 +44,42 @@ namespace Pire {
namespace Impl {
- inline static ssize_t SignExtend(i32 i) { return i; }
- template<class T>
- class ScannerGlueCommon;
+ inline static ssize_t SignExtend(i32 i) { return i; }
+ template<class T>
+ class ScannerGlueCommon;
- template<class T>
- class ScannerGlueTask;
+ template<class T>
+ class ScannerGlueTask;
- // This strategy allows to mmap() saved representation of a scanner. This is achieved by
- // storing shifts instead of addresses in the transition table.
- struct Relocatable {
- static const size_t Signature = 1;
- // Please note that Transition size is hardcoded as 32 bits.
- // This limits size of transition table to 4G, but compresses
- // it twice compared to 64-bit transitions. In future Transition
- // can be made a template parameter if this is a concern.
- typedef ui32 Transition;
+ // This strategy allows to mmap() saved representation of a scanner. This is achieved by
+ // storing shifts instead of addresses in the transition table.
+ struct Relocatable {
+ static const size_t Signature = 1;
+ // Please note that Transition size is hardcoded as 32 bits.
+ // This limits size of transition table to 4G, but compresses
+ // it twice compared to 64-bit transitions. In future Transition
+ // can be made a template parameter if this is a concern.
+ typedef ui32 Transition;
- typedef const void* RetvalForMmap;
+ typedef const void* RetvalForMmap;
- static size_t Go(size_t state, Transition shift) { return state + SignExtend(shift); }
- static Transition Diff(size_t from, size_t to) { return static_cast<Transition>(to - from); }
- };
+ static size_t Go(size_t state, Transition shift) { return state + SignExtend(shift); }
+ static Transition Diff(size_t from, size_t to) { return static_cast<Transition>(to - from); }
+ };
- // With this strategy the transition table stores addresses. This makes the scanner faster
- // compared to mmap()-ed
- struct Nonrelocatable {
- static const size_t Signature = 2;
- typedef size_t Transition;
+ // With this strategy the transition table stores addresses. This makes the scanner faster
+ // compared to mmap()-ed
+ struct Nonrelocatable {
+ static const size_t Signature = 2;
+ typedef size_t Transition;
- // Generates a compile-time error if Scanner<Nonrelocatable>::Mmap()
- // (which is unsupported) is mistakenly called
- typedef struct {} RetvalForMmap;
+ // Generates a compile-time error if Scanner<Nonrelocatable>::Mmap()
+ // (which is unsupported) is mistakenly called
+ typedef struct {} RetvalForMmap;
- static size_t Go(size_t /*state*/, Transition shift) { return shift; }
- static Transition Diff(size_t /*from*/, size_t to) { return to; }
- };
+ static size_t Go(size_t /*state*/, Transition shift) { return shift; }
+ static Transition Diff(size_t /*from*/, size_t to) { return to; }
+ };
// Scanner implementation parametrized by
@@ -88,728 +88,728 @@ namespace Impl {
template<class Relocation, class Shortcutting>
class Scanner {
protected:
- enum {
- FinalFlag = 1,
- DeadFlag = 2,
- Flags = FinalFlag | DeadFlag
- };
+ enum {
+ FinalFlag = 1,
+ DeadFlag = 2,
+ Flags = FinalFlag | DeadFlag
+ };
- static const size_t End = static_cast<size_t>(-1);
+ static const size_t End = static_cast<size_t>(-1);
public:
- typedef typename Relocation::Transition Transition;
-
- typedef ui16 Letter;
- typedef ui32 Action;
- typedef ui8 Tag;
-
- /// Some properties of the particular state.
- struct CommonRowHeader {
- size_t Flags; ///< Holds FinalFlag, DeadFlag, etc...
-
- CommonRowHeader(): Flags(0) {}
-
- template <class OtherCommonRowHeader>
- CommonRowHeader& operator =(const OtherCommonRowHeader& other)
- {
- Flags = other.Flags;
- return *this;
- }
- };
-
- typedef typename Shortcutting::template ExtendedRowHeader<Scanner> ScannerRowHeader;
-
- Scanner() { Alias(Null()); }
-
- explicit Scanner(Fsm& fsm, size_t distance = 0)
- {
- if (distance) {
- fsm = CreateApproxFsm(fsm, distance);
- }
- fsm.Canonize();
- Init(fsm.Size(), fsm.Letters(), fsm.Finals().size(), fsm.Initial(), 1);
- BuildScanner(fsm, *this);
- }
-
-
- size_t Size() const { return m.statesCount; }
- bool Empty() const { return m_transitions == Null().m_transitions; }
-
- typedef size_t State;
-
- size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; }
- size_t LettersCount() const { return m.lettersCount; }
-
- /// Checks whether specified state is in any of the final sets
- bool Final(const State& state) const { return (Header(state).Common.Flags & FinalFlag) != 0; }
-
- /// Checks whether specified state is 'dead' (i.e. scanner will never
- /// reach any final state from current one)
- bool Dead(const State& state) const { return (Header(state).Common.Flags & DeadFlag) != 0; }
-
- ypair<const size_t*, const size_t*> AcceptedRegexps(const State& state) const
- {
- size_t idx = (state - reinterpret_cast<size_t>(m_transitions)) /
- (RowSize() * sizeof(Transition));
- const size_t* b = m_final + m_finalIndex[idx];
- const size_t* e = b;
- while (*e != End)
- ++e;
- return ymake_pair(b, e);
- }
-
- /// Returns an initial state for this scanner
- void Initialize(State& state) const { state = m.initial; }
-
- Char Translate(Char ch) const
- {
- return m_letters[static_cast<size_t>(ch)];
- }
-
- /// Handles one letter
- Action NextTranslated(State& state, Char letter) const
- {
- PIRE_IFDEBUG(
- Y_ASSERT(state >= (size_t)m_transitions);
- Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size()));
- Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0);
- );
-
- state = Relocation::Go(state, reinterpret_cast<const Transition*>(state)[letter]);
-
- PIRE_IFDEBUG(
- Y_ASSERT(state >= (size_t)m_transitions);
- Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size()));
- Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0);
- );
-
- return 0;
- }
-
- /// Handles one character
- Action Next(State& state, Char c) const
- {
- return NextTranslated(state, Translate(c));
- }
-
- void TakeAction(State&, Action) const {}
-
- Scanner(const Scanner& s): m(s.m)
- {
- if (!s.m_buffer) {
- // Empty or mmap()-ed scanner
- Alias(s);
- } else {
- // In-memory scanner
- DeepCopy(s);
- }
- }
-
- Scanner(Scanner&& s)
- {
- Alias(Null());
- Swap(s);
- }
-
- template<class AnotherRelocation>
- Scanner(const Scanner<AnotherRelocation, Shortcutting>& s)
- {
- if (s.Empty())
- Alias(Null());
- else
- DeepCopy(s);
- }
-
- void Swap(Scanner& s)
- {
- Y_ASSERT(m.relocationSignature == s.m.relocationSignature);
- Y_ASSERT(m.shortcuttingSignature == s.m.shortcuttingSignature);
- DoSwap(m_buffer, s.m_buffer);
- DoSwap(m.statesCount, s.m.statesCount);
- DoSwap(m.lettersCount, s.m.lettersCount);
- DoSwap(m.regexpsCount, s.m.regexpsCount);
- DoSwap(m.initial, s.m.initial);
- DoSwap(m_letters, s.m_letters);
- DoSwap(m.finalTableSize, s.m.finalTableSize);
- DoSwap(m_final, s.m_final);
- DoSwap(m_finalIndex, s.m_finalIndex);
- DoSwap(m_transitions, s.m_transitions);
- }
-
- Scanner& operator = (const Scanner& s) { Scanner(s).Swap(*this); return *this; }
-
- /*
- * Constructs the scanner from mmap()-ed memory range, returning a pointer
- * to unconsumed part of the buffer.
- */
- typename Relocation::RetvalForMmap Mmap(const void* ptr, size_t size)
- {
- Impl::CheckAlign(ptr, sizeof(size_t));
- Scanner s;
-
- const size_t* p = reinterpret_cast<const size_t*>(ptr);
- Impl::ValidateHeader(p, size, ScannerIOTypes::Scanner, sizeof(m));
- if (size < sizeof(s.m))
- throw Error("EOF reached while mapping Pire::Scanner");
-
- memcpy(&s.m, p, sizeof(s.m));
- if (s.m.relocationSignature != Relocation::Signature)
- throw Error("Type mismatch while mmapping Pire::Scanner");
- Impl::AdvancePtr(p, size, sizeof(s.m));
- Impl::AlignPtr(p, size);
-
- if (Shortcutting::Signature != s.m.shortcuttingSignature)
- throw Error("This scanner has different shortcutting type");
-
- bool empty = *((const bool*) p);
- Impl::AdvancePtr(p, size, sizeof(empty));
- Impl::AlignPtr(p, size);
-
- if (empty)
- s.Alias(Null());
- else {
- if (size < s.BufSize())
- throw Error("EOF reached while mapping NPire::Scanner");
- s.Markup(const_cast<size_t*>(p));
- Impl::AdvancePtr(p, size, s.BufSize());
- s.m.initial += reinterpret_cast<size_t>(s.m_transitions);
- }
-
- Swap(s);
- return Impl::AlignPtr(p, size);
- }
-
- size_t StateIndex(State s) const
- {
- return (s - reinterpret_cast<size_t>(m_transitions)) / (RowSize() * sizeof(Transition));
- }
-
- /**
- * Agglutinates two scanners together, producing a larger scanner.
- * Checkig a string against that scanner effectively checks them against both agglutinated regexps
- * (detailed information about matched regexps can be obtained with AcceptedRegexps()).
- *
- * Returns default-constructed scanner in case of failure
- * (consult Scanner::Empty() to find out whether the operation was successful).
- */
- static Scanner Glue(const Scanner& a, const Scanner& b, size_t maxSize = 0);
-
- // Returns the size of the memory buffer used (or required) by scanner.
- size_t BufSize() const
- {
- return AlignUp(
- MaxChar * sizeof(Letter) // Letters translation table
- + m.finalTableSize * sizeof(size_t) // Final table
- + m.statesCount * sizeof(size_t) // Final index
- + RowSize() * m.statesCount * sizeof(Transition), // Transitions table
- sizeof(size_t));
- }
-
- void Save(yostream*) const;
- void Load(yistream*);
-
- ScannerRowHeader& Header(State s) { return *(ScannerRowHeader*) s; }
- const ScannerRowHeader& Header(State s) const { return *(const ScannerRowHeader*) s; }
+ typedef typename Relocation::Transition Transition;
+
+ typedef ui16 Letter;
+ typedef ui32 Action;
+ typedef ui8 Tag;
+
+ /// Some properties of the particular state.
+ struct CommonRowHeader {
+ size_t Flags; ///< Holds FinalFlag, DeadFlag, etc...
+
+ CommonRowHeader(): Flags(0) {}
+
+ template <class OtherCommonRowHeader>
+ CommonRowHeader& operator =(const OtherCommonRowHeader& other)
+ {
+ Flags = other.Flags;
+ return *this;
+ }
+ };
+
+ typedef typename Shortcutting::template ExtendedRowHeader<Scanner> ScannerRowHeader;
+
+ Scanner() { Alias(Null()); }
+
+ explicit Scanner(Fsm& fsm, size_t distance = 0)
+ {
+ if (distance) {
+ fsm = CreateApproxFsm(fsm, distance);
+ }
+ fsm.Canonize();
+ Init(fsm.Size(), fsm.Letters(), fsm.Finals().size(), fsm.Initial(), 1);
+ BuildScanner(fsm, *this);
+ }
+
+
+ size_t Size() const { return m.statesCount; }
+ bool Empty() const { return m_transitions == Null().m_transitions; }
+
+ typedef size_t State;
+
+ size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; }
+ size_t LettersCount() const { return m.lettersCount; }
+
+ /// Checks whether specified state is in any of the final sets
+ bool Final(const State& state) const { return (Header(state).Common.Flags & FinalFlag) != 0; }
+
+ /// Checks whether specified state is 'dead' (i.e. scanner will never
+ /// reach any final state from current one)
+ bool Dead(const State& state) const { return (Header(state).Common.Flags & DeadFlag) != 0; }
+
+ ypair<const size_t*, const size_t*> AcceptedRegexps(const State& state) const
+ {
+ size_t idx = (state - reinterpret_cast<size_t>(m_transitions)) /
+ (RowSize() * sizeof(Transition));
+ const size_t* b = m_final + m_finalIndex[idx];
+ const size_t* e = b;
+ while (*e != End)
+ ++e;
+ return ymake_pair(b, e);
+ }
+
+ /// Returns an initial state for this scanner
+ void Initialize(State& state) const { state = m.initial; }
+
+ Char Translate(Char ch) const
+ {
+ return m_letters[static_cast<size_t>(ch)];
+ }
+
+ /// Handles one letter
+ Action NextTranslated(State& state, Char letter) const
+ {
+ PIRE_IFDEBUG(
+ Y_ASSERT(state >= (size_t)m_transitions);
+ Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size()));
+ Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0);
+ );
+
+ state = Relocation::Go(state, reinterpret_cast<const Transition*>(state)[letter]);
+
+ PIRE_IFDEBUG(
+ Y_ASSERT(state >= (size_t)m_transitions);
+ Y_ASSERT(state < (size_t)(m_transitions + RowSize()*Size()));
+ Y_ASSERT((state - (size_t)m_transitions) % (RowSize()*sizeof(Transition)) == 0);
+ );
+
+ return 0;
+ }
+
+ /// Handles one character
+ Action Next(State& state, Char c) const
+ {
+ return NextTranslated(state, Translate(c));
+ }
+
+ void TakeAction(State&, Action) const {}
+
+ Scanner(const Scanner& s): m(s.m)
+ {
+ if (!s.m_buffer) {
+ // Empty or mmap()-ed scanner
+ Alias(s);
+ } else {
+ // In-memory scanner
+ DeepCopy(s);
+ }
+ }
+
+ Scanner(Scanner&& s)
+ {
+ Alias(Null());
+ Swap(s);
+ }
+
+ template<class AnotherRelocation>
+ Scanner(const Scanner<AnotherRelocation, Shortcutting>& s)
+ {
+ if (s.Empty())
+ Alias(Null());
+ else
+ DeepCopy(s);
+ }
+
+ void Swap(Scanner& s)
+ {
+ Y_ASSERT(m.relocationSignature == s.m.relocationSignature);
+ Y_ASSERT(m.shortcuttingSignature == s.m.shortcuttingSignature);
+ DoSwap(m_buffer, s.m_buffer);
+ DoSwap(m.statesCount, s.m.statesCount);
+ DoSwap(m.lettersCount, s.m.lettersCount);
+ DoSwap(m.regexpsCount, s.m.regexpsCount);
+ DoSwap(m.initial, s.m.initial);
+ DoSwap(m_letters, s.m_letters);
+ DoSwap(m.finalTableSize, s.m.finalTableSize);
+ DoSwap(m_final, s.m_final);
+ DoSwap(m_finalIndex, s.m_finalIndex);
+ DoSwap(m_transitions, s.m_transitions);
+ }
+
+ Scanner& operator = (const Scanner& s) { Scanner(s).Swap(*this); return *this; }
+
+ /*
+ * Constructs the scanner from mmap()-ed memory range, returning a pointer
+ * to unconsumed part of the buffer.
+ */
+ typename Relocation::RetvalForMmap Mmap(const void* ptr, size_t size)
+ {
+ Impl::CheckAlign(ptr, sizeof(size_t));
+ Scanner s;
+
+ const size_t* p = reinterpret_cast<const size_t*>(ptr);
+ Impl::ValidateHeader(p, size, ScannerIOTypes::Scanner, sizeof(m));
+ if (size < sizeof(s.m))
+ throw Error("EOF reached while mapping Pire::Scanner");
+
+ memcpy(&s.m, p, sizeof(s.m));
+ if (s.m.relocationSignature != Relocation::Signature)
+ throw Error("Type mismatch while mmapping Pire::Scanner");
+ Impl::AdvancePtr(p, size, sizeof(s.m));
+ Impl::AlignPtr(p, size);
+
+ if (Shortcutting::Signature != s.m.shortcuttingSignature)
+ throw Error("This scanner has different shortcutting type");
+
+ bool empty = *((const bool*) p);
+ Impl::AdvancePtr(p, size, sizeof(empty));
+ Impl::AlignPtr(p, size);
+
+ if (empty)
+ s.Alias(Null());
+ else {
+ if (size < s.BufSize())
+ throw Error("EOF reached while mapping NPire::Scanner");
+ s.Markup(const_cast<size_t*>(p));
+ Impl::AdvancePtr(p, size, s.BufSize());
+ s.m.initial += reinterpret_cast<size_t>(s.m_transitions);
+ }
+
+ Swap(s);
+ return Impl::AlignPtr(p, size);
+ }
+
+ size_t StateIndex(State s) const
+ {
+ return (s - reinterpret_cast<size_t>(m_transitions)) / (RowSize() * sizeof(Transition));
+ }
+
+ /**
+ * Agglutinates two scanners together, producing a larger scanner.
+ * Checkig a string against that scanner effectively checks them against both agglutinated regexps
+ * (detailed information about matched regexps can be obtained with AcceptedRegexps()).
+ *
+ * Returns default-constructed scanner in case of failure
+ * (consult Scanner::Empty() to find out whether the operation was successful).
+ */
+ static Scanner Glue(const Scanner& a, const Scanner& b, size_t maxSize = 0);
+
+ // Returns the size of the memory buffer used (or required) by scanner.
+ size_t BufSize() const
+ {
+ return AlignUp(
+ MaxChar * sizeof(Letter) // Letters translation table
+ + m.finalTableSize * sizeof(size_t) // Final table
+ + m.statesCount * sizeof(size_t) // Final index
+ + RowSize() * m.statesCount * sizeof(Transition), // Transitions table
+ sizeof(size_t));
+ }
+
+ void Save(yostream*) const;
+ void Load(yistream*);
+
+ ScannerRowHeader& Header(State s) { return *(ScannerRowHeader*) s; }
+ const ScannerRowHeader& Header(State s) const { return *(const ScannerRowHeader*) s; }
protected:
- struct Locals {
- ui32 statesCount;
- ui32 lettersCount;
- ui32 regexpsCount;
- size_t initial;
- ui32 finalTableSize;
- size_t relocationSignature;
- size_t shortcuttingSignature;
- } m;
-
- using BufferType = TArrayHolder<char>;
- BufferType m_buffer;
- Letter* m_letters;
-
- size_t* m_final;
- size_t* m_finalIndex;
-
- Transition* m_transitions;
-
- inline static const Scanner& Null()
- {
- static const Scanner n = Fsm::MakeFalse().Compile< Scanner<Relocation, Shortcutting> >();
-
- return n;
- }
-
- // Returns transition row size in Transition's. Row size_in bytes should be a multiple of sizeof(MaxSizeWord)
- size_t RowSize() const { return AlignUp(m.lettersCount + HEADER_SIZE, sizeof(MaxSizeWord)/sizeof(Transition)); }
-
- static const size_t HEADER_SIZE = sizeof(ScannerRowHeader) / sizeof(Transition);
- PIRE_STATIC_ASSERT(sizeof(ScannerRowHeader) % sizeof(Transition) == 0);
-
- template<class Eq>
- void Init(size_t states, const Partition<Char, Eq>& letters, size_t finalStatesCount, size_t startState, size_t regexpsCount = 1)
- {
- std::memset(&m, 0, sizeof(m));
- m.relocationSignature = Relocation::Signature;
- m.shortcuttingSignature = Shortcutting::Signature;
- m.statesCount = states;
- m.lettersCount = letters.Size();
- m.regexpsCount = regexpsCount;
- m.finalTableSize = finalStatesCount + states;
-
- m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]);
- memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t));
- Markup(AlignUp(m_buffer.Get(), sizeof(size_t)));
-
- for (size_t i = 0; i != Size(); ++i)
- Header(IndexToState(i)) = ScannerRowHeader();
-
- m.initial = reinterpret_cast<size_t>(m_transitions + startState * RowSize());
-
- // Build letter translation table
- for (auto&& letter : letters)
- for (auto&& character : letter.second.second)
- m_letters[character] = letter.second.first + HEADER_SIZE;
- }
-
- /*
- * Initializes pointers depending on buffer start, letters and states count
- */
- void Markup(void* ptr)
- {
- Impl::CheckAlign(ptr, sizeof(size_t));
- m_letters = reinterpret_cast<Letter*>(ptr);
- m_final = reinterpret_cast<size_t*>(m_letters + MaxChar);
- m_finalIndex = reinterpret_cast<size_t*>(m_final + m.finalTableSize);
- m_transitions = reinterpret_cast<Transition*>(m_finalIndex + m.statesCount);
- }
-
- // Makes a shallow ("weak") copy of the given scanner.
- // The copied scanner does not maintain lifetime of the original's entrails.
- void Alias(const Scanner<Relocation, Shortcutting>& s)
- {
- memcpy(&m, &s.m, sizeof(m));
- m_buffer.Reset();
- m_letters = s.m_letters;
- m_final = s.m_final;
- m_finalIndex = s.m_finalIndex;
- m_transitions = s.m_transitions;
- }
-
- template<class AnotherRelocation>
- void DeepCopy(const Scanner<AnotherRelocation, Shortcutting>& s)
- {
- // Don't want memory leaks, but we cannot free the buffer because there might be aliased instances
- Y_ASSERT(m_buffer == nullptr);
-
- // Ensure that specializations of Scanner across different Relocations do not touch its Locals
- static_assert(sizeof(m) == sizeof(s.m), "sizeof(m) == sizeof(s.m)");
- memcpy(&m, &s.m, sizeof(s.m));
- m.relocationSignature = Relocation::Signature;
- m.shortcuttingSignature = Shortcutting::Signature;
- m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]);
- std::memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t));
- Markup(AlignUp(m_buffer.Get(), sizeof(size_t)));
-
- // Values in letter-to-leterclass table take into account row header size
- for (size_t c = 0; c < MaxChar; ++c) {
- m_letters[c] = s.m_letters[c] - s.HEADER_SIZE + HEADER_SIZE;
- Y_ASSERT(c == Epsilon || m_letters[c] >= HEADER_SIZE);
- Y_ASSERT(c == Epsilon || m_letters[c] < RowSize());
- }
- memcpy(m_final, s.m_final, m.finalTableSize * sizeof(*m_final));
- memcpy(m_finalIndex, s.m_finalIndex, m.statesCount * sizeof(*m_finalIndex));
-
- m.initial = IndexToState(s.StateIndex(s.m.initial));
-
- for (size_t st = 0; st != m.statesCount; ++st) {
- size_t oldstate = s.IndexToState(st);
- size_t newstate = IndexToState(st);
- Header(newstate) = s.Header(oldstate);
- const typename Scanner<AnotherRelocation, Shortcutting>::Transition* os
- = reinterpret_cast<const typename Scanner<AnotherRelocation, Shortcutting>::Transition*>(oldstate);
- Transition* ns = reinterpret_cast<Transition*>(newstate);
-
- for (size_t let = 0; let != LettersCount(); ++let) {
- size_t destIndex = s.StateIndex(AnotherRelocation::Go(oldstate, os[let + s.HEADER_SIZE]));
- Transition tr = Relocation::Diff(newstate, IndexToState(destIndex));
- ns[let + HEADER_SIZE] = tr;
- Y_ASSERT(Relocation::Go(newstate, tr) >= (size_t)m_transitions);
- Y_ASSERT(Relocation::Go(newstate, tr) < (size_t)(m_transitions + RowSize()*Size()));
- }
- }
- }
-
-
- size_t IndexToState(size_t stateIndex) const
- {
- return reinterpret_cast<size_t>(m_transitions + stateIndex * RowSize());
- }
-
- void SetJump(size_t oldState, Char c, size_t newState, unsigned long /*payload*/ = 0)
- {
- Y_ASSERT(m_buffer);
- Y_ASSERT(oldState < m.statesCount);
- Y_ASSERT(newState < m.statesCount);
-
- m_transitions[oldState * RowSize() + m_letters[c]]
- = Relocation::Diff(IndexToState(oldState), IndexToState(newState));
- }
-
- unsigned long RemapAction(unsigned long action) { return action; }
-
- void SetInitial(size_t state)
- {
- Y_ASSERT(m_buffer);
- m.initial = IndexToState(state);
- }
-
- void SetTag(size_t state, size_t value)
- {
- Y_ASSERT(m_buffer);
- Header(IndexToState(state)).Common.Flags = value;
- }
-
- // Fill shortcut masks for all the states
- void BuildShortcuts()
- {
- Y_ASSERT(m_buffer);
-
- // Build the mapping from letter classes to characters
- TVector< TVector<char> > letters(RowSize());
- for (unsigned ch = 0; ch != 1 << (sizeof(char)*8); ++ch)
- letters[m_letters[ch]].push_back(ch);
-
- // Loop through all states in the transition table and
- // check if it is possible to setup shortcuts
- for (size_t i = 0; i != Size(); ++i) {
- State st = IndexToState(i);
- ScannerRowHeader& header = Header(st);
- Shortcutting::SetNoExit(header);
- size_t ind = 0;
- size_t let = HEADER_SIZE;
- for (; let != LettersCount() + HEADER_SIZE; ++let) {
- // Check if the transition is not the same state
- if (Relocation::Go(st, reinterpret_cast<const Transition*>(st)[let]) != st) {
- if (ind + letters[let].size() > Shortcutting::ExitMaskCount)
- break;
- // For each character setup a mask
- for (auto&& character : letters[let]) {
- Shortcutting::SetMask(header, ind, character);
- ++ind;
- }
- }
- }
-
- if (let != LettersCount() + HEADER_SIZE) {
- // Not enough space in ExitMasks, so reset all masks (which leads to bypassing the optimization)
- Shortcutting::SetNoShortcut(header);
- }
- // Fill the rest of the shortcut masks with the last used mask
- Shortcutting::FinishMasks(header, ind);
- }
- }
-
- // Fills final states table and builds shortcuts if possible
- void FinishBuild()
- {
- Y_ASSERT(m_buffer);
- auto finalWriter = m_final;
- for (size_t state = 0; state != Size(); ++state) {
- m_finalIndex[state] = finalWriter - m_final;
- if (Header(IndexToState(state)).Common.Flags & FinalFlag)
- *finalWriter++ = 0;
- *finalWriter++ = static_cast<size_t>(-1);
- }
- BuildShortcuts();
- }
-
- size_t AcceptedRegexpsCount(size_t idx) const
- {
- const size_t* b = m_final + m_finalIndex[idx];
- const size_t* e = b;
- while (*e != End)
- ++e;
- return e - b;
- }
-
- template <class Scanner>
- friend void Pire::BuildScanner(const Fsm&, Scanner&);
-
- typedef State InternalState; // Needed for agglutination
- friend class ScannerGlueCommon<Scanner>;
- friend class ScannerGlueTask<Scanner>;
-
- template<class AnotherRelocation, class AnotherShortcutting>
- friend class Scanner;
+ struct Locals {
+ ui32 statesCount;
+ ui32 lettersCount;
+ ui32 regexpsCount;
+ size_t initial;
+ ui32 finalTableSize;
+ size_t relocationSignature;
+ size_t shortcuttingSignature;
+ } m;
+
+ using BufferType = TArrayHolder<char>;
+ BufferType m_buffer;
+ Letter* m_letters;
+
+ size_t* m_final;
+ size_t* m_finalIndex;
+
+ Transition* m_transitions;
+
+ inline static const Scanner& Null()
+ {
+ static const Scanner n = Fsm::MakeFalse().Compile< Scanner<Relocation, Shortcutting> >();
+
+ return n;
+ }
+
+ // Returns transition row size in Transition's. Row size_in bytes should be a multiple of sizeof(MaxSizeWord)
+ size_t RowSize() const { return AlignUp(m.lettersCount + HEADER_SIZE, sizeof(MaxSizeWord)/sizeof(Transition)); }
+
+ static const size_t HEADER_SIZE = sizeof(ScannerRowHeader) / sizeof(Transition);
+ PIRE_STATIC_ASSERT(sizeof(ScannerRowHeader) % sizeof(Transition) == 0);
+
+ template<class Eq>
+ void Init(size_t states, const Partition<Char, Eq>& letters, size_t finalStatesCount, size_t startState, size_t regexpsCount = 1)
+ {
+ std::memset(&m, 0, sizeof(m));
+ m.relocationSignature = Relocation::Signature;
+ m.shortcuttingSignature = Shortcutting::Signature;
+ m.statesCount = states;
+ m.lettersCount = letters.Size();
+ m.regexpsCount = regexpsCount;
+ m.finalTableSize = finalStatesCount + states;
+
+ m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]);
+ memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t));
+ Markup(AlignUp(m_buffer.Get(), sizeof(size_t)));
+
+ for (size_t i = 0; i != Size(); ++i)
+ Header(IndexToState(i)) = ScannerRowHeader();
+
+ m.initial = reinterpret_cast<size_t>(m_transitions + startState * RowSize());
+
+ // Build letter translation table
+ for (auto&& letter : letters)
+ for (auto&& character : letter.second.second)
+ m_letters[character] = letter.second.first + HEADER_SIZE;
+ }
+
+ /*
+ * Initializes pointers depending on buffer start, letters and states count
+ */
+ void Markup(void* ptr)
+ {
+ Impl::CheckAlign(ptr, sizeof(size_t));
+ m_letters = reinterpret_cast<Letter*>(ptr);
+ m_final = reinterpret_cast<size_t*>(m_letters + MaxChar);
+ m_finalIndex = reinterpret_cast<size_t*>(m_final + m.finalTableSize);
+ m_transitions = reinterpret_cast<Transition*>(m_finalIndex + m.statesCount);
+ }
+
+ // Makes a shallow ("weak") copy of the given scanner.
+ // The copied scanner does not maintain lifetime of the original's entrails.
+ void Alias(const Scanner<Relocation, Shortcutting>& s)
+ {
+ memcpy(&m, &s.m, sizeof(m));
+ m_buffer.Reset();
+ m_letters = s.m_letters;
+ m_final = s.m_final;
+ m_finalIndex = s.m_finalIndex;
+ m_transitions = s.m_transitions;
+ }
+
+ template<class AnotherRelocation>
+ void DeepCopy(const Scanner<AnotherRelocation, Shortcutting>& s)
+ {
+ // Don't want memory leaks, but we cannot free the buffer because there might be aliased instances
+ Y_ASSERT(m_buffer == nullptr);
+
+ // Ensure that specializations of Scanner across different Relocations do not touch its Locals
+ static_assert(sizeof(m) == sizeof(s.m), "sizeof(m) == sizeof(s.m)");
+ memcpy(&m, &s.m, sizeof(s.m));
+ m.relocationSignature = Relocation::Signature;
+ m.shortcuttingSignature = Shortcutting::Signature;
+ m_buffer = BufferType(new char[BufSize() + sizeof(size_t)]);
+ std::memset(m_buffer.Get(), 0, BufSize() + sizeof(size_t));
+ Markup(AlignUp(m_buffer.Get(), sizeof(size_t)));
+
+ // Values in letter-to-leterclass table take into account row header size
+ for (size_t c = 0; c < MaxChar; ++c) {
+ m_letters[c] = s.m_letters[c] - s.HEADER_SIZE + HEADER_SIZE;
+ Y_ASSERT(c == Epsilon || m_letters[c] >= HEADER_SIZE);
+ Y_ASSERT(c == Epsilon || m_letters[c] < RowSize());
+ }
+ memcpy(m_final, s.m_final, m.finalTableSize * sizeof(*m_final));
+ memcpy(m_finalIndex, s.m_finalIndex, m.statesCount * sizeof(*m_finalIndex));
+
+ m.initial = IndexToState(s.StateIndex(s.m.initial));
+
+ for (size_t st = 0; st != m.statesCount; ++st) {
+ size_t oldstate = s.IndexToState(st);
+ size_t newstate = IndexToState(st);
+ Header(newstate) = s.Header(oldstate);
+ const typename Scanner<AnotherRelocation, Shortcutting>::Transition* os
+ = reinterpret_cast<const typename Scanner<AnotherRelocation, Shortcutting>::Transition*>(oldstate);
+ Transition* ns = reinterpret_cast<Transition*>(newstate);
+
+ for (size_t let = 0; let != LettersCount(); ++let) {
+ size_t destIndex = s.StateIndex(AnotherRelocation::Go(oldstate, os[let + s.HEADER_SIZE]));
+ Transition tr = Relocation::Diff(newstate, IndexToState(destIndex));
+ ns[let + HEADER_SIZE] = tr;
+ Y_ASSERT(Relocation::Go(newstate, tr) >= (size_t)m_transitions);
+ Y_ASSERT(Relocation::Go(newstate, tr) < (size_t)(m_transitions + RowSize()*Size()));
+ }
+ }
+ }
+
+
+ size_t IndexToState(size_t stateIndex) const
+ {
+ return reinterpret_cast<size_t>(m_transitions + stateIndex * RowSize());
+ }
+
+ void SetJump(size_t oldState, Char c, size_t newState, unsigned long /*payload*/ = 0)
+ {
+ Y_ASSERT(m_buffer);
+ Y_ASSERT(oldState < m.statesCount);
+ Y_ASSERT(newState < m.statesCount);
+
+ m_transitions[oldState * RowSize() + m_letters[c]]
+ = Relocation::Diff(IndexToState(oldState), IndexToState(newState));
+ }
+
+ unsigned long RemapAction(unsigned long action) { return action; }
+
+ void SetInitial(size_t state)
+ {
+ Y_ASSERT(m_buffer);
+ m.initial = IndexToState(state);
+ }
+
+ void SetTag(size_t state, size_t value)
+ {
+ Y_ASSERT(m_buffer);
+ Header(IndexToState(state)).Common.Flags = value;
+ }
+
+ // Fill shortcut masks for all the states
+ void BuildShortcuts()
+ {
+ Y_ASSERT(m_buffer);
+
+ // Build the mapping from letter classes to characters
+ TVector< TVector<char> > letters(RowSize());
+ for (unsigned ch = 0; ch != 1 << (sizeof(char)*8); ++ch)
+ letters[m_letters[ch]].push_back(ch);
+
+ // Loop through all states in the transition table and
+ // check if it is possible to setup shortcuts
+ for (size_t i = 0; i != Size(); ++i) {
+ State st = IndexToState(i);
+ ScannerRowHeader& header = Header(st);
+ Shortcutting::SetNoExit(header);
+ size_t ind = 0;
+ size_t let = HEADER_SIZE;
+ for (; let != LettersCount() + HEADER_SIZE; ++let) {
+ // Check if the transition is not the same state
+ if (Relocation::Go(st, reinterpret_cast<const Transition*>(st)[let]) != st) {
+ if (ind + letters[let].size() > Shortcutting::ExitMaskCount)
+ break;
+ // For each character setup a mask
+ for (auto&& character : letters[let]) {
+ Shortcutting::SetMask(header, ind, character);
+ ++ind;
+ }
+ }
+ }
+
+ if (let != LettersCount() + HEADER_SIZE) {
+ // Not enough space in ExitMasks, so reset all masks (which leads to bypassing the optimization)
+ Shortcutting::SetNoShortcut(header);
+ }
+ // Fill the rest of the shortcut masks with the last used mask
+ Shortcutting::FinishMasks(header, ind);
+ }
+ }
+
+ // Fills final states table and builds shortcuts if possible
+ void FinishBuild()
+ {
+ Y_ASSERT(m_buffer);
+ auto finalWriter = m_final;
+ for (size_t state = 0; state != Size(); ++state) {
+ m_finalIndex[state] = finalWriter - m_final;
+ if (Header(IndexToState(state)).Common.Flags & FinalFlag)
+ *finalWriter++ = 0;
+ *finalWriter++ = static_cast<size_t>(-1);
+ }
+ BuildShortcuts();
+ }
+
+ size_t AcceptedRegexpsCount(size_t idx) const
+ {
+ const size_t* b = m_final + m_finalIndex[idx];
+ const size_t* e = b;
+ while (*e != End)
+ ++e;
+ return e - b;
+ }
+
+ template <class Scanner>
+ friend void Pire::BuildScanner(const Fsm&, Scanner&);
+
+ typedef State InternalState; // Needed for agglutination
+ friend class ScannerGlueCommon<Scanner>;
+ friend class ScannerGlueTask<Scanner>;
+
+ template<class AnotherRelocation, class AnotherShortcutting>
+ friend class Scanner;
friend struct ScannerSaver;
#ifndef PIRE_DEBUG
- friend struct AlignedRunner< Scanner<Relocation, Shortcutting> >;
+ friend struct AlignedRunner< Scanner<Relocation, Shortcutting> >;
#endif
};
// Helper class for Save/Load partial specialization
struct ScannerSaver {
- template<class Shortcutting>
- static void SaveScanner(const Scanner<Relocatable, Shortcutting>& scanner, yostream* s)
- {
- typedef Scanner<Relocatable, Shortcutting> ScannerType;
-
- typename ScannerType::Locals mc = scanner.m;
- mc.initial -= reinterpret_cast<size_t>(scanner.m_transitions);
- SavePodType(s, Pire::Header(ScannerIOTypes::Scanner, sizeof(mc)));
- Impl::AlignSave(s, sizeof(Pire::Header));
- SavePodType(s, mc);
- Impl::AlignSave(s, sizeof(mc));
- SavePodType(s, scanner.Empty());
- Impl::AlignSave(s, sizeof(scanner.Empty()));
- if (!scanner.Empty())
- Impl::AlignedSaveArray(s, scanner.m_buffer.Get(), scanner.BufSize());
- }
-
- template<class Shortcutting>
- static void LoadScanner(Scanner<Relocatable, Shortcutting>& scanner, yistream* s)
- {
- typedef Scanner<Relocatable, Shortcutting> ScannerType;
-
- Scanner<Relocatable, Shortcutting> sc;
- Impl::ValidateHeader(s, ScannerIOTypes::Scanner, sizeof(sc.m));
- LoadPodType(s, sc.m);
- Impl::AlignLoad(s, sizeof(sc.m));
- if (Shortcutting::Signature != sc.m.shortcuttingSignature)
- throw Error("This scanner has different shortcutting type");
- bool empty;
- LoadPodType(s, empty);
- Impl::AlignLoad(s, sizeof(empty));
-
- if (empty) {
- sc.Alias(ScannerType::Null());
- } else {
- sc.m_buffer = TArrayHolder<char>(new char[sc.BufSize()]);
- Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize());
- sc.Markup(sc.m_buffer.Get());
- sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions);
- }
- scanner.Swap(sc);
- }
-
- // TODO: implement more effective serialization
- // of nonrelocatable scanner if necessary
-
- template<class Shortcutting>
- static void SaveScanner(const Scanner<Nonrelocatable, Shortcutting>& scanner, yostream* s)
- {
- Scanner<Relocatable, Shortcutting>(scanner).Save(s);
- }
-
- template<class Shortcutting>
- static void LoadScanner(Scanner<Nonrelocatable, Shortcutting>& scanner, yistream* s)
- {
- Scanner<Relocatable, Shortcutting> rs;
- rs.Load(s);
- Scanner<Nonrelocatable, Shortcutting>(rs).Swap(scanner);
- }
+ template<class Shortcutting>
+ static void SaveScanner(const Scanner<Relocatable, Shortcutting>& scanner, yostream* s)
+ {
+ typedef Scanner<Relocatable, Shortcutting> ScannerType;
+
+ typename ScannerType::Locals mc = scanner.m;
+ mc.initial -= reinterpret_cast<size_t>(scanner.m_transitions);
+ SavePodType(s, Pire::Header(ScannerIOTypes::Scanner, sizeof(mc)));
+ Impl::AlignSave(s, sizeof(Pire::Header));
+ SavePodType(s, mc);
+ Impl::AlignSave(s, sizeof(mc));
+ SavePodType(s, scanner.Empty());
+ Impl::AlignSave(s, sizeof(scanner.Empty()));
+ if (!scanner.Empty())
+ Impl::AlignedSaveArray(s, scanner.m_buffer.Get(), scanner.BufSize());
+ }
+
+ template<class Shortcutting>
+ static void LoadScanner(Scanner<Relocatable, Shortcutting>& scanner, yistream* s)
+ {
+ typedef Scanner<Relocatable, Shortcutting> ScannerType;
+
+ Scanner<Relocatable, Shortcutting> sc;
+ Impl::ValidateHeader(s, ScannerIOTypes::Scanner, sizeof(sc.m));
+ LoadPodType(s, sc.m);
+ Impl::AlignLoad(s, sizeof(sc.m));
+ if (Shortcutting::Signature != sc.m.shortcuttingSignature)
+ throw Error("This scanner has different shortcutting type");
+ bool empty;
+ LoadPodType(s, empty);
+ Impl::AlignLoad(s, sizeof(empty));
+
+ if (empty) {
+ sc.Alias(ScannerType::Null());
+ } else {
+ sc.m_buffer = TArrayHolder<char>(new char[sc.BufSize()]);
+ Impl::AlignedLoadArray(s, sc.m_buffer.Get(), sc.BufSize());
+ sc.Markup(sc.m_buffer.Get());
+ sc.m.initial += reinterpret_cast<size_t>(sc.m_transitions);
+ }
+ scanner.Swap(sc);
+ }
+
+ // TODO: implement more effective serialization
+ // of nonrelocatable scanner if necessary
+
+ template<class Shortcutting>
+ static void SaveScanner(const Scanner<Nonrelocatable, Shortcutting>& scanner, yostream* s)
+ {
+ Scanner<Relocatable, Shortcutting>(scanner).Save(s);
+ }
+
+ template<class Shortcutting>
+ static void LoadScanner(Scanner<Nonrelocatable, Shortcutting>& scanner, yistream* s)
+ {
+ Scanner<Relocatable, Shortcutting> rs;
+ rs.Load(s);
+ Scanner<Nonrelocatable, Shortcutting>(rs).Swap(scanner);
+ }
};
template<class Relocation, class Shortcutting>
void Scanner<Relocation, Shortcutting>::Save(yostream* s) const
{
- ScannerSaver::SaveScanner(*this, s);
+ ScannerSaver::SaveScanner(*this, s);
}
template<class Relocation, class Shortcutting>
void Scanner<Relocation, Shortcutting>::Load(yistream* s)
{
- ScannerSaver::LoadScanner(*this, s);
+ ScannerSaver::LoadScanner(*this, s);
}
// Shortcutting policy that checks state exit masks
template <size_t MaskCount>
class ExitMasks {
private:
- enum {
- NO_SHORTCUT_MASK = 1, // the state doesn't have shortcuts
- NO_EXIT_MASK = 2 // the state has only transtions to itself (we can stop the scan)
- };
-
- template<class ScannerRowHeader, unsigned N>
- struct MaskCheckerBase {
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool Check(const ScannerRowHeader& hdr, size_t alignOffset, Word chunk)
- {
- Word mask = CheckBytes(hdr.Mask(N, alignOffset), chunk);
- for (int i = N-1; i >= 0; --i) {
- mask = Or(mask, CheckBytes(hdr.Mask(i, alignOffset), chunk));
- }
- return !IsAnySet(mask);
- }
-
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- const Word* DoRun(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
- {
- for (; begin != end && Check(hdr, alignOffset, ToLittleEndian(*begin)); ++begin) {}
- return begin;
- }
- };
-
- template<class ScannerRowHeader, unsigned N, unsigned Nmax>
- struct MaskChecker : MaskCheckerBase<ScannerRowHeader, N> {
- typedef MaskCheckerBase<ScannerRowHeader, N> Base;
- typedef MaskChecker<ScannerRowHeader, N+1, Nmax> Next;
-
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
- {
- if (hdr.Mask(N) == hdr.Mask(N + 1))
- return Base::DoRun(hdr, alignOffset, begin, end);
- else
- return Next::Run(hdr, alignOffset, begin, end);
- }
- };
-
- template<class ScannerRowHeader, unsigned N>
- struct MaskChecker<ScannerRowHeader, N, N> : MaskCheckerBase<ScannerRowHeader, N> {
- typedef MaskCheckerBase<ScannerRowHeader, N> Base;
-
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
- {
- return Base::DoRun(hdr, alignOffset, begin, end);
- }
- };
-
- // Compares the ExitMask[0] value without SSE reads which seems to be more optimal
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool CheckFirstMask(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t val)
- {
- return (scanner.Header(state).Mask(0) == val);
- }
+ enum {
+ NO_SHORTCUT_MASK = 1, // the state doesn't have shortcuts
+ NO_EXIT_MASK = 2 // the state has only transtions to itself (we can stop the scan)
+ };
+
+ template<class ScannerRowHeader, unsigned N>
+ struct MaskCheckerBase {
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool Check(const ScannerRowHeader& hdr, size_t alignOffset, Word chunk)
+ {
+ Word mask = CheckBytes(hdr.Mask(N, alignOffset), chunk);
+ for (int i = N-1; i >= 0; --i) {
+ mask = Or(mask, CheckBytes(hdr.Mask(i, alignOffset), chunk));
+ }
+ return !IsAnySet(mask);
+ }
+
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ const Word* DoRun(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
+ {
+ for (; begin != end && Check(hdr, alignOffset, ToLittleEndian(*begin)); ++begin) {}
+ return begin;
+ }
+ };
+
+ template<class ScannerRowHeader, unsigned N, unsigned Nmax>
+ struct MaskChecker : MaskCheckerBase<ScannerRowHeader, N> {
+ typedef MaskCheckerBase<ScannerRowHeader, N> Base;
+ typedef MaskChecker<ScannerRowHeader, N+1, Nmax> Next;
+
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
+ {
+ if (hdr.Mask(N) == hdr.Mask(N + 1))
+ return Base::DoRun(hdr, alignOffset, begin, end);
+ else
+ return Next::Run(hdr, alignOffset, begin, end);
+ }
+ };
+
+ template<class ScannerRowHeader, unsigned N>
+ struct MaskChecker<ScannerRowHeader, N, N> : MaskCheckerBase<ScannerRowHeader, N> {
+ typedef MaskCheckerBase<ScannerRowHeader, N> Base;
+
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ const Word* Run(const ScannerRowHeader& hdr, size_t alignOffset, const Word* begin, const Word* end)
+ {
+ return Base::DoRun(hdr, alignOffset, begin, end);
+ }
+ };
+
+ // Compares the ExitMask[0] value without SSE reads which seems to be more optimal
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool CheckFirstMask(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t val)
+ {
+ return (scanner.Header(state).Mask(0) == val);
+ }
public:
- static const size_t ExitMaskCount = MaskCount;
- static const size_t Signature = 0x2000 + MaskCount;
-
- template <class Scanner>
- struct ExtendedRowHeader {
- private:
- /// In order to allow transition table to be aligned at sizeof(size_t) instead of
- /// sizeof(Word) and still be able to read Masks at Word-aligned addresses each mask
- /// occupies 2x space and only properly aligned part of it is read
- enum {
- SizeTInMaxSizeWord = sizeof(MaxSizeWord) / sizeof(size_t),
- MaskSizeInSizeT = 2 * SizeTInMaxSizeWord,
- };
-
- public:
- static const size_t ExitMaskCount = MaskCount;
-
- inline
- const Word& Mask(size_t i, size_t alignOffset) const
- {
- Y_ASSERT(i < ExitMaskCount);
- Y_ASSERT(alignOffset < SizeTInMaxSizeWord);
- const Word* p = (const Word*)(ExitMasksArray + alignOffset + MaskSizeInSizeT * i);
- Y_ASSERT(IsAligned(p, sizeof(Word)));
- return *p;
- }
-
- PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- size_t Mask(size_t i) const
- {
- Y_ASSERT(i < ExitMaskCount);
- return ExitMasksArray[MaskSizeInSizeT*i];
- }
-
- void SetMask(size_t i, size_t val)
- {
- for (size_t j = 0; j < MaskSizeInSizeT; ++j)
- ExitMasksArray[MaskSizeInSizeT*i + j] = val;
- }
-
- ExtendedRowHeader()
- {
- for (size_t i = 0; i < ExitMaskCount; ++i)
- SetMask(i, NO_SHORTCUT_MASK);
- }
-
- template <class OtherScanner>
- ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other)
- {
- PIRE_STATIC_ASSERT(ExitMaskCount == ExtendedRowHeader<OtherScanner>::ExitMaskCount);
- Common = other.Common;
- for (size_t i = 0; i < ExitMaskCount; ++i)
- SetMask(i, other.Mask(i));
- return *this;
- }
-
- private:
- /// If this state loops for all letters except particular set
- /// (common thing when matching something like /.*[Aa]/),
- /// each ExitMask contains that letter in each byte of size_t.
- ///
- /// These masks are most commonly used for fast forwarding through parts
- /// of the string matching /.*/ somewhere in the middle regexp.
- size_t ExitMasksArray[ExitMaskCount * MaskSizeInSizeT];
-
- public:
- typename Scanner::CommonRowHeader Common;
- };
-
- template <class Header>
- static void SetNoExit(Header& header)
- {
- header.SetMask(0, NO_EXIT_MASK);
- }
-
- template <class Header>
- static void SetNoShortcut(Header& header)
- {
- header.SetMask(0, NO_SHORTCUT_MASK);
- }
-
- template <class Header>
- static void SetMask(Header& header, size_t ind, char c)
- {
- header.SetMask(ind, FillSizeT(c));
- }
-
- template <class Header>
- static void FinishMasks(Header& header, size_t ind)
- {
- if (ind == 0)
- ind = 1;
- // Fill the rest of the shortcut masks with the last used mask
- size_t lastMask = header.Mask(ind - 1);
- while (ind != ExitMaskCount) {
- header.SetMask(ind, lastMask);
- ++ind;
- }
- }
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool NoExit(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state)
- {
- return CheckFirstMask(scanner, state, NO_EXIT_MASK);
- }
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool NoShortcut(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state)
- {
- return CheckFirstMask(scanner, state, NO_SHORTCUT_MASK);
- }
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- const Word* Run(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t alignOffset, const Word* begin, const Word* end)
- {
- return MaskChecker<typename Scanner<Relocation, ExitMasks<MaskCount> >::ScannerRowHeader, 0, MaskCount - 1>::Run(scanner.Header(state), alignOffset, begin, end);
- }
+ static const size_t ExitMaskCount = MaskCount;
+ static const size_t Signature = 0x2000 + MaskCount;
+
+ template <class Scanner>
+ struct ExtendedRowHeader {
+ private:
+ /// In order to allow transition table to be aligned at sizeof(size_t) instead of
+ /// sizeof(Word) and still be able to read Masks at Word-aligned addresses each mask
+ /// occupies 2x space and only properly aligned part of it is read
+ enum {
+ SizeTInMaxSizeWord = sizeof(MaxSizeWord) / sizeof(size_t),
+ MaskSizeInSizeT = 2 * SizeTInMaxSizeWord,
+ };
+
+ public:
+ static const size_t ExitMaskCount = MaskCount;
+
+ inline
+ const Word& Mask(size_t i, size_t alignOffset) const
+ {
+ Y_ASSERT(i < ExitMaskCount);
+ Y_ASSERT(alignOffset < SizeTInMaxSizeWord);
+ const Word* p = (const Word*)(ExitMasksArray + alignOffset + MaskSizeInSizeT * i);
+ Y_ASSERT(IsAligned(p, sizeof(Word)));
+ return *p;
+ }
+
+ PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ size_t Mask(size_t i) const
+ {
+ Y_ASSERT(i < ExitMaskCount);
+ return ExitMasksArray[MaskSizeInSizeT*i];
+ }
+
+ void SetMask(size_t i, size_t val)
+ {
+ for (size_t j = 0; j < MaskSizeInSizeT; ++j)
+ ExitMasksArray[MaskSizeInSizeT*i + j] = val;
+ }
+
+ ExtendedRowHeader()
+ {
+ for (size_t i = 0; i < ExitMaskCount; ++i)
+ SetMask(i, NO_SHORTCUT_MASK);
+ }
+
+ template <class OtherScanner>
+ ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other)
+ {
+ PIRE_STATIC_ASSERT(ExitMaskCount == ExtendedRowHeader<OtherScanner>::ExitMaskCount);
+ Common = other.Common;
+ for (size_t i = 0; i < ExitMaskCount; ++i)
+ SetMask(i, other.Mask(i));
+ return *this;
+ }
+
+ private:
+ /// If this state loops for all letters except particular set
+ /// (common thing when matching something like /.*[Aa]/),
+ /// each ExitMask contains that letter in each byte of size_t.
+ ///
+ /// These masks are most commonly used for fast forwarding through parts
+ /// of the string matching /.*/ somewhere in the middle regexp.
+ size_t ExitMasksArray[ExitMaskCount * MaskSizeInSizeT];
+
+ public:
+ typename Scanner::CommonRowHeader Common;
+ };
+
+ template <class Header>
+ static void SetNoExit(Header& header)
+ {
+ header.SetMask(0, NO_EXIT_MASK);
+ }
+
+ template <class Header>
+ static void SetNoShortcut(Header& header)
+ {
+ header.SetMask(0, NO_SHORTCUT_MASK);
+ }
+
+ template <class Header>
+ static void SetMask(Header& header, size_t ind, char c)
+ {
+ header.SetMask(ind, FillSizeT(c));
+ }
+
+ template <class Header>
+ static void FinishMasks(Header& header, size_t ind)
+ {
+ if (ind == 0)
+ ind = 1;
+ // Fill the rest of the shortcut masks with the last used mask
+ size_t lastMask = header.Mask(ind - 1);
+ while (ind != ExitMaskCount) {
+ header.SetMask(ind, lastMask);
+ ++ind;
+ }
+ }
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool NoExit(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state)
+ {
+ return CheckFirstMask(scanner, state, NO_EXIT_MASK);
+ }
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool NoShortcut(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state)
+ {
+ return CheckFirstMask(scanner, state, NO_SHORTCUT_MASK);
+ }
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ const Word* Run(const Scanner<Relocation, ExitMasks<MaskCount> >& scanner, typename Scanner<Relocation, ExitMasks<MaskCount> >::State state, size_t alignOffset, const Word* begin, const Word* end)
+ {
+ return MaskChecker<typename Scanner<Relocation, ExitMasks<MaskCount> >::ScannerRowHeader, 0, MaskCount - 1>::Run(scanner.Header(state), alignOffset, begin, end);
+ }
};
@@ -817,57 +817,57 @@ public:
// Shortcutting policy that doesn't do shortcuts
struct NoShortcuts {
- static const size_t ExitMaskCount = 0;
- static const size_t Signature = 0x1000;
-
- template <class Scanner>
- struct ExtendedRowHeader {
- typename Scanner::CommonRowHeader Common;
-
- template <class OtherScanner>
- ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other)
- {
- PIRE_STATIC_ASSERT(sizeof(ExtendedRowHeader) == sizeof(ExtendedRowHeader<OtherScanner>));
- Common = other.Common;
- return *this;
- }
- };
-
- template <class Header>
- static void SetNoExit(Header&) {}
-
- template <class Header>
- static void SetNoShortcut(Header&) {}
-
- template <class Header>
- static void SetMask(Header&, size_t, char) {}
-
- template <class Header>
- static void FinishMasks(Header&, size_t) {}
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool NoExit(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State)
- {
- // Cannot exit prematurely
- return false;
- }
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- bool NoShortcut(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State)
- {
- // There's no shortcut regardless of the state
- return true;
- }
-
- template <class Relocation>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- const Word* Run(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State, size_t, const Word* begin, const Word*)
- {
- // Stop shortcutting right at the beginning
- return begin;
- }
+ static const size_t ExitMaskCount = 0;
+ static const size_t Signature = 0x1000;
+
+ template <class Scanner>
+ struct ExtendedRowHeader {
+ typename Scanner::CommonRowHeader Common;
+
+ template <class OtherScanner>
+ ExtendedRowHeader& operator =(const ExtendedRowHeader<OtherScanner>& other)
+ {
+ PIRE_STATIC_ASSERT(sizeof(ExtendedRowHeader) == sizeof(ExtendedRowHeader<OtherScanner>));
+ Common = other.Common;
+ return *this;
+ }
+ };
+
+ template <class Header>
+ static void SetNoExit(Header&) {}
+
+ template <class Header>
+ static void SetNoShortcut(Header&) {}
+
+ template <class Header>
+ static void SetMask(Header&, size_t, char) {}
+
+ template <class Header>
+ static void FinishMasks(Header&, size_t) {}
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool NoExit(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State)
+ {
+ // Cannot exit prematurely
+ return false;
+ }
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ bool NoShortcut(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State)
+ {
+ // There's no shortcut regardless of the state
+ return true;
+ }
+
+ template <class Relocation>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ const Word* Run(const Scanner<Relocation, NoShortcuts>&, typename Scanner<Relocation, NoShortcuts>::State, size_t, const Word* begin, const Word*)
+ {
+ // Stop shortcutting right at the beginning
+ return begin;
+ }
};
#ifndef PIRE_DEBUG
@@ -877,120 +877,120 @@ struct NoShortcuts {
// Manually unrolled code proves to be faster
template <class Scanner, unsigned Count>
struct MultiChunk {
- // Process Word-sized chunk which consist of >=1 size_t-sized chunks
- template<class Pred>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action Process(const Scanner& scanner, typename Scanner::State& state, const size_t* p, Pred pred)
- {
- if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Continue)
- return MultiChunk<Scanner, Count-1>::Process(scanner, state, ++p, pred);
- else
- return Stop;
- }
+ // Process Word-sized chunk which consist of >=1 size_t-sized chunks
+ template<class Pred>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action Process(const Scanner& scanner, typename Scanner::State& state, const size_t* p, Pred pred)
+ {
+ if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Continue)
+ return MultiChunk<Scanner, Count-1>::Process(scanner, state, ++p, pred);
+ else
+ return Stop;
+ }
};
template <class Scanner>
struct MultiChunk<Scanner, 0> {
- // Process Word-sized chunk which consist of >=1 size_t-sized chunks
- template<class Pred>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action Process(const Scanner&, typename Scanner::State, const size_t*, Pred)
- {
- return Continue;
- }
+ // Process Word-sized chunk which consist of >=1 size_t-sized chunks
+ template<class Pred>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action Process(const Scanner&, typename Scanner::State, const size_t*, Pred)
+ {
+ return Continue;
+ }
};
// Efficiently runs a scanner through size_t-aligned memory range
template<class Relocation, class Shortcutting>
struct AlignedRunner< Scanner<Relocation, Shortcutting> > {
private:
- typedef Scanner<Relocation, Shortcutting> ScannerType;
-
- // Processes Word-sized chuck of memory (depending on the platform a Word might
- // consist of multiple size_t chuncks)
- template <class Pred>
- static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
- Action RunMultiChunk(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, Pred pred)
- {
- return MultiChunk<ScannerType, sizeof(Word)/sizeof(size_t)>::Process(scanner, st, begin, pred);
- }
-
- // Asserts if the scanner changes state while processing the byte range that is
- // supposed to be skipped by a shortcut
- static void ValidateSkip(const ScannerType& scanner, typename ScannerType::State st, const char* begin, const char* end)
- {
- typename ScannerType::State stateBefore = st;
- for (const char* pos = begin; pos != end; ++pos) {
- Step(scanner, st, (unsigned char)*pos);
- Y_ASSERT(st == stateBefore);
- }
- }
+ typedef Scanner<Relocation, Shortcutting> ScannerType;
+
+ // Processes Word-sized chuck of memory (depending on the platform a Word might
+ // consist of multiple size_t chuncks)
+ template <class Pred>
+ static PIRE_FORCED_INLINE PIRE_HOT_FUNCTION
+ Action RunMultiChunk(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, Pred pred)
+ {
+ return MultiChunk<ScannerType, sizeof(Word)/sizeof(size_t)>::Process(scanner, st, begin, pred);
+ }
+
+ // Asserts if the scanner changes state while processing the byte range that is
+ // supposed to be skipped by a shortcut
+ static void ValidateSkip(const ScannerType& scanner, typename ScannerType::State st, const char* begin, const char* end)
+ {
+ typename ScannerType::State stateBefore = st;
+ for (const char* pos = begin; pos != end; ++pos) {
+ Step(scanner, st, (unsigned char)*pos);
+ Y_ASSERT(st == stateBefore);
+ }
+ }
public:
- template<class Pred>
- static inline PIRE_HOT_FUNCTION
- Action RunAligned(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, const size_t* end , Pred pred)
- {
- typename ScannerType::State state = st;
- const Word* head = AlignUp((const Word*) begin, sizeof(Word));
- const Word* tail = AlignDown((const Word*) end, sizeof(Word));
- for (; begin != (const size_t*) head && begin != end; ++begin)
- if (RunChunk(scanner, state, begin, 0, sizeof(void*), pred) == Stop) {
- st = state;
- return Stop;
- }
-
- if (begin == end) {
- st = state;
- return Continue;
- }
- if (Shortcutting::NoExit(scanner, state)) {
- st = state;
- return pred(scanner, state, ((const char*) end));
- }
-
- // Row size should be a multiple of MaxSizeWord size. Then alignOffset is the same for any state
- Y_ASSERT((scanner.RowSize()*sizeof(typename ScannerType::Transition)) % sizeof(MaxSizeWord) == 0);
- size_t alignOffset = (AlignUp((size_t)scanner.m_transitions, sizeof(Word)) - (size_t)scanner.m_transitions) / sizeof(size_t);
-
- bool noShortcut = Shortcutting::NoShortcut(scanner, state);
-
- while (true) {
- // Do normal processing until a shortcut is possible
- while (noShortcut && head != tail) {
- if (RunMultiChunk(scanner, state, (const size_t*)head, pred) == Stop) {
- st = state;
- return Stop;
- }
- ++head;
- noShortcut = Shortcutting::NoShortcut(scanner, state);
- }
- if (head == tail)
- break;
-
- if (Shortcutting::NoExit(scanner, state)) {
- st = state;
- return pred(scanner, state, ((const char*) end));
- }
-
- // Do fast forwarding while it is possible
- const Word* skipEnd = Shortcutting::Run(scanner, state, alignOffset, head, tail);
- PIRE_IF_CHECKED(ValidateSkip(scanner, state, (const char*)head, (const char*)skipEnd));
- head = skipEnd;
- noShortcut = true;
- }
-
- for (size_t* p = (size_t*) tail; p != end; ++p) {
- if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Stop) {
- st = state;
- return Stop;
- }
- }
-
- st = state;
- return Continue;
- }
+ template<class Pred>
+ static inline PIRE_HOT_FUNCTION
+ Action RunAligned(const ScannerType& scanner, typename ScannerType::State& st, const size_t* begin, const size_t* end , Pred pred)
+ {
+ typename ScannerType::State state = st;
+ const Word* head = AlignUp((const Word*) begin, sizeof(Word));
+ const Word* tail = AlignDown((const Word*) end, sizeof(Word));
+ for (; begin != (const size_t*) head && begin != end; ++begin)
+ if (RunChunk(scanner, state, begin, 0, sizeof(void*), pred) == Stop) {
+ st = state;
+ return Stop;
+ }
+
+ if (begin == end) {
+ st = state;
+ return Continue;
+ }
+ if (Shortcutting::NoExit(scanner, state)) {
+ st = state;
+ return pred(scanner, state, ((const char*) end));
+ }
+
+ // Row size should be a multiple of MaxSizeWord size. Then alignOffset is the same for any state
+ Y_ASSERT((scanner.RowSize()*sizeof(typename ScannerType::Transition)) % sizeof(MaxSizeWord) == 0);
+ size_t alignOffset = (AlignUp((size_t)scanner.m_transitions, sizeof(Word)) - (size_t)scanner.m_transitions) / sizeof(size_t);
+
+ bool noShortcut = Shortcutting::NoShortcut(scanner, state);
+
+ while (true) {
+ // Do normal processing until a shortcut is possible
+ while (noShortcut && head != tail) {
+ if (RunMultiChunk(scanner, state, (const size_t*)head, pred) == Stop) {
+ st = state;
+ return Stop;
+ }
+ ++head;
+ noShortcut = Shortcutting::NoShortcut(scanner, state);
+ }
+ if (head == tail)
+ break;
+
+ if (Shortcutting::NoExit(scanner, state)) {
+ st = state;
+ return pred(scanner, state, ((const char*) end));
+ }
+
+ // Do fast forwarding while it is possible
+ const Word* skipEnd = Shortcutting::Run(scanner, state, alignOffset, head, tail);
+ PIRE_IF_CHECKED(ValidateSkip(scanner, state, (const char*)head, (const char*)skipEnd));
+ head = skipEnd;
+ noShortcut = true;
+ }
+
+ for (size_t* p = (size_t*) tail; p != end; ++p) {
+ if (RunChunk(scanner, state, p, 0, sizeof(void*), pred) == Stop) {
+ st = state;
+ return Stop;
+ }
+ }
+
+ st = state;
+ return Continue;
+ }
};
#endif
@@ -998,64 +998,64 @@ public:
template<class Scanner>
class ScannerGlueTask: public ScannerGlueCommon<Scanner> {
public:
- typedef ScannerGlueCommon<Scanner> Base;
- typedef typename Base::State State;
- using Base::Lhs;
- using Base::Rhs;
- using Base::Sc;
- using Base::Letters;
-
- typedef GluedStateLookupTable<256*1024, typename Scanner::State> InvStates;
-
- ScannerGlueTask(const Scanner& lhs, const Scanner& rhs)
- : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters))
- {
- }
-
- void AcceptStates(const TVector<State>& states)
- {
- // Make up a new scanner and fill in the final table
-
- size_t finalTableSize = 0;
- for (auto&& i : states)
- finalTableSize += RangeLen(Lhs().AcceptedRegexps(i.first)) + RangeLen(Rhs().AcceptedRegexps(i.second));
- this->SetSc(THolder<Scanner>(new Scanner));
- Sc().Init(states.size(), Letters(), finalTableSize, size_t(0), Lhs().RegexpsCount() + Rhs().RegexpsCount());
-
- auto finalWriter = Sc().m_final;
- for (size_t state = 0; state != states.size(); ++state) {
- Sc().m_finalIndex[state] = finalWriter - Sc().m_final;
- finalWriter = Shift(Lhs().AcceptedRegexps(states[state].first), 0, finalWriter);
- finalWriter = Shift(Rhs().AcceptedRegexps(states[state].second), Lhs().RegexpsCount(), finalWriter);
- *finalWriter++ = static_cast<size_t>(-1);
-
- Sc().SetTag(state, ((Lhs().Final(states[state].first) || Rhs().Final(states[state].second)) ? Scanner::FinalFlag : 0)
- | ((Lhs().Dead(states[state].first) && Rhs().Dead(states[state].second)) ? Scanner::DeadFlag : 0));
- }
- }
-
- void Connect(size_t from, size_t to, Char letter) { Sc().SetJump(from, letter, to); }
-
- const Scanner& Success()
- {
- Sc().BuildShortcuts();
- return Sc();
- }
+ typedef ScannerGlueCommon<Scanner> Base;
+ typedef typename Base::State State;
+ using Base::Lhs;
+ using Base::Rhs;
+ using Base::Sc;
+ using Base::Letters;
+
+ typedef GluedStateLookupTable<256*1024, typename Scanner::State> InvStates;
+
+ ScannerGlueTask(const Scanner& lhs, const Scanner& rhs)
+ : ScannerGlueCommon<Scanner>(lhs, rhs, LettersEquality<Scanner>(lhs.m_letters, rhs.m_letters))
+ {
+ }
+
+ void AcceptStates(const TVector<State>& states)
+ {
+ // Make up a new scanner and fill in the final table
+
+ size_t finalTableSize = 0;
+ for (auto&& i : states)
+ finalTableSize += RangeLen(Lhs().AcceptedRegexps(i.first)) + RangeLen(Rhs().AcceptedRegexps(i.second));
+ this->SetSc(THolder<Scanner>(new Scanner));
+ Sc().Init(states.size(), Letters(), finalTableSize, size_t(0), Lhs().RegexpsCount() + Rhs().RegexpsCount());
+
+ auto finalWriter = Sc().m_final;
+ for (size_t state = 0; state != states.size(); ++state) {
+ Sc().m_finalIndex[state] = finalWriter - Sc().m_final;
+ finalWriter = Shift(Lhs().AcceptedRegexps(states[state].first), 0, finalWriter);
+ finalWriter = Shift(Rhs().AcceptedRegexps(states[state].second), Lhs().RegexpsCount(), finalWriter);
+ *finalWriter++ = static_cast<size_t>(-1);
+
+ Sc().SetTag(state, ((Lhs().Final(states[state].first) || Rhs().Final(states[state].second)) ? Scanner::FinalFlag : 0)
+ | ((Lhs().Dead(states[state].first) && Rhs().Dead(states[state].second)) ? Scanner::DeadFlag : 0));
+ }
+ }
+
+ void Connect(size_t from, size_t to, Char letter) { Sc().SetJump(from, letter, to); }
+
+ const Scanner& Success()
+ {
+ Sc().BuildShortcuts();
+ return Sc();
+ }
private:
- template<class Iter>
- size_t RangeLen(ypair<Iter, Iter> range) const
- {
- return std::distance(range.first, range.second);
- }
-
- template<class Iter, class OutIter>
- OutIter Shift(ypair<Iter, Iter> range, size_t shift, OutIter out) const
- {
- for (; range.first != range.second; ++range.first, ++out)
- *out = *range.first + shift;
- return out;
- }
+ template<class Iter>
+ size_t RangeLen(ypair<Iter, Iter> range) const
+ {
+ return std::distance(range.first, range.second);
+ }
+
+ template<class Iter, class OutIter>
+ OutIter Shift(ypair<Iter, Iter> range, size_t shift, OutIter out) const
+ {
+ for (; range.first != range.second; ++range.first, ++out)
+ *out = *range.first + shift;
+ return out;
+ }
};
}
@@ -1064,35 +1064,35 @@ private:
template<class Relocation, class Shortcutting>
struct StDumper< Impl::Scanner<Relocation, Shortcutting> > {
- typedef Impl::Scanner<Relocation, Shortcutting> ScannerType;
+ typedef Impl::Scanner<Relocation, Shortcutting> ScannerType;
- StDumper(const ScannerType& sc, typename ScannerType::State st): m_sc(&sc), m_st(st) {}
+ StDumper(const ScannerType& sc, typename ScannerType::State st): m_sc(&sc), m_st(st) {}
- void Dump(yostream& stream) const
- {
- stream << m_sc->StateIndex(m_st);
- if (m_sc->Final(m_st))
- stream << " [final]";
- if (m_sc->Dead(m_st))
- stream << " [dead]";
- }
+ void Dump(yostream& stream) const
+ {
+ stream << m_sc->StateIndex(m_st);
+ if (m_sc->Final(m_st))
+ stream << " [final]";
+ if (m_sc->Dead(m_st))
+ stream << " [dead]";
+ }
private:
- const ScannerType* m_sc;
- typename ScannerType::State m_st;
+ const ScannerType* m_sc;
+ typename ScannerType::State m_st;
};
template<class Relocation, class Shortcutting>
Impl::Scanner<Relocation, Shortcutting> Impl::Scanner<Relocation, Shortcutting>::Glue(const Impl::Scanner<Relocation, Shortcutting>& lhs, const Impl::Scanner<Relocation, Shortcutting>& rhs, size_t maxSize /* = 0 */)
{
- if (lhs.Empty())
- return rhs;
- if (rhs.Empty())
- return lhs;
-
- static const size_t DefMaxSize = 80000;
- Impl::ScannerGlueTask< Impl::Scanner<Relocation, Shortcutting> > task(lhs, rhs);
- return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
+ if (lhs.Empty())
+ return rhs;
+ if (rhs.Empty())
+ return lhs;
+
+ static const size_t DefMaxSize = 80000;
+ Impl::ScannerGlueTask< Impl::Scanner<Relocation, Shortcutting> > task(lhs, rhs);
+ return Impl::Determine(task, maxSize ? maxSize : DefMaxSize);
}
@@ -1118,13 +1118,13 @@ typedef Impl::Scanner<Impl::Nonrelocatable, Impl::NoShortcuts> NonrelocScannerNo
}
namespace std {
- inline void swap(Pire::Scanner& a, Pire::Scanner& b) {
- a.Swap(b);
- }
+ inline void swap(Pire::Scanner& a, Pire::Scanner& b) {
+ a.Swap(b);
+ }
- inline void swap(Pire::NonrelocScanner& a, Pire::NonrelocScanner& b) {
- a.Swap(b);
- }
+ inline void swap(Pire::NonrelocScanner& a, Pire::NonrelocScanner& b) {
+ a.Swap(b);
+ }
}
diff --git a/library/cpp/regex/pire/pire/scanners/pair.h b/library/cpp/regex/pire/pire/scanners/pair.h
index c12338a2a06..1c96e5dc0da 100644
--- a/library/cpp/regex/pire/pire/scanners/pair.h
+++ b/library/cpp/regex/pire/pire/scanners/pair.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -30,70 +30,70 @@ namespace Pire {
* If you need to run two scanners on the same string, using ScannerPair
* is usually faster then running those scanners sequentially.
*/
- template<class Scanner1, class Scanner2>
- class ScannerPair {
- public:
- typedef ypair<typename Scanner1::State, typename Scanner2::State> State;
- typedef ypair<typename Scanner1::Action, typename Scanner2::Action> Action;
-
- ScannerPair()
- : m_scanner1()
- , m_scanner2()
- {
- }
- ScannerPair(const Scanner1& s1, const Scanner2& s2)
- : m_scanner1(&s1)
- , m_scanner2(&s2)
- {
- }
-
- void Initialize(State& state) const
- {
- m_scanner1->Initialize(state.first);
- m_scanner2->Initialize(state.second);
- }
-
- Action Next(State& state, Char ch) const
- {
- return ymake_pair(
- m_scanner1->Next(state.first, ch),
- m_scanner2->Next(state.second, ch)
- );
- }
-
- void TakeAction(State& s, Action a) const
- {
- m_scanner1->TakeAction(s.first, a.first);
- m_scanner2->TakeAction(s.second, a.second);
- }
-
- bool Final(const State& state) const
- {
- return m_scanner1->Final(state.first) || m_scanner2->Final(state.second);
- }
-
- bool Dead(const State& state) const
- {
- return m_scanner1->Dead(state.first) && m_scanner2->Dead(state.second);
- }
-
- ypair<size_t, size_t> StateIndex(const State& state) const
- {
- return ymake_pair(m_scanner1->StateIndex(state.first), m_scanner2->StateIndex(state.second));
- }
-
- Scanner1& First() { return *m_scanner1; }
- Scanner2& Second() { return *m_scanner2; }
-
- const Scanner1& First() const { return *m_scanner1; }
- const Scanner2& Second() const { return *m_scanner2; }
-
- private:
- const Scanner1* m_scanner1;
- const Scanner2* m_scanner2;
- };
-
-
+ template<class Scanner1, class Scanner2>
+ class ScannerPair {
+ public:
+ typedef ypair<typename Scanner1::State, typename Scanner2::State> State;
+ typedef ypair<typename Scanner1::Action, typename Scanner2::Action> Action;
+
+ ScannerPair()
+ : m_scanner1()
+ , m_scanner2()
+ {
+ }
+ ScannerPair(const Scanner1& s1, const Scanner2& s2)
+ : m_scanner1(&s1)
+ , m_scanner2(&s2)
+ {
+ }
+
+ void Initialize(State& state) const
+ {
+ m_scanner1->Initialize(state.first);
+ m_scanner2->Initialize(state.second);
+ }
+
+ Action Next(State& state, Char ch) const
+ {
+ return ymake_pair(
+ m_scanner1->Next(state.first, ch),
+ m_scanner2->Next(state.second, ch)
+ );
+ }
+
+ void TakeAction(State& s, Action a) const
+ {
+ m_scanner1->TakeAction(s.first, a.first);
+ m_scanner2->TakeAction(s.second, a.second);
+ }
+
+ bool Final(const State& state) const
+ {
+ return m_scanner1->Final(state.first) || m_scanner2->Final(state.second);
+ }
+
+ bool Dead(const State& state) const
+ {
+ return m_scanner1->Dead(state.first) && m_scanner2->Dead(state.second);
+ }
+
+ ypair<size_t, size_t> StateIndex(const State& state) const
+ {
+ return ymake_pair(m_scanner1->StateIndex(state.first), m_scanner2->StateIndex(state.second));
+ }
+
+ Scanner1& First() { return *m_scanner1; }
+ Scanner2& Second() { return *m_scanner2; }
+
+ const Scanner1& First() const { return *m_scanner1; }
+ const Scanner2& Second() const { return *m_scanner2; }
+
+ private:
+ const Scanner1* m_scanner1;
+ const Scanner2* m_scanner2;
+ };
+
+
}
#endif
diff --git a/library/cpp/regex/pire/pire/scanners/simple.h b/library/cpp/regex/pire/pire/scanners/simple.h
index 9a5978dda21..85a77dcf728 100644
--- a/library/cpp/regex/pire/pire/scanners/simple.h
+++ b/library/cpp/regex/pire/pire/scanners/simple.h
@@ -39,220 +39,220 @@ namespace Pire {
*/
class SimpleScanner {
private:
- static const size_t STATE_ROW_SIZE = MaxChar + 1; // All characters + 1 element to store final state flag
+ static const size_t STATE_ROW_SIZE = MaxChar + 1; // All characters + 1 element to store final state flag
public:
- typedef size_t Transition;
- typedef ui16 Letter;
- typedef ui32 Action;
- typedef ui8 Tag;
-
- SimpleScanner() { Alias(Null()); }
-
- explicit SimpleScanner(Fsm& fsm, size_t distance = 0);
-
- size_t Size() const { return m.statesCount; }
- bool Empty() const { return m_transitions == Null().m_transitions; }
-
- typedef size_t State;
-
- size_t RegexpsCount() const { return Empty() ? 0 : 1; }
- size_t LettersCount() const { return MaxChar; }
-
- /// Checks whether specified state is in any of the final sets
- bool Final(const State& state) const { return *(((const Transition*) state) - 1) != 0; }
-
- bool Dead(const State&) const { return false; }
-
- ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
- return Final(s) ? Accept() : Deny();
- }
-
- /// returns an initial state for this scanner
- void Initialize(State& state) const { state = m.initial; }
-
- /// Handles one characters
- Action Next(State& state, Char c) const
- {
- Transition shift = reinterpret_cast<const Transition*>(state)[c];
- state += shift;
- return 0;
- }
-
- bool TakeAction(State&, Action) const { return false; }
-
- SimpleScanner(const SimpleScanner& s): m(s.m)
- {
- if (!s.m_buffer) {
- // Empty or mmap()-ed scanner, just copy pointers
- m_buffer = 0;
- m_transitions = s.m_transitions;
- } else {
- // In-memory scanner, perform deep copy
- m_buffer = BufferType(new char[BufSize()]);
- memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize());
- Markup(m_buffer.Get());
-
- m.initial += (m_transitions - s.m_transitions) * sizeof(Transition);
- }
- }
-
- // Makes a shallow ("weak") copy of the given scanner.
- // The copied scanner does not maintain lifetime of the original's entrails.
- void Alias(const SimpleScanner& s)
- {
- m = s.m;
- m_buffer.Reset();
- m_transitions = s.m_transitions;
- }
-
- void Swap(SimpleScanner& s)
- {
- DoSwap(m_buffer, s.m_buffer);
- DoSwap(m.statesCount, s.m.statesCount);
- DoSwap(m.initial, s.m.initial);
- DoSwap(m_transitions, s.m_transitions);
- }
-
- SimpleScanner& operator = (const SimpleScanner& s) { SimpleScanner(s).Swap(*this); return *this; }
-
- ~SimpleScanner() = default;
-
- /*
- * Constructs the scanner from mmap()-ed memory range, returning a pointer
- * to unconsumed part of the buffer.
- */
- const void* Mmap(const void* ptr, size_t size)
- {
- Impl::CheckAlign(ptr);
- SimpleScanner s;
-
- const size_t* p = reinterpret_cast<const size_t*>(ptr);
- Impl::ValidateHeader(p, size, ScannerIOTypes::SimpleScanner, sizeof(m));
- if (size < sizeof(s.m))
- throw Error("EOF reached while mapping NPire::Scanner");
-
- memcpy(&s.m, p, sizeof(s.m));
- Impl::AdvancePtr(p, size, sizeof(s.m));
- Impl::AlignPtr(p, size);
-
- bool empty = *((const bool*) p);
- Impl::AdvancePtr(p, size, sizeof(empty));
- Impl::AlignPtr(p, size);
-
- if (empty)
- s.Alias(Null());
- else {
- if (size < s.BufSize())
- throw Error("EOF reached while mapping NPire::Scanner");
- s.Markup(const_cast<size_t*>(p));
- s.m.initial += reinterpret_cast<size_t>(s.m_transitions);
-
- Swap(s);
- Impl::AdvancePtr(p, size, BufSize());
- }
- return Impl::AlignPtr(p, size);
- }
-
- size_t StateIndex(State s) const
- {
- return (s - reinterpret_cast<size_t>(m_transitions)) / (STATE_ROW_SIZE * sizeof(Transition));
- }
-
- // Returns the size of the memory buffer used (or required) by scanner.
- size_t BufSize() const
- {
- return STATE_ROW_SIZE * m.statesCount * sizeof(Transition); // Transitions table
- }
-
- void Save(yostream*) const;
- void Load(yistream*);
+ typedef size_t Transition;
+ typedef ui16 Letter;
+ typedef ui32 Action;
+ typedef ui8 Tag;
+
+ SimpleScanner() { Alias(Null()); }
+
+ explicit SimpleScanner(Fsm& fsm, size_t distance = 0);
+
+ size_t Size() const { return m.statesCount; }
+ bool Empty() const { return m_transitions == Null().m_transitions; }
+
+ typedef size_t State;
+
+ size_t RegexpsCount() const { return Empty() ? 0 : 1; }
+ size_t LettersCount() const { return MaxChar; }
+
+ /// Checks whether specified state is in any of the final sets
+ bool Final(const State& state) const { return *(((const Transition*) state) - 1) != 0; }
+
+ bool Dead(const State&) const { return false; }
+
+ ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
+ return Final(s) ? Accept() : Deny();
+ }
+
+ /// returns an initial state for this scanner
+ void Initialize(State& state) const { state = m.initial; }
+
+ /// Handles one characters
+ Action Next(State& state, Char c) const
+ {
+ Transition shift = reinterpret_cast<const Transition*>(state)[c];
+ state += shift;
+ return 0;
+ }
+
+ bool TakeAction(State&, Action) const { return false; }
+
+ SimpleScanner(const SimpleScanner& s): m(s.m)
+ {
+ if (!s.m_buffer) {
+ // Empty or mmap()-ed scanner, just copy pointers
+ m_buffer = 0;
+ m_transitions = s.m_transitions;
+ } else {
+ // In-memory scanner, perform deep copy
+ m_buffer = BufferType(new char[BufSize()]);
+ memcpy(m_buffer.Get(), s.m_buffer.Get(), BufSize());
+ Markup(m_buffer.Get());
+
+ m.initial += (m_transitions - s.m_transitions) * sizeof(Transition);
+ }
+ }
+
+ // Makes a shallow ("weak") copy of the given scanner.
+ // The copied scanner does not maintain lifetime of the original's entrails.
+ void Alias(const SimpleScanner& s)
+ {
+ m = s.m;
+ m_buffer.Reset();
+ m_transitions = s.m_transitions;
+ }
+
+ void Swap(SimpleScanner& s)
+ {
+ DoSwap(m_buffer, s.m_buffer);
+ DoSwap(m.statesCount, s.m.statesCount);
+ DoSwap(m.initial, s.m.initial);
+ DoSwap(m_transitions, s.m_transitions);
+ }
+
+ SimpleScanner& operator = (const SimpleScanner& s) { SimpleScanner(s).Swap(*this); return *this; }
+
+ ~SimpleScanner() = default;
+
+ /*
+ * Constructs the scanner from mmap()-ed memory range, returning a pointer
+ * to unconsumed part of the buffer.
+ */
+ const void* Mmap(const void* ptr, size_t size)
+ {
+ Impl::CheckAlign(ptr);
+ SimpleScanner s;
+
+ const size_t* p = reinterpret_cast<const size_t*>(ptr);
+ Impl::ValidateHeader(p, size, ScannerIOTypes::SimpleScanner, sizeof(m));
+ if (size < sizeof(s.m))
+ throw Error("EOF reached while mapping NPire::Scanner");
+
+ memcpy(&s.m, p, sizeof(s.m));
+ Impl::AdvancePtr(p, size, sizeof(s.m));
+ Impl::AlignPtr(p, size);
+
+ bool empty = *((const bool*) p);
+ Impl::AdvancePtr(p, size, sizeof(empty));
+ Impl::AlignPtr(p, size);
+
+ if (empty)
+ s.Alias(Null());
+ else {
+ if (size < s.BufSize())
+ throw Error("EOF reached while mapping NPire::Scanner");
+ s.Markup(const_cast<size_t*>(p));
+ s.m.initial += reinterpret_cast<size_t>(s.m_transitions);
+
+ Swap(s);
+ Impl::AdvancePtr(p, size, BufSize());
+ }
+ return Impl::AlignPtr(p, size);
+ }
+
+ size_t StateIndex(State s) const
+ {
+ return (s - reinterpret_cast<size_t>(m_transitions)) / (STATE_ROW_SIZE * sizeof(Transition));
+ }
+
+ // Returns the size of the memory buffer used (or required) by scanner.
+ size_t BufSize() const
+ {
+ return STATE_ROW_SIZE * m.statesCount * sizeof(Transition); // Transitions table
+ }
+
+ void Save(yostream*) const;
+ void Load(yistream*);
protected:
- struct Locals {
- size_t statesCount;
- size_t initial;
- } m;
-
- using BufferType = TArrayHolder<char>;
- BufferType m_buffer;
-
- Transition* m_transitions;
-
- inline static const SimpleScanner& Null()
- {
- static const SimpleScanner n = Fsm::MakeFalse().Compile<SimpleScanner>();
- return n;
- }
-
- static ypair<const size_t*, const size_t*> Accept()
- {
- static size_t v[1] = { 0 };
- return ymake_pair(v, v + 1);
- }
-
- static ypair<const size_t*, const size_t*> Deny()
- {
- static size_t v[1] = { 0 };
- return ymake_pair(v, v);
- }
-
- /*
- * Initializes pointers depending on buffer start, letters and states count
- */
- void Markup(void* ptr)
- {
- m_transitions = reinterpret_cast<Transition*>(ptr);
- }
-
- void SetJump(size_t oldState, Char c, size_t newState)
- {
- Y_ASSERT(m_buffer);
- Y_ASSERT(oldState < m.statesCount);
- Y_ASSERT(newState < m.statesCount);
- m_transitions[oldState * STATE_ROW_SIZE + 1 + c]
- = (((newState - oldState) * STATE_ROW_SIZE) * sizeof(Transition));
- }
-
- unsigned long RemapAction(unsigned long action) { return action; }
-
- void SetInitial(size_t state)
- {
- Y_ASSERT(m_buffer);
- m.initial = reinterpret_cast<size_t>(m_transitions + state * STATE_ROW_SIZE + 1);
- }
-
- void SetTag(size_t state, size_t tag)
- {
- Y_ASSERT(m_buffer);
- m_transitions[state * STATE_ROW_SIZE] = tag;
- }
+ struct Locals {
+ size_t statesCount;
+ size_t initial;
+ } m;
+
+ using BufferType = TArrayHolder<char>;
+ BufferType m_buffer;
+
+ Transition* m_transitions;
+
+ inline static const SimpleScanner& Null()
+ {
+ static const SimpleScanner n = Fsm::MakeFalse().Compile<SimpleScanner>();
+ return n;
+ }
+
+ static ypair<const size_t*, const size_t*> Accept()
+ {
+ static size_t v[1] = { 0 };
+ return ymake_pair(v, v + 1);
+ }
+
+ static ypair<const size_t*, const size_t*> Deny()
+ {
+ static size_t v[1] = { 0 };
+ return ymake_pair(v, v);
+ }
+
+ /*
+ * Initializes pointers depending on buffer start, letters and states count
+ */
+ void Markup(void* ptr)
+ {
+ m_transitions = reinterpret_cast<Transition*>(ptr);
+ }
+
+ void SetJump(size_t oldState, Char c, size_t newState)
+ {
+ Y_ASSERT(m_buffer);
+ Y_ASSERT(oldState < m.statesCount);
+ Y_ASSERT(newState < m.statesCount);
+ m_transitions[oldState * STATE_ROW_SIZE + 1 + c]
+ = (((newState - oldState) * STATE_ROW_SIZE) * sizeof(Transition));
+ }
+
+ unsigned long RemapAction(unsigned long action) { return action; }
+
+ void SetInitial(size_t state)
+ {
+ Y_ASSERT(m_buffer);
+ m.initial = reinterpret_cast<size_t>(m_transitions + state * STATE_ROW_SIZE + 1);
+ }
+
+ void SetTag(size_t state, size_t tag)
+ {
+ Y_ASSERT(m_buffer);
+ m_transitions[state * STATE_ROW_SIZE] = tag;
+ }
};
inline SimpleScanner::SimpleScanner(Fsm& fsm, size_t distance)
{
- if (distance) {
- fsm = CreateApproxFsm(fsm, distance);
- }
- fsm.Canonize();
-
- m.statesCount = fsm.Size();
- m_buffer = BufferType(new char[BufSize()]);
- memset(m_buffer.Get(), 0, BufSize());
- Markup(m_buffer.Get());
- m.initial = reinterpret_cast<size_t>(m_transitions + fsm.Initial() * STATE_ROW_SIZE + 1);
- for (size_t state = 0; state < fsm.Size(); ++state)
- SetTag(state, fsm.Tag(state) | (fsm.IsFinal(state) ? 1 : 0));
-
- for (size_t from = 0; from != fsm.Size(); ++from)
- for (auto&& i : fsm.Letters()) {
- const auto& tos = fsm.Destinations(from, i.first);
- if (tos.empty())
- continue;
- for (auto&& l : i.second.second)
- for (auto&& to : tos)
- SetJump(from, l, to);
- }
+ if (distance) {
+ fsm = CreateApproxFsm(fsm, distance);
+ }
+ fsm.Canonize();
+
+ m.statesCount = fsm.Size();
+ m_buffer = BufferType(new char[BufSize()]);
+ memset(m_buffer.Get(), 0, BufSize());
+ Markup(m_buffer.Get());
+ m.initial = reinterpret_cast<size_t>(m_transitions + fsm.Initial() * STATE_ROW_SIZE + 1);
+ for (size_t state = 0; state < fsm.Size(); ++state)
+ SetTag(state, fsm.Tag(state) | (fsm.IsFinal(state) ? 1 : 0));
+
+ for (size_t from = 0; from != fsm.Size(); ++from)
+ for (auto&& i : fsm.Letters()) {
+ const auto& tos = fsm.Destinations(from, i.first);
+ if (tos.empty())
+ continue;
+ for (auto&& l : i.second.second)
+ for (auto&& to : tos)
+ SetJump(from, l, to);
+ }
}
diff --git a/library/cpp/regex/pire/pire/scanners/slow.h b/library/cpp/regex/pire/pire/scanners/slow.h
index 5f90e4d5dde..0c26499d17e 100644
--- a/library/cpp/regex/pire/pire/scanners/slow.h
+++ b/library/cpp/regex/pire/pire/scanners/slow.h
@@ -51,380 +51,380 @@ namespace Pire {
*/
class SlowScanner {
public:
- typedef size_t Transition;
- typedef ui16 Letter;
- typedef ui32 Action;
- typedef ui8 Tag;
+ typedef size_t Transition;
+ typedef ui16 Letter;
+ typedef ui32 Action;
+ typedef ui8 Tag;
- enum {
- FinalFlag = 1,
- DeadFlag = 0
- };
+ enum {
+ FinalFlag = 1,
+ DeadFlag = 0
+ };
- struct State {
- TVector<unsigned> states;
- BitSet flags;
+ struct State {
+ TVector<unsigned> states;
+ BitSet flags;
- State() {}
- State(size_t size): flags(size) { states.reserve(size); }
- void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); }
+ State() {}
+ State(size_t size): flags(size) { states.reserve(size); }
+ void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); }
#ifdef PIRE_DEBUG
- friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); }
+ friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); }
#endif
- };
-
- SlowScanner(bool needActions = false) {
- Alias(Null());
- need_actions = needActions;
- }
-
- size_t GetLettersCount() const {return m.lettersCount; };
-
- size_t Size() const { return GetSize(); }
- size_t GetSize() const { return m.statesCount; }
- bool Empty() const { return m_finals == Null().m_finals; }
-
- size_t Id() const {return (size_t) -1;}
- size_t RegexpsCount() const { return Empty() ? 0 : 1; }
-
- void Initialize(State& state) const
- {
- state.states.clear();
- state.states.reserve(m.statesCount);
- state.states.push_back(m.start);
- BitSet(m.statesCount).Swap(state.flags);
- }
-
- Char Translate(Char ch) const
- {
- return m_letters[static_cast<size_t>(ch)];
- }
-
- Action NextTranslated(const State& current, State& next, Char l) const
- {
- next.flags.Clear();
- next.states.clear();
- for (auto&& state : current.states) {
- const unsigned* begin = 0;
- const unsigned* end = 0;
- if (!m_vecptr) {
- const size_t* pos = m_jumpPos + state * m.lettersCount + l;
- begin = m_jumps + pos[0];
- end = m_jumps + pos[1];
- } else {
- const auto& v = (*m_vecptr)[state * m.lettersCount + l];
- if (!v.empty()) {
- begin = &v[0];
- end = &v[0] + v.size();
- }
- }
-
- for (; begin != end; ++begin)
- if (!next.flags.Test(*begin)) {
- next.flags.Set(*begin);
- next.states.push_back(*begin);
- }
- }
-
- return 0;
- }
-
- Action Next(const State& current, State& next, Char c) const
- {
- return NextTranslated(current, next, Translate(c));
- }
-
- bool TakeAction(State&, Action) const { return false; }
-
- Action NextTranslated(State& s, Char l) const
- {
- State dest(m.statesCount);
- Action a = NextTranslated(s, dest, l);
- s.Swap(dest);
- return a;
- }
-
- Action Next(State& s, Char c) const
- {
- return NextTranslated(s, Translate(c));
- }
-
- bool Final(const State& s) const
- {
- for (auto&& state : s.states)
- if (m_finals[state])
- return true;
- return false;
- }
-
- bool Dead(const State&) const
- {
- return false;
- }
-
- ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
- return Final(s) ? Accept() : Deny();
- }
-
- bool CanStop(const State& s) const {
- return Final(s);
- }
-
- const void* Mmap(const void* ptr, size_t size)
- {
- Impl::CheckAlign(ptr);
- SlowScanner s;
- const size_t* p = reinterpret_cast<const size_t*>(ptr);
-
- Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m));
- Locals* locals;
- Impl::MapPtr(locals, 1, p, size);
- memcpy(&s.m, locals, sizeof(s.m));
-
- bool empty = *((const bool*) p);
- Impl::AdvancePtr(p, size, sizeof(empty));
- Impl::AlignPtr(p, size);
-
- if (empty)
- s.Alias(Null());
- else {
- s.m_vecptr = 0;
- Impl::MapPtr(s.m_letters, MaxChar, p, size);
- Impl::MapPtr(s.m_finals, s.m.statesCount, p, size);
- Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size);
- Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
- if (need_actions)
- Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
- Swap(s);
- }
- return (const void*) p;
- }
-
- void Swap(SlowScanner& s)
- {
- DoSwap(m_finals, s.m_finals);
- DoSwap(m_jumps, s.m_jumps);
- DoSwap(m_actions, s.m_actions);
- DoSwap(m_jumpPos, s.m_jumpPos);
- DoSwap(m.statesCount, s.m.statesCount);
- DoSwap(m.lettersCount, s.m.lettersCount);
- DoSwap(m.start, s.m.start);
- DoSwap(m_letters, s.m_letters);
- DoSwap(m_pool, s.m_pool);
- DoSwap(m_vec, s.m_vec);
-
- DoSwap(m_vecptr, s.m_vecptr);
- DoSwap(need_actions, s.need_actions);
- DoSwap(m_actionsvec, s.m_actionsvec);
- if (m_vecptr == &s.m_vec)
- m_vecptr = &m_vec;
- if (s.m_vecptr == &m_vec)
- s.m_vecptr = &s.m_vec;
- }
-
- SlowScanner(const SlowScanner& s)
- : m(s.m)
- , m_vec(s.m_vec)
- , need_actions(s.need_actions)
- , m_actionsvec(s.m_actionsvec)
- {
- if (s.m_vec.empty()) {
- // Empty or mmap()-ed scanner, just copy pointers
- m_finals = s.m_finals;
- m_jumps = s.m_jumps;
- m_actions = s.m_actions;
- m_jumpPos = s.m_jumpPos;
- m_letters = s.m_letters;
- m_vecptr = 0;
- } else {
- // In-memory scanner, perform deep copy
- alloc(m_letters, MaxChar);
- memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar);
- m_jumps = 0;
- m_jumpPos = 0;
- m_actions = 0;
- alloc(m_finals, m.statesCount);
- memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount);
- m_vecptr = &m_vec;
- }
- }
-
- explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0)
- : need_actions(needActions)
- {
- if (distance) {
- fsm = CreateApproxFsm(fsm, distance);
- }
- if (removeEpsilons)
- fsm.RemoveEpsilons();
- fsm.Sparse(!removeEpsilons);
-
- m.statesCount = fsm.Size();
- m.lettersCount = fsm.Letters().Size();
-
- m_vec.resize(m.statesCount * m.lettersCount);
- if (need_actions)
- m_actionsvec.resize(m.statesCount * m.lettersCount);
- m_vecptr = &m_vec;
- alloc(m_letters, MaxChar);
- m_jumps = 0;
- m_actions = 0;
- m_jumpPos = 0;
- alloc(m_finals, m.statesCount);
-
- // Build letter translation table
- Fill(m_letters, m_letters + MaxChar, 0);
- for (auto&& letter : fsm.Letters())
- for (auto&& character : letter.second.second)
- m_letters[character] = letter.second.first;
-
- m.start = fsm.Initial();
- BuildScanner(fsm, *this);
- }
-
-
- SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; }
-
- ~SlowScanner()
- {
- for (auto&& i : m_pool)
- free(i);
- }
-
- void Save(yostream*) const;
- void Load(yistream*);
-
- const State& StateIndex(const State& s) const { return s; }
+ };
+
+ SlowScanner(bool needActions = false) {
+ Alias(Null());
+ need_actions = needActions;
+ }
+
+ size_t GetLettersCount() const {return m.lettersCount; };
+
+ size_t Size() const { return GetSize(); }
+ size_t GetSize() const { return m.statesCount; }
+ bool Empty() const { return m_finals == Null().m_finals; }
+
+ size_t Id() const {return (size_t) -1;}
+ size_t RegexpsCount() const { return Empty() ? 0 : 1; }
+
+ void Initialize(State& state) const
+ {
+ state.states.clear();
+ state.states.reserve(m.statesCount);
+ state.states.push_back(m.start);
+ BitSet(m.statesCount).Swap(state.flags);
+ }
+
+ Char Translate(Char ch) const
+ {
+ return m_letters[static_cast<size_t>(ch)];
+ }
+
+ Action NextTranslated(const State& current, State& next, Char l) const
+ {
+ next.flags.Clear();
+ next.states.clear();
+ for (auto&& state : current.states) {
+ const unsigned* begin = 0;
+ const unsigned* end = 0;
+ if (!m_vecptr) {
+ const size_t* pos = m_jumpPos + state * m.lettersCount + l;
+ begin = m_jumps + pos[0];
+ end = m_jumps + pos[1];
+ } else {
+ const auto& v = (*m_vecptr)[state * m.lettersCount + l];
+ if (!v.empty()) {
+ begin = &v[0];
+ end = &v[0] + v.size();
+ }
+ }
+
+ for (; begin != end; ++begin)
+ if (!next.flags.Test(*begin)) {
+ next.flags.Set(*begin);
+ next.states.push_back(*begin);
+ }
+ }
+
+ return 0;
+ }
+
+ Action Next(const State& current, State& next, Char c) const
+ {
+ return NextTranslated(current, next, Translate(c));
+ }
+
+ bool TakeAction(State&, Action) const { return false; }
+
+ Action NextTranslated(State& s, Char l) const
+ {
+ State dest(m.statesCount);
+ Action a = NextTranslated(s, dest, l);
+ s.Swap(dest);
+ return a;
+ }
+
+ Action Next(State& s, Char c) const
+ {
+ return NextTranslated(s, Translate(c));
+ }
+
+ bool Final(const State& s) const
+ {
+ for (auto&& state : s.states)
+ if (m_finals[state])
+ return true;
+ return false;
+ }
+
+ bool Dead(const State&) const
+ {
+ return false;
+ }
+
+ ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
+ return Final(s) ? Accept() : Deny();
+ }
+
+ bool CanStop(const State& s) const {
+ return Final(s);
+ }
+
+ const void* Mmap(const void* ptr, size_t size)
+ {
+ Impl::CheckAlign(ptr);
+ SlowScanner s;
+ const size_t* p = reinterpret_cast<const size_t*>(ptr);
+
+ Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m));
+ Locals* locals;
+ Impl::MapPtr(locals, 1, p, size);
+ memcpy(&s.m, locals, sizeof(s.m));
+
+ bool empty = *((const bool*) p);
+ Impl::AdvancePtr(p, size, sizeof(empty));
+ Impl::AlignPtr(p, size);
+
+ if (empty)
+ s.Alias(Null());
+ else {
+ s.m_vecptr = 0;
+ Impl::MapPtr(s.m_letters, MaxChar, p, size);
+ Impl::MapPtr(s.m_finals, s.m.statesCount, p, size);
+ Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size);
+ Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
+ if (need_actions)
+ Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
+ Swap(s);
+ }
+ return (const void*) p;
+ }
+
+ void Swap(SlowScanner& s)
+ {
+ DoSwap(m_finals, s.m_finals);
+ DoSwap(m_jumps, s.m_jumps);
+ DoSwap(m_actions, s.m_actions);
+ DoSwap(m_jumpPos, s.m_jumpPos);
+ DoSwap(m.statesCount, s.m.statesCount);
+ DoSwap(m.lettersCount, s.m.lettersCount);
+ DoSwap(m.start, s.m.start);
+ DoSwap(m_letters, s.m_letters);
+ DoSwap(m_pool, s.m_pool);
+ DoSwap(m_vec, s.m_vec);
+
+ DoSwap(m_vecptr, s.m_vecptr);
+ DoSwap(need_actions, s.need_actions);
+ DoSwap(m_actionsvec, s.m_actionsvec);
+ if (m_vecptr == &s.m_vec)
+ m_vecptr = &m_vec;
+ if (s.m_vecptr == &m_vec)
+ s.m_vecptr = &s.m_vec;
+ }
+
+ SlowScanner(const SlowScanner& s)
+ : m(s.m)
+ , m_vec(s.m_vec)
+ , need_actions(s.need_actions)
+ , m_actionsvec(s.m_actionsvec)
+ {
+ if (s.m_vec.empty()) {
+ // Empty or mmap()-ed scanner, just copy pointers
+ m_finals = s.m_finals;
+ m_jumps = s.m_jumps;
+ m_actions = s.m_actions;
+ m_jumpPos = s.m_jumpPos;
+ m_letters = s.m_letters;
+ m_vecptr = 0;
+ } else {
+ // In-memory scanner, perform deep copy
+ alloc(m_letters, MaxChar);
+ memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar);
+ m_jumps = 0;
+ m_jumpPos = 0;
+ m_actions = 0;
+ alloc(m_finals, m.statesCount);
+ memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount);
+ m_vecptr = &m_vec;
+ }
+ }
+
+ explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0)
+ : need_actions(needActions)
+ {
+ if (distance) {
+ fsm = CreateApproxFsm(fsm, distance);
+ }
+ if (removeEpsilons)
+ fsm.RemoveEpsilons();
+ fsm.Sparse(!removeEpsilons);
+
+ m.statesCount = fsm.Size();
+ m.lettersCount = fsm.Letters().Size();
+
+ m_vec.resize(m.statesCount * m.lettersCount);
+ if (need_actions)
+ m_actionsvec.resize(m.statesCount * m.lettersCount);
+ m_vecptr = &m_vec;
+ alloc(m_letters, MaxChar);
+ m_jumps = 0;
+ m_actions = 0;
+ m_jumpPos = 0;
+ alloc(m_finals, m.statesCount);
+
+ // Build letter translation table
+ Fill(m_letters, m_letters + MaxChar, 0);
+ for (auto&& letter : fsm.Letters())
+ for (auto&& character : letter.second.second)
+ m_letters[character] = letter.second.first;
+
+ m.start = fsm.Initial();
+ BuildScanner(fsm, *this);
+ }
+
+
+ SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; }
+
+ ~SlowScanner()
+ {
+ for (auto&& i : m_pool)
+ free(i);
+ }
+
+ void Save(yostream*) const;
+ void Load(yistream*);
+
+ const State& StateIndex(const State& s) const { return s; }
protected:
- bool IsMmaped() const
- {
- return (!m_vecptr);
- }
-
- size_t GetJump(size_t pos) const
- {
- return m_jumps[pos];
- }
-
- Action& GetAction(size_t pos) const
- {
- return m_actions[pos];
- }
-
- const TVector<Action>& GetActionsVec(size_t from) const
- {
- return m_actionsvec[from];
- }
-
- const TVector<unsigned int>& GetJumpsVec(size_t from) const
- {
- return m_vec[from];
- }
-
- size_t* GetJumpPos() const
- {
- return m_jumpPos;
- }
-
- size_t GetStart() const
- {
- return m.start;
- }
-
- bool IsFinal(size_t pos) const
- {
- return m_finals[pos];
- }
+ bool IsMmaped() const
+ {
+ return (!m_vecptr);
+ }
+
+ size_t GetJump(size_t pos) const
+ {
+ return m_jumps[pos];
+ }
+
+ Action& GetAction(size_t pos) const
+ {
+ return m_actions[pos];
+ }
+
+ const TVector<Action>& GetActionsVec(size_t from) const
+ {
+ return m_actionsvec[from];
+ }
+
+ const TVector<unsigned int>& GetJumpsVec(size_t from) const
+ {
+ return m_vec[from];
+ }
+
+ size_t* GetJumpPos() const
+ {
+ return m_jumpPos;
+ }
+
+ size_t GetStart() const
+ {
+ return m.start;
+ }
+
+ bool IsFinal(size_t pos) const
+ {
+ return m_finals[pos];
+ }
private:
- struct Locals {
- size_t statesCount;
- size_t lettersCount;
- size_t start;
- } m;
-
- bool* m_finals;
- unsigned* m_jumps;
- Action* m_actions;
- size_t* m_jumpPos;
- size_t* m_letters;
-
- TVector<void*> m_pool;
- TVector< TVector<unsigned> > m_vec, *m_vecptr;
-
- bool need_actions;
- TVector<TVector<Action>> m_actionsvec;
- static const SlowScanner& Null();
-
- template<class T> void alloc(T*& p, size_t size)
- {
- p = static_cast<T*>(malloc(size * sizeof(T)));
- memset(p, 0, size * sizeof(T));
- m_pool.push_back(p);
- }
-
- void Alias(const SlowScanner& s)
- {
- memcpy(&m, &s.m, sizeof(m));
- m_vec.clear();
- need_actions = s.need_actions;
- m_actionsvec.clear();
- m_finals = s.m_finals;
- m_jumps = s.m_jumps;
- m_actions = s.m_actions;
- m_jumpPos = s.m_jumpPos;
- m_letters = s.m_letters;
- m_vecptr = s.m_vecptr;
- m_pool.clear();
- }
-
- void SetJump(size_t oldState, Char c, size_t newState, unsigned long action)
- {
- Y_ASSERT(!m_vec.empty());
- Y_ASSERT(oldState < m.statesCount);
- Y_ASSERT(newState < m.statesCount);
-
- size_t idx = oldState * m.lettersCount + m_letters[c];
- m_vec[idx].push_back(newState);
- if (need_actions)
- m_actionsvec[idx].push_back(action);
- }
-
- unsigned long RemapAction(unsigned long action) { return action; }
-
- void SetInitial(size_t state) { m.start = state; }
- void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); }
-
- void FinishBuild() {}
-
- static ypair<const size_t*, const size_t*> Accept()
- {
- static size_t v[1] = { 0 };
-
- return ymake_pair(v, v + 1);
- }
-
- static ypair<const size_t*, const size_t*> Deny()
- {
- static size_t v[1] = { 0 };
- return ymake_pair(v, v);
- }
-
- friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&);
+ struct Locals {
+ size_t statesCount;
+ size_t lettersCount;
+ size_t start;
+ } m;
+
+ bool* m_finals;
+ unsigned* m_jumps;
+ Action* m_actions;
+ size_t* m_jumpPos;
+ size_t* m_letters;
+
+ TVector<void*> m_pool;
+ TVector< TVector<unsigned> > m_vec, *m_vecptr;
+
+ bool need_actions;
+ TVector<TVector<Action>> m_actionsvec;
+ static const SlowScanner& Null();
+
+ template<class T> void alloc(T*& p, size_t size)
+ {
+ p = static_cast<T*>(malloc(size * sizeof(T)));
+ memset(p, 0, size * sizeof(T));
+ m_pool.push_back(p);
+ }
+
+ void Alias(const SlowScanner& s)
+ {
+ memcpy(&m, &s.m, sizeof(m));
+ m_vec.clear();
+ need_actions = s.need_actions;
+ m_actionsvec.clear();
+ m_finals = s.m_finals;
+ m_jumps = s.m_jumps;
+ m_actions = s.m_actions;
+ m_jumpPos = s.m_jumpPos;
+ m_letters = s.m_letters;
+ m_vecptr = s.m_vecptr;
+ m_pool.clear();
+ }
+
+ void SetJump(size_t oldState, Char c, size_t newState, unsigned long action)
+ {
+ Y_ASSERT(!m_vec.empty());
+ Y_ASSERT(oldState < m.statesCount);
+ Y_ASSERT(newState < m.statesCount);
+
+ size_t idx = oldState * m.lettersCount + m_letters[c];
+ m_vec[idx].push_back(newState);
+ if (need_actions)
+ m_actionsvec[idx].push_back(action);
+ }
+
+ unsigned long RemapAction(unsigned long action) { return action; }
+
+ void SetInitial(size_t state) { m.start = state; }
+ void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); }
+
+ void FinishBuild() {}
+
+ static ypair<const size_t*, const size_t*> Accept()
+ {
+ static size_t v[1] = { 0 };
+
+ return ymake_pair(v, v + 1);
+ }
+
+ static ypair<const size_t*, const size_t*> Deny()
+ {
+ static size_t v[1] = { 0 };
+ return ymake_pair(v, v);
+ }
+
+ friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&);
};
template<>
inline SlowScanner Fsm::Compile(size_t distance) {
- return SlowScanner(*this, false, true, distance);
+ return SlowScanner(*this, false, true, distance);
}
inline const SlowScanner& SlowScanner::Null()
{
- static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>();
- return n;
+ static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>();
+ return n;
}
#ifndef PIRE_DEBUG
@@ -433,18 +433,18 @@ inline const SlowScanner& SlowScanner::Null()
template<>
inline void Run<SlowScanner>(const SlowScanner& scanner, SlowScanner::State& state, TStringBuf str)
{
- SlowScanner::State temp;
- scanner.Initialize(temp);
-
- SlowScanner::State* src = &state;
- SlowScanner::State* dest = &temp;
-
- for (auto it = str.begin(); it != str.end(); ++it) {
- scanner.Next(*src, *dest, static_cast<unsigned char>(*it));
- DoSwap(src, dest);
- }
- if (src != &state)
- state = *src;
+ SlowScanner::State temp;
+ scanner.Initialize(temp);
+
+ SlowScanner::State* src = &state;
+ SlowScanner::State* dest = &temp;
+
+ for (auto it = str.begin(); it != str.end(); ++it) {
+ scanner.Next(*src, *dest, static_cast<unsigned char>(*it));
+ DoSwap(src, dest);
+ }
+ if (src != &state)
+ state = *src;
}
#endif
diff --git a/library/cpp/regex/pire/pire/static_assert.h b/library/cpp/regex/pire/pire/static_assert.h
index 90dd0ff4f01..5d671a1624f 100644
--- a/library/cpp/regex/pire/pire/static_assert.h
+++ b/library/cpp/regex/pire/pire/static_assert.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -24,7 +24,7 @@
#define PIRE_ASSERT_H_INCLUDED
namespace Pire { namespace Impl {
-
+
// A static (compile-tile) assertion.
// The idea was shamelessly borrowed from Boost.
template<bool x> struct StaticAssertion;
diff --git a/library/cpp/regex/pire/pire/stub/singleton.h b/library/cpp/regex/pire/pire/stub/singleton.h
index f24e9244607..99fb6578f19 100644
--- a/library/cpp/regex/pire/pire/stub/singleton.h
+++ b/library/cpp/regex/pire/pire/stub/singleton.h
@@ -3,6 +3,6 @@
namespace Pire {
template<class T>
const T& DefaultValue() {
- return Default<T>();
+ return Default<T>();
}
}
diff --git a/library/cpp/regex/pire/pire/vbitset.h b/library/cpp/regex/pire/pire/vbitset.h
index e255031b070..62b85aa05c3 100644
--- a/library/cpp/regex/pire/pire/vbitset.h
+++ b/library/cpp/regex/pire/pire/vbitset.h
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -40,78 +40,78 @@ namespace Pire {
/// A bitset with variable width
class BitSet {
public:
- typedef size_t value_type;
- typedef size_t* pointer;
- typedef size_t& reference;
- typedef const size_t& const_reference;
-
- class const_iterator;
-
- BitSet()
- : m_data(1, 1)
- {
- }
- BitSet(size_t size)
- : m_data(RoundUp(size + 1) + 1)
- , m_size(size)
- {
- m_data[RoundDown(size)] |= (1U << Remainder(size));
- }
-
- void Swap(BitSet& s)
- {
- m_data.swap(s.m_data);
- DoSwap(m_size, s.m_size);
- }
-
- /// Sets the specified bit to 1.
- void Set(size_t pos) {
- m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] |= (1U << Remainder(pos));
- }
-
- /// Resets the specified bit to 0.
- void Reset(size_t pos) {
- m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] &= ~(1U << Remainder(pos));
- }
-
- /// Checks whether the specified bit is set to 1.
- bool Test(size_t pos) const {
- return (m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] & (1U << Remainder(pos))) != 0;
- }
-
- size_t Size() const {
- return m_size;
- }
-
- void Resize(size_t newsize)
- {
- m_data.resize(RoundUp(newsize + 1));
- if (Remainder(newsize) && !m_data.empty())
- m_data[m_data.size() - 1] &= ((1U << Remainder(newsize)) - 1); // Clear tail
- m_data[RoundDown(newsize)] |= (1U << Remainder(newsize));
- }
-
- /// Resets all bits to 0.
- void Clear() { memset(&m_data[0], 0, m_data.size() * sizeof(ContainerType)); }
+ typedef size_t value_type;
+ typedef size_t* pointer;
+ typedef size_t& reference;
+ typedef const size_t& const_reference;
+
+ class const_iterator;
+
+ BitSet()
+ : m_data(1, 1)
+ {
+ }
+ BitSet(size_t size)
+ : m_data(RoundUp(size + 1) + 1)
+ , m_size(size)
+ {
+ m_data[RoundDown(size)] |= (1U << Remainder(size));
+ }
+
+ void Swap(BitSet& s)
+ {
+ m_data.swap(s.m_data);
+ DoSwap(m_size, s.m_size);
+ }
+
+ /// Sets the specified bit to 1.
+ void Set(size_t pos) {
+ m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] |= (1U << Remainder(pos));
+ }
+
+ /// Resets the specified bit to 0.
+ void Reset(size_t pos) {
+ m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] &= ~(1U << Remainder(pos));
+ }
+
+ /// Checks whether the specified bit is set to 1.
+ bool Test(size_t pos) const {
+ return (m_data[RoundDown(VBITSET_CHECK_SIZE(pos))] & (1U << Remainder(pos))) != 0;
+ }
+
+ size_t Size() const {
+ return m_size;
+ }
+
+ void Resize(size_t newsize)
+ {
+ m_data.resize(RoundUp(newsize + 1));
+ if (Remainder(newsize) && !m_data.empty())
+ m_data[m_data.size() - 1] &= ((1U << Remainder(newsize)) - 1); // Clear tail
+ m_data[RoundDown(newsize)] |= (1U << Remainder(newsize));
+ }
+
+ /// Resets all bits to 0.
+ void Clear() { memset(&m_data[0], 0, m_data.size() * sizeof(ContainerType)); }
private:
- typedef unsigned char ContainerType;
- static const size_t ItemSize = sizeof(ContainerType) * 8;
- TVector<ContainerType> m_data;
- size_t m_size;
+ typedef unsigned char ContainerType;
+ static const size_t ItemSize = sizeof(ContainerType) * 8;
+ TVector<ContainerType> m_data;
+ size_t m_size;
- static size_t RoundUp(size_t x) { return x / ItemSize + ((x % ItemSize) ? 1 : 0); }
- static size_t RoundDown(size_t x) { return x / ItemSize; }
- static size_t Remainder(size_t x) { return x % ItemSize; }
+ static size_t RoundUp(size_t x) { return x / ItemSize + ((x % ItemSize) ? 1 : 0); }
+ static size_t RoundDown(size_t x) { return x / ItemSize; }
+ static size_t Remainder(size_t x) { return x % ItemSize; }
#ifdef _DEBUG
- size_t CheckSize(size_t size) const
- {
- if (size < m_size)
- return size;
- else
- throw Error("BitSet: subscript out of range");
- }
+ size_t CheckSize(size_t size) const
+ {
+ if (size < m_size)
+ return size;
+ else
+ throw Error("BitSet: subscript out of range");
+ }
#endif
};
diff --git a/library/cpp/regex/pire/ut/approx_matching_ut.cpp b/library/cpp/regex/pire/ut/approx_matching_ut.cpp
index 0454b46a868..f8a85c271f4 100644
--- a/library/cpp/regex/pire/ut/approx_matching_ut.cpp
+++ b/library/cpp/regex/pire/ut/approx_matching_ut.cpp
@@ -24,356 +24,356 @@
#include "common.h"
Y_UNIT_TEST_SUITE(ApproxMatchingTest) {
- Pire::Fsm BuildFsm(const char *str)
- {
- Pire::Lexer lexer;
- TVector<wchar32> ucs4;
-
- lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
- lexer.Assign(ucs4.begin(), ucs4.end());
- return lexer.Parse();
- }
-
- Y_UNIT_TEST(Simple) {
- auto fsm = BuildFsm("^ab$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("ax");
- ACCEPTS("xb");
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("xab");
- ACCEPTS("axb");
- ACCEPTS("abx");
- ACCEPTS("aab");
- DENIES("xy");
- DENIES("abcd");
- DENIES("xabx");
- DENIES("");
- }
-
- fsm = BuildFsm("^ab$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("ab");
- ACCEPTS("xy");
- ACCEPTS("");
- ACCEPTS("axbx");
- DENIES("xxabx");
- DENIES("xbxxx");
- }
- }
-
- Y_UNIT_TEST(SpecialSymbols) {
- auto fsm = BuildFsm("^.*ab$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("ab");
- ACCEPTS("xxxxab");
- ACCEPTS("xxxxabab");
- DENIES("xxxx");
- DENIES("abxxxx");
- }
-
- fsm = BuildFsm("^[a-c]$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("c");
- ACCEPTS("/");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("xx");
- DENIES("abc");
- }
-
- fsm = BuildFsm("^x{4}$");
- APPROXIMATE_SCANNER(fsm, 2) {
- DENIES ("x");
- ACCEPTS("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxx");
- ACCEPTS("xxxxxx");
- DENIES ("xxxxxxx");
- ACCEPTS("xxyy");
- ACCEPTS("xxyyx");
- ACCEPTS("xxxxyz");
- DENIES("xyyy");
- }
-
- fsm = BuildFsm("^(a|b)$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("x");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("abc");
- DENIES("xx");
- }
-
- fsm = BuildFsm("^(ab|cd)$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("cd");
- ACCEPTS("ax");
- ACCEPTS("xd");
- ACCEPTS("abx");
- ACCEPTS("a");
- DENIES("abcd");
- DENIES("xx");
- DENIES("");
- }
-
- fsm = BuildFsm("^[a-c]{3}$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("abc");
- ACCEPTS("aaa");
- ACCEPTS("a");
- ACCEPTS("ax");
- ACCEPTS("abxcx");
- DENIES("x");
- DENIES("");
- DENIES("xaxx");
- }
-
- fsm = BuildFsm("^\\x{61}$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("x");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("axx");
- DENIES("xx");
- }
-
- fsm = BuildFsm("^a.bc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("axxbc");
- ACCEPTS("abc");
- ACCEPTS("xabc");
- ACCEPTS("xaxbc");
- DENIES("bc");
- DENIES("abcx");
- }
- }
-
- Y_UNIT_TEST(TestSurrounded) {
- auto fsm = BuildFsm("abc").Surround();
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("abc");
- ACCEPTS("xabcx");
- ACCEPTS("xabx");
- ACCEPTS("axc");
- ACCEPTS("bac");
- DENIES("a");
- DENIES("xaxxxx");
- }
-
- fsm = BuildFsm("^abc$").Surround();
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("abc");
- ACCEPTS("abcx");
- ACCEPTS("xabc");
- ACCEPTS("axc");
- ACCEPTS("bac");
- DENIES("xabx");
- DENIES("axx");
- }
- }
-
- Y_UNIT_TEST(GlueFsm) {
- auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("");
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("x");
- ACCEPTS("ab");
- DENIES("abb");
- }
-
- fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("cc");
- ACCEPTS("x");
- ACCEPTS("xa");
- ACCEPTS("c");
- ACCEPTS("xc");
- ACCEPTS("cxc");
- ACCEPTS("");
- }
- }
-
- enum MutateOperation {
- Begin,
- Substitute = Begin,
- Delete,
- Insert,
- End
- };
-
- ystring ChangeText(const ystring& text, int operation, int pos)
- {
- auto changedText = text;
- switch (operation) {
- case MutateOperation::Substitute:
- changedText[pos] = 'x';
- break;
- case MutateOperation::Delete:
- changedText.erase(pos, 1);
- break;
- case MutateOperation::Insert:
- changedText.insert(pos, 1, 'x');
- break;
- }
-
- return changedText;
- }
-
- Y_UNIT_TEST(StressTest) {
- ystring text;
- for (size_t letter = 0; letter < 10; ++letter) {
- text += ystring(3, letter + 'a');
- }
- const ystring regexp = "^" + text + "$";
- auto fsm = BuildFsm(regexp.data());
-
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
-
- for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
- for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
- auto changedText = ChangeText(text, operation, pos);
- ACCEPTS(changedText);
- }
- }
- }
-
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS(text);
-
- for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
- for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
- auto changedText = ChangeText(text, operation, pos);
- DENIES(changedText);
- }
- }
- }
-
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS(text);
-
- for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
- size_t posRight = text.size() - posLeft - 1;
- for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
- for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
- auto changedText = ChangeText(text, operationRight, posRight);
- changedText = ChangeText(changedText, operationLeft, posLeft);
- ACCEPTS(changedText);
- }
- }
- }
- }
-
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
-
- for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
- size_t posRight = text.size() - posLeft - 1;
- for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
- for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
- auto changedText = ChangeText(text, operationRight, posRight);
- changedText = ChangeText(changedText, operationLeft, posLeft);
- DENIES(changedText);
- }
- }
- }
- }
- }
-
- Y_UNIT_TEST(SwapLetters) {
- auto fsm = BuildFsm("^abc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("bac");
- ACCEPTS("acb");
- DENIES("cba");
- DENIES("bax");
- }
-
- fsm = BuildFsm("^abcd$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("bacd");
- ACCEPTS("acbd");
- ACCEPTS("baxd");
- ACCEPTS("badc");
- ACCEPTS("bcad");
- ACCEPTS("bcda");
- DENIES("xcbx");
- DENIES("baxx");
- DENIES("ba");
- DENIES("cdab");
- }
-
- fsm = BuildFsm("^abc$");
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS("abc");
- DENIES("bac");
- }
-
- fsm = BuildFsm("^[a-c][1-3]$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a3");
- ACCEPTS("c");
- ACCEPTS("1");
- ACCEPTS("1a");
- ACCEPTS("3b");
- DENIES("4a");
- }
-
- fsm = BuildFsm("^.*abc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("xxxxbac");
- DENIES("xxxxa");
- DENIES("xxxxcb");
- }
- }
-
- Y_UNIT_TEST(SwapStressTest){
- ystring text;
- for (size_t letter = 0; letter < 30; ++letter) {
- text += ystring(1, (letter % 26) + 'a');
- }
- const ystring regexp = "^" + text + "$";
- auto fsm = BuildFsm(regexp.data());
- auto changedText = text;
-
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
-
- for (size_t pos = 0; pos < text.size() - 1; ++pos) {
- changedText[pos] = text[pos + 1];
- changedText[pos + 1] = text[pos];
- ACCEPTS(changedText);
- changedText[pos] = text[pos];
- changedText[pos + 1] = text[pos + 1];
- }
- }
-
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS(text);
-
- for (size_t pos = 0; pos < text.size() - 1; ++pos) {
- changedText[pos] = text[pos + 1];
- changedText[pos + 1] = text[pos];
- DENIES(changedText);
- changedText[pos] = text[pos];
- changedText[pos + 1] = text[pos + 1];
- }
- }
- }
+ Pire::Fsm BuildFsm(const char *str)
+ {
+ Pire::Lexer lexer;
+ TVector<wchar32> ucs4;
+
+ lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+ return lexer.Parse();
+ }
+
+ Y_UNIT_TEST(Simple) {
+ auto fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("ax");
+ ACCEPTS("xb");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("xab");
+ ACCEPTS("axb");
+ ACCEPTS("abx");
+ ACCEPTS("aab");
+ DENIES("xy");
+ DENIES("abcd");
+ DENIES("xabx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("ab");
+ ACCEPTS("xy");
+ ACCEPTS("");
+ ACCEPTS("axbx");
+ DENIES("xxabx");
+ DENIES("xbxxx");
+ }
+ }
+
+ Y_UNIT_TEST(SpecialSymbols) {
+ auto fsm = BuildFsm("^.*ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("ab");
+ ACCEPTS("xxxxab");
+ ACCEPTS("xxxxabab");
+ DENIES("xxxx");
+ DENIES("abxxxx");
+ }
+
+ fsm = BuildFsm("^[a-c]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("/");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("xx");
+ DENIES("abc");
+ }
+
+ fsm = BuildFsm("^x{4}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ DENIES ("x");
+ ACCEPTS("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ ACCEPTS("xxyy");
+ ACCEPTS("xxyyx");
+ ACCEPTS("xxxxyz");
+ DENIES("xyyy");
+ }
+
+ fsm = BuildFsm("^(a|b)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("abc");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^(ab|cd)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("cd");
+ ACCEPTS("ax");
+ ACCEPTS("xd");
+ ACCEPTS("abx");
+ ACCEPTS("a");
+ DENIES("abcd");
+ DENIES("xx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^[a-c]{3}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("abc");
+ ACCEPTS("aaa");
+ ACCEPTS("a");
+ ACCEPTS("ax");
+ ACCEPTS("abxcx");
+ DENIES("x");
+ DENIES("");
+ DENIES("xaxx");
+ }
+
+ fsm = BuildFsm("^\\x{61}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("axx");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^a.bc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("axxbc");
+ ACCEPTS("abc");
+ ACCEPTS("xabc");
+ ACCEPTS("xaxbc");
+ DENIES("bc");
+ DENIES("abcx");
+ }
+ }
+
+ Y_UNIT_TEST(TestSurrounded) {
+ auto fsm = BuildFsm("abc").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("xabcx");
+ ACCEPTS("xabx");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("a");
+ DENIES("xaxxxx");
+ }
+
+ fsm = BuildFsm("^abc$").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("abcx");
+ ACCEPTS("xabc");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("xabx");
+ DENIES("axx");
+ }
+ }
+
+ Y_UNIT_TEST(GlueFsm) {
+ auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("ab");
+ DENIES("abb");
+ }
+
+ fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("cc");
+ ACCEPTS("x");
+ ACCEPTS("xa");
+ ACCEPTS("c");
+ ACCEPTS("xc");
+ ACCEPTS("cxc");
+ ACCEPTS("");
+ }
+ }
+
+ enum MutateOperation {
+ Begin,
+ Substitute = Begin,
+ Delete,
+ Insert,
+ End
+ };
+
+ ystring ChangeText(const ystring& text, int operation, int pos)
+ {
+ auto changedText = text;
+ switch (operation) {
+ case MutateOperation::Substitute:
+ changedText[pos] = 'x';
+ break;
+ case MutateOperation::Delete:
+ changedText.erase(pos, 1);
+ break;
+ case MutateOperation::Insert:
+ changedText.insert(pos, 1, 'x');
+ break;
+ }
+
+ return changedText;
+ }
+
+ Y_UNIT_TEST(StressTest) {
+ ystring text;
+ for (size_t letter = 0; letter < 10; ++letter) {
+ text += ystring(3, letter + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.data());
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ DENIES(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ DENIES(changedText);
+ }
+ }
+ }
+ }
+ }
+
+ Y_UNIT_TEST(SwapLetters) {
+ auto fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("bac");
+ ACCEPTS("acb");
+ DENIES("cba");
+ DENIES("bax");
+ }
+
+ fsm = BuildFsm("^abcd$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("bacd");
+ ACCEPTS("acbd");
+ ACCEPTS("baxd");
+ ACCEPTS("badc");
+ ACCEPTS("bcad");
+ ACCEPTS("bcda");
+ DENIES("xcbx");
+ DENIES("baxx");
+ DENIES("ba");
+ DENIES("cdab");
+ }
+
+ fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS("abc");
+ DENIES("bac");
+ }
+
+ fsm = BuildFsm("^[a-c][1-3]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a3");
+ ACCEPTS("c");
+ ACCEPTS("1");
+ ACCEPTS("1a");
+ ACCEPTS("3b");
+ DENIES("4a");
+ }
+
+ fsm = BuildFsm("^.*abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("xxxxbac");
+ DENIES("xxxxa");
+ DENIES("xxxxcb");
+ }
+ }
+
+ Y_UNIT_TEST(SwapStressTest){
+ ystring text;
+ for (size_t letter = 0; letter < 30; ++letter) {
+ text += ystring(1, (letter % 26) + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.data());
+ auto changedText = text;
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ ACCEPTS(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ DENIES(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+ }
}
diff --git a/library/cpp/regex/pire/ut/capture_ut.cpp b/library/cpp/regex/pire/ut/capture_ut.cpp
index 3d339c56019..7303ac6b0e8 100644
--- a/library/cpp/regex/pire/ut/capture_ut.cpp
+++ b/library/cpp/regex/pire/ut/capture_ut.cpp
@@ -32,268 +32,268 @@
Y_UNIT_TEST_SUITE(TestPireCapture) {
- using Pire::CapturingScanner;
- using Pire::SlowCapturingScanner;
- typedef Pire::CapturingScanner::State State;
-
- CapturingScanner Compile(const char* regexp, int index)
- {
- Pire::Lexer lexer;
-
- lexer.Assign(regexp, regexp + strlen(regexp));
- lexer.AddFeature(Pire::Features::CaseInsensitive());
- lexer.AddFeature(Pire::Features::Capture((size_t) index));
-
- Pire::Fsm fsm = lexer.Parse();
-
- fsm.Surround();
- fsm.Determine();
- return fsm.Compile<Pire::CapturingScanner>();
- }
-
- SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
- {
- Pire::Lexer lexer;
- lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index)));
- lexer.SetEncoding(encoding);
- TVector<wchar32> ucs4;
- encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
- lexer.Assign(ucs4.begin(), ucs4.end());
- Pire::Fsm fsm = lexer.Parse();
- fsm.Surround();
- return fsm.Compile<Pire::SlowCapturingScanner>();
- }
-
- State RunRegexp(const CapturingScanner& scanner, const char* str)
- {
- State state;
- scanner.Initialize(state);
- Step(scanner, state, Pire::BeginMark);
- Run(scanner, state, str, str + strlen(str));
- Step(scanner, state, Pire::EndMark);
- return state;
- }
-
- SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str)
- {
- SlowCapturingScanner::State state;
- scanner.Initialize(state);
- Run(scanner, state, str, str + strlen(str));
- return state;
- }
-
- ystring Captured(const State& state, const char* str)
- {
- if (state.Captured())
- return ystring(str + state.Begin() - 1, str + state.End() - 1);
- else
- return ystring();
- }
-
- Y_UNIT_TEST(Trivial)
- {
- CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
- State state;
- const char* str;
-
- str = "google_id = 'abcde';";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
-
- str = "var google_id = 'abcde'; eval(google_id);";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
-
- str = "google_id != 'abcde';";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(!state.Captured());
- }
-
- Y_UNIT_TEST(Sequential)
- {
- CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
- State state;
- const char* str;
-
- str = "google_id = 'abcde'; google_id = 'xyz';";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde"));
-
- str = "var google_id = 'abc de'; google_id = 'xyz';";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz"));
- }
-
- Y_UNIT_TEST(NegatedTerminator)
- {
- CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1);
- State state;
- const char* str;
-
- str = "=12345;";
- state = RunRegexp(scanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345"));
- }
-
- Y_UNIT_TEST(Serialization)
- {
- const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;";
- CapturingScanner scanner2 = Compile(regex, 1);
- SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1);
- BufferOutput wbuf, wbuf2;
- ::Save(&wbuf, scanner2);
- ::Save(&wbuf2, slowScanner2);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size());
- CapturingScanner scanner;
- SlowCapturingScanner slowScanner;
- ::Load(&rbuf, scanner);
- ::Load(&rbuf2, slowScanner);
-
- State state;
- SlowCapturingScanner::State slowState;
- const char* str;
-
- str = "google_id = 'abcde';";
- state = RunRegexp(scanner, str);
- slowState = RunRegexp(slowScanner, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
- SlowCapturingScanner::SingleState final;
- UNIT_ASSERT(slowScanner.GetCapture(slowState, final));
- ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin());
- UNIT_ASSERT_EQUAL(ans, ystring("abcde"));
-
- str = "google_id != 'abcde';";
- state = RunRegexp(scanner, str);
- slowState = RunRegexp(slowScanner, str);
- UNIT_ASSERT(!state.Captured());
- UNIT_ASSERT(!slowScanner.GetCapture(slowState, final));
-
- CapturingScanner scanner3;
- const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
- TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
- const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
- const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size());
- UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
-
- str = "google_id = 'abcde';";
- state = RunRegexp(scanner3, str);
- UNIT_ASSERT(state.Captured());
- UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
-
- str = "google_id != 'abcde';";
- state = RunRegexp(scanner3, str);
- UNIT_ASSERT(!state.Captured());
-
- ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1);
- try {
- scanner3.Mmap(ptr, wbuf.Buffer().Size());
- UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
- }
- catch (Pire::Error&) {}
-
- for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
- ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
- try {
- scanner3.Mmap(ptr, wbuf.Buffer().Size());
- if (offset % sizeof(size_t) != 0) {
- UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
- } else {
- str = "google_id = 'abcde';";
- state = RunRegexp(scanner3, str);
- UNIT_ASSERT(state.Captured());
- }
- }
- catch (Pire::Error&) {}
- }
- }
-
- Y_UNIT_TEST(Empty)
- {
- Pire::CapturingScanner sc;
- UNIT_ASSERT(sc.Empty());
-
- UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash
-
- // Test Save/Load/Mmap
- BufferOutput wbuf;
- ::Save(&wbuf, sc);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- Pire::CapturingScanner sc3;
- ::Load(&rbuf, sc3);
- UNIT_CHECKPOINT(); RunRegexp(sc3, "a string");
-
- const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
- TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
- const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
-
- Pire::CapturingScanner sc4;
- const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
- UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
- UNIT_CHECKPOINT(); RunRegexp(sc4, "a string");
- }
-
- void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8())
- {
- Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding);
- SlowCapturingScanner::State st = RunRegexp(sc, text);
- SlowCapturingScanner::SingleState fin;
- bool ifCaptured = sc.GetCapture(st, fin);
- if (ans) {
- UNIT_ASSERT(ifCaptured);
- ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin());
- UNIT_ASSERT_EQUAL(answer, captured);
- } else {
- UNIT_ASSERT(!ifCaptured);
- }
- }
-
- Y_UNIT_TEST(SlowCapturingNonGreedy)
- {
- const char* regexp = ".*?(pref.*suff)";
- const char* text = "pref ala bla pref cla suff dla";
- MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff"));
- }
-
- Y_UNIT_TEST(SlowCaptureGreedy)
- {
- const char* regexp = ".*(pref.*suff)";
- const char* text = "pref ala bla pref cla suff dla";
- MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff"));
- }
-
- Y_UNIT_TEST(SlowCaptureInOr)
- {
- const char* regexp = "(A)|A";
- const char* text = "A";
- MakeSlowCapturingTest(regexp, text, 1, true, ystring("A"));
- const char* regexp2 = "A|(A)";
- MakeSlowCapturingTest(regexp2, text, 1, false);
- }
-
- Y_UNIT_TEST(SlowCapturing)
- {
- const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)";
- const char* text = "http://vkontakte.ru/id100500";
- MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500"));
- }
-
- Y_UNIT_TEST(Utf_8)
- {
- const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!";
- const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! ";
- const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)";
- MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans));
- }
+ using Pire::CapturingScanner;
+ using Pire::SlowCapturingScanner;
+ typedef Pire::CapturingScanner::State State;
+
+ CapturingScanner Compile(const char* regexp, int index)
+ {
+ Pire::Lexer lexer;
+
+ lexer.Assign(regexp, regexp + strlen(regexp));
+ lexer.AddFeature(Pire::Features::CaseInsensitive());
+ lexer.AddFeature(Pire::Features::Capture((size_t) index));
+
+ Pire::Fsm fsm = lexer.Parse();
+
+ fsm.Surround();
+ fsm.Determine();
+ return fsm.Compile<Pire::CapturingScanner>();
+ }
+
+ SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ Pire::Lexer lexer;
+ lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index)));
+ lexer.SetEncoding(encoding);
+ TVector<wchar32> ucs4;
+ encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+ Pire::Fsm fsm = lexer.Parse();
+ fsm.Surround();
+ return fsm.Compile<Pire::SlowCapturingScanner>();
+ }
+
+ State RunRegexp(const CapturingScanner& scanner, const char* str)
+ {
+ State state;
+ scanner.Initialize(state);
+ Step(scanner, state, Pire::BeginMark);
+ Run(scanner, state, str, str + strlen(str));
+ Step(scanner, state, Pire::EndMark);
+ return state;
+ }
+
+ SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str)
+ {
+ SlowCapturingScanner::State state;
+ scanner.Initialize(state);
+ Run(scanner, state, str, str + strlen(str));
+ return state;
+ }
+
+ ystring Captured(const State& state, const char* str)
+ {
+ if (state.Captured())
+ return ystring(str + state.Begin() - 1, str + state.End() - 1);
+ else
+ return ystring();
+ }
+
+ Y_UNIT_TEST(Trivial)
+ {
+ CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
+ State state;
+ const char* str;
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "var google_id = 'abcde'; eval(google_id);";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(!state.Captured());
+ }
+
+ Y_UNIT_TEST(Sequential)
+ {
+ CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
+ State state;
+ const char* str;
+
+ str = "google_id = 'abcde'; google_id = 'xyz';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "var google_id = 'abc de'; google_id = 'xyz';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz"));
+ }
+
+ Y_UNIT_TEST(NegatedTerminator)
+ {
+ CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1);
+ State state;
+ const char* str;
+
+ str = "=12345;";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345"));
+ }
+
+ Y_UNIT_TEST(Serialization)
+ {
+ const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;";
+ CapturingScanner scanner2 = Compile(regex, 1);
+ SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1);
+ BufferOutput wbuf, wbuf2;
+ ::Save(&wbuf, scanner2);
+ ::Save(&wbuf2, slowScanner2);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size());
+ CapturingScanner scanner;
+ SlowCapturingScanner slowScanner;
+ ::Load(&rbuf, scanner);
+ ::Load(&rbuf2, slowScanner);
+
+ State state;
+ SlowCapturingScanner::State slowState;
+ const char* str;
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner, str);
+ slowState = RunRegexp(slowScanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+ SlowCapturingScanner::SingleState final;
+ UNIT_ASSERT(slowScanner.GetCapture(slowState, final));
+ ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin());
+ UNIT_ASSERT_EQUAL(ans, ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner, str);
+ slowState = RunRegexp(slowScanner, str);
+ UNIT_ASSERT(!state.Captured());
+ UNIT_ASSERT(!slowScanner.GetCapture(slowState, final));
+
+ CapturingScanner scanner3;
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(!state.Captured());
+
+ ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1);
+ try {
+ scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
+ }
+ catch (Pire::Error&) {}
+
+ for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
+ ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ try {
+ scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ if (offset % sizeof(size_t) != 0) {
+ UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
+ } else {
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(state.Captured());
+ }
+ }
+ catch (Pire::Error&) {}
+ }
+ }
+
+ Y_UNIT_TEST(Empty)
+ {
+ Pire::CapturingScanner sc;
+ UNIT_ASSERT(sc.Empty());
+
+ UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash
+
+ // Test Save/Load/Mmap
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::CapturingScanner sc3;
+ ::Load(&rbuf, sc3);
+ UNIT_CHECKPOINT(); RunRegexp(sc3, "a string");
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ Pire::CapturingScanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+ UNIT_CHECKPOINT(); RunRegexp(sc4, "a string");
+ }
+
+ void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding);
+ SlowCapturingScanner::State st = RunRegexp(sc, text);
+ SlowCapturingScanner::SingleState fin;
+ bool ifCaptured = sc.GetCapture(st, fin);
+ if (ans) {
+ UNIT_ASSERT(ifCaptured);
+ ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin());
+ UNIT_ASSERT_EQUAL(answer, captured);
+ } else {
+ UNIT_ASSERT(!ifCaptured);
+ }
+ }
+
+ Y_UNIT_TEST(SlowCapturingNonGreedy)
+ {
+ const char* regexp = ".*?(pref.*suff)";
+ const char* text = "pref ala bla pref cla suff dla";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureGreedy)
+ {
+ const char* regexp = ".*(pref.*suff)";
+ const char* text = "pref ala bla pref cla suff dla";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureInOr)
+ {
+ const char* regexp = "(A)|A";
+ const char* text = "A";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("A"));
+ const char* regexp2 = "A|(A)";
+ MakeSlowCapturingTest(regexp2, text, 1, false);
+ }
+
+ Y_UNIT_TEST(SlowCapturing)
+ {
+ const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)";
+ const char* text = "http://vkontakte.ru/id100500";
+ MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500"));
+ }
+
+ Y_UNIT_TEST(Utf_8)
+ {
+ const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!";
+ const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! ";
+ const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans));
+ }
}
diff --git a/library/cpp/regex/pire/ut/common.h b/library/cpp/regex/pire/ut/common.h
index d79eedafb73..e88a2affc8d 100644
--- a/library/cpp/regex/pire/ut/common.h
+++ b/library/cpp/regex/pire/ut/common.h
@@ -2,7 +2,7 @@
* common.h --
*
* Copyright (c) 2007-2010, Dmitry Prokoptsev <[email protected]>,
- * Alexander Gololobov <[email protected]>
+ * Alexander Gololobov <[email protected]>
*
* This file is part of Pire, the Perl Incompatible
* Regular Expressions library.
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -39,83 +39,83 @@ using namespace Pire;
inline Pire::Fsm ParseRegexp(const char* str, const char* options = "", const Pire::Encoding** enc = 0)
{
- Pire::Lexer lexer;
- TVector<wchar32> ucs4;
-
- bool surround = true;
- for (; *options; ++options) {
- if (*options == 'i')
- lexer.AddFeature(Pire::Features::CaseInsensitive());
- else if (*options == 'u')
- lexer.SetEncoding(Pire::Encodings::Utf8());
- else if (*options == 'n')
- surround = false;
- else if (*options == 'a')
- lexer.AddFeature(Pire::Features::AndNotSupport());
- else
- throw std::invalid_argument("Unknown option: " + ystring(1, *options));
- }
-
- if (enc)
- *enc = &lexer.Encoding();
-
- lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
- lexer.Assign(ucs4.begin(), ucs4.end());
-
- Pire::Fsm fsm = lexer.Parse();
- if (surround)
- fsm.Surround();
- return fsm;
+ Pire::Lexer lexer;
+ TVector<wchar32> ucs4;
+
+ bool surround = true;
+ for (; *options; ++options) {
+ if (*options == 'i')
+ lexer.AddFeature(Pire::Features::CaseInsensitive());
+ else if (*options == 'u')
+ lexer.SetEncoding(Pire::Encodings::Utf8());
+ else if (*options == 'n')
+ surround = false;
+ else if (*options == 'a')
+ lexer.AddFeature(Pire::Features::AndNotSupport());
+ else
+ throw std::invalid_argument("Unknown option: " + ystring(1, *options));
+ }
+
+ if (enc)
+ *enc = &lexer.Encoding();
+
+ lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+
+ Pire::Fsm fsm = lexer.Parse();
+ if (surround)
+ fsm.Surround();
+ return fsm;
}
inline bool HasError(const char* regexp) {
- try {
- ParseRegexp(regexp);
- return false;
- } catch (Pire::Error& ex) {
- return true;
- }
+ try {
+ ParseRegexp(regexp);
+ return false;
+ } catch (Pire::Error& ex) {
+ return true;
+ }
}
struct Scanners {
- Pire::Scanner fast;
- Pire::NonrelocScanner nonreloc;
- Pire::SimpleScanner simple;
- Pire::SlowScanner slow;
- Pire::ScannerNoMask fastNoMask;
- Pire::NonrelocScannerNoMask nonrelocNoMask;
- Pire::HalfFinalScanner halfFinal;
- Pire::HalfFinalScannerNoMask halfFinalNoMask;
- Pire::NonrelocHalfFinalScanner nonrelocHalfFinal;
- Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask;
-
- Scanners(const Pire::Fsm& fsm, size_t distance = 0)
- : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance))
- , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance))
- , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance))
- , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance))
- , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance))
- , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance))
- , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance))
- , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance))
- , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance))
- , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance))
- {}
-
- Scanners(const char* str, const char* options = "")
- {
- Pire::Fsm fsm = ParseRegexp(str, options);
- fast = Pire::Fsm(fsm).Compile<Pire::Scanner>();
- nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>();
- simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>();
- slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>();
- fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>();
- nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>();
- halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>();
- halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>();
- nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>();
- nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>();
- }
+ Pire::Scanner fast;
+ Pire::NonrelocScanner nonreloc;
+ Pire::SimpleScanner simple;
+ Pire::SlowScanner slow;
+ Pire::ScannerNoMask fastNoMask;
+ Pire::NonrelocScannerNoMask nonrelocNoMask;
+ Pire::HalfFinalScanner halfFinal;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask;
+ Pire::NonrelocHalfFinalScanner nonrelocHalfFinal;
+ Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask;
+
+ Scanners(const Pire::Fsm& fsm, size_t distance = 0)
+ : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance))
+ , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance))
+ , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance))
+ , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance))
+ , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance))
+ , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance))
+ , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance))
+ , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance))
+ , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance))
+ , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance))
+ {}
+
+ Scanners(const char* str, const char* options = "")
+ {
+ Pire::Fsm fsm = ParseRegexp(str, options);
+ fast = Pire::Fsm(fsm).Compile<Pire::Scanner>();
+ nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>();
+ simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>();
+ slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>();
+ fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>();
+ nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>();
+ halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>();
+ halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>();
+ nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>();
+ nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>();
+ }
};
#ifdef PIRE_DEBUG
@@ -123,33 +123,33 @@ struct Scanners {
template <class Scanner>
inline ystring DbgState(const Scanner& scanner, typename Scanner::State state)
{
- return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
+ return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
}
/*
inline ystring DbgState(const Pire::SimpleScanner& scanner, Pire::SimpleScanner::State state)
{
- return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
+ return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
}
*/
inline ystring DbgState(const Pire::SlowScanner& scanner, const Pire::SlowScanner::State& state)
{
- return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring());
+ return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring());
}
template<class Scanner>
void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end)
{
- for (; begin != end; ++begin) {
- char tmp[8];
- if (*begin >= 32) {
- tmp[0] = *begin;
- tmp[1] = 0;
- } else
- snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin);
- std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> ";
- scanner.Next(state, (unsigned char) *begin);
- std::clog << DbgState(scanner, state) << "\n";
- }
+ for (; begin != end; ++begin) {
+ char tmp[8];
+ if (*begin >= 32) {
+ tmp[0] = *begin;
+ tmp[1] = 0;
+ } else
+ snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin);
+ std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> ";
+ scanner.Next(state, (unsigned char) *begin);
+ std::clog << DbgState(scanner, state) << "\n";
+ }
}
#define Run DbgRun
@@ -158,34 +158,34 @@ void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char*
template<class Scanner>
typename Scanner::State RunRegexp(const Scanner& scanner, const ystring& str)
{
- PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n");
-
- typename Scanner::State state;
- scanner.Initialize(state);
- Step(scanner, state, BeginMark);
- Run(scanner, state, str.c_str(), str.c_str() + str.length());
- Step(scanner, state, EndMark);
- return state;
+ PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n");
+
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ Step(scanner, state, BeginMark);
+ Run(scanner, state, str.c_str(), str.c_str() + str.length());
+ Step(scanner, state, EndMark);
+ return state;
}
template<class Scanner>
typename Scanner::State RunRegexp(const Scanner& scanner, const char* str)
{
- return RunRegexp(scanner, ystring(str));
+ return RunRegexp(scanner, ystring(str));
}
template<class Scanner>
bool Matches(const Scanner& scanner, const ystring& str)
{
- auto state = RunRegexp(scanner, str);
- auto result = scanner.AcceptedRegexps(state);
- return result.first != result.second;
+ auto state = RunRegexp(scanner, str);
+ auto result = scanner.AcceptedRegexps(state);
+ return result.first != result.second;
}
template<class Scanner>
bool Matches(const Scanner& scanner, const char* str)
{
- return Matches(scanner, ystring(str));
+ return Matches(scanner, ystring(str));
}
#define SCANNER(fsm) for (Scanners m_scanners(fsm), *m_flag = &m_scanners; m_flag; m_flag = 0)
@@ -193,32 +193,32 @@ bool Matches(const Scanner& scanner, const char* str)
#define REGEXP(pattern) for (Scanners m_scanners(pattern), *m_flag = &m_scanners; m_flag; m_flag = 0)
#define REGEXP2(pattern,flags) for (Scanners m_scanners(pattern, flags), *m_flag = &m_scanners; m_flag; m_flag = 0)
#define ACCEPTS(str) \
- do {\
- UNIT_ASSERT(Matches(m_scanners.fast, str));\
+ do {\
+ UNIT_ASSERT(Matches(m_scanners.fast, str));\
UNIT_ASSERT(Matches(m_scanners.nonreloc, str));\
- UNIT_ASSERT(Matches(m_scanners.simple, str));\
- UNIT_ASSERT(Matches(m_scanners.slow, str));\
- UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\
- UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\
- UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\
- UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\
- UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\
- UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
- } while (false)
+ UNIT_ASSERT(Matches(m_scanners.simple, str));\
+ UNIT_ASSERT(Matches(m_scanners.slow, str));\
+ UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\
+ UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
+ } while (false)
#define DENIES(str) \
- do {\
- UNIT_ASSERT(!Matches(m_scanners.fast, str));\
+ do {\
+ UNIT_ASSERT(!Matches(m_scanners.fast, str));\
UNIT_ASSERT(!Matches(m_scanners.nonreloc, str));\
- UNIT_ASSERT(!Matches(m_scanners.simple, str));\
- UNIT_ASSERT(!Matches(m_scanners.slow, str));\
- UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\
- UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\
- UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\
- UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\
- UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\
- UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
- } while (false)
+ UNIT_ASSERT(!Matches(m_scanners.simple, str));\
+ UNIT_ASSERT(!Matches(m_scanners.slow, str));\
+ UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\
+ UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
+ } while (false)
#endif
diff --git a/library/cpp/regex/pire/ut/count_ut.cpp b/library/cpp/regex/pire/ut/count_ut.cpp
index ffe7943fcc6..0db72a4ad56 100644
--- a/library/cpp/regex/pire/ut/count_ut.cpp
+++ b/library/cpp/regex/pire/ut/count_ut.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -33,551 +33,551 @@
Y_UNIT_TEST_SUITE(TestCount) {
- Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding)
- {
- Pire::Lexer lex;
- lex.SetEncoding(encoding);
- TVector<wchar32> ucs4;
- encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
- lex.Assign(ucs4.begin(), ucs4.end());
- return lex.Parse();
- }
-
- template<class Scanner>
- typename Scanner::State InitializedState(const Scanner& scanner)
- {
- typename Scanner::State state;
- scanner.Initialize(state);
- return state;
- }
-
- template<class Scanner>
- typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1)
- {
- if (len == (size_t)-1) len = strlen(text);
- auto state = InitializedState(scanner);
- Pire::Step(scanner, state, Pire::BeginMark);
- Pire::Run(scanner, state, text, text + len);
- Pire::Step(scanner, state, Pire::EndMark);
- return state;
- }
-
- template<class Scanner>
- size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
- {
- const auto regexpFsm = MkFsm(regexp, encoding);
- const auto separatorFsm = MkFsm(separator, encoding);
- return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0);
- }
-
- size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
- {
- const auto regexpFsm = MkFsm(regexp, encoding);
- const auto separatorFsm = MkFsm(separator, encoding);
- auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
- auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
- if (strcmp(separator, ".*") == 0) {
- HalfFinalFsm fsm(regexpFsm);
- fsm.MakeGreedyCounter(true);
- auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
- fsm = HalfFinalFsm(regexpFsm);
- fsm.MakeGreedyCounter(false);
- auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
- UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult);
- UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult);
- }
- UNIT_ASSERT_EQUAL(countingResult, newResult);
- auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
- UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult);
- return newResult;
- }
-
- Y_UNIT_TEST(Count)
- {
- UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3));
- char aaa[] = "abc def\0 abc\0 def ghi, abc";
- UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6));
- UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6));
- UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8));
- UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3));
- UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4));
- UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 "
- "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240"
- "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 "
- "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302"
- "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270"
- "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5));
- UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*",
- "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 "
- "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 "
- "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. "
- "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 "
- "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 "
- "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 "
- "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> "
- "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 "
- "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 "
- "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 "
- "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 "
- "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
- "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 "
- "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> "
- "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. "
- "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> "
- "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
- "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 "
- "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! "
- "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, "
- "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 "
- "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202"
- "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264"
- "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204"
- "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275"
- "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321"
- "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321"
- "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 "
- "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277"
- "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265"
- "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321"
- "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201"
- "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277"
- "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 "
- "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271"
- "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201"
- "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270"
- "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202"
- "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202"
- "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"),
- size_t(6));
- UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />"
- "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276"
- "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">"
- "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />"
- "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203"
- "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">"
- "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />"
- "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201"
- "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260"
- "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276"
- "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320"
- "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214"
- "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7));
- UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3));
- UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1));
- UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +",
- " \320\260\320\260\320\220 abc def \320\260 cd"),
- size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state
- }
-
- Y_UNIT_TEST(CountWithoutSeparator)
- {
- UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3));
- }
-
- Y_UNIT_TEST(CountGreedy)
- {
- const auto& enc = Pire::Encodings::Latin1();
- char text[] = "wwwsswwwsssswwws";
- UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
- UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
- UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
- UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
- }
-
- Y_UNIT_TEST(CountRepeating)
- {
- char text[] = "abbabbabbabbat";
- UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2));
- }
-
- template<class Scanner>
- void CountGlueOne()
- {
- const auto& enc = Pire::Encodings::Utf8();
- auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
- auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
- auto sc = Scanner::Glue(sc1, sc2);
- auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
- }
-
- Y_UNIT_TEST(CountGlue)
- {
- CountGlueOne<Pire::CountingScanner>();
- CountGlueOne<Pire::AdvancedCountingScanner>();
- CountGlueOne<Pire::NoGlueLimitCountingScanner>();
- }
-
- template <class Scanner>
- void CountManyGluesOne(size_t maxRegexps) {
- const auto& encoding = Pire::Encodings::Utf8();
- auto text = "abcdbaa aa";
- TVector<ypair<std::string, std::string>> tasks = {
- {"a", ".*"},
- {"b", ".*"},
- {"c", ".*"},
- {"ba", ".*"},
- {"ab",".*"},
- };
- TVector<size_t> answers = {5, 2, 1, 1, 1};
- Scanner scanner;
- size_t regexpsCount = 0;
- for (; regexpsCount < maxRegexps; ++regexpsCount) {
- const auto& task = tasks[regexpsCount % tasks.size()];
- const auto regexpFsm = MkFsm(task.first.c_str(), encoding);
- const auto separatorFsm = MkFsm(task.second.c_str(), encoding);
- Scanner nextScanner(regexpFsm, separatorFsm);
- auto glue = Scanner::Glue(scanner, nextScanner);
- if (glue.Empty()) {
- break;
- }
- scanner = std::move(glue);
- }
- auto state = Run(scanner, text);
- for (size_t i = 0; i < regexpsCount; ++i) {
- UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]);
- }
- }
-
- Y_UNIT_TEST(CountManyGlues)
- {
- CountManyGluesOne<Pire::CountingScanner>(20);
- CountManyGluesOne<Pire::AdvancedCountingScanner>(20);
- CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50);
- }
-
- template<class Scanner>
- void CountBoundariesOne()
- {
- const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" };
-
- const auto& enc = Pire::Encodings::Utf8();
- Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc));
- auto st = InitializedState(sc);
- for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
- Pire::Step(sc, st, Pire::BeginMark);
- Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i]));
- Pire::Step(sc, st, Pire::EndMark);
- }
- UNIT_ASSERT_EQUAL(st.Result(0), size_t(3));
-
- const auto& enc2 = Pire::Encodings::Latin1();
- Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2));
- auto st2 = InitializedState(sc2);
- for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
- Pire::Step(sc2, st2, Pire::BeginMark);
- Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i]));
- Pire::Step(sc2, st2, Pire::EndMark);
- }
- UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7));
- }
-
- Y_UNIT_TEST(CountBoundaries)
- {
- CountBoundariesOne<Pire::CountingScanner>();
- CountBoundariesOne<Pire::AdvancedCountingScanner>();
- CountBoundariesOne<Pire::NoGlueLimitCountingScanner>();
- }
-
- template<class Scanner>
- void SerializationOne()
- {
- const auto& enc = Pire::Encodings::Latin1();
- auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
- auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
- auto sc = Scanner::Glue(sc1, sc2);
-
- BufferOutput wbuf;
- ::Save(&wbuf, sc);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- Scanner sc3;
- ::Load(&rbuf, sc3);
-
- auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
-
- const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
- TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
-
- // Test mmap-ing at various alignments
- for (size_t offset = 0; offset < MaxTestOffset; ++offset) {
- const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
- try {
- Scanner sc4;
- const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
-
- if (offset % sizeof(size_t) != 0) {
- UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping");
- } else {
- UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
-
- st = Run(sc4, "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
- }
- }
- catch (Pire::Error&) {}
- }
- }
-
- Y_UNIT_TEST(Serialization)
- {
- SerializationOne<Pire::CountingScanner>();
- SerializationOne<Pire::AdvancedCountingScanner>();
- SerializationOne<Pire::NoGlueLimitCountingScanner>();
- }
-
- template<class Scanner>
- void Serialization_v6_compatibilityOne()
- {
- const auto& enc = Pire::Encodings::Latin1();
- auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
- auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
- auto sc = Scanner::Glue(sc1, sc2);
-
- BufferOutput wbuf;
- ::Save(&wbuf, sc);
-
- // Patched scanner is a scanner of RE_VERSION 6.
- // The patched scanner is concatenated with original scanner to
- // make sure all content of patched scanner is consumed.
-
- const size_t ALIGNMENT = sizeof(size_t);
- size_t actions_size =
- sc.Size() *
- sc.LettersCount() *
- sizeof(typename Scanner::Action);
- UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0);
- size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag);
- const char* src = wbuf.Buffer().Data();
- size_t src_size = wbuf.Buffer().Size();
- size_t patched_size = src_size + actions_size;
- size_t bytes_before_actions = src_size - tags_size;
- const int fill_char = 0x42;
-
- TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT);
- char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT));
- char* patched = dst;
-
- // Insert dummy m_actions between m_jumps and m_tags.
- memcpy(patched, src, bytes_before_actions); // copy members before m_actions
- memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions
- memcpy(patched + bytes_before_actions + actions_size,
- src + bytes_before_actions,
- tags_size); // m_tags
- // Set version to 6
- // order of fields in header: magic, version, ...
- ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1;
- UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION);
- *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS;
-
- // write normal scanner after patched one
- char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT);
- memcpy(normal, src, src_size);
- char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT);
- size_t dst_size = dst_end - dst;
-
- // test loading from stream
- {
- MemoryInput rbuf(dst, dst_size);
- Scanner sc_patched, sc_normal;
- ::Load(&rbuf, sc_patched);
- ::Load(&rbuf, sc_normal);
- auto st_patched = Run(sc_patched,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
- auto st_normal = Run(sc_normal,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
- }
-
- // test loading using Mmap
- {
- Scanner sc_patched, sc_normal;
- const void* tail = sc_patched.Mmap(patched, patched_size);
- UNIT_ASSERT_EQUAL(tail, normal);
- const void* tail2 = sc_normal.Mmap(tail, src_size);
- UNIT_ASSERT_EQUAL(tail2, dst_end);
- auto st_patched = Run(sc_patched,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
- auto st_normal = Run(sc_normal,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
- }
- }
-
- Y_UNIT_TEST(Serialization_v6_compatibility)
- {
- Serialization_v6_compatibilityOne<Pire::CountingScanner>();
- Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>();
- // NoGlueLimitCountingScanner is not v6_compatible
- }
-
- Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) {
- const auto& enc = Pire::Encodings::Latin1();
- auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
- auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
- auto sc = AdvancedCountingScanner::Glue(sc1, sc2);
-
- BufferOutput wbuf;
- ::Save(&wbuf, sc);
-
- TVector<char> buf2(wbuf.Buffer().Size());
- memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size());
-
- // test loading from stream
- {
- MemoryInput rbuf(buf2.data(), buf2.size());
- NoGlueLimitCountingScanner scanner;
- ::Load(&rbuf, scanner);
- auto state = Run(scanner,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
- }
-
- // test loading using Mmap
- {
- NoGlueLimitCountingScanner scanner;
- const void* tail = scanner.Mmap(buf2.data(), buf2.size());
- UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size());
- auto state = Run(scanner,
- "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
- UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
- }
- }
-
- template<class Scanner>
- void EmptyOne()
- {
- Scanner sc;
- UNIT_ASSERT(sc.Empty());
-
- UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash
-
- // Test glueing empty
- const auto& enc = Pire::Encodings::Latin1();
- auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
- auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1));
- auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst");
- UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
-
- // Test Save/Load/Mmap
- BufferOutput wbuf;
- ::Save(&wbuf, sc);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- Pire::CountingScanner sc3;
- ::Load(&rbuf, sc3);
- UNIT_CHECKPOINT(); Run(sc3, "a string");
-
- const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
- TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
- const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
-
- Scanner sc4;
- const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
- UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
- UNIT_CHECKPOINT(); Run(sc4, "a string");
- }
-
- Y_UNIT_TEST(Empty)
- {
- EmptyOne<Pire::CountingScanner>();
- EmptyOne<Pire::AdvancedCountingScanner>();
- EmptyOne<Pire::NoGlueLimitCountingScanner>();
- }
-
- template<typename Scanner>
- TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) {
- TVector<Scanner> scanners(6);
- const auto regexpFsm = MkFsm(regexp, encoding);
- HalfFinalFsm fsm(regexpFsm);
- fsm.MakeGreedyCounter(true);
- scanners[0] = Scanner(fsm);
- fsm = HalfFinalFsm(regexpFsm);
- fsm.MakeGreedyCounter(false);
- scanners[1] = Scanner(fsm);
- fsm = HalfFinalFsm(regexpFsm);
- fsm.MakeNonGreedyCounter(true, true);
- scanners[2] = Scanner(fsm);
- fsm = HalfFinalFsm(regexpFsm);
- fsm.MakeNonGreedyCounter(true, false);
- scanners[3] = Scanner(fsm);
- fsm = HalfFinalFsm(regexpFsm);
- fsm.MakeNonGreedyCounter(false);
- scanners[4] = Scanner(fsm);
- scanners[5] = scanners[0];
- for (size_t i = 1; i < 5; i++) {
- scanners[5] = Scanner::Glue(scanners[5], scanners[i]);
- }
- return scanners;
- }
-
- template<typename Scanner>
- void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) {
- for (size_t i = 0; i < 5; i++) {
- UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]);
- }
- auto state = Run(scanners[5], text, -1);
- for (size_t i = 0; i < 5; i++) {
- UNIT_ASSERT_EQUAL(state.Result(i), result[i]);
- }
- }
-
- template<typename Scanner>
- void TestHalfFinalCount() {
- HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8});
- HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7});
- }
-
- Y_UNIT_TEST(HalfFinal)
- {
- TestHalfFinalCount<Pire::HalfFinalScanner>();
- TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>();
- TestHalfFinalCount<Pire::HalfFinalScannerNoMask>();
- TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>();
- }
-
- template<typename Scanner>
- void TestHalfFinalSerialization() {
- auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+");
- BufferOutput wbuf;
- for (size_t i = 0; i < 6; i++) {
- ::Save(&wbuf, oldScanners[i]);
- }
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- TVector<Scanner> scanners(6);
- for (size_t i = 0; i < 6; i++) {
- ::Load(&rbuf, scanners[i]);
- }
-
- HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5});
- }
-
- Y_UNIT_TEST(HalfFinalSerialization)
- {
- TestHalfFinalSerialization<Pire::HalfFinalScanner>();
- TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>();
- }
+ Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding)
+ {
+ Pire::Lexer lex;
+ lex.SetEncoding(encoding);
+ TVector<wchar32> ucs4;
+ encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ lex.Assign(ucs4.begin(), ucs4.end());
+ return lex.Parse();
+ }
+
+ template<class Scanner>
+ typename Scanner::State InitializedState(const Scanner& scanner)
+ {
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ return state;
+ }
+
+ template<class Scanner>
+ typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1)
+ {
+ if (len == (size_t)-1) len = strlen(text);
+ auto state = InitializedState(scanner);
+ Pire::Step(scanner, state, Pire::BeginMark);
+ Pire::Run(scanner, state, text, text + len);
+ Pire::Step(scanner, state, Pire::EndMark);
+ return state;
+ }
+
+ template<class Scanner>
+ size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ const auto separatorFsm = MkFsm(separator, encoding);
+ return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ }
+
+ size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ const auto separatorFsm = MkFsm(separator, encoding);
+ auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ if (strcmp(separator, ".*") == 0) {
+ HalfFinalFsm fsm(regexpFsm);
+ fsm.MakeGreedyCounter(true);
+ auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeGreedyCounter(false);
+ auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
+ UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult);
+ UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult);
+ }
+ UNIT_ASSERT_EQUAL(countingResult, newResult);
+ auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult);
+ return newResult;
+ }
+
+ Y_UNIT_TEST(Count)
+ {
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3));
+ char aaa[] = "abc def\0 abc\0 def ghi, abc";
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6));
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6));
+ UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8));
+ UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3));
+ UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4));
+ UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 "
+ "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240"
+ "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 "
+ "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302"
+ "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270"
+ "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5));
+ UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*",
+ "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 "
+ "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 "
+ "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. "
+ "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 "
+ "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 "
+ "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 "
+ "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 "
+ "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 "
+ "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 "
+ "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 "
+ "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 "
+ "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> "
+ "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. "
+ "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> "
+ "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
+ "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! "
+ "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, "
+ "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 "
+ "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202"
+ "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264"
+ "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204"
+ "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275"
+ "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321"
+ "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321"
+ "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 "
+ "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277"
+ "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265"
+ "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321"
+ "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201"
+ "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277"
+ "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 "
+ "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271"
+ "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201"
+ "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270"
+ "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202"
+ "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202"
+ "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"),
+ size_t(6));
+ UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276"
+ "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">"
+ "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203"
+ "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">"
+ "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201"
+ "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260"
+ "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276"
+ "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320"
+ "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214"
+ "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7));
+ UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +",
+ " \320\260\320\260\320\220 abc def \320\260 cd"),
+ size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state
+ }
+
+ Y_UNIT_TEST(CountWithoutSeparator)
+ {
+ UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3));
+ }
+
+ Y_UNIT_TEST(CountGreedy)
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ char text[] = "wwwsswwwsssswwws";
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
+ }
+
+ Y_UNIT_TEST(CountRepeating)
+ {
+ char text[] = "abbabbabbabbat";
+ UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2));
+ }
+
+ template<class Scanner>
+ void CountGlueOne()
+ {
+ const auto& enc = Pire::Encodings::Utf8();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+ auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+ }
+
+ Y_UNIT_TEST(CountGlue)
+ {
+ CountGlueOne<Pire::CountingScanner>();
+ CountGlueOne<Pire::AdvancedCountingScanner>();
+ CountGlueOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template <class Scanner>
+ void CountManyGluesOne(size_t maxRegexps) {
+ const auto& encoding = Pire::Encodings::Utf8();
+ auto text = "abcdbaa aa";
+ TVector<ypair<std::string, std::string>> tasks = {
+ {"a", ".*"},
+ {"b", ".*"},
+ {"c", ".*"},
+ {"ba", ".*"},
+ {"ab",".*"},
+ };
+ TVector<size_t> answers = {5, 2, 1, 1, 1};
+ Scanner scanner;
+ size_t regexpsCount = 0;
+ for (; regexpsCount < maxRegexps; ++regexpsCount) {
+ const auto& task = tasks[regexpsCount % tasks.size()];
+ const auto regexpFsm = MkFsm(task.first.c_str(), encoding);
+ const auto separatorFsm = MkFsm(task.second.c_str(), encoding);
+ Scanner nextScanner(regexpFsm, separatorFsm);
+ auto glue = Scanner::Glue(scanner, nextScanner);
+ if (glue.Empty()) {
+ break;
+ }
+ scanner = std::move(glue);
+ }
+ auto state = Run(scanner, text);
+ for (size_t i = 0; i < regexpsCount; ++i) {
+ UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]);
+ }
+ }
+
+ Y_UNIT_TEST(CountManyGlues)
+ {
+ CountManyGluesOne<Pire::CountingScanner>(20);
+ CountManyGluesOne<Pire::AdvancedCountingScanner>(20);
+ CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50);
+ }
+
+ template<class Scanner>
+ void CountBoundariesOne()
+ {
+ const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" };
+
+ const auto& enc = Pire::Encodings::Utf8();
+ Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc));
+ auto st = InitializedState(sc);
+ for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
+ Pire::Step(sc, st, Pire::BeginMark);
+ Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i]));
+ Pire::Step(sc, st, Pire::EndMark);
+ }
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(3));
+
+ const auto& enc2 = Pire::Encodings::Latin1();
+ Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2));
+ auto st2 = InitializedState(sc2);
+ for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
+ Pire::Step(sc2, st2, Pire::BeginMark);
+ Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i]));
+ Pire::Step(sc2, st2, Pire::EndMark);
+ }
+ UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7));
+ }
+
+ Y_UNIT_TEST(CountBoundaries)
+ {
+ CountBoundariesOne<Pire::CountingScanner>();
+ CountBoundariesOne<Pire::AdvancedCountingScanner>();
+ CountBoundariesOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<class Scanner>
+ void SerializationOne()
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Scanner sc3;
+ ::Load(&rbuf, sc3);
+
+ auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+
+ // Test mmap-ing at various alignments
+ for (size_t offset = 0; offset < MaxTestOffset; ++offset) {
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ try {
+ Scanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+
+ if (offset % sizeof(size_t) != 0) {
+ UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping");
+ } else {
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+
+ st = Run(sc4, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+ }
+ }
+ catch (Pire::Error&) {}
+ }
+ }
+
+ Y_UNIT_TEST(Serialization)
+ {
+ SerializationOne<Pire::CountingScanner>();
+ SerializationOne<Pire::AdvancedCountingScanner>();
+ SerializationOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<class Scanner>
+ void Serialization_v6_compatibilityOne()
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ // Patched scanner is a scanner of RE_VERSION 6.
+ // The patched scanner is concatenated with original scanner to
+ // make sure all content of patched scanner is consumed.
+
+ const size_t ALIGNMENT = sizeof(size_t);
+ size_t actions_size =
+ sc.Size() *
+ sc.LettersCount() *
+ sizeof(typename Scanner::Action);
+ UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0);
+ size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag);
+ const char* src = wbuf.Buffer().Data();
+ size_t src_size = wbuf.Buffer().Size();
+ size_t patched_size = src_size + actions_size;
+ size_t bytes_before_actions = src_size - tags_size;
+ const int fill_char = 0x42;
+
+ TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT);
+ char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT));
+ char* patched = dst;
+
+ // Insert dummy m_actions between m_jumps and m_tags.
+ memcpy(patched, src, bytes_before_actions); // copy members before m_actions
+ memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions
+ memcpy(patched + bytes_before_actions + actions_size,
+ src + bytes_before_actions,
+ tags_size); // m_tags
+ // Set version to 6
+ // order of fields in header: magic, version, ...
+ ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1;
+ UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION);
+ *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS;
+
+ // write normal scanner after patched one
+ char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT);
+ memcpy(normal, src, src_size);
+ char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT);
+ size_t dst_size = dst_end - dst;
+
+ // test loading from stream
+ {
+ MemoryInput rbuf(dst, dst_size);
+ Scanner sc_patched, sc_normal;
+ ::Load(&rbuf, sc_patched);
+ ::Load(&rbuf, sc_normal);
+ auto st_patched = Run(sc_patched,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
+ auto st_normal = Run(sc_normal,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
+ }
+
+ // test loading using Mmap
+ {
+ Scanner sc_patched, sc_normal;
+ const void* tail = sc_patched.Mmap(patched, patched_size);
+ UNIT_ASSERT_EQUAL(tail, normal);
+ const void* tail2 = sc_normal.Mmap(tail, src_size);
+ UNIT_ASSERT_EQUAL(tail2, dst_end);
+ auto st_patched = Run(sc_patched,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
+ auto st_normal = Run(sc_normal,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
+ }
+ }
+
+ Y_UNIT_TEST(Serialization_v6_compatibility)
+ {
+ Serialization_v6_compatibilityOne<Pire::CountingScanner>();
+ Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>();
+ // NoGlueLimitCountingScanner is not v6_compatible
+ }
+
+ Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = AdvancedCountingScanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ TVector<char> buf2(wbuf.Buffer().Size());
+ memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ // test loading from stream
+ {
+ MemoryInput rbuf(buf2.data(), buf2.size());
+ NoGlueLimitCountingScanner scanner;
+ ::Load(&rbuf, scanner);
+ auto state = Run(scanner,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
+ }
+
+ // test loading using Mmap
+ {
+ NoGlueLimitCountingScanner scanner;
+ const void* tail = scanner.Mmap(buf2.data(), buf2.size());
+ UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size());
+ auto state = Run(scanner,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
+ }
+ }
+
+ template<class Scanner>
+ void EmptyOne()
+ {
+ Scanner sc;
+ UNIT_ASSERT(sc.Empty());
+
+ UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash
+
+ // Test glueing empty
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1));
+ auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+
+ // Test Save/Load/Mmap
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::CountingScanner sc3;
+ ::Load(&rbuf, sc3);
+ UNIT_CHECKPOINT(); Run(sc3, "a string");
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ Scanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+ UNIT_CHECKPOINT(); Run(sc4, "a string");
+ }
+
+ Y_UNIT_TEST(Empty)
+ {
+ EmptyOne<Pire::CountingScanner>();
+ EmptyOne<Pire::AdvancedCountingScanner>();
+ EmptyOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<typename Scanner>
+ TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) {
+ TVector<Scanner> scanners(6);
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ HalfFinalFsm fsm(regexpFsm);
+ fsm.MakeGreedyCounter(true);
+ scanners[0] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeGreedyCounter(false);
+ scanners[1] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(true, true);
+ scanners[2] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(true, false);
+ scanners[3] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(false);
+ scanners[4] = Scanner(fsm);
+ scanners[5] = scanners[0];
+ for (size_t i = 1; i < 5; i++) {
+ scanners[5] = Scanner::Glue(scanners[5], scanners[i]);
+ }
+ return scanners;
+ }
+
+ template<typename Scanner>
+ void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) {
+ for (size_t i = 0; i < 5; i++) {
+ UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]);
+ }
+ auto state = Run(scanners[5], text, -1);
+ for (size_t i = 0; i < 5; i++) {
+ UNIT_ASSERT_EQUAL(state.Result(i), result[i]);
+ }
+ }
+
+ template<typename Scanner>
+ void TestHalfFinalCount() {
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7});
+ }
+
+ Y_UNIT_TEST(HalfFinal)
+ {
+ TestHalfFinalCount<Pire::HalfFinalScanner>();
+ TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>();
+ TestHalfFinalCount<Pire::HalfFinalScannerNoMask>();
+ TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>();
+ }
+
+ template<typename Scanner>
+ void TestHalfFinalSerialization() {
+ auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+");
+ BufferOutput wbuf;
+ for (size_t i = 0; i < 6; i++) {
+ ::Save(&wbuf, oldScanners[i]);
+ }
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ TVector<Scanner> scanners(6);
+ for (size_t i = 0; i < 6; i++) {
+ ::Load(&rbuf, scanners[i]);
+ }
+
+ HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5});
+ }
+
+ Y_UNIT_TEST(HalfFinalSerialization)
+ {
+ TestHalfFinalSerialization<Pire::HalfFinalScanner>();
+ TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>();
+ }
}
diff --git a/library/cpp/regex/pire/ut/easy_ut.cpp b/library/cpp/regex/pire/ut/easy_ut.cpp
index 5f0f8303fce..a1a4b688582 100644
--- a/library/cpp/regex/pire/ut/easy_ut.cpp
+++ b/library/cpp/regex/pire/ut/easy_ut.cpp
@@ -31,27 +31,27 @@
#include <easy.h>
Y_UNIT_TEST_SUITE(TestPireEasy) {
-
+
Y_UNIT_TEST(Match)
{
- Pire::Regexp re("(foo|bar)+", Pire::I);
- UNIT_ASSERT("prefix fOoBaR suffix" ==~ re);
- UNIT_ASSERT(!("bla bla bla" ==~ re));
+ Pire::Regexp re("(foo|bar)+", Pire::I);
+ UNIT_ASSERT("prefix fOoBaR suffix" ==~ re);
+ UNIT_ASSERT(!("bla bla bla" ==~ re));
}
Y_UNIT_TEST(Utf8)
{
- Pire::Regexp re("^.$", Pire::I | Pire::UTF8);
- UNIT_ASSERT("\x41" ==~ re);
- UNIT_ASSERT(!("\x81" ==~ re));
+ Pire::Regexp re("^.$", Pire::I | Pire::UTF8);
+ UNIT_ASSERT("\x41" ==~ re);
+ UNIT_ASSERT(!("\x81" ==~ re));
}
Y_UNIT_TEST(TwoFeatures)
{
- Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT);
- UNIT_ASSERT("abc" ==~ re);
- UNIT_ASSERT("ABC" ==~ re);
- UNIT_ASSERT(!("adc" ==~ re));
+ Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT);
+ UNIT_ASSERT("abc" ==~ re);
+ UNIT_ASSERT("ABC" ==~ re);
+ UNIT_ASSERT(!("adc" ==~ re));
}
-
+
}
diff --git a/library/cpp/regex/pire/ut/glyph_ut.cpp b/library/cpp/regex/pire/ut/glyph_ut.cpp
index 05ef56b01bc..3955029266d 100644
--- a/library/cpp/regex/pire/ut/glyph_ut.cpp
+++ b/library/cpp/regex/pire/ut/glyph_ut.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -28,36 +28,36 @@
Y_UNIT_TEST_SUITE(Glyphs) {
- Pire::Fsm ParseFsm(const char* regexp)
- {
- TVector<wchar32> ucs4;
- Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
- return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround();
- }
+ Pire::Fsm ParseFsm(const char* regexp)
+ {
+ TVector<wchar32> ucs4;
+ Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround();
+ }
#define NOGL_REGEXP(str) REGEXP2(str, "u")
#define GL_REGEXP(str) SCANNER(ParseFsm(str))
- Y_UNIT_TEST(Glyphs)
- {
- NOGL_REGEXP("regexp") {
- ACCEPTS("regexp");
- DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
- }
-
- GL_REGEXP("regexp") {
- ACCEPTS("regexp");
- ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
- }
-
- NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
- DENIES("regexp");
- ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
- }
-
- GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
- ACCEPTS("regexp");
- ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
- }
- }
+ Y_UNIT_TEST(Glyphs)
+ {
+ NOGL_REGEXP("regexp") {
+ ACCEPTS("regexp");
+ DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ GL_REGEXP("regexp") {
+ ACCEPTS("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
+ DENIES("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
+ ACCEPTS("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+ }
}
diff --git a/library/cpp/regex/pire/ut/inline_ut.cpp b/library/cpp/regex/pire/ut/inline_ut.cpp
index 3ba31dfaa86..055c5b28bf9 100644
--- a/library/cpp/regex/pire/ut/inline_ut.cpp
+++ b/library/cpp/regex/pire/ut/inline_ut.cpp
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -32,60 +32,60 @@ Y_UNIT_TEST_SUITE(TestPireInline) {
template<class Scanner>
typename Scanner::State RunRegexp(const Scanner& scanner, const char* str)
{
- typename Scanner::State state;
- scanner.Initialize(state);
- Step(scanner, state, Pire::BeginMark);
- Run(scanner, state, str, str + strlen(str));
- Step(scanner, state, Pire::EndMark);
- return state;
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ Step(scanner, state, Pire::BeginMark);
+ Run(scanner, state, str, str + strlen(str));
+ Step(scanner, state, Pire::EndMark);
+ return state;
}
template<class Scanner>
bool Matches(const Scanner& scanner, const char* str)
{
- return scanner.Final(RunRegexp(scanner, str));
+ return scanner.Final(RunRegexp(scanner, str));
}
template<class Scanner>
bool Matches2(const Scanner& scanner, const char* str)
{
- return Pire::Matches(scanner, str);
+ return Pire::Matches(scanner, str);
}
bool ParticularMatch(Pire::Scanner& sc, Pire::Scanner::State st, size_t idx)
{
- std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st);
- return std::distance(p.first, p.second) == 1 && *p.first == idx;
+ std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st);
+ return std::distance(p.first, p.second) == 1 && *p.first == idx;
}
Y_UNIT_TEST(Inline)
{
- Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is");
- UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/"));
- UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/"));
- UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/"));
+ Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is");
+ UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/"));
+ UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/"));
- Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i");
- UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/"));
- UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/"));
- UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/"));
+ Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i");
+ UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/"));
}
-
+
Y_UNIT_TEST(InlineGlue)
{
- // Check whether pire_inline handles comments as well:
-
- /* - a C-style comment outside a regexp; */
- Pire::Scanner sc = PIRE_REGEXP(
- "foo", "", /* - a C-style comment inside a regexp; */
- "bar", "", // - a C++-style comment inside a regexp;
- "baz", ""
- );
- // - a C++-style comment outside a regexp.
- UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0));
- UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1));
- UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2));
- UNIT_ASSERT(!Matches2(sc, "xxx"));
+ // Check whether pire_inline handles comments as well:
+
+ /* - a C-style comment outside a regexp; */
+ Pire::Scanner sc = PIRE_REGEXP(
+ "foo", "", /* - a C-style comment inside a regexp; */
+ "bar", "", // - a C++-style comment inside a regexp;
+ "baz", ""
+ );
+ // - a C++-style comment outside a regexp.
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0));
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1));
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2));
+ UNIT_ASSERT(!Matches2(sc, "xxx"));
}
}
diff --git a/library/cpp/regex/pire/ut/pire_ut.cpp b/library/cpp/regex/pire/ut/pire_ut.cpp
index 13f3f2ec717..5fa66c36545 100644
--- a/library/cpp/regex/pire/ut/pire_ut.cpp
+++ b/library/cpp/regex/pire/ut/pire_ut.cpp
@@ -37,237 +37,237 @@ Y_UNIT_TEST_SUITE(TestPire) {
Y_UNIT_TEST(String)
{
- REGEXP("abc") {
- ACCEPTS("def abc ghi");
- ACCEPTS("abc");
- DENIES ("def abd ghi");
- }
+ REGEXP("abc") {
+ ACCEPTS("def abc ghi");
+ ACCEPTS("abc");
+ DENIES ("def abd ghi");
+ }
}
Y_UNIT_TEST(Boundaries)
{
- REGEXP("^abc") {
- ACCEPTS("abc ghi");
- DENIES ("def abc");
- }
+ REGEXP("^abc") {
+ ACCEPTS("abc ghi");
+ DENIES ("def abc");
+ }
- REGEXP("abc$") {
- DENIES ("abc ghi");
- ACCEPTS("def abc");
- }
+ REGEXP("abc$") {
+ DENIES ("abc ghi");
+ ACCEPTS("def abc");
+ }
}
Y_UNIT_TEST(Primitives)
{
- REGEXP("abc|def") {
- ACCEPTS("def");
- ACCEPTS("abc");
- DENIES ("deb");
- }
-
- REGEXP("ad*e") {
- ACCEPTS("xaez");
- ACCEPTS("xadez");
- ACCEPTS("xaddez");
- ACCEPTS("xadddddddddddddddddddddddez");
- DENIES ("xafez");
- }
-
- REGEXP("ad+e") {
- DENIES ("xaez");
- ACCEPTS("xadez");
- ACCEPTS("xaddez");
- ACCEPTS("xadddddddddddddddddddddddez");
- DENIES ("xafez");
- }
-
- REGEXP("ad?e") {
- ACCEPTS("xaez");
- ACCEPTS("xadez");
- DENIES ("xaddez");
- DENIES ("xafez");
- }
-
- REGEXP("a.{1}e") {
- ACCEPTS("axe");
- DENIES ("ae");
- DENIES ("axye");
- }
+ REGEXP("abc|def") {
+ ACCEPTS("def");
+ ACCEPTS("abc");
+ DENIES ("deb");
+ }
+
+ REGEXP("ad*e") {
+ ACCEPTS("xaez");
+ ACCEPTS("xadez");
+ ACCEPTS("xaddez");
+ ACCEPTS("xadddddddddddddddddddddddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("ad+e") {
+ DENIES ("xaez");
+ ACCEPTS("xadez");
+ ACCEPTS("xaddez");
+ ACCEPTS("xadddddddddddddddddddddddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("ad?e") {
+ ACCEPTS("xaez");
+ ACCEPTS("xadez");
+ DENIES ("xaddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("a.{1}e") {
+ ACCEPTS("axe");
+ DENIES ("ae");
+ DENIES ("axye");
+ }
}
void TestMassAlternatives(const char* pattern) {
- REGEXP(pattern) {
- ACCEPTS("abc");
- ACCEPTS("def");
- ACCEPTS("ghi");
- ACCEPTS("klm");
- DENIES ("aei");
- DENIES ("klc");
- }
+ REGEXP(pattern) {
+ ACCEPTS("abc");
+ ACCEPTS("def");
+ ACCEPTS("ghi");
+ ACCEPTS("klm");
+ DENIES ("aei");
+ DENIES ("klc");
+ }
}
Y_UNIT_TEST(MassAlternatives)
{
- TestMassAlternatives("((abc|def)|ghi)|klm");
+ TestMassAlternatives("((abc|def)|ghi)|klm");
- TestMassAlternatives("(abc|def)|(ghi|klm)");
+ TestMassAlternatives("(abc|def)|(ghi|klm)");
- TestMassAlternatives("abc|(def|(ghi|klm))");
+ TestMassAlternatives("abc|(def|(ghi|klm))");
- TestMassAlternatives("abc|(def|ghi)|klm");
+ TestMassAlternatives("abc|(def|ghi)|klm");
}
Y_UNIT_TEST(Composition)
{
- REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") {
- ACCEPTS("/regexp/i");
- ACCEPTS("/regexp2/");
- DENIES ("regexp");
+ REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") {
+ ACCEPTS("/regexp/i");
+ ACCEPTS("/regexp2/");
+ DENIES ("regexp");
- ACCEPTS("/dir\\/file/");
- DENIES ("/dir/file/");
+ ACCEPTS("/dir\\/file/");
+ DENIES ("/dir/file/");
- ACCEPTS("/dir\\\\/");
- DENIES ("/dir\\\\/file/");
- }
+ ACCEPTS("/dir\\\\/");
+ DENIES ("/dir\\\\/file/");
+ }
- REGEXP("Head(Inner)*Tail") {
- ACCEPTS("HeadInnerTail");
- ACCEPTS("HeadInnerInnerTail");
- DENIES ("HeadInneInnerTail");
- ACCEPTS("HeadTail");
- }
+ REGEXP("Head(Inner)*Tail") {
+ ACCEPTS("HeadInnerTail");
+ ACCEPTS("HeadInnerInnerTail");
+ DENIES ("HeadInneInnerTail");
+ ACCEPTS("HeadTail");
+ }
}
Y_UNIT_TEST(Repetition)
{
- REGEXP("^x{3,6}$") {
- DENIES ("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxx");
- ACCEPTS("xxxxxx");
- DENIES ("xxxxxxx");
- }
-
- REGEXP("^x{3,}$") {
- DENIES ("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxxxxxxxx");
- ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
- }
-
- REGEXP("^x{3}$") {
- DENIES ("x");
- DENIES ("xx");
- ACCEPTS("xxx");
- DENIES ("xxxx");
- DENIES ("xxxxx");
- DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
- }
-
- REGEXP("x.{3,10}$") {
- for (size_t size = 0; size < 20; ++size) {
- ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e');
- if (size >= 3 && size <= 10)
- ACCEPTS(str.c_str());
- else
- DENIES(str.c_str());
- }
- }
+ REGEXP("^x{3,6}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ }
+
+ REGEXP("^x{3,}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxxxxxxxx");
+ ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^x{3}$") {
+ DENIES ("x");
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ DENIES ("xxxx");
+ DENIES ("xxxxx");
+ DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("x.{3,10}$") {
+ for (size_t size = 0; size < 20; ++size) {
+ ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e');
+ if (size >= 3 && size <= 10)
+ ACCEPTS(str.c_str());
+ else
+ DENIES(str.c_str());
+ }
+ }
}
Y_UNIT_TEST(UTF8)
{
- REGEXP2("^.$", "u") {
- // A single-byte sequence 0xxx xxxx
- ACCEPTS("\x41");
- DENIES ("\x81");
-
- // A two-byte sequence: 110x xxxx | 10xx xxxx
- ACCEPTS("\xC1\x81");
- DENIES ("\xC1");
- DENIES ("\xC1\x41");
- DENIES ("\xC1\xC2");
- DENIES ("\xC1\x81\x82");
-
- // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx
- ACCEPTS("\xE1\x81\x82");
- DENIES ("\xE1");
- DENIES ("\xE1\x42");
- DENIES ("\xE1\x42\x43");
- DENIES ("\xE1\xC2\xC3");
- DENIES ("\xE1\x82");
- DENIES ("\xE1\x82\x83\x84");
-
- // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
- ACCEPTS("\xF1\x81\x82\x83");
- }
-
- REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y");
+ REGEXP2("^.$", "u") {
+ // A single-byte sequence 0xxx xxxx
+ ACCEPTS("\x41");
+ DENIES ("\x81");
+
+ // A two-byte sequence: 110x xxxx | 10xx xxxx
+ ACCEPTS("\xC1\x81");
+ DENIES ("\xC1");
+ DENIES ("\xC1\x41");
+ DENIES ("\xC1\xC2");
+ DENIES ("\xC1\x81\x82");
+
+ // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx
+ ACCEPTS("\xE1\x81\x82");
+ DENIES ("\xE1");
+ DENIES ("\xE1\x42");
+ DENIES ("\xE1\x42\x43");
+ DENIES ("\xE1\xC2\xC3");
+ DENIES ("\xE1\x82");
+ DENIES ("\xE1\x82\x83\x84");
+
+ // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
+ ACCEPTS("\xF1\x81\x82\x83");
+ }
+
+ REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y");
}
Y_UNIT_TEST(AndNot)
{
- REGEXP2("<([0-9]+&~123&~456)>", "a") {
- ACCEPTS("<111>");
- ACCEPTS("<124>");
- DENIES ("<123>");
- DENIES ("<456>");
- DENIES ("<abc>");
- }
+ REGEXP2("<([0-9]+&~123&~456)>", "a") {
+ ACCEPTS("<111>");
+ ACCEPTS("<124>");
+ DENIES ("<123>");
+ DENIES ("<456>");
+ DENIES ("<abc>");
+ }
- REGEXP2("[0-9]+\\&1+", "a") {
- DENIES("111");
- ACCEPTS("123&111");
- }
+ REGEXP2("[0-9]+\\&1+", "a") {
+ DENIES("111");
+ ACCEPTS("123&111");
+ }
}
Y_UNIT_TEST(Empty)
{
- Scanners s("\\s*", "n");
- Pire::Scanner::State state;
- s.fast.Initialize(state);
- UNIT_ASSERT(s.fast.Final(state));
- Pire::SimpleScanner::State stateSF;
- s.simple.Initialize(stateSF);
- UNIT_ASSERT(s.simple.Final(stateSF));
+ Scanners s("\\s*", "n");
+ Pire::Scanner::State state;
+ s.fast.Initialize(state);
+ UNIT_ASSERT(s.fast.Final(state));
+ Pire::SimpleScanner::State stateSF;
+ s.simple.Initialize(stateSF);
+ UNIT_ASSERT(s.simple.Final(stateSF));
}
Y_UNIT_TEST(Misc)
{
- REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a");
- REGEXP("\\t") ACCEPTS("\t");
+ REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a");
+ REGEXP("\\t") ACCEPTS("\t");
- SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) {
- ACCEPTS("str");
- DENIES("str_http");
- }
+ SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) {
+ ACCEPTS("str");
+ DENIES("str_http");
+ }
- SCANNER(~Pire::Fsm()) ACCEPTS("str");
+ SCANNER(~Pire::Fsm()) ACCEPTS("str");
}
Y_UNIT_TEST(Ranges)
{
- REGEXP("a\\W") {
- ACCEPTS("a,");
- DENIES("ab");
- }
+ REGEXP("a\\W") {
+ ACCEPTS("a,");
+ DENIES("ab");
+ }
- try {
- REGEXP("abc[def") {}
- UNIT_ASSERT(!"Should report syntax error");
- }
- catch (Pire::Error&) {}
+ try {
+ REGEXP("abc[def") {}
+ UNIT_ASSERT(!"Should report syntax error");
+ }
+ catch (Pire::Error&) {}
}
Y_UNIT_TEST(Reverse)
{
- SCANNER(ParseRegexp("abcdef").Reverse()) {
- ACCEPTS("fedcba");
- DENIES ("abcdef");
- }
+ SCANNER(ParseRegexp("abcdef").Reverse()) {
+ ACCEPTS("fedcba");
+ DENIES ("abcdef");
+ }
}
#if defined(__GNUC__)
@@ -277,480 +277,480 @@ Y_UNIT_TEST(Reverse)
Y_UNIT_TEST(PrefixSuffix)
{
- static const char* pattern = "-->";
- Pire::Fsm fsm = ParseRegexp(pattern, "n");
- Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>();
- Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>();
- Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>();
-
- static const char* text = "1234567890 --> middle --> end";
- const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text));
- UNIT_ASSERT_EQUAL(end, text + 14);
- const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
- UNIT_ASSERT_EQUAL(begin, text + 11);
- auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text));
- UNIT_ASSERT_EQUAL(view.data(), text + 11);
- UNIT_ASSERT_EQUAL(view.size(), 3);
-
- end = Pire::LongestPrefix(ngsc, text, text + strlen(text));
- UNIT_ASSERT_EQUAL(end, text + 25);
- begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
- UNIT_ASSERT_EQUAL(begin, text + 22);
- view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text));
- UNIT_ASSERT_EQUAL(view.data(), text + 22);
- UNIT_ASSERT_EQUAL(view.size(), 3);
-
- end = Pire::ShortestPrefix(gsc, text, text + strlen(text));
- UNIT_ASSERT_EQUAL(end, text + 14);
- begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
- UNIT_ASSERT_EQUAL(begin, text + 11);
- view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text));
- UNIT_ASSERT_EQUAL(view.data(), text + 11);
- UNIT_ASSERT_EQUAL(view.size(), 3);
-
- end = Pire::ShortestPrefix(ngsc, text, text + strlen(text));
- UNIT_ASSERT_EQUAL(end, text + 14);
- begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
- UNIT_ASSERT_EQUAL(begin, text + 11);
- view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text));
- UNIT_ASSERT_EQUAL(view.data(), text + 11);
- UNIT_ASSERT_EQUAL(view.size(), 3);
+ static const char* pattern = "-->";
+ Pire::Fsm fsm = ParseRegexp(pattern, "n");
+ Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>();
+ Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>();
+ Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>();
+
+ static const char* text = "1234567890 --> middle --> end";
+ const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::LongestPrefix(ngsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 25);
+ begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 22);
+ view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 22);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::ShortestPrefix(gsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::ShortestPrefix(ngsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
}
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
Y_UNIT_TEST(PrefixSuffixEmptyView) {
- const std::string_view empty{};
- auto checkAnswer = [](std::string_view answer) {
- return !answer.data() && answer.size() == 0;
- };
-
- TVector<ystring> patterns = {
- "",
- "a",
- ".*",
- "a.*",
- ".*a"
- };
-
- for (const auto& pattern: patterns) {
- Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
- UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern);
- UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern);
- UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern);
- UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern);
- }
+ const std::string_view empty{};
+ auto checkAnswer = [](std::string_view answer) {
+ return !answer.data() && answer.size() == 0;
+ };
+
+ TVector<ystring> patterns = {
+ "",
+ "a",
+ ".*",
+ "a.*",
+ ".*a"
+ };
+
+ for (const auto& pattern: patterns) {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern);
+ }
}
namespace {
- ssize_t LongestPrefixLen(const char* pattern, const char* str)
- {
- Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
- const char* end = Pire::LongestPrefix(sc, str, str + strlen(str));
- return end ? end - str : -1;
- }
-
- ssize_t ShortestPrefixLen(const char* pattern, const char* str)
- {
- Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
- const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str));
- return end ? end - str : -1;
- }
-
- ssize_t LongestSuffixLen(const char* pattern, const char* str)
- {
- Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
- const char* rbegin = str + strlen(str) - 1;
- const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1);
- return rend ? rbegin - rend : -1;
- }
-
- ssize_t ShortestSuffixLen(const char* pattern, const char* str) {
- Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
- const char* rbegin = str + strlen(str) - 1;
- const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1);
- return rend ? rbegin - rend : -1;
- }
+ ssize_t LongestPrefixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* end = Pire::LongestPrefix(sc, str, str + strlen(str));
+ return end ? end - str : -1;
+ }
+
+ ssize_t ShortestPrefixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str));
+ return end ? end - str : -1;
+ }
+
+ ssize_t LongestSuffixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* rbegin = str + strlen(str) - 1;
+ const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1);
+ return rend ? rbegin - rend : -1;
+ }
+
+ ssize_t ShortestSuffixLen(const char* pattern, const char* str) {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* rbegin = str + strlen(str) - 1;
+ const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1);
+ return rend ? rbegin - rend : -1;
+ }
}
Y_UNIT_TEST(ScanBoundaries)
{
- struct Case {
- ystring pattern;
- ystring text;
- ssize_t shortestPrefixLen;
- ssize_t longestPrefixLen;
-
- ystring ToString() const {
- return ystring("Pattern: ") + pattern + ", text: " + text;
- }
- };
-
- TVector <Case> cases = {
- {
- "a*",
- "",
- 0,
- 0,
- },
- {
- "a",
- "",
- -1,
- -1,
- },
- {
- "fixed",
- "fixed prefix",
- 5,
- 5,
- },
- {
- "fixed",
- "a fixed nonexistent prefix",
- -1,
- -1,
- },
- {
- "a*",
- "aaabbb",
- 0,
- 3,
- },
- {
- "a*",
- "bbbbbb",
- 0,
- 0,
- },
- {
- "a*",
- "aaaaaa",
- 0,
- 6,
- },
- {
- "aa*",
- "aaabbb",
- 1,
- 3,
- },
- {
- "a*a",
- "aaaaaa",
- 1,
- 6,
- },
- {
- ".*a",
- "bbbba",
- 5,
- 5,
- },
- {
- ".*",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
- 0,
- 80,
- },
- {
- ".*a",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
- 81,
- 81,
- },
- {
- ".*a",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
- 81,
- 162,
- },
- {
- ".*b",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
- -1,
- -1,
- },
- {
- ".*a.*",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
- 81,
- 162,
- },
- {
- ".*a.*b",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
- 162,
- 162,
- },
- {
- "1.*a.*",
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
- "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
- 81,
- 162,
- },
- {
- "a+",
- "bbbbbb",
- -1,
- -1,
- },
- };
-
- for (const auto& test: cases) {
- UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString());
- UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString());
- auto reversed = test.text;
- ReverseInPlace(reversed);
- UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString());
- UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString());
- }
+ struct Case {
+ ystring pattern;
+ ystring text;
+ ssize_t shortestPrefixLen;
+ ssize_t longestPrefixLen;
+
+ ystring ToString() const {
+ return ystring("Pattern: ") + pattern + ", text: " + text;
+ }
+ };
+
+ TVector <Case> cases = {
+ {
+ "a*",
+ "",
+ 0,
+ 0,
+ },
+ {
+ "a",
+ "",
+ -1,
+ -1,
+ },
+ {
+ "fixed",
+ "fixed prefix",
+ 5,
+ 5,
+ },
+ {
+ "fixed",
+ "a fixed nonexistent prefix",
+ -1,
+ -1,
+ },
+ {
+ "a*",
+ "aaabbb",
+ 0,
+ 3,
+ },
+ {
+ "a*",
+ "bbbbbb",
+ 0,
+ 0,
+ },
+ {
+ "a*",
+ "aaaaaa",
+ 0,
+ 6,
+ },
+ {
+ "aa*",
+ "aaabbb",
+ 1,
+ 3,
+ },
+ {
+ "a*a",
+ "aaaaaa",
+ 1,
+ 6,
+ },
+ {
+ ".*a",
+ "bbbba",
+ 5,
+ 5,
+ },
+ {
+ ".*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
+ 0,
+ 80,
+ },
+ {
+ ".*a",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
+ 81,
+ 81,
+ },
+ {
+ ".*a",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
+ 81,
+ 162,
+ },
+ {
+ ".*b",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
+ -1,
+ -1,
+ },
+ {
+ ".*a.*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 81,
+ 162,
+ },
+ {
+ ".*a.*b",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 162,
+ 162,
+ },
+ {
+ "1.*a.*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 81,
+ 162,
+ },
+ {
+ "a+",
+ "bbbbbb",
+ -1,
+ -1,
+ },
+ };
+
+ for (const auto& test: cases) {
+ UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString());
+ UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString());
+ auto reversed = test.text;
+ ReverseInPlace(reversed);
+ UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString());
+ UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString());
+ }
}
Y_UNIT_TEST(ScanTermination)
{
- Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>();
- // Scanning must terminate at first dead state. If it does not,
- // we will pass through the end of our string and end up with segfault.
- const char str[] = "aaab";
- const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str));
- UNIT_ASSERT(p == &str[0] + 3);
+ Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>();
+ // Scanning must terminate at first dead state. If it does not,
+ // we will pass through the end of our string and end up with segfault.
+ const char str[] = "aaab";
+ const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str));
+ UNIT_ASSERT(p == &str[0] + 3);
}
struct BasicMmapTest {
- template <class Scanner>
- static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str)
- {
- try {
- sc.Mmap(ptr, sz);
- if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) {
- UNIT_ASSERT(!"Failed to check for misaligned mmaping");
- } else {
- UNIT_ASSERT(Matches(sc, str));
- }
- }
- catch (Pire::Error&) {}
- }
+ template <class Scanner>
+ static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str)
+ {
+ try {
+ sc.Mmap(ptr, sz);
+ if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) {
+ UNIT_ASSERT(!"Failed to check for misaligned mmaping");
+ } else {
+ UNIT_ASSERT(Matches(sc, str));
+ }
+ }
+ catch (Pire::Error&) {}
+ }
};
template <class Sc1, class Sc2>
void TestCopyingHelper()
{
- Pire::Fsm fsm = ParseRegexp("^r$", "");
- Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>());
+ Pire::Fsm fsm = ParseRegexp("^r$", "");
+ Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>());
- // Test copy ctor
- UNIT_ASSERT(Matches(Sc2(sc1), "r"));
- UNIT_ASSERT(!Matches(Sc2(sc1), "p"));
+ // Test copy ctor
+ UNIT_ASSERT(Matches(Sc2(sc1), "r"));
+ UNIT_ASSERT(!Matches(Sc2(sc1), "p"));
- // Test '=' operator
- Sc2 sc2;
- sc2 = sc1;
- UNIT_ASSERT(Matches(sc2, "r"));
- UNIT_ASSERT(!Matches(sc2, "p"));
+ // Test '=' operator
+ Sc2 sc2;
+ sc2 = sc1;
+ UNIT_ASSERT(Matches(sc2, "r"));
+ UNIT_ASSERT(!Matches(sc2, "p"));
}
template <class Sc1, class Sc2>
void TestCopying()
{
- TestCopyingHelper<Sc1, Sc2>();
- TestCopyingHelper<Sc2, Sc1>();
+ TestCopyingHelper<Sc1, Sc2>();
+ TestCopyingHelper<Sc2, Sc1>();
}
Y_UNIT_TEST(Copying)
{
- TestCopying<Pire::Scanner, Pire::NonrelocScanner>();
- TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>();
- TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>();
- TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>();
+ TestCopying<Pire::Scanner, Pire::NonrelocScanner>();
+ TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>();
+ TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>();
+ TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>();
}
template<class Scanner>
void MatchScanner(Scanner& scanner) {
- UNIT_ASSERT(Matches(scanner, "regexp"));
- UNIT_ASSERT(!Matches(scanner, "regxp"));
- UNIT_ASSERT(!Matches(scanner, "regexp t"));
+ UNIT_ASSERT(Matches(scanner, "regexp"));
+ UNIT_ASSERT(!Matches(scanner, "regxp"));
+ UNIT_ASSERT(!Matches(scanner, "regexp t"));
}
template<class Scanner>
void LoadAndMatchScanner(MemoryInput& rbuf, Scanner& scanner) {
- Load(&rbuf, scanner);
- MatchScanner(scanner);
+ Load(&rbuf, scanner);
+ MatchScanner(scanner);
}
template<class Scanner>
const char* MmapAndMatchScanner(Scanner& scanner, const char* ptr, size_t size) {
- const char* ptr2 = (const char*)scanner.Mmap(ptr, size);
- MatchScanner(scanner);
- return ptr2;
+ const char* ptr2 = (const char*)scanner.Mmap(ptr, size);
+ MatchScanner(scanner);
+ return ptr2;
}
Y_UNIT_TEST(Serialization)
{
- Scanners s("^regexp$");
-
- BufferOutput wbuf;
- Save(&wbuf, s.fast);
- Save(&wbuf, s.simple);
- Save(&wbuf, s.slow);
- Save(&wbuf, s.fastNoMask);
- Save(&wbuf, s.nonreloc);
- Save(&wbuf, s.nonrelocNoMask);
- Save(&wbuf, s.halfFinal);
- Save(&wbuf, s.halfFinalNoMask);
- Save(&wbuf, s.nonrelocHalfFinal);
- Save(&wbuf, s.nonrelocHalfFinalNoMask);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- LoadAndMatchScanner(rbuf, s.fast);
- LoadAndMatchScanner(rbuf, s.simple);
- LoadAndMatchScanner(rbuf, s.slow);
- LoadAndMatchScanner(rbuf, s.fastNoMask);
- LoadAndMatchScanner(rbuf, s.nonreloc);
- LoadAndMatchScanner(rbuf, s.nonrelocNoMask);
- LoadAndMatchScanner(rbuf, s.halfFinal);
- LoadAndMatchScanner(rbuf, s.halfFinalNoMask);
- LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal);
- LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask);
-
- Pire::Scanner fast;
- Pire::SimpleScanner simple;
- Pire::SlowScanner slow;
- Pire::ScannerNoMask fastNoMask;
- Pire::HalfFinalScanner halfFinal;
- Pire::HalfFinalScannerNoMask halfFinalNoMask;
- Pire::Scanner fast1;
- Pire::ScannerNoMask fastNoMask1;
- Pire::HalfFinalScanner halfFinal1;
- Pire::HalfFinalScannerNoMask halfFinalNoMask1;
- const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
- TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
- const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
- const char* end = ptr + wbuf.Buffer().Size();
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
-
- const char* ptr2 = 0;
- ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr);
- size_t fastSize = ptr2 - ptr;
- ptr = ptr2;
- ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr);
- size_t simpleSize = ptr2 - ptr;
- ptr = ptr2;
- ptr = MmapAndMatchScanner(slow, ptr, end - ptr);
- ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr);
- // Nonreloc-s are saved as Scaner-s, so read them again
- ptr = MmapAndMatchScanner(fast1, ptr, end - ptr);
- ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr);
-
- ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr);
- ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr);
- ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr);
- ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr);
- UNIT_ASSERT_EQUAL(ptr, end);
-
- for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
- ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
- end = ptr + wbuf.Buffer().Size();
- memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
- BasicMmapTest::Match(fast, ptr, end - ptr, "regexp");
- ptr = ptr + fastSize;
- BasicMmapTest::Match(simple, ptr, end - ptr, "regexp");
- ptr = ptr + simpleSize;
- BasicMmapTest::Match(slow, ptr, end - ptr, "regexp");
- }
+ Scanners s("^regexp$");
+
+ BufferOutput wbuf;
+ Save(&wbuf, s.fast);
+ Save(&wbuf, s.simple);
+ Save(&wbuf, s.slow);
+ Save(&wbuf, s.fastNoMask);
+ Save(&wbuf, s.nonreloc);
+ Save(&wbuf, s.nonrelocNoMask);
+ Save(&wbuf, s.halfFinal);
+ Save(&wbuf, s.halfFinalNoMask);
+ Save(&wbuf, s.nonrelocHalfFinal);
+ Save(&wbuf, s.nonrelocHalfFinalNoMask);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ LoadAndMatchScanner(rbuf, s.fast);
+ LoadAndMatchScanner(rbuf, s.simple);
+ LoadAndMatchScanner(rbuf, s.slow);
+ LoadAndMatchScanner(rbuf, s.fastNoMask);
+ LoadAndMatchScanner(rbuf, s.nonreloc);
+ LoadAndMatchScanner(rbuf, s.nonrelocNoMask);
+ LoadAndMatchScanner(rbuf, s.halfFinal);
+ LoadAndMatchScanner(rbuf, s.halfFinalNoMask);
+ LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal);
+ LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask);
+
+ Pire::Scanner fast;
+ Pire::SimpleScanner simple;
+ Pire::SlowScanner slow;
+ Pire::ScannerNoMask fastNoMask;
+ Pire::HalfFinalScanner halfFinal;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask;
+ Pire::Scanner fast1;
+ Pire::ScannerNoMask fastNoMask1;
+ Pire::HalfFinalScanner halfFinal1;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask1;
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ const char* end = ptr + wbuf.Buffer().Size();
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ const char* ptr2 = 0;
+ ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr);
+ size_t fastSize = ptr2 - ptr;
+ ptr = ptr2;
+ ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr);
+ size_t simpleSize = ptr2 - ptr;
+ ptr = ptr2;
+ ptr = MmapAndMatchScanner(slow, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr);
+ // Nonreloc-s are saved as Scaner-s, so read them again
+ ptr = MmapAndMatchScanner(fast1, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr);
+
+ ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr);
+ UNIT_ASSERT_EQUAL(ptr, end);
+
+ for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
+ ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ end = ptr + wbuf.Buffer().Size();
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ BasicMmapTest::Match(fast, ptr, end - ptr, "regexp");
+ ptr = ptr + fastSize;
+ BasicMmapTest::Match(simple, ptr, end - ptr, "regexp");
+ ptr = ptr + simpleSize;
+ BasicMmapTest::Match(slow, ptr, end - ptr, "regexp");
+ }
}
Y_UNIT_TEST(TestShortcuts)
{
- REGEXP("aaa") {
- ACCEPTS("......................................aaa.............");
- DENIES ("......................................aab.............");
- DENIES ("......................................................");
- }
- REGEXP("[ab]{3}") {
- ACCEPTS("......................................aaa.............");
- ACCEPTS("......................................aab.............");
- ACCEPTS("......................................bbb.............");
- DENIES ("......................................................");
- }
- REGEXP2("\xD0\xB0", "u") {
- ACCEPTS("......................................\xD0\xB0...............");
- ACCEPTS("...................................\xD0\xB0..................");
- ACCEPTS("................................\xD0\xB0.....................");
- }
+ REGEXP("aaa") {
+ ACCEPTS("......................................aaa.............");
+ DENIES ("......................................aab.............");
+ DENIES ("......................................................");
+ }
+ REGEXP("[ab]{3}") {
+ ACCEPTS("......................................aaa.............");
+ ACCEPTS("......................................aab.............");
+ ACCEPTS("......................................bbb.............");
+ DENIES ("......................................................");
+ }
+ REGEXP2("\xD0\xB0", "u") {
+ ACCEPTS("......................................\xD0\xB0...............");
+ ACCEPTS("...................................\xD0\xB0..................");
+ ACCEPTS("................................\xD0\xB0.....................");
+ }
}
template<class Scanner>
void TestGlue()
{
- Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>();
- Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>();
- Scanner glued = Scanner::Glue(sc1, sc2);
- UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2));
-
- auto state = RunRegexp(glued, "aaa");
- auto res = glued.AcceptedRegexps(state);
- UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
- UNIT_ASSERT_EQUAL(*res.first, size_t(0));
-
- state = RunRegexp(glued, "bbb");
- res = glued.AcceptedRegexps(state);
- UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
- UNIT_ASSERT_EQUAL(*res.first, size_t(1));
-
- state = RunRegexp(glued, "aaabbb");
- res = glued.AcceptedRegexps(state);
- UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2));
- UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
- UNIT_ASSERT_EQUAL(res.first[1], size_t(1));
-
- state = RunRegexp(glued, "ccc");
- res = glued.AcceptedRegexps(state);
- UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0));
-
- Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>();
- glued = Scanner::Glue(sc3, glued);
- UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3));
-
- state = RunRegexp(glued, "ccc");
- res = glued.AcceptedRegexps(state);
- UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
- UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
- Scanner sc4 = Scanner::Glue(
- ParseRegexp("a", "n").Compile<Scanner>(),
- ParseRegexp("c", "n").Compile<Scanner>()
- );
- state = RunRegexp(sc4, "ac");
- res = sc4.AcceptedRegexps(state);
- UNIT_ASSERT(res.second == res.first);
- state = RunRegexp(sc4, "ac");
- UNIT_ASSERT(!sc4.Final(state));
+ Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>();
+ Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>();
+ Scanner glued = Scanner::Glue(sc1, sc2);
+ UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2));
+
+ auto state = RunRegexp(glued, "aaa");
+ auto res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(*res.first, size_t(0));
+
+ state = RunRegexp(glued, "bbb");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(*res.first, size_t(1));
+
+ state = RunRegexp(glued, "aaabbb");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2));
+ UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
+ UNIT_ASSERT_EQUAL(res.first[1], size_t(1));
+
+ state = RunRegexp(glued, "ccc");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0));
+
+ Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>();
+ glued = Scanner::Glue(sc3, glued);
+ UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3));
+
+ state = RunRegexp(glued, "ccc");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
+ Scanner sc4 = Scanner::Glue(
+ ParseRegexp("a", "n").Compile<Scanner>(),
+ ParseRegexp("c", "n").Compile<Scanner>()
+ );
+ state = RunRegexp(sc4, "ac");
+ res = sc4.AcceptedRegexps(state);
+ UNIT_ASSERT(res.second == res.first);
+ state = RunRegexp(sc4, "ac");
+ UNIT_ASSERT(!sc4.Final(state));
}
Y_UNIT_TEST(Glue)
{
- TestGlue<Pire::Scanner>();
- TestGlue<Pire::NonrelocScanner>();
- TestGlue<Pire::ScannerNoMask>();
- TestGlue<Pire::NonrelocScannerNoMask>();
- TestGlue<Pire::HalfFinalScanner>();
- TestGlue<Pire::NonrelocHalfFinalScanner>();
- TestGlue<Pire::HalfFinalScannerNoMask>();
- TestGlue<Pire::NonrelocHalfFinalScannerNoMask>();
+ TestGlue<Pire::Scanner>();
+ TestGlue<Pire::NonrelocScanner>();
+ TestGlue<Pire::ScannerNoMask>();
+ TestGlue<Pire::NonrelocScannerNoMask>();
+ TestGlue<Pire::HalfFinalScanner>();
+ TestGlue<Pire::NonrelocHalfFinalScanner>();
+ TestGlue<Pire::HalfFinalScannerNoMask>();
+ TestGlue<Pire::NonrelocHalfFinalScannerNoMask>();
}
Y_UNIT_TEST(Slow)
{
- Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>();
- // 123456789012345678901234567890
- UNIT_ASSERT( Matches(sc, "....a.............................."));
- UNIT_ASSERT(!Matches(sc, "....a..............................."));
- UNIT_ASSERT(!Matches(sc, "....a............................."));
+ Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>();
+ // 123456789012345678901234567890
+ UNIT_ASSERT( Matches(sc, "....a.............................."));
+ UNIT_ASSERT(!Matches(sc, "....a..............................."));
+ UNIT_ASSERT(!Matches(sc, "....a............................."));
}
struct astring: private std::vector<char> {
@@ -775,33 +775,33 @@ struct astring: private std::vector<char> {
Y_UNIT_TEST(Aligned)
{
- using ystring = astring;
-
- UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*)));
-
- REGEXP("xy") {
- // Short string with aligned head
- ACCEPTS(ystring("xy").c_str());
- DENIES (ystring("yz").c_str());
- // Short string, unaligned
- ACCEPTS(ystring(".xy").c_str() + 1);
- DENIES (ystring(".yz").c_str() + 1);
- // Short string with aligned tail
- ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2);
- DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2);
- }
-
- REGEXP("abcde") {
- // Everything aligned, match occurs in the middle
- ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str());
- DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str());
- // Unaligned head
- ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1);
- DENIES (ystring(".ZxbcdeZZZ").c_str() + 1);
- // Unaligned tail
- ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str());
- DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str());
- }
+ using ystring = astring;
+
+ UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*)));
+
+ REGEXP("xy") {
+ // Short string with aligned head
+ ACCEPTS(ystring("xy").c_str());
+ DENIES (ystring("yz").c_str());
+ // Short string, unaligned
+ ACCEPTS(ystring(".xy").c_str() + 1);
+ DENIES (ystring(".yz").c_str() + 1);
+ // Short string with aligned tail
+ ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2);
+ DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2);
+ }
+
+ REGEXP("abcde") {
+ // Everything aligned, match occurs in the middle
+ ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str());
+ DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str());
+ // Unaligned head
+ ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1);
+ DENIES (ystring(".ZxbcdeZZZ").c_str() + 1);
+ // Unaligned tail
+ ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str());
+ DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str());
+ }
}
#undef Run
@@ -809,73 +809,73 @@ Y_UNIT_TEST(Aligned)
template <class Scanner>
void BasicTestEmptySaveLoadMmap()
{
- Scanner sc;
- UNIT_ASSERT(sc.Empty());
- UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0));
- UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash
+ Scanner sc;
+ UNIT_ASSERT(sc.Empty());
+ UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash
- BufferOutput wbuf;
- UNIT_CHECKPOINT(); Save(&wbuf, sc);
+ BufferOutput wbuf;
+ UNIT_CHECKPOINT(); Save(&wbuf, sc);
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- Scanner sc3;
- /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3);
- UNIT_ASSERT(sc3.Empty());
- UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End();
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Scanner sc3;
+ /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3);
+ UNIT_ASSERT(sc3.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End();
- Scanner sc4;
- /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size());
- UNIT_ASSERT(sc4.Empty());
- UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End();
+ Scanner sc4;
+ /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size());
+ UNIT_ASSERT(sc4.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End();
}
Y_UNIT_TEST(EmptyScanner)
{
- // Tests for Scanner
- BasicTestEmptySaveLoadMmap<Pire::Scanner>();
- BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>();
- BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>();
- BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>();
-
- Pire::Scanner sc;
- Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc);
- UNIT_ASSERT(scsc.Empty());
- UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0));
- UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End();
-
- Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
- UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
- UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
- UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1));
- UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End();
- UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1));
- UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End();
-
- // Tests for NonrelocScanner
- Pire::NonrelocScanner nsc;
- UNIT_ASSERT(nsc.Empty());
- UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0));
- UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End();
-
- Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
- UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
- UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
-
- {
- BufferOutput wbuf;
- UNIT_CHECKPOINT(); Save(&wbuf, nsc);
-
- MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
- Pire::NonrelocScanner nsc3;
- /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3);
- UNIT_ASSERT(nsc3.Empty());
- UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End();
- }
-
- BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>();
-
- BasicTestEmptySaveLoadMmap<Pire::SlowScanner>();
+ // Tests for Scanner
+ BasicTestEmptySaveLoadMmap<Pire::Scanner>();
+ BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>();
+ BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>();
+ BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>();
+
+ Pire::Scanner sc;
+ Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc);
+ UNIT_ASSERT(scsc.Empty());
+ UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End();
+
+ Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End();
+
+ // Tests for NonrelocScanner
+ Pire::NonrelocScanner nsc;
+ UNIT_ASSERT(nsc.Empty());
+ UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End();
+
+ Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
+
+ {
+ BufferOutput wbuf;
+ UNIT_CHECKPOINT(); Save(&wbuf, nsc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::NonrelocScanner nsc3;
+ /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3);
+ UNIT_ASSERT(nsc3.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End();
+ }
+
+ BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>();
+
+ BasicTestEmptySaveLoadMmap<Pire::SlowScanner>();
}
Y_UNIT_TEST(NullPointer)
diff --git a/library/cpp/regex/pire/ut/read_unicode_ut.cpp b/library/cpp/regex/pire/ut/read_unicode_ut.cpp
index 17569096873..16f627d9da8 100644
--- a/library/cpp/regex/pire/ut/read_unicode_ut.cpp
+++ b/library/cpp/regex/pire/ut/read_unicode_ut.cpp
@@ -26,282 +26,282 @@
#include "common.h"
Y_UNIT_TEST_SUITE(ReadUnicodeTest) {
- ystring CreateStringWithZeroSymbol(const char* str, size_t pos) {
- ystring result = str;
- Y_ASSERT(pos < result.size());
- result[pos] = '\0';
- return result;
- }
-
- Y_UNIT_TEST(ZeroSymbol)
- {
- REGEXP("\\x{0}") {
- ACCEPTS(CreateStringWithZeroSymbol("a", 0));
- ACCEPTS(CreateStringWithZeroSymbol("some text", 3));
- DENIES("string without zero");
- }
-
- REGEXP("the\\x00middle") {
- ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6));
- DENIES(CreateStringWithZeroSymbol("in the middle", 5));
- DENIES("in the middle");
- }
- }
-
- Y_UNIT_TEST(SymbolsByCodes)
- {
- REGEXP("\\x{41}") {
- ACCEPTS("A");
- ACCEPTS("tAst string");
- DENIES("test string");
- }
-
- REGEXP("\\x26abc") {
- ACCEPTS("&abc;");
- DENIES("test &ab");
- DENIES("without");
- }
- }
-
- Y_UNIT_TEST(ErrorsWhileCompiling)
- {
- UNIT_ASSERT(HasError("\\x"));
- UNIT_ASSERT(HasError("\\x0"));
- UNIT_ASSERT(HasError("\\xfu"));
- UNIT_ASSERT(HasError("\\xs1"));
- UNIT_ASSERT(HasError("\\x 0"));
- UNIT_ASSERT(HasError("\\x0 "));
-
- UNIT_ASSERT(HasError("\\x{2A1"));
- UNIT_ASSERT(HasError("\\x{"));
- UNIT_ASSERT(HasError("\\x}"));
- UNIT_ASSERT(HasError("\\x2}"));
- UNIT_ASSERT(HasError("\\x{{3}"));
- UNIT_ASSERT(HasError("\\x{2a{5}"));
-
- UNIT_ASSERT(HasError("\\x{}"));
- UNIT_ASSERT(HasError("\\x{+3}"));
- UNIT_ASSERT(HasError("\\x{-3}"));
- UNIT_ASSERT(HasError("\\x{ 2F}"));
- UNIT_ASSERT(HasError("\\x{2A F}"));
- UNIT_ASSERT(HasError("\\x{2Arft}"));
- UNIT_ASSERT(HasError("\\x{110000}"));
-
- UNIT_ASSERT(!HasError("\\x{fB1}"));
- UNIT_ASSERT(!HasError("\\x00"));
- UNIT_ASSERT(!HasError("\\x{10FFFF}"));
- }
-
- Y_UNIT_TEST(OneCharacterRange)
- {
- SCANNER("[\\x{61}]") {
- ACCEPTS("a");
- ACCEPTS("bac");
- DENIES("test");
- }
-
- SCANNER("[\\x3f]") {
- ACCEPTS("?");
- ACCEPTS("test?");
- DENIES("test");
- }
- }
-
- Y_UNIT_TEST(CharacterRange) {
- REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("?");
- ACCEPTS("acd");
- ACCEPTS("bcd");
- ACCEPTS("cd?");
- ACCEPTS("ab?");
- DENIES("cd");
- }
-
- REGEXP("[\\x{61}-\\x{63}]") {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("c");
- ACCEPTS("qwertya");
- DENIES("d");
- }
-
- REGEXP("[\\x61-\\x61]") {
- ACCEPTS("a");
- ACCEPTS("qwertya");
- DENIES("b");
- }
-
- REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") {
- ACCEPTS("&");
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("?");
- ACCEPTS("ade");
- ACCEPTS("ab?");
- DENIES("d");
- }
-
- REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("A");
- ACCEPTS("B");
- DENIES("c");
- DENIES("C");
- }
-
- REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") {
- ACCEPTS("Aa");
- ACCEPTS("Ab");
- ACCEPTS("Ba");
- ACCEPTS("Bb");
- DENIES("a");
- DENIES("b");
- DENIES("A");
- DENIES("B");
- DENIES("ab");
- DENIES("AB");
- DENIES("Ca");
- }
- }
-
- Y_UNIT_TEST(RangeExcludeCharacters) {
- REGEXP("[^\\x{61}]") {
- ACCEPTS("b");
- ACCEPTS("c");
- ACCEPTS("aba");
- DENIES("a");
- DENIES("aaa");
- }
-
- REGEXP("[^\\x{61}-\\x{7a}]") {
- ACCEPTS("A");
- ACCEPTS("123");
- ACCEPTS("acb1");
- DENIES("a");
- DENIES("abcxyz");
- }
- }
-
- Y_UNIT_TEST(MixedRange) {
- REGEXP("[\\x{61}B]") {
- ACCEPTS("a");
- ACCEPTS("B");
- ACCEPTS("atestB");
- DENIES("test");
- }
-
- REGEXP("[^\\x{61}A]") {
- ACCEPTS("b");
- ACCEPTS("B");
- ACCEPTS("atestB");
- DENIES("a");
- DENIES("A");
- DENIES("aaAA");
- }
-
- REGEXP("[0-9][\\x{61}-\\x{62}A-B]") {
- ACCEPTS("0a");
- ACCEPTS("1A");
- ACCEPTS("5b");
- ACCEPTS("9B");
- ACCEPTS("1atestB");
- ACCEPTS("2Atest");
- DENIES("aB");
- DENIES("testb");
- DENIES("test");
- }
-
- REGEXP("[\\x{61}-c]") {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("c");
- ACCEPTS("testb");
- DENIES("d");
- }
-
- REGEXP("[^a-\\x{7a}]") {
- ACCEPTS("A");
- ACCEPTS("123");
- ACCEPTS("acb1");
- DENIES("a");
- DENIES("abcxyz");
- }
-
- REGEXP("[\\x{41}-Ba-\\x{62}]") {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("A");
- ACCEPTS("B");
- DENIES("c");
- DENIES("C");
- }
- }
-
- Y_UNIT_TEST(CompilingRange)
- {
- UNIT_ASSERT(HasError("[\\x41"));
- UNIT_ASSERT(HasError("[\\xfq]"));
- UNIT_ASSERT(HasError("[\\x{01}-]"));
-
- UNIT_ASSERT(!HasError("[\\x{10FFFF}]"));
- UNIT_ASSERT(!HasError("[\\x{00}]"));
- UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]"));
-
- UNIT_ASSERT(!HasError("[^\\xFF]"));
- UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]"));
- UNIT_ASSERT(!HasError("[-\\x{01}]"));
- }
-
- Y_UNIT_TEST(UnicodeRepetition)
- {
- REGEXP("^\\x{78}{3,6}$") {
- DENIES ("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxx");
- ACCEPTS("xxxxxx");
- DENIES ("xxxxxxx");
- }
-
- REGEXP("^x{3,}$") {
- DENIES ("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxxxxxxxx");
- ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
- }
-
- REGEXP("^\\x{78}{3}$") {
- DENIES ("x");
- DENIES ("xx");
- ACCEPTS("xxx");
- DENIES ("xxxx");
- DENIES ("xxxxx");
- DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
- }
-
- REGEXP("^([\\x{78}-\\x{79}]){2}$") {
- DENIES("x");
- DENIES("y");
- ACCEPTS("xx");
- ACCEPTS("xy");
- ACCEPTS("yx");
- ACCEPTS("yy");
- DENIES("xxy");
- DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
- }
- }
-
- Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed)
- {
- REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") {
- ACCEPTS("w");
- DENIES ("x");
- ACCEPTS("y");
- }
- }
+ ystring CreateStringWithZeroSymbol(const char* str, size_t pos) {
+ ystring result = str;
+ Y_ASSERT(pos < result.size());
+ result[pos] = '\0';
+ return result;
+ }
+
+ Y_UNIT_TEST(ZeroSymbol)
+ {
+ REGEXP("\\x{0}") {
+ ACCEPTS(CreateStringWithZeroSymbol("a", 0));
+ ACCEPTS(CreateStringWithZeroSymbol("some text", 3));
+ DENIES("string without zero");
+ }
+
+ REGEXP("the\\x00middle") {
+ ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6));
+ DENIES(CreateStringWithZeroSymbol("in the middle", 5));
+ DENIES("in the middle");
+ }
+ }
+
+ Y_UNIT_TEST(SymbolsByCodes)
+ {
+ REGEXP("\\x{41}") {
+ ACCEPTS("A");
+ ACCEPTS("tAst string");
+ DENIES("test string");
+ }
+
+ REGEXP("\\x26abc") {
+ ACCEPTS("&abc;");
+ DENIES("test &ab");
+ DENIES("without");
+ }
+ }
+
+ Y_UNIT_TEST(ErrorsWhileCompiling)
+ {
+ UNIT_ASSERT(HasError("\\x"));
+ UNIT_ASSERT(HasError("\\x0"));
+ UNIT_ASSERT(HasError("\\xfu"));
+ UNIT_ASSERT(HasError("\\xs1"));
+ UNIT_ASSERT(HasError("\\x 0"));
+ UNIT_ASSERT(HasError("\\x0 "));
+
+ UNIT_ASSERT(HasError("\\x{2A1"));
+ UNIT_ASSERT(HasError("\\x{"));
+ UNIT_ASSERT(HasError("\\x}"));
+ UNIT_ASSERT(HasError("\\x2}"));
+ UNIT_ASSERT(HasError("\\x{{3}"));
+ UNIT_ASSERT(HasError("\\x{2a{5}"));
+
+ UNIT_ASSERT(HasError("\\x{}"));
+ UNIT_ASSERT(HasError("\\x{+3}"));
+ UNIT_ASSERT(HasError("\\x{-3}"));
+ UNIT_ASSERT(HasError("\\x{ 2F}"));
+ UNIT_ASSERT(HasError("\\x{2A F}"));
+ UNIT_ASSERT(HasError("\\x{2Arft}"));
+ UNIT_ASSERT(HasError("\\x{110000}"));
+
+ UNIT_ASSERT(!HasError("\\x{fB1}"));
+ UNIT_ASSERT(!HasError("\\x00"));
+ UNIT_ASSERT(!HasError("\\x{10FFFF}"));
+ }
+
+ Y_UNIT_TEST(OneCharacterRange)
+ {
+ SCANNER("[\\x{61}]") {
+ ACCEPTS("a");
+ ACCEPTS("bac");
+ DENIES("test");
+ }
+
+ SCANNER("[\\x3f]") {
+ ACCEPTS("?");
+ ACCEPTS("test?");
+ DENIES("test");
+ }
+ }
+
+ Y_UNIT_TEST(CharacterRange) {
+ REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("acd");
+ ACCEPTS("bcd");
+ ACCEPTS("cd?");
+ ACCEPTS("ab?");
+ DENIES("cd");
+ }
+
+ REGEXP("[\\x{61}-\\x{63}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("qwertya");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x61-\\x61]") {
+ ACCEPTS("a");
+ ACCEPTS("qwertya");
+ DENIES("b");
+ }
+
+ REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") {
+ ACCEPTS("&");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("ade");
+ ACCEPTS("ab?");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") {
+ ACCEPTS("Aa");
+ ACCEPTS("Ab");
+ ACCEPTS("Ba");
+ ACCEPTS("Bb");
+ DENIES("a");
+ DENIES("b");
+ DENIES("A");
+ DENIES("B");
+ DENIES("ab");
+ DENIES("AB");
+ DENIES("Ca");
+ }
+ }
+
+ Y_UNIT_TEST(RangeExcludeCharacters) {
+ REGEXP("[^\\x{61}]") {
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("aba");
+ DENIES("a");
+ DENIES("aaa");
+ }
+
+ REGEXP("[^\\x{61}-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+ }
+
+ Y_UNIT_TEST(MixedRange) {
+ REGEXP("[\\x{61}B]") {
+ ACCEPTS("a");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("test");
+ }
+
+ REGEXP("[^\\x{61}A]") {
+ ACCEPTS("b");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("a");
+ DENIES("A");
+ DENIES("aaAA");
+ }
+
+ REGEXP("[0-9][\\x{61}-\\x{62}A-B]") {
+ ACCEPTS("0a");
+ ACCEPTS("1A");
+ ACCEPTS("5b");
+ ACCEPTS("9B");
+ ACCEPTS("1atestB");
+ ACCEPTS("2Atest");
+ DENIES("aB");
+ DENIES("testb");
+ DENIES("test");
+ }
+
+ REGEXP("[\\x{61}-c]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("testb");
+ DENIES("d");
+ }
+
+ REGEXP("[^a-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+
+ REGEXP("[\\x{41}-Ba-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+ }
+
+ Y_UNIT_TEST(CompilingRange)
+ {
+ UNIT_ASSERT(HasError("[\\x41"));
+ UNIT_ASSERT(HasError("[\\xfq]"));
+ UNIT_ASSERT(HasError("[\\x{01}-]"));
+
+ UNIT_ASSERT(!HasError("[\\x{10FFFF}]"));
+ UNIT_ASSERT(!HasError("[\\x{00}]"));
+ UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]"));
+
+ UNIT_ASSERT(!HasError("[^\\xFF]"));
+ UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]"));
+ UNIT_ASSERT(!HasError("[-\\x{01}]"));
+ }
+
+ Y_UNIT_TEST(UnicodeRepetition)
+ {
+ REGEXP("^\\x{78}{3,6}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ }
+
+ REGEXP("^x{3,}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxxxxxxxx");
+ ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^\\x{78}{3}$") {
+ DENIES ("x");
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ DENIES ("xxxx");
+ DENIES ("xxxxx");
+ DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^([\\x{78}-\\x{79}]){2}$") {
+ DENIES("x");
+ DENIES("y");
+ ACCEPTS("xx");
+ ACCEPTS("xy");
+ ACCEPTS("yx");
+ ACCEPTS("yy");
+ DENIES("xxy");
+ DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+ }
+
+ Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed)
+ {
+ REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") {
+ ACCEPTS("w");
+ DENIES ("x");
+ ACCEPTS("y");
+ }
+ }
}