aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/pire
diff options
context:
space:
mode:
authormonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
committermonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
commit06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /contrib/libs/pire
parent03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
downloadydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz
fix ya.make
Diffstat (limited to 'contrib/libs/pire')
-rw-r--r--contrib/libs/pire/Makefile.am2
-rw-r--r--contrib/libs/pire/README6
-rw-r--r--contrib/libs/pire/configure.ac47
-rw-r--r--contrib/libs/pire/pire/Makefile.am121
-rw-r--r--contrib/libs/pire/ut/approx_matching_ut.cpp379
-rw-r--r--contrib/libs/pire/ut/capture_ut.cpp299
-rw-r--r--contrib/libs/pire/ut/common.h224
-rw-r--r--contrib/libs/pire/ut/count_ut.cpp583
-rw-r--r--contrib/libs/pire/ut/easy_ut.cpp (renamed from contrib/libs/pire/pire/fwd.h)47
-rw-r--r--contrib/libs/pire/ut/glyph_ut.cpp63
-rw-r--r--contrib/libs/pire/ut/inline_ut.cpp91
-rw-r--r--contrib/libs/pire/ut/pire_ut.cpp888
-rw-r--r--contrib/libs/pire/ut/read_unicode_ut.cpp298
-rw-r--r--contrib/libs/pire/ut/stub/cppunit.h14
14 files changed, 2870 insertions, 192 deletions
diff --git a/contrib/libs/pire/Makefile.am b/contrib/libs/pire/Makefile.am
deleted file mode 100644
index a9e8908fb6..0000000000
--- a/contrib/libs/pire/Makefile.am
+++ /dev/null
@@ -1,2 +0,0 @@
-ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = pire tests pkg samples
diff --git a/contrib/libs/pire/README b/contrib/libs/pire/README
deleted file mode 100644
index 1791486f8e..0000000000
--- a/contrib/libs/pire/README
+++ /dev/null
@@ -1,6 +0,0 @@
-This is PIRE, Perl Incompatible Regular Expressions library.
-
-For detailed information about what it is, how to build and use it,
-see http://wiki.yandex-team.ru/DmitrijjProkopcev/pire .
-
-Please report bugs to dprokoptsev@yandex-team.ru or davenger@yandex-team.ru.
diff --git a/contrib/libs/pire/configure.ac b/contrib/libs/pire/configure.ac
deleted file mode 100644
index 49f235129c..0000000000
--- a/contrib/libs/pire/configure.ac
+++ /dev/null
@@ -1,47 +0,0 @@
-AC_PREREQ([2.63])
-AC_INIT([pire], [0.0.2], [dprokoptsev@yandex-team.ru])
-AM_INIT_AUTOMAKE([foreign -Wall])
-AC_CONFIG_SRCDIR([pire/classes.cpp])
-AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_MACRO_DIR([m4])
-
-AC_LANG_CPLUSPLUS
-
-# Require neccessary binaries to build ourselves
-AC_PROG_CXX
-AC_PROG_CC
-AC_PROG_LEX
-AC_PROG_YACC
-AC_PROG_LIBTOOL
-
-# Check for cppunit
-AM_PATH_CPPUNIT([0.0.0],[with_unittests=yes],[
- AC_WARN([cppunit not found. Unit tests will not compile and run.])
- with_unittests=no
-])
-AM_CONDITIONAL([WITH_UNITTESTS], [test x"$with_unittests" = xyes])
-
-# Just for conscience' sake
-AC_CHECK_HEADERS([stdlib.h string.h sys/time.h])
-AC_HEADER_STDBOOL
-AC_C_INLINE
-AC_TYPE_SIZE_T
-AC_CHECK_TYPES([ptrdiff_t])
-AC_FUNC_ERROR_AT_LINE
-AC_FUNC_MALLOC
-AC_CHECK_FUNCS([memset strchr])
-
-# Require little-endian platform
-AC_C_BIGENDIAN
-if test x"$ac_cv_c_bigendian" = xyes; then
- AC_ERROR([pire has not been ported to big-endian platforms yet.])
-fi
-
-# Optional features
-AC_ARG_ENABLE([extra], AS_HELP_STRING([--enable-extra], [Add extra functionality (capturing scanner, etc...)]))
-AC_ARG_ENABLE([debug], AS_HELP_STRING([--enable-debug], [Make Pire dump all constructed FSMs to std::clog (useless unless debugging Pire)]))
-AM_CONDITIONAL([ENABLE_EXTRA], [test x"$enable_extra" = xyes])
-AM_CONDITIONAL([ENABLE_DEBUG], [test x"$enable_debug" = xyes])
-
-AC_CONFIG_FILES([Makefile pire/Makefile tests/Makefile pkg/Makefile samples/Makefile samples/bench/Makefile])
-AC_OUTPUT
diff --git a/contrib/libs/pire/pire/Makefile.am b/contrib/libs/pire/pire/Makefile.am
deleted file mode 100644
index 09ef211704..0000000000
--- a/contrib/libs/pire/pire/Makefile.am
+++ /dev/null
@@ -1,121 +0,0 @@
-
-AM_CXXFLAGS = -Wall
-if ENABLE_DEBUG
-AM_CXXFLAGS += -DPIRE_DEBUG
-endif
-if ENABLE_CHECKED
-AM_CXXFLAGS += -DPIRE_CHECKED
-endif
-
-lib_LTLIBRARIES = libpire.la
-libpire_la_SOURCES = \
- align.h \
- any.h \
- classes.cpp \
- defs.h \
- determine.h \
- encoding.cpp \
- encoding.h \
- extra.h \
- fsm.cpp \
- fsm.h \
- fwd.h \
- glue.cpp \
- glue.h \
- minimize.h \
- half_final_fsm.cpp \
- half_final_fsm.h \
- partition.h \
- pire.h \
- re_lexer.cpp \
- re_lexer.h \
- run.h \
- scanner_io.cpp \
- vbitset.h \
- re_parser.ypp \
- scanners/half_final.h \
- scanners/loaded.h \
- scanners/multi.h \
- scanners/slow.h \
- scanners/simple.h \
- scanners/common.h \
- scanners/pair.h \
- stub/stl.h \
- stub/lexical_cast.h \
- stub/saveload.h \
- stub/singleton.h \
- stub/utf8.cpp \
- stub/utf8.h \
- stub/noncopyable.h \
- stub/codepage_h.h \
- stub/doccodes_h.h \
- stub/unidata_h.h \
- stub/unidata_cpp.h
-
-if ENABLE_EXTRA
-libpire_la_SOURCES += \
- extra/capture.cpp \
- extra/capture.h \
- extra/count.cpp \
- extra/count.h \
- extra/glyphs.cpp \
- extra/glyphs.h
-endif
-
-pire_hdrdir = $(includedir)/pire
-pire_hdr_HEADERS = \
- align.h \
- any.h \
- defs.h \
- determine.h \
- encoding.h \
- extra.h \
- fsm.h \
- fwd.h \
- glue.h \
- minimize.h \
- half_final_fsm.h \
- partition.h \
- pire.h \
- re_lexer.h \
- re_parser.h \
- run.h \
- static_assert.h \
- vbitset.h
-
-if ENABLE_EXTRA
-pire_extradir = $(includedir)/pire/extra
-pire_extra_HEADERS = \
- extra/capture.h \
- extra/count.h \
- extra/glyphs.h
-endif
-
-pire_scannersdir = $(includedir)/pire/scanners
-pire_scanners_HEADERS = \
- scanners/common.h \
- scanners/half_final.h \
- scanners/multi.h \
- scanners/slow.h \
- scanners/simple.h \
- scanners/loaded.h \
- scanners/pair.h
-
-pire_stubdir = $(includedir)/pire/stub
-pire_stub_HEADERS = \
- stub/stl.h \
- stub/defaults.h \
- stub/singleton.h \
- stub/saveload.h \
- stub/lexical_cast.h
-
-bin_PROGRAMS = pire_inline
-
-pire_inline_SOURCES = inline.lpp stub/hacks.h stub/memstreams.h
-pire_inline_LDADD = libpire.la
-
-BUILT_SOURCES = re_parser.h re_parser.cpp
-CLEANFILES = re_parser.h re_parser.cpp
-
-AM_YFLAGS = -d
-
diff --git a/contrib/libs/pire/ut/approx_matching_ut.cpp b/contrib/libs/pire/ut/approx_matching_ut.cpp
new file mode 100644
index 0000000000..3b4cb972f6
--- /dev/null
+++ b/contrib/libs/pire/ut/approx_matching_ut.cpp
@@ -0,0 +1,379 @@
+/*
+ * approx_matching_ut.cpp --
+ *
+ * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <contrib/libs/pire/pire/pire.h>
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(ApproxMatchingTest) {
+ Pire::Fsm BuildFsm(const char *str)
+ {
+ Pire::Lexer lexer;
+ TVector<wchar32> ucs4;
+
+ lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+ return lexer.Parse();
+ }
+
+ Y_UNIT_TEST(Simple) {
+ auto fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("ax");
+ ACCEPTS("xb");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("xab");
+ ACCEPTS("axb");
+ ACCEPTS("abx");
+ ACCEPTS("aab");
+ DENIES("xy");
+ DENIES("abcd");
+ DENIES("xabx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("ab");
+ ACCEPTS("xy");
+ ACCEPTS("");
+ ACCEPTS("axbx");
+ DENIES("xxabx");
+ DENIES("xbxxx");
+ }
+ }
+
+ Y_UNIT_TEST(SpecialSymbols) {
+ auto fsm = BuildFsm("^.*ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("ab");
+ ACCEPTS("xxxxab");
+ ACCEPTS("xxxxabab");
+ DENIES("xxxx");
+ DENIES("abxxxx");
+ }
+
+ fsm = BuildFsm("^[a-c]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("/");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("xx");
+ DENIES("abc");
+ }
+
+ fsm = BuildFsm("^x{4}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ DENIES ("x");
+ ACCEPTS("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ ACCEPTS("xxyy");
+ ACCEPTS("xxyyx");
+ ACCEPTS("xxxxyz");
+ DENIES("xyyy");
+ }
+
+ fsm = BuildFsm("^(a|b)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("abc");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^(ab|cd)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("cd");
+ ACCEPTS("ax");
+ ACCEPTS("xd");
+ ACCEPTS("abx");
+ ACCEPTS("a");
+ DENIES("abcd");
+ DENIES("xx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^[a-c]{3}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("abc");
+ ACCEPTS("aaa");
+ ACCEPTS("a");
+ ACCEPTS("ax");
+ ACCEPTS("abxcx");
+ DENIES("x");
+ DENIES("");
+ DENIES("xaxx");
+ }
+
+ fsm = BuildFsm("^\\x{61}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("axx");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^a.bc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("axxbc");
+ ACCEPTS("abc");
+ ACCEPTS("xabc");
+ ACCEPTS("xaxbc");
+ DENIES("bc");
+ DENIES("abcx");
+ }
+ }
+
+ Y_UNIT_TEST(TestSurrounded) {
+ auto fsm = BuildFsm("abc").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("xabcx");
+ ACCEPTS("xabx");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("a");
+ DENIES("xaxxxx");
+ }
+
+ fsm = BuildFsm("^abc$").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("abcx");
+ ACCEPTS("xabc");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("xabx");
+ DENIES("axx");
+ }
+ }
+
+ Y_UNIT_TEST(GlueFsm) {
+ auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("ab");
+ DENIES("abb");
+ }
+
+ fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("cc");
+ ACCEPTS("x");
+ ACCEPTS("xa");
+ ACCEPTS("c");
+ ACCEPTS("xc");
+ ACCEPTS("cxc");
+ ACCEPTS("");
+ }
+ }
+
+ enum MutateOperation {
+ Begin,
+ Substitute = Begin,
+ Delete,
+ Insert,
+ End
+ };
+
+ ystring ChangeText(const ystring& text, int operation, int pos)
+ {
+ auto changedText = text;
+ switch (operation) {
+ case MutateOperation::Substitute:
+ changedText[pos] = 'x';
+ break;
+ case MutateOperation::Delete:
+ changedText.erase(pos, 1);
+ break;
+ case MutateOperation::Insert:
+ changedText.insert(pos, 1, 'x');
+ break;
+ }
+
+ return changedText;
+ }
+
+ Y_UNIT_TEST(StressTest) {
+ ystring text;
+ for (size_t letter = 0; letter < 10; ++letter) {
+ text += ystring(3, letter + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.Data());
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ DENIES(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ DENIES(changedText);
+ }
+ }
+ }
+ }
+ }
+
+ Y_UNIT_TEST(SwapLetters) {
+ auto fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("bac");
+ ACCEPTS("acb");
+ DENIES("cba");
+ DENIES("bax");
+ }
+
+ fsm = BuildFsm("^abcd$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("bacd");
+ ACCEPTS("acbd");
+ ACCEPTS("baxd");
+ ACCEPTS("badc");
+ ACCEPTS("bcad");
+ ACCEPTS("bcda");
+ DENIES("xcbx");
+ DENIES("baxx");
+ DENIES("ba");
+ DENIES("cdab");
+ }
+
+ fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS("abc");
+ DENIES("bac");
+ }
+
+ fsm = BuildFsm("^[a-c][1-3]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a3");
+ ACCEPTS("c");
+ ACCEPTS("1");
+ ACCEPTS("1a");
+ ACCEPTS("3b");
+ DENIES("4a");
+ }
+
+ fsm = BuildFsm("^.*abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("xxxxbac");
+ DENIES("xxxxa");
+ DENIES("xxxxcb");
+ }
+ }
+
+ Y_UNIT_TEST(SwapStressTest){
+ ystring text;
+ for (size_t letter = 0; letter < 30; ++letter) {
+ text += ystring(1, (letter % 26) + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.Data());
+ auto changedText = text;
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ ACCEPTS(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ DENIES(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+ }
+}
diff --git a/contrib/libs/pire/ut/capture_ut.cpp b/contrib/libs/pire/ut/capture_ut.cpp
new file mode 100644
index 0000000000..3d339c5601
--- /dev/null
+++ b/contrib/libs/pire/ut/capture_ut.cpp
@@ -0,0 +1,299 @@
+/*
+ * capture_ut.cpp --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <stub/hacks.h>
+#include <stub/saveload.h>
+#include <stub/utf8.h>
+#include <stub/memstreams.h>
+#include "stub/cppunit.h"
+#include <pire.h>
+#include <extra.h>
+#include <string.h>
+
+Y_UNIT_TEST_SUITE(TestPireCapture) {
+
+ using Pire::CapturingScanner;
+ using Pire::SlowCapturingScanner;
+ typedef Pire::CapturingScanner::State State;
+
+ CapturingScanner Compile(const char* regexp, int index)
+ {
+ Pire::Lexer lexer;
+
+ lexer.Assign(regexp, regexp + strlen(regexp));
+ lexer.AddFeature(Pire::Features::CaseInsensitive());
+ lexer.AddFeature(Pire::Features::Capture((size_t) index));
+
+ Pire::Fsm fsm = lexer.Parse();
+
+ fsm.Surround();
+ fsm.Determine();
+ return fsm.Compile<Pire::CapturingScanner>();
+ }
+
+ SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ Pire::Lexer lexer;
+ lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index)));
+ lexer.SetEncoding(encoding);
+ TVector<wchar32> ucs4;
+ encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+ Pire::Fsm fsm = lexer.Parse();
+ fsm.Surround();
+ return fsm.Compile<Pire::SlowCapturingScanner>();
+ }
+
+ State RunRegexp(const CapturingScanner& scanner, const char* str)
+ {
+ State state;
+ scanner.Initialize(state);
+ Step(scanner, state, Pire::BeginMark);
+ Run(scanner, state, str, str + strlen(str));
+ Step(scanner, state, Pire::EndMark);
+ return state;
+ }
+
+ SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str)
+ {
+ SlowCapturingScanner::State state;
+ scanner.Initialize(state);
+ Run(scanner, state, str, str + strlen(str));
+ return state;
+ }
+
+ ystring Captured(const State& state, const char* str)
+ {
+ if (state.Captured())
+ return ystring(str + state.Begin() - 1, str + state.End() - 1);
+ else
+ return ystring();
+ }
+
+ Y_UNIT_TEST(Trivial)
+ {
+ CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
+ State state;
+ const char* str;
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "var google_id = 'abcde'; eval(google_id);";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(!state.Captured());
+ }
+
+ Y_UNIT_TEST(Sequential)
+ {
+ CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
+ State state;
+ const char* str;
+
+ str = "google_id = 'abcde'; google_id = 'xyz';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "var google_id = 'abc de'; google_id = 'xyz';";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz"));
+ }
+
+ Y_UNIT_TEST(NegatedTerminator)
+ {
+ CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1);
+ State state;
+ const char* str;
+
+ str = "=12345;";
+ state = RunRegexp(scanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345"));
+ }
+
+ Y_UNIT_TEST(Serialization)
+ {
+ const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;";
+ CapturingScanner scanner2 = Compile(regex, 1);
+ SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1);
+ BufferOutput wbuf, wbuf2;
+ ::Save(&wbuf, scanner2);
+ ::Save(&wbuf2, slowScanner2);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size());
+ CapturingScanner scanner;
+ SlowCapturingScanner slowScanner;
+ ::Load(&rbuf, scanner);
+ ::Load(&rbuf2, slowScanner);
+
+ State state;
+ SlowCapturingScanner::State slowState;
+ const char* str;
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner, str);
+ slowState = RunRegexp(slowScanner, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+ SlowCapturingScanner::SingleState final;
+ UNIT_ASSERT(slowScanner.GetCapture(slowState, final));
+ ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin());
+ UNIT_ASSERT_EQUAL(ans, ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner, str);
+ slowState = RunRegexp(slowScanner, str);
+ UNIT_ASSERT(!state.Captured());
+ UNIT_ASSERT(!slowScanner.GetCapture(slowState, final));
+
+ CapturingScanner scanner3;
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(state.Captured());
+ UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
+
+ str = "google_id != 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(!state.Captured());
+
+ ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1);
+ try {
+ scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
+ }
+ catch (Pire::Error&) {}
+
+ for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
+ ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ try {
+ scanner3.Mmap(ptr, wbuf.Buffer().Size());
+ if (offset % sizeof(size_t) != 0) {
+ UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
+ } else {
+ str = "google_id = 'abcde';";
+ state = RunRegexp(scanner3, str);
+ UNIT_ASSERT(state.Captured());
+ }
+ }
+ catch (Pire::Error&) {}
+ }
+ }
+
+ Y_UNIT_TEST(Empty)
+ {
+ Pire::CapturingScanner sc;
+ UNIT_ASSERT(sc.Empty());
+
+ UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash
+
+ // Test Save/Load/Mmap
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::CapturingScanner sc3;
+ ::Load(&rbuf, sc3);
+ UNIT_CHECKPOINT(); RunRegexp(sc3, "a string");
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ Pire::CapturingScanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+ UNIT_CHECKPOINT(); RunRegexp(sc4, "a string");
+ }
+
+ void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding);
+ SlowCapturingScanner::State st = RunRegexp(sc, text);
+ SlowCapturingScanner::SingleState fin;
+ bool ifCaptured = sc.GetCapture(st, fin);
+ if (ans) {
+ UNIT_ASSERT(ifCaptured);
+ ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin());
+ UNIT_ASSERT_EQUAL(answer, captured);
+ } else {
+ UNIT_ASSERT(!ifCaptured);
+ }
+ }
+
+ Y_UNIT_TEST(SlowCapturingNonGreedy)
+ {
+ const char* regexp = ".*?(pref.*suff)";
+ const char* text = "pref ala bla pref cla suff dla";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureGreedy)
+ {
+ const char* regexp = ".*(pref.*suff)";
+ const char* text = "pref ala bla pref cla suff dla";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff"));
+ }
+
+ Y_UNIT_TEST(SlowCaptureInOr)
+ {
+ const char* regexp = "(A)|A";
+ const char* text = "A";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring("A"));
+ const char* regexp2 = "A|(A)";
+ MakeSlowCapturingTest(regexp2, text, 1, false);
+ }
+
+ Y_UNIT_TEST(SlowCapturing)
+ {
+ const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)";
+ const char* text = "http://vkontakte.ru/id100500";
+ MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500"));
+ }
+
+ Y_UNIT_TEST(Utf_8)
+ {
+ const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!";
+ const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! ";
+ const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)";
+ MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans));
+ }
+}
diff --git a/contrib/libs/pire/ut/common.h b/contrib/libs/pire/ut/common.h
new file mode 100644
index 0000000000..d79eedafb7
--- /dev/null
+++ b/contrib/libs/pire/ut/common.h
@@ -0,0 +1,224 @@
+/*
+ * common.h --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#ifndef PIRE_TEST_COMMON_H_INCLUDED
+#define PIRE_TEST_COMMON_H_INCLUDED
+
+#include <stdio.h>
+#include <pire.h>
+#include <stub/stl.h>
+#include <stub/defaults.h>
+#include <stub/lexical_cast.h>
+#include "stub/cppunit.h"
+
+using namespace Pire;
+
+/*****************************************************************************
+* Helpers
+*****************************************************************************/
+
+inline Pire::Fsm ParseRegexp(const char* str, const char* options = "", const Pire::Encoding** enc = 0)
+{
+ Pire::Lexer lexer;
+ TVector<wchar32> ucs4;
+
+ bool surround = true;
+ for (; *options; ++options) {
+ if (*options == 'i')
+ lexer.AddFeature(Pire::Features::CaseInsensitive());
+ else if (*options == 'u')
+ lexer.SetEncoding(Pire::Encodings::Utf8());
+ else if (*options == 'n')
+ surround = false;
+ else if (*options == 'a')
+ lexer.AddFeature(Pire::Features::AndNotSupport());
+ else
+ throw std::invalid_argument("Unknown option: " + ystring(1, *options));
+ }
+
+ if (enc)
+ *enc = &lexer.Encoding();
+
+ lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+
+ Pire::Fsm fsm = lexer.Parse();
+ if (surround)
+ fsm.Surround();
+ return fsm;
+}
+
+inline bool HasError(const char* regexp) {
+ try {
+ ParseRegexp(regexp);
+ return false;
+ } catch (Pire::Error& ex) {
+ return true;
+ }
+}
+
+struct Scanners {
+ Pire::Scanner fast;
+ Pire::NonrelocScanner nonreloc;
+ Pire::SimpleScanner simple;
+ Pire::SlowScanner slow;
+ Pire::ScannerNoMask fastNoMask;
+ Pire::NonrelocScannerNoMask nonrelocNoMask;
+ Pire::HalfFinalScanner halfFinal;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask;
+ Pire::NonrelocHalfFinalScanner nonrelocHalfFinal;
+ Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask;
+
+ Scanners(const Pire::Fsm& fsm, size_t distance = 0)
+ : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance))
+ , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance))
+ , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance))
+ , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance))
+ , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance))
+ , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance))
+ , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance))
+ , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance))
+ , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance))
+ , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance))
+ {}
+
+ Scanners(const char* str, const char* options = "")
+ {
+ Pire::Fsm fsm = ParseRegexp(str, options);
+ fast = Pire::Fsm(fsm).Compile<Pire::Scanner>();
+ nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>();
+ simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>();
+ slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>();
+ fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>();
+ nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>();
+ halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>();
+ halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>();
+ nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>();
+ nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>();
+ }
+};
+
+#ifdef PIRE_DEBUG
+
+template <class Scanner>
+inline ystring DbgState(const Scanner& scanner, typename Scanner::State state)
+{
+ return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
+}
+/*
+inline ystring DbgState(const Pire::SimpleScanner& scanner, Pire::SimpleScanner::State state)
+{
+ return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring());
+}
+*/
+inline ystring DbgState(const Pire::SlowScanner& scanner, const Pire::SlowScanner::State& state)
+{
+ return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring());
+}
+
+template<class Scanner>
+void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end)
+{
+ for (; begin != end; ++begin) {
+ char tmp[8];
+ if (*begin >= 32) {
+ tmp[0] = *begin;
+ tmp[1] = 0;
+ } else
+ snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin);
+ std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> ";
+ scanner.Next(state, (unsigned char) *begin);
+ std::clog << DbgState(scanner, state) << "\n";
+ }
+}
+
+#define Run DbgRun
+#endif
+
+template<class Scanner>
+typename Scanner::State RunRegexp(const Scanner& scanner, const ystring& str)
+{
+ PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n");
+
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ Step(scanner, state, BeginMark);
+ Run(scanner, state, str.c_str(), str.c_str() + str.length());
+ Step(scanner, state, EndMark);
+ return state;
+}
+
+template<class Scanner>
+typename Scanner::State RunRegexp(const Scanner& scanner, const char* str)
+{
+ return RunRegexp(scanner, ystring(str));
+}
+
+template<class Scanner>
+bool Matches(const Scanner& scanner, const ystring& str)
+{
+ auto state = RunRegexp(scanner, str);
+ auto result = scanner.AcceptedRegexps(state);
+ return result.first != result.second;
+}
+
+template<class Scanner>
+bool Matches(const Scanner& scanner, const char* str)
+{
+ return Matches(scanner, ystring(str));
+}
+
+#define SCANNER(fsm) for (Scanners m_scanners(fsm), *m_flag = &m_scanners; m_flag; m_flag = 0)
+#define APPROXIMATE_SCANNER(fsm, distance) for (Scanners m_scanners(fsm, distance), *m_flag = &m_scanners; m_flag; m_flag = 0)
+#define REGEXP(pattern) for (Scanners m_scanners(pattern), *m_flag = &m_scanners; m_flag; m_flag = 0)
+#define REGEXP2(pattern,flags) for (Scanners m_scanners(pattern, flags), *m_flag = &m_scanners; m_flag; m_flag = 0)
+#define ACCEPTS(str) \
+ do {\
+ UNIT_ASSERT(Matches(m_scanners.fast, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonreloc, str));\
+ UNIT_ASSERT(Matches(m_scanners.simple, str));\
+ UNIT_ASSERT(Matches(m_scanners.slow, str));\
+ UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\
+ UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\
+ UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
+ } while (false)
+
+#define DENIES(str) \
+ do {\
+ UNIT_ASSERT(!Matches(m_scanners.fast, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonreloc, str));\
+ UNIT_ASSERT(!Matches(m_scanners.simple, str));\
+ UNIT_ASSERT(!Matches(m_scanners.slow, str));\
+ UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\
+ UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\
+ UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\
+ } while (false)
+
+
+#endif
diff --git a/contrib/libs/pire/ut/count_ut.cpp b/contrib/libs/pire/ut/count_ut.cpp
new file mode 100644
index 0000000000..ffe7943fcc
--- /dev/null
+++ b/contrib/libs/pire/ut/count_ut.cpp
@@ -0,0 +1,583 @@
+/*
+ * count_ut.cpp --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <stub/hacks.h>
+#include <stub/saveload.h>
+#include <stub/utf8.h>
+#include <stub/memstreams.h>
+#include "stub/cppunit.h"
+#include <pire.h>
+#include <extra.h>
+#include <string.h>
+
+
+Y_UNIT_TEST_SUITE(TestCount) {
+
+ Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding)
+ {
+ Pire::Lexer lex;
+ lex.SetEncoding(encoding);
+ TVector<wchar32> ucs4;
+ encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ lex.Assign(ucs4.begin(), ucs4.end());
+ return lex.Parse();
+ }
+
+ template<class Scanner>
+ typename Scanner::State InitializedState(const Scanner& scanner)
+ {
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ return state;
+ }
+
+ template<class Scanner>
+ typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1)
+ {
+ if (len == (size_t)-1) len = strlen(text);
+ auto state = InitializedState(scanner);
+ Pire::Step(scanner, state, Pire::BeginMark);
+ Pire::Run(scanner, state, text, text + len);
+ Pire::Step(scanner, state, Pire::EndMark);
+ return state;
+ }
+
+ template<class Scanner>
+ size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ const auto separatorFsm = MkFsm(separator, encoding);
+ return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ }
+
+ size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
+ {
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ const auto separatorFsm = MkFsm(separator, encoding);
+ auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ if (strcmp(separator, ".*") == 0) {
+ HalfFinalFsm fsm(regexpFsm);
+ fsm.MakeGreedyCounter(true);
+ auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeGreedyCounter(false);
+ auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
+ UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult);
+ UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult);
+ }
+ UNIT_ASSERT_EQUAL(countingResult, newResult);
+ auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
+ UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult);
+ return newResult;
+ }
+
+ Y_UNIT_TEST(Count)
+ {
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3));
+ char aaa[] = "abc def\0 abc\0 def ghi, abc";
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6));
+ UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6));
+ UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8));
+ UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3));
+ UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4));
+ UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 "
+ "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240"
+ "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 "
+ "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302"
+ "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270"
+ "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5));
+ UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*",
+ "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 "
+ "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 "
+ "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. "
+ "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 "
+ "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 "
+ "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 "
+ "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 "
+ "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 "
+ "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 "
+ "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 "
+ "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 "
+ "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> "
+ "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. "
+ "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> "
+ "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
+ "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 "
+ "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! "
+ "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, "
+ "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 "
+ "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202"
+ "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264"
+ "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204"
+ "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275"
+ "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321"
+ "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321"
+ "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 "
+ "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277"
+ "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265"
+ "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321"
+ "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201"
+ "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277"
+ "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 "
+ "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271"
+ "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201"
+ "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270"
+ "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202"
+ "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202"
+ "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"),
+ size_t(6));
+ UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276"
+ "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">"
+ "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203"
+ "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">"
+ "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />"
+ "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201"
+ "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260"
+ "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276"
+ "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320"
+ "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214"
+ "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7));
+ UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +",
+ " \320\260\320\260\320\220 abc def \320\260 cd"),
+ size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state
+ }
+
+ Y_UNIT_TEST(CountWithoutSeparator)
+ {
+ UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3));
+ }
+
+ Y_UNIT_TEST(CountGreedy)
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ char text[] = "wwwsswwwsssswwws";
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
+ UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
+ }
+
+ Y_UNIT_TEST(CountRepeating)
+ {
+ char text[] = "abbabbabbabbat";
+ UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2));
+ }
+
+ template<class Scanner>
+ void CountGlueOne()
+ {
+ const auto& enc = Pire::Encodings::Utf8();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+ auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+ }
+
+ Y_UNIT_TEST(CountGlue)
+ {
+ CountGlueOne<Pire::CountingScanner>();
+ CountGlueOne<Pire::AdvancedCountingScanner>();
+ CountGlueOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template <class Scanner>
+ void CountManyGluesOne(size_t maxRegexps) {
+ const auto& encoding = Pire::Encodings::Utf8();
+ auto text = "abcdbaa aa";
+ TVector<ypair<std::string, std::string>> tasks = {
+ {"a", ".*"},
+ {"b", ".*"},
+ {"c", ".*"},
+ {"ba", ".*"},
+ {"ab",".*"},
+ };
+ TVector<size_t> answers = {5, 2, 1, 1, 1};
+ Scanner scanner;
+ size_t regexpsCount = 0;
+ for (; regexpsCount < maxRegexps; ++regexpsCount) {
+ const auto& task = tasks[regexpsCount % tasks.size()];
+ const auto regexpFsm = MkFsm(task.first.c_str(), encoding);
+ const auto separatorFsm = MkFsm(task.second.c_str(), encoding);
+ Scanner nextScanner(regexpFsm, separatorFsm);
+ auto glue = Scanner::Glue(scanner, nextScanner);
+ if (glue.Empty()) {
+ break;
+ }
+ scanner = std::move(glue);
+ }
+ auto state = Run(scanner, text);
+ for (size_t i = 0; i < regexpsCount; ++i) {
+ UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]);
+ }
+ }
+
+ Y_UNIT_TEST(CountManyGlues)
+ {
+ CountManyGluesOne<Pire::CountingScanner>(20);
+ CountManyGluesOne<Pire::AdvancedCountingScanner>(20);
+ CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50);
+ }
+
+ template<class Scanner>
+ void CountBoundariesOne()
+ {
+ const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" };
+
+ const auto& enc = Pire::Encodings::Utf8();
+ Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc));
+ auto st = InitializedState(sc);
+ for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
+ Pire::Step(sc, st, Pire::BeginMark);
+ Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i]));
+ Pire::Step(sc, st, Pire::EndMark);
+ }
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(3));
+
+ const auto& enc2 = Pire::Encodings::Latin1();
+ Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2));
+ auto st2 = InitializedState(sc2);
+ for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
+ Pire::Step(sc2, st2, Pire::BeginMark);
+ Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i]));
+ Pire::Step(sc2, st2, Pire::EndMark);
+ }
+ UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7));
+ }
+
+ Y_UNIT_TEST(CountBoundaries)
+ {
+ CountBoundariesOne<Pire::CountingScanner>();
+ CountBoundariesOne<Pire::AdvancedCountingScanner>();
+ CountBoundariesOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<class Scanner>
+ void SerializationOne()
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Scanner sc3;
+ ::Load(&rbuf, sc3);
+
+ auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+
+ // Test mmap-ing at various alignments
+ for (size_t offset = 0; offset < MaxTestOffset; ++offset) {
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ try {
+ Scanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+
+ if (offset % sizeof(size_t) != 0) {
+ UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping");
+ } else {
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+
+ st = Run(sc4, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
+ }
+ }
+ catch (Pire::Error&) {}
+ }
+ }
+
+ Y_UNIT_TEST(Serialization)
+ {
+ SerializationOne<Pire::CountingScanner>();
+ SerializationOne<Pire::AdvancedCountingScanner>();
+ SerializationOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<class Scanner>
+ void Serialization_v6_compatibilityOne()
+ {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = Scanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ // Patched scanner is a scanner of RE_VERSION 6.
+ // The patched scanner is concatenated with original scanner to
+ // make sure all content of patched scanner is consumed.
+
+ const size_t ALIGNMENT = sizeof(size_t);
+ size_t actions_size =
+ sc.Size() *
+ sc.LettersCount() *
+ sizeof(typename Scanner::Action);
+ UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0);
+ size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag);
+ const char* src = wbuf.Buffer().Data();
+ size_t src_size = wbuf.Buffer().Size();
+ size_t patched_size = src_size + actions_size;
+ size_t bytes_before_actions = src_size - tags_size;
+ const int fill_char = 0x42;
+
+ TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT);
+ char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT));
+ char* patched = dst;
+
+ // Insert dummy m_actions between m_jumps and m_tags.
+ memcpy(patched, src, bytes_before_actions); // copy members before m_actions
+ memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions
+ memcpy(patched + bytes_before_actions + actions_size,
+ src + bytes_before_actions,
+ tags_size); // m_tags
+ // Set version to 6
+ // order of fields in header: magic, version, ...
+ ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1;
+ UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION);
+ *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS;
+
+ // write normal scanner after patched one
+ char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT);
+ memcpy(normal, src, src_size);
+ char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT);
+ size_t dst_size = dst_end - dst;
+
+ // test loading from stream
+ {
+ MemoryInput rbuf(dst, dst_size);
+ Scanner sc_patched, sc_normal;
+ ::Load(&rbuf, sc_patched);
+ ::Load(&rbuf, sc_normal);
+ auto st_patched = Run(sc_patched,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
+ auto st_normal = Run(sc_normal,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
+ }
+
+ // test loading using Mmap
+ {
+ Scanner sc_patched, sc_normal;
+ const void* tail = sc_patched.Mmap(patched, patched_size);
+ UNIT_ASSERT_EQUAL(tail, normal);
+ const void* tail2 = sc_normal.Mmap(tail, src_size);
+ UNIT_ASSERT_EQUAL(tail2, dst_end);
+ auto st_patched = Run(sc_patched,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
+ auto st_normal = Run(sc_normal,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
+ }
+ }
+
+ Y_UNIT_TEST(Serialization_v6_compatibility)
+ {
+ Serialization_v6_compatibilityOne<Pire::CountingScanner>();
+ Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>();
+ // NoGlueLimitCountingScanner is not v6_compatible
+ }
+
+ Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) {
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
+ auto sc = AdvancedCountingScanner::Glue(sc1, sc2);
+
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ TVector<char> buf2(wbuf.Buffer().Size());
+ memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ // test loading from stream
+ {
+ MemoryInput rbuf(buf2.data(), buf2.size());
+ NoGlueLimitCountingScanner scanner;
+ ::Load(&rbuf, scanner);
+ auto state = Run(scanner,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
+ }
+
+ // test loading using Mmap
+ {
+ NoGlueLimitCountingScanner scanner;
+ const void* tail = scanner.Mmap(buf2.data(), buf2.size());
+ UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size());
+ auto state = Run(scanner,
+ "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
+ UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
+ }
+ }
+
+ template<class Scanner>
+ void EmptyOne()
+ {
+ Scanner sc;
+ UNIT_ASSERT(sc.Empty());
+
+ UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash
+
+ // Test glueing empty
+ const auto& enc = Pire::Encodings::Latin1();
+ auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
+ auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1));
+ auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst");
+ UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
+
+ // Test Save/Load/Mmap
+ BufferOutput wbuf;
+ ::Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::CountingScanner sc3;
+ ::Load(&rbuf, sc3);
+ UNIT_CHECKPOINT(); Run(sc3, "a string");
+
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ Scanner sc4;
+ const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
+ UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
+ UNIT_CHECKPOINT(); Run(sc4, "a string");
+ }
+
+ Y_UNIT_TEST(Empty)
+ {
+ EmptyOne<Pire::CountingScanner>();
+ EmptyOne<Pire::AdvancedCountingScanner>();
+ EmptyOne<Pire::NoGlueLimitCountingScanner>();
+ }
+
+ template<typename Scanner>
+ TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) {
+ TVector<Scanner> scanners(6);
+ const auto regexpFsm = MkFsm(regexp, encoding);
+ HalfFinalFsm fsm(regexpFsm);
+ fsm.MakeGreedyCounter(true);
+ scanners[0] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeGreedyCounter(false);
+ scanners[1] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(true, true);
+ scanners[2] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(true, false);
+ scanners[3] = Scanner(fsm);
+ fsm = HalfFinalFsm(regexpFsm);
+ fsm.MakeNonGreedyCounter(false);
+ scanners[4] = Scanner(fsm);
+ scanners[5] = scanners[0];
+ for (size_t i = 1; i < 5; i++) {
+ scanners[5] = Scanner::Glue(scanners[5], scanners[i]);
+ }
+ return scanners;
+ }
+
+ template<typename Scanner>
+ void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) {
+ for (size_t i = 0; i < 5; i++) {
+ UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]);
+ }
+ auto state = Run(scanners[5], text, -1);
+ for (size_t i = 0; i < 5; i++) {
+ UNIT_ASSERT_EQUAL(state.Result(i), result[i]);
+ }
+ }
+
+ template<typename Scanner>
+ void TestHalfFinalCount() {
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8});
+ HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7});
+ }
+
+ Y_UNIT_TEST(HalfFinal)
+ {
+ TestHalfFinalCount<Pire::HalfFinalScanner>();
+ TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>();
+ TestHalfFinalCount<Pire::HalfFinalScannerNoMask>();
+ TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>();
+ }
+
+ template<typename Scanner>
+ void TestHalfFinalSerialization() {
+ auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+");
+ BufferOutput wbuf;
+ for (size_t i = 0; i < 6; i++) {
+ ::Save(&wbuf, oldScanners[i]);
+ }
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ TVector<Scanner> scanners(6);
+ for (size_t i = 0; i < 6; i++) {
+ ::Load(&rbuf, scanners[i]);
+ }
+
+ HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5});
+ }
+
+ Y_UNIT_TEST(HalfFinalSerialization)
+ {
+ TestHalfFinalSerialization<Pire::HalfFinalScanner>();
+ TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>();
+ }
+}
diff --git a/contrib/libs/pire/pire/fwd.h b/contrib/libs/pire/ut/easy_ut.cpp
index c2b5870b05..5f0f8303fc 100644
--- a/contrib/libs/pire/pire/fwd.h
+++ b/contrib/libs/pire/ut/easy_ut.cpp
@@ -1,5 +1,5 @@
/*
- * fwd.h -- forward declarations of Pire classes
+ * easy_ut.cpp -- Unit tests for PireEasy
*
* Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
* Alexander Gololobov <agololobov@gmail.com>
@@ -11,7 +11,7 @@
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@@ -20,23 +20,38 @@
* along with Pire. If not, see <http://www.gnu.org/licenses>.
*/
+#include <stub/hacks.h>
+#include <stub/defaults.h>
+#include "stub/cppunit.h"
+#include <stdexcept>
+#include "common.h"
-#ifndef PIRE_FWD_H
-#define PIRE_FWD_H
-
-
-namespace Pire {
+#undef Run
- class Scanner;
- class MultiScanner;
- class SlowScanner;
- class CapturingScanner;
- class CountingScanner;
+#include <easy.h>
- class Fsm;
+Y_UNIT_TEST_SUITE(TestPireEasy) {
+
+Y_UNIT_TEST(Match)
+{
+ Pire::Regexp re("(foo|bar)+", Pire::I);
+ UNIT_ASSERT("prefix fOoBaR suffix" ==~ re);
+ UNIT_ASSERT(!("bla bla bla" ==~ re));
+}
- class Lexer;
- class Encoding;
+Y_UNIT_TEST(Utf8)
+{
+ Pire::Regexp re("^.$", Pire::I | Pire::UTF8);
+ UNIT_ASSERT("\x41" ==~ re);
+ UNIT_ASSERT(!("\x81" ==~ re));
}
-#endif
+Y_UNIT_TEST(TwoFeatures)
+{
+ Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT);
+ UNIT_ASSERT("abc" ==~ re);
+ UNIT_ASSERT("ABC" ==~ re);
+ UNIT_ASSERT(!("adc" ==~ re));
+}
+
+}
diff --git a/contrib/libs/pire/ut/glyph_ut.cpp b/contrib/libs/pire/ut/glyph_ut.cpp
new file mode 100644
index 0000000000..05ef56b01b
--- /dev/null
+++ b/contrib/libs/pire/ut/glyph_ut.cpp
@@ -0,0 +1,63 @@
+/*
+ * glyph_ut.cpp --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <pire.h>
+#include <extra/glyphs.h>
+#include "stub/cppunit.h"
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(Glyphs) {
+
+ Pire::Fsm ParseFsm(const char* regexp)
+ {
+ TVector<wchar32> ucs4;
+ Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
+ return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround();
+ }
+
+#define NOGL_REGEXP(str) REGEXP2(str, "u")
+#define GL_REGEXP(str) SCANNER(ParseFsm(str))
+
+ Y_UNIT_TEST(Glyphs)
+ {
+ NOGL_REGEXP("regexp") {
+ ACCEPTS("regexp");
+ DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ GL_REGEXP("regexp") {
+ ACCEPTS("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
+ DENIES("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+
+ GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") {
+ ACCEPTS("regexp");
+ ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80");
+ }
+ }
+}
diff --git a/contrib/libs/pire/ut/inline_ut.cpp b/contrib/libs/pire/ut/inline_ut.cpp
new file mode 100644
index 0000000000..3ba31dfaa8
--- /dev/null
+++ b/contrib/libs/pire/ut/inline_ut.cpp
@@ -0,0 +1,91 @@
+/*
+ * inline_ut.cpp --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <stub/hacks.h>
+#include "stub/cppunit.h"
+#include <pire.h>
+#include <iostream>
+#include <string.h>
+
+Y_UNIT_TEST_SUITE(TestPireInline) {
+
+template<class Scanner>
+typename Scanner::State RunRegexp(const Scanner& scanner, const char* str)
+{
+ typename Scanner::State state;
+ scanner.Initialize(state);
+ Step(scanner, state, Pire::BeginMark);
+ Run(scanner, state, str, str + strlen(str));
+ Step(scanner, state, Pire::EndMark);
+ return state;
+}
+
+template<class Scanner>
+bool Matches(const Scanner& scanner, const char* str)
+{
+ return scanner.Final(RunRegexp(scanner, str));
+}
+
+template<class Scanner>
+bool Matches2(const Scanner& scanner, const char* str)
+{
+ return Pire::Matches(scanner, str);
+}
+
+bool ParticularMatch(Pire::Scanner& sc, Pire::Scanner::State st, size_t idx)
+{
+ std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st);
+ return std::distance(p.first, p.second) == 1 && *p.first == idx;
+}
+
+Y_UNIT_TEST(Inline)
+{
+ Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is");
+ UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/"));
+ UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/"));
+
+ Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i");
+ UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/"));
+ UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/"));
+}
+
+Y_UNIT_TEST(InlineGlue)
+{
+ // Check whether pire_inline handles comments as well:
+
+ /* - a C-style comment outside a regexp; */
+ Pire::Scanner sc = PIRE_REGEXP(
+ "foo", "", /* - a C-style comment inside a regexp; */
+ "bar", "", // - a C++-style comment inside a regexp;
+ "baz", ""
+ );
+ // - a C++-style comment outside a regexp.
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0));
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1));
+ UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2));
+ UNIT_ASSERT(!Matches2(sc, "xxx"));
+}
+
+}
diff --git a/contrib/libs/pire/ut/pire_ut.cpp b/contrib/libs/pire/ut/pire_ut.cpp
new file mode 100644
index 0000000000..13f3f2ec71
--- /dev/null
+++ b/contrib/libs/pire/ut/pire_ut.cpp
@@ -0,0 +1,888 @@
+/*
+ * pire_ut.cpp --
+ *
+ * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
+ * Alexander Gololobov <agololobov@gmail.com>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <stub/hacks.h>
+#include <stub/defaults.h>
+#include <stub/saveload.h>
+#include <stub/memstreams.h>
+#include "stub/cppunit.h"
+#include <stdexcept>
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(TestPire) {
+
+/*****************************************************************************
+* Tests themselves
+*****************************************************************************/
+
+Y_UNIT_TEST(String)
+{
+ REGEXP("abc") {
+ ACCEPTS("def abc ghi");
+ ACCEPTS("abc");
+ DENIES ("def abd ghi");
+ }
+}
+
+Y_UNIT_TEST(Boundaries)
+{
+ REGEXP("^abc") {
+ ACCEPTS("abc ghi");
+ DENIES ("def abc");
+ }
+
+ REGEXP("abc$") {
+ DENIES ("abc ghi");
+ ACCEPTS("def abc");
+ }
+}
+
+Y_UNIT_TEST(Primitives)
+{
+ REGEXP("abc|def") {
+ ACCEPTS("def");
+ ACCEPTS("abc");
+ DENIES ("deb");
+ }
+
+ REGEXP("ad*e") {
+ ACCEPTS("xaez");
+ ACCEPTS("xadez");
+ ACCEPTS("xaddez");
+ ACCEPTS("xadddddddddddddddddddddddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("ad+e") {
+ DENIES ("xaez");
+ ACCEPTS("xadez");
+ ACCEPTS("xaddez");
+ ACCEPTS("xadddddddddddddddddddddddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("ad?e") {
+ ACCEPTS("xaez");
+ ACCEPTS("xadez");
+ DENIES ("xaddez");
+ DENIES ("xafez");
+ }
+
+ REGEXP("a.{1}e") {
+ ACCEPTS("axe");
+ DENIES ("ae");
+ DENIES ("axye");
+ }
+}
+
+void TestMassAlternatives(const char* pattern) {
+ REGEXP(pattern) {
+ ACCEPTS("abc");
+ ACCEPTS("def");
+ ACCEPTS("ghi");
+ ACCEPTS("klm");
+ DENIES ("aei");
+ DENIES ("klc");
+ }
+}
+
+Y_UNIT_TEST(MassAlternatives)
+{
+ TestMassAlternatives("((abc|def)|ghi)|klm");
+
+ TestMassAlternatives("(abc|def)|(ghi|klm)");
+
+ TestMassAlternatives("abc|(def|(ghi|klm))");
+
+ TestMassAlternatives("abc|(def|ghi)|klm");
+}
+
+Y_UNIT_TEST(Composition)
+{
+ REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") {
+ ACCEPTS("/regexp/i");
+ ACCEPTS("/regexp2/");
+ DENIES ("regexp");
+
+ ACCEPTS("/dir\\/file/");
+ DENIES ("/dir/file/");
+
+ ACCEPTS("/dir\\\\/");
+ DENIES ("/dir\\\\/file/");
+ }
+
+ REGEXP("Head(Inner)*Tail") {
+ ACCEPTS("HeadInnerTail");
+ ACCEPTS("HeadInnerInnerTail");
+ DENIES ("HeadInneInnerTail");
+ ACCEPTS("HeadTail");
+ }
+}
+
+Y_UNIT_TEST(Repetition)
+{
+ REGEXP("^x{3,6}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ }
+
+ REGEXP("^x{3,}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxxxxxxxx");
+ ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^x{3}$") {
+ DENIES ("x");
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ DENIES ("xxxx");
+ DENIES ("xxxxx");
+ DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("x.{3,10}$") {
+ for (size_t size = 0; size < 20; ++size) {
+ ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e');
+ if (size >= 3 && size <= 10)
+ ACCEPTS(str.c_str());
+ else
+ DENIES(str.c_str());
+ }
+ }
+}
+
+Y_UNIT_TEST(UTF8)
+{
+ REGEXP2("^.$", "u") {
+ // A single-byte sequence 0xxx xxxx
+ ACCEPTS("\x41");
+ DENIES ("\x81");
+
+ // A two-byte sequence: 110x xxxx | 10xx xxxx
+ ACCEPTS("\xC1\x81");
+ DENIES ("\xC1");
+ DENIES ("\xC1\x41");
+ DENIES ("\xC1\xC2");
+ DENIES ("\xC1\x81\x82");
+
+ // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx
+ ACCEPTS("\xE1\x81\x82");
+ DENIES ("\xE1");
+ DENIES ("\xE1\x42");
+ DENIES ("\xE1\x42\x43");
+ DENIES ("\xE1\xC2\xC3");
+ DENIES ("\xE1\x82");
+ DENIES ("\xE1\x82\x83\x84");
+
+ // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
+ ACCEPTS("\xF1\x81\x82\x83");
+ }
+
+ REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y");
+}
+
+Y_UNIT_TEST(AndNot)
+{
+ REGEXP2("<([0-9]+&~123&~456)>", "a") {
+ ACCEPTS("<111>");
+ ACCEPTS("<124>");
+ DENIES ("<123>");
+ DENIES ("<456>");
+ DENIES ("<abc>");
+ }
+
+ REGEXP2("[0-9]+\\&1+", "a") {
+ DENIES("111");
+ ACCEPTS("123&111");
+ }
+}
+
+Y_UNIT_TEST(Empty)
+{
+ Scanners s("\\s*", "n");
+ Pire::Scanner::State state;
+ s.fast.Initialize(state);
+ UNIT_ASSERT(s.fast.Final(state));
+ Pire::SimpleScanner::State stateSF;
+ s.simple.Initialize(stateSF);
+ UNIT_ASSERT(s.simple.Final(stateSF));
+}
+
+Y_UNIT_TEST(Misc)
+{
+ REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a");
+ REGEXP("\\t") ACCEPTS("\t");
+
+ SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) {
+ ACCEPTS("str");
+ DENIES("str_http");
+ }
+
+ SCANNER(~Pire::Fsm()) ACCEPTS("str");
+}
+
+Y_UNIT_TEST(Ranges)
+{
+ REGEXP("a\\W") {
+ ACCEPTS("a,");
+ DENIES("ab");
+ }
+
+ try {
+ REGEXP("abc[def") {}
+ UNIT_ASSERT(!"Should report syntax error");
+ }
+ catch (Pire::Error&) {}
+}
+
+Y_UNIT_TEST(Reverse)
+{
+ SCANNER(ParseRegexp("abcdef").Reverse()) {
+ ACCEPTS("fedcba");
+ DENIES ("abcdef");
+ }
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+Y_UNIT_TEST(PrefixSuffix)
+{
+ static const char* pattern = "-->";
+ Pire::Fsm fsm = ParseRegexp(pattern, "n");
+ Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>();
+ Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>();
+ Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>();
+
+ static const char* text = "1234567890 --> middle --> end";
+ const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::LongestPrefix(ngsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 25);
+ begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 22);
+ view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 22);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::ShortestPrefix(gsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+
+ end = Pire::ShortestPrefix(ngsc, text, text + strlen(text));
+ UNIT_ASSERT_EQUAL(end, text + 14);
+ begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
+ UNIT_ASSERT_EQUAL(begin, text + 11);
+ view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text));
+ UNIT_ASSERT_EQUAL(view.data(), text + 11);
+ UNIT_ASSERT_EQUAL(view.size(), 3);
+}
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+Y_UNIT_TEST(PrefixSuffixEmptyView) {
+ const std::string_view empty{};
+ auto checkAnswer = [](std::string_view answer) {
+ return !answer.data() && answer.size() == 0;
+ };
+
+ TVector<ystring> patterns = {
+ "",
+ "a",
+ ".*",
+ "a.*",
+ ".*a"
+ };
+
+ for (const auto& pattern: patterns) {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern);
+ UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern);
+ }
+}
+
+namespace {
+ ssize_t LongestPrefixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* end = Pire::LongestPrefix(sc, str, str + strlen(str));
+ return end ? end - str : -1;
+ }
+
+ ssize_t ShortestPrefixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str));
+ return end ? end - str : -1;
+ }
+
+ ssize_t LongestSuffixLen(const char* pattern, const char* str)
+ {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* rbegin = str + strlen(str) - 1;
+ const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1);
+ return rend ? rbegin - rend : -1;
+ }
+
+ ssize_t ShortestSuffixLen(const char* pattern, const char* str) {
+ Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
+ const char* rbegin = str + strlen(str) - 1;
+ const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1);
+ return rend ? rbegin - rend : -1;
+ }
+}
+
+Y_UNIT_TEST(ScanBoundaries)
+{
+ struct Case {
+ ystring pattern;
+ ystring text;
+ ssize_t shortestPrefixLen;
+ ssize_t longestPrefixLen;
+
+ ystring ToString() const {
+ return ystring("Pattern: ") + pattern + ", text: " + text;
+ }
+ };
+
+ TVector <Case> cases = {
+ {
+ "a*",
+ "",
+ 0,
+ 0,
+ },
+ {
+ "a",
+ "",
+ -1,
+ -1,
+ },
+ {
+ "fixed",
+ "fixed prefix",
+ 5,
+ 5,
+ },
+ {
+ "fixed",
+ "a fixed nonexistent prefix",
+ -1,
+ -1,
+ },
+ {
+ "a*",
+ "aaabbb",
+ 0,
+ 3,
+ },
+ {
+ "a*",
+ "bbbbbb",
+ 0,
+ 0,
+ },
+ {
+ "a*",
+ "aaaaaa",
+ 0,
+ 6,
+ },
+ {
+ "aa*",
+ "aaabbb",
+ 1,
+ 3,
+ },
+ {
+ "a*a",
+ "aaaaaa",
+ 1,
+ 6,
+ },
+ {
+ ".*a",
+ "bbbba",
+ 5,
+ 5,
+ },
+ {
+ ".*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
+ 0,
+ 80,
+ },
+ {
+ ".*a",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
+ 81,
+ 81,
+ },
+ {
+ ".*a",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
+ 81,
+ 162,
+ },
+ {
+ ".*b",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
+ -1,
+ -1,
+ },
+ {
+ ".*a.*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 81,
+ 162,
+ },
+ {
+ ".*a.*b",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 162,
+ 162,
+ },
+ {
+ "1.*a.*",
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
+ "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
+ 81,
+ 162,
+ },
+ {
+ "a+",
+ "bbbbbb",
+ -1,
+ -1,
+ },
+ };
+
+ for (const auto& test: cases) {
+ UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString());
+ UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString());
+ auto reversed = test.text;
+ ReverseInPlace(reversed);
+ UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString());
+ UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString());
+ }
+}
+
+Y_UNIT_TEST(ScanTermination)
+{
+ Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>();
+ // Scanning must terminate at first dead state. If it does not,
+ // we will pass through the end of our string and end up with segfault.
+ const char str[] = "aaab";
+ const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str));
+ UNIT_ASSERT(p == &str[0] + 3);
+}
+
+struct BasicMmapTest {
+ template <class Scanner>
+ static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str)
+ {
+ try {
+ sc.Mmap(ptr, sz);
+ if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) {
+ UNIT_ASSERT(!"Failed to check for misaligned mmaping");
+ } else {
+ UNIT_ASSERT(Matches(sc, str));
+ }
+ }
+ catch (Pire::Error&) {}
+ }
+};
+
+template <class Sc1, class Sc2>
+void TestCopyingHelper()
+{
+ Pire::Fsm fsm = ParseRegexp("^r$", "");
+ Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>());
+
+ // Test copy ctor
+ UNIT_ASSERT(Matches(Sc2(sc1), "r"));
+ UNIT_ASSERT(!Matches(Sc2(sc1), "p"));
+
+ // Test '=' operator
+ Sc2 sc2;
+ sc2 = sc1;
+ UNIT_ASSERT(Matches(sc2, "r"));
+ UNIT_ASSERT(!Matches(sc2, "p"));
+}
+
+template <class Sc1, class Sc2>
+void TestCopying()
+{
+ TestCopyingHelper<Sc1, Sc2>();
+ TestCopyingHelper<Sc2, Sc1>();
+}
+
+Y_UNIT_TEST(Copying)
+{
+ TestCopying<Pire::Scanner, Pire::NonrelocScanner>();
+ TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>();
+ TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>();
+ TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>();
+}
+
+template<class Scanner>
+void MatchScanner(Scanner& scanner) {
+ UNIT_ASSERT(Matches(scanner, "regexp"));
+ UNIT_ASSERT(!Matches(scanner, "regxp"));
+ UNIT_ASSERT(!Matches(scanner, "regexp t"));
+}
+
+template<class Scanner>
+void LoadAndMatchScanner(MemoryInput& rbuf, Scanner& scanner) {
+ Load(&rbuf, scanner);
+ MatchScanner(scanner);
+}
+
+template<class Scanner>
+const char* MmapAndMatchScanner(Scanner& scanner, const char* ptr, size_t size) {
+ const char* ptr2 = (const char*)scanner.Mmap(ptr, size);
+ MatchScanner(scanner);
+ return ptr2;
+}
+
+Y_UNIT_TEST(Serialization)
+{
+ Scanners s("^regexp$");
+
+ BufferOutput wbuf;
+ Save(&wbuf, s.fast);
+ Save(&wbuf, s.simple);
+ Save(&wbuf, s.slow);
+ Save(&wbuf, s.fastNoMask);
+ Save(&wbuf, s.nonreloc);
+ Save(&wbuf, s.nonrelocNoMask);
+ Save(&wbuf, s.halfFinal);
+ Save(&wbuf, s.halfFinalNoMask);
+ Save(&wbuf, s.nonrelocHalfFinal);
+ Save(&wbuf, s.nonrelocHalfFinalNoMask);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ LoadAndMatchScanner(rbuf, s.fast);
+ LoadAndMatchScanner(rbuf, s.simple);
+ LoadAndMatchScanner(rbuf, s.slow);
+ LoadAndMatchScanner(rbuf, s.fastNoMask);
+ LoadAndMatchScanner(rbuf, s.nonreloc);
+ LoadAndMatchScanner(rbuf, s.nonrelocNoMask);
+ LoadAndMatchScanner(rbuf, s.halfFinal);
+ LoadAndMatchScanner(rbuf, s.halfFinalNoMask);
+ LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal);
+ LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask);
+
+ Pire::Scanner fast;
+ Pire::SimpleScanner simple;
+ Pire::SlowScanner slow;
+ Pire::ScannerNoMask fastNoMask;
+ Pire::HalfFinalScanner halfFinal;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask;
+ Pire::Scanner fast1;
+ Pire::ScannerNoMask fastNoMask1;
+ Pire::HalfFinalScanner halfFinal1;
+ Pire::HalfFinalScannerNoMask halfFinalNoMask1;
+ const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
+ TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
+ const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
+ const char* end = ptr + wbuf.Buffer().Size();
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+
+ const char* ptr2 = 0;
+ ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr);
+ size_t fastSize = ptr2 - ptr;
+ ptr = ptr2;
+ ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr);
+ size_t simpleSize = ptr2 - ptr;
+ ptr = ptr2;
+ ptr = MmapAndMatchScanner(slow, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr);
+ // Nonreloc-s are saved as Scaner-s, so read them again
+ ptr = MmapAndMatchScanner(fast1, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr);
+
+ ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr);
+ ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr);
+ UNIT_ASSERT_EQUAL(ptr, end);
+
+ for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
+ ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
+ end = ptr + wbuf.Buffer().Size();
+ memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ BasicMmapTest::Match(fast, ptr, end - ptr, "regexp");
+ ptr = ptr + fastSize;
+ BasicMmapTest::Match(simple, ptr, end - ptr, "regexp");
+ ptr = ptr + simpleSize;
+ BasicMmapTest::Match(slow, ptr, end - ptr, "regexp");
+ }
+}
+
+Y_UNIT_TEST(TestShortcuts)
+{
+ REGEXP("aaa") {
+ ACCEPTS("......................................aaa.............");
+ DENIES ("......................................aab.............");
+ DENIES ("......................................................");
+ }
+ REGEXP("[ab]{3}") {
+ ACCEPTS("......................................aaa.............");
+ ACCEPTS("......................................aab.............");
+ ACCEPTS("......................................bbb.............");
+ DENIES ("......................................................");
+ }
+ REGEXP2("\xD0\xB0", "u") {
+ ACCEPTS("......................................\xD0\xB0...............");
+ ACCEPTS("...................................\xD0\xB0..................");
+ ACCEPTS("................................\xD0\xB0.....................");
+ }
+}
+
+template<class Scanner>
+void TestGlue()
+{
+ Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>();
+ Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>();
+ Scanner glued = Scanner::Glue(sc1, sc2);
+ UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2));
+
+ auto state = RunRegexp(glued, "aaa");
+ auto res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(*res.first, size_t(0));
+
+ state = RunRegexp(glued, "bbb");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(*res.first, size_t(1));
+
+ state = RunRegexp(glued, "aaabbb");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2));
+ UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
+ UNIT_ASSERT_EQUAL(res.first[1], size_t(1));
+
+ state = RunRegexp(glued, "ccc");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0));
+
+ Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>();
+ glued = Scanner::Glue(sc3, glued);
+ UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3));
+
+ state = RunRegexp(glued, "ccc");
+ res = glued.AcceptedRegexps(state);
+ UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
+ UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
+ Scanner sc4 = Scanner::Glue(
+ ParseRegexp("a", "n").Compile<Scanner>(),
+ ParseRegexp("c", "n").Compile<Scanner>()
+ );
+ state = RunRegexp(sc4, "ac");
+ res = sc4.AcceptedRegexps(state);
+ UNIT_ASSERT(res.second == res.first);
+ state = RunRegexp(sc4, "ac");
+ UNIT_ASSERT(!sc4.Final(state));
+}
+
+Y_UNIT_TEST(Glue)
+{
+ TestGlue<Pire::Scanner>();
+ TestGlue<Pire::NonrelocScanner>();
+ TestGlue<Pire::ScannerNoMask>();
+ TestGlue<Pire::NonrelocScannerNoMask>();
+ TestGlue<Pire::HalfFinalScanner>();
+ TestGlue<Pire::NonrelocHalfFinalScanner>();
+ TestGlue<Pire::HalfFinalScannerNoMask>();
+ TestGlue<Pire::NonrelocHalfFinalScannerNoMask>();
+}
+
+Y_UNIT_TEST(Slow)
+{
+ Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>();
+ // 123456789012345678901234567890
+ UNIT_ASSERT( Matches(sc, "....a.............................."));
+ UNIT_ASSERT(!Matches(sc, "....a..............................."));
+ UNIT_ASSERT(!Matches(sc, "....a............................."));
+}
+
+struct astring: private std::vector<char> {
+ template <typename... A>
+ inline astring(A&&... a) {
+ std::string s(std::forward<A>(a)...);
+
+ insert(end(), s.begin(), s.end());
+ push_back(0);
+ }
+
+ inline char* c_str() noexcept {
+ return data();
+ }
+
+ friend astring operator+(astring l, const astring& r) {
+ l.insert(l.end() - 1, r.begin(), r.end());
+
+ return l;
+ }
+};
+
+Y_UNIT_TEST(Aligned)
+{
+ using ystring = astring;
+
+ UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*)));
+
+ REGEXP("xy") {
+ // Short string with aligned head
+ ACCEPTS(ystring("xy").c_str());
+ DENIES (ystring("yz").c_str());
+ // Short string, unaligned
+ ACCEPTS(ystring(".xy").c_str() + 1);
+ DENIES (ystring(".yz").c_str() + 1);
+ // Short string with aligned tail
+ ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2);
+ DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2);
+ }
+
+ REGEXP("abcde") {
+ // Everything aligned, match occurs in the middle
+ ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str());
+ DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str());
+ // Unaligned head
+ ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1);
+ DENIES (ystring(".ZxbcdeZZZ").c_str() + 1);
+ // Unaligned tail
+ ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str());
+ DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str());
+ }
+}
+
+#undef Run
+
+template <class Scanner>
+void BasicTestEmptySaveLoadMmap()
+{
+ Scanner sc;
+ UNIT_ASSERT(sc.Empty());
+ UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash
+
+ BufferOutput wbuf;
+ UNIT_CHECKPOINT(); Save(&wbuf, sc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Scanner sc3;
+ /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3);
+ UNIT_ASSERT(sc3.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End();
+
+ Scanner sc4;
+ /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size());
+ UNIT_ASSERT(sc4.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End();
+}
+
+Y_UNIT_TEST(EmptyScanner)
+{
+ // Tests for Scanner
+ BasicTestEmptySaveLoadMmap<Pire::Scanner>();
+ BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>();
+ BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>();
+ BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>();
+
+ Pire::Scanner sc;
+ Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc);
+ UNIT_ASSERT(scsc.Empty());
+ UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End();
+
+ Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End();
+
+ // Tests for NonrelocScanner
+ Pire::NonrelocScanner nsc;
+ UNIT_ASSERT(nsc.Empty());
+ UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0));
+ UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End();
+
+ Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
+ UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
+ UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
+
+ {
+ BufferOutput wbuf;
+ UNIT_CHECKPOINT(); Save(&wbuf, nsc);
+
+ MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
+ Pire::NonrelocScanner nsc3;
+ /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3);
+ UNIT_ASSERT(nsc3.Empty());
+ UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End();
+ }
+
+ BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>();
+
+ BasicTestEmptySaveLoadMmap<Pire::SlowScanner>();
+}
+
+Y_UNIT_TEST(NullPointer)
+{
+ const char* null = 0;
+ Pire::Scanner sc = Pire::Fsm().Compile<Pire::Scanner>();
+ Pire::Runner(sc).Begin().Run(null, null).End();
+}
+
+}
diff --git a/contrib/libs/pire/ut/read_unicode_ut.cpp b/contrib/libs/pire/ut/read_unicode_ut.cpp
new file mode 100644
index 0000000000..f0433401c7
--- /dev/null
+++ b/contrib/libs/pire/ut/read_unicode_ut.cpp
@@ -0,0 +1,298 @@
+/*
+ * unicode_range_ut.cpp --
+ *
+ * Copyright (c) 2019 YANDEX LLC
+ * Author: Karina Usmanova <usmanova.karin@yandex.ru>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <pire.h>
+#include "stub/cppunit.h"
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(ReadUnicodeTest) {
+ ystring CreateStringWithZeroSymbol(const char* str, size_t pos) {
+ ystring result = str;
+ Y_ASSERT(pos < result.size());
+ result[pos] = '\0';
+ return result;
+ }
+
+ Y_UNIT_TEST(ZeroSymbol)
+ {
+ REGEXP("\\x{0}") {
+ ACCEPTS(CreateStringWithZeroSymbol("a", 0));
+ ACCEPTS(CreateStringWithZeroSymbol("some text", 3));
+ DENIES("string without zero");
+ }
+
+ REGEXP("the\\x00middle") {
+ ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6));
+ DENIES(CreateStringWithZeroSymbol("in the middle", 5));
+ DENIES("in the middle");
+ }
+ }
+
+ Y_UNIT_TEST(SymbolsByCodes)
+ {
+ REGEXP("\\x{41}") {
+ ACCEPTS("A");
+ ACCEPTS("tAst string");
+ DENIES("test string");
+ }
+
+ REGEXP("\\x26abc") {
+ ACCEPTS("&abc;");
+ DENIES("test &ab");
+ DENIES("without");
+ }
+ }
+
+ Y_UNIT_TEST(ErrorsWhileCompiling)
+ {
+ UNIT_ASSERT(HasError("\\x"));
+ UNIT_ASSERT(HasError("\\x0"));
+ UNIT_ASSERT(HasError("\\xfu"));
+ UNIT_ASSERT(HasError("\\xs1"));
+ UNIT_ASSERT(HasError("\\x 0"));
+ UNIT_ASSERT(HasError("\\x0 "));
+
+ UNIT_ASSERT(HasError("\\x{2A1"));
+ UNIT_ASSERT(HasError("\\x{"));
+ UNIT_ASSERT(HasError("\\x}"));
+ UNIT_ASSERT(HasError("\\x2}"));
+ UNIT_ASSERT(HasError("\\x{{3}"));
+ UNIT_ASSERT(HasError("\\x{2a{5}"));
+
+ UNIT_ASSERT(HasError("\\x{}"));
+ UNIT_ASSERT(HasError("\\x{+3}"));
+ UNIT_ASSERT(HasError("\\x{-3}"));
+ UNIT_ASSERT(HasError("\\x{ 2F}"));
+ UNIT_ASSERT(HasError("\\x{2A F}"));
+ UNIT_ASSERT(HasError("\\x{2Arft}"));
+ UNIT_ASSERT(HasError("\\x{110000}"));
+
+ UNIT_ASSERT(!HasError("\\x{fB1}"));
+ UNIT_ASSERT(!HasError("\\x00"));
+ UNIT_ASSERT(!HasError("\\x{10FFFF}"));
+ }
+
+ Y_UNIT_TEST(OneCharacterRange)
+ {
+ SCANNER("[\\x{61}]") {
+ ACCEPTS("a");
+ ACCEPTS("bac");
+ DENIES("test");
+ }
+
+ SCANNER("[\\x3f]") {
+ ACCEPTS("?");
+ ACCEPTS("test?");
+ DENIES("test");
+ }
+ }
+
+ Y_UNIT_TEST(CharacterRange) {
+ REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("acd");
+ ACCEPTS("bcd");
+ ACCEPTS("cd?");
+ ACCEPTS("ab?");
+ DENIES("cd");
+ }
+
+ REGEXP("[\\x{61}-\\x{63}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("qwertya");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x61-\\x61]") {
+ ACCEPTS("a");
+ ACCEPTS("qwertya");
+ DENIES("b");
+ }
+
+ REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") {
+ ACCEPTS("&");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("ade");
+ ACCEPTS("ab?");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") {
+ ACCEPTS("Aa");
+ ACCEPTS("Ab");
+ ACCEPTS("Ba");
+ ACCEPTS("Bb");
+ DENIES("a");
+ DENIES("b");
+ DENIES("A");
+ DENIES("B");
+ DENIES("ab");
+ DENIES("AB");
+ DENIES("Ca");
+ }
+ }
+
+ Y_UNIT_TEST(RangeExcludeCharacters) {
+ REGEXP("[^\\x{61}]") {
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("aba");
+ DENIES("a");
+ DENIES("aaa");
+ }
+
+ REGEXP("[^\\x{61}-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+ }
+
+ Y_UNIT_TEST(MixedRange) {
+ REGEXP("[\\x{61}B]") {
+ ACCEPTS("a");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("test");
+ }
+
+ REGEXP("[^\\x{61}A]") {
+ ACCEPTS("b");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("a");
+ DENIES("A");
+ DENIES("aaAA");
+ }
+
+ REGEXP("[0-9][\\x{61}-\\x{62}A-B]") {
+ ACCEPTS("0a");
+ ACCEPTS("1A");
+ ACCEPTS("5b");
+ ACCEPTS("9B");
+ ACCEPTS("1atestB");
+ ACCEPTS("2Atest");
+ DENIES("aB");
+ DENIES("testb");
+ DENIES("test");
+ }
+
+ REGEXP("[\\x{61}-c]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("testb");
+ DENIES("d");
+ }
+
+ REGEXP("[^a-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+
+ REGEXP("[\\x{41}-Ba-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+ }
+
+ Y_UNIT_TEST(CompilingRange)
+ {
+ UNIT_ASSERT(HasError("[\\x41"));
+ UNIT_ASSERT(HasError("[\\xfq]"));
+ UNIT_ASSERT(HasError("[\\x{01}-]"));
+
+ UNIT_ASSERT(!HasError("[\\x{10FFFF}]"));
+ UNIT_ASSERT(!HasError("[\\x{00}]"));
+ UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]"));
+
+ UNIT_ASSERT(!HasError("[^\\xFF]"));
+ UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]"));
+ UNIT_ASSERT(!HasError("[-\\x{01}]"));
+ }
+
+ Y_UNIT_TEST(UnicodeRepetition)
+ {
+ REGEXP("^\\x{78}{3,6}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ }
+
+ REGEXP("^x{3,}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxxxxxxxx");
+ ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^\\x{78}{3}$") {
+ DENIES ("x");
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ DENIES ("xxxx");
+ DENIES ("xxxxx");
+ DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^([\\x{78}-\\x{79}]){2}$") {
+ DENIES("x");
+ DENIES("y");
+ ACCEPTS("xx");
+ ACCEPTS("xy");
+ ACCEPTS("yx");
+ ACCEPTS("yy");
+ DENIES("xxy");
+ DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+ }
+
+}
diff --git a/contrib/libs/pire/ut/stub/cppunit.h b/contrib/libs/pire/ut/stub/cppunit.h
new file mode 100644
index 0000000000..6d15ce0912
--- /dev/null
+++ b/contrib/libs/pire/ut/stub/cppunit.h
@@ -0,0 +1,14 @@
+#ifndef PIRE_STUB_CPPUNIT_H_INCLUDED
+#define PIRE_STUB_CPPUNIT_H_INCLUDED
+
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/stream/mem.h>
+
+#define UNIT_CHECKPOINT()
+
+typedef TMemoryInput MemoryInput;
+
+#endif
+
+
+