diff options
author | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
---|---|---|
committer | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
commit | 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch) | |
tree | 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /contrib/libs/pire | |
parent | 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff) | |
download | ydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz |
fix ya.make
Diffstat (limited to 'contrib/libs/pire')
-rw-r--r-- | contrib/libs/pire/Makefile.am | 2 | ||||
-rw-r--r-- | contrib/libs/pire/README | 6 | ||||
-rw-r--r-- | contrib/libs/pire/configure.ac | 47 | ||||
-rw-r--r-- | contrib/libs/pire/pire/Makefile.am | 121 | ||||
-rw-r--r-- | contrib/libs/pire/ut/approx_matching_ut.cpp | 379 | ||||
-rw-r--r-- | contrib/libs/pire/ut/capture_ut.cpp | 299 | ||||
-rw-r--r-- | contrib/libs/pire/ut/common.h | 224 | ||||
-rw-r--r-- | contrib/libs/pire/ut/count_ut.cpp | 583 | ||||
-rw-r--r-- | contrib/libs/pire/ut/easy_ut.cpp (renamed from contrib/libs/pire/pire/fwd.h) | 47 | ||||
-rw-r--r-- | contrib/libs/pire/ut/glyph_ut.cpp | 63 | ||||
-rw-r--r-- | contrib/libs/pire/ut/inline_ut.cpp | 91 | ||||
-rw-r--r-- | contrib/libs/pire/ut/pire_ut.cpp | 888 | ||||
-rw-r--r-- | contrib/libs/pire/ut/read_unicode_ut.cpp | 298 | ||||
-rw-r--r-- | contrib/libs/pire/ut/stub/cppunit.h | 14 |
14 files changed, 2870 insertions, 192 deletions
diff --git a/contrib/libs/pire/Makefile.am b/contrib/libs/pire/Makefile.am deleted file mode 100644 index a9e8908fb6..0000000000 --- a/contrib/libs/pire/Makefile.am +++ /dev/null @@ -1,2 +0,0 @@ -ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = pire tests pkg samples diff --git a/contrib/libs/pire/README b/contrib/libs/pire/README deleted file mode 100644 index 1791486f8e..0000000000 --- a/contrib/libs/pire/README +++ /dev/null @@ -1,6 +0,0 @@ -This is PIRE, Perl Incompatible Regular Expressions library. - -For detailed information about what it is, how to build and use it, -see http://wiki.yandex-team.ru/DmitrijjProkopcev/pire . - -Please report bugs to dprokoptsev@yandex-team.ru or davenger@yandex-team.ru. diff --git a/contrib/libs/pire/configure.ac b/contrib/libs/pire/configure.ac deleted file mode 100644 index 49f235129c..0000000000 --- a/contrib/libs/pire/configure.ac +++ /dev/null @@ -1,47 +0,0 @@ -AC_PREREQ([2.63]) -AC_INIT([pire], [0.0.2], [dprokoptsev@yandex-team.ru]) -AM_INIT_AUTOMAKE([foreign -Wall]) -AC_CONFIG_SRCDIR([pire/classes.cpp]) -AC_CONFIG_HEADERS([config.h]) -AC_CONFIG_MACRO_DIR([m4]) - -AC_LANG_CPLUSPLUS - -# Require neccessary binaries to build ourselves -AC_PROG_CXX -AC_PROG_CC -AC_PROG_LEX -AC_PROG_YACC -AC_PROG_LIBTOOL - -# Check for cppunit -AM_PATH_CPPUNIT([0.0.0],[with_unittests=yes],[ - AC_WARN([cppunit not found. Unit tests will not compile and run.]) - with_unittests=no -]) -AM_CONDITIONAL([WITH_UNITTESTS], [test x"$with_unittests" = xyes]) - -# Just for conscience' sake -AC_CHECK_HEADERS([stdlib.h string.h sys/time.h]) -AC_HEADER_STDBOOL -AC_C_INLINE -AC_TYPE_SIZE_T -AC_CHECK_TYPES([ptrdiff_t]) -AC_FUNC_ERROR_AT_LINE -AC_FUNC_MALLOC -AC_CHECK_FUNCS([memset strchr]) - -# Require little-endian platform -AC_C_BIGENDIAN -if test x"$ac_cv_c_bigendian" = xyes; then - AC_ERROR([pire has not been ported to big-endian platforms yet.]) -fi - -# Optional features -AC_ARG_ENABLE([extra], AS_HELP_STRING([--enable-extra], [Add extra functionality (capturing scanner, etc...)])) -AC_ARG_ENABLE([debug], AS_HELP_STRING([--enable-debug], [Make Pire dump all constructed FSMs to std::clog (useless unless debugging Pire)])) -AM_CONDITIONAL([ENABLE_EXTRA], [test x"$enable_extra" = xyes]) -AM_CONDITIONAL([ENABLE_DEBUG], [test x"$enable_debug" = xyes]) - -AC_CONFIG_FILES([Makefile pire/Makefile tests/Makefile pkg/Makefile samples/Makefile samples/bench/Makefile]) -AC_OUTPUT diff --git a/contrib/libs/pire/pire/Makefile.am b/contrib/libs/pire/pire/Makefile.am deleted file mode 100644 index 09ef211704..0000000000 --- a/contrib/libs/pire/pire/Makefile.am +++ /dev/null @@ -1,121 +0,0 @@ - -AM_CXXFLAGS = -Wall -if ENABLE_DEBUG -AM_CXXFLAGS += -DPIRE_DEBUG -endif -if ENABLE_CHECKED -AM_CXXFLAGS += -DPIRE_CHECKED -endif - -lib_LTLIBRARIES = libpire.la -libpire_la_SOURCES = \ - align.h \ - any.h \ - classes.cpp \ - defs.h \ - determine.h \ - encoding.cpp \ - encoding.h \ - extra.h \ - fsm.cpp \ - fsm.h \ - fwd.h \ - glue.cpp \ - glue.h \ - minimize.h \ - half_final_fsm.cpp \ - half_final_fsm.h \ - partition.h \ - pire.h \ - re_lexer.cpp \ - re_lexer.h \ - run.h \ - scanner_io.cpp \ - vbitset.h \ - re_parser.ypp \ - scanners/half_final.h \ - scanners/loaded.h \ - scanners/multi.h \ - scanners/slow.h \ - scanners/simple.h \ - scanners/common.h \ - scanners/pair.h \ - stub/stl.h \ - stub/lexical_cast.h \ - stub/saveload.h \ - stub/singleton.h \ - stub/utf8.cpp \ - stub/utf8.h \ - stub/noncopyable.h \ - stub/codepage_h.h \ - stub/doccodes_h.h \ - stub/unidata_h.h \ - stub/unidata_cpp.h - -if ENABLE_EXTRA -libpire_la_SOURCES += \ - extra/capture.cpp \ - extra/capture.h \ - extra/count.cpp \ - extra/count.h \ - extra/glyphs.cpp \ - extra/glyphs.h -endif - -pire_hdrdir = $(includedir)/pire -pire_hdr_HEADERS = \ - align.h \ - any.h \ - defs.h \ - determine.h \ - encoding.h \ - extra.h \ - fsm.h \ - fwd.h \ - glue.h \ - minimize.h \ - half_final_fsm.h \ - partition.h \ - pire.h \ - re_lexer.h \ - re_parser.h \ - run.h \ - static_assert.h \ - vbitset.h - -if ENABLE_EXTRA -pire_extradir = $(includedir)/pire/extra -pire_extra_HEADERS = \ - extra/capture.h \ - extra/count.h \ - extra/glyphs.h -endif - -pire_scannersdir = $(includedir)/pire/scanners -pire_scanners_HEADERS = \ - scanners/common.h \ - scanners/half_final.h \ - scanners/multi.h \ - scanners/slow.h \ - scanners/simple.h \ - scanners/loaded.h \ - scanners/pair.h - -pire_stubdir = $(includedir)/pire/stub -pire_stub_HEADERS = \ - stub/stl.h \ - stub/defaults.h \ - stub/singleton.h \ - stub/saveload.h \ - stub/lexical_cast.h - -bin_PROGRAMS = pire_inline - -pire_inline_SOURCES = inline.lpp stub/hacks.h stub/memstreams.h -pire_inline_LDADD = libpire.la - -BUILT_SOURCES = re_parser.h re_parser.cpp -CLEANFILES = re_parser.h re_parser.cpp - -AM_YFLAGS = -d - diff --git a/contrib/libs/pire/ut/approx_matching_ut.cpp b/contrib/libs/pire/ut/approx_matching_ut.cpp new file mode 100644 index 0000000000..3b4cb972f6 --- /dev/null +++ b/contrib/libs/pire/ut/approx_matching_ut.cpp @@ -0,0 +1,379 @@ +/* + * approx_matching_ut.cpp -- + * + * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <contrib/libs/pire/pire/pire.h> +#include "common.h" + +Y_UNIT_TEST_SUITE(ApproxMatchingTest) { + Pire::Fsm BuildFsm(const char *str) + { + Pire::Lexer lexer; + TVector<wchar32> ucs4; + + lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + return lexer.Parse(); + } + + Y_UNIT_TEST(Simple) { + auto fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("ax"); + ACCEPTS("xb"); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("xab"); + ACCEPTS("axb"); + ACCEPTS("abx"); + ACCEPTS("aab"); + DENIES("xy"); + DENIES("abcd"); + DENIES("xabx"); + DENIES(""); + } + + fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("ab"); + ACCEPTS("xy"); + ACCEPTS(""); + ACCEPTS("axbx"); + DENIES("xxabx"); + DENIES("xbxxx"); + } + } + + Y_UNIT_TEST(SpecialSymbols) { + auto fsm = BuildFsm("^.*ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("ab"); + ACCEPTS("xxxxab"); + ACCEPTS("xxxxabab"); + DENIES("xxxx"); + DENIES("abxxxx"); + } + + fsm = BuildFsm("^[a-c]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("/"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("xx"); + DENIES("abc"); + } + + fsm = BuildFsm("^x{4}$"); + APPROXIMATE_SCANNER(fsm, 2) { + DENIES ("x"); + ACCEPTS("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + ACCEPTS("xxyy"); + ACCEPTS("xxyyx"); + ACCEPTS("xxxxyz"); + DENIES("xyyy"); + } + + fsm = BuildFsm("^(a|b)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("abc"); + DENIES("xx"); + } + + fsm = BuildFsm("^(ab|cd)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("cd"); + ACCEPTS("ax"); + ACCEPTS("xd"); + ACCEPTS("abx"); + ACCEPTS("a"); + DENIES("abcd"); + DENIES("xx"); + DENIES(""); + } + + fsm = BuildFsm("^[a-c]{3}$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("abc"); + ACCEPTS("aaa"); + ACCEPTS("a"); + ACCEPTS("ax"); + ACCEPTS("abxcx"); + DENIES("x"); + DENIES(""); + DENIES("xaxx"); + } + + fsm = BuildFsm("^\\x{61}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("axx"); + DENIES("xx"); + } + + fsm = BuildFsm("^a.bc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("axxbc"); + ACCEPTS("abc"); + ACCEPTS("xabc"); + ACCEPTS("xaxbc"); + DENIES("bc"); + DENIES("abcx"); + } + } + + Y_UNIT_TEST(TestSurrounded) { + auto fsm = BuildFsm("abc").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("xabcx"); + ACCEPTS("xabx"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("a"); + DENIES("xaxxxx"); + } + + fsm = BuildFsm("^abc$").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("abcx"); + ACCEPTS("xabc"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("xabx"); + DENIES("axx"); + } + } + + Y_UNIT_TEST(GlueFsm) { + auto fsm = BuildFsm("^a$") | BuildFsm("^b$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(""); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS("ab"); + DENIES("abb"); + } + + fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("cc"); + ACCEPTS("x"); + ACCEPTS("xa"); + ACCEPTS("c"); + ACCEPTS("xc"); + ACCEPTS("cxc"); + ACCEPTS(""); + } + } + + enum MutateOperation { + Begin, + Substitute = Begin, + Delete, + Insert, + End + }; + + ystring ChangeText(const ystring& text, int operation, int pos) + { + auto changedText = text; + switch (operation) { + case MutateOperation::Substitute: + changedText[pos] = 'x'; + break; + case MutateOperation::Delete: + changedText.erase(pos, 1); + break; + case MutateOperation::Insert: + changedText.insert(pos, 1, 'x'); + break; + } + + return changedText; + } + + Y_UNIT_TEST(StressTest) { + ystring text; + for (size_t letter = 0; letter < 10; ++letter) { + text += ystring(3, letter + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.Data()); + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + ACCEPTS(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + DENIES(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + ACCEPTS(changedText); + } + } + } + } + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + DENIES(changedText); + } + } + } + } + } + + Y_UNIT_TEST(SwapLetters) { + auto fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("bac"); + ACCEPTS("acb"); + DENIES("cba"); + DENIES("bax"); + } + + fsm = BuildFsm("^abcd$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("bacd"); + ACCEPTS("acbd"); + ACCEPTS("baxd"); + ACCEPTS("badc"); + ACCEPTS("bcad"); + ACCEPTS("bcda"); + DENIES("xcbx"); + DENIES("baxx"); + DENIES("ba"); + DENIES("cdab"); + } + + fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS("abc"); + DENIES("bac"); + } + + fsm = BuildFsm("^[a-c][1-3]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a3"); + ACCEPTS("c"); + ACCEPTS("1"); + ACCEPTS("1a"); + ACCEPTS("3b"); + DENIES("4a"); + } + + fsm = BuildFsm("^.*abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("xxxxbac"); + DENIES("xxxxa"); + DENIES("xxxxcb"); + } + } + + Y_UNIT_TEST(SwapStressTest){ + ystring text; + for (size_t letter = 0; letter < 30; ++letter) { + text += ystring(1, (letter % 26) + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.Data()); + auto changedText = text; + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + ACCEPTS(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + DENIES(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + } +} diff --git a/contrib/libs/pire/ut/capture_ut.cpp b/contrib/libs/pire/ut/capture_ut.cpp new file mode 100644 index 0000000000..3d339c5601 --- /dev/null +++ b/contrib/libs/pire/ut/capture_ut.cpp @@ -0,0 +1,299 @@ +/* + * capture_ut.cpp -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <stub/hacks.h> +#include <stub/saveload.h> +#include <stub/utf8.h> +#include <stub/memstreams.h> +#include "stub/cppunit.h" +#include <pire.h> +#include <extra.h> +#include <string.h> + +Y_UNIT_TEST_SUITE(TestPireCapture) { + + using Pire::CapturingScanner; + using Pire::SlowCapturingScanner; + typedef Pire::CapturingScanner::State State; + + CapturingScanner Compile(const char* regexp, int index) + { + Pire::Lexer lexer; + + lexer.Assign(regexp, regexp + strlen(regexp)); + lexer.AddFeature(Pire::Features::CaseInsensitive()); + lexer.AddFeature(Pire::Features::Capture((size_t) index)); + + Pire::Fsm fsm = lexer.Parse(); + + fsm.Surround(); + fsm.Determine(); + return fsm.Compile<Pire::CapturingScanner>(); + } + + SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + Pire::Lexer lexer; + lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index))); + lexer.SetEncoding(encoding); + TVector<wchar32> ucs4; + encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + Pire::Fsm fsm = lexer.Parse(); + fsm.Surround(); + return fsm.Compile<Pire::SlowCapturingScanner>(); + } + + State RunRegexp(const CapturingScanner& scanner, const char* str) + { + State state; + scanner.Initialize(state); + Step(scanner, state, Pire::BeginMark); + Run(scanner, state, str, str + strlen(str)); + Step(scanner, state, Pire::EndMark); + return state; + } + + SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str) + { + SlowCapturingScanner::State state; + scanner.Initialize(state); + Run(scanner, state, str, str + strlen(str)); + return state; + } + + ystring Captured(const State& state, const char* str) + { + if (state.Captured()) + return ystring(str + state.Begin() - 1, str + state.End() - 1); + else + return ystring(); + } + + Y_UNIT_TEST(Trivial) + { + CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); + State state; + const char* str; + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "var google_id = 'abcde'; eval(google_id);"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(!state.Captured()); + } + + Y_UNIT_TEST(Sequential) + { + CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1); + State state; + const char* str; + + str = "google_id = 'abcde'; google_id = 'xyz';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde")); + + str = "var google_id = 'abc de'; google_id = 'xyz';"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz")); + } + + Y_UNIT_TEST(NegatedTerminator) + { + CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1); + State state; + const char* str; + + str = "=12345;"; + state = RunRegexp(scanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345")); + } + + Y_UNIT_TEST(Serialization) + { + const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;"; + CapturingScanner scanner2 = Compile(regex, 1); + SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1); + BufferOutput wbuf, wbuf2; + ::Save(&wbuf, scanner2); + ::Save(&wbuf2, slowScanner2); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size()); + CapturingScanner scanner; + SlowCapturingScanner slowScanner; + ::Load(&rbuf, scanner); + ::Load(&rbuf2, slowScanner); + + State state; + SlowCapturingScanner::State slowState; + const char* str; + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner, str); + slowState = RunRegexp(slowScanner, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + SlowCapturingScanner::SingleState final; + UNIT_ASSERT(slowScanner.GetCapture(slowState, final)); + ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin()); + UNIT_ASSERT_EQUAL(ans, ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner, str); + slowState = RunRegexp(slowScanner, str); + UNIT_ASSERT(!state.Captured()); + UNIT_ASSERT(!slowScanner.GetCapture(slowState, final)); + + CapturingScanner scanner3; + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + + str = "google_id = 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(state.Captured()); + UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde")); + + str = "google_id != 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(!state.Captured()); + + ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1); + try { + scanner3.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); + } + catch (Pire::Error&) {} + + for (size_t offset = 1; offset < MaxTestOffset; ++offset) { + ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + try { + scanner3.Mmap(ptr, wbuf.Buffer().Size()); + if (offset % sizeof(size_t) != 0) { + UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping"); + } else { + str = "google_id = 'abcde';"; + state = RunRegexp(scanner3, str); + UNIT_ASSERT(state.Captured()); + } + } + catch (Pire::Error&) {} + } + } + + Y_UNIT_TEST(Empty) + { + Pire::CapturingScanner sc; + UNIT_ASSERT(sc.Empty()); + + UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash + + // Test Save/Load/Mmap + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::CapturingScanner sc3; + ::Load(&rbuf, sc3); + UNIT_CHECKPOINT(); RunRegexp(sc3, "a string"); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + Pire::CapturingScanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + UNIT_CHECKPOINT(); RunRegexp(sc4, "a string"); + } + + void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding); + SlowCapturingScanner::State st = RunRegexp(sc, text); + SlowCapturingScanner::SingleState fin; + bool ifCaptured = sc.GetCapture(st, fin); + if (ans) { + UNIT_ASSERT(ifCaptured); + ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin()); + UNIT_ASSERT_EQUAL(answer, captured); + } else { + UNIT_ASSERT(!ifCaptured); + } + } + + Y_UNIT_TEST(SlowCapturingNonGreedy) + { + const char* regexp = ".*?(pref.*suff)"; + const char* text = "pref ala bla pref cla suff dla"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureGreedy) + { + const char* regexp = ".*(pref.*suff)"; + const char* text = "pref ala bla pref cla suff dla"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureInOr) + { + const char* regexp = "(A)|A"; + const char* text = "A"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring("A")); + const char* regexp2 = "A|(A)"; + MakeSlowCapturingTest(regexp2, text, 1, false); + } + + Y_UNIT_TEST(SlowCapturing) + { + const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)"; + const char* text = "http://vkontakte.ru/id100500"; + MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500")); + } + + Y_UNIT_TEST(Utf_8) + { + const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!"; + const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! "; + const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)"; + MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans)); + } +} diff --git a/contrib/libs/pire/ut/common.h b/contrib/libs/pire/ut/common.h new file mode 100644 index 0000000000..d79eedafb7 --- /dev/null +++ b/contrib/libs/pire/ut/common.h @@ -0,0 +1,224 @@ +/* + * common.h -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#ifndef PIRE_TEST_COMMON_H_INCLUDED +#define PIRE_TEST_COMMON_H_INCLUDED + +#include <stdio.h> +#include <pire.h> +#include <stub/stl.h> +#include <stub/defaults.h> +#include <stub/lexical_cast.h> +#include "stub/cppunit.h" + +using namespace Pire; + +/***************************************************************************** +* Helpers +*****************************************************************************/ + +inline Pire::Fsm ParseRegexp(const char* str, const char* options = "", const Pire::Encoding** enc = 0) +{ + Pire::Lexer lexer; + TVector<wchar32> ucs4; + + bool surround = true; + for (; *options; ++options) { + if (*options == 'i') + lexer.AddFeature(Pire::Features::CaseInsensitive()); + else if (*options == 'u') + lexer.SetEncoding(Pire::Encodings::Utf8()); + else if (*options == 'n') + surround = false; + else if (*options == 'a') + lexer.AddFeature(Pire::Features::AndNotSupport()); + else + throw std::invalid_argument("Unknown option: " + ystring(1, *options)); + } + + if (enc) + *enc = &lexer.Encoding(); + + lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + + Pire::Fsm fsm = lexer.Parse(); + if (surround) + fsm.Surround(); + return fsm; +} + +inline bool HasError(const char* regexp) { + try { + ParseRegexp(regexp); + return false; + } catch (Pire::Error& ex) { + return true; + } +} + +struct Scanners { + Pire::Scanner fast; + Pire::NonrelocScanner nonreloc; + Pire::SimpleScanner simple; + Pire::SlowScanner slow; + Pire::ScannerNoMask fastNoMask; + Pire::NonrelocScannerNoMask nonrelocNoMask; + Pire::HalfFinalScanner halfFinal; + Pire::HalfFinalScannerNoMask halfFinalNoMask; + Pire::NonrelocHalfFinalScanner nonrelocHalfFinal; + Pire::NonrelocHalfFinalScannerNoMask nonrelocHalfFinalNoMask; + + Scanners(const Pire::Fsm& fsm, size_t distance = 0) + : fast(Pire::Fsm(fsm).Compile<Pire::Scanner>(distance)) + , nonreloc(Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(distance)) + , simple(Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(distance)) + , slow(Pire::Fsm(fsm).Compile<Pire::SlowScanner>(distance)) + , fastNoMask(Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(distance)) + , nonrelocNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(distance)) + , halfFinal(Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(distance)) + , halfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(distance)) + , nonrelocHalfFinal(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(distance)) + , nonrelocHalfFinalNoMask(Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(distance)) + {} + + Scanners(const char* str, const char* options = "") + { + Pire::Fsm fsm = ParseRegexp(str, options); + fast = Pire::Fsm(fsm).Compile<Pire::Scanner>(); + nonreloc = Pire::Fsm(fsm).Compile<Pire::NonrelocScanner>(); + simple = Pire::Fsm(fsm).Compile<Pire::SimpleScanner>(); + slow = Pire::Fsm(fsm).Compile<Pire::SlowScanner>(); + fastNoMask = Pire::Fsm(fsm).Compile<Pire::ScannerNoMask>(); + nonrelocNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocScannerNoMask>(); + halfFinal = Pire::Fsm(fsm).Compile<Pire::HalfFinalScanner>(); + halfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::HalfFinalScannerNoMask>(); + nonrelocHalfFinal = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScanner>(); + nonrelocHalfFinalNoMask = Pire::Fsm(fsm).Compile<Pire::NonrelocHalfFinalScannerNoMask>(); + } +}; + +#ifdef PIRE_DEBUG + +template <class Scanner> +inline ystring DbgState(const Scanner& scanner, typename Scanner::State state) +{ + return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); +} +/* +inline ystring DbgState(const Pire::SimpleScanner& scanner, Pire::SimpleScanner::State state) +{ + return ToString(scanner.StateIndex(state)) + (scanner.Final(state) ? ystring(" [final]") : ystring()); +} +*/ +inline ystring DbgState(const Pire::SlowScanner& scanner, const Pire::SlowScanner::State& state) +{ + return ystring("(") + Join(state.states.begin(), state.states.end(), ", ") + ystring(")") + (scanner.Final(state) ? ystring(" [final]") : ystring()); +} + +template<class Scanner> +void DbgRun(const Scanner& scanner, typename Scanner::State& state, const char* begin, const char* end) +{ + for (; begin != end; ++begin) { + char tmp[8]; + if (*begin >= 32) { + tmp[0] = *begin; + tmp[1] = 0; + } else + snprintf(tmp, sizeof(tmp)-1, "\\%03o", (unsigned char) *begin); + std::clog << DbgState(scanner, state) << " --[" << tmp << "]--> "; + scanner.Next(state, (unsigned char) *begin); + std::clog << DbgState(scanner, state) << "\n"; + } +} + +#define Run DbgRun +#endif + +template<class Scanner> +typename Scanner::State RunRegexp(const Scanner& scanner, const ystring& str) +{ + PIRE_IFDEBUG(std::clog << "--- checking against " << str << "\n"); + + typename Scanner::State state; + scanner.Initialize(state); + Step(scanner, state, BeginMark); + Run(scanner, state, str.c_str(), str.c_str() + str.length()); + Step(scanner, state, EndMark); + return state; +} + +template<class Scanner> +typename Scanner::State RunRegexp(const Scanner& scanner, const char* str) +{ + return RunRegexp(scanner, ystring(str)); +} + +template<class Scanner> +bool Matches(const Scanner& scanner, const ystring& str) +{ + auto state = RunRegexp(scanner, str); + auto result = scanner.AcceptedRegexps(state); + return result.first != result.second; +} + +template<class Scanner> +bool Matches(const Scanner& scanner, const char* str) +{ + return Matches(scanner, ystring(str)); +} + +#define SCANNER(fsm) for (Scanners m_scanners(fsm), *m_flag = &m_scanners; m_flag; m_flag = 0) +#define APPROXIMATE_SCANNER(fsm, distance) for (Scanners m_scanners(fsm, distance), *m_flag = &m_scanners; m_flag; m_flag = 0) +#define REGEXP(pattern) for (Scanners m_scanners(pattern), *m_flag = &m_scanners; m_flag; m_flag = 0) +#define REGEXP2(pattern,flags) for (Scanners m_scanners(pattern, flags), *m_flag = &m_scanners; m_flag; m_flag = 0) +#define ACCEPTS(str) \ + do {\ + UNIT_ASSERT(Matches(m_scanners.fast, str));\ + UNIT_ASSERT(Matches(m_scanners.nonreloc, str));\ + UNIT_ASSERT(Matches(m_scanners.simple, str));\ + UNIT_ASSERT(Matches(m_scanners.slow, str));\ + UNIT_ASSERT(Matches(m_scanners.fastNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.halfFinal, str));\ + UNIT_ASSERT(Matches(m_scanners.halfFinalNoMask, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinal, str));\ + UNIT_ASSERT(Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ + } while (false) + +#define DENIES(str) \ + do {\ + UNIT_ASSERT(!Matches(m_scanners.fast, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonreloc, str));\ + UNIT_ASSERT(!Matches(m_scanners.simple, str));\ + UNIT_ASSERT(!Matches(m_scanners.slow, str));\ + UNIT_ASSERT(!Matches(m_scanners.fastNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.halfFinal, str));\ + UNIT_ASSERT(!Matches(m_scanners.halfFinalNoMask, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinal, str));\ + UNIT_ASSERT(!Matches(m_scanners.nonrelocHalfFinalNoMask, str));\ + } while (false) + + +#endif diff --git a/contrib/libs/pire/ut/count_ut.cpp b/contrib/libs/pire/ut/count_ut.cpp new file mode 100644 index 0000000000..ffe7943fcc --- /dev/null +++ b/contrib/libs/pire/ut/count_ut.cpp @@ -0,0 +1,583 @@ +/* + * count_ut.cpp -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <stub/hacks.h> +#include <stub/saveload.h> +#include <stub/utf8.h> +#include <stub/memstreams.h> +#include "stub/cppunit.h" +#include <pire.h> +#include <extra.h> +#include <string.h> + + +Y_UNIT_TEST_SUITE(TestCount) { + + Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding) + { + Pire::Lexer lex; + lex.SetEncoding(encoding); + TVector<wchar32> ucs4; + encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + lex.Assign(ucs4.begin(), ucs4.end()); + return lex.Parse(); + } + + template<class Scanner> + typename Scanner::State InitializedState(const Scanner& scanner) + { + typename Scanner::State state; + scanner.Initialize(state); + return state; + } + + template<class Scanner> + typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1) + { + if (len == (size_t)-1) len = strlen(text); + auto state = InitializedState(scanner); + Pire::Step(scanner, state, Pire::BeginMark); + Pire::Run(scanner, state, text, text + len); + Pire::Step(scanner, state, Pire::EndMark); + return state; + } + + template<class Scanner> + size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + const auto regexpFsm = MkFsm(regexp, encoding); + const auto separatorFsm = MkFsm(separator, encoding); + return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0); + } + + size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) + { + const auto regexpFsm = MkFsm(regexp, encoding); + const auto separatorFsm = MkFsm(separator, encoding); + auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + if (strcmp(separator, ".*") == 0) { + HalfFinalFsm fsm(regexpFsm); + fsm.MakeGreedyCounter(true); + auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeGreedyCounter(false); + auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0); + UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult); + UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult); + } + UNIT_ASSERT_EQUAL(countingResult, newResult); + auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0); + UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult); + return newResult; + } + + Y_UNIT_TEST(Count) + { + UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3)); + char aaa[] = "abc def\0 abc\0 def ghi, abc"; + UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6)); + UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6)); + UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8)); + UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3)); + UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4)); + UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 " + "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240" + "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 " + "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302" + "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270" + "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5)); + UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*", + "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 " + "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 " + "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. " + "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 " + "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 " + "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 " + "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 " + "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 " + "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 " + "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 " + "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 " + "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> " + "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. " + "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> " + "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 " + "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 " + "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! " + "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, " + "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 " + "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202" + "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264" + "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204" + "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275" + "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321" + "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321" + "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 " + "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277" + "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265" + "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321" + "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201" + "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277" + "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 " + "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271" + "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201" + "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270" + "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202" + "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202" + "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"), + size_t(6)); + UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276" + "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">" + "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203" + "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">" + "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />" + "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201" + "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260" + "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276" + "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320" + "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214" + "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7)); + UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +", + " \320\260\320\260\320\220 abc def \320\260 cd"), + size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state + } + + Y_UNIT_TEST(CountWithoutSeparator) + { + UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3)); + } + + Y_UNIT_TEST(CountGreedy) + { + const auto& enc = Pire::Encodings::Latin1(); + char text[] = "wwwsswwwsssswwws"; + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); + UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3)); + } + + Y_UNIT_TEST(CountRepeating) + { + char text[] = "abbabbabbabbat"; + UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2)); + } + + template<class Scanner> + void CountGlueOne() + { + const auto& enc = Pire::Encodings::Utf8(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + } + + Y_UNIT_TEST(CountGlue) + { + CountGlueOne<Pire::CountingScanner>(); + CountGlueOne<Pire::AdvancedCountingScanner>(); + CountGlueOne<Pire::NoGlueLimitCountingScanner>(); + } + + template <class Scanner> + void CountManyGluesOne(size_t maxRegexps) { + const auto& encoding = Pire::Encodings::Utf8(); + auto text = "abcdbaa aa"; + TVector<ypair<std::string, std::string>> tasks = { + {"a", ".*"}, + {"b", ".*"}, + {"c", ".*"}, + {"ba", ".*"}, + {"ab",".*"}, + }; + TVector<size_t> answers = {5, 2, 1, 1, 1}; + Scanner scanner; + size_t regexpsCount = 0; + for (; regexpsCount < maxRegexps; ++regexpsCount) { + const auto& task = tasks[regexpsCount % tasks.size()]; + const auto regexpFsm = MkFsm(task.first.c_str(), encoding); + const auto separatorFsm = MkFsm(task.second.c_str(), encoding); + Scanner nextScanner(regexpFsm, separatorFsm); + auto glue = Scanner::Glue(scanner, nextScanner); + if (glue.Empty()) { + break; + } + scanner = std::move(glue); + } + auto state = Run(scanner, text); + for (size_t i = 0; i < regexpsCount; ++i) { + UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]); + } + } + + Y_UNIT_TEST(CountManyGlues) + { + CountManyGluesOne<Pire::CountingScanner>(20); + CountManyGluesOne<Pire::AdvancedCountingScanner>(20); + CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50); + } + + template<class Scanner> + void CountBoundariesOne() + { + const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" }; + + const auto& enc = Pire::Encodings::Utf8(); + Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc)); + auto st = InitializedState(sc); + for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { + Pire::Step(sc, st, Pire::BeginMark); + Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i])); + Pire::Step(sc, st, Pire::EndMark); + } + UNIT_ASSERT_EQUAL(st.Result(0), size_t(3)); + + const auto& enc2 = Pire::Encodings::Latin1(); + Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2)); + auto st2 = InitializedState(sc2); + for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) { + Pire::Step(sc2, st2, Pire::BeginMark); + Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i])); + Pire::Step(sc2, st2, Pire::EndMark); + } + UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7)); + } + + Y_UNIT_TEST(CountBoundaries) + { + CountBoundariesOne<Pire::CountingScanner>(); + CountBoundariesOne<Pire::AdvancedCountingScanner>(); + CountBoundariesOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<class Scanner> + void SerializationOne() + { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Scanner sc3; + ::Load(&rbuf, sc3); + + auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + + // Test mmap-ing at various alignments + for (size_t offset = 0; offset < MaxTestOffset; ++offset) { + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + try { + Scanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + + if (offset % sizeof(size_t) != 0) { + UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping"); + } else { + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + + st = Run(sc4, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st.Result(1), size_t(2)); + } + } + catch (Pire::Error&) {} + } + } + + Y_UNIT_TEST(Serialization) + { + SerializationOne<Pire::CountingScanner>(); + SerializationOne<Pire::AdvancedCountingScanner>(); + SerializationOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<class Scanner> + void Serialization_v6_compatibilityOne() + { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = Scanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + // Patched scanner is a scanner of RE_VERSION 6. + // The patched scanner is concatenated with original scanner to + // make sure all content of patched scanner is consumed. + + const size_t ALIGNMENT = sizeof(size_t); + size_t actions_size = + sc.Size() * + sc.LettersCount() * + sizeof(typename Scanner::Action); + UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0); + size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag); + const char* src = wbuf.Buffer().Data(); + size_t src_size = wbuf.Buffer().Size(); + size_t patched_size = src_size + actions_size; + size_t bytes_before_actions = src_size - tags_size; + const int fill_char = 0x42; + + TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT); + char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT)); + char* patched = dst; + + // Insert dummy m_actions between m_jumps and m_tags. + memcpy(patched, src, bytes_before_actions); // copy members before m_actions + memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions + memcpy(patched + bytes_before_actions + actions_size, + src + bytes_before_actions, + tags_size); // m_tags + // Set version to 6 + // order of fields in header: magic, version, ... + ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1; + UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION); + *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS; + + // write normal scanner after patched one + char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT); + memcpy(normal, src, src_size); + char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT); + size_t dst_size = dst_end - dst; + + // test loading from stream + { + MemoryInput rbuf(dst, dst_size); + Scanner sc_patched, sc_normal; + ::Load(&rbuf, sc_patched); + ::Load(&rbuf, sc_normal); + auto st_patched = Run(sc_patched, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); + auto st_normal = Run(sc_normal, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); + } + + // test loading using Mmap + { + Scanner sc_patched, sc_normal; + const void* tail = sc_patched.Mmap(patched, patched_size); + UNIT_ASSERT_EQUAL(tail, normal); + const void* tail2 = sc_normal.Mmap(tail, src_size); + UNIT_ASSERT_EQUAL(tail2, dst_end); + auto st_patched = Run(sc_patched, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2)); + auto st_normal = Run(sc_normal, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2)); + } + } + + Y_UNIT_TEST(Serialization_v6_compatibility) + { + Serialization_v6_compatibilityOne<Pire::CountingScanner>(); + Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>(); + // NoGlueLimitCountingScanner is not v6_compatible + } + + Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) { + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc)); + auto sc = AdvancedCountingScanner::Glue(sc1, sc2); + + BufferOutput wbuf; + ::Save(&wbuf, sc); + + TVector<char> buf2(wbuf.Buffer().Size()); + memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + // test loading from stream + { + MemoryInput rbuf(buf2.data(), buf2.size()); + NoGlueLimitCountingScanner scanner; + ::Load(&rbuf, scanner); + auto state = Run(scanner, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); + } + + // test loading using Mmap + { + NoGlueLimitCountingScanner scanner; + const void* tail = scanner.Mmap(buf2.data(), buf2.size()); + UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size()); + auto state = Run(scanner, + "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(state.Result(0), size_t(4)); + UNIT_ASSERT_EQUAL(state.Result(1), size_t(2)); + } + } + + template<class Scanner> + void EmptyOne() + { + Scanner sc; + UNIT_ASSERT(sc.Empty()); + + UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash + + // Test glueing empty + const auto& enc = Pire::Encodings::Latin1(); + auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc)); + auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1)); + auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst"); + UNIT_ASSERT_EQUAL(st.Result(0), size_t(4)); + + // Test Save/Load/Mmap + BufferOutput wbuf; + ::Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::CountingScanner sc3; + ::Load(&rbuf, sc3); + UNIT_CHECKPOINT(); Run(sc3, "a string"); + + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + Scanner sc4; + const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size()); + UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size())); + UNIT_CHECKPOINT(); Run(sc4, "a string"); + } + + Y_UNIT_TEST(Empty) + { + EmptyOne<Pire::CountingScanner>(); + EmptyOne<Pire::AdvancedCountingScanner>(); + EmptyOne<Pire::NoGlueLimitCountingScanner>(); + } + + template<typename Scanner> + TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) { + TVector<Scanner> scanners(6); + const auto regexpFsm = MkFsm(regexp, encoding); + HalfFinalFsm fsm(regexpFsm); + fsm.MakeGreedyCounter(true); + scanners[0] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeGreedyCounter(false); + scanners[1] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(true, true); + scanners[2] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(true, false); + scanners[3] = Scanner(fsm); + fsm = HalfFinalFsm(regexpFsm); + fsm.MakeNonGreedyCounter(false); + scanners[4] = Scanner(fsm); + scanners[5] = scanners[0]; + for (size_t i = 1; i < 5; i++) { + scanners[5] = Scanner::Glue(scanners[5], scanners[i]); + } + return scanners; + } + + template<typename Scanner> + void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) { + for (size_t i = 0; i < 5; i++) { + UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]); + } + auto state = Run(scanners[5], text, -1); + for (size_t i = 0; i < 5; i++) { + UNIT_ASSERT_EQUAL(state.Result(i), result[i]); + } + } + + template<typename Scanner> + void TestHalfFinalCount() { + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8}); + HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7}); + } + + Y_UNIT_TEST(HalfFinal) + { + TestHalfFinalCount<Pire::HalfFinalScanner>(); + TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>(); + TestHalfFinalCount<Pire::HalfFinalScannerNoMask>(); + TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>(); + } + + template<typename Scanner> + void TestHalfFinalSerialization() { + auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+"); + BufferOutput wbuf; + for (size_t i = 0; i < 6; i++) { + ::Save(&wbuf, oldScanners[i]); + } + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + TVector<Scanner> scanners(6); + for (size_t i = 0; i < 6; i++) { + ::Load(&rbuf, scanners[i]); + } + + HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5}); + } + + Y_UNIT_TEST(HalfFinalSerialization) + { + TestHalfFinalSerialization<Pire::HalfFinalScanner>(); + TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>(); + } +} diff --git a/contrib/libs/pire/pire/fwd.h b/contrib/libs/pire/ut/easy_ut.cpp index c2b5870b05..5f0f8303fc 100644 --- a/contrib/libs/pire/pire/fwd.h +++ b/contrib/libs/pire/ut/easy_ut.cpp @@ -1,5 +1,5 @@ /* - * fwd.h -- forward declarations of Pire classes + * easy_ut.cpp -- Unit tests for PireEasy * * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, * Alexander Gololobov <agololobov@gmail.com> @@ -11,7 +11,7 @@ * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -20,23 +20,38 @@ * along with Pire. If not, see <http://www.gnu.org/licenses>. */ +#include <stub/hacks.h> +#include <stub/defaults.h> +#include "stub/cppunit.h" +#include <stdexcept> +#include "common.h" -#ifndef PIRE_FWD_H -#define PIRE_FWD_H - - -namespace Pire { +#undef Run - class Scanner; - class MultiScanner; - class SlowScanner; - class CapturingScanner; - class CountingScanner; +#include <easy.h> - class Fsm; +Y_UNIT_TEST_SUITE(TestPireEasy) { + +Y_UNIT_TEST(Match) +{ + Pire::Regexp re("(foo|bar)+", Pire::I); + UNIT_ASSERT("prefix fOoBaR suffix" ==~ re); + UNIT_ASSERT(!("bla bla bla" ==~ re)); +} - class Lexer; - class Encoding; +Y_UNIT_TEST(Utf8) +{ + Pire::Regexp re("^.$", Pire::I | Pire::UTF8); + UNIT_ASSERT("\x41" ==~ re); + UNIT_ASSERT(!("\x81" ==~ re)); } -#endif +Y_UNIT_TEST(TwoFeatures) +{ + Pire::Regexp re("^(a.c&.b.)$", Pire::I | Pire::ANDNOT); + UNIT_ASSERT("abc" ==~ re); + UNIT_ASSERT("ABC" ==~ re); + UNIT_ASSERT(!("adc" ==~ re)); +} + +} diff --git a/contrib/libs/pire/ut/glyph_ut.cpp b/contrib/libs/pire/ut/glyph_ut.cpp new file mode 100644 index 0000000000..05ef56b01b --- /dev/null +++ b/contrib/libs/pire/ut/glyph_ut.cpp @@ -0,0 +1,63 @@ +/* + * glyph_ut.cpp -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <pire.h> +#include <extra/glyphs.h> +#include "stub/cppunit.h" +#include "common.h" + +Y_UNIT_TEST_SUITE(Glyphs) { + + Pire::Fsm ParseFsm(const char* regexp) + { + TVector<wchar32> ucs4; + Pire::Encodings::Utf8().FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4)); + return Pire::Lexer(ucs4).SetEncoding(Pire::Encodings::Utf8()).AddFeature(Pire::Features::GlueSimilarGlyphs()).Parse().Surround(); + } + +#define NOGL_REGEXP(str) REGEXP2(str, "u") +#define GL_REGEXP(str) SCANNER(ParseFsm(str)) + + Y_UNIT_TEST(Glyphs) + { + NOGL_REGEXP("regexp") { + ACCEPTS("regexp"); + DENIES("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + GL_REGEXP("regexp") { + ACCEPTS("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + NOGL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { + DENIES("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + + GL_REGEXP("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80") { + ACCEPTS("regexp"); + ACCEPTS("r\xD0\xB5g\xD0\xB5\xD1\x85\xD1\x80"); + } + } +} diff --git a/contrib/libs/pire/ut/inline_ut.cpp b/contrib/libs/pire/ut/inline_ut.cpp new file mode 100644 index 0000000000..3ba31dfaa8 --- /dev/null +++ b/contrib/libs/pire/ut/inline_ut.cpp @@ -0,0 +1,91 @@ +/* + * inline_ut.cpp -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <stub/hacks.h> +#include "stub/cppunit.h" +#include <pire.h> +#include <iostream> +#include <string.h> + +Y_UNIT_TEST_SUITE(TestPireInline) { + +template<class Scanner> +typename Scanner::State RunRegexp(const Scanner& scanner, const char* str) +{ + typename Scanner::State state; + scanner.Initialize(state); + Step(scanner, state, Pire::BeginMark); + Run(scanner, state, str, str + strlen(str)); + Step(scanner, state, Pire::EndMark); + return state; +} + +template<class Scanner> +bool Matches(const Scanner& scanner, const char* str) +{ + return scanner.Final(RunRegexp(scanner, str)); +} + +template<class Scanner> +bool Matches2(const Scanner& scanner, const char* str) +{ + return Pire::Matches(scanner, str); +} + +bool ParticularMatch(Pire::Scanner& sc, Pire::Scanner::State st, size_t idx) +{ + std::pair<const size_t*, const size_t*> p = sc.AcceptedRegexps(st); + return std::distance(p.first, p.second) == 1 && *p.first == idx; +} + +Y_UNIT_TEST(Inline) +{ + Pire::Scanner scanner = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "is"); + UNIT_ASSERT(Matches(scanner, "http://domain.vasya.ru/")); + UNIT_ASSERT(Matches(scanner, "prefix http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches(scanner, "http://127.0.0.1/")); + + Pire::Scanner scanner2 = PIRE_REGEXP("http://([a-z0-9]+\\.)+[a-z]{2,4}/?", "i"); + UNIT_ASSERT(Matches2(scanner2, "http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches2(scanner2, "prefix http://domain.vasya.ru/")); + UNIT_ASSERT(!Matches2(scanner2, "http://127.0.0.1/")); +} + +Y_UNIT_TEST(InlineGlue) +{ + // Check whether pire_inline handles comments as well: + + /* - a C-style comment outside a regexp; */ + Pire::Scanner sc = PIRE_REGEXP( + "foo", "", /* - a C-style comment inside a regexp; */ + "bar", "", // - a C++-style comment inside a regexp; + "baz", "" + ); + // - a C++-style comment outside a regexp. + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("foo").State(), 0)); + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("bar").State(), 1)); + UNIT_ASSERT(ParticularMatch(sc, Pire::Runner(sc).Run("baz").State(), 2)); + UNIT_ASSERT(!Matches2(sc, "xxx")); +} + +} diff --git a/contrib/libs/pire/ut/pire_ut.cpp b/contrib/libs/pire/ut/pire_ut.cpp new file mode 100644 index 0000000000..13f3f2ec71 --- /dev/null +++ b/contrib/libs/pire/ut/pire_ut.cpp @@ -0,0 +1,888 @@ +/* + * pire_ut.cpp -- + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, + * Alexander Gololobov <agololobov@gmail.com> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <stub/hacks.h> +#include <stub/defaults.h> +#include <stub/saveload.h> +#include <stub/memstreams.h> +#include "stub/cppunit.h" +#include <stdexcept> +#include "common.h" + +Y_UNIT_TEST_SUITE(TestPire) { + +/***************************************************************************** +* Tests themselves +*****************************************************************************/ + +Y_UNIT_TEST(String) +{ + REGEXP("abc") { + ACCEPTS("def abc ghi"); + ACCEPTS("abc"); + DENIES ("def abd ghi"); + } +} + +Y_UNIT_TEST(Boundaries) +{ + REGEXP("^abc") { + ACCEPTS("abc ghi"); + DENIES ("def abc"); + } + + REGEXP("abc$") { + DENIES ("abc ghi"); + ACCEPTS("def abc"); + } +} + +Y_UNIT_TEST(Primitives) +{ + REGEXP("abc|def") { + ACCEPTS("def"); + ACCEPTS("abc"); + DENIES ("deb"); + } + + REGEXP("ad*e") { + ACCEPTS("xaez"); + ACCEPTS("xadez"); + ACCEPTS("xaddez"); + ACCEPTS("xadddddddddddddddddddddddez"); + DENIES ("xafez"); + } + + REGEXP("ad+e") { + DENIES ("xaez"); + ACCEPTS("xadez"); + ACCEPTS("xaddez"); + ACCEPTS("xadddddddddddddddddddddddez"); + DENIES ("xafez"); + } + + REGEXP("ad?e") { + ACCEPTS("xaez"); + ACCEPTS("xadez"); + DENIES ("xaddez"); + DENIES ("xafez"); + } + + REGEXP("a.{1}e") { + ACCEPTS("axe"); + DENIES ("ae"); + DENIES ("axye"); + } +} + +void TestMassAlternatives(const char* pattern) { + REGEXP(pattern) { + ACCEPTS("abc"); + ACCEPTS("def"); + ACCEPTS("ghi"); + ACCEPTS("klm"); + DENIES ("aei"); + DENIES ("klc"); + } +} + +Y_UNIT_TEST(MassAlternatives) +{ + TestMassAlternatives("((abc|def)|ghi)|klm"); + + TestMassAlternatives("(abc|def)|(ghi|klm)"); + + TestMassAlternatives("abc|(def|(ghi|klm))"); + + TestMassAlternatives("abc|(def|ghi)|klm"); +} + +Y_UNIT_TEST(Composition) +{ + REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") { + ACCEPTS("/regexp/i"); + ACCEPTS("/regexp2/"); + DENIES ("regexp"); + + ACCEPTS("/dir\\/file/"); + DENIES ("/dir/file/"); + + ACCEPTS("/dir\\\\/"); + DENIES ("/dir\\\\/file/"); + } + + REGEXP("Head(Inner)*Tail") { + ACCEPTS("HeadInnerTail"); + ACCEPTS("HeadInnerInnerTail"); + DENIES ("HeadInneInnerTail"); + ACCEPTS("HeadTail"); + } +} + +Y_UNIT_TEST(Repetition) +{ + REGEXP("^x{3,6}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + } + + REGEXP("^x{3,}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxxxxxxxx"); + ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^x{3}$") { + DENIES ("x"); + DENIES ("xx"); + ACCEPTS("xxx"); + DENIES ("xxxx"); + DENIES ("xxxxx"); + DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("x.{3,10}$") { + for (size_t size = 0; size < 20; ++size) { + ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e'); + if (size >= 3 && size <= 10) + ACCEPTS(str.c_str()); + else + DENIES(str.c_str()); + } + } +} + +Y_UNIT_TEST(UTF8) +{ + REGEXP2("^.$", "u") { + // A single-byte sequence 0xxx xxxx + ACCEPTS("\x41"); + DENIES ("\x81"); + + // A two-byte sequence: 110x xxxx | 10xx xxxx + ACCEPTS("\xC1\x81"); + DENIES ("\xC1"); + DENIES ("\xC1\x41"); + DENIES ("\xC1\xC2"); + DENIES ("\xC1\x81\x82"); + + // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx + ACCEPTS("\xE1\x81\x82"); + DENIES ("\xE1"); + DENIES ("\xE1\x42"); + DENIES ("\xE1\x42\x43"); + DENIES ("\xE1\xC2\xC3"); + DENIES ("\xE1\x82"); + DENIES ("\xE1\x82\x83\x84"); + + // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx + ACCEPTS("\xF1\x81\x82\x83"); + } + + REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y"); +} + +Y_UNIT_TEST(AndNot) +{ + REGEXP2("<([0-9]+&~123&~456)>", "a") { + ACCEPTS("<111>"); + ACCEPTS("<124>"); + DENIES ("<123>"); + DENIES ("<456>"); + DENIES ("<abc>"); + } + + REGEXP2("[0-9]+\\&1+", "a") { + DENIES("111"); + ACCEPTS("123&111"); + } +} + +Y_UNIT_TEST(Empty) +{ + Scanners s("\\s*", "n"); + Pire::Scanner::State state; + s.fast.Initialize(state); + UNIT_ASSERT(s.fast.Final(state)); + Pire::SimpleScanner::State stateSF; + s.simple.Initialize(stateSF); + UNIT_ASSERT(s.simple.Final(stateSF)); +} + +Y_UNIT_TEST(Misc) +{ + REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a"); + REGEXP("\\t") ACCEPTS("\t"); + + SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) { + ACCEPTS("str"); + DENIES("str_http"); + } + + SCANNER(~Pire::Fsm()) ACCEPTS("str"); +} + +Y_UNIT_TEST(Ranges) +{ + REGEXP("a\\W") { + ACCEPTS("a,"); + DENIES("ab"); + } + + try { + REGEXP("abc[def") {} + UNIT_ASSERT(!"Should report syntax error"); + } + catch (Pire::Error&) {} +} + +Y_UNIT_TEST(Reverse) +{ + SCANNER(ParseRegexp("abcdef").Reverse()) { + ACCEPTS("fedcba"); + DENIES ("abcdef"); + } +} + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + +Y_UNIT_TEST(PrefixSuffix) +{ + static const char* pattern = "-->"; + Pire::Fsm fsm = ParseRegexp(pattern, "n"); + Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>(); + Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>(); + Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>(); + + static const char* text = "1234567890 --> middle --> end"; + const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::LongestPrefix(ngsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 25); + begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 22); + view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 22); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::ShortestPrefix(gsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); + + end = Pire::ShortestPrefix(ngsc, text, text + strlen(text)); + UNIT_ASSERT_EQUAL(end, text + 14); + begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1; + UNIT_ASSERT_EQUAL(begin, text + 11); + view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text)); + UNIT_ASSERT_EQUAL(view.data(), text + 11); + UNIT_ASSERT_EQUAL(view.size(), 3); +} +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +Y_UNIT_TEST(PrefixSuffixEmptyView) { + const std::string_view empty{}; + auto checkAnswer = [](std::string_view answer) { + return !answer.data() && answer.size() == 0; + }; + + TVector<ystring> patterns = { + "", + "a", + ".*", + "a.*", + ".*a" + }; + + for (const auto& pattern: patterns) { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern); + UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern); + } +} + +namespace { + ssize_t LongestPrefixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* end = Pire::LongestPrefix(sc, str, str + strlen(str)); + return end ? end - str : -1; + } + + ssize_t ShortestPrefixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str)); + return end ? end - str : -1; + } + + ssize_t LongestSuffixLen(const char* pattern, const char* str) + { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* rbegin = str + strlen(str) - 1; + const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1); + return rend ? rbegin - rend : -1; + } + + ssize_t ShortestSuffixLen(const char* pattern, const char* str) { + Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>(); + const char* rbegin = str + strlen(str) - 1; + const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1); + return rend ? rbegin - rend : -1; + } +} + +Y_UNIT_TEST(ScanBoundaries) +{ + struct Case { + ystring pattern; + ystring text; + ssize_t shortestPrefixLen; + ssize_t longestPrefixLen; + + ystring ToString() const { + return ystring("Pattern: ") + pattern + ", text: " + text; + } + }; + + TVector <Case> cases = { + { + "a*", + "", + 0, + 0, + }, + { + "a", + "", + -1, + -1, + }, + { + "fixed", + "fixed prefix", + 5, + 5, + }, + { + "fixed", + "a fixed nonexistent prefix", + -1, + -1, + }, + { + "a*", + "aaabbb", + 0, + 3, + }, + { + "a*", + "bbbbbb", + 0, + 0, + }, + { + "a*", + "aaaaaa", + 0, + 6, + }, + { + "aa*", + "aaabbb", + 1, + 3, + }, + { + "a*a", + "aaaaaa", + 1, + 6, + }, + { + ".*a", + "bbbba", + 5, + 5, + }, + { + ".*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", + 0, + 80, + }, + { + ".*a", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", + 81, + 81, + }, + { + ".*a", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a", + 81, + 162, + }, + { + ".*b", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-", + -1, + -1, + }, + { + ".*a.*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 81, + 162, + }, + { + ".*a.*b", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 162, + 162, + }, + { + "1.*a.*", + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a" + "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b", + 81, + 162, + }, + { + "a+", + "bbbbbb", + -1, + -1, + }, + }; + + for (const auto& test: cases) { + UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString()); + UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString()); + auto reversed = test.text; + ReverseInPlace(reversed); + UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString()); + UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString()); + } +} + +Y_UNIT_TEST(ScanTermination) +{ + Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>(); + // Scanning must terminate at first dead state. If it does not, + // we will pass through the end of our string and end up with segfault. + const char str[] = "aaab"; + const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str)); + UNIT_ASSERT(p == &str[0] + 3); +} + +struct BasicMmapTest { + template <class Scanner> + static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str) + { + try { + sc.Mmap(ptr, sz); + if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) { + UNIT_ASSERT(!"Failed to check for misaligned mmaping"); + } else { + UNIT_ASSERT(Matches(sc, str)); + } + } + catch (Pire::Error&) {} + } +}; + +template <class Sc1, class Sc2> +void TestCopyingHelper() +{ + Pire::Fsm fsm = ParseRegexp("^r$", ""); + Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>()); + + // Test copy ctor + UNIT_ASSERT(Matches(Sc2(sc1), "r")); + UNIT_ASSERT(!Matches(Sc2(sc1), "p")); + + // Test '=' operator + Sc2 sc2; + sc2 = sc1; + UNIT_ASSERT(Matches(sc2, "r")); + UNIT_ASSERT(!Matches(sc2, "p")); +} + +template <class Sc1, class Sc2> +void TestCopying() +{ + TestCopyingHelper<Sc1, Sc2>(); + TestCopyingHelper<Sc2, Sc1>(); +} + +Y_UNIT_TEST(Copying) +{ + TestCopying<Pire::Scanner, Pire::NonrelocScanner>(); + TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>(); + TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>(); + TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>(); +} + +template<class Scanner> +void MatchScanner(Scanner& scanner) { + UNIT_ASSERT(Matches(scanner, "regexp")); + UNIT_ASSERT(!Matches(scanner, "regxp")); + UNIT_ASSERT(!Matches(scanner, "regexp t")); +} + +template<class Scanner> +void LoadAndMatchScanner(MemoryInput& rbuf, Scanner& scanner) { + Load(&rbuf, scanner); + MatchScanner(scanner); +} + +template<class Scanner> +const char* MmapAndMatchScanner(Scanner& scanner, const char* ptr, size_t size) { + const char* ptr2 = (const char*)scanner.Mmap(ptr, size); + MatchScanner(scanner); + return ptr2; +} + +Y_UNIT_TEST(Serialization) +{ + Scanners s("^regexp$"); + + BufferOutput wbuf; + Save(&wbuf, s.fast); + Save(&wbuf, s.simple); + Save(&wbuf, s.slow); + Save(&wbuf, s.fastNoMask); + Save(&wbuf, s.nonreloc); + Save(&wbuf, s.nonrelocNoMask); + Save(&wbuf, s.halfFinal); + Save(&wbuf, s.halfFinalNoMask); + Save(&wbuf, s.nonrelocHalfFinal); + Save(&wbuf, s.nonrelocHalfFinalNoMask); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + LoadAndMatchScanner(rbuf, s.fast); + LoadAndMatchScanner(rbuf, s.simple); + LoadAndMatchScanner(rbuf, s.slow); + LoadAndMatchScanner(rbuf, s.fastNoMask); + LoadAndMatchScanner(rbuf, s.nonreloc); + LoadAndMatchScanner(rbuf, s.nonrelocNoMask); + LoadAndMatchScanner(rbuf, s.halfFinal); + LoadAndMatchScanner(rbuf, s.halfFinalNoMask); + LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal); + LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask); + + Pire::Scanner fast; + Pire::SimpleScanner simple; + Pire::SlowScanner slow; + Pire::ScannerNoMask fastNoMask; + Pire::HalfFinalScanner halfFinal; + Pire::HalfFinalScannerNoMask halfFinalNoMask; + Pire::Scanner fast1; + Pire::ScannerNoMask fastNoMask1; + Pire::HalfFinalScanner halfFinal1; + Pire::HalfFinalScannerNoMask halfFinalNoMask1; + const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord); + TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset); + const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)); + const char* end = ptr + wbuf.Buffer().Size(); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + + const char* ptr2 = 0; + ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr); + size_t fastSize = ptr2 - ptr; + ptr = ptr2; + ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr); + size_t simpleSize = ptr2 - ptr; + ptr = ptr2; + ptr = MmapAndMatchScanner(slow, ptr, end - ptr); + ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr); + // Nonreloc-s are saved as Scaner-s, so read them again + ptr = MmapAndMatchScanner(fast1, ptr, end - ptr); + ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr); + + ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr); + ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr); + UNIT_ASSERT_EQUAL(ptr, end); + + for (size_t offset = 1; offset < MaxTestOffset; ++offset) { + ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset; + end = ptr + wbuf.Buffer().Size(); + memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size()); + BasicMmapTest::Match(fast, ptr, end - ptr, "regexp"); + ptr = ptr + fastSize; + BasicMmapTest::Match(simple, ptr, end - ptr, "regexp"); + ptr = ptr + simpleSize; + BasicMmapTest::Match(slow, ptr, end - ptr, "regexp"); + } +} + +Y_UNIT_TEST(TestShortcuts) +{ + REGEXP("aaa") { + ACCEPTS("......................................aaa............."); + DENIES ("......................................aab............."); + DENIES ("......................................................"); + } + REGEXP("[ab]{3}") { + ACCEPTS("......................................aaa............."); + ACCEPTS("......................................aab............."); + ACCEPTS("......................................bbb............."); + DENIES ("......................................................"); + } + REGEXP2("\xD0\xB0", "u") { + ACCEPTS("......................................\xD0\xB0..............."); + ACCEPTS("...................................\xD0\xB0.................."); + ACCEPTS("................................\xD0\xB0....................."); + } +} + +template<class Scanner> +void TestGlue() +{ + Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>(); + Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>(); + Scanner glued = Scanner::Glue(sc1, sc2); + UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2)); + + auto state = RunRegexp(glued, "aaa"); + auto res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(*res.first, size_t(0)); + + state = RunRegexp(glued, "bbb"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(*res.first, size_t(1)); + + state = RunRegexp(glued, "aaabbb"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2)); + UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); + UNIT_ASSERT_EQUAL(res.first[1], size_t(1)); + + state = RunRegexp(glued, "ccc"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0)); + + Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>(); + glued = Scanner::Glue(sc3, glued); + UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3)); + + state = RunRegexp(glued, "ccc"); + res = glued.AcceptedRegexps(state); + UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1)); + UNIT_ASSERT_EQUAL(res.first[0], size_t(0)); + Scanner sc4 = Scanner::Glue( + ParseRegexp("a", "n").Compile<Scanner>(), + ParseRegexp("c", "n").Compile<Scanner>() + ); + state = RunRegexp(sc4, "ac"); + res = sc4.AcceptedRegexps(state); + UNIT_ASSERT(res.second == res.first); + state = RunRegexp(sc4, "ac"); + UNIT_ASSERT(!sc4.Final(state)); +} + +Y_UNIT_TEST(Glue) +{ + TestGlue<Pire::Scanner>(); + TestGlue<Pire::NonrelocScanner>(); + TestGlue<Pire::ScannerNoMask>(); + TestGlue<Pire::NonrelocScannerNoMask>(); + TestGlue<Pire::HalfFinalScanner>(); + TestGlue<Pire::NonrelocHalfFinalScanner>(); + TestGlue<Pire::HalfFinalScannerNoMask>(); + TestGlue<Pire::NonrelocHalfFinalScannerNoMask>(); +} + +Y_UNIT_TEST(Slow) +{ + Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>(); + // 123456789012345678901234567890 + UNIT_ASSERT( Matches(sc, "....a..............................")); + UNIT_ASSERT(!Matches(sc, "....a...............................")); + UNIT_ASSERT(!Matches(sc, "....a.............................")); +} + +struct astring: private std::vector<char> { + template <typename... A> + inline astring(A&&... a) { + std::string s(std::forward<A>(a)...); + + insert(end(), s.begin(), s.end()); + push_back(0); + } + + inline char* c_str() noexcept { + return data(); + } + + friend astring operator+(astring l, const astring& r) { + l.insert(l.end() - 1, r.begin(), r.end()); + + return l; + } +}; + +Y_UNIT_TEST(Aligned) +{ + using ystring = astring; + + UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*))); + + REGEXP("xy") { + // Short string with aligned head + ACCEPTS(ystring("xy").c_str()); + DENIES (ystring("yz").c_str()); + // Short string, unaligned + ACCEPTS(ystring(".xy").c_str() + 1); + DENIES (ystring(".yz").c_str() + 1); + // Short string with aligned tail + ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2); + DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2); + } + + REGEXP("abcde") { + // Everything aligned, match occurs in the middle + ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str()); + DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str()); + // Unaligned head + ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1); + DENIES (ystring(".ZxbcdeZZZ").c_str() + 1); + // Unaligned tail + ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str()); + DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str()); + } +} + +#undef Run + +template <class Scanner> +void BasicTestEmptySaveLoadMmap() +{ + Scanner sc; + UNIT_ASSERT(sc.Empty()); + UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash + + BufferOutput wbuf; + UNIT_CHECKPOINT(); Save(&wbuf, sc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Scanner sc3; + /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3); + UNIT_ASSERT(sc3.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End(); + + Scanner sc4; + /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size()); + UNIT_ASSERT(sc4.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End(); +} + +Y_UNIT_TEST(EmptyScanner) +{ + // Tests for Scanner + BasicTestEmptySaveLoadMmap<Pire::Scanner>(); + BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>(); + BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>(); + BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>(); + + Pire::Scanner sc; + Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc); + UNIT_ASSERT(scsc.Empty()); + UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End(); + + Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End(); + + // Tests for NonrelocScanner + Pire::NonrelocScanner nsc; + UNIT_ASSERT(nsc.Empty()); + UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0)); + UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End(); + + Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>(); + UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1)); + UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End(); + + { + BufferOutput wbuf; + UNIT_CHECKPOINT(); Save(&wbuf, nsc); + + MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size()); + Pire::NonrelocScanner nsc3; + /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3); + UNIT_ASSERT(nsc3.Empty()); + UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End(); + } + + BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>(); + + BasicTestEmptySaveLoadMmap<Pire::SlowScanner>(); +} + +Y_UNIT_TEST(NullPointer) +{ + const char* null = 0; + Pire::Scanner sc = Pire::Fsm().Compile<Pire::Scanner>(); + Pire::Runner(sc).Begin().Run(null, null).End(); +} + +} diff --git a/contrib/libs/pire/ut/read_unicode_ut.cpp b/contrib/libs/pire/ut/read_unicode_ut.cpp new file mode 100644 index 0000000000..f0433401c7 --- /dev/null +++ b/contrib/libs/pire/ut/read_unicode_ut.cpp @@ -0,0 +1,298 @@ +/* + * unicode_range_ut.cpp -- + * + * Copyright (c) 2019 YANDEX LLC + * Author: Karina Usmanova <usmanova.karin@yandex.ru> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <pire.h> +#include "stub/cppunit.h" +#include "common.h" + +Y_UNIT_TEST_SUITE(ReadUnicodeTest) { + ystring CreateStringWithZeroSymbol(const char* str, size_t pos) { + ystring result = str; + Y_ASSERT(pos < result.size()); + result[pos] = '\0'; + return result; + } + + Y_UNIT_TEST(ZeroSymbol) + { + REGEXP("\\x{0}") { + ACCEPTS(CreateStringWithZeroSymbol("a", 0)); + ACCEPTS(CreateStringWithZeroSymbol("some text", 3)); + DENIES("string without zero"); + } + + REGEXP("the\\x00middle") { + ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6)); + DENIES(CreateStringWithZeroSymbol("in the middle", 5)); + DENIES("in the middle"); + } + } + + Y_UNIT_TEST(SymbolsByCodes) + { + REGEXP("\\x{41}") { + ACCEPTS("A"); + ACCEPTS("tAst string"); + DENIES("test string"); + } + + REGEXP("\\x26abc") { + ACCEPTS("&abc;"); + DENIES("test &ab"); + DENIES("without"); + } + } + + Y_UNIT_TEST(ErrorsWhileCompiling) + { + UNIT_ASSERT(HasError("\\x")); + UNIT_ASSERT(HasError("\\x0")); + UNIT_ASSERT(HasError("\\xfu")); + UNIT_ASSERT(HasError("\\xs1")); + UNIT_ASSERT(HasError("\\x 0")); + UNIT_ASSERT(HasError("\\x0 ")); + + UNIT_ASSERT(HasError("\\x{2A1")); + UNIT_ASSERT(HasError("\\x{")); + UNIT_ASSERT(HasError("\\x}")); + UNIT_ASSERT(HasError("\\x2}")); + UNIT_ASSERT(HasError("\\x{{3}")); + UNIT_ASSERT(HasError("\\x{2a{5}")); + + UNIT_ASSERT(HasError("\\x{}")); + UNIT_ASSERT(HasError("\\x{+3}")); + UNIT_ASSERT(HasError("\\x{-3}")); + UNIT_ASSERT(HasError("\\x{ 2F}")); + UNIT_ASSERT(HasError("\\x{2A F}")); + UNIT_ASSERT(HasError("\\x{2Arft}")); + UNIT_ASSERT(HasError("\\x{110000}")); + + UNIT_ASSERT(!HasError("\\x{fB1}")); + UNIT_ASSERT(!HasError("\\x00")); + UNIT_ASSERT(!HasError("\\x{10FFFF}")); + } + + Y_UNIT_TEST(OneCharacterRange) + { + SCANNER("[\\x{61}]") { + ACCEPTS("a"); + ACCEPTS("bac"); + DENIES("test"); + } + + SCANNER("[\\x3f]") { + ACCEPTS("?"); + ACCEPTS("test?"); + DENIES("test"); + } + } + + Y_UNIT_TEST(CharacterRange) { + REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("?"); + ACCEPTS("acd"); + ACCEPTS("bcd"); + ACCEPTS("cd?"); + ACCEPTS("ab?"); + DENIES("cd"); + } + + REGEXP("[\\x{61}-\\x{63}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("qwertya"); + DENIES("d"); + } + + REGEXP("[\\x61-\\x61]") { + ACCEPTS("a"); + ACCEPTS("qwertya"); + DENIES("b"); + } + + REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") { + ACCEPTS("&"); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("?"); + ACCEPTS("ade"); + ACCEPTS("ab?"); + DENIES("d"); + } + + REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("A"); + ACCEPTS("B"); + DENIES("c"); + DENIES("C"); + } + + REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") { + ACCEPTS("Aa"); + ACCEPTS("Ab"); + ACCEPTS("Ba"); + ACCEPTS("Bb"); + DENIES("a"); + DENIES("b"); + DENIES("A"); + DENIES("B"); + DENIES("ab"); + DENIES("AB"); + DENIES("Ca"); + } + } + + Y_UNIT_TEST(RangeExcludeCharacters) { + REGEXP("[^\\x{61}]") { + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("aba"); + DENIES("a"); + DENIES("aaa"); + } + + REGEXP("[^\\x{61}-\\x{7a}]") { + ACCEPTS("A"); + ACCEPTS("123"); + ACCEPTS("acb1"); + DENIES("a"); + DENIES("abcxyz"); + } + } + + Y_UNIT_TEST(MixedRange) { + REGEXP("[\\x{61}B]") { + ACCEPTS("a"); + ACCEPTS("B"); + ACCEPTS("atestB"); + DENIES("test"); + } + + REGEXP("[^\\x{61}A]") { + ACCEPTS("b"); + ACCEPTS("B"); + ACCEPTS("atestB"); + DENIES("a"); + DENIES("A"); + DENIES("aaAA"); + } + + REGEXP("[0-9][\\x{61}-\\x{62}A-B]") { + ACCEPTS("0a"); + ACCEPTS("1A"); + ACCEPTS("5b"); + ACCEPTS("9B"); + ACCEPTS("1atestB"); + ACCEPTS("2Atest"); + DENIES("aB"); + DENIES("testb"); + DENIES("test"); + } + + REGEXP("[\\x{61}-c]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("testb"); + DENIES("d"); + } + + REGEXP("[^a-\\x{7a}]") { + ACCEPTS("A"); + ACCEPTS("123"); + ACCEPTS("acb1"); + DENIES("a"); + DENIES("abcxyz"); + } + + REGEXP("[\\x{41}-Ba-\\x{62}]") { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("A"); + ACCEPTS("B"); + DENIES("c"); + DENIES("C"); + } + } + + Y_UNIT_TEST(CompilingRange) + { + UNIT_ASSERT(HasError("[\\x41")); + UNIT_ASSERT(HasError("[\\xfq]")); + UNIT_ASSERT(HasError("[\\x{01}-]")); + + UNIT_ASSERT(!HasError("[\\x{10FFFF}]")); + UNIT_ASSERT(!HasError("[\\x{00}]")); + UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]")); + + UNIT_ASSERT(!HasError("[^\\xFF]")); + UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]")); + UNIT_ASSERT(!HasError("[-\\x{01}]")); + } + + Y_UNIT_TEST(UnicodeRepetition) + { + REGEXP("^\\x{78}{3,6}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + } + + REGEXP("^x{3,}$") { + DENIES ("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxxxxxxxx"); + ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^\\x{78}{3}$") { + DENIES ("x"); + DENIES ("xx"); + ACCEPTS("xxx"); + DENIES ("xxxx"); + DENIES ("xxxxx"); + DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + + REGEXP("^([\\x{78}-\\x{79}]){2}$") { + DENIES("x"); + DENIES("y"); + ACCEPTS("xx"); + ACCEPTS("xy"); + ACCEPTS("yx"); + ACCEPTS("yy"); + DENIES("xxy"); + DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); + } + } + +} diff --git a/contrib/libs/pire/ut/stub/cppunit.h b/contrib/libs/pire/ut/stub/cppunit.h new file mode 100644 index 0000000000..6d15ce0912 --- /dev/null +++ b/contrib/libs/pire/ut/stub/cppunit.h @@ -0,0 +1,14 @@ +#ifndef PIRE_STUB_CPPUNIT_H_INCLUDED +#define PIRE_STUB_CPPUNIT_H_INCLUDED + +#include <library/cpp/testing/unittest/registar.h> +#include <util/stream/mem.h> + +#define UNIT_CHECKPOINT() + +typedef TMemoryInput MemoryInput; + +#endif + + + |