diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pire/ut/regexp_ut.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pire/ut/regexp_ut.cpp')
-rw-r--r-- | library/cpp/regex/pire/ut/regexp_ut.cpp | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp new file mode 100644 index 0000000000..e7206de9ad --- /dev/null +++ b/library/cpp/regex/pire/ut/regexp_ut.cpp @@ -0,0 +1,318 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/regex/pire/regexp.h> +#include <library/cpp/regex/pire/pcre2pire.h> + +Y_UNIT_TEST_SUITE(TRegExp) { + using namespace NRegExp; + + Y_UNIT_TEST(False) { + UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final()); + UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final()); + } + + Y_UNIT_TEST(Surround) { + UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final()); + } + + Y_UNIT_TEST(Boundaries) { + UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final()); + UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final()); + + UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true))) + .Match(TStringBuf("a"), true, false) + .Match(TStringBuf("q"), false, false) + .Match(TStringBuf("w"), false, false) + .Match(TStringBuf("b"), false, true) + .Final()); + } + + Y_UNIT_TEST(Case) { + UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final()); + UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final()); + } + + Y_UNIT_TEST(UnicodeCase) { + UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final()); + UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final()); + } + + Y_UNIT_TEST(Utf) { + NRegExp::TFsmBase::TOptions opts; + opts.Charset = CODES_UTF8; + opts.Surround = true; + UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final()); + UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final()); + UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final()); + UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final()); + } + + Y_UNIT_TEST(AndNot) { + NRegExp::TFsmBase::TOptions opts; + opts.AndNotSupport = true; + { + NRegExp::TFsm fsm(".*&~([0-9]*)", opts); + UNIT_ASSERT(TMatcher(fsm).Match("a2").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("ab").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("1a").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("12").Final()); + } + { + NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts); + UNIT_ASSERT(TMatcher(fsm).Match("ab").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("12").Final()); + } + { + NRegExp::TFsm fsm( + "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)" + "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?", + TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true) + ); + UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final()); + UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final()); + UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final()); + } + } + + Y_UNIT_TEST(Glue) { + TFsm glued = + TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) | + TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) | + TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false)); + UNIT_ASSERT(TMatcher(glued).Match("Qw").Final()); + UNIT_ASSERT(TMatcher(glued).Match("Qw").Final()); + UNIT_ASSERT(TMatcher(glued).Match("abc").Final()); + UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final()); + } + + Y_UNIT_TEST(Capture1) { + TCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); + + TSearcher searcher(fsm); + searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); + } + + Y_UNIT_TEST(Capture2) { + TCapturingFsm fsm("w([abcdez]+)f"); + + TSearcher searcher(fsm); + searcher.Search("wabcdef"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde")); + } + + Y_UNIT_TEST(Capture3) { + TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", + TFsm::TOptions().SetCapture(2)); + + TSearcher searcher(fsm); + searcher.Search("http://vkontakte.ru/id100500"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); + } + + Y_UNIT_TEST(Capture4) { + TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", + TFsm::TOptions().SetCharset(CODES_UTF8)); + + TSearcher searcher(fsm); + searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); + } + + Y_UNIT_TEST(Capture5) { + TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\""); + TSearcher searcher(fsm); + searcher.Search("\"/away.php?to=http:some.addr\"&id=1"); + UNIT_ASSERT(searcher.Captured()); + //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr")); + } + + Y_UNIT_TEST(Capture6) { + TCapturingFsm fsm("(/to-match-with)"); + TSearcher searcher(fsm); + searcher.Search("/some/table/path/to-match-with"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with")); + } + + Y_UNIT_TEST(Capture7) { + TCapturingFsm fsm("(pref.*suff)"); + TSearcher searcher(fsm); + searcher.Search("ala pref bla suff cla"); + UNIT_ASSERT(searcher.Captured()); + //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff")); + } + + Y_UNIT_TEST(CaptureXA) { + TCapturingFsm fsm(".*(xa).*"); + + TSearcher searcher(fsm); + searcher.Search("xa"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa")); + } + + Y_UNIT_TEST(CaptureWrongXX) { + TCapturingFsm fsm(".*(xx).*"); + + TSearcher searcher(fsm); + searcher.Search("xx"); + UNIT_ASSERT(searcher.Captured()); + // Surprise! + // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm. + // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong. + // So it returns not the expected "xx" but just the second "x" instead. + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x")); + } + + Y_UNIT_TEST(CaptureRight1XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("xxx"); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(CaptureRight2XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("axx"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(CaptureRight3XX) { + TCapturingFsm fsm("[^x]+(xx).*"); + + TSearcher searcher(fsm); + + searcher.Search("axxb"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(SlowCaptureXX) { + TSlowCapturingFsm fsm(".*(xx).*"); + + TSlowSearcher searcher(fsm); + searcher.Search("xx"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx")); + } + + Y_UNIT_TEST(SlowCapture) { + TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", + TFsm::TOptions().SetCapture(2)); + TSlowSearcher searcher(fsm); + searcher.Search("http://vkontakte.ru/id100500"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); + } + + Y_UNIT_TEST(SlowCaptureGreedy) { + TSlowCapturingFsm fsm(".*(pref.*suff)"); + TSlowSearcher searcher(fsm); + searcher.Search("pref ala bla pref cla suff dla"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff")); + } + + Y_UNIT_TEST(SlowCaptureNonGreedy) { + TSlowCapturingFsm fsm(".*?(pref.*suff)"); + TSlowSearcher searcher(fsm); + searcher.Search("pref ala bla pref cla suff dla"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff")); + } + + Y_UNIT_TEST(SlowCapture2) { + TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", + TFsm::TOptions().SetCharset(CODES_UTF8)); + + TSlowSearcher searcher(fsm); + searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); + } + + Y_UNIT_TEST(SlowCapture3) { + TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); + TSlowSearcher searcher(fsm); + searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); + } + + Y_UNIT_TEST(SlowCapture4) { + TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\""); + TSlowSearcher searcher(fsm); + searcher.Search("\"/away.php?to=http:some.addr\"&id=1"); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr")); + } + + Y_UNIT_TEST(CapturedEmptySlow) { + TSlowCapturingFsm fsm("Comments=(.*)$"); + TSlowSearcher searcher(fsm); + searcher.Search("And Comments="); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("")); + } + + Y_UNIT_TEST(CaptureInOrFirst) { + TSlowCapturingFsm fsm("(A)|A"); + TSlowSearcher searcher(fsm); + searcher.Search("A"); + UNIT_ASSERT(searcher.Captured()); + } + + Y_UNIT_TEST(CaptureInOrSecond) { + TSlowCapturingFsm fsm("A|(A)"); + TSlowSearcher searcher(fsm); + searcher.Search("A"); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(CaptureOutside) { + TSlowCapturingFsm fsm("((ID=([0-9]+))?)"); + TSlowSearcher searcher(fsm); + searcher.Search("ID="); + UNIT_ASSERT(searcher.Captured()); + UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("")); + } + + Y_UNIT_TEST(CaptureInside) { + TSlowCapturingFsm fsm("((ID=([0-9]+))?)", + TFsm::TOptions().SetCapture(2)); + TSlowSearcher searcher(fsm); + searcher.Search("ID="); + UNIT_ASSERT(!searcher.Captured()); + } + + Y_UNIT_TEST(Pcre2PireTest) { + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake"); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), ""); + UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))"); + } +} |