diff options
author | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
---|---|---|
committer | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
commit | 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch) | |
tree | 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /contrib/libs/pire/ut/approx_matching_ut.cpp | |
parent | 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff) | |
download | ydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz |
fix ya.make
Diffstat (limited to 'contrib/libs/pire/ut/approx_matching_ut.cpp')
-rw-r--r-- | contrib/libs/pire/ut/approx_matching_ut.cpp | 379 |
1 files changed, 379 insertions, 0 deletions
diff --git a/contrib/libs/pire/ut/approx_matching_ut.cpp b/contrib/libs/pire/ut/approx_matching_ut.cpp new file mode 100644 index 0000000000..3b4cb972f6 --- /dev/null +++ b/contrib/libs/pire/ut/approx_matching_ut.cpp @@ -0,0 +1,379 @@ +/* + * approx_matching_ut.cpp -- + * + * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru> + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see <http://www.gnu.org/licenses>. + */ + + +#include <contrib/libs/pire/pire/pire.h> +#include "common.h" + +Y_UNIT_TEST_SUITE(ApproxMatchingTest) { + Pire::Fsm BuildFsm(const char *str) + { + Pire::Lexer lexer; + TVector<wchar32> ucs4; + + lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); + lexer.Assign(ucs4.begin(), ucs4.end()); + return lexer.Parse(); + } + + Y_UNIT_TEST(Simple) { + auto fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("ax"); + ACCEPTS("xb"); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("xab"); + ACCEPTS("axb"); + ACCEPTS("abx"); + ACCEPTS("aab"); + DENIES("xy"); + DENIES("abcd"); + DENIES("xabx"); + DENIES(""); + } + + fsm = BuildFsm("^ab$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("ab"); + ACCEPTS("xy"); + ACCEPTS(""); + ACCEPTS("axbx"); + DENIES("xxabx"); + DENIES("xbxxx"); + } + } + + Y_UNIT_TEST(SpecialSymbols) { + auto fsm = BuildFsm("^.*ab$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("ab"); + ACCEPTS("xxxxab"); + ACCEPTS("xxxxabab"); + DENIES("xxxx"); + DENIES("abxxxx"); + } + + fsm = BuildFsm("^[a-c]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("c"); + ACCEPTS("/"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("xx"); + DENIES("abc"); + } + + fsm = BuildFsm("^x{4}$"); + APPROXIMATE_SCANNER(fsm, 2) { + DENIES ("x"); + ACCEPTS("xx"); + ACCEPTS("xxx"); + ACCEPTS("xxxx"); + ACCEPTS("xxxxx"); + ACCEPTS("xxxxxx"); + DENIES ("xxxxxxx"); + ACCEPTS("xxyy"); + ACCEPTS("xxyyx"); + ACCEPTS("xxxxyz"); + DENIES("xyyy"); + } + + fsm = BuildFsm("^(a|b)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("abc"); + DENIES("xx"); + } + + fsm = BuildFsm("^(ab|cd)$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("cd"); + ACCEPTS("ax"); + ACCEPTS("xd"); + ACCEPTS("abx"); + ACCEPTS("a"); + DENIES("abcd"); + DENIES("xx"); + DENIES(""); + } + + fsm = BuildFsm("^[a-c]{3}$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("abc"); + ACCEPTS("aaa"); + ACCEPTS("a"); + ACCEPTS("ax"); + ACCEPTS("abxcx"); + DENIES("x"); + DENIES(""); + DENIES("xaxx"); + } + + fsm = BuildFsm("^\\x{61}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("x"); + ACCEPTS(""); + ACCEPTS("ax"); + DENIES("axx"); + DENIES("xx"); + } + + fsm = BuildFsm("^a.bc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("axxbc"); + ACCEPTS("abc"); + ACCEPTS("xabc"); + ACCEPTS("xaxbc"); + DENIES("bc"); + DENIES("abcx"); + } + } + + Y_UNIT_TEST(TestSurrounded) { + auto fsm = BuildFsm("abc").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("xabcx"); + ACCEPTS("xabx"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("a"); + DENIES("xaxxxx"); + } + + fsm = BuildFsm("^abc$").Surround(); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("abc"); + ACCEPTS("abcx"); + ACCEPTS("xabc"); + ACCEPTS("axc"); + ACCEPTS("bac"); + DENIES("xabx"); + DENIES("axx"); + } + } + + Y_UNIT_TEST(GlueFsm) { + auto fsm = BuildFsm("^a$") | BuildFsm("^b$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(""); + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("x"); + ACCEPTS("ab"); + DENIES("abb"); + } + + fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a"); + ACCEPTS("b"); + ACCEPTS("cc"); + ACCEPTS("x"); + ACCEPTS("xa"); + ACCEPTS("c"); + ACCEPTS("xc"); + ACCEPTS("cxc"); + ACCEPTS(""); + } + } + + enum MutateOperation { + Begin, + Substitute = Begin, + Delete, + Insert, + End + }; + + ystring ChangeText(const ystring& text, int operation, int pos) + { + auto changedText = text; + switch (operation) { + case MutateOperation::Substitute: + changedText[pos] = 'x'; + break; + case MutateOperation::Delete: + changedText.erase(pos, 1); + break; + case MutateOperation::Insert: + changedText.insert(pos, 1, 'x'); + break; + } + + return changedText; + } + + Y_UNIT_TEST(StressTest) { + ystring text; + for (size_t letter = 0; letter < 10; ++letter) { + text += ystring(3, letter + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.Data()); + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + ACCEPTS(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { + for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { + auto changedText = ChangeText(text, operation, pos); + DENIES(changedText); + } + } + } + + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + ACCEPTS(changedText); + } + } + } + } + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight + size_t posRight = text.size() - posLeft - 1; + for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { + for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { + auto changedText = ChangeText(text, operationRight, posRight); + changedText = ChangeText(changedText, operationLeft, posLeft); + DENIES(changedText); + } + } + } + } + } + + Y_UNIT_TEST(SwapLetters) { + auto fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("bac"); + ACCEPTS("acb"); + DENIES("cba"); + DENIES("bax"); + } + + fsm = BuildFsm("^abcd$"); + APPROXIMATE_SCANNER(fsm, 2) { + ACCEPTS("bacd"); + ACCEPTS("acbd"); + ACCEPTS("baxd"); + ACCEPTS("badc"); + ACCEPTS("bcad"); + ACCEPTS("bcda"); + DENIES("xcbx"); + DENIES("baxx"); + DENIES("ba"); + DENIES("cdab"); + } + + fsm = BuildFsm("^abc$"); + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS("abc"); + DENIES("bac"); + } + + fsm = BuildFsm("^[a-c][1-3]$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("a3"); + ACCEPTS("c"); + ACCEPTS("1"); + ACCEPTS("1a"); + ACCEPTS("3b"); + DENIES("4a"); + } + + fsm = BuildFsm("^.*abc$"); + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS("ab"); + ACCEPTS("xxxxbac"); + DENIES("xxxxa"); + DENIES("xxxxcb"); + } + } + + Y_UNIT_TEST(SwapStressTest){ + ystring text; + for (size_t letter = 0; letter < 30; ++letter) { + text += ystring(1, (letter % 26) + 'a'); + } + const ystring regexp = "^" + text + "$"; + auto fsm = BuildFsm(regexp.Data()); + auto changedText = text; + + APPROXIMATE_SCANNER(fsm, 1) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + ACCEPTS(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + + APPROXIMATE_SCANNER(fsm, 0) { + ACCEPTS(text); + + for (size_t pos = 0; pos < text.size() - 1; ++pos) { + changedText[pos] = text[pos + 1]; + changedText[pos + 1] = text[pos]; + DENIES(changedText); + changedText[pos] = text[pos]; + changedText[pos + 1] = text[pos + 1]; + } + } + } +} |