aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/pire/ut/approx_matching_ut.cpp
diff options
context:
space:
mode:
authormonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
committermonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
commit06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /contrib/libs/pire/ut/approx_matching_ut.cpp
parent03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
downloadydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz
fix ya.make
Diffstat (limited to 'contrib/libs/pire/ut/approx_matching_ut.cpp')
-rw-r--r--contrib/libs/pire/ut/approx_matching_ut.cpp379
1 files changed, 379 insertions, 0 deletions
diff --git a/contrib/libs/pire/ut/approx_matching_ut.cpp b/contrib/libs/pire/ut/approx_matching_ut.cpp
new file mode 100644
index 0000000000..3b4cb972f6
--- /dev/null
+++ b/contrib/libs/pire/ut/approx_matching_ut.cpp
@@ -0,0 +1,379 @@
+/*
+ * approx_matching_ut.cpp --
+ *
+ * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <contrib/libs/pire/pire/pire.h>
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(ApproxMatchingTest) {
+ Pire::Fsm BuildFsm(const char *str)
+ {
+ Pire::Lexer lexer;
+ TVector<wchar32> ucs4;
+
+ lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
+ lexer.Assign(ucs4.begin(), ucs4.end());
+ return lexer.Parse();
+ }
+
+ Y_UNIT_TEST(Simple) {
+ auto fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("ax");
+ ACCEPTS("xb");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("xab");
+ ACCEPTS("axb");
+ ACCEPTS("abx");
+ ACCEPTS("aab");
+ DENIES("xy");
+ DENIES("abcd");
+ DENIES("xabx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^ab$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("ab");
+ ACCEPTS("xy");
+ ACCEPTS("");
+ ACCEPTS("axbx");
+ DENIES("xxabx");
+ DENIES("xbxxx");
+ }
+ }
+
+ Y_UNIT_TEST(SpecialSymbols) {
+ auto fsm = BuildFsm("^.*ab$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("ab");
+ ACCEPTS("xxxxab");
+ ACCEPTS("xxxxabab");
+ DENIES("xxxx");
+ DENIES("abxxxx");
+ }
+
+ fsm = BuildFsm("^[a-c]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("/");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("xx");
+ DENIES("abc");
+ }
+
+ fsm = BuildFsm("^x{4}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ DENIES ("x");
+ ACCEPTS("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ ACCEPTS("xxyy");
+ ACCEPTS("xxyyx");
+ ACCEPTS("xxxxyz");
+ DENIES("xyyy");
+ }
+
+ fsm = BuildFsm("^(a|b)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("abc");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^(ab|cd)$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("cd");
+ ACCEPTS("ax");
+ ACCEPTS("xd");
+ ACCEPTS("abx");
+ ACCEPTS("a");
+ DENIES("abcd");
+ DENIES("xx");
+ DENIES("");
+ }
+
+ fsm = BuildFsm("^[a-c]{3}$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("abc");
+ ACCEPTS("aaa");
+ ACCEPTS("a");
+ ACCEPTS("ax");
+ ACCEPTS("abxcx");
+ DENIES("x");
+ DENIES("");
+ DENIES("xaxx");
+ }
+
+ fsm = BuildFsm("^\\x{61}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("x");
+ ACCEPTS("");
+ ACCEPTS("ax");
+ DENIES("axx");
+ DENIES("xx");
+ }
+
+ fsm = BuildFsm("^a.bc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("axxbc");
+ ACCEPTS("abc");
+ ACCEPTS("xabc");
+ ACCEPTS("xaxbc");
+ DENIES("bc");
+ DENIES("abcx");
+ }
+ }
+
+ Y_UNIT_TEST(TestSurrounded) {
+ auto fsm = BuildFsm("abc").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("xabcx");
+ ACCEPTS("xabx");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("a");
+ DENIES("xaxxxx");
+ }
+
+ fsm = BuildFsm("^abc$").Surround();
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("abc");
+ ACCEPTS("abcx");
+ ACCEPTS("xabc");
+ ACCEPTS("axc");
+ ACCEPTS("bac");
+ DENIES("xabx");
+ DENIES("axx");
+ }
+ }
+
+ Y_UNIT_TEST(GlueFsm) {
+ auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("x");
+ ACCEPTS("ab");
+ DENIES("abb");
+ }
+
+ fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("cc");
+ ACCEPTS("x");
+ ACCEPTS("xa");
+ ACCEPTS("c");
+ ACCEPTS("xc");
+ ACCEPTS("cxc");
+ ACCEPTS("");
+ }
+ }
+
+ enum MutateOperation {
+ Begin,
+ Substitute = Begin,
+ Delete,
+ Insert,
+ End
+ };
+
+ ystring ChangeText(const ystring& text, int operation, int pos)
+ {
+ auto changedText = text;
+ switch (operation) {
+ case MutateOperation::Substitute:
+ changedText[pos] = 'x';
+ break;
+ case MutateOperation::Delete:
+ changedText.erase(pos, 1);
+ break;
+ case MutateOperation::Insert:
+ changedText.insert(pos, 1, 'x');
+ break;
+ }
+
+ return changedText;
+ }
+
+ Y_UNIT_TEST(StressTest) {
+ ystring text;
+ for (size_t letter = 0; letter < 10; ++letter) {
+ text += ystring(3, letter + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.Data());
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
+ for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
+ auto changedText = ChangeText(text, operation, pos);
+ DENIES(changedText);
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ ACCEPTS(changedText);
+ }
+ }
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
+ size_t posRight = text.size() - posLeft - 1;
+ for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
+ for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
+ auto changedText = ChangeText(text, operationRight, posRight);
+ changedText = ChangeText(changedText, operationLeft, posLeft);
+ DENIES(changedText);
+ }
+ }
+ }
+ }
+ }
+
+ Y_UNIT_TEST(SwapLetters) {
+ auto fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("bac");
+ ACCEPTS("acb");
+ DENIES("cba");
+ DENIES("bax");
+ }
+
+ fsm = BuildFsm("^abcd$");
+ APPROXIMATE_SCANNER(fsm, 2) {
+ ACCEPTS("bacd");
+ ACCEPTS("acbd");
+ ACCEPTS("baxd");
+ ACCEPTS("badc");
+ ACCEPTS("bcad");
+ ACCEPTS("bcda");
+ DENIES("xcbx");
+ DENIES("baxx");
+ DENIES("ba");
+ DENIES("cdab");
+ }
+
+ fsm = BuildFsm("^abc$");
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS("abc");
+ DENIES("bac");
+ }
+
+ fsm = BuildFsm("^[a-c][1-3]$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("a3");
+ ACCEPTS("c");
+ ACCEPTS("1");
+ ACCEPTS("1a");
+ ACCEPTS("3b");
+ DENIES("4a");
+ }
+
+ fsm = BuildFsm("^.*abc$");
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS("ab");
+ ACCEPTS("xxxxbac");
+ DENIES("xxxxa");
+ DENIES("xxxxcb");
+ }
+ }
+
+ Y_UNIT_TEST(SwapStressTest){
+ ystring text;
+ for (size_t letter = 0; letter < 30; ++letter) {
+ text += ystring(1, (letter % 26) + 'a');
+ }
+ const ystring regexp = "^" + text + "$";
+ auto fsm = BuildFsm(regexp.Data());
+ auto changedText = text;
+
+ APPROXIMATE_SCANNER(fsm, 1) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ ACCEPTS(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+
+ APPROXIMATE_SCANNER(fsm, 0) {
+ ACCEPTS(text);
+
+ for (size_t pos = 0; pos < text.size() - 1; ++pos) {
+ changedText[pos] = text[pos + 1];
+ changedText[pos + 1] = text[pos];
+ DENIES(changedText);
+ changedText[pos] = text[pos];
+ changedText[pos + 1] = text[pos + 1];
+ }
+ }
+ }
+}