aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/pire/ut/read_unicode_ut.cpp
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2025-06-25 00:23:21 +0300
committerthegeorg <thegeorg@yandex-team.com>2025-06-25 00:38:34 +0300
commite7147783ae6a23ee6675fa9f8ca6f43c6af17bc3 (patch)
tree454e5df12108188dd07fff8193566892d22e5909 /library/cpp/regex/pire/ut/read_unicode_ut.cpp
parentebc5e196362b795c9a1ac8efa9d5a997cf07b1a4 (diff)
downloadydb-e7147783ae6a23ee6675fa9f8ca6f43c6af17bc3.tar.gz
pire was achived on GitHub, move the code into library/cpp/regex/pire
commit_hash:018daf4645e87c4e0b31e1191af4e75e48f6d958
Diffstat (limited to 'library/cpp/regex/pire/ut/read_unicode_ut.cpp')
-rw-r--r--library/cpp/regex/pire/ut/read_unicode_ut.cpp307
1 files changed, 307 insertions, 0 deletions
diff --git a/library/cpp/regex/pire/ut/read_unicode_ut.cpp b/library/cpp/regex/pire/ut/read_unicode_ut.cpp
new file mode 100644
index 00000000000..17569096873
--- /dev/null
+++ b/library/cpp/regex/pire/ut/read_unicode_ut.cpp
@@ -0,0 +1,307 @@
+/*
+ * unicode_range_ut.cpp --
+ *
+ * Copyright (c) 2019 YANDEX LLC
+ * Author: Karina Usmanova <usmanova.karin@yandex.ru>
+ *
+ * This file is part of Pire, the Perl Incompatible
+ * Regular Expressions library.
+ *
+ * Pire is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Pire is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Pire. If not, see <http://www.gnu.org/licenses>.
+ */
+
+
+#include <pire.h>
+#include "stub/cppunit.h"
+#include "common.h"
+
+Y_UNIT_TEST_SUITE(ReadUnicodeTest) {
+ ystring CreateStringWithZeroSymbol(const char* str, size_t pos) {
+ ystring result = str;
+ Y_ASSERT(pos < result.size());
+ result[pos] = '\0';
+ return result;
+ }
+
+ Y_UNIT_TEST(ZeroSymbol)
+ {
+ REGEXP("\\x{0}") {
+ ACCEPTS(CreateStringWithZeroSymbol("a", 0));
+ ACCEPTS(CreateStringWithZeroSymbol("some text", 3));
+ DENIES("string without zero");
+ }
+
+ REGEXP("the\\x00middle") {
+ ACCEPTS(CreateStringWithZeroSymbol("in the middle", 6));
+ DENIES(CreateStringWithZeroSymbol("in the middle", 5));
+ DENIES("in the middle");
+ }
+ }
+
+ Y_UNIT_TEST(SymbolsByCodes)
+ {
+ REGEXP("\\x{41}") {
+ ACCEPTS("A");
+ ACCEPTS("tAst string");
+ DENIES("test string");
+ }
+
+ REGEXP("\\x26abc") {
+ ACCEPTS("&abc;");
+ DENIES("test &ab");
+ DENIES("without");
+ }
+ }
+
+ Y_UNIT_TEST(ErrorsWhileCompiling)
+ {
+ UNIT_ASSERT(HasError("\\x"));
+ UNIT_ASSERT(HasError("\\x0"));
+ UNIT_ASSERT(HasError("\\xfu"));
+ UNIT_ASSERT(HasError("\\xs1"));
+ UNIT_ASSERT(HasError("\\x 0"));
+ UNIT_ASSERT(HasError("\\x0 "));
+
+ UNIT_ASSERT(HasError("\\x{2A1"));
+ UNIT_ASSERT(HasError("\\x{"));
+ UNIT_ASSERT(HasError("\\x}"));
+ UNIT_ASSERT(HasError("\\x2}"));
+ UNIT_ASSERT(HasError("\\x{{3}"));
+ UNIT_ASSERT(HasError("\\x{2a{5}"));
+
+ UNIT_ASSERT(HasError("\\x{}"));
+ UNIT_ASSERT(HasError("\\x{+3}"));
+ UNIT_ASSERT(HasError("\\x{-3}"));
+ UNIT_ASSERT(HasError("\\x{ 2F}"));
+ UNIT_ASSERT(HasError("\\x{2A F}"));
+ UNIT_ASSERT(HasError("\\x{2Arft}"));
+ UNIT_ASSERT(HasError("\\x{110000}"));
+
+ UNIT_ASSERT(!HasError("\\x{fB1}"));
+ UNIT_ASSERT(!HasError("\\x00"));
+ UNIT_ASSERT(!HasError("\\x{10FFFF}"));
+ }
+
+ Y_UNIT_TEST(OneCharacterRange)
+ {
+ SCANNER("[\\x{61}]") {
+ ACCEPTS("a");
+ ACCEPTS("bac");
+ DENIES("test");
+ }
+
+ SCANNER("[\\x3f]") {
+ ACCEPTS("?");
+ ACCEPTS("test?");
+ DENIES("test");
+ }
+ }
+
+ Y_UNIT_TEST(CharacterRange) {
+ REGEXP("[\\x{61}\\x62\\x{3f}\\x26]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("acd");
+ ACCEPTS("bcd");
+ ACCEPTS("cd?");
+ ACCEPTS("ab?");
+ DENIES("cd");
+ }
+
+ REGEXP("[\\x{61}-\\x{63}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("qwertya");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x61-\\x61]") {
+ ACCEPTS("a");
+ ACCEPTS("qwertya");
+ DENIES("b");
+ }
+
+ REGEXP("[\\x26\\x{61}-\\x{62}\\x{3f}]") {
+ ACCEPTS("&");
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("?");
+ ACCEPTS("ade");
+ ACCEPTS("ab?");
+ DENIES("d");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}\\x{61}-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+
+ REGEXP("[\\x{41}-\\x{42}][\\x{61}-\\x{62}]") {
+ ACCEPTS("Aa");
+ ACCEPTS("Ab");
+ ACCEPTS("Ba");
+ ACCEPTS("Bb");
+ DENIES("a");
+ DENIES("b");
+ DENIES("A");
+ DENIES("B");
+ DENIES("ab");
+ DENIES("AB");
+ DENIES("Ca");
+ }
+ }
+
+ Y_UNIT_TEST(RangeExcludeCharacters) {
+ REGEXP("[^\\x{61}]") {
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("aba");
+ DENIES("a");
+ DENIES("aaa");
+ }
+
+ REGEXP("[^\\x{61}-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+ }
+
+ Y_UNIT_TEST(MixedRange) {
+ REGEXP("[\\x{61}B]") {
+ ACCEPTS("a");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("test");
+ }
+
+ REGEXP("[^\\x{61}A]") {
+ ACCEPTS("b");
+ ACCEPTS("B");
+ ACCEPTS("atestB");
+ DENIES("a");
+ DENIES("A");
+ DENIES("aaAA");
+ }
+
+ REGEXP("[0-9][\\x{61}-\\x{62}A-B]") {
+ ACCEPTS("0a");
+ ACCEPTS("1A");
+ ACCEPTS("5b");
+ ACCEPTS("9B");
+ ACCEPTS("1atestB");
+ ACCEPTS("2Atest");
+ DENIES("aB");
+ DENIES("testb");
+ DENIES("test");
+ }
+
+ REGEXP("[\\x{61}-c]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("c");
+ ACCEPTS("testb");
+ DENIES("d");
+ }
+
+ REGEXP("[^a-\\x{7a}]") {
+ ACCEPTS("A");
+ ACCEPTS("123");
+ ACCEPTS("acb1");
+ DENIES("a");
+ DENIES("abcxyz");
+ }
+
+ REGEXP("[\\x{41}-Ba-\\x{62}]") {
+ ACCEPTS("a");
+ ACCEPTS("b");
+ ACCEPTS("A");
+ ACCEPTS("B");
+ DENIES("c");
+ DENIES("C");
+ }
+ }
+
+ Y_UNIT_TEST(CompilingRange)
+ {
+ UNIT_ASSERT(HasError("[\\x41"));
+ UNIT_ASSERT(HasError("[\\xfq]"));
+ UNIT_ASSERT(HasError("[\\x{01}-]"));
+
+ UNIT_ASSERT(!HasError("[\\x{10FFFF}]"));
+ UNIT_ASSERT(!HasError("[\\x{00}]"));
+ UNIT_ASSERT(!HasError("[\\x{abc}-\\x{FFF}]"));
+
+ UNIT_ASSERT(!HasError("[^\\xFF]"));
+ UNIT_ASSERT(!HasError("[^\\x{FF}-\\x{FF0}]"));
+ UNIT_ASSERT(!HasError("[-\\x{01}]"));
+ }
+
+ Y_UNIT_TEST(UnicodeRepetition)
+ {
+ REGEXP("^\\x{78}{3,6}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxx");
+ ACCEPTS("xxxxxx");
+ DENIES ("xxxxxxx");
+ }
+
+ REGEXP("^x{3,}$") {
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ ACCEPTS("xxxx");
+ ACCEPTS("xxxxxxxxxxx");
+ ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^\\x{78}{3}$") {
+ DENIES ("x");
+ DENIES ("xx");
+ ACCEPTS("xxx");
+ DENIES ("xxxx");
+ DENIES ("xxxxx");
+ DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+
+ REGEXP("^([\\x{78}-\\x{79}]){2}$") {
+ DENIES("x");
+ DENIES("y");
+ ACCEPTS("xx");
+ ACCEPTS("xy");
+ ACCEPTS("yx");
+ ACCEPTS("yy");
+ DENIES("xxy");
+ DENIES("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+ }
+ }
+
+ Y_UNIT_TEST(AnyUnicodeCodepointIsAllowed)
+ {
+ REGEXP("[\\x{0}-\\x{77}\\x{79}-\\x{10ffff}]") {
+ ACCEPTS("w");
+ DENIES ("x");
+ ACCEPTS("y");
+ }
+ }
+
+}