summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoratarasov5 <[email protected]>2025-08-14 10:29:14 +0300
committeratarasov5 <[email protected]>2025-08-14 10:44:46 +0300
commitf132db24bdf6dc23b539c87283d4c32d0bc4154b (patch)
treefd3542827091fb20f21fc3fa22bf7e47f757538e
parentc29c15d2a51aabf8c2c33010027c7726c2debfa2 (diff)
YQL-18878: Introduce Re2::IsValidRegexp
commit_hash:44a39f94ad6f9407cb9e493cbf88ef28b320586e
-rw-r--r--yql/essentials/docs/en/udf/list/re2.md5
-rw-r--r--yql/essentials/docs/ru/udf/list/re2.md5
-rw-r--r--yql/essentials/sql/v1/builtin.cpp2
-rw-r--r--yql/essentials/udfs/common/re2/re2_udf.cpp97
-rw-r--r--yql/essentials/udfs/common/re2/test/canondata/result.json5
-rw-r--r--yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt108
-rw-r--r--yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql15
7 files changed, 218 insertions, 19 deletions
diff --git a/yql/essentials/docs/en/udf/list/re2.md b/yql/essentials/docs/en/udf/list/re2.md
index f83dd976c13..2b66322bfd7 100644
--- a/yql/essentials/docs/en/udf/list/re2.md
+++ b/yql/essentials/docs/en/udf/list/re2.md
@@ -9,6 +9,7 @@ Re2::Capture(pattern:String, options:Struct<...>?) -> (string:String?) -> Struct
Re2::FindAndConsume(pattern:String, options:Struct<...>?) -> (string:String?) -> List<String>
Re2::Replace(pattern:String, options:Struct<...>?) -> (string:String?, replacement:String) -> String?
Re2::Count(pattern:String, options:Struct<...>?) -> (string:String?) -> Uint32
+Re2::IsValidRegexp(pattern:String?, options:Struct<...>?) -> Bool
Re2::Options([CaseSensitive:Bool?,DotNl:Bool?,Literal:Bool?,LogErrors:Bool?,LongestMatch:Bool?,MaxMem:Uint64?,NeverCapture:Bool?,NeverNl:Bool?,OneLine:Bool?,PerlClasses:Bool?,PosixSyntax:Bool?,Utf8:Bool?,WordBoundary:Bool?]) -> Struct<CaseSensitive:Bool,DotNl:Bool,Literal:Bool,LogErrors:Bool,LongestMatch:Bool,MaxMem:Uint64,NeverCapture:Bool,NeverNl:Bool,OneLine:Bool,PerlClasses:Bool,PosixSyntax:Bool,Utf8:Bool,WordBoundary:Bool>
```
@@ -86,6 +87,10 @@ Works as follows:
Returns the number of non-overlapping substrings of the input string that have matched the regular expression.
+## Re2::IsValidRegexp {#isvalidregexp}
+
+Checks if the passed string is a valid regular expression pattern according to Re2 syntax. The optional `options` parameter allows you to validate the pattern using the same parsing settings that would be used by other Re2 functions, ensuring consistency in validation.
+
## Re2::Options {#options}
Notes on Re2::Options from the official [repository](https://github.com/google/re2/blob/main/re2/re2.h#L595-L617)
diff --git a/yql/essentials/docs/ru/udf/list/re2.md b/yql/essentials/docs/ru/udf/list/re2.md
index 3c0906f30e1..c910ab625a6 100644
--- a/yql/essentials/docs/ru/udf/list/re2.md
+++ b/yql/essentials/docs/ru/udf/list/re2.md
@@ -9,6 +9,7 @@ Re2::Capture(pattern:String, options:Struct<...>?) -> (string:String?) -> Struct
Re2::FindAndConsume(pattern:String, options:Struct<...>?) -> (string:String?) -> List<String>
Re2::Replace(pattern:String, options:Struct<...>?) -> (string:String?, replacement:String) -> String?
Re2::Count(pattern:String, options:Struct<...>?) -> (string:String?) -> Uint32
+Re2::IsValidRegexp(pattern:String?, options:Struct<...>?) -> Bool
Re2::Options([CaseSensitive:Bool?,DotNl:Bool?,Literal:Bool?,LogErrors:Bool?,LongestMatch:Bool?,MaxMem:Uint64?,NeverCapture:Bool?,NeverNl:Bool?,OneLine:Bool?,PerlClasses:Bool?,PosixSyntax:Bool?,Utf8:Bool?,WordBoundary:Bool?]) -> Struct<CaseSensitive:Bool,DotNl:Bool,Literal:Bool,LogErrors:Bool,LongestMatch:Bool,MaxMem:Uint64,NeverCapture:Bool,NeverNl:Bool,OneLine:Bool,PerlClasses:Bool,PosixSyntax:Bool,Utf8:Bool,WordBoundary:Bool>
```
@@ -86,6 +87,10 @@ SELECT
Возвращает количество совпавших с регулярным выражением непересекающихся подстрок во входной строке.
+## Re2::IsValidRegexp {#isvalidregexp}
+
+Проверяет, является ли переданная строка корректным шаблоном регулярного выражения согласно синтаксису Re2. Опциональный параметр `options` позволяет валидировать шаблон с использованием тех же настроек парсинга, которые будут использоваться другими функциями Re2, обеспечивая согласованность валидации.
+
## Re2::Options {#options}
Пояснения к параметрам Re2::Options из официального [репозитория](https://github.com/google/re2/blob/main/re2/re2.h#L595-L617)
diff --git a/yql/essentials/sql/v1/builtin.cpp b/yql/essentials/sql/v1/builtin.cpp
index 995f31d7328..d2a368be60f 100644
--- a/yql/essentials/sql/v1/builtin.cpp
+++ b/yql/essentials/sql/v1/builtin.cpp
@@ -3476,7 +3476,7 @@ TNodePtr BuildBuiltinFunc(TContext& ctx, TPosition pos, TString name, const TVec
};
auto fullName = moduleName + "." + name;
return new TYqlTypeConfigUdf(pos, fullName, multiArgs, multiArgs.size() + 1);
- } else if (!(ns.StartsWith("re2") && lowerName == "options")) {
+ } else if (!(ns.StartsWith("re2") && (lowerName == "options" || lowerName == "isvalidregexp"))) {
auto newArgs = args;
if (ns.StartsWith("re2")) {
// convert run config is tuple of string and optional options
diff --git a/yql/essentials/udfs/common/re2/re2_udf.cpp b/yql/essentials/udfs/common/re2/re2_udf.cpp
index 8e5079e6e42..2f1f6dbb529 100644
--- a/yql/essentials/udfs/common/re2/re2_udf.cpp
+++ b/yql/essentials/udfs/common/re2/re2_udf.cpp
@@ -82,6 +82,24 @@ namespace {
ui32 Indices[EOptionsField::Count];
};
+ RE2::Options ExtractOptions(std::string_view pattern, TUnboxedValuePod optionsValue, const TOptionsSchema& schema, bool posix) {
+ RE2::Options options = CreateDefaultOptions();
+
+ options.set_posix_syntax(posix);
+ bool needUtf8 = (UTF8Detect(pattern) == UTF8);
+ options.set_encoding(
+ needUtf8
+ ? RE2::Options::Encoding::EncodingUTF8
+ : RE2::Options::Encoding::EncodingLatin1);
+ if (optionsValue) {
+#define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(schema.Indices[index]).Get<type>()));
+ OPTIONS_MAP(FIELD_HANDLE)
+#undef FIELD_HANDLE
+ options.set_log_errors(false);
+ }
+ return options;
+ }
+
struct TRegexpGroups {
TVector<TString> Names;
TVector<ui32> Indexes;
@@ -182,22 +200,8 @@ namespace {
auto patternValue = runConfig.GetElement(0);
auto optionsValue = runConfig.GetElement(1);
const std::string_view pattern(patternValue.AsStringRef());
- RE2::Options options = CreateDefaultOptions();
-
- options.set_posix_syntax(posix);
- bool needUtf8 = (UTF8Detect(pattern) == UTF8);
- options.set_encoding(
- needUtf8
- ? RE2::Options::Encoding::EncodingUTF8
- : RE2::Options::Encoding::EncodingLatin1
- );
- if (optionsValue) {
-#define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(OptionsSchema_.Indices[index]).Get<type>()));
- OPTIONS_MAP(FIELD_HANDLE)
-#undef FIELD_HANDLE
- options.set_log_errors(false);
- }
+ RE2::Options options = ExtractOptions(pattern, optionsValue, OptionsSchema_, posix);
Regexp_ = std::make_unique<RE2>(StringPiece(pattern.data(), pattern.size()), options);
if (!Regexp_->ok() && ShouldFailOnInvalidRegexp(pattern, CurrentLangVersion_)) {
@@ -385,6 +389,61 @@ namespace {
}
};
+ template <bool posix>
+ class TIsValidRegexp: public TBoxedValue {
+ public:
+ TIsValidRegexp(const TOptionsSchema optionsSchema)
+ : OptionsSchema_(std::move(optionsSchema))
+ {
+ }
+
+ TUnboxedValue Run(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod* args) const override {
+ Y_UNUSED(valueBuilder);
+ if (!args[0]) {
+ return TUnboxedValuePod(false);
+ }
+ RE2::Options options = ExtractOptions(args[0].AsStringRef(), args[1], OptionsSchema_, posix);
+ RE2 regexp(args[0].AsStringRef(), options);
+ return TUnboxedValuePod(regexp.ok());
+ }
+
+ static const ::NKikimr::NUdf::TStringRef& Name() {
+ static auto name = ::NKikimr::NUdf::TStringRef::Of("IsValidRegexp");
+ return name;
+ }
+
+ static bool DeclareSignature(
+ const ::NKikimr::NUdf::TStringRef& name,
+ ::NKikimr::NUdf::TType* userType,
+ ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder,
+ bool typesOnly) {
+ Y_UNUSED(userType);
+ if (Name() == name) {
+ TOptionsSchema optionsSchema = MakeOptionsSchema(builder);
+ auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build();
+ builder.Args()
+ ->Add(builder.Optional()->Item(builder.SimpleType<char*>()))
+ .Add(optOptionsStructType)
+ .Done()
+ .Returns(builder.SimpleType<bool>());
+
+ builder.OptionalArgs(1);
+ if (!typesOnly) {
+ builder.Implementation(new TIsValidRegexp(std::move(optionsSchema)));
+ }
+ builder.IsStrict();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ private:
+ const TOptionsSchema OptionsSchema_;
+ };
+
SIMPLE_UDF_WITH_OPTIONAL_ARGS(TPatternFromLike, char*(char*, TOptional<char*>), 1) {
const std::string_view input(args[0].AsStringRef());
const bool hasEscape = bool(args[1]);
@@ -472,6 +531,7 @@ namespace {
sink.Add(TEscape::Name());
sink.Add(TPatternFromLike::Name());
sink.Add(TOptions::Name());
+ sink.Add(TIsValidRegexp<posix>::Name());
}
void BuildFunctionTypeInfo(
@@ -567,9 +627,10 @@ namespace {
builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::FIND_AND_CONSUME, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer()));
}
} else if (!(
- TEscape::DeclareSignature(name, userType, builder, typesOnly) ||
- TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) ||
- TOptions::DeclareSignature(name, userType, builder, typesOnly))) {
+ TEscape::DeclareSignature(name, userType, builder, typesOnly) ||
+ TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) ||
+ TOptions::DeclareSignature(name, userType, builder, typesOnly) ||
+ TIsValidRegexp<posix>::DeclareSignature(name, userType, builder, typesOnly))) {
builder.SetError(
TStringBuilder() << "Unknown function name: " << TString(name));
}
diff --git a/yql/essentials/udfs/common/re2/test/canondata/result.json b/yql/essentials/udfs/common/re2/test/canondata/result.json
index 75db00ebaf8..d0f4a21acb0 100644
--- a/yql/essentials/udfs/common/re2/test/canondata/result.json
+++ b/yql/essentials/udfs/common/re2/test/canondata/result.json
@@ -44,6 +44,11 @@
"uri": "file://test.test_InvalidRegexSuccess_2025.02_/results.txt"
}
],
+ "test.test[IsValidRegexp]": [
+ {
+ "uri": "file://test.test_IsValidRegexp_/results.txt"
+ }
+ ],
"test.test[LikeEscape]": [
{
"uri": "file://test.test_LikeEscape_/results.txt"
diff --git a/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt b/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt
new file mode 100644
index 00000000000..92bfcd609a2
--- /dev/null
+++ b/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt
@@ -0,0 +1,108 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "invalid_star_at_start";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "invalid_unclosed_bracket";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "valid_dot_metachar";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "valid_literal_string";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "valid_byte_default_encoding";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "invalid_byte_utf8_encoding";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "valid_byte_latin1_encoding";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "posix_invalid_non_capturing_group";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "re2_default_valid_non_capturing_group";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "null_string";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "null_options";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ %false;
+ %false;
+ %true;
+ %true;
+ %true;
+ %false;
+ %true;
+ %false;
+ %true;
+ %false;
+ %true
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql b/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql
new file mode 100644
index 00000000000..876f13ccf73
--- /dev/null
+++ b/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql
@@ -0,0 +1,15 @@
+/* syntax version 1 */
+SELECT
+ Re2::IsValidRegexp("*") AS invalid_star_at_start,
+ Re2::IsValidRegexp("[") AS invalid_unclosed_bracket,
+ Re2::IsValidRegexp(".") AS valid_dot_metachar,
+ Re2::IsValidRegexp("abc") AS valid_literal_string,
+ Re2::IsValidRegexp("\xff") AS valid_byte_default_encoding,
+ Re2::IsValidRegexp("\xff", Re2::Options(true as Utf8)) AS invalid_byte_utf8_encoding,
+ Re2::IsValidRegexp("\xff", Re2::Options(false as Utf8)) AS valid_byte_latin1_encoding,
+
+ Re2posix::IsValidRegexp("(?:abc)") AS posix_invalid_non_capturing_group,
+ Re2::IsValidRegexp("(?:abc)") AS re2_default_valid_non_capturing_group,
+
+ Re2::IsValidRegexp(null) as null_string,
+ Re2::IsValidRegexp(".", null) AS null_options;