diff options
author | atarasov5 <[email protected]> | 2025-08-14 10:29:14 +0300 |
---|---|---|
committer | atarasov5 <[email protected]> | 2025-08-14 10:44:46 +0300 |
commit | f132db24bdf6dc23b539c87283d4c32d0bc4154b (patch) | |
tree | fd3542827091fb20f21fc3fa22bf7e47f757538e | |
parent | c29c15d2a51aabf8c2c33010027c7726c2debfa2 (diff) |
YQL-18878: Introduce Re2::IsValidRegexp
commit_hash:44a39f94ad6f9407cb9e493cbf88ef28b320586e
7 files changed, 218 insertions, 19 deletions
diff --git a/yql/essentials/docs/en/udf/list/re2.md b/yql/essentials/docs/en/udf/list/re2.md index f83dd976c13..2b66322bfd7 100644 --- a/yql/essentials/docs/en/udf/list/re2.md +++ b/yql/essentials/docs/en/udf/list/re2.md @@ -9,6 +9,7 @@ Re2::Capture(pattern:String, options:Struct<...>?) -> (string:String?) -> Struct Re2::FindAndConsume(pattern:String, options:Struct<...>?) -> (string:String?) -> List<String> Re2::Replace(pattern:String, options:Struct<...>?) -> (string:String?, replacement:String) -> String? Re2::Count(pattern:String, options:Struct<...>?) -> (string:String?) -> Uint32 +Re2::IsValidRegexp(pattern:String?, options:Struct<...>?) -> Bool Re2::Options([CaseSensitive:Bool?,DotNl:Bool?,Literal:Bool?,LogErrors:Bool?,LongestMatch:Bool?,MaxMem:Uint64?,NeverCapture:Bool?,NeverNl:Bool?,OneLine:Bool?,PerlClasses:Bool?,PosixSyntax:Bool?,Utf8:Bool?,WordBoundary:Bool?]) -> Struct<CaseSensitive:Bool,DotNl:Bool,Literal:Bool,LogErrors:Bool,LongestMatch:Bool,MaxMem:Uint64,NeverCapture:Bool,NeverNl:Bool,OneLine:Bool,PerlClasses:Bool,PosixSyntax:Bool,Utf8:Bool,WordBoundary:Bool> ``` @@ -86,6 +87,10 @@ Works as follows: Returns the number of non-overlapping substrings of the input string that have matched the regular expression. +## Re2::IsValidRegexp {#isvalidregexp} + +Checks if the passed string is a valid regular expression pattern according to Re2 syntax. The optional `options` parameter allows you to validate the pattern using the same parsing settings that would be used by other Re2 functions, ensuring consistency in validation. + ## Re2::Options {#options} Notes on Re2::Options from the official [repository](https://github.com/google/re2/blob/main/re2/re2.h#L595-L617) diff --git a/yql/essentials/docs/ru/udf/list/re2.md b/yql/essentials/docs/ru/udf/list/re2.md index 3c0906f30e1..c910ab625a6 100644 --- a/yql/essentials/docs/ru/udf/list/re2.md +++ b/yql/essentials/docs/ru/udf/list/re2.md @@ -9,6 +9,7 @@ Re2::Capture(pattern:String, options:Struct<...>?) -> (string:String?) -> Struct Re2::FindAndConsume(pattern:String, options:Struct<...>?) -> (string:String?) -> List<String> Re2::Replace(pattern:String, options:Struct<...>?) -> (string:String?, replacement:String) -> String? Re2::Count(pattern:String, options:Struct<...>?) -> (string:String?) -> Uint32 +Re2::IsValidRegexp(pattern:String?, options:Struct<...>?) -> Bool Re2::Options([CaseSensitive:Bool?,DotNl:Bool?,Literal:Bool?,LogErrors:Bool?,LongestMatch:Bool?,MaxMem:Uint64?,NeverCapture:Bool?,NeverNl:Bool?,OneLine:Bool?,PerlClasses:Bool?,PosixSyntax:Bool?,Utf8:Bool?,WordBoundary:Bool?]) -> Struct<CaseSensitive:Bool,DotNl:Bool,Literal:Bool,LogErrors:Bool,LongestMatch:Bool,MaxMem:Uint64,NeverCapture:Bool,NeverNl:Bool,OneLine:Bool,PerlClasses:Bool,PosixSyntax:Bool,Utf8:Bool,WordBoundary:Bool> ``` @@ -86,6 +87,10 @@ SELECT Возвращает количество совпавших с регулярным выражением непересекающихся подстрок во входной строке. +## Re2::IsValidRegexp {#isvalidregexp} + +Проверяет, является ли переданная строка корректным шаблоном регулярного выражения согласно синтаксису Re2. Опциональный параметр `options` позволяет валидировать шаблон с использованием тех же настроек парсинга, которые будут использоваться другими функциями Re2, обеспечивая согласованность валидации. + ## Re2::Options {#options} Пояснения к параметрам Re2::Options из официального [репозитория](https://github.com/google/re2/blob/main/re2/re2.h#L595-L617) diff --git a/yql/essentials/sql/v1/builtin.cpp b/yql/essentials/sql/v1/builtin.cpp index 995f31d7328..d2a368be60f 100644 --- a/yql/essentials/sql/v1/builtin.cpp +++ b/yql/essentials/sql/v1/builtin.cpp @@ -3476,7 +3476,7 @@ TNodePtr BuildBuiltinFunc(TContext& ctx, TPosition pos, TString name, const TVec }; auto fullName = moduleName + "." + name; return new TYqlTypeConfigUdf(pos, fullName, multiArgs, multiArgs.size() + 1); - } else if (!(ns.StartsWith("re2") && lowerName == "options")) { + } else if (!(ns.StartsWith("re2") && (lowerName == "options" || lowerName == "isvalidregexp"))) { auto newArgs = args; if (ns.StartsWith("re2")) { // convert run config is tuple of string and optional options diff --git a/yql/essentials/udfs/common/re2/re2_udf.cpp b/yql/essentials/udfs/common/re2/re2_udf.cpp index 8e5079e6e42..2f1f6dbb529 100644 --- a/yql/essentials/udfs/common/re2/re2_udf.cpp +++ b/yql/essentials/udfs/common/re2/re2_udf.cpp @@ -82,6 +82,24 @@ namespace { ui32 Indices[EOptionsField::Count]; }; + RE2::Options ExtractOptions(std::string_view pattern, TUnboxedValuePod optionsValue, const TOptionsSchema& schema, bool posix) { + RE2::Options options = CreateDefaultOptions(); + + options.set_posix_syntax(posix); + bool needUtf8 = (UTF8Detect(pattern) == UTF8); + options.set_encoding( + needUtf8 + ? RE2::Options::Encoding::EncodingUTF8 + : RE2::Options::Encoding::EncodingLatin1); + if (optionsValue) { +#define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(schema.Indices[index]).Get<type>())); + OPTIONS_MAP(FIELD_HANDLE) +#undef FIELD_HANDLE + options.set_log_errors(false); + } + return options; + } + struct TRegexpGroups { TVector<TString> Names; TVector<ui32> Indexes; @@ -182,22 +200,8 @@ namespace { auto patternValue = runConfig.GetElement(0); auto optionsValue = runConfig.GetElement(1); const std::string_view pattern(patternValue.AsStringRef()); - RE2::Options options = CreateDefaultOptions(); - - options.set_posix_syntax(posix); - bool needUtf8 = (UTF8Detect(pattern) == UTF8); - options.set_encoding( - needUtf8 - ? RE2::Options::Encoding::EncodingUTF8 - : RE2::Options::Encoding::EncodingLatin1 - ); - if (optionsValue) { -#define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(OptionsSchema_.Indices[index]).Get<type>())); - OPTIONS_MAP(FIELD_HANDLE) -#undef FIELD_HANDLE - options.set_log_errors(false); - } + RE2::Options options = ExtractOptions(pattern, optionsValue, OptionsSchema_, posix); Regexp_ = std::make_unique<RE2>(StringPiece(pattern.data(), pattern.size()), options); if (!Regexp_->ok() && ShouldFailOnInvalidRegexp(pattern, CurrentLangVersion_)) { @@ -385,6 +389,61 @@ namespace { } }; + template <bool posix> + class TIsValidRegexp: public TBoxedValue { + public: + TIsValidRegexp(const TOptionsSchema optionsSchema) + : OptionsSchema_(std::move(optionsSchema)) + { + } + + TUnboxedValue Run( + const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + if (!args[0]) { + return TUnboxedValuePod(false); + } + RE2::Options options = ExtractOptions(args[0].AsStringRef(), args[1], OptionsSchema_, posix); + RE2 regexp(args[0].AsStringRef(), options); + return TUnboxedValuePod(regexp.ok()); + } + + static const ::NKikimr::NUdf::TStringRef& Name() { + static auto name = ::NKikimr::NUdf::TStringRef::Of("IsValidRegexp"); + return name; + } + + static bool DeclareSignature( + const ::NKikimr::NUdf::TStringRef& name, + ::NKikimr::NUdf::TType* userType, + ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder, + bool typesOnly) { + Y_UNUSED(userType); + if (Name() == name) { + TOptionsSchema optionsSchema = MakeOptionsSchema(builder); + auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build(); + builder.Args() + ->Add(builder.Optional()->Item(builder.SimpleType<char*>())) + .Add(optOptionsStructType) + .Done() + .Returns(builder.SimpleType<bool>()); + + builder.OptionalArgs(1); + if (!typesOnly) { + builder.Implementation(new TIsValidRegexp(std::move(optionsSchema))); + } + builder.IsStrict(); + return true; + } else { + return false; + } + } + + private: + const TOptionsSchema OptionsSchema_; + }; + SIMPLE_UDF_WITH_OPTIONAL_ARGS(TPatternFromLike, char*(char*, TOptional<char*>), 1) { const std::string_view input(args[0].AsStringRef()); const bool hasEscape = bool(args[1]); @@ -472,6 +531,7 @@ namespace { sink.Add(TEscape::Name()); sink.Add(TPatternFromLike::Name()); sink.Add(TOptions::Name()); + sink.Add(TIsValidRegexp<posix>::Name()); } void BuildFunctionTypeInfo( @@ -567,9 +627,10 @@ namespace { builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::FIND_AND_CONSUME, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); } } else if (!( - TEscape::DeclareSignature(name, userType, builder, typesOnly) || - TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) || - TOptions::DeclareSignature(name, userType, builder, typesOnly))) { + TEscape::DeclareSignature(name, userType, builder, typesOnly) || + TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) || + TOptions::DeclareSignature(name, userType, builder, typesOnly) || + TIsValidRegexp<posix>::DeclareSignature(name, userType, builder, typesOnly))) { builder.SetError( TStringBuilder() << "Unknown function name: " << TString(name)); } diff --git a/yql/essentials/udfs/common/re2/test/canondata/result.json b/yql/essentials/udfs/common/re2/test/canondata/result.json index 75db00ebaf8..d0f4a21acb0 100644 --- a/yql/essentials/udfs/common/re2/test/canondata/result.json +++ b/yql/essentials/udfs/common/re2/test/canondata/result.json @@ -44,6 +44,11 @@ "uri": "file://test.test_InvalidRegexSuccess_2025.02_/results.txt" } ], + "test.test[IsValidRegexp]": [ + { + "uri": "file://test.test_IsValidRegexp_/results.txt" + } + ], "test.test[LikeEscape]": [ { "uri": "file://test.test_LikeEscape_/results.txt" diff --git a/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt b/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt new file mode 100644 index 00000000000..92bfcd609a2 --- /dev/null +++ b/yql/essentials/udfs/common/re2/test/canondata/test.test_IsValidRegexp_/results.txt @@ -0,0 +1,108 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "invalid_star_at_start"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "invalid_unclosed_bracket"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "valid_dot_metachar"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "valid_literal_string"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "valid_byte_default_encoding"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "invalid_byte_utf8_encoding"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "valid_byte_latin1_encoding"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "posix_invalid_non_capturing_group"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "re2_default_valid_non_capturing_group"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "null_string"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "null_options"; + [ + "DataType"; + "Bool" + ] + ] + ] + ] + ]; + "Data" = [ + [ + %false; + %false; + %true; + %true; + %true; + %false; + %true; + %false; + %true; + %false; + %true + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql b/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql new file mode 100644 index 00000000000..876f13ccf73 --- /dev/null +++ b/yql/essentials/udfs/common/re2/test/cases/IsValidRegexp.sql @@ -0,0 +1,15 @@ +/* syntax version 1 */ +SELECT + Re2::IsValidRegexp("*") AS invalid_star_at_start, + Re2::IsValidRegexp("[") AS invalid_unclosed_bracket, + Re2::IsValidRegexp(".") AS valid_dot_metachar, + Re2::IsValidRegexp("abc") AS valid_literal_string, + Re2::IsValidRegexp("\xff") AS valid_byte_default_encoding, + Re2::IsValidRegexp("\xff", Re2::Options(true as Utf8)) AS invalid_byte_utf8_encoding, + Re2::IsValidRegexp("\xff", Re2::Options(false as Utf8)) AS valid_byte_latin1_encoding, + + Re2posix::IsValidRegexp("(?:abc)") AS posix_invalid_non_capturing_group, + Re2::IsValidRegexp("(?:abc)") AS re2_default_valid_non_capturing_group, + + Re2::IsValidRegexp(null) as null_string, + Re2::IsValidRegexp(".", null) AS null_options; |