diff options
author | vvvv <[email protected]> | 2025-10-06 13:26:25 +0300 |
---|---|---|
committer | vvvv <[email protected]> | 2025-10-06 14:06:25 +0300 |
commit | eca8ce9cb1613d5c983185c4e43c20651a9638aa (patch) | |
tree | 61ee5ae779948e61af9a7691d19eaa2c09869121 /yql/essentials/udfs/common/re2/re2_udf.cpp | |
parent | 4adf7eecae16a9b228b28cc5f64c27ef69ad5ec2 (diff) |
YQL-20086 udfs
init
commit_hash:f9684778bf1ea956965f2360b80b91edb7d4ffbe
Diffstat (limited to 'yql/essentials/udfs/common/re2/re2_udf.cpp')
-rw-r--r-- | yql/essentials/udfs/common/re2/re2_udf.cpp | 1034 |
1 files changed, 518 insertions, 516 deletions
diff --git a/yql/essentials/udfs/common/re2/re2_udf.cpp b/yql/essentials/udfs/common/re2/re2_udf.cpp index 2f1f6dbb529..b13d975cf35 100644 --- a/yql/essentials/udfs/common/re2/re2_udf.cpp +++ b/yql/essentials/udfs/common/re2/re2_udf.cpp @@ -15,14 +15,14 @@ using namespace NUdf; namespace { - template <typename T> - T Id(T x) { - return x; - } +template <typename T> +T Id(T x) { + return x; +} - re2::RE2::Options::Encoding EncodingFromBool(bool x) { - return x ? re2::RE2::Options::Encoding::EncodingUTF8 : re2::RE2::Options::Encoding::EncodingLatin1; - } +re2::RE2::Options::Encoding EncodingFromBool(bool x) { + return x ? re2::RE2::Options::Encoding::EncodingUTF8 : re2::RE2::Options::Encoding::EncodingLatin1; +} #define OPTIONS_MAP(xx) \ xx(Utf8, 0, bool, true, set_encoding, EncodingFromBool) \ @@ -39,309 +39,310 @@ namespace { xx(WordBoundary, 11, bool, false, set_word_boundary, Id) \ xx(OneLine, 12, bool, false, set_one_line, Id) - ui64 GetFailProbability() { - auto envResult = TryGetEnv("YQL_RE2_REGEXP_PROBABILITY_FAIL"); - if (!envResult) { - return 0; - } - ui64 result; - bool isValid = TryIntFromString<10, ui64>(envResult->data(), envResult->size(), result); - Y_ENSURE(isValid, TStringBuilder() << "Error while parsing YQL_RE2_REGEXP_PROBABILITY_FAIL. Actual value is: " << *envResult); - return result; +ui64 GetFailProbability() { + auto envResult = TryGetEnv("YQL_RE2_REGEXP_PROBABILITY_FAIL"); + if (!envResult) { + return 0; } + ui64 result; + bool isValid = TryIntFromString<10, ui64>(envResult->data(), envResult->size(), result); + Y_ENSURE(isValid, TStringBuilder() << "Error while parsing YQL_RE2_REGEXP_PROBABILITY_FAIL. Actual value is: " << *envResult); + return result; +} - bool ShouldFailOnInvalidRegexp(const std::string_view regexp, NYql::TLangVersion currentLangVersion) { - if (currentLangVersion >= NYql::MakeLangVersion(2025, 3)) { - return true; - } - THashType hash = GetStringHash(regexp) % 100; - static ui64 failProbability = GetFailProbability(); - return hash < failProbability; +bool ShouldFailOnInvalidRegexp(const std::string_view regexp, NYql::TLangVersion currentLangVersion) { + if (currentLangVersion >= NYql::MakeLangVersion(2025, 3)) { + return true; } + THashType hash = GetStringHash(regexp) % 100; + static ui64 failProbability = GetFailProbability(); + return hash < failProbability; +} - RE2::Options CreateDefaultOptions(){ - RE2::Options options; +RE2::Options CreateDefaultOptions() { + RE2::Options options; #define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(defVal)); - OPTIONS_MAP(FIELD_HANDLE) + OPTIONS_MAP(FIELD_HANDLE) #undef FIELD_HANDLE - options.set_log_errors(false); - return options; - } - - TString FormatRegexpError(const RE2& Regexp) { - return TStringBuilder() << "Regexp compilation failed. Regexp: \"" << Regexp.pattern() << "\". Original error is: \"" << Regexp.error() << "\""; - } - - enum EOptionsField: ui32 { - OPTIONS_MAP(ENUM_VALUE_GEN) - Count - }; - - struct TOptionsSchema { - TType* StructType; - ui32 Indices[EOptionsField::Count]; - }; + options.set_log_errors(false); + return options; +} - RE2::Options ExtractOptions(std::string_view pattern, TUnboxedValuePod optionsValue, const TOptionsSchema& schema, bool posix) { - RE2::Options options = CreateDefaultOptions(); +TString FormatRegexpError(const RE2& Regexp) { + return TStringBuilder() << "Regexp compilation failed. Regexp: \"" << Regexp.pattern() << "\". Original error is: \"" << Regexp.error() << "\""; +} - options.set_posix_syntax(posix); - bool needUtf8 = (UTF8Detect(pattern) == UTF8); - options.set_encoding( - needUtf8 - ? RE2::Options::Encoding::EncodingUTF8 - : RE2::Options::Encoding::EncodingLatin1); - if (optionsValue) { +enum EOptionsField: ui32 { + OPTIONS_MAP(ENUM_VALUE_GEN) + Count +}; + +struct TOptionsSchema { + TType* StructType; + ui32 Indices[EOptionsField::Count]; +}; + +RE2::Options ExtractOptions(std::string_view pattern, TUnboxedValuePod optionsValue, const TOptionsSchema& schema, bool posix) { + RE2::Options options = CreateDefaultOptions(); + + options.set_posix_syntax(posix); + bool needUtf8 = (UTF8Detect(pattern) == UTF8); + options.set_encoding( + needUtf8 + ? RE2::Options::Encoding::EncodingUTF8 + : RE2::Options::Encoding::EncodingLatin1); + if (optionsValue) { #define FIELD_HANDLE(name, index, type, defVal, setter, conv) options.setter(conv(optionsValue.GetElement(schema.Indices[index]).Get<type>())); - OPTIONS_MAP(FIELD_HANDLE) + OPTIONS_MAP(FIELD_HANDLE) #undef FIELD_HANDLE - options.set_log_errors(false); - } - return options; + options.set_log_errors(false); } + return options; +} - struct TRegexpGroups { - TVector<TString> Names; - TVector<ui32> Indexes; +struct TRegexpGroups { + TVector<TString> Names; + TVector<ui32> Indexes; +}; + +class TRe2Udf: public TBoxedValue { +public: + enum EMode { + MATCH, + GREP, + CAPTURE, + REPLACE, + COUNT, + FIND_AND_CONSUME, }; - class TRe2Udf: public TBoxedValue { + template <bool posix> + class TFactory: public TBoxedValue { public: - enum EMode { - MATCH, - GREP, - CAPTURE, - REPLACE, - COUNT, - FIND_AND_CONSUME, - }; - - template <bool posix> - class TFactory: public TBoxedValue { - public: - TFactory( - EMode mode, - const TOptionsSchema& optionsSchema, - TSourcePosition pos, - NYql::TLangVersion currentlangVersion, - const TRegexpGroups& regexpGroups = TRegexpGroups()) - : Mode_(mode) - , OptionsSchema_(optionsSchema) - , Pos_(pos) - , RegexpGroups_(regexpGroups) - , CurrentLangVersion_(currentlangVersion) - { - } - - private: - TUnboxedValue Run( - const IValueBuilder* valueBuilder, - const TUnboxedValuePod* args) const override { - return TUnboxedValuePod( - new TRe2Udf( - valueBuilder, - args[0], - RegexpGroups_, - Mode_, - posix, - OptionsSchema_, - Pos_, - CurrentLangVersion_)); - } - - EMode Mode_; - const TOptionsSchema OptionsSchema_; - TSourcePosition Pos_; - const TRegexpGroups RegexpGroups_; - NYql::TLangVersion CurrentLangVersion_; - }; - - static const TStringRef& Name(EMode mode) { - static auto match = TStringRef::Of("Match"); - static auto grep = TStringRef::Of("Grep"); - static auto capture = TStringRef::Of("Capture"); - static auto replace = TStringRef::Of("Replace"); - static auto count = TStringRef::Of("Count"); - static auto findAndconsume = TStringRef::Of("FindAndConsume"); - - switch (mode) { - case EMode::MATCH: - return match; - case EMode::GREP: - return grep; - case EMode::CAPTURE: - return capture; - case EMode::REPLACE: - return replace; - case EMode::COUNT: - return count; - case EMode::FIND_AND_CONSUME: - return findAndconsume; - } - Y_ABORT("Unexpected mode"); - } - - TRe2Udf( - const IValueBuilder*, - const TUnboxedValuePod& runConfig, - const TRegexpGroups regexpGroups, + TFactory( EMode mode, - bool posix, const TOptionsSchema& optionsSchema, TSourcePosition pos, - NYql::TLangVersion currentLangVersion) - : RegexpGroups_(regexpGroups) - , Mode_(mode) - , Captured_() + NYql::TLangVersion currentlangVersion, + const TRegexpGroups& regexpGroups = TRegexpGroups()) + : Mode_(mode) , OptionsSchema_(optionsSchema) , Pos_(pos) - , CurrentLangVersion_(currentLangVersion) { - try { - auto patternValue = runConfig.GetElement(0); - auto optionsValue = runConfig.GetElement(1); - const std::string_view pattern(patternValue.AsStringRef()); + , RegexpGroups_(regexpGroups) + , CurrentLangVersion_(currentlangVersion) + { + } - RE2::Options options = ExtractOptions(pattern, optionsValue, OptionsSchema_, posix); - Regexp_ = std::make_unique<RE2>(StringPiece(pattern.data(), pattern.size()), options); + private: + TUnboxedValue Run( + const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + return TUnboxedValuePod( + new TRe2Udf( + valueBuilder, + args[0], + RegexpGroups_, + Mode_, + posix, + OptionsSchema_, + Pos_, + CurrentLangVersion_)); + } - if (!Regexp_->ok() && ShouldFailOnInvalidRegexp(pattern, CurrentLangVersion_)) { - throw yexception() << FormatRegexpError(*Regexp_); - } + EMode Mode_; + const TOptionsSchema OptionsSchema_; + TSourcePosition Pos_; + const TRegexpGroups RegexpGroups_; + NYql::TLangVersion CurrentLangVersion_; + }; - if (mode == EMode::CAPTURE) { - Captured_ = std::make_unique<StringPiece[]>(Regexp_->NumberOfCapturingGroups() + 1); - } + static const TStringRef& Name(EMode mode) { + static auto match = TStringRef::Of("Match"); + static auto grep = TStringRef::Of("Grep"); + static auto capture = TStringRef::Of("Capture"); + static auto replace = TStringRef::Of("Replace"); + static auto count = TStringRef::Of("Count"); + static auto findAndconsume = TStringRef::Of("FindAndConsume"); + + switch (mode) { + case EMode::MATCH: + return match; + case EMode::GREP: + return grep; + case EMode::CAPTURE: + return capture; + case EMode::REPLACE: + return replace; + case EMode::COUNT: + return count; + case EMode::FIND_AND_CONSUME: + return findAndconsume; + } + Y_ABORT("Unexpected mode"); + } + + TRe2Udf( + const IValueBuilder*, + const TUnboxedValuePod& runConfig, + const TRegexpGroups regexpGroups, + EMode mode, + bool posix, + const TOptionsSchema& optionsSchema, + TSourcePosition pos, + NYql::TLangVersion currentLangVersion) + : RegexpGroups_(regexpGroups) + , Mode_(mode) + , Captured_() + , OptionsSchema_(optionsSchema) + , Pos_(pos) + , CurrentLangVersion_(currentLangVersion) + { + try { + auto patternValue = runConfig.GetElement(0); + auto optionsValue = runConfig.GetElement(1); + const std::string_view pattern(patternValue.AsStringRef()); + + RE2::Options options = ExtractOptions(pattern, optionsValue, OptionsSchema_, posix); + Regexp_ = std::make_unique<RE2>(StringPiece(pattern.data(), pattern.size()), options); + + if (!Regexp_->ok() && ShouldFailOnInvalidRegexp(pattern, CurrentLangVersion_)) { + throw yexception() << FormatRegexpError(*Regexp_); + } - } catch (const std::exception& e) { - UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str()); + if (mode == EMode::CAPTURE) { + Captured_ = std::make_unique<StringPiece[]>(Regexp_->NumberOfCapturingGroups() + 1); } + + } catch (const std::exception& e) { + UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str()); } + } - private: - TUnboxedValue Run( - const IValueBuilder* valueBuilder, - const TUnboxedValuePod* args) const final try { - RE2::Anchor anchor = RE2::UNANCHORED; - if (args[0]) { - const std::string_view input(args[0].AsStringRef()); - const StringPiece piece(input.data(), input.size()); - - switch (Mode_) { - case MATCH: - anchor = RE2::ANCHOR_BOTH; - [[fallthrough]]; - case GREP: - return TUnboxedValuePod(Regexp_->Match(piece, 0, input.size(), anchor, nullptr, 0)); - case CAPTURE: { - const int count = Regexp_->NumberOfCapturingGroups() + 1; - TUnboxedValue* items = nullptr; - const auto result = valueBuilder->NewArray(RegexpGroups_.Names.size(), items); - if (Regexp_->Match(piece, 0, input.size(), anchor, Captured_.get(), count)) { - for (int i = 0; i < count; ++i) { - if (!Captured_[i].empty()) { - items[RegexpGroups_.Indexes[i]] = valueBuilder->SubString(args[0], std::distance(piece.begin(), Captured_[i].begin()), Captured_[i].size()); - } +private: + TUnboxedValue Run( + const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const final try { + RE2::Anchor anchor = RE2::UNANCHORED; + if (args[0]) { + const std::string_view input(args[0].AsStringRef()); + const StringPiece piece(input.data(), input.size()); + + switch (Mode_) { + case MATCH: + anchor = RE2::ANCHOR_BOTH; + [[fallthrough]]; + case GREP: + return TUnboxedValuePod(Regexp_->Match(piece, 0, input.size(), anchor, nullptr, 0)); + case CAPTURE: { + const int count = Regexp_->NumberOfCapturingGroups() + 1; + TUnboxedValue* items = nullptr; + const auto result = valueBuilder->NewArray(RegexpGroups_.Names.size(), items); + if (Regexp_->Match(piece, 0, input.size(), anchor, Captured_.get(), count)) { + for (int i = 0; i < count; ++i) { + if (!Captured_[i].empty()) { + items[RegexpGroups_.Indexes[i]] = valueBuilder->SubString(args[0], std::distance(piece.begin(), Captured_[i].begin()), Captured_[i].size()); } - } else { - return BuildEmptyStruct(valueBuilder); - } - return result; - } - case REPLACE: { - const std::string_view rewriteRef(args[1].AsStringRef()); - const StringPiece rewrite(rewriteRef.data(), rewriteRef.size()); - TString rewriteError; - if (!Regexp_->CheckRewriteString(rewrite, &rewriteError)) { - UdfTerminate((TStringBuilder() << Pos_ << " [rewrite error] " << rewriteError).c_str()); } - std::string result(input); - RE2::GlobalReplace(&result, *Regexp_, rewrite); - return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result); + } else { + return BuildEmptyStruct(valueBuilder); } - case COUNT: { - std::string inputHolder(input); - const ui32 result = RE2::GlobalReplace(&inputHolder, *Regexp_, ""); - return TUnboxedValuePod(result); + return result; + } + case REPLACE: { + const std::string_view rewriteRef(args[1].AsStringRef()); + const StringPiece rewrite(rewriteRef.data(), rewriteRef.size()); + TString rewriteError; + if (!Regexp_->CheckRewriteString(rewrite, &rewriteError)) { + UdfTerminate((TStringBuilder() << Pos_ << " [rewrite error] " << rewriteError).c_str()); } - case FIND_AND_CONSUME: { - StringPiece text(piece); - std::vector<TUnboxedValue> matches; - for (StringPiece w; text.begin() < text.end() && RE2::FindAndConsume(&text, *Regexp_, &w);) { - if (w.size() == 0 && !text.empty()) { - text.remove_prefix(1); - } - matches.emplace_back(valueBuilder->SubString(args[0], std::distance(piece.begin(), w.begin()), w.size())); + std::string result(input); + RE2::GlobalReplace(&result, *Regexp_, rewrite); + return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result); + } + case COUNT: { + std::string inputHolder(input); + const ui32 result = RE2::GlobalReplace(&inputHolder, *Regexp_, ""); + return TUnboxedValuePod(result); + } + case FIND_AND_CONSUME: { + StringPiece text(piece); + std::vector<TUnboxedValue> matches; + for (StringPiece w; text.begin() < text.end() && RE2::FindAndConsume(&text, *Regexp_, &w);) { + if (w.size() == 0 && !text.empty()) { + text.remove_prefix(1); } - return valueBuilder->NewList(matches.data(), matches.size()); + matches.emplace_back(valueBuilder->SubString(args[0], std::distance(piece.begin(), w.begin()), w.size())); } + return valueBuilder->NewList(matches.data(), matches.size()); } - Y_ABORT("Unexpected mode"); - } else { - switch (Mode_) { - case MATCH: - case GREP: - return TUnboxedValuePod(false); - case CAPTURE: - return BuildEmptyStruct(valueBuilder); - case REPLACE: - return TUnboxedValuePod(); - case COUNT: - return TUnboxedValuePod::Zero(); - case FIND_AND_CONSUME: - return valueBuilder->NewEmptyList(); - } - Y_ABORT("Unexpected mode"); } - } catch (const std::exception& e) { - UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str()); - } - - std::unique_ptr<RE2> Regexp_; - const TRegexpGroups RegexpGroups_; - EMode Mode_; - std::unique_ptr<StringPiece[]> Captured_; - const TOptionsSchema OptionsSchema_; - TSourcePosition Pos_; - NYql::TLangVersion CurrentLangVersion_; - - TUnboxedValue BuildEmptyStruct(const IValueBuilder* valueBuilder) const { - TUnboxedValue* items = nullptr; - return valueBuilder->NewArray(RegexpGroups_.Names.size(), items); + Y_ABORT("Unexpected mode"); + } else { + switch (Mode_) { + case MATCH: + case GREP: + return TUnboxedValuePod(false); + case CAPTURE: + return BuildEmptyStruct(valueBuilder); + case REPLACE: + return TUnboxedValuePod(); + case COUNT: + return TUnboxedValuePod::Zero(); + case FIND_AND_CONSUME: + return valueBuilder->NewEmptyList(); + } + Y_ABORT("Unexpected mode"); } - }; + } catch (const std::exception& e) { + UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str()); + } - SIMPLE_STRICT_UDF(TEscape, char*(char*)) { - const std::string_view input(args[0].AsStringRef()); - const auto& result = RE2::QuoteMeta(StringPiece(input.data(), input.size())); - return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result); + std::unique_ptr<RE2> Regexp_; + const TRegexpGroups RegexpGroups_; + EMode Mode_; + std::unique_ptr<StringPiece[]> Captured_; + const TOptionsSchema OptionsSchema_; + TSourcePosition Pos_; + NYql::TLangVersion CurrentLangVersion_; + + TUnboxedValue BuildEmptyStruct(const IValueBuilder* valueBuilder) const { + TUnboxedValue* items = nullptr; + return valueBuilder->NewArray(RegexpGroups_.Names.size(), items); } +}; - TOptionsSchema MakeOptionsSchema(::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder) { - TOptionsSchema ret; - auto structBuilder = builder.Struct(EOptionsField::Count); +SIMPLE_STRICT_UDF(TEscape, char*(char*)) { + const std::string_view input(args[0].AsStringRef()); + const auto& result = RE2::QuoteMeta(StringPiece(input.data(), input.size())); + return input == result ? TUnboxedValue(args[0]) : valueBuilder->NewString(result); +} + +TOptionsSchema MakeOptionsSchema(::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder) { + TOptionsSchema ret; + auto structBuilder = builder.Struct(EOptionsField::Count); #define FIELD_HANDLE(name, index, type, ...) structBuilder->AddField<type>(TStringRef::Of(#name), &ret.Indices[index]); - OPTIONS_MAP(FIELD_HANDLE) + OPTIONS_MAP(FIELD_HANDLE) #undef FIELD_HANDLE - ret.StructType = structBuilder->Build(); - return ret; - } + ret.StructType = structBuilder->Build(); + return ret; +} - class TOptions: public TBoxedValue { - private: - const TOptionsSchema Schema_; +class TOptions: public TBoxedValue { +private: + const TOptionsSchema Schema_; - public: - TOptions(const TOptionsSchema& schema) - : Schema_(schema) - { - } +public: + TOptions(const TOptionsSchema& schema) + : Schema_(schema) + { + } - TUnboxedValue Run( - const IValueBuilder* valueBuilder, - const TUnboxedValuePod* args) const override { - TUnboxedValue* items = nullptr; - const auto result = valueBuilder->NewArray(EOptionsField::Count, items); + TUnboxedValue Run( + const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + TUnboxedValue* items = nullptr; + const auto result = valueBuilder->NewArray(EOptionsField::Count, items); #define FIELD_HANDLE(name, index, type, defVal, ...) \ { \ auto structIndex = Schema_.Indices[index]; \ @@ -352,294 +353,295 @@ namespace { } \ } - OPTIONS_MAP(FIELD_HANDLE) + OPTIONS_MAP(FIELD_HANDLE) #undef FIELD_HANDLE - return result; - } + return result; + } - static const ::NKikimr::NUdf::TStringRef& Name() { - static auto name = ::NKikimr::NUdf::TStringRef::Of("Options"); - return name; - } + static const ::NKikimr::NUdf::TStringRef& Name() { + static auto name = ::NKikimr::NUdf::TStringRef::Of("Options"); + return name; + } - static bool DeclareSignature( - const ::NKikimr::NUdf::TStringRef& name, - ::NKikimr::NUdf::TType* userType, - ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder, - bool typesOnly) { - Y_UNUSED(userType); - if (Name() == name) { - builder.IsStrict(); + static bool DeclareSignature( + const ::NKikimr::NUdf::TStringRef& name, + ::NKikimr::NUdf::TType* userType, + ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder, + bool typesOnly) { + Y_UNUSED(userType); + if (Name() == name) { + builder.IsStrict(); - auto argsBuilder = builder.Args(); + auto argsBuilder = builder.Args(); #define FIELD_HANDLE(name, index, type, ...) argsBuilder->Add<TOptional<type>>().Name(TStringRef::Of(#name)); - OPTIONS_MAP(FIELD_HANDLE) + OPTIONS_MAP(FIELD_HANDLE) #undef FIELD_HANDLE - auto optionsSchema = MakeOptionsSchema(builder); - builder.Returns(optionsSchema.StructType); - builder.OptionalArgs(EOptionsField::Count); - if (!typesOnly) { - builder.Implementation(new TOptions(optionsSchema)); - } - - return true; - } else { - return false; + auto optionsSchema = MakeOptionsSchema(builder); + builder.Returns(optionsSchema.StructType); + builder.OptionalArgs(EOptionsField::Count); + if (!typesOnly) { + builder.Implementation(new TOptions(optionsSchema)); } - } - }; - template <bool posix> - class TIsValidRegexp: public TBoxedValue { - public: - TIsValidRegexp(const TOptionsSchema optionsSchema) - : OptionsSchema_(std::move(optionsSchema)) - { + return true; + } else { + return false; } + } +}; + +template <bool posix> +class TIsValidRegexp: public TBoxedValue { +public: + TIsValidRegexp(const TOptionsSchema optionsSchema) + : OptionsSchema_(std::move(optionsSchema)) + { + } - TUnboxedValue Run( - const IValueBuilder* valueBuilder, - const TUnboxedValuePod* args) const override { - Y_UNUSED(valueBuilder); - if (!args[0]) { - return TUnboxedValuePod(false); - } - RE2::Options options = ExtractOptions(args[0].AsStringRef(), args[1], OptionsSchema_, posix); - RE2 regexp(args[0].AsStringRef(), options); - return TUnboxedValuePod(regexp.ok()); + TUnboxedValue Run( + const IValueBuilder* valueBuilder, + const TUnboxedValuePod* args) const override { + Y_UNUSED(valueBuilder); + if (!args[0]) { + return TUnboxedValuePod(false); } + RE2::Options options = ExtractOptions(args[0].AsStringRef(), args[1], OptionsSchema_, posix); + RE2 regexp(args[0].AsStringRef(), options); + return TUnboxedValuePod(regexp.ok()); + } - static const ::NKikimr::NUdf::TStringRef& Name() { - static auto name = ::NKikimr::NUdf::TStringRef::Of("IsValidRegexp"); - return name; - } + static const ::NKikimr::NUdf::TStringRef& Name() { + static auto name = ::NKikimr::NUdf::TStringRef::Of("IsValidRegexp"); + return name; + } - static bool DeclareSignature( - const ::NKikimr::NUdf::TStringRef& name, - ::NKikimr::NUdf::TType* userType, - ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder, - bool typesOnly) { - Y_UNUSED(userType); - if (Name() == name) { - TOptionsSchema optionsSchema = MakeOptionsSchema(builder); - auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build(); - builder.Args() - ->Add(builder.Optional()->Item(builder.SimpleType<char*>())) - .Add(optOptionsStructType) - .Done() - .Returns(builder.SimpleType<bool>()); - - builder.OptionalArgs(1); - if (!typesOnly) { - builder.Implementation(new TIsValidRegexp(std::move(optionsSchema))); - } - builder.IsStrict(); - return true; - } else { - return false; + static bool DeclareSignature( + const ::NKikimr::NUdf::TStringRef& name, + ::NKikimr::NUdf::TType* userType, + ::NKikimr::NUdf::IFunctionTypeInfoBuilder& builder, + bool typesOnly) { + Y_UNUSED(userType); + if (Name() == name) { + TOptionsSchema optionsSchema = MakeOptionsSchema(builder); + auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build(); + builder.Args() + ->Add(builder.Optional()->Item(builder.SimpleType<char*>())) + .Add(optOptionsStructType) + .Done() + .Returns(builder.SimpleType<bool>()); + + builder.OptionalArgs(1); + if (!typesOnly) { + builder.Implementation(new TIsValidRegexp(std::move(optionsSchema))); } + builder.IsStrict(); + return true; + } else { + return false; } + } - private: - const TOptionsSchema OptionsSchema_; - }; - - SIMPLE_UDF_WITH_OPTIONAL_ARGS(TPatternFromLike, char*(char*, TOptional<char*>), 1) { - const std::string_view input(args[0].AsStringRef()); - const bool hasEscape = bool(args[1]); - char escape = 0; - if (hasEscape) { - const std::string_view escapeRef(args[1].AsStringRef()); - if (escapeRef.size() != 1U) { - UdfTerminate((TStringBuilder() << GetPos() << " Escape should be single character").c_str()); - } - escape = escapeRef.front(); +private: + const TOptionsSchema OptionsSchema_; +}; + +SIMPLE_UDF_WITH_OPTIONAL_ARGS(TPatternFromLike, char*(char*, TOptional<char*>), 1) { + const std::string_view input(args[0].AsStringRef()); + const bool hasEscape = bool(args[1]); + char escape = 0; + if (hasEscape) { + const std::string_view escapeRef(args[1].AsStringRef()); + if (escapeRef.size() != 1U) { + UdfTerminate((TStringBuilder() << GetPos() << " Escape should be single character").c_str()); } - const TString escaped(RE2::QuoteMeta(StringPiece(input.data(), input.size()))); - - TStringBuilder result; - result << "(?s)"; - bool slash = false; - bool escapeOn = false; - - for (const char& c : escaped) { - switch (c) { - case '\\': - if (slash) { - result << "\\\\"; - } - slash = !slash; - break; - case '%': - if (escapeOn) { - result << "\\%"; - escapeOn = false; - } else { - result << ".*"; - } - slash = false; - break; - case '_': + escape = escapeRef.front(); + } + const TString escaped(RE2::QuoteMeta(StringPiece(input.data(), input.size()))); + + TStringBuilder result; + result << "(?s)"; + bool slash = false; + bool escapeOn = false; + + for (const char& c : escaped) { + switch (c) { + case '\\': + if (slash) { + result << "\\\\"; + } + slash = !slash; + break; + case '%': + if (escapeOn) { + result << "\\%"; + escapeOn = false; + } else { + result << ".*"; + } + slash = false; + break; + case '_': + if (escapeOn) { + result << "\\_"; + escapeOn = false; + } else { + result << '.'; + } + slash = false; + break; + default: + if (hasEscape && c == escape) { if (escapeOn) { - result << "\\_"; - escapeOn = false; - } else { - result << '.'; + result << RE2::QuoteMeta(StringPiece(&c, 1)); } - slash = false; - break; - default: - if (hasEscape && c == escape) { - if (escapeOn) { - result << RE2::QuoteMeta(StringPiece(&c, 1)); - } - escapeOn = !escapeOn; - } else { - if (slash) - result << '\\'; - result << c; - escapeOn = false; + escapeOn = !escapeOn; + } else { + if (slash) { + result << '\\'; } - slash = false; - break; - } + result << c; + escapeOn = false; + } + slash = false; + break; } - return valueBuilder->NewString(result); - } - - TType* MakeRunConfigType(IFunctionTypeInfoBuilder& builder, TType* optOptionsStructType) { - return builder.Tuple()->Add<char*>().Add(optOptionsStructType).Build(); } + return valueBuilder->NewString(result); +} - template <bool posix> - class TRe2Module: public IUdfModule { - public: - TStringRef Name() const { - return posix ? TStringRef::Of("Re2posix") : TStringRef::Of("Re2"); - } - - void CleanupOnTerminate() const final { - } - - void GetAllFunctions(IFunctionsSink& sink) const final { - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::MATCH)); - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::GREP)); - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::CAPTURE))->SetTypeAwareness(); - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::REPLACE)); - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::COUNT)); - sink.Add(TRe2Udf::Name(TRe2Udf::EMode::FIND_AND_CONSUME)); - sink.Add(TEscape::Name()); - sink.Add(TPatternFromLike::Name()); - sink.Add(TOptions::Name()); - sink.Add(TIsValidRegexp<posix>::Name()); - } +TType* MakeRunConfigType(IFunctionTypeInfoBuilder& builder, TType* optOptionsStructType) { + return builder.Tuple()->Add<char*>().Add(optOptionsStructType).Build(); +} - void BuildFunctionTypeInfo( - const TStringRef& name, - TType* userType, - const TStringRef& typeConfig, - ui32 flags, - IFunctionTypeInfoBuilder& builder) const final try { - Y_UNUSED(userType); - TOptionsSchema optionsSchema = MakeOptionsSchema(builder); - auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build(); +template <bool posix> +class TRe2Module: public IUdfModule { +public: + TStringRef Name() const { + return posix ? TStringRef::Of("Re2posix") : TStringRef::Of("Re2"); + } - bool typesOnly = (flags & TFlags::TypesOnly); - bool isMatch = (TRe2Udf::Name(TRe2Udf::EMode::MATCH) == name); - bool isGrep = (TRe2Udf::Name(TRe2Udf::EMode::GREP) == name); - bool isCapture = (TRe2Udf::Name(TRe2Udf::EMode::CAPTURE) == name); - bool isReplace = (TRe2Udf::Name(TRe2Udf::EMode::REPLACE) == name); - bool isCount = (TRe2Udf::Name(TRe2Udf::EMode::COUNT) == name); - bool isFindAndConsume = (TRe2Udf::Name(TRe2Udf::FIND_AND_CONSUME) == name); + void CleanupOnTerminate() const final { + } - if (isMatch || isGrep) { - builder.SimpleSignature<bool(TOptional<char*>)>() - .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + void GetAllFunctions(IFunctionsSink& sink) const final { + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::MATCH)); + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::GREP)); + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::CAPTURE))->SetTypeAwareness(); + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::REPLACE)); + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::COUNT)); + sink.Add(TRe2Udf::Name(TRe2Udf::EMode::FIND_AND_CONSUME)); + sink.Add(TEscape::Name()); + sink.Add(TPatternFromLike::Name()); + sink.Add(TOptions::Name()); + sink.Add(TIsValidRegexp<posix>::Name()); + } - if (!typesOnly) { - const auto mode = isMatch ? TRe2Udf::EMode::MATCH : TRe2Udf::EMode::GREP; - builder.Implementation(new TRe2Udf::TFactory<posix>(mode, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); - } - } else if (isCapture) { - TRegexpGroups groups; - auto optionalStringType = builder.Optional()->Item<char*>().Build(); - auto structBuilder = builder.Struct(); - RE2::Options options = CreateDefaultOptions(); - RE2 regexp(StringPiece(typeConfig.Data(), typeConfig.Size()), options); - if (!regexp.ok()) { - builder.SetError(FormatRegexpError(regexp)); - return; - } - const auto& groupNames = regexp.CapturingGroupNames(); - int groupCount = regexp.NumberOfCapturingGroups(); - if (groupCount >= 0) { - std::unordered_set<std::string_view> groupNamesSet; - int unnamedCount = 0; - ++groupCount; - groups.Indexes.resize(groupCount); - groups.Names.resize(groupCount); - for (int i = 0; i < groupCount; ++i) { - TString fieldName; - auto it = groupNames.find(i); - if (it != groupNames.end()) { - if (!groupNamesSet.insert(it->second).second) { - builder.SetError( - TStringBuilder() << "Regexp contains duplicate capturing group name: " << it->second); - return; - } - fieldName = it->second; - } else { - fieldName = "_" + ToString(unnamedCount); - ++unnamedCount; + void BuildFunctionTypeInfo( + const TStringRef& name, + TType* userType, + const TStringRef& typeConfig, + ui32 flags, + IFunctionTypeInfoBuilder& builder) const final try { + Y_UNUSED(userType); + TOptionsSchema optionsSchema = MakeOptionsSchema(builder); + auto optOptionsStructType = builder.Optional()->Item(optionsSchema.StructType).Build(); + + bool typesOnly = (flags & TFlags::TypesOnly); + bool isMatch = (TRe2Udf::Name(TRe2Udf::EMode::MATCH) == name); + bool isGrep = (TRe2Udf::Name(TRe2Udf::EMode::GREP) == name); + bool isCapture = (TRe2Udf::Name(TRe2Udf::EMode::CAPTURE) == name); + bool isReplace = (TRe2Udf::Name(TRe2Udf::EMode::REPLACE) == name); + bool isCount = (TRe2Udf::Name(TRe2Udf::EMode::COUNT) == name); + bool isFindAndConsume = (TRe2Udf::Name(TRe2Udf::FIND_AND_CONSUME) == name); + + if (isMatch || isGrep) { + builder.SimpleSignature<bool(TOptional<char*>)>() + .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + + if (!typesOnly) { + const auto mode = isMatch ? TRe2Udf::EMode::MATCH : TRe2Udf::EMode::GREP; + builder.Implementation(new TRe2Udf::TFactory<posix>(mode, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); + } + } else if (isCapture) { + TRegexpGroups groups; + auto optionalStringType = builder.Optional()->Item<char*>().Build(); + auto structBuilder = builder.Struct(); + RE2::Options options = CreateDefaultOptions(); + RE2 regexp(StringPiece(typeConfig.Data(), typeConfig.Size()), options); + if (!regexp.ok()) { + builder.SetError(FormatRegexpError(regexp)); + return; + } + const auto& groupNames = regexp.CapturingGroupNames(); + int groupCount = regexp.NumberOfCapturingGroups(); + if (groupCount >= 0) { + std::unordered_set<std::string_view> groupNamesSet; + int unnamedCount = 0; + ++groupCount; + groups.Indexes.resize(groupCount); + groups.Names.resize(groupCount); + for (int i = 0; i < groupCount; ++i) { + TString fieldName; + auto it = groupNames.find(i); + if (it != groupNames.end()) { + if (!groupNamesSet.insert(it->second).second) { + builder.SetError( + TStringBuilder() << "Regexp contains duplicate capturing group name: " << it->second); + return; } - groups.Names[i] = fieldName; - structBuilder->AddField(fieldName, optionalStringType, &groups.Indexes[i]); - } - builder.Args(1)->Add(optionalStringType).Done().Returns(structBuilder->Build()).RunConfig(MakeRunConfigType(builder, optOptionsStructType)); - - if (!typesOnly) { - builder.Implementation( - new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::CAPTURE, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer(), groups)); + fieldName = it->second; + } else { + fieldName = "_" + ToString(unnamedCount); + ++unnamedCount; } - - } else { - Y_ENSURE(regexp.ok()); - builder.SetError("Regexp contains no capturing groups"); + groups.Names[i] = fieldName; + structBuilder->AddField(fieldName, optionalStringType, &groups.Indexes[i]); } - } else if (isReplace) { - builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>() - .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + builder.Args(1)->Add(optionalStringType).Done().Returns(structBuilder->Build()).RunConfig(MakeRunConfigType(builder, optOptionsStructType)); if (!typesOnly) { - builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::REPLACE, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); + builder.Implementation( + new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::CAPTURE, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer(), groups)); } - } else if (isCount) { - builder.SimpleSignature<ui32(TOptional<char*>)>() - .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); - if (!typesOnly) { - builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::COUNT, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); - } - } else if (isFindAndConsume) { - builder.SimpleSignature<TListType<char*>(TOptional<char*>)>() - .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); - if (!typesOnly) { - builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::FIND_AND_CONSUME, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); - } - } else if (!( - TEscape::DeclareSignature(name, userType, builder, typesOnly) || - TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) || - TOptions::DeclareSignature(name, userType, builder, typesOnly) || - TIsValidRegexp<posix>::DeclareSignature(name, userType, builder, typesOnly))) { - builder.SetError( - TStringBuilder() << "Unknown function name: " << TString(name)); + } else { + Y_ENSURE(regexp.ok()); + builder.SetError("Regexp contains no capturing groups"); } - } catch (const std::exception& e) { - builder.SetError(CurrentExceptionMessage()); + } else if (isReplace) { + builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>() + .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + + if (!typesOnly) { + builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::REPLACE, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); + } + } else if (isCount) { + builder.SimpleSignature<ui32(TOptional<char*>)>() + .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + + if (!typesOnly) { + builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::COUNT, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); + } + } else if (isFindAndConsume) { + builder.SimpleSignature<TListType<char*>(TOptional<char*>)>() + .RunConfig(MakeRunConfigType(builder, optOptionsStructType)); + if (!typesOnly) { + builder.Implementation(new TRe2Udf::TFactory<posix>(TRe2Udf::EMode::FIND_AND_CONSUME, optionsSchema, builder.GetSourcePosition(), builder.GetCurrentLangVer())); + } + } else if (!( + TEscape::DeclareSignature(name, userType, builder, typesOnly) || + TPatternFromLike::DeclareSignature(name, userType, builder, typesOnly) || + TOptions::DeclareSignature(name, userType, builder, typesOnly) || + TIsValidRegexp<posix>::DeclareSignature(name, userType, builder, typesOnly))) { + builder.SetError( + TStringBuilder() << "Unknown function name: " << TString(name)); } - }; + } catch (const std::exception& e) { + builder.SetError(CurrentExceptionMessage()); + } +}; -} +} // namespace REGISTER_MODULES( TRe2Module<false>, |