summaryrefslogtreecommitdiffstats
path: root/yql/essentials/udfs/common/pire/pire_udf.cpp
diff options
context:
space:
mode:
authorvvvv <[email protected]>2025-10-06 13:26:25 +0300
committervvvv <[email protected]>2025-10-06 14:06:25 +0300
commiteca8ce9cb1613d5c983185c4e43c20651a9638aa (patch)
tree61ee5ae779948e61af9a7691d19eaa2c09869121 /yql/essentials/udfs/common/pire/pire_udf.cpp
parent4adf7eecae16a9b228b28cc5f64c27ef69ad5ec2 (diff)
YQL-20086 udfs
init commit_hash:f9684778bf1ea956965f2360b80b91edb7d4ffbe
Diffstat (limited to 'yql/essentials/udfs/common/pire/pire_udf.cpp')
-rw-r--r--yql/essentials/udfs/common/pire/pire_udf.cpp561
1 files changed, 283 insertions, 278 deletions
diff --git a/yql/essentials/udfs/common/pire/pire_udf.cpp b/yql/essentials/udfs/common/pire/pire_udf.cpp
index de2a75955e3..1357107a12b 100644
--- a/yql/essentials/udfs/common/pire/pire_udf.cpp
+++ b/yql/essentials/udfs/common/pire/pire_udf.cpp
@@ -14,345 +14,350 @@ using namespace NKikimr;
using namespace NUdf;
namespace {
- class TPireUdfBase: public TBoxedValue {
- protected:
- TPireUdfBase(TSourcePosition pos)
- : Pos_(pos)
- {}
-
- void SetCommonOptions(std::string_view& regex, TFsm::TOptions& options) {
- if (regex.size() >= 4U && regex.substr(0U, 4U) == "(?i)") {
- options.SetCaseInsensitive(true);
- regex.remove_prefix(4U);
- }
- if (UTF8Detect(regex) == UTF8) {
- options.SetCharset(CODES_UTF8);
- }
+class TPireUdfBase: public TBoxedValue {
+protected:
+ TPireUdfBase(TSourcePosition pos)
+ : Pos_(pos)
+ {
+ }
+
+ void SetCommonOptions(std::string_view& regex, TFsm::TOptions& options) {
+ if (regex.size() >= 4U && regex.substr(0U, 4U) == "(?i)") {
+ options.SetCaseInsensitive(true);
+ regex.remove_prefix(4U);
}
+ if (UTF8Detect(regex) == UTF8) {
+ options.SetCharset(CODES_UTF8);
+ }
+ }
- TSourcePosition Pos_;
- };
+ TSourcePosition Pos_;
+};
- class TPireMatch: public TPireUdfBase {
+class TPireMatch: public TPireUdfBase {
+public:
+ class TFactory: public TPireUdfBase {
public:
- class TFactory: public TPireUdfBase {
- public:
- TFactory(
- bool surroundMode,
- bool multiMode,
- TSourcePosition pos,
- size_t regexpsCount = 0)
- : TPireUdfBase(pos)
- , SurroundMode_(surroundMode)
- , MultiMode_(multiMode)
- , RegexpsCount_(regexpsCount)
- {
- }
-
- private:
- TUnboxedValue Run(
- const IValueBuilder* valueBuilder,
- const TUnboxedValuePod* args) const final {
- return TUnboxedValuePod(
- new TPireMatch(
- valueBuilder,
- args[0],
- SurroundMode_,
- MultiMode_,
- Pos_,
- RegexpsCount_));
- }
-
- bool SurroundMode_;
- bool MultiMode_;
- size_t RegexpsCount_;
- };
-
- static const TStringRef& Name(bool surroundMode, bool multiMode) {
- static auto match = TStringRef::Of("Match");
- static auto grep = TStringRef::Of("Grep");
- static auto multiMatch = TStringRef::Of("MultiMatch");
- static auto multiGrep = TStringRef::Of("MultiGrep");
- if (surroundMode) {
- return multiMode ? multiGrep : grep;
- } else {
- return multiMode ? multiMatch : match;
- }
- }
-
- TPireMatch(
- const IValueBuilder* valueBuilder,
- const TUnboxedValuePod& runConfig,
+ TFactory(
bool surroundMode,
bool multiMode,
TSourcePosition pos,
- size_t regexpsCount)
+ size_t regexpsCount = 0)
: TPireUdfBase(pos)
+ , SurroundMode_(surroundMode)
, MultiMode_(multiMode)
, RegexpsCount_(regexpsCount)
- , SurroundMode_(surroundMode)
{
- Y_UNUSED(valueBuilder);
- try {
- std::string_view regex(runConfig.AsStringRef());
- TFsm::TOptions options;
- options.SetSurround(surroundMode);
- SetCommonOptions(regex, options);
- if (multiMode) {
- std::vector<std::string_view> parts;
- StringSplitter(regex).Split('\n').AddTo(&parts);
- for (const auto& part : parts) {
- if (!part.empty()) {
- if (Fsm_) try {
+ }
+
+ private:
+ TUnboxedValue Run(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod* args) const final {
+ return TUnboxedValuePod(
+ new TPireMatch(
+ valueBuilder,
+ args[0],
+ SurroundMode_,
+ MultiMode_,
+ Pos_,
+ RegexpsCount_));
+ }
+
+ bool SurroundMode_;
+ bool MultiMode_;
+ size_t RegexpsCount_;
+ };
+
+ static const TStringRef& Name(bool surroundMode, bool multiMode) {
+ static auto match = TStringRef::Of("Match");
+ static auto grep = TStringRef::Of("Grep");
+ static auto multiMatch = TStringRef::Of("MultiMatch");
+ static auto multiGrep = TStringRef::Of("MultiGrep");
+ if (surroundMode) {
+ return multiMode ? multiGrep : grep;
+ } else {
+ return multiMode ? multiMatch : match;
+ }
+ }
+
+ TPireMatch(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod& runConfig,
+ bool surroundMode,
+ bool multiMode,
+ TSourcePosition pos,
+ size_t regexpsCount)
+ : TPireUdfBase(pos)
+ , MultiMode_(multiMode)
+ , RegexpsCount_(regexpsCount)
+ , SurroundMode_(surroundMode)
+ {
+ Y_UNUSED(valueBuilder);
+ try {
+ std::string_view regex(runConfig.AsStringRef());
+ TFsm::TOptions options;
+ options.SetSurround(surroundMode);
+ SetCommonOptions(regex, options);
+ if (multiMode) {
+ std::vector<std::string_view> parts;
+ StringSplitter(regex).Split('\n').AddTo(&parts);
+ for (const auto& part : parts) {
+ if (!part.empty()) {
+ if (Fsm_) {
+ try {
*Fsm_ = *Fsm_ | TFsm(TString(part), options);
} catch (const yexception&) {
UdfTerminate((TStringBuilder() << Pos_ << " Failed to glue up regexes, probably the finite state machine appeared to be too large").c_str());
- } else {
- Fsm_.Reset(new TFsm(TString(part), options));
}
+ } else {
+ Fsm_.Reset(new TFsm(TString(part), options));
}
}
- } else {
- Fsm_.Reset(new TFsm(TString(regex), options));
}
- } catch (const std::exception& e) {
- UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ } else {
+ Fsm_.Reset(new TFsm(TString(regex), options));
}
+ } catch (const std::exception& e) {
+ UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
}
+ }
- private:
- TUnboxedValue Run(
- const IValueBuilder* valueBuilder,
- const TUnboxedValuePod* args) const final try {
- TUnboxedValue* items = nullptr;
- TUnboxedValue tuple;
- size_t i = 0;
+private:
+ TUnboxedValue Run(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod* args) const final try {
+ TUnboxedValue* items = nullptr;
+ TUnboxedValue tuple;
+ size_t i = 0;
- if (MultiMode_) {
- tuple = valueBuilder->NewArray(RegexpsCount_, items);
+ if (MultiMode_) {
+ tuple = valueBuilder->NewArray(RegexpsCount_, items);
- for (i = 0; i < RegexpsCount_; ++i) {
- items[i] = TUnboxedValuePod(false);
- }
+ for (i = 0; i < RegexpsCount_; ++i) {
+ items[i] = TUnboxedValuePod(false);
}
+ }
- if (args[0]) {
- const auto input = args[0].AsStringRef();
- TMatcher matcher(*Fsm_);
- const bool isMatch = matcher.Match(input.Data(), input.Size(), SurroundMode_, SurroundMode_).Final();
- if (MultiMode_) {
- if (isMatch) {
- const auto& matchedRegexps = matcher.MatchedRegexps();
- size_t matchesCount = matchedRegexps.second - matchedRegexps.first;
-
- for (i = 0; i < matchesCount; ++i) {
- items[matchedRegexps.first[i]] = TUnboxedValuePod(true);
- }
- }
- return tuple;
+ if (args[0]) {
+ const auto input = args[0].AsStringRef();
+ TMatcher matcher(*Fsm_);
+ const bool isMatch = matcher.Match(input.Data(), input.Size(), SurroundMode_, SurroundMode_).Final();
+ if (MultiMode_) {
+ if (isMatch) {
+ const auto& matchedRegexps = matcher.MatchedRegexps();
+ size_t matchesCount = matchedRegexps.second - matchedRegexps.first;
- } else {
- return TUnboxedValuePod(isMatch);
+ for (i = 0; i < matchesCount; ++i) {
+ items[matchedRegexps.first[i]] = TUnboxedValuePod(true);
+ }
}
+ return tuple;
} else {
- return MultiMode_ ? tuple : TUnboxedValue(TUnboxedValuePod(false));
- }
- } catch (const std::exception& e) {
- UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
- }
-
- private:
- TUniquePtr<TFsm> Fsm_;
- bool MultiMode_;
- size_t RegexpsCount_;
- bool SurroundMode_;
- };
-
- class TPireCapture: public TPireUdfBase {
- public:
- class TFactory: public TPireUdfBase {
- public:
- TFactory(TSourcePosition pos)
- : TPireUdfBase(pos)
- {}
-
- private:
- TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
- return TUnboxedValuePod(new TPireCapture(args[0], Pos_));
- } catch (const std::exception& e) {
- UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ return TUnboxedValuePod(isMatch);
}
- };
- static const TStringRef& Name() {
- static auto name = TStringRef::Of("Capture");
- return name;
+ } else {
+ return MultiMode_ ? tuple : TUnboxedValue(TUnboxedValuePod(false));
}
-
- TPireCapture(const TUnboxedValuePod& runConfig, TSourcePosition pos)
+ } catch (const std::exception& e) {
+ UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ }
+
+private:
+ TUniquePtr<TFsm> Fsm_;
+ bool MultiMode_;
+ size_t RegexpsCount_;
+ bool SurroundMode_;
+};
+
+class TPireCapture: public TPireUdfBase {
+public:
+ class TFactory: public TPireUdfBase {
+ public:
+ TFactory(TSourcePosition pos)
: TPireUdfBase(pos)
{
- std::string_view regex(runConfig.AsStringRef());
- TFsm::TOptions options;
- SetCommonOptions(regex, options);
- Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
}
private:
- TUnboxedValue Run(
- const IValueBuilder* valueBuilder,
- const TUnboxedValuePod* args) const final try {
- if (args[0]) {
- const std::string_view input = args[0].AsStringRef();
-
- TSlowSearcher searcher(*Fsm_);
- searcher.Search(input.data(), input.size());
-
- if (searcher.Captured()) {
- const auto& captured = searcher.GetCaptured();
- return valueBuilder->SubString(args[0], std::distance(input.begin(), captured.begin()), captured.length());
- }
- }
-
- return TUnboxedValue();
+ TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
+ return TUnboxedValuePod(new TPireCapture(args[0], Pos_));
} catch (const std::exception& e) {
UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
}
-
- TUniquePtr<TSlowCapturingFsm> Fsm_;
};
- class TPireReplace: public TPireUdfBase {
- public:
- class TFactory: public TPireUdfBase {
- public:
- TFactory(TSourcePosition pos)
- : TPireUdfBase(pos)
- {}
-
- private:
- TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
- return TUnboxedValuePod(new TPireReplace(args[0], Pos_));
- } catch (const std::exception& e) {
- UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ static const TStringRef& Name() {
+ static auto name = TStringRef::Of("Capture");
+ return name;
+ }
+
+ TPireCapture(const TUnboxedValuePod& runConfig, TSourcePosition pos)
+ : TPireUdfBase(pos)
+ {
+ std::string_view regex(runConfig.AsStringRef());
+ TFsm::TOptions options;
+ SetCommonOptions(regex, options);
+ Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
+ }
+
+private:
+ TUnboxedValue Run(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod* args) const final try {
+ if (args[0]) {
+ const std::string_view input = args[0].AsStringRef();
+
+ TSlowSearcher searcher(*Fsm_);
+ searcher.Search(input.data(), input.size());
+
+ if (searcher.Captured()) {
+ const auto& captured = searcher.GetCaptured();
+ return valueBuilder->SubString(args[0], std::distance(input.begin(), captured.begin()), captured.length());
}
- };
-
- static const TStringRef& Name() {
- static auto name = TStringRef::Of("Replace");
- return name;
}
- TPireReplace(const TUnboxedValuePod& runConfig, TSourcePosition pos)
+ return TUnboxedValue();
+ } catch (const std::exception& e) {
+ UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ }
+
+ TUniquePtr<TSlowCapturingFsm> Fsm_;
+};
+
+class TPireReplace: public TPireUdfBase {
+public:
+ class TFactory: public TPireUdfBase {
+ public:
+ TFactory(TSourcePosition pos)
: TPireUdfBase(pos)
{
- std::string_view regex(runConfig.AsStringRef());
- TFsm::TOptions options;
- SetCommonOptions(regex, options);
- Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
}
private:
- TUnboxedValue Run(
- const IValueBuilder* valueBuilder,
- const TUnboxedValuePod* args) const final try {
- if (args[0]) {
- const std::string_view input(args[0].AsStringRef());
-
- TSlowSearcher s(*Fsm_);
- s.Search(input.data(), input.size());
- if (s.Captured()) {
- const auto& captured = s.GetCaptured();
- const TString replacement(args[1].AsStringRef());
- TString replaced(args[0].AsStringRef());
- replaced.replace(std::distance(input.begin(), captured.begin()), captured.length(), replacement);
- return valueBuilder->NewString(replaced);
- } else {
- return TUnboxedValue(args[0]);
- }
- } else {
- return TUnboxedValue();
- }
+ TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
+ return TUnboxedValuePod(new TPireReplace(args[0], Pos_));
} catch (const std::exception& e) {
UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
}
-
- TUniquePtr<TSlowCapturingFsm> Fsm_;
};
- class TPireModule: public IUdfModule {
- public:
- TStringRef Name() const {
- return TStringRef::Of("Pire");
- }
-
- void CleanupOnTerminate() const final {
- }
-
- void GetAllFunctions(IFunctionsSink& sink) const final {
- sink.Add(TPireMatch::Name(true, true))->SetTypeAwareness();
- sink.Add(TPireMatch::Name(false, true))->SetTypeAwareness();
- sink.Add(TPireMatch::Name(true, false));
- sink.Add(TPireMatch::Name(false, false));
- sink.Add(TPireCapture::Name());
- sink.Add(TPireReplace::Name());
+ static const TStringRef& Name() {
+ static auto name = TStringRef::Of("Replace");
+ return name;
+ }
+
+ TPireReplace(const TUnboxedValuePod& runConfig, TSourcePosition pos)
+ : TPireUdfBase(pos)
+ {
+ std::string_view regex(runConfig.AsStringRef());
+ TFsm::TOptions options;
+ SetCommonOptions(regex, options);
+ Fsm_.Reset(new TSlowCapturingFsm(TString(regex), options));
+ }
+
+private:
+ TUnboxedValue Run(
+ const IValueBuilder* valueBuilder,
+ const TUnboxedValuePod* args) const final try {
+ if (args[0]) {
+ const std::string_view input(args[0].AsStringRef());
+
+ TSlowSearcher s(*Fsm_);
+ s.Search(input.data(), input.size());
+ if (s.Captured()) {
+ const auto& captured = s.GetCaptured();
+ const TString replacement(args[1].AsStringRef());
+ TString replaced(args[0].AsStringRef());
+ replaced.replace(std::distance(input.begin(), captured.begin()), captured.length(), replacement);
+ return valueBuilder->NewString(replaced);
+ } else {
+ return TUnboxedValue(args[0]);
+ }
+ } else {
+ return TUnboxedValue();
}
+ } catch (const std::exception& e) {
+ UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).c_str());
+ }
+
+ TUniquePtr<TSlowCapturingFsm> Fsm_;
+};
+
+class TPireModule: public IUdfModule {
+public:
+ TStringRef Name() const {
+ return TStringRef::Of("Pire");
+ }
+
+ void CleanupOnTerminate() const final {
+ }
+
+ void GetAllFunctions(IFunctionsSink& sink) const final {
+ sink.Add(TPireMatch::Name(true, true))->SetTypeAwareness();
+ sink.Add(TPireMatch::Name(false, true))->SetTypeAwareness();
+ sink.Add(TPireMatch::Name(true, false));
+ sink.Add(TPireMatch::Name(false, false));
+ sink.Add(TPireCapture::Name());
+ sink.Add(TPireReplace::Name());
+ }
+
+ void BuildFunctionTypeInfo(
+ const TStringRef& name,
+ TType*,
+ const TStringRef& typeConfig,
+ ui32 flags,
+ IFunctionTypeInfoBuilder& builder) const final try {
+ const bool typesOnly = (flags & TFlags::TypesOnly);
+ const bool isMatch = (TPireMatch::Name(false, false) == name);
+ const bool isGrep = (TPireMatch::Name(true, false) == name);
+ const bool isMultiMatch = (TPireMatch::Name(false, true) == name);
+ const bool isMultiGrep = (TPireMatch::Name(true, true) == name);
+
+ if (isMatch || isGrep) {
+ builder.SimpleSignature<bool(TOptional<char*>)>()
+ .RunConfig<const char*>();
+
+ if (!typesOnly) {
+ builder.Implementation(new TPireMatch::TFactory(isGrep, false, builder.GetSourcePosition()));
+ }
+ } else if (isMultiMatch || isMultiGrep) {
+ const auto boolType = builder.SimpleType<bool>();
+ const auto optionalStringType = builder.Optional()->Item<char*>().Build();
+ const std::string_view regexp(typeConfig);
+ const size_t regexpCount = std::count(regexp.begin(), regexp.end(), '\n') + 1;
+ const auto tuple = builder.Tuple();
+ for (size_t i = 0; i < regexpCount; ++i) {
+ tuple->Add(boolType);
+ }
+ const auto tupleType = tuple->Build();
+ builder.Args(1)->Add(optionalStringType).Done().Returns(tupleType).RunConfig<char*>();
- void BuildFunctionTypeInfo(
- const TStringRef& name,
- TType*,
- const TStringRef& typeConfig,
- ui32 flags,
- IFunctionTypeInfoBuilder& builder) const final try {
- const bool typesOnly = (flags & TFlags::TypesOnly);
- const bool isMatch = (TPireMatch::Name(false, false) == name);
- const bool isGrep = (TPireMatch::Name(true, false) == name);
- const bool isMultiMatch = (TPireMatch::Name(false, true) == name);
- const bool isMultiGrep = (TPireMatch::Name(true, true) == name);
-
- if (isMatch || isGrep) {
- builder.SimpleSignature<bool(TOptional<char*>)>()
- .RunConfig<const char*>();
-
- if (!typesOnly) {
- builder.Implementation(new TPireMatch::TFactory(isGrep, false, builder.GetSourcePosition()));
- }
- } else if (isMultiMatch || isMultiGrep) {
- const auto boolType = builder.SimpleType<bool>();
- const auto optionalStringType = builder.Optional()->Item<char*>().Build();
- const std::string_view regexp(typeConfig);
- const size_t regexpCount = std::count(regexp.begin(), regexp.end(), '\n') + 1;
- const auto tuple = builder.Tuple();
- for (size_t i = 0; i < regexpCount; ++i) {
- tuple->Add(boolType);
- }
- const auto tupleType = tuple->Build();
- builder.Args(1)->Add(optionalStringType).Done().Returns(tupleType).RunConfig<char*>();
-
- if (!typesOnly) {
- builder.Implementation(new TPireMatch::TFactory(isMultiGrep, true, builder.GetSourcePosition(), regexpCount));
- }
- } else if (TPireCapture::Name() == name) {
- builder.SimpleSignature<TOptional<char*>(TOptional<char*>)>()
- .RunConfig<char*>();
+ if (!typesOnly) {
+ builder.Implementation(new TPireMatch::TFactory(isMultiGrep, true, builder.GetSourcePosition(), regexpCount));
+ }
+ } else if (TPireCapture::Name() == name) {
+ builder.SimpleSignature<TOptional<char*>(TOptional<char*>)>()
+ .RunConfig<char*>();
- if (!typesOnly) {
- builder.Implementation(new TPireCapture::TFactory(builder.GetSourcePosition()));
- }
- } else if (TPireReplace::Name() == name) {
- builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>()
- .RunConfig<char*>();
+ if (!typesOnly) {
+ builder.Implementation(new TPireCapture::TFactory(builder.GetSourcePosition()));
+ }
+ } else if (TPireReplace::Name() == name) {
+ builder.SimpleSignature<TOptional<char*>(TOptional<char*>, char*)>()
+ .RunConfig<char*>();
- if (!typesOnly) {
- builder.Implementation(new TPireReplace::TFactory(builder.GetSourcePosition()));
- }
+ if (!typesOnly) {
+ builder.Implementation(new TPireReplace::TFactory(builder.GetSourcePosition()));
}
- } catch (const std::exception& e) {
- builder.SetError(CurrentExceptionMessage());
}
- };
+ } catch (const std::exception& e) {
+ builder.SetError(CurrentExceptionMessage());
+ }
+};
-}
+} // namespace
REGISTER_MODULES(TPireModule)