diff options
author | aneporada <[email protected]> | 2022-08-01 17:22:22 +0300 |
---|---|---|
committer | aneporada <[email protected]> | 2022-08-01 17:22:22 +0300 |
commit | 23b1156d8df573e6efc5c30099b7411bca59e1ad (patch) | |
tree | ad5a3f9b64540b8e3e9f9507e050abc74c46f8fe | |
parent | b59fea3f5d37f9694d59d9088c786bd8713d25d4 (diff) |
[] ExtractMembers optimizer for S3
6 files changed, 180 insertions, 108 deletions
diff --git a/ydb/library/yql/core/yql_opt_utils.cpp b/ydb/library/yql/core/yql_opt_utils.cpp index 9fc315badd0..9d88389d124 100644 --- a/ydb/library/yql/core/yql_opt_utils.cpp +++ b/ydb/library/yql/core/yql_opt_utils.cpp @@ -1407,4 +1407,60 @@ TStringBuf GetEmptyCollectionName(ETypeAnnotationKind kind) { return {}; } +namespace { + +ui8 GetTypeWeight(const TTypeAnnotationNode& type) { + switch (type.GetKind()) { + case ETypeAnnotationKind::Data: + switch (type.Cast<TDataExprType>()->GetSlot()) { + case NUdf::EDataSlot::Bool: + case NUdf::EDataSlot::Int8: + case NUdf::EDataSlot::Uint8: return 1; + + case NUdf::EDataSlot::Int16: + case NUdf::EDataSlot::Uint16: + case NUdf::EDataSlot::Date: return 2; + + case NUdf::EDataSlot::TzDate: return 3; + + case NUdf::EDataSlot::Int32: + case NUdf::EDataSlot::Uint32: + case NUdf::EDataSlot::Float: + case NUdf::EDataSlot::Datetime: return 4; + + case NUdf::EDataSlot::TzDatetime: return 5; + + case NUdf::EDataSlot::Int64: + case NUdf::EDataSlot::Uint64: + case NUdf::EDataSlot::Double: + case NUdf::EDataSlot::Timestamp: + case NUdf::EDataSlot::Interval: return 8; + + case NUdf::EDataSlot::TzTimestamp: return 9; + + case NUdf::EDataSlot::Decimal: return 15; + case NUdf::EDataSlot::Uuid: return 16; + + default: return 32; + } + case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType()); + default: return 255; + } +} + +} // namespace + +const TItemExprType* GetLightColumn(const TStructExprType& type) { + ui8 weight = 255; + const TItemExprType* field = nullptr; + for (const auto& item : type.GetItems()) { + + if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) { + weight = w; + field = item; + } + } + return field; +} + } diff --git a/ydb/library/yql/core/yql_opt_utils.h b/ydb/library/yql/core/yql_opt_utils.h index eee4574bf4e..9f5d41a1a82 100644 --- a/ydb/library/yql/core/yql_opt_utils.h +++ b/ydb/library/yql/core/yql_opt_utils.h @@ -107,4 +107,6 @@ inline TStringBuf GetEmptyCollectionName(const TTypeAnnotationNode* type) { return GetEmptyCollectionName(type->GetKind()); } +const TItemExprType* GetLightColumn(const TStructExprType& type); + } diff --git a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp index c37259936b7..229c80091a3 100644 --- a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp +++ b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp @@ -7,6 +7,7 @@ #include <ydb/library/yql/providers/common/provider/yql_data_provider_impl.h> #include <ydb/library/yql/providers/common/transform/yql_optimize.h> #include <ydb/library/yql/core/expr_nodes/yql_expr_nodes.h> +#include <ydb/library/yql/core/yql_opt_utils.h> #include <ydb/library/yql/utils/log/log.h> @@ -16,58 +17,6 @@ using namespace NNodes; namespace { -ui8 GetTypeWeight(const TTypeAnnotationNode& type) { - switch (type.GetKind()) { - case ETypeAnnotationKind::Data: - switch (type.Cast<TDataExprType>()->GetSlot()) { - case NUdf::EDataSlot::Bool: - case NUdf::EDataSlot::Int8: - case NUdf::EDataSlot::Uint8: return 1; - - case NUdf::EDataSlot::Int16: - case NUdf::EDataSlot::Uint16: - case NUdf::EDataSlot::Date: return 2; - - case NUdf::EDataSlot::TzDate: return 3; - - case NUdf::EDataSlot::Int32: - case NUdf::EDataSlot::Uint32: - case NUdf::EDataSlot::Float: - case NUdf::EDataSlot::Datetime: return 4; - - case NUdf::EDataSlot::TzDatetime: return 5; - - case NUdf::EDataSlot::Int64: - case NUdf::EDataSlot::Uint64: - case NUdf::EDataSlot::Double: - case NUdf::EDataSlot::Timestamp: - case NUdf::EDataSlot::Interval: return 8; - - case NUdf::EDataSlot::TzTimestamp: return 9; - - case NUdf::EDataSlot::Decimal: return 15; - case NUdf::EDataSlot::Uuid: return 16; - - default: return 32; - } - case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType()); - default: return 255; - } -} - -TStringBuf GetLightColumn(const TStructExprType& type) { - ui8 weight = 255; - const TItemExprType* field = nullptr; - for (const auto& item : type.GetItems()) { - - if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) { - weight = w; - field = item; - } - } - return field->GetName(); -} - class TClickHousePhysicalOptProposalTransformer : public TOptimizeTransformerBase { public: TClickHousePhysicalOptProposalTransformer(TClickHouseState::TPtr state) @@ -86,7 +35,8 @@ public: if (!wide.Cast().Ref().GetTypeAnn()->Cast<TFlowExprType>()->GetItemType()->Cast<TMultiExprType>()->GetSize()) { const auto& read = maybe.Cast(); const auto structType = State_->Tables[std::make_pair(read.DataSource().Cluster().Value(), read.Table().Value())].ItemType; - auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType))}); + YQL_ENSURE(structType->GetSize()); + auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType)->GetName())}); return Build<TCoNarrowMap>(ctx, narrow.Cast().Pos()) .Input<TDqReadWideWrap>() .InitFrom(wide.Cast()) diff --git a/ydb/library/yql/providers/common/mkql/parser.cpp b/ydb/library/yql/providers/common/mkql/parser.cpp index 69a9fa37cc0..426eb84498d 100644 --- a/ydb/library/yql/providers/common/mkql/parser.cpp +++ b/ydb/library/yql/providers/common/mkql/parser.cpp @@ -122,10 +122,15 @@ TRuntimeNode BuildParseCall( const auto structType = static_cast<const TStructType*>(outputItemType); if (format == "raw") { - MKQL_ENSURE(1U == structType->GetMembersCount(), "Expected single column."); - bool isOptional; - const auto schemeType = UnpackOptionalData(structType->GetMemberType(0U), isOptional)->GetSchemeType(); + MKQL_ENSURE(1U >= structType->GetMembersCount(), "Expected at most one column."); auto parseLambda = [&](TRuntimeNode item) { + if (structType->GetMembersCount() == 0) { + return ctx.ProgramBuilder.NewStruct(outputItemType, {}); + } + + bool isOptional; + const auto schemeType = UnpackOptionalData(structType->GetMemberType(0U), isOptional)->GetSchemeType(); + TRuntimeNode converted; if (NUdf::TDataType<const char*>::Id == schemeType) { converted = isOptional ? ctx.ProgramBuilder.NewOptional(item) : item; diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp index 18ad6fb96c6..1ffb82a4b09 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp @@ -182,6 +182,7 @@ public: AddHandler(0, &TCoLeft::Match, HNDL(TrimReadWorld)); AddHandler(0, &TCoFlatMapBase::Match, HNDL(TryPrunePaths)); AddHandler(0, &TDqSourceWrap::Match, HNDL(ApplyPrunedPath)); + AddHandler(0, &TCoExtractMembers::Match, HNDL(ExtractMembersOverDqSource)); #undef HNDL } @@ -294,6 +295,114 @@ public: return node; } + TMaybeNode<TExprBase> ExtractMembersOverDqSource(TExprBase node, TExprContext& ctx) const { + const auto& extract = node.Cast<TCoExtractMembers>(); + const auto& maybeDqSource = extract.Input().Maybe<TDqSourceWrap>(); + if (!maybeDqSource) { + return node; + } + + const auto& dqSource = maybeDqSource.Cast(); + if (dqSource.DataSource().Category() != S3ProviderName) { + return node; + } + + const auto& maybeS3SourceSettings = dqSource.Input().Maybe<TS3SourceSettingsBase>(); + if (!maybeS3SourceSettings) { + return node; + } + + TSet<TStringBuf> extractMembers; + for (auto member : extract.Members()) { + extractMembers.insert(member.Value()); + } + + TMaybeNode<TExprBase> settings = dqSource.Settings(); + + TMaybeNode<TExprBase> newSettings = settings; + TExprNode::TPtr newPaths = maybeS3SourceSettings.Cast().Paths().Ptr(); + + if (settings) { + if (auto prunedPaths = GetSetting(settings.Cast().Ref(), "prunedPaths")) { + if (prunedPaths->ChildrenSize() > 1) { + // pruning in progress + return node; + } + } + + if (auto extraColumnsSetting = GetSetting(settings.Cast().Ref(), "extraColumns")) { + const TStructExprType* extraType = extraColumnsSetting->Tail().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->Cast<TStructExprType>(); + auto extraTypeItems = extraType->GetItems(); + EraseIf(extraTypeItems, [&](const TItemExprType* item) { return !extractMembers.contains(item->GetName()); }); + if (extraTypeItems.size() < extraType->GetSize()) { + auto originalPaths = maybeS3SourceSettings.Cast().Paths().Ptr(); + auto originalExtra = extraColumnsSetting->TailPtr(); + YQL_ENSURE(originalExtra->IsCallable("AsList")); + YQL_ENSURE(originalPaths->IsList()); + YQL_ENSURE(originalPaths->ChildrenSize() == originalExtra->ChildrenSize()); + + TExprNodeList newPathItems; + TExprNodeList newExtraColumnsItems; + + for (const auto& path : maybeS3SourceSettings.Cast().Paths()) { + auto extra = path.ExtraColumns(); + YQL_ENSURE(TCoAsStruct::Match(extra.Raw())); + TExprNodeList children = extra.Ref().ChildrenList(); + EraseIf(children, [&](const TExprNode::TPtr& child) { return !extractMembers.contains(child->Head().Content()); }); + auto newStruct = ctx.ChangeChildren(extra.Ref(), std::move(children)); + newExtraColumnsItems.push_back(newStruct); + newPathItems.push_back(ctx.ChangeChild(path.Ref(), TS3Path::idx_ExtraColumns, std::move(newStruct))); + } + + newPaths = ctx.ChangeChildren(maybeS3SourceSettings.Cast().Paths().Ref(), std::move(newPathItems)); + TExprNode::TPtr newExtra = ctx.ChangeChildren(*originalExtra, std::move(newExtraColumnsItems)); + newSettings = TExprBase(extraTypeItems.empty() ? RemoveSetting(settings.Cast().Ref(), "extraColumns", ctx) : + ReplaceSetting(settings.Cast().Ref(), extraColumnsSetting->Pos(), "extraColumns", newExtra, ctx)); + } + } + + } + + const TStructExprType* outputRowType = node.Ref().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->Cast<TStructExprType>(); + const TExprNode::TPtr outputRowTypeNode = ExpandType(dqSource.RowType().Pos(), *outputRowType, ctx); + + YQL_CLOG(INFO, ProviderS3) << "ExtractMembers over DqSource with " << maybeS3SourceSettings.Cast().CallableName(); + + if (maybeS3SourceSettings.Cast().CallableName() == TS3SourceSettings::CallableName()) { + return Build<TDqSourceWrap>(ctx, dqSource.Pos()) + .InitFrom(dqSource) + .Input<TS3SourceSettings>() + .InitFrom(dqSource.Input().Maybe<TS3SourceSettings>().Cast()) + .Paths(newPaths) + .Build() + .RowType(outputRowTypeNode) + .Settings(newSettings) + .Done(); + } + + const TStructExprType* readRowType = + dqSource.Input().Maybe<TS3ParseSettings>().Cast().RowType().Ref().GetTypeAnn()->Cast<TTypeExprType>()->GetType()->Cast<TStructExprType>(); + + if (outputRowType->GetSize() == 0 && readRowType->GetSize() != 0) { + auto item = GetLightColumn(*readRowType); + YQL_ENSURE(item); + readRowType = ctx.MakeType<TStructExprType>(TVector<const TItemExprType*>{item}); + } else { + readRowType = outputRowType; + } + + return Build<TDqSourceWrap>(ctx, dqSource.Pos()) + .InitFrom(dqSource) + .Input<TS3ParseSettings>() + .InitFrom(dqSource.Input().Maybe<TS3ParseSettings>().Cast()) + .Paths(newPaths) + .RowType(ExpandType(dqSource.Input().Pos(), *readRowType, ctx)) + .Build() + .RowType(outputRowTypeNode) + .Settings(newSettings) + .Done(); + } + private: const TS3State::TPtr State_; }; diff --git a/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp b/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp index 0ca6ab90782..a59751a8e05 100644 --- a/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp +++ b/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp @@ -8,6 +8,7 @@ #include <ydb/library/yql/providers/common/provider/yql_data_provider_impl.h> #include <ydb/library/yql/providers/common/transform/yql_optimize.h> #include <ydb/library/yql/core/expr_nodes/yql_expr_nodes.h> +#include <ydb/library/yql/core/yql_opt_utils.h> #include <ydb/library/yql/utils/log/log.h> @@ -17,58 +18,6 @@ using namespace NNodes; namespace { -ui8 GetTypeWeight(const TTypeAnnotationNode& type) { - switch (type.GetKind()) { - case ETypeAnnotationKind::Data: - switch (type.Cast<TDataExprType>()->GetSlot()) { - case NUdf::EDataSlot::Bool: - case NUdf::EDataSlot::Int8: - case NUdf::EDataSlot::Uint8: return 1; - - case NUdf::EDataSlot::Int16: - case NUdf::EDataSlot::Uint16: - case NUdf::EDataSlot::Date: return 2; - - case NUdf::EDataSlot::TzDate: return 3; - - case NUdf::EDataSlot::Int32: - case NUdf::EDataSlot::Uint32: - case NUdf::EDataSlot::Float: - case NUdf::EDataSlot::Datetime: return 4; - - case NUdf::EDataSlot::TzDatetime: return 5; - - case NUdf::EDataSlot::Int64: - case NUdf::EDataSlot::Uint64: - case NUdf::EDataSlot::Double: - case NUdf::EDataSlot::Timestamp: - case NUdf::EDataSlot::Interval: return 8; - - case NUdf::EDataSlot::TzTimestamp: return 9; - - case NUdf::EDataSlot::Decimal: return 15; - case NUdf::EDataSlot::Uuid: return 16; - - default: return 32; - } - case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType()); - default: return 255; - } -} - -const TItemExprType* GetLightColumn(const TStructExprType& type) { - ui8 weight = 255; - const TItemExprType* field = nullptr; - for (const auto& item : type.GetItems()) { - - if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) { - weight = w; - field = item; - } - } - return field; -} - class TYdbPhysicalOptProposalTransformer : public TOptimizeTransformerBase { public: TYdbPhysicalOptProposalTransformer(TYdbState::TPtr state) @@ -119,6 +68,7 @@ public: if (!wide.Cast().Ref().GetTypeAnn()->Cast<TFlowExprType>()->GetItemType()->Cast<TMultiExprType>()->GetSize()) { const auto& read = maybe.Cast(); const auto structType = State_->Tables[std::make_pair(read.DataSource().Cluster().StringValue(), read.Table().StringValue())].ItemType; + YQL_ENSURE(structType->GetSize()); auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType)->GetName())}); return Build<TCoNarrowMap>(ctx, narrow.Cast().Pos()) .Input<TDqReadWideWrap>() |