summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraneporada <[email protected]>2022-08-01 17:22:22 +0300
committeraneporada <[email protected]>2022-08-01 17:22:22 +0300
commit23b1156d8df573e6efc5c30099b7411bca59e1ad (patch)
treead5a3f9b64540b8e3e9f9507e050abc74c46f8fe
parentb59fea3f5d37f9694d59d9088c786bd8713d25d4 (diff)
[] ExtractMembers optimizer for S3
-rw-r--r--ydb/library/yql/core/yql_opt_utils.cpp56
-rw-r--r--ydb/library/yql/core/yql_opt_utils.h2
-rw-r--r--ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp56
-rw-r--r--ydb/library/yql/providers/common/mkql/parser.cpp11
-rw-r--r--ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp109
-rw-r--r--ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp54
6 files changed, 180 insertions, 108 deletions
diff --git a/ydb/library/yql/core/yql_opt_utils.cpp b/ydb/library/yql/core/yql_opt_utils.cpp
index 9fc315badd0..9d88389d124 100644
--- a/ydb/library/yql/core/yql_opt_utils.cpp
+++ b/ydb/library/yql/core/yql_opt_utils.cpp
@@ -1407,4 +1407,60 @@ TStringBuf GetEmptyCollectionName(ETypeAnnotationKind kind) {
return {};
}
+namespace {
+
+ui8 GetTypeWeight(const TTypeAnnotationNode& type) {
+ switch (type.GetKind()) {
+ case ETypeAnnotationKind::Data:
+ switch (type.Cast<TDataExprType>()->GetSlot()) {
+ case NUdf::EDataSlot::Bool:
+ case NUdf::EDataSlot::Int8:
+ case NUdf::EDataSlot::Uint8: return 1;
+
+ case NUdf::EDataSlot::Int16:
+ case NUdf::EDataSlot::Uint16:
+ case NUdf::EDataSlot::Date: return 2;
+
+ case NUdf::EDataSlot::TzDate: return 3;
+
+ case NUdf::EDataSlot::Int32:
+ case NUdf::EDataSlot::Uint32:
+ case NUdf::EDataSlot::Float:
+ case NUdf::EDataSlot::Datetime: return 4;
+
+ case NUdf::EDataSlot::TzDatetime: return 5;
+
+ case NUdf::EDataSlot::Int64:
+ case NUdf::EDataSlot::Uint64:
+ case NUdf::EDataSlot::Double:
+ case NUdf::EDataSlot::Timestamp:
+ case NUdf::EDataSlot::Interval: return 8;
+
+ case NUdf::EDataSlot::TzTimestamp: return 9;
+
+ case NUdf::EDataSlot::Decimal: return 15;
+ case NUdf::EDataSlot::Uuid: return 16;
+
+ default: return 32;
+ }
+ case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType());
+ default: return 255;
+ }
+}
+
+} // namespace
+
+const TItemExprType* GetLightColumn(const TStructExprType& type) {
+ ui8 weight = 255;
+ const TItemExprType* field = nullptr;
+ for (const auto& item : type.GetItems()) {
+
+ if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) {
+ weight = w;
+ field = item;
+ }
+ }
+ return field;
+}
+
}
diff --git a/ydb/library/yql/core/yql_opt_utils.h b/ydb/library/yql/core/yql_opt_utils.h
index eee4574bf4e..9f5d41a1a82 100644
--- a/ydb/library/yql/core/yql_opt_utils.h
+++ b/ydb/library/yql/core/yql_opt_utils.h
@@ -107,4 +107,6 @@ inline TStringBuf GetEmptyCollectionName(const TTypeAnnotationNode* type) {
return GetEmptyCollectionName(type->GetKind());
}
+const TItemExprType* GetLightColumn(const TStructExprType& type);
+
}
diff --git a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp
index c37259936b7..229c80091a3 100644
--- a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp
+++ b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_physical_opt.cpp
@@ -7,6 +7,7 @@
#include <ydb/library/yql/providers/common/provider/yql_data_provider_impl.h>
#include <ydb/library/yql/providers/common/transform/yql_optimize.h>
#include <ydb/library/yql/core/expr_nodes/yql_expr_nodes.h>
+#include <ydb/library/yql/core/yql_opt_utils.h>
#include <ydb/library/yql/utils/log/log.h>
@@ -16,58 +17,6 @@ using namespace NNodes;
namespace {
-ui8 GetTypeWeight(const TTypeAnnotationNode& type) {
- switch (type.GetKind()) {
- case ETypeAnnotationKind::Data:
- switch (type.Cast<TDataExprType>()->GetSlot()) {
- case NUdf::EDataSlot::Bool:
- case NUdf::EDataSlot::Int8:
- case NUdf::EDataSlot::Uint8: return 1;
-
- case NUdf::EDataSlot::Int16:
- case NUdf::EDataSlot::Uint16:
- case NUdf::EDataSlot::Date: return 2;
-
- case NUdf::EDataSlot::TzDate: return 3;
-
- case NUdf::EDataSlot::Int32:
- case NUdf::EDataSlot::Uint32:
- case NUdf::EDataSlot::Float:
- case NUdf::EDataSlot::Datetime: return 4;
-
- case NUdf::EDataSlot::TzDatetime: return 5;
-
- case NUdf::EDataSlot::Int64:
- case NUdf::EDataSlot::Uint64:
- case NUdf::EDataSlot::Double:
- case NUdf::EDataSlot::Timestamp:
- case NUdf::EDataSlot::Interval: return 8;
-
- case NUdf::EDataSlot::TzTimestamp: return 9;
-
- case NUdf::EDataSlot::Decimal: return 15;
- case NUdf::EDataSlot::Uuid: return 16;
-
- default: return 32;
- }
- case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType());
- default: return 255;
- }
-}
-
-TStringBuf GetLightColumn(const TStructExprType& type) {
- ui8 weight = 255;
- const TItemExprType* field = nullptr;
- for (const auto& item : type.GetItems()) {
-
- if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) {
- weight = w;
- field = item;
- }
- }
- return field->GetName();
-}
-
class TClickHousePhysicalOptProposalTransformer : public TOptimizeTransformerBase {
public:
TClickHousePhysicalOptProposalTransformer(TClickHouseState::TPtr state)
@@ -86,7 +35,8 @@ public:
if (!wide.Cast().Ref().GetTypeAnn()->Cast<TFlowExprType>()->GetItemType()->Cast<TMultiExprType>()->GetSize()) {
const auto& read = maybe.Cast();
const auto structType = State_->Tables[std::make_pair(read.DataSource().Cluster().Value(), read.Table().Value())].ItemType;
- auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType))});
+ YQL_ENSURE(structType->GetSize());
+ auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType)->GetName())});
return Build<TCoNarrowMap>(ctx, narrow.Cast().Pos())
.Input<TDqReadWideWrap>()
.InitFrom(wide.Cast())
diff --git a/ydb/library/yql/providers/common/mkql/parser.cpp b/ydb/library/yql/providers/common/mkql/parser.cpp
index 69a9fa37cc0..426eb84498d 100644
--- a/ydb/library/yql/providers/common/mkql/parser.cpp
+++ b/ydb/library/yql/providers/common/mkql/parser.cpp
@@ -122,10 +122,15 @@ TRuntimeNode BuildParseCall(
const auto structType = static_cast<const TStructType*>(outputItemType);
if (format == "raw") {
- MKQL_ENSURE(1U == structType->GetMembersCount(), "Expected single column.");
- bool isOptional;
- const auto schemeType = UnpackOptionalData(structType->GetMemberType(0U), isOptional)->GetSchemeType();
+ MKQL_ENSURE(1U >= structType->GetMembersCount(), "Expected at most one column.");
auto parseLambda = [&](TRuntimeNode item) {
+ if (structType->GetMembersCount() == 0) {
+ return ctx.ProgramBuilder.NewStruct(outputItemType, {});
+ }
+
+ bool isOptional;
+ const auto schemeType = UnpackOptionalData(structType->GetMemberType(0U), isOptional)->GetSchemeType();
+
TRuntimeNode converted;
if (NUdf::TDataType<const char*>::Id == schemeType) {
converted = isOptional ? ctx.ProgramBuilder.NewOptional(item) : item;
diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp
index 18ad6fb96c6..1ffb82a4b09 100644
--- a/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp
+++ b/ydb/library/yql/providers/s3/provider/yql_s3_logical_opt.cpp
@@ -182,6 +182,7 @@ public:
AddHandler(0, &TCoLeft::Match, HNDL(TrimReadWorld));
AddHandler(0, &TCoFlatMapBase::Match, HNDL(TryPrunePaths));
AddHandler(0, &TDqSourceWrap::Match, HNDL(ApplyPrunedPath));
+ AddHandler(0, &TCoExtractMembers::Match, HNDL(ExtractMembersOverDqSource));
#undef HNDL
}
@@ -294,6 +295,114 @@ public:
return node;
}
+ TMaybeNode<TExprBase> ExtractMembersOverDqSource(TExprBase node, TExprContext& ctx) const {
+ const auto& extract = node.Cast<TCoExtractMembers>();
+ const auto& maybeDqSource = extract.Input().Maybe<TDqSourceWrap>();
+ if (!maybeDqSource) {
+ return node;
+ }
+
+ const auto& dqSource = maybeDqSource.Cast();
+ if (dqSource.DataSource().Category() != S3ProviderName) {
+ return node;
+ }
+
+ const auto& maybeS3SourceSettings = dqSource.Input().Maybe<TS3SourceSettingsBase>();
+ if (!maybeS3SourceSettings) {
+ return node;
+ }
+
+ TSet<TStringBuf> extractMembers;
+ for (auto member : extract.Members()) {
+ extractMembers.insert(member.Value());
+ }
+
+ TMaybeNode<TExprBase> settings = dqSource.Settings();
+
+ TMaybeNode<TExprBase> newSettings = settings;
+ TExprNode::TPtr newPaths = maybeS3SourceSettings.Cast().Paths().Ptr();
+
+ if (settings) {
+ if (auto prunedPaths = GetSetting(settings.Cast().Ref(), "prunedPaths")) {
+ if (prunedPaths->ChildrenSize() > 1) {
+ // pruning in progress
+ return node;
+ }
+ }
+
+ if (auto extraColumnsSetting = GetSetting(settings.Cast().Ref(), "extraColumns")) {
+ const TStructExprType* extraType = extraColumnsSetting->Tail().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->Cast<TStructExprType>();
+ auto extraTypeItems = extraType->GetItems();
+ EraseIf(extraTypeItems, [&](const TItemExprType* item) { return !extractMembers.contains(item->GetName()); });
+ if (extraTypeItems.size() < extraType->GetSize()) {
+ auto originalPaths = maybeS3SourceSettings.Cast().Paths().Ptr();
+ auto originalExtra = extraColumnsSetting->TailPtr();
+ YQL_ENSURE(originalExtra->IsCallable("AsList"));
+ YQL_ENSURE(originalPaths->IsList());
+ YQL_ENSURE(originalPaths->ChildrenSize() == originalExtra->ChildrenSize());
+
+ TExprNodeList newPathItems;
+ TExprNodeList newExtraColumnsItems;
+
+ for (const auto& path : maybeS3SourceSettings.Cast().Paths()) {
+ auto extra = path.ExtraColumns();
+ YQL_ENSURE(TCoAsStruct::Match(extra.Raw()));
+ TExprNodeList children = extra.Ref().ChildrenList();
+ EraseIf(children, [&](const TExprNode::TPtr& child) { return !extractMembers.contains(child->Head().Content()); });
+ auto newStruct = ctx.ChangeChildren(extra.Ref(), std::move(children));
+ newExtraColumnsItems.push_back(newStruct);
+ newPathItems.push_back(ctx.ChangeChild(path.Ref(), TS3Path::idx_ExtraColumns, std::move(newStruct)));
+ }
+
+ newPaths = ctx.ChangeChildren(maybeS3SourceSettings.Cast().Paths().Ref(), std::move(newPathItems));
+ TExprNode::TPtr newExtra = ctx.ChangeChildren(*originalExtra, std::move(newExtraColumnsItems));
+ newSettings = TExprBase(extraTypeItems.empty() ? RemoveSetting(settings.Cast().Ref(), "extraColumns", ctx) :
+ ReplaceSetting(settings.Cast().Ref(), extraColumnsSetting->Pos(), "extraColumns", newExtra, ctx));
+ }
+ }
+
+ }
+
+ const TStructExprType* outputRowType = node.Ref().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->Cast<TStructExprType>();
+ const TExprNode::TPtr outputRowTypeNode = ExpandType(dqSource.RowType().Pos(), *outputRowType, ctx);
+
+ YQL_CLOG(INFO, ProviderS3) << "ExtractMembers over DqSource with " << maybeS3SourceSettings.Cast().CallableName();
+
+ if (maybeS3SourceSettings.Cast().CallableName() == TS3SourceSettings::CallableName()) {
+ return Build<TDqSourceWrap>(ctx, dqSource.Pos())
+ .InitFrom(dqSource)
+ .Input<TS3SourceSettings>()
+ .InitFrom(dqSource.Input().Maybe<TS3SourceSettings>().Cast())
+ .Paths(newPaths)
+ .Build()
+ .RowType(outputRowTypeNode)
+ .Settings(newSettings)
+ .Done();
+ }
+
+ const TStructExprType* readRowType =
+ dqSource.Input().Maybe<TS3ParseSettings>().Cast().RowType().Ref().GetTypeAnn()->Cast<TTypeExprType>()->GetType()->Cast<TStructExprType>();
+
+ if (outputRowType->GetSize() == 0 && readRowType->GetSize() != 0) {
+ auto item = GetLightColumn(*readRowType);
+ YQL_ENSURE(item);
+ readRowType = ctx.MakeType<TStructExprType>(TVector<const TItemExprType*>{item});
+ } else {
+ readRowType = outputRowType;
+ }
+
+ return Build<TDqSourceWrap>(ctx, dqSource.Pos())
+ .InitFrom(dqSource)
+ .Input<TS3ParseSettings>()
+ .InitFrom(dqSource.Input().Maybe<TS3ParseSettings>().Cast())
+ .Paths(newPaths)
+ .RowType(ExpandType(dqSource.Input().Pos(), *readRowType, ctx))
+ .Build()
+ .RowType(outputRowTypeNode)
+ .Settings(newSettings)
+ .Done();
+ }
+
private:
const TS3State::TPtr State_;
};
diff --git a/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp b/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp
index 0ca6ab90782..a59751a8e05 100644
--- a/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp
+++ b/ydb/library/yql/providers/ydb/provider/yql_ydb_physical_opt.cpp
@@ -8,6 +8,7 @@
#include <ydb/library/yql/providers/common/provider/yql_data_provider_impl.h>
#include <ydb/library/yql/providers/common/transform/yql_optimize.h>
#include <ydb/library/yql/core/expr_nodes/yql_expr_nodes.h>
+#include <ydb/library/yql/core/yql_opt_utils.h>
#include <ydb/library/yql/utils/log/log.h>
@@ -17,58 +18,6 @@ using namespace NNodes;
namespace {
-ui8 GetTypeWeight(const TTypeAnnotationNode& type) {
- switch (type.GetKind()) {
- case ETypeAnnotationKind::Data:
- switch (type.Cast<TDataExprType>()->GetSlot()) {
- case NUdf::EDataSlot::Bool:
- case NUdf::EDataSlot::Int8:
- case NUdf::EDataSlot::Uint8: return 1;
-
- case NUdf::EDataSlot::Int16:
- case NUdf::EDataSlot::Uint16:
- case NUdf::EDataSlot::Date: return 2;
-
- case NUdf::EDataSlot::TzDate: return 3;
-
- case NUdf::EDataSlot::Int32:
- case NUdf::EDataSlot::Uint32:
- case NUdf::EDataSlot::Float:
- case NUdf::EDataSlot::Datetime: return 4;
-
- case NUdf::EDataSlot::TzDatetime: return 5;
-
- case NUdf::EDataSlot::Int64:
- case NUdf::EDataSlot::Uint64:
- case NUdf::EDataSlot::Double:
- case NUdf::EDataSlot::Timestamp:
- case NUdf::EDataSlot::Interval: return 8;
-
- case NUdf::EDataSlot::TzTimestamp: return 9;
-
- case NUdf::EDataSlot::Decimal: return 15;
- case NUdf::EDataSlot::Uuid: return 16;
-
- default: return 32;
- }
- case ETypeAnnotationKind::Optional: return 1 + GetTypeWeight(*type.Cast<TOptionalExprType>()->GetItemType());
- default: return 255;
- }
-}
-
-const TItemExprType* GetLightColumn(const TStructExprType& type) {
- ui8 weight = 255;
- const TItemExprType* field = nullptr;
- for (const auto& item : type.GetItems()) {
-
- if (const auto w = GetTypeWeight(*item->GetItemType()); w < weight) {
- weight = w;
- field = item;
- }
- }
- return field;
-}
-
class TYdbPhysicalOptProposalTransformer : public TOptimizeTransformerBase {
public:
TYdbPhysicalOptProposalTransformer(TYdbState::TPtr state)
@@ -119,6 +68,7 @@ public:
if (!wide.Cast().Ref().GetTypeAnn()->Cast<TFlowExprType>()->GetItemType()->Cast<TMultiExprType>()->GetSize()) {
const auto& read = maybe.Cast();
const auto structType = State_->Tables[std::make_pair(read.DataSource().Cluster().StringValue(), read.Table().StringValue())].ItemType;
+ YQL_ENSURE(structType->GetSize());
auto columns = ctx.NewList(read.Pos(), {ctx.NewAtom(read.Pos(), GetLightColumn(*structType)->GetName())});
return Build<TCoNarrowMap>(ctx, narrow.Cast().Pos())
.Input<TDqReadWideWrap>()