diff options
author | Denis Khalikov <dennis.khalikov@gmail.com> | 2025-04-16 16:45:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-04-16 16:45:01 +0300 |
commit | a055fbabeecbd6bbb6a14d45ab576dc6c31ad51f (patch) | |
tree | a90b78302a8a05b9654319afc21ff722f47959ab | |
parent | 71fddc7cdb247cc22909d57e73a2e6b6ccb755f0 (diff) | |
download | ydb-a055fbabeecbd6bbb6a14d45ab576dc6c31ad51f.tar.gz |
Add predicate selectivity with Histogram (#14564) (#16982)
-rw-r--r-- | ydb/core/kqp/opt/kqp_statistics_transformer.cpp | 23 | ||||
-rw-r--r-- | ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp | 166 | ||||
-rw-r--r-- | ydb/library/yql/dq/opt/dq_opt_stat.h | 3 |
3 files changed, 178 insertions, 14 deletions
diff --git a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp index fa9a7918897..b14b7ce27d2 100644 --- a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp +++ b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp @@ -495,7 +495,14 @@ public: size_t listSize = listPtr->ChildrenSize(); if (listSize == 3) { TString compSign = TString(listPtr->Child(0)->Content()); - TString attr = TString(listPtr->Child(1)->Content()); + auto left = listPtr->ChildPtr(1); + auto right = listPtr->ChildPtr(2); + if (IsConstantExpr(left) && OlapOppositeCompSigns.contains(compSign)) { + compSign = OlapOppositeCompSigns[compSign]; + std::swap(left, right); + } + + TString attr = TString(left->Content()); TExprContext dummyCtx; TPositionHandle dummyPos; @@ -511,12 +518,12 @@ public: .Name().Build(attr) .Done(); - auto value = TExprBase(listPtr->ChildPtr(2)); + auto value = TExprBase(right); if (listPtr->ChildPtr(2)->ChildrenSize() >= 2 && listPtr->ChildPtr(2)->ChildPtr(0)->Content() == "just") { value = TExprBase(listPtr->ChildPtr(2)->ChildPtr(1)); } if (OlapCompSigns.contains(compSign)) { - resSelectivity = this->ComputeComparisonSelectivity(member, value); + resSelectivity = this->ComputeInequalitySelectivity(member, value, OlapCompStrToEInequalityPredicate[compSign]); } else if (compSign == "eq") { resSelectivity = this->ComputeEqualitySelectivity(member, value); } else if (compSign == "neq") { @@ -549,6 +556,16 @@ private: "starts_with", "ends_with" }; + + THashMap<TString, EInequalityPredicateType> OlapCompStrToEInequalityPredicate = { + {"lt", EInequalityPredicateType::Less}, + {"lte", EInequalityPredicateType::LessOrEqual}, + {"gt", EInequalityPredicateType::GreaterOrEqual}, + {"gte", EInequalityPredicateType::GreaterOrEqual}, + }; + + THashMap<TString, TString> OlapOppositeCompSigns = {{"lt", "gt"}, {"lte", "gte"}, {"gt", "lt"}, + {"gte", "lte"}, {"eq", "neq"}, {"neq", "eq"}}; }; void InferStatisticsForOlapFilter(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) { diff --git a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp index df9b2f5255d..01ddc779a0b 100644 --- a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp @@ -11,7 +11,14 @@ namespace { using namespace NYql::NDq; THashSet<TString> PgInequalityPreds = { - "<", "<=", ">", ">="}; + "<", "<=", ">", ">=", "="}; + + THashMap<TString, EInequalityPredicateType> StringToInequalityPredicateMap{ + {"<", EInequalityPredicateType::Less}, + {"<=", EInequalityPredicateType::LessOrEqual}, + {">", EInequalityPredicateType::Greater}, + {">=", EInequalityPredicateType::GreaterOrEqual}, + {"=", EInequalityPredicateType::Equal}}; /** * Check if a callable is an attribute of some table @@ -56,8 +63,81 @@ namespace { } } - std::optional<ui32> EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, TString columnType, const std::shared_ptr<NKikimr::TCountMinSketch>& countMinSketch) { - if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>() ) { + // Estimates number of rows based on histogram and predicate type. + template <typename T> + std::optional<ui64> EstimateInequalityPredicateByType(const std::shared_ptr<NKikimr::TEqWidthHistogramEstimator>& estimator, T val, + EInequalityPredicateType predicate) { + switch (predicate) { + case EInequalityPredicateType::Less: + return estimator->EstimateLess<T>(val); + case EInequalityPredicateType::LessOrEqual: + return estimator->EstimateLessOrEqual<T>(val); + case EInequalityPredicateType::Greater: + return estimator->EstimateGreater<T>(val); + case EInequalityPredicateType::GreaterOrEqual: + return estimator->EstimateGreaterOrEqual<T>(val); + case EInequalityPredicateType::Equal: + return estimator->EstimateEqual<T>(val); + } + return std::nullopt; + } + + // Returns an opposite predicate. + EInequalityPredicateType GetOppositePredicateType(EInequalityPredicateType predicate) { + switch (predicate) { + case EInequalityPredicateType::Less: + return EInequalityPredicateType::Greater; + case EInequalityPredicateType::Greater: + return EInequalityPredicateType::Less; + case EInequalityPredicateType::LessOrEqual: + return EInequalityPredicateType::GreaterOrEqual; + case EInequalityPredicateType::GreaterOrEqual: + return EInequalityPredicateType::LessOrEqual; + default: + Y_ABORT(); + } + } + + // Returns a number of rows based on predicate. + std::optional<ui64> EstimateInequalityPredicateByHistogram(NYql::NNodes::TExprBase maybeLiteral, const TString& columnType, + const std::shared_ptr<NKikimr::TEqWidthHistogramEstimator>& estimator, + EInequalityPredicateType predicate) { + if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>()) { + maybeLiteral = maybeJust.Cast().Input(); + } + + if (maybeLiteral.Maybe<NYql::NNodes::TCoDataCtor>()) { + auto literal = maybeLiteral.Maybe<NYql::NNodes::TCoDataCtor>().Cast(); + auto value = literal.Literal().Value(); + if (columnType == "Uint32") { + ui32 val = FromString<ui32>(value); + return EstimateInequalityPredicateByType<ui32>(estimator, val, predicate); + } else if (columnType == "Int32") { + i32 val = FromString<i32>(value); + return EstimateInequalityPredicateByType<ui32>(estimator, val, predicate); + } else if (columnType == "Uint64") { + ui64 val = FromString<ui64>(value); + return EstimateInequalityPredicateByType<ui64>(estimator, val, predicate); + } else if (columnType == "Int64") { + i64 val = FromString<i64>(value); + return EstimateInequalityPredicateByType<i64>(estimator, val, predicate); + } else if (columnType == "Double") { + double val = FromString<double>(value); + return EstimateInequalityPredicateByType<double>(estimator, val, predicate); + } else if (columnType == "Date") { + ui16 val = FromString<ui16>(value); + return EstimateInequalityPredicateByType<ui16>(estimator, val, predicate); + } + // TODO: Add support for other types. + return std::nullopt; + } + + return std::nullopt; + } + + std::optional<ui32> EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, TString columnType, + const std::shared_ptr<NKikimr::TCountMinSketch>& countMinSketch) { + if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>()) { maybeLiteral = maybeJust.Cast().Input(); } @@ -109,13 +189,12 @@ namespace { } else if (columnType == "Uuid") { const ui64* uuidData = reinterpret_cast<const ui64*>(value.data()); std::pair<ui64, ui64> v{}; - v.first = uuidData[0]; // low128 - v.second = uuidData[1]; // high128 + v.first = uuidData[0]; // low128 + v.second = uuidData[1]; // high128 return countMinSketch->Probe(reinterpret_cast<const char*>(&v), sizeof(v)); } else { return std::nullopt; } - } return std::nullopt; @@ -138,6 +217,43 @@ TExprNode::TPtr FindNode(const TExprBase& input) { return nullptr; } +double NYql::NDq::TPredicateSelectivityComputer::ComputeInequalitySelectivity(const TExprBase& left, const TExprBase& right, + EInequalityPredicateType predicate) { + if (IsAttribute(right) && IsConstantExprWithParams(left.Ptr())) { + return ComputeInequalitySelectivity(right, left, GetOppositePredicateType(predicate)); + } + + if (auto maybeMember = IsAttribute(left)) { + // It seems like this is not possible in current version. + if (IsAttribute(right)) { + return 0.3; + } else if (IsConstantExprWithParams(right.Ptr())) { + const TString attributeName = maybeMember.Get()->Name().StringValue(); + if (!IsConstantExpr(right.Ptr())) { + return DefaultSelectivity(Stats, attributeName); + } + + if (!Stats || !Stats->ColumnStatistics) { + return DefaultSelectivity(Stats, attributeName); + } + + if (auto histogramEstimator = Stats->ColumnStatistics->Data[attributeName].EqWidthHistogramEstimator) { + const auto columnType = Stats->ColumnStatistics->Data[attributeName].Type; + std::optional<ui64> estimation = EstimateInequalityPredicateByHistogram(right, columnType, histogramEstimator, predicate); + if (!estimation.has_value()) { + return DefaultSelectivity(Stats, attributeName); + } + // Should we compare the number of rows in histogram against `Nrows` and adjust `value` based on that. + Y_ASSERT(Stats->Nrows); + return estimation.value() / Stats->Nrows; + } + return 0.5; + } + } + + return 1.0; +} + double NYql::NDq::TPredicateSelectivityComputer::ComputeEqualitySelectivity(const TExprBase& left, const TExprBase& right) { if (IsAttribute(right) && IsConstantExprWithParams(left.Ptr())) { return ComputeEqualitySelectivity(right, left); @@ -169,7 +285,7 @@ double NYql::NDq::TPredicateSelectivityComputer::ComputeEqualitySelectivity(cons if (auto countMinSketch = Stats->ColumnStatistics->Data[attributeName].CountMinSketch; countMinSketch != nullptr) { auto columnType = Stats->ColumnStatistics->Data[attributeName].Type; - std::optional<ui32> countMinEstimation = EstimateCountMin(right, columnType, countMinSketch); + std::optional<ui32> countMinEstimation = EstimateCountMin(right, columnType, countMinSketch); if (!countMinEstimation.has_value()) { return DefaultSelectivity(Stats, attributeName); } @@ -271,6 +387,34 @@ double TPredicateSelectivityComputer::Compute( resSelectivity = ComputeEqualitySelectivity(left, right); } + else if (auto less = input.Maybe<TCoCmpLess>()) { + auto left = less.Cast().Left(); + auto right = less.Cast().Right(); + + resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::Less); + } + + else if (auto less = input.Maybe<TCoCmpGreater>()) { + auto left = less.Cast().Left(); + auto right = less.Cast().Right(); + + resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::Greater); + } + + else if (auto less = input.Maybe<TCoCmpLessOrEqual>()) { + auto left = less.Cast().Left(); + auto right = less.Cast().Right(); + + resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::LessOrEqual); + } + + else if (auto less = input.Maybe<TCoCmpGreaterOrEqual>()) { + auto left = less.Cast().Left(); + auto right = less.Cast().Right(); + + resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::GreaterOrEqual); + } + else if (input.Ptr()->IsCallable("PgResolvedOp") && input.Ptr()->ChildPtr(0)->Content()=="=") { auto left = TExprBase(input.Ptr()->ChildPtr(2)); auto right = TExprBase(input.Ptr()->ChildPtr(3)); @@ -306,17 +450,17 @@ double TPredicateSelectivityComputer::Compute( else if (input.Ptr()->IsCallable("PgResolvedOp") && PgInequalityPreds.contains(input.Ptr()->ChildPtr(0)->Content())){ auto left = TExprBase(input.Ptr()->ChildPtr(2)); auto right = TExprBase(input.Ptr()->ChildPtr(3)); - - resSelectivity = ComputeComparisonSelectivity(left, right); + resSelectivity = + ComputeInequalitySelectivity(left, right, StringToInequalityPredicateMap[input.Ptr()->ChildPtr(0)->Content()]); } // Process SqlIn - else if(input.Ptr()->IsCallable("SqlIn")) { + else if (input.Ptr()->IsCallable("SqlIn")) { auto list = input.Ptr()->ChildPtr(0); double tmpSelectivity = 0.0; auto lhs = TExprBase(input.Ptr()->ChildPtr(1)); - for (const auto& child: list->Children()) { + for (const auto& child : list->Children()) { TExprBase rhs = TExprBase(child); tmpSelectivity += ComputeEqualitySelectivity(lhs, rhs); } diff --git a/ydb/library/yql/dq/opt/dq_opt_stat.h b/ydb/library/yql/dq/opt/dq_opt_stat.h index 670f30c185f..d8ded84ae05 100644 --- a/ydb/library/yql/dq/opt/dq_opt_stat.h +++ b/ydb/library/yql/dq/opt/dq_opt_stat.h @@ -6,6 +6,7 @@ #include <yql/essentials/core/cbo/cbo_optimizer_new.h> namespace NYql::NDq { +enum class EInequalityPredicateType : ui8 { Less, LessOrEqual, Greater, GreaterOrEqual, Equal }; void InferStatisticsForFlatMap(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx); void InferStatisticsForFilter(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx); @@ -83,6 +84,8 @@ public: protected: double ComputeEqualitySelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right); + double ComputeInequalitySelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right, + EInequalityPredicateType predicate); double ComputeComparisonSelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right); |