aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDenis Khalikov <dennis.khalikov@gmail.com>2025-04-16 16:45:01 +0300
committerGitHub <noreply@github.com>2025-04-16 16:45:01 +0300
commita055fbabeecbd6bbb6a14d45ab576dc6c31ad51f (patch)
treea90b78302a8a05b9654319afc21ff722f47959ab
parent71fddc7cdb247cc22909d57e73a2e6b6ccb755f0 (diff)
downloadydb-a055fbabeecbd6bbb6a14d45ab576dc6c31ad51f.tar.gz
Add predicate selectivity with Histogram (#14564) (#16982)
-rw-r--r--ydb/core/kqp/opt/kqp_statistics_transformer.cpp23
-rw-r--r--ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp166
-rw-r--r--ydb/library/yql/dq/opt/dq_opt_stat.h3
3 files changed, 178 insertions, 14 deletions
diff --git a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
index fa9a7918897..b14b7ce27d2 100644
--- a/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
+++ b/ydb/core/kqp/opt/kqp_statistics_transformer.cpp
@@ -495,7 +495,14 @@ public:
size_t listSize = listPtr->ChildrenSize();
if (listSize == 3) {
TString compSign = TString(listPtr->Child(0)->Content());
- TString attr = TString(listPtr->Child(1)->Content());
+ auto left = listPtr->ChildPtr(1);
+ auto right = listPtr->ChildPtr(2);
+ if (IsConstantExpr(left) && OlapOppositeCompSigns.contains(compSign)) {
+ compSign = OlapOppositeCompSigns[compSign];
+ std::swap(left, right);
+ }
+
+ TString attr = TString(left->Content());
TExprContext dummyCtx;
TPositionHandle dummyPos;
@@ -511,12 +518,12 @@ public:
.Name().Build(attr)
.Done();
- auto value = TExprBase(listPtr->ChildPtr(2));
+ auto value = TExprBase(right);
if (listPtr->ChildPtr(2)->ChildrenSize() >= 2 && listPtr->ChildPtr(2)->ChildPtr(0)->Content() == "just") {
value = TExprBase(listPtr->ChildPtr(2)->ChildPtr(1));
}
if (OlapCompSigns.contains(compSign)) {
- resSelectivity = this->ComputeComparisonSelectivity(member, value);
+ resSelectivity = this->ComputeInequalitySelectivity(member, value, OlapCompStrToEInequalityPredicate[compSign]);
} else if (compSign == "eq") {
resSelectivity = this->ComputeEqualitySelectivity(member, value);
} else if (compSign == "neq") {
@@ -549,6 +556,16 @@ private:
"starts_with",
"ends_with"
};
+
+ THashMap<TString, EInequalityPredicateType> OlapCompStrToEInequalityPredicate = {
+ {"lt", EInequalityPredicateType::Less},
+ {"lte", EInequalityPredicateType::LessOrEqual},
+ {"gt", EInequalityPredicateType::GreaterOrEqual},
+ {"gte", EInequalityPredicateType::GreaterOrEqual},
+ };
+
+ THashMap<TString, TString> OlapOppositeCompSigns = {{"lt", "gt"}, {"lte", "gte"}, {"gt", "lt"},
+ {"gte", "lte"}, {"eq", "neq"}, {"neq", "eq"}};
};
void InferStatisticsForOlapFilter(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx) {
diff --git a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp
index df9b2f5255d..01ddc779a0b 100644
--- a/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp
+++ b/ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp
@@ -11,7 +11,14 @@ namespace {
using namespace NYql::NDq;
THashSet<TString> PgInequalityPreds = {
- "<", "<=", ">", ">="};
+ "<", "<=", ">", ">=", "="};
+
+ THashMap<TString, EInequalityPredicateType> StringToInequalityPredicateMap{
+ {"<", EInequalityPredicateType::Less},
+ {"<=", EInequalityPredicateType::LessOrEqual},
+ {">", EInequalityPredicateType::Greater},
+ {">=", EInequalityPredicateType::GreaterOrEqual},
+ {"=", EInequalityPredicateType::Equal}};
/**
* Check if a callable is an attribute of some table
@@ -56,8 +63,81 @@ namespace {
}
}
- std::optional<ui32> EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, TString columnType, const std::shared_ptr<NKikimr::TCountMinSketch>& countMinSketch) {
- if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>() ) {
+ // Estimates number of rows based on histogram and predicate type.
+ template <typename T>
+ std::optional<ui64> EstimateInequalityPredicateByType(const std::shared_ptr<NKikimr::TEqWidthHistogramEstimator>& estimator, T val,
+ EInequalityPredicateType predicate) {
+ switch (predicate) {
+ case EInequalityPredicateType::Less:
+ return estimator->EstimateLess<T>(val);
+ case EInequalityPredicateType::LessOrEqual:
+ return estimator->EstimateLessOrEqual<T>(val);
+ case EInequalityPredicateType::Greater:
+ return estimator->EstimateGreater<T>(val);
+ case EInequalityPredicateType::GreaterOrEqual:
+ return estimator->EstimateGreaterOrEqual<T>(val);
+ case EInequalityPredicateType::Equal:
+ return estimator->EstimateEqual<T>(val);
+ }
+ return std::nullopt;
+ }
+
+ // Returns an opposite predicate.
+ EInequalityPredicateType GetOppositePredicateType(EInequalityPredicateType predicate) {
+ switch (predicate) {
+ case EInequalityPredicateType::Less:
+ return EInequalityPredicateType::Greater;
+ case EInequalityPredicateType::Greater:
+ return EInequalityPredicateType::Less;
+ case EInequalityPredicateType::LessOrEqual:
+ return EInequalityPredicateType::GreaterOrEqual;
+ case EInequalityPredicateType::GreaterOrEqual:
+ return EInequalityPredicateType::LessOrEqual;
+ default:
+ Y_ABORT();
+ }
+ }
+
+ // Returns a number of rows based on predicate.
+ std::optional<ui64> EstimateInequalityPredicateByHistogram(NYql::NNodes::TExprBase maybeLiteral, const TString& columnType,
+ const std::shared_ptr<NKikimr::TEqWidthHistogramEstimator>& estimator,
+ EInequalityPredicateType predicate) {
+ if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>()) {
+ maybeLiteral = maybeJust.Cast().Input();
+ }
+
+ if (maybeLiteral.Maybe<NYql::NNodes::TCoDataCtor>()) {
+ auto literal = maybeLiteral.Maybe<NYql::NNodes::TCoDataCtor>().Cast();
+ auto value = literal.Literal().Value();
+ if (columnType == "Uint32") {
+ ui32 val = FromString<ui32>(value);
+ return EstimateInequalityPredicateByType<ui32>(estimator, val, predicate);
+ } else if (columnType == "Int32") {
+ i32 val = FromString<i32>(value);
+ return EstimateInequalityPredicateByType<ui32>(estimator, val, predicate);
+ } else if (columnType == "Uint64") {
+ ui64 val = FromString<ui64>(value);
+ return EstimateInequalityPredicateByType<ui64>(estimator, val, predicate);
+ } else if (columnType == "Int64") {
+ i64 val = FromString<i64>(value);
+ return EstimateInequalityPredicateByType<i64>(estimator, val, predicate);
+ } else if (columnType == "Double") {
+ double val = FromString<double>(value);
+ return EstimateInequalityPredicateByType<double>(estimator, val, predicate);
+ } else if (columnType == "Date") {
+ ui16 val = FromString<ui16>(value);
+ return EstimateInequalityPredicateByType<ui16>(estimator, val, predicate);
+ }
+ // TODO: Add support for other types.
+ return std::nullopt;
+ }
+
+ return std::nullopt;
+ }
+
+ std::optional<ui32> EstimateCountMin(NYql::NNodes::TExprBase maybeLiteral, TString columnType,
+ const std::shared_ptr<NKikimr::TCountMinSketch>& countMinSketch) {
+ if (auto maybeJust = maybeLiteral.Maybe<NYql::NNodes::TCoJust>()) {
maybeLiteral = maybeJust.Cast().Input();
}
@@ -109,13 +189,12 @@ namespace {
} else if (columnType == "Uuid") {
const ui64* uuidData = reinterpret_cast<const ui64*>(value.data());
std::pair<ui64, ui64> v{};
- v.first = uuidData[0]; // low128
- v.second = uuidData[1]; // high128
+ v.first = uuidData[0]; // low128
+ v.second = uuidData[1]; // high128
return countMinSketch->Probe(reinterpret_cast<const char*>(&v), sizeof(v));
} else {
return std::nullopt;
}
-
}
return std::nullopt;
@@ -138,6 +217,43 @@ TExprNode::TPtr FindNode(const TExprBase& input) {
return nullptr;
}
+double NYql::NDq::TPredicateSelectivityComputer::ComputeInequalitySelectivity(const TExprBase& left, const TExprBase& right,
+ EInequalityPredicateType predicate) {
+ if (IsAttribute(right) && IsConstantExprWithParams(left.Ptr())) {
+ return ComputeInequalitySelectivity(right, left, GetOppositePredicateType(predicate));
+ }
+
+ if (auto maybeMember = IsAttribute(left)) {
+ // It seems like this is not possible in current version.
+ if (IsAttribute(right)) {
+ return 0.3;
+ } else if (IsConstantExprWithParams(right.Ptr())) {
+ const TString attributeName = maybeMember.Get()->Name().StringValue();
+ if (!IsConstantExpr(right.Ptr())) {
+ return DefaultSelectivity(Stats, attributeName);
+ }
+
+ if (!Stats || !Stats->ColumnStatistics) {
+ return DefaultSelectivity(Stats, attributeName);
+ }
+
+ if (auto histogramEstimator = Stats->ColumnStatistics->Data[attributeName].EqWidthHistogramEstimator) {
+ const auto columnType = Stats->ColumnStatistics->Data[attributeName].Type;
+ std::optional<ui64> estimation = EstimateInequalityPredicateByHistogram(right, columnType, histogramEstimator, predicate);
+ if (!estimation.has_value()) {
+ return DefaultSelectivity(Stats, attributeName);
+ }
+ // Should we compare the number of rows in histogram against `Nrows` and adjust `value` based on that.
+ Y_ASSERT(Stats->Nrows);
+ return estimation.value() / Stats->Nrows;
+ }
+ return 0.5;
+ }
+ }
+
+ return 1.0;
+}
+
double NYql::NDq::TPredicateSelectivityComputer::ComputeEqualitySelectivity(const TExprBase& left, const TExprBase& right) {
if (IsAttribute(right) && IsConstantExprWithParams(left.Ptr())) {
return ComputeEqualitySelectivity(right, left);
@@ -169,7 +285,7 @@ double NYql::NDq::TPredicateSelectivityComputer::ComputeEqualitySelectivity(cons
if (auto countMinSketch = Stats->ColumnStatistics->Data[attributeName].CountMinSketch; countMinSketch != nullptr) {
auto columnType = Stats->ColumnStatistics->Data[attributeName].Type;
- std::optional<ui32> countMinEstimation = EstimateCountMin(right, columnType, countMinSketch);
+ std::optional<ui32> countMinEstimation = EstimateCountMin(right, columnType, countMinSketch);
if (!countMinEstimation.has_value()) {
return DefaultSelectivity(Stats, attributeName);
}
@@ -271,6 +387,34 @@ double TPredicateSelectivityComputer::Compute(
resSelectivity = ComputeEqualitySelectivity(left, right);
}
+ else if (auto less = input.Maybe<TCoCmpLess>()) {
+ auto left = less.Cast().Left();
+ auto right = less.Cast().Right();
+
+ resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::Less);
+ }
+
+ else if (auto less = input.Maybe<TCoCmpGreater>()) {
+ auto left = less.Cast().Left();
+ auto right = less.Cast().Right();
+
+ resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::Greater);
+ }
+
+ else if (auto less = input.Maybe<TCoCmpLessOrEqual>()) {
+ auto left = less.Cast().Left();
+ auto right = less.Cast().Right();
+
+ resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::LessOrEqual);
+ }
+
+ else if (auto less = input.Maybe<TCoCmpGreaterOrEqual>()) {
+ auto left = less.Cast().Left();
+ auto right = less.Cast().Right();
+
+ resSelectivity = ComputeInequalitySelectivity(left, right, EInequalityPredicateType::GreaterOrEqual);
+ }
+
else if (input.Ptr()->IsCallable("PgResolvedOp") && input.Ptr()->ChildPtr(0)->Content()=="=") {
auto left = TExprBase(input.Ptr()->ChildPtr(2));
auto right = TExprBase(input.Ptr()->ChildPtr(3));
@@ -306,17 +450,17 @@ double TPredicateSelectivityComputer::Compute(
else if (input.Ptr()->IsCallable("PgResolvedOp") && PgInequalityPreds.contains(input.Ptr()->ChildPtr(0)->Content())){
auto left = TExprBase(input.Ptr()->ChildPtr(2));
auto right = TExprBase(input.Ptr()->ChildPtr(3));
-
- resSelectivity = ComputeComparisonSelectivity(left, right);
+ resSelectivity =
+ ComputeInequalitySelectivity(left, right, StringToInequalityPredicateMap[input.Ptr()->ChildPtr(0)->Content()]);
}
// Process SqlIn
- else if(input.Ptr()->IsCallable("SqlIn")) {
+ else if (input.Ptr()->IsCallable("SqlIn")) {
auto list = input.Ptr()->ChildPtr(0);
double tmpSelectivity = 0.0;
auto lhs = TExprBase(input.Ptr()->ChildPtr(1));
- for (const auto& child: list->Children()) {
+ for (const auto& child : list->Children()) {
TExprBase rhs = TExprBase(child);
tmpSelectivity += ComputeEqualitySelectivity(lhs, rhs);
}
diff --git a/ydb/library/yql/dq/opt/dq_opt_stat.h b/ydb/library/yql/dq/opt/dq_opt_stat.h
index 670f30c185f..d8ded84ae05 100644
--- a/ydb/library/yql/dq/opt/dq_opt_stat.h
+++ b/ydb/library/yql/dq/opt/dq_opt_stat.h
@@ -6,6 +6,7 @@
#include <yql/essentials/core/cbo/cbo_optimizer_new.h>
namespace NYql::NDq {
+enum class EInequalityPredicateType : ui8 { Less, LessOrEqual, Greater, GreaterOrEqual, Equal };
void InferStatisticsForFlatMap(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx);
void InferStatisticsForFilter(const TExprNode::TPtr& input, TTypeAnnotationContext* typeCtx);
@@ -83,6 +84,8 @@ public:
protected:
double ComputeEqualitySelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right);
+ double ComputeInequalitySelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right,
+ EInequalityPredicateType predicate);
double ComputeComparisonSelectivity(const NYql::NNodes::TExprBase& left, const NYql::NNodes::TExprBase& right);