diff options
author | vvvv <vvvv@ydb.tech> | 2022-11-23 19:17:46 +0300 |
---|---|---|
committer | vvvv <vvvv@ydb.tech> | 2022-11-23 19:17:46 +0300 |
commit | a84c09d1d4783305b944430bffdbc32e8c7b4728 (patch) | |
tree | f44d0c96876d858cd6fa04f2feb2c6fd80e1b0c5 | |
parent | 3ddfb0470a13e7007289af879595658dd6a7eb7f (diff) | |
download | ydb-a84c09d1d4783305b944430bffdbc32e8c7b4728.tar.gz |
pushdown of filter for count/sum/avg/min/max
4 files changed, 269 insertions, 79 deletions
diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg.cpp index f6acca37d35..e730a28c5df 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg.cpp @@ -57,13 +57,22 @@ public: std::optional<ui64> filtered; if (FilterColumn_) { - arrow::BooleanArray arr(TArrowBlock::From(s.Values_[*FilterColumn_]).GetDatum().array());
- ui64 popCount = (ui64)arr.true_count(); - if (popCount == 0) { - continue; + auto filterDatum = TArrowBlock::From(s.Values_[*FilterColumn_]).GetDatum(); + if (filterDatum.is_scalar()) { + if (!filterDatum.scalar_as<arrow::BooleanScalar>().value) { + continue; + } + } else { + arrow::BooleanArray arr(filterDatum.array());
+ ui64 popCount = (ui64)arr.true_count(); + if (popCount == 0) { + continue; + } + + if (popCount < batchLength) { + filtered = popCount; + } } - - filtered = popCount; } s.HasValues_ = true; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_count.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_count.cpp index c985b182206..5afa75b3df8 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_count.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_count.cpp @@ -36,15 +36,38 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); if (datum.is_scalar()) { if (datum.scalar()->is_valid) { - State_ += batchLength; + State_ += filtered ? *filtered : batchLength; } } else { const auto& array = datum.array(); - State_ += array->length - array->GetNullCount(); + if (!filtered) { + State_ += array->length - array->GetNullCount(); + } else if (array->GetNullCount() == array->length) { + // all nulls + return; + } else if (array->GetNullCount() == 0) { + // no nulls + State_ += *filtered; + } else { + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + // intersect masks from nulls and filter column + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + auto filterBitmap = filterArray->GetValues<uint8_t>(1, 0); + auto state = State_; + for (ui32 i = 0; i < array->length; ++i) { + ui64 fullIndex = i + array->offset; + auto bit1 = ((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + auto bit2 = ((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + state += bit1 & bit2; + } + + State_ = state; + } } } diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_minmax.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_minmax.cpp index 9cea6d1fbc7..a2f1d53f6ce 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_minmax.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_minmax.cpp @@ -34,7 +34,7 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); + Y_UNUSED(batchLength); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); if (datum.is_scalar()) { if (datum.scalar()->is_valid) { @@ -50,23 +50,56 @@ public: return; } - IsValid_ = true; - TIn value = Value_; - if (array->GetNullCount() == 0) { - for (int64_t i = 0; i < len; ++i) { - value = UpdateMinMax<IsMin>(value, ptr[i]); + if (!filtered) { + IsValid_ = true; + TIn value = Value_; + if (array->GetNullCount() == 0) { + for (int64_t i = 0; i < len; ++i) { + value = UpdateMinMax<IsMin>(value, ptr[i]); + } + } else { + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + value = UpdateMinMax<IsMin>(value, TIn((ptr[i] & mask) | (value & ~mask))); + } } + + Value_ = value; } else { - auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); - for (int64_t i = 0; i < len; ++i) { - ui64 fullIndex = i + array->offset; - // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 - TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); - value = UpdateMinMax<IsMin>(value, TIn((ptr[i] & mask) | (value & ~mask))); + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + + TIn value = Value_; + if (array->GetNullCount() == 0) { + IsValid_ = true; + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + value = UpdateMinMax<IsMin>(value, TIn((ptr[i] & filterMask) | (value & ~filterMask))); + } + } else { + ui64 count = 0; + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + mask &= filterMask; + value = UpdateMinMax<IsMin>(value, TIn((ptr[i] & mask) | (value & ~mask))); + count += mask & 1; + } + + IsValid_ = IsValid_ || count > 0; } - } - Value_ = value; + Value_ = value; + } } } @@ -99,7 +132,6 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); Y_UNUSED(batchLength); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); MKQL_ENSURE(datum.is_array(), "Expected array"); @@ -108,12 +140,28 @@ public: auto len = array->length; MKQL_ENSURE(array->GetNullCount() == 0, "Expected no nulls"); MKQL_ENSURE(len > 0, "Expected at least one value"); - TIn value = Value_; - for (int64_t i = 0; i < len; ++i) { - value = UpdateMinMax<IsMin>(value, ptr[i]); - } + if (!filtered) { + TIn value = Value_; + for (int64_t i = 0; i < len; ++i) { + value = UpdateMinMax<IsMin>(value, ptr[i]); + } - Value_ = value; + Value_ = value; + } else { + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + + TIn value = Value_; + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + value = UpdateMinMax<IsMin>(value, TIn((ptr[i] & filterMask) | (value & ~filterMask))); + } + + Value_ = value; + } } NUdf::TUnboxedValue Finish() final { @@ -140,7 +188,6 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); if (datum.is_scalar()) { if (datum.scalar()->is_valid) { @@ -156,26 +203,61 @@ public: return; } - IsValid_ = true; - ui8 value = Value_; - if (array->GetNullCount() == 0) { - for (int64_t i = 0; i < len; ++i) { - ui64 fullIndex = i + array->offset; - ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); - value = UpdateMinMax<IsMin>(value, in); + if (!filtered) { + IsValid_ = true; + ui8 value = Value_; + if (array->GetNullCount() == 0) { + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + value = UpdateMinMax<IsMin>(value, in); + } + } else { + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + ui8 mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - ui8(1); + value = UpdateMinMax<IsMin>(value, ui8((in & mask) | (value & ~mask))); + } } + + Value_ = value; } else { - auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); - for (int64_t i = 0; i < len; ++i) { - ui64 fullIndex = i + array->offset; - // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 - ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); - ui8 mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - ui8(1); - value = UpdateMinMax<IsMin>(value, ui8((in & mask) | (value & ~mask))); + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + + ui8 value = Value_; + if (array->GetNullCount() == 0) { + IsValid_ = true; + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + ui8 filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - ui8(1); + value = UpdateMinMax<IsMin>(value, ui8((in & filterMask) | (value & ~filterMask))); + } + } else { + ui64 count = 0; + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + ui8 in = ((ptr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1); + ui8 mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - ui8(1); + ui8 filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - ui8(1); + mask &= filterMask; + value = UpdateMinMax<IsMin>(value, ui8((in & mask) | (value & ~mask))); + count += mask & 1; + } + + IsValid_ = IsValid_ || count > 0; } - } - Value_ = value; + Value_ = value; + } } } diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_sum.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_sum.cpp index dbf0d5a5edb..a4033fc71b4 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_sum.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_block_agg_sum.cpp @@ -20,11 +20,10 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); if (datum.is_scalar()) { if (datum.scalar()->is_valid) { - Sum_ += batchLength * datum.scalar_as<TInScalar>().value; + Sum_ += (filtered ? *filtered : batchLength) * datum.scalar_as<TInScalar>().value; IsValid_ = true; } } else { @@ -36,23 +35,56 @@ public: return; } - IsValid_ = true; - TSum sum = Sum_; - if (array->GetNullCount() == 0) { - for (int64_t i = 0; i < len; ++i) { - sum += ptr[i]; + if (!filtered) { + IsValid_ = true; + TSum sum = Sum_; + if (array->GetNullCount() == 0) { + for (int64_t i = 0; i < len; ++i) { + sum += ptr[i]; + } + } else { + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + sum += (ptr[i] & mask); + } } + + Sum_ = sum; } else { - auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); - for (int64_t i = 0; i < len; ++i) { - ui64 fullIndex = i + array->offset; - // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 - TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); - sum += (ptr[i] & mask); + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + TSum sum = Sum_; + if (array->GetNullCount() == 0) { + IsValid_ = true; + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + sum += ptr[i] & filterMask; + } + } else { + ui64 count = 0; + auto nullBitmapPtr = array->template GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + mask &= filterMask; + sum += (ptr[i] & mask); + count += mask & 1; + } + + IsValid_ = IsValid_ || count > 0; } - } - Sum_ = sum; + Sum_ = sum; + } } } @@ -80,7 +112,6 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); Y_UNUSED(batchLength); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); MKQL_ENSURE(datum.is_array(), "Expected array"); @@ -91,8 +122,21 @@ public: MKQL_ENSURE(len > 0, "Expected at least one value"); TSum sum = Sum_; - for (int64_t i = 0; i < len; ++i) { - sum += ptr[i]; + if (!filtered) { + for (int64_t i = 0; i < len; ++i) { + sum += ptr[i]; + } + } else { + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + sum += ptr[i] & filterMask; + } } Sum_ = sum; @@ -118,11 +162,10 @@ public: } void AddMany(const NUdf::TUnboxedValue* columns, ui64 batchLength, std::optional<ui64> filtered) final { - Y_ENSURE(!filtered); const auto& datum = TArrowBlock::From(columns[ArgColumn_]).GetDatum(); if (datum.is_scalar()) { if (datum.scalar()->is_valid) { - Sum_ += double(batchLength * datum.scalar_as<TInScalar>().value); + Sum_ += double((filtered ? *filtered : batchLength) * datum.scalar_as<TInScalar>().value); Count_ += batchLength; } } else { @@ -134,23 +177,56 @@ public: return; } - Count_ += count; - double sum = Sum_; - if (array->GetNullCount() == 0) { - for (int64_t i = 0; i < len; ++i) { - sum += double(ptr[i]); + if (!filtered) { + Count_ += count; + double sum = Sum_; + if (array->GetNullCount() == 0) { + for (int64_t i = 0; i < len; ++i) { + sum += double(ptr[i]); + } + } else { + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + sum += double(ptr[i] & mask); + } } + + Sum_ = sum; } else { - auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); - for (int64_t i = 0; i < len; ++i) { - ui64 fullIndex = i + array->offset; - // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 - TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); - sum += double(ptr[i] & mask); + const auto& filterDatum = TArrowBlock::From(columns[*FilterColumn_]).GetDatum(); + const auto& filterArray = filterDatum.array(); + MKQL_ENSURE(filterArray->GetNullCount() == 0, "Expected non-nullable bool column"); + auto filterBitmap = filterArray->template GetValues<uint8_t>(1, 0); + + double sum = Sum_; + ui64 count = Count_; + if (array->GetNullCount() == 0) { + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + sum += double(ptr[i] & filterMask); + count += filterMask & 1; + } + } else { + auto nullBitmapPtr = array->GetValues<uint8_t>(0, 0); + for (int64_t i = 0; i < len; ++i) { + ui64 fullIndex = i + array->offset; + // bit 1 -> mask 0xFF..FF, bit 0 -> mask 0x00..00 + TIn mask = (((nullBitmapPtr[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + TIn filterMask = (((filterBitmap[fullIndex >> 3] >> (fullIndex & 0x07)) & 1) ^ 1) - TIn(1); + mask &= filterMask; + sum += double(ptr[i] & mask); + count += mask & 1; + } } - } - Sum_ = sum; + Sum_ = sum; + Count_ = count; + } } } |