aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/histogram/adaptive/block_histogram.cpp
diff options
context:
space:
mode:
authoraprudaev <aprudaev@yandex-team.ru>2022-02-10 16:49:58 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:49:58 +0300
commit29e66fe2ab37743577f88e6ab770defc4e6c1002 (patch)
tree332323481b37f79cf521451582f724fb13a7143a /library/cpp/histogram/adaptive/block_histogram.cpp
parent66f1a2f0600877a21fcc76f83c52904a58425f78 (diff)
downloadydb-29e66fe2ab37743577f88e6ab770defc4e6c1002.tar.gz
Restoring authorship annotation for <aprudaev@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/histogram/adaptive/block_histogram.cpp')
-rw-r--r--library/cpp/histogram/adaptive/block_histogram.cpp506
1 files changed, 253 insertions, 253 deletions
diff --git a/library/cpp/histogram/adaptive/block_histogram.cpp b/library/cpp/histogram/adaptive/block_histogram.cpp
index 6586d13ff6..b5c32cf7ff 100644
--- a/library/cpp/histogram/adaptive/block_histogram.cpp
+++ b/library/cpp/histogram/adaptive/block_histogram.cpp
@@ -1,141 +1,141 @@
#include "block_histogram.h"
-
+
#include <library/cpp/histogram/adaptive/protos/histo.pb.h>
-
+
#include <util/generic/algorithm.h>
-#include <util/generic/yexception.h>
-#include <util/generic/intrlist.h>
-#include <util/generic/ptr.h>
+#include <util/generic/yexception.h>
+#include <util/generic/intrlist.h>
+#include <util/generic/ptr.h>
#include <util/generic/queue.h>
#include <util/generic/ymath.h>
#include <util/string/printf.h>
-
-namespace {
- struct TEmpty {
- };
-
- class TSmartHeap {
- private:
+
+namespace {
+ struct TEmpty {
+ };
+
+ class TSmartHeap {
+ private:
TVector<ui32> A;
TVector<ui32> Pos;
const TVector<double>& Weights;
-
- public:
+
+ public:
TSmartHeap(const TVector<double>& weights)
- : A(weights.size())
- , Pos(weights.size())
- , Weights(weights)
- {
- for (ui32 i = 0; i < weights.size(); ++i) {
- A[i] = i;
- Pos[i] = i;
- }
- for (ui32 i = weights.size() / 2; i > 0; --i) {
- Down(i - 1);
- }
- }
-
- ui32 IdOfMin() {
- return A[0];
- }
-
- void Pop() {
- A[0] = A.back();
- Pos[A[0]] = 0;
- A.pop_back();
- Down(0);
- }
-
- void DownElement(ui32 id) {
- Down(Pos[id]);
- }
-
- private:
- void SwapPositions(ui32 x, ui32 y) {
+ : A(weights.size())
+ , Pos(weights.size())
+ , Weights(weights)
+ {
+ for (ui32 i = 0; i < weights.size(); ++i) {
+ A[i] = i;
+ Pos[i] = i;
+ }
+ for (ui32 i = weights.size() / 2; i > 0; --i) {
+ Down(i - 1);
+ }
+ }
+
+ ui32 IdOfMin() {
+ return A[0];
+ }
+
+ void Pop() {
+ A[0] = A.back();
+ Pos[A[0]] = 0;
+ A.pop_back();
+ Down(0);
+ }
+
+ void DownElement(ui32 id) {
+ Down(Pos[id]);
+ }
+
+ private:
+ void SwapPositions(ui32 x, ui32 y) {
std::swap(A[x], A[y]);
- Pos[A[x]] = x;
- Pos[A[y]] = y;
- }
-
- void Down(ui32 pos) {
- while (1) {
- ui32 left = pos * 2 + 1;
- ui32 right = pos * 2 + 2;
- ui32 min = pos;
- if (left < A.size() && Weights[A[min]] > Weights[A[left]])
- min = left;
- if (right < A.size() && Weights[A[min]] > Weights[A[right]])
- min = right;
- if (pos == min)
- break;
- SwapPositions(min, pos);
- pos = min;
- }
- }
- };
-
-}
-
-namespace NKiwiAggr {
+ Pos[A[x]] = x;
+ Pos[A[y]] = y;
+ }
+
+ void Down(ui32 pos) {
+ while (1) {
+ ui32 left = pos * 2 + 1;
+ ui32 right = pos * 2 + 2;
+ ui32 min = pos;
+ if (left < A.size() && Weights[A[min]] > Weights[A[left]])
+ min = left;
+ if (right < A.size() && Weights[A[min]] > Weights[A[right]])
+ min = right;
+ if (pos == min)
+ break;
+ SwapPositions(min, pos);
+ pos = min;
+ }
+ }
+ };
+
+}
+
+namespace NKiwiAggr {
///////////////////
// TBlockHistogram
///////////////////
-
+
TBlockHistogram::TBlockHistogram(EHistogramType type, TQualityFunction calcQuality,
size_t intervals, ui64 id, size_t shrinkSize)
: Type(type)
, CalcQuality(calcQuality)
, Intervals(intervals)
- , ShrinkSize(shrinkSize)
- , PrevSize(0)
- , Id(id)
- , Sum(0)
- , MinValue(0)
- , MaxValue(0)
- {
- CorrectShrinkSize();
- }
-
+ , ShrinkSize(shrinkSize)
+ , PrevSize(0)
+ , Id(id)
+ , Sum(0)
+ , MinValue(0)
+ , MaxValue(0)
+ {
+ CorrectShrinkSize();
+ }
+
void TBlockHistogram::Clear() {
PrevSize = 0;
Sum = 0.0;
MinValue = 0.0;
MaxValue = 0.0;
Bins.clear();
- }
-
+ }
+
void TBlockHistogram::Add(const THistoRec& rec) {
- if (!rec.HasId() || rec.GetId() == Id) {
- Add(rec.GetValue(), rec.GetWeight());
- }
- }
-
+ if (!rec.HasId() || rec.GetId() == Id) {
+ Add(rec.GetValue(), rec.GetWeight());
+ }
+ }
+
void TBlockHistogram::Add(double value, double weight) {
if (!IsValidFloat(value) || !IsValidFloat(weight)) {
ythrow yexception() << Sprintf("Histogram id %lu: bad value %f weight %f", Id, value, weight);
}
- if (weight <= 0.0) {
- return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
- }
-
- if (Bins.empty()) {
- MinValue = value;
- MaxValue = value;
+ if (weight <= 0.0) {
+ return; // all zero-weighted values should be skipped because they don't affect the distribution, negative weights are forbidden
+ }
+
+ if (Bins.empty()) {
+ MinValue = value;
+ MaxValue = value;
} else {
MinValue = Min(MinValue, value);
MaxValue = Max(MaxValue, value);
- }
-
- Sum += weight;
-
+ }
+
+ Sum += weight;
+
if (Bins.size() > ShrinkSize) {
- SortAndShrink(Intervals * SHRINK_MULTIPLIER);
+ SortAndShrink(Intervals * SHRINK_MULTIPLIER);
}
-
- Bins.push_back(TWeightedValue(value, weight));
- }
-
+
+ Bins.push_back(TWeightedValue(value, weight));
+ }
+
void TBlockHistogram::Merge(const THistogram& histo, double multiplier) {
if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
fprintf(stderr, "Merging in histogram id %lu: skip bad histo with minvalue %f maxvalue %f\n", Id, histo.GetMinValue(), histo.GetMaxValue());
@@ -214,18 +214,18 @@ namespace NKiwiAggr {
break; // ok
default: // not ok
ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record of type = " << (ui32)histo.GetType();
- }
+ }
- if (histo.FreqSize() != histo.PositionSize()) {
+ if (histo.FreqSize() != histo.PositionSize()) {
ythrow yexception() << "Attempt to parse TBlockHistogram from THistogram protobuf record where FreqSize != PositionSize. FreqSize == " << (ui32)histo.FreqSize() << ", PositionSize == " << (ui32)histo.PositionSize();
- }
- Id = histo.GetId();
- Sum = 0;
- Intervals = Max(Intervals, histo.FreqSize());
- CorrectShrinkSize();
- Bins.resize(histo.FreqSize());
- PrevSize = Bins.size();
- for (size_t i = 0; i < histo.FreqSize(); ++i) {
+ }
+ Id = histo.GetId();
+ Sum = 0;
+ Intervals = Max(Intervals, histo.FreqSize());
+ CorrectShrinkSize();
+ Bins.resize(histo.FreqSize());
+ PrevSize = Bins.size();
+ for (size_t i = 0; i < histo.FreqSize(); ++i) {
double value = histo.GetPosition(i);
double weight = histo.GetFreq(i);
if (!IsValidFloat(value) || !IsValidFloat(weight)) {
@@ -234,92 +234,92 @@ namespace NKiwiAggr {
}
Bins[i].first = value;
Bins[i].second = weight;
- Sum += Bins[i].second;
- }
+ Sum += Bins[i].second;
+ }
if (!IsValidFloat(histo.GetMinValue()) || !IsValidFloat(histo.GetMaxValue())) {
ythrow yexception() << Sprintf("FromProto in histogram id %lu: skip bad histo with minvalue %f maxvalue %f", Id, histo.GetMinValue(), histo.GetMaxValue());
}
- MinValue = histo.GetMinValue();
- MaxValue = histo.GetMaxValue();
- }
-
+ MinValue = histo.GetMinValue();
+ MaxValue = histo.GetMaxValue();
+ }
+
void TBlockHistogram::ToProto(THistogram& histo) {
- histo.Clear();
+ histo.Clear();
histo.SetType(Type);
- histo.SetId(Id);
- if (Empty()) {
- return;
- }
-
- SortAndShrink(Intervals, true);
- histo.SetMinValue(MinValue);
- histo.SetMaxValue(MaxValue);
+ histo.SetId(Id);
+ if (Empty()) {
+ return;
+ }
+
+ SortAndShrink(Intervals, true);
+ histo.SetMinValue(MinValue);
+ histo.SetMaxValue(MaxValue);
for (TVector<TWeightedValue>::const_iterator it = Bins.begin(); it != Bins.end(); ++it) {
- histo.AddFreq(it->second);
- histo.AddPosition(it->first);
- }
- }
-
+ histo.AddFreq(it->second);
+ histo.AddPosition(it->first);
+ }
+ }
+
void TBlockHistogram::SetId(ui64 id) {
Id = id;
}
ui64 TBlockHistogram::GetId() {
- return Id;
- }
-
+ return Id;
+ }
+
bool TBlockHistogram::Empty() {
- return Bins.empty();
- }
-
+ return Bins.empty();
+ }
+
double TBlockHistogram::GetMinValue() {
return MinValue;
- }
-
+ }
+
double TBlockHistogram::GetMaxValue() {
return MaxValue;
- }
-
+ }
+
double TBlockHistogram::GetSum() {
- return Sum;
- }
-
+ return Sum;
+ }
+
void TBlockHistogram::SortAndShrink(size_t intervals, bool final) {
Y_VERIFY(intervals > 0);
-
+
if (Bins.size() <= intervals) {
- return;
+ return;
}
-
- if (Bins.size() >= Intervals * GREEDY_SHRINK_MULTIPLIER) {
- SortBins();
+
+ if (Bins.size() >= Intervals * GREEDY_SHRINK_MULTIPLIER) {
+ SortBins();
UniquifyBins();
- FastGreedyShrink(intervals);
-
- if (final) {
- SlowShrink(intervals);
- }
-
+ FastGreedyShrink(intervals);
+
+ if (final) {
+ SlowShrink(intervals);
+ }
+
} else {
- SortBins();
+ SortBins();
UniquifyBins();
- SlowShrink(intervals);
- }
- }
-
+ SlowShrink(intervals);
+ }
+ }
+
void TBlockHistogram::SortBins() {
- Sort(Bins.begin() + PrevSize, Bins.end());
- if (PrevSize != 0) {
+ Sort(Bins.begin() + PrevSize, Bins.end());
+ if (PrevSize != 0) {
TVector<TWeightedValue> temp(Bins.begin(), Bins.begin() + PrevSize);
std::merge(temp.begin(), temp.end(), Bins.begin() + PrevSize, Bins.end(), Bins.begin());
- }
- }
-
+ }
+ }
+
void TBlockHistogram::UniquifyBins() {
if (Bins.empty())
- return;
-
+ return;
+
auto it1 = Bins.begin();
auto it2 = Bins.begin();
while (++it2 != Bins.end()) {
@@ -327,124 +327,124 @@ namespace NKiwiAggr {
it1->second += it2->second;
} else {
*(++it1) = *it2;
- }
- }
-
+ }
+ }
+
Bins.erase(++it1, Bins.end());
- }
-
+ }
+
void TBlockHistogram::CorrectShrinkSize() {
ShrinkSize = Max(ShrinkSize, Intervals * (SHRINK_MULTIPLIER + GREEDY_SHRINK_MULTIPLIER));
}
void TBlockHistogram::SlowShrink(size_t intervals) {
- {
- size_t pos = 0;
- for (size_t i = 1; i < Bins.size(); ++i)
- if (Bins[i].first - Bins[pos].first < 1e-9) {
- Bins[pos].second += Bins[i].second;
- } else {
- ++pos;
- Bins[pos] = Bins[i];
- }
- Bins.resize(pos + 1);
- PrevSize = pos + 1;
- }
-
+ {
+ size_t pos = 0;
+ for (size_t i = 1; i < Bins.size(); ++i)
+ if (Bins[i].first - Bins[pos].first < 1e-9) {
+ Bins[pos].second += Bins[i].second;
+ } else {
+ ++pos;
+ Bins[pos] = Bins[i];
+ }
+ Bins.resize(pos + 1);
+ PrevSize = pos + 1;
+ }
+
if (Bins.size() <= intervals) {
- return;
- }
-
- typedef TIntrusiveListItem<TEmpty> TListItem;
-
- ui32 n = Bins.size() - 1;
- const ui32 end = (ui32)Bins.size();
-
- TArrayHolder<TListItem> listElementsHolder(new TListItem[end + 1]);
- TListItem* const bins = listElementsHolder.Get();
-
+ return;
+ }
+
+ typedef TIntrusiveListItem<TEmpty> TListItem;
+
+ ui32 n = Bins.size() - 1;
+ const ui32 end = (ui32)Bins.size();
+
+ TArrayHolder<TListItem> listElementsHolder(new TListItem[end + 1]);
+ TListItem* const bins = listElementsHolder.Get();
+
for (ui32 i = 1; i <= end; ++i) {
- bins[i].LinkAfter(&bins[i - 1]);
+ bins[i].LinkAfter(&bins[i - 1]);
}
-
+
TVector<double> pairWeights(n);
-
+
for (ui32 i = 0; i < n; ++i) {
pairWeights[i] = CalcQuality(Bins[i], Bins[i + 1]).first;
}
-
- TSmartHeap heap(pairWeights);
-
- while (n + 1 > intervals) {
- ui32 b = heap.IdOfMin();
- heap.Pop();
-
- ui32 a = (ui32)(bins[b].Prev() - bins);
- ui32 c = (ui32)(bins[b].Next() - bins);
- ui32 d = (ui32)(bins[b].Next()->Next() - bins);
+
+ TSmartHeap heap(pairWeights);
+
+ while (n + 1 > intervals) {
+ ui32 b = heap.IdOfMin();
+ heap.Pop();
+
+ ui32 a = (ui32)(bins[b].Prev() - bins);
+ ui32 c = (ui32)(bins[b].Next() - bins);
+ ui32 d = (ui32)(bins[b].Next()->Next() - bins);
Y_VERIFY(Bins[c].second != -1);
-
- double mass = Bins[b].second + Bins[c].second;
- Bins[c].first = (Bins[b].first * Bins[b].second + Bins[c].first * Bins[c].second) / mass;
- Bins[c].second = mass;
-
- bins[b].Unlink();
- Bins[b].second = -1;
-
- if (a != end) {
+
+ double mass = Bins[b].second + Bins[c].second;
+ Bins[c].first = (Bins[b].first * Bins[b].second + Bins[c].first * Bins[c].second) / mass;
+ Bins[c].second = mass;
+
+ bins[b].Unlink();
+ Bins[b].second = -1;
+
+ if (a != end) {
pairWeights[a] = CalcQuality(Bins[a], Bins[c]).first;
- heap.DownElement(a);
- }
-
- if (d != end && c + 1 != Bins.size()) {
+ heap.DownElement(a);
+ }
+
+ if (d != end && c + 1 != Bins.size()) {
pairWeights[c] = CalcQuality(Bins[c], Bins[d]).first;
- heap.DownElement(c);
- }
-
- --n;
- }
-
- size_t pos = 0;
- for (TListItem* it = bins[end].Next(); it != &bins[end]; it = it->Next()) {
- Bins[pos++] = Bins[it - bins];
- }
-
- Bins.resize(pos);
- PrevSize = pos;
+ heap.DownElement(c);
+ }
+
+ --n;
+ }
+
+ size_t pos = 0;
+ for (TListItem* it = bins[end].Next(); it != &bins[end]; it = it->Next()) {
+ Bins[pos++] = Bins[it - bins];
+ }
+
+ Bins.resize(pos);
+ PrevSize = pos;
Y_VERIFY(pos == intervals);
- }
-
+ }
+
double TBlockHistogram::GetSumInRange(double leftBound, double rightBound) {
Y_UNUSED(leftBound);
Y_UNUSED(rightBound);
ythrow yexception() << "Method is not implemented for TBlockHistogram";
- return 0;
- }
-
+ return 0;
+ }
+
double TBlockHistogram::GetSumAboveBound(double bound) {
Y_UNUSED(bound);
ythrow yexception() << "Method is not implemented for TBlockHistogram";
- return 0;
- }
-
+ return 0;
+ }
+
double TBlockHistogram::GetSumBelowBound(double bound) {
Y_UNUSED(bound);
ythrow yexception() << "Method is not implemented for TBlockHistogram";
- return 0;
- }
-
+ return 0;
+ }
+
double TBlockHistogram::CalcUpperBound(double sum) {
Y_UNUSED(sum);
ythrow yexception() << "Method is not implemented for TBlockHistogram";
- return 0;
- }
-
+ return 0;
+ }
+
double TBlockHistogram::CalcLowerBound(double sum) {
Y_UNUSED(sum);
ythrow yexception() << "Method is not implemented for TBlockHistogram";
- return 0;
- }
-
+ return 0;
+ }
+
double TBlockHistogram::CalcUpperBoundSafe(double sum) {
Y_UNUSED(sum);
ythrow yexception() << "Method is not implemented for TBlockHistogram";