diff options
author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/lwtrace/mon/analytics/transform.h |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/lwtrace/mon/analytics/transform.h')
-rw-r--r-- | library/cpp/lwtrace/mon/analytics/transform.h | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/library/cpp/lwtrace/mon/analytics/transform.h b/library/cpp/lwtrace/mon/analytics/transform.h new file mode 100644 index 00000000000..f7dc9adb5b9 --- /dev/null +++ b/library/cpp/lwtrace/mon/analytics/transform.h @@ -0,0 +1,204 @@ +#pragma once + +#include "data.h" + +namespace NAnalytics { + +template <class TSkip, class TX, class TY> +inline TTable Histogram(const TTable& in, TSkip skip, + const TString& xn_out, TX x_in, + const TString& yn_out, TY y_in, + double x1, double x2, double dx) +{ + long buckets = (x2 - x1) / dx; + TTable out; + TString yn_sum = yn_out + "_sum"; + TString yn_share = yn_out + "_share"; + double ysum = 0.0; + out.resize(buckets); + for (size_t i = 0; i < out.size(); i++) { + double lb = x1 + dx*i; + double ub = lb + dx; + out[i].Name = "[" + ToString(lb) + ";" + ToString(ub) + (ub==x2? "]": ")"); + out[i][xn_out] = (lb + ub) / 2; + out[i][yn_sum] = 0.0; + } + for (const auto& row : in) { + if (skip(row)) { + continue; + } + double x = x_in(row); + long i = (x - x1) / dx; + if (x == x2) { // Special hack to include right edge + i--; + } + double y = y_in(row); + ysum += y; + if (i >= 0 && i < buckets) { + out[i][yn_sum] = y + out[i].GetOrDefault(yn_sum, 0.0); + } + } + for (TRow& row : out) { + if (ysum != 0.0) { + row[yn_share] = row.GetOrDefault(yn_sum, 0.0) / ysum; + } + } + return out; +} + +inline TTable HistogramAll(const TTable& in, const TString& xn, double x1, double x2, double dx) +{ + long buckets = (dx == 0.0? 1: (x2 - x1) / dx); + TTable out; + THashMap<TString, double> colSum; + out.resize(buckets); + + TSet<TString> cols; + for (auto& row : in) { + for (auto& kv : row) { + cols.insert(kv.first); + } + } + cols.insert("_count"); + cols.erase(xn); + + for (const TString& col : cols) { + colSum[col] = 0.0; + } + + for (size_t i = 0; i < out.size(); i++) { + double lb = x1 + dx*i; + double ub = lb + dx; + TRow& row = out[i]; + row.Name = "[" + ToString(lb) + ";" + ToString(ub) + (ub==x2? "]": ")"); + row[xn] = (lb + ub) / 2; + for (const TString& col : cols) { + row[col + "_sum"] = 0.0; + } + } + for (const TRow& row_in : in) { + double x; + if (!row_in.Get(xn, x)) { + continue; + } + long i = (dx == 0.0? 0: (x - x1) / dx); + if (x == x2 && dx > 0.0) { // Special hack to include right edge + i--; + } + for (const auto& kv : row_in) { + const TString& yn = kv.first; + if (yn == xn) { + continue; + } + double y; + if (!row_in.Get(yn, y)) { + continue; + } + colSum[yn] += y; + if (i >= 0 && i < buckets) { + out[i][yn + "_cnt"] = out[i].GetOrDefault(yn + "_cnt") + 1; + out[i][yn + "_sum"] = out[i].GetOrDefault(yn + "_sum") + y; + if (out[i].contains(yn + "_min")) { + out[i][yn + "_min"] = Min(y, out[i].GetOrDefault(yn + "_min")); + } else { + out[i][yn + "_min"] = y; + } + if (out[i].contains(yn + "_max")) { + out[i][yn + "_max"] = Max(y, out[i].GetOrDefault(yn + "_max")); + } else { + out[i][yn + "_max"] = y; + } + } + } + colSum["_count"]++; + if (i >= 0 && i < buckets) { + out[i]["_count_sum"] = out[i].GetOrDefault("_count_sum") + 1; + } + } + for (TRow& row : out) { + for (const TString& col : cols) { + double ysum = colSum[col]; + if (col != "_count") { + if (row.GetOrDefault(col + "_cnt") != 0.0) { + row[col + "_avg"] = row.GetOrDefault(col + "_sum") / row.GetOrDefault(col + "_cnt"); + } + } + if (ysum != 0.0) { + row[col + "_share"] = row.GetOrDefault(col + "_sum") / ysum; + } + } + } + return out; +} + +inline TMatrix CovarianceMatrix(const TTable& in) +{ + TSet<TString> cols; + for (auto& row : in) { + for (auto& kv : row) { + cols.insert(kv.first); + } + } + + struct TAggregate { + size_t Idx = 0; + double Sum = 0; + size_t Count = 0; + double Mean = 0; + }; + + THashMap<TString, TAggregate> colAggr; + + size_t colCount = 0; + for (const TString& col : cols) { + TAggregate& aggr = colAggr[col]; + aggr.Idx = colCount++; + } + + for (const TRow& row : in) { + for (const auto& kv : row) { + const TString& xn = kv.first; + double x; + if (!row.Get(xn, x)) { + continue; + } + TAggregate& aggr = colAggr[xn]; + aggr.Sum += x; + aggr.Count++; + } + } + + for (auto& kv : colAggr) { + TAggregate& aggr = kv.second; + aggr.Mean = aggr.Sum / aggr.Count; + } + + TMatrix covCount(cols.size(), cols.size()); + TMatrix cov(cols.size(), cols.size()); + for (const TRow& row : in) { + for (const auto& kv1 : row) { + double x; + if (!row.Get(kv1.first, x)) { + continue; + } + TAggregate& xaggr = colAggr[kv1.first]; + for (const auto& kv2 : row) { + double y; + if (!row.Get(kv2.first, y)) { + continue; + } + TAggregate& yaggr = colAggr[kv2.first]; + covCount.Cell(xaggr.Idx, yaggr.Idx)++; + cov.Cell(xaggr.Idx, yaggr.Idx) += (x - xaggr.Mean) * (y - yaggr.Mean); + } + } + } + + for (size_t idx = 0; idx < cov.size(); idx++) { + cov[idx] /= covCount[idx]; + } + + return cov; +} + +} |