aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/linear_regression/linear_regression.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/linear_regression/linear_regression.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/linear_regression/linear_regression.h')
-rw-r--r--library/cpp/linear_regression/linear_regression.h342
1 files changed, 342 insertions, 0 deletions
diff --git a/library/cpp/linear_regression/linear_regression.h b/library/cpp/linear_regression/linear_regression.h
new file mode 100644
index 0000000000..e57de5ff6c
--- /dev/null
+++ b/library/cpp/linear_regression/linear_regression.h
@@ -0,0 +1,342 @@
+#pragma once
+
+#include "linear_model.h"
+#include "welford.h"
+
+#include <library/cpp/accurate_accumulate/accurate_accumulate.h>
+
+#include <util/generic/vector.h>
+#include <util/generic/hash.h>
+#include <util/generic/ymath.h>
+
+class TFastLinearRegressionSolver {
+private:
+ TKahanAccumulator<double> SumSquaredGoals;
+
+ TVector<double> LinearizedOLSMatrix;
+ TVector<double> OLSVector;
+
+public:
+ bool Add(const TVector<double>& features, const double goal, const double weight = 1.);
+ TLinearModel Solve() const;
+ double SumSquaredErrors() const;
+};
+
+class TLinearRegressionSolver {
+private:
+ double GoalsMean = 0.;
+ double GoalsDeviation = 0.;
+
+ TVector<double> FeatureMeans;
+ TVector<double> LastMeans;
+ TVector<double> NewMeans;
+ TVector<double> LinearizedOLSMatrix;
+
+ TVector<double> OLSVector;
+
+ TKahanAccumulator<double> SumWeights;
+
+public:
+ bool Add(const TVector<double>& features, const double goal, const double weight = 1.);
+ TLinearModel Solve() const;
+ double SumSquaredErrors() const;
+};
+
+template <typename TStoreType>
+class TTypedFastSLRSolver {
+private:
+ TStoreType SumFeatures = TStoreType();
+ TStoreType SumSquaredFeatures = TStoreType();
+
+ TStoreType SumGoals = TStoreType();
+ TStoreType SumSquaredGoals = TStoreType();
+
+ TStoreType SumProducts = TStoreType();
+
+ TStoreType SumWeights = TStoreType();
+
+public:
+ bool Add(const double feature, const double goal, const double weight = 1.) {
+ SumFeatures += feature * weight;
+ SumSquaredFeatures += feature * feature * weight;
+
+ SumGoals += goal * weight;
+ SumSquaredGoals += goal * goal * weight;
+
+ SumProducts += goal * feature * weight;
+
+ SumWeights += weight;
+
+ return true;
+ }
+
+ template <typename TFloatType>
+ void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
+ if (!(double)SumGoals) {
+ factor = intercept = TFloatType();
+ return;
+ }
+
+ double productsDeviation, featuresDeviation;
+ SetupSolutionFactors(productsDeviation, featuresDeviation);
+
+ if (!featuresDeviation) {
+ factor = TFloatType();
+ intercept = (double)SumGoals / (double)SumWeights;
+ return;
+ }
+
+ factor = productsDeviation / (featuresDeviation + regularizationParameter);
+ intercept = (double)SumGoals / (double)SumWeights - factor * (double)SumFeatures / (double)SumWeights;
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const {
+ if (!(double)SumWeights) {
+ return 0.;
+ }
+
+ const double sumGoalSquaredDeviations = (double)SumSquaredGoals - (double)SumGoals / (double)SumWeights * (double)SumGoals;
+
+ double productsDeviation, featuresDeviation;
+ SetupSolutionFactors(productsDeviation, featuresDeviation);
+ if (!featuresDeviation) {
+ return sumGoalSquaredDeviations;
+ }
+
+ const double factor = productsDeviation / (featuresDeviation + regularizationParameter);
+
+ const double sumSquaredErrors = factor * factor * featuresDeviation - 2 * factor * productsDeviation + sumGoalSquaredDeviations;
+ return Max(0., sumSquaredErrors);
+ }
+
+private:
+ void SetupSolutionFactors(double& productsDeviation, double& featuresDeviation) const {
+ if (!(double)SumWeights) {
+ productsDeviation = featuresDeviation = 0.;
+ return;
+ }
+
+ featuresDeviation = (double)SumSquaredFeatures - (double)SumFeatures / (double)SumWeights * (double)SumFeatures;
+ if (!featuresDeviation) {
+ return;
+ }
+ productsDeviation = (double)SumProducts - (double)SumFeatures / (double)SumWeights * (double)SumGoals;
+ }
+};
+
+using TFastSLRSolver = TTypedFastSLRSolver<double>;
+using TKahanSLRSolver = TTypedFastSLRSolver<TKahanAccumulator<double>>;
+
+class TSLRSolver {
+private:
+ double FeaturesMean = 0.;
+ double FeaturesDeviation = 0.;
+
+ double GoalsMean = 0.;
+ double GoalsDeviation = 0.;
+
+ TKahanAccumulator<double> SumWeights;
+
+ double Covariation = 0.;
+
+public:
+ bool Add(const double feature, const double goal, const double weight = 1.);
+
+ bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin);
+ bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin, const double* weightsBegin);
+
+ bool Add(const TVector<double>& features, const TVector<double>& goals) {
+ Y_ASSERT(features.size() == goals.size());
+ return Add(features.data(), features.data() + features.size(), goals.data());
+ }
+
+ bool Add(const TVector<double>& features, const TVector<double>& goals, const TVector<double>& weights) {
+ Y_ASSERT(features.size() == goals.size() && features.size() == weights.size());
+ return Add(features.data(), features.data() + features.size(), goals.data(), weights.data());
+ }
+
+ template <typename TFloatType>
+ void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
+ if (!FeaturesDeviation) {
+ factor = 0.;
+ intercept = GoalsMean;
+ return;
+ }
+
+ factor = Covariation / (FeaturesDeviation + regularizationParameter);
+ intercept = GoalsMean - factor * FeaturesMean;
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const;
+
+ double GetSumWeights() const {
+ return SumWeights.Get();
+ }
+};
+
+template <typename TSLRSolverType>
+class TTypedBestSLRSolver {
+private:
+ TVector<TSLRSolverType> SLRSolvers;
+
+public:
+ bool Add(const TVector<double>& features, const double goal, const double weight = 1.) {
+ if (SLRSolvers.empty()) {
+ SLRSolvers.resize(features.size());
+ }
+
+ for (size_t featureNumber = 0; featureNumber < features.size(); ++featureNumber) {
+ SLRSolvers[featureNumber].Add(features[featureNumber], goal, weight);
+ }
+
+ return true;
+ }
+
+ TLinearModel Solve(const double regularizationParameter = 0.1) const {
+ const TSLRSolverType* bestSolver = nullptr;
+ for (const TSLRSolverType& solver : SLRSolvers) {
+ if (!bestSolver || solver.SumSquaredErrors(regularizationParameter) < bestSolver->SumSquaredErrors(regularizationParameter)) {
+ bestSolver = &solver;
+ }
+ }
+
+ TVector<double> coefficients(SLRSolvers.size());
+ double intercept = 0.0;
+ if (bestSolver) {
+ bestSolver->Solve(coefficients[bestSolver - SLRSolvers.begin()], intercept, regularizationParameter);
+ }
+
+ TLinearModel model(std::move(coefficients), intercept);
+ return model;
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const {
+ if (SLRSolvers.empty()) {
+ return 0.;
+ }
+
+ double sse = SLRSolvers.begin()->SumSquaredErrors(regularizationParameter);
+ for (const TSLRSolver& solver : SLRSolvers) {
+ sse = Min(solver.SumSquaredErrors(regularizationParameter), sse);
+ }
+ return sse;
+ }
+};
+
+using TFastBestSLRSolver = TTypedBestSLRSolver<TFastSLRSolver>;
+using TKahanBestSLRSolver = TTypedBestSLRSolver<TKahanSLRSolver>;
+using TBestSLRSolver = TTypedBestSLRSolver<TSLRSolver>;
+
+enum ETransformationType {
+ TT_IDENTITY,
+ TT_SIGMA,
+};
+
+struct TTransformationParameters {
+ double RegressionFactor = 1.;
+ double RegressionIntercept = 0.;
+
+ double FeatureOffset = 0.;
+ double FeatureNormalizer = 1.;
+
+ Y_SAVELOAD_DEFINE(RegressionFactor,
+ RegressionIntercept,
+ FeatureOffset,
+ FeatureNormalizer);
+};
+
+class TFeaturesTransformer {
+private:
+ ETransformationType TransformationType;
+ TTransformationParameters TransformationParameters;
+
+public:
+ Y_SAVELOAD_DEFINE(TransformationType, TransformationParameters);
+
+ TFeaturesTransformer() = default;
+
+ TFeaturesTransformer(const ETransformationType transformationType,
+ const TTransformationParameters transformationParameters)
+ : TransformationType(transformationType)
+ , TransformationParameters(transformationParameters)
+ {
+ }
+
+ double Transformation(const double value) const {
+ switch (TransformationType) {
+ case ETransformationType::TT_IDENTITY: {
+ return value;
+ }
+ case ETransformationType::TT_SIGMA: {
+ const double valueWithoutOffset = value - TransformationParameters.FeatureOffset;
+ const double transformedValue = valueWithoutOffset / (fabs(valueWithoutOffset) + TransformationParameters.FeatureNormalizer);
+ return TransformationParameters.RegressionIntercept + TransformationParameters.RegressionFactor * transformedValue;
+ }
+ }
+ Y_ASSERT(0);
+ return 0.;
+ }
+};
+
+class TFeaturesTransformerLearner {
+private:
+ struct TPoint {
+ float Argument;
+ float Target;
+ };
+
+ float MinimalArgument = Max<float>();
+ float MaximalArgument = Min<float>();
+
+ ETransformationType TransformationType;
+ TVector<TPoint> Points;
+
+public:
+ TFeaturesTransformerLearner(const ETransformationType transformationType)
+ : TransformationType(transformationType)
+ {
+ }
+
+ void Add(const float argument, const float target) {
+ Points.push_back(TPoint{argument, target});
+ MinimalArgument = Min(MinimalArgument, argument);
+ MaximalArgument = Max(MaximalArgument, argument);
+ }
+
+ TFeaturesTransformer Solve(const size_t iterationsCount = 100);
+};
+
+class TFastFeaturesTransformerLearner {
+private:
+ ETransformationType TransformationType;
+
+ struct TBucket {
+ TMeanCalculator ArgumentsMean;
+ TMeanCalculator TargetsMean;
+ };
+
+ THashMap<double, TBucket> Buckets;
+ double Step;
+
+public:
+ TFastFeaturesTransformerLearner(const ETransformationType transformationType, const double step = 0.1)
+ : TransformationType(transformationType)
+ , Step(step)
+ {
+ }
+
+ void Add(const float argument, const float target) {
+ TBucket& bucket = Buckets[round(argument / Step)];
+ bucket.ArgumentsMean.Add(argument);
+ bucket.TargetsMean.Add(target);
+ }
+
+ TFeaturesTransformer Solve(const size_t iterationsCount = 100) {
+ TFeaturesTransformerLearner learner(TransformationType);
+ for (auto&& argumentWithBucket : Buckets) {
+ const TBucket& bucket = argumentWithBucket.second;
+ learner.Add(bucket.ArgumentsMean.GetMean(), bucket.TargetsMean.GetMean());
+ }
+ return learner.Solve(iterationsCount);
+ }
+};