aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/linear_regression/linear_regression.h
diff options
context:
space:
mode:
authoralex-sh <alex-sh@yandex-team.ru>2022-02-10 16:50:03 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:50:03 +0300
commit3196904c9f5bf7aff7374eeadcb0671589581f61 (patch)
treed13114a178799aeb203a4b3b43dd7fb0c4f6975f /library/cpp/linear_regression/linear_regression.h
parentd154d11651ea533127249184148c3f023e2c6d0a (diff)
downloadydb-3196904c9f5bf7aff7374eeadcb0671589581f61.tar.gz
Restoring authorship annotation for <alex-sh@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/linear_regression/linear_regression.h')
-rw-r--r--library/cpp/linear_regression/linear_regression.h576
1 files changed, 288 insertions, 288 deletions
diff --git a/library/cpp/linear_regression/linear_regression.h b/library/cpp/linear_regression/linear_regression.h
index e57de5ff6c..44b0254d5d 100644
--- a/library/cpp/linear_regression/linear_regression.h
+++ b/library/cpp/linear_regression/linear_regression.h
@@ -1,342 +1,342 @@
-#pragma once
-
-#include "linear_model.h"
-#include "welford.h"
-
+#pragma once
+
+#include "linear_model.h"
+#include "welford.h"
+
#include <library/cpp/accurate_accumulate/accurate_accumulate.h>
-
-#include <util/generic/vector.h>
-#include <util/generic/hash.h>
+
+#include <util/generic/vector.h>
+#include <util/generic/hash.h>
#include <util/generic/ymath.h>
-
-class TFastLinearRegressionSolver {
-private:
- TKahanAccumulator<double> SumSquaredGoals;
-
+
+class TFastLinearRegressionSolver {
+private:
+ TKahanAccumulator<double> SumSquaredGoals;
+
TVector<double> LinearizedOLSMatrix;
TVector<double> OLSVector;
-public:
+public:
bool Add(const TVector<double>& features, const double goal, const double weight = 1.);
- TLinearModel Solve() const;
- double SumSquaredErrors() const;
-};
-
-class TLinearRegressionSolver {
-private:
- double GoalsMean = 0.;
- double GoalsDeviation = 0.;
-
+ TLinearModel Solve() const;
+ double SumSquaredErrors() const;
+};
+
+class TLinearRegressionSolver {
+private:
+ double GoalsMean = 0.;
+ double GoalsDeviation = 0.;
+
TVector<double> FeatureMeans;
TVector<double> LastMeans;
TVector<double> NewMeans;
TVector<double> LinearizedOLSMatrix;
-
+
TVector<double> OLSVector;
+
+ TKahanAccumulator<double> SumWeights;
- TKahanAccumulator<double> SumWeights;
-
-public:
+public:
bool Add(const TVector<double>& features, const double goal, const double weight = 1.);
- TLinearModel Solve() const;
- double SumSquaredErrors() const;
-};
-
-template <typename TStoreType>
-class TTypedFastSLRSolver {
-private:
- TStoreType SumFeatures = TStoreType();
- TStoreType SumSquaredFeatures = TStoreType();
-
- TStoreType SumGoals = TStoreType();
- TStoreType SumSquaredGoals = TStoreType();
-
- TStoreType SumProducts = TStoreType();
-
- TStoreType SumWeights = TStoreType();
-
-public:
+ TLinearModel Solve() const;
+ double SumSquaredErrors() const;
+};
+
+template <typename TStoreType>
+class TTypedFastSLRSolver {
+private:
+ TStoreType SumFeatures = TStoreType();
+ TStoreType SumSquaredFeatures = TStoreType();
+
+ TStoreType SumGoals = TStoreType();
+ TStoreType SumSquaredGoals = TStoreType();
+
+ TStoreType SumProducts = TStoreType();
+
+ TStoreType SumWeights = TStoreType();
+
+public:
bool Add(const double feature, const double goal, const double weight = 1.) {
- SumFeatures += feature * weight;
- SumSquaredFeatures += feature * feature * weight;
-
- SumGoals += goal * weight;
- SumSquaredGoals += goal * goal * weight;
-
- SumProducts += goal * feature * weight;
-
- SumWeights += weight;
+ SumFeatures += feature * weight;
+ SumSquaredFeatures += feature * feature * weight;
+
+ SumGoals += goal * weight;
+ SumSquaredGoals += goal * goal * weight;
+
+ SumProducts += goal * feature * weight;
+
+ SumWeights += weight;
return true;
- }
-
- template <typename TFloatType>
- void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
+ }
+
+ template <typename TFloatType>
+ void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
if (!(double)SumGoals) {
- factor = intercept = TFloatType();
- return;
- }
-
- double productsDeviation, featuresDeviation;
- SetupSolutionFactors(productsDeviation, featuresDeviation);
-
- if (!featuresDeviation) {
- factor = TFloatType();
+ factor = intercept = TFloatType();
+ return;
+ }
+
+ double productsDeviation, featuresDeviation;
+ SetupSolutionFactors(productsDeviation, featuresDeviation);
+
+ if (!featuresDeviation) {
+ factor = TFloatType();
intercept = (double)SumGoals / (double)SumWeights;
- return;
- }
-
- factor = productsDeviation / (featuresDeviation + regularizationParameter);
+ return;
+ }
+
+ factor = productsDeviation / (featuresDeviation + regularizationParameter);
intercept = (double)SumGoals / (double)SumWeights - factor * (double)SumFeatures / (double)SumWeights;
- }
-
- double SumSquaredErrors(const double regularizationParameter = 0.1) const {
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const {
if (!(double)SumWeights) {
- return 0.;
- }
-
+ return 0.;
+ }
+
const double sumGoalSquaredDeviations = (double)SumSquaredGoals - (double)SumGoals / (double)SumWeights * (double)SumGoals;
-
- double productsDeviation, featuresDeviation;
- SetupSolutionFactors(productsDeviation, featuresDeviation);
- if (!featuresDeviation) {
- return sumGoalSquaredDeviations;
- }
-
- const double factor = productsDeviation / (featuresDeviation + regularizationParameter);
-
- const double sumSquaredErrors = factor * factor * featuresDeviation - 2 * factor * productsDeviation + sumGoalSquaredDeviations;
- return Max(0., sumSquaredErrors);
- }
-
-private:
- void SetupSolutionFactors(double& productsDeviation, double& featuresDeviation) const {
+
+ double productsDeviation, featuresDeviation;
+ SetupSolutionFactors(productsDeviation, featuresDeviation);
+ if (!featuresDeviation) {
+ return sumGoalSquaredDeviations;
+ }
+
+ const double factor = productsDeviation / (featuresDeviation + regularizationParameter);
+
+ const double sumSquaredErrors = factor * factor * featuresDeviation - 2 * factor * productsDeviation + sumGoalSquaredDeviations;
+ return Max(0., sumSquaredErrors);
+ }
+
+private:
+ void SetupSolutionFactors(double& productsDeviation, double& featuresDeviation) const {
if (!(double)SumWeights) {
- productsDeviation = featuresDeviation = 0.;
- return;
- }
-
+ productsDeviation = featuresDeviation = 0.;
+ return;
+ }
+
featuresDeviation = (double)SumSquaredFeatures - (double)SumFeatures / (double)SumWeights * (double)SumFeatures;
- if (!featuresDeviation) {
- return;
- }
+ if (!featuresDeviation) {
+ return;
+ }
productsDeviation = (double)SumProducts - (double)SumFeatures / (double)SumWeights * (double)SumGoals;
- }
-};
-
-using TFastSLRSolver = TTypedFastSLRSolver<double>;
+ }
+};
+
+using TFastSLRSolver = TTypedFastSLRSolver<double>;
using TKahanSLRSolver = TTypedFastSLRSolver<TKahanAccumulator<double>>;
-
-class TSLRSolver {
-private:
- double FeaturesMean = 0.;
- double FeaturesDeviation = 0.;
-
- double GoalsMean = 0.;
- double GoalsDeviation = 0.;
-
- TKahanAccumulator<double> SumWeights;
-
- double Covariation = 0.;
-
-public:
+
+class TSLRSolver {
+private:
+ double FeaturesMean = 0.;
+ double FeaturesDeviation = 0.;
+
+ double GoalsMean = 0.;
+ double GoalsDeviation = 0.;
+
+ TKahanAccumulator<double> SumWeights;
+
+ double Covariation = 0.;
+
+public:
bool Add(const double feature, const double goal, const double weight = 1.);
-
- bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin);
- bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin, const double* weightsBegin);
-
+
+ bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin);
+ bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin, const double* weightsBegin);
+
bool Add(const TVector<double>& features, const TVector<double>& goals) {
Y_ASSERT(features.size() == goals.size());
return Add(features.data(), features.data() + features.size(), goals.data());
- }
-
+ }
+
bool Add(const TVector<double>& features, const TVector<double>& goals, const TVector<double>& weights) {
Y_ASSERT(features.size() == goals.size() && features.size() == weights.size());
return Add(features.data(), features.data() + features.size(), goals.data(), weights.data());
- }
-
- template <typename TFloatType>
- void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
- if (!FeaturesDeviation) {
- factor = 0.;
- intercept = GoalsMean;
- return;
- }
-
- factor = Covariation / (FeaturesDeviation + regularizationParameter);
- intercept = GoalsMean - factor * FeaturesMean;
- }
-
- double SumSquaredErrors(const double regularizationParameter = 0.1) const;
-
- double GetSumWeights() const {
- return SumWeights.Get();
- }
-};
-
-template <typename TSLRSolverType>
-class TTypedBestSLRSolver {
-private:
+ }
+
+ template <typename TFloatType>
+ void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const {
+ if (!FeaturesDeviation) {
+ factor = 0.;
+ intercept = GoalsMean;
+ return;
+ }
+
+ factor = Covariation / (FeaturesDeviation + regularizationParameter);
+ intercept = GoalsMean - factor * FeaturesMean;
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const;
+
+ double GetSumWeights() const {
+ return SumWeights.Get();
+ }
+};
+
+template <typename TSLRSolverType>
+class TTypedBestSLRSolver {
+private:
TVector<TSLRSolverType> SLRSolvers;
-public:
+public:
bool Add(const TVector<double>& features, const double goal, const double weight = 1.) {
- if (SLRSolvers.empty()) {
- SLRSolvers.resize(features.size());
- }
-
- for (size_t featureNumber = 0; featureNumber < features.size(); ++featureNumber) {
- SLRSolvers[featureNumber].Add(features[featureNumber], goal, weight);
- }
+ if (SLRSolvers.empty()) {
+ SLRSolvers.resize(features.size());
+ }
+
+ for (size_t featureNumber = 0; featureNumber < features.size(); ++featureNumber) {
+ SLRSolvers[featureNumber].Add(features[featureNumber], goal, weight);
+ }
return true;
- }
-
- TLinearModel Solve(const double regularizationParameter = 0.1) const {
- const TSLRSolverType* bestSolver = nullptr;
- for (const TSLRSolverType& solver : SLRSolvers) {
- if (!bestSolver || solver.SumSquaredErrors(regularizationParameter) < bestSolver->SumSquaredErrors(regularizationParameter)) {
- bestSolver = &solver;
- }
- }
-
+ }
+
+ TLinearModel Solve(const double regularizationParameter = 0.1) const {
+ const TSLRSolverType* bestSolver = nullptr;
+ for (const TSLRSolverType& solver : SLRSolvers) {
+ if (!bestSolver || solver.SumSquaredErrors(regularizationParameter) < bestSolver->SumSquaredErrors(regularizationParameter)) {
+ bestSolver = &solver;
+ }
+ }
+
TVector<double> coefficients(SLRSolvers.size());
double intercept = 0.0;
- if (bestSolver) {
+ if (bestSolver) {
bestSolver->Solve(coefficients[bestSolver - SLRSolvers.begin()], intercept, regularizationParameter);
- }
-
+ }
+
TLinearModel model(std::move(coefficients), intercept);
- return model;
- }
-
- double SumSquaredErrors(const double regularizationParameter = 0.1) const {
- if (SLRSolvers.empty()) {
- return 0.;
- }
-
- double sse = SLRSolvers.begin()->SumSquaredErrors(regularizationParameter);
- for (const TSLRSolver& solver : SLRSolvers) {
- sse = Min(solver.SumSquaredErrors(regularizationParameter), sse);
- }
- return sse;
- }
-};
-
-using TFastBestSLRSolver = TTypedBestSLRSolver<TFastSLRSolver>;
-using TKahanBestSLRSolver = TTypedBestSLRSolver<TKahanSLRSolver>;
-using TBestSLRSolver = TTypedBestSLRSolver<TSLRSolver>;
-
-enum ETransformationType {
- TT_IDENTITY,
- TT_SIGMA,
-};
-
-struct TTransformationParameters {
- double RegressionFactor = 1.;
- double RegressionIntercept = 0.;
-
- double FeatureOffset = 0.;
- double FeatureNormalizer = 1.;
-
+ return model;
+ }
+
+ double SumSquaredErrors(const double regularizationParameter = 0.1) const {
+ if (SLRSolvers.empty()) {
+ return 0.;
+ }
+
+ double sse = SLRSolvers.begin()->SumSquaredErrors(regularizationParameter);
+ for (const TSLRSolver& solver : SLRSolvers) {
+ sse = Min(solver.SumSquaredErrors(regularizationParameter), sse);
+ }
+ return sse;
+ }
+};
+
+using TFastBestSLRSolver = TTypedBestSLRSolver<TFastSLRSolver>;
+using TKahanBestSLRSolver = TTypedBestSLRSolver<TKahanSLRSolver>;
+using TBestSLRSolver = TTypedBestSLRSolver<TSLRSolver>;
+
+enum ETransformationType {
+ TT_IDENTITY,
+ TT_SIGMA,
+};
+
+struct TTransformationParameters {
+ double RegressionFactor = 1.;
+ double RegressionIntercept = 0.;
+
+ double FeatureOffset = 0.;
+ double FeatureNormalizer = 1.;
+
Y_SAVELOAD_DEFINE(RegressionFactor,
RegressionIntercept,
FeatureOffset,
FeatureNormalizer);
-};
-
-class TFeaturesTransformer {
-private:
- ETransformationType TransformationType;
- TTransformationParameters TransformationParameters;
-
-public:
+};
+
+class TFeaturesTransformer {
+private:
+ ETransformationType TransformationType;
+ TTransformationParameters TransformationParameters;
+
+public:
Y_SAVELOAD_DEFINE(TransformationType, TransformationParameters);
-
+
TFeaturesTransformer() = default;
-
- TFeaturesTransformer(const ETransformationType transformationType,
- const TTransformationParameters transformationParameters)
- : TransformationType(transformationType)
- , TransformationParameters(transformationParameters)
- {
- }
-
- double Transformation(const double value) const {
- switch (TransformationType) {
+
+ TFeaturesTransformer(const ETransformationType transformationType,
+ const TTransformationParameters transformationParameters)
+ : TransformationType(transformationType)
+ , TransformationParameters(transformationParameters)
+ {
+ }
+
+ double Transformation(const double value) const {
+ switch (TransformationType) {
case ETransformationType::TT_IDENTITY: {
- return value;
- }
+ return value;
+ }
case ETransformationType::TT_SIGMA: {
- const double valueWithoutOffset = value - TransformationParameters.FeatureOffset;
- const double transformedValue = valueWithoutOffset / (fabs(valueWithoutOffset) + TransformationParameters.FeatureNormalizer);
- return TransformationParameters.RegressionIntercept + TransformationParameters.RegressionFactor * transformedValue;
- }
- }
+ const double valueWithoutOffset = value - TransformationParameters.FeatureOffset;
+ const double transformedValue = valueWithoutOffset / (fabs(valueWithoutOffset) + TransformationParameters.FeatureNormalizer);
+ return TransformationParameters.RegressionIntercept + TransformationParameters.RegressionFactor * transformedValue;
+ }
+ }
Y_ASSERT(0);
- return 0.;
- }
-};
-
-class TFeaturesTransformerLearner {
-private:
- struct TPoint {
- float Argument;
- float Target;
- };
-
- float MinimalArgument = Max<float>();
- float MaximalArgument = Min<float>();
-
- ETransformationType TransformationType;
+ return 0.;
+ }
+};
+
+class TFeaturesTransformerLearner {
+private:
+ struct TPoint {
+ float Argument;
+ float Target;
+ };
+
+ float MinimalArgument = Max<float>();
+ float MaximalArgument = Min<float>();
+
+ ETransformationType TransformationType;
TVector<TPoint> Points;
-public:
- TFeaturesTransformerLearner(const ETransformationType transformationType)
- : TransformationType(transformationType)
- {
- }
-
- void Add(const float argument, const float target) {
- Points.push_back(TPoint{argument, target});
- MinimalArgument = Min(MinimalArgument, argument);
- MaximalArgument = Max(MaximalArgument, argument);
- }
-
- TFeaturesTransformer Solve(const size_t iterationsCount = 100);
-};
-
-class TFastFeaturesTransformerLearner {
-private:
- ETransformationType TransformationType;
-
- struct TBucket {
- TMeanCalculator ArgumentsMean;
- TMeanCalculator TargetsMean;
- };
-
+public:
+ TFeaturesTransformerLearner(const ETransformationType transformationType)
+ : TransformationType(transformationType)
+ {
+ }
+
+ void Add(const float argument, const float target) {
+ Points.push_back(TPoint{argument, target});
+ MinimalArgument = Min(MinimalArgument, argument);
+ MaximalArgument = Max(MaximalArgument, argument);
+ }
+
+ TFeaturesTransformer Solve(const size_t iterationsCount = 100);
+};
+
+class TFastFeaturesTransformerLearner {
+private:
+ ETransformationType TransformationType;
+
+ struct TBucket {
+ TMeanCalculator ArgumentsMean;
+ TMeanCalculator TargetsMean;
+ };
+
THashMap<double, TBucket> Buckets;
- double Step;
-
-public:
- TFastFeaturesTransformerLearner(const ETransformationType transformationType, const double step = 0.1)
- : TransformationType(transformationType)
- , Step(step)
- {
- }
-
- void Add(const float argument, const float target) {
- TBucket& bucket = Buckets[round(argument / Step)];
- bucket.ArgumentsMean.Add(argument);
- bucket.TargetsMean.Add(target);
- }
-
- TFeaturesTransformer Solve(const size_t iterationsCount = 100) {
- TFeaturesTransformerLearner learner(TransformationType);
- for (auto&& argumentWithBucket : Buckets) {
+ double Step;
+
+public:
+ TFastFeaturesTransformerLearner(const ETransformationType transformationType, const double step = 0.1)
+ : TransformationType(transformationType)
+ , Step(step)
+ {
+ }
+
+ void Add(const float argument, const float target) {
+ TBucket& bucket = Buckets[round(argument / Step)];
+ bucket.ArgumentsMean.Add(argument);
+ bucket.TargetsMean.Add(target);
+ }
+
+ TFeaturesTransformer Solve(const size_t iterationsCount = 100) {
+ TFeaturesTransformerLearner learner(TransformationType);
+ for (auto&& argumentWithBucket : Buckets) {
const TBucket& bucket = argumentWithBucket.second;
- learner.Add(bucket.ArgumentsMean.GetMean(), bucket.TargetsMean.GetMean());
- }
- return learner.Solve(iterationsCount);
- }
-};
+ learner.Add(bucket.ArgumentsMean.GetMean(), bucket.TargetsMean.GetMean());
+ }
+ return learner.Solve(iterationsCount);
+ }
+};