diff options
author | alex-sh <alex-sh@yandex-team.ru> | 2022-02-10 16:50:03 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:03 +0300 |
commit | 3196904c9f5bf7aff7374eeadcb0671589581f61 (patch) | |
tree | d13114a178799aeb203a4b3b43dd7fb0c4f6975f /library/cpp/linear_regression/linear_regression.h | |
parent | d154d11651ea533127249184148c3f023e2c6d0a (diff) | |
download | ydb-3196904c9f5bf7aff7374eeadcb0671589581f61.tar.gz |
Restoring authorship annotation for <alex-sh@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/linear_regression/linear_regression.h')
-rw-r--r-- | library/cpp/linear_regression/linear_regression.h | 576 |
1 files changed, 288 insertions, 288 deletions
diff --git a/library/cpp/linear_regression/linear_regression.h b/library/cpp/linear_regression/linear_regression.h index e57de5ff6c..44b0254d5d 100644 --- a/library/cpp/linear_regression/linear_regression.h +++ b/library/cpp/linear_regression/linear_regression.h @@ -1,342 +1,342 @@ -#pragma once - -#include "linear_model.h" -#include "welford.h" - +#pragma once + +#include "linear_model.h" +#include "welford.h" + #include <library/cpp/accurate_accumulate/accurate_accumulate.h> - -#include <util/generic/vector.h> -#include <util/generic/hash.h> + +#include <util/generic/vector.h> +#include <util/generic/hash.h> #include <util/generic/ymath.h> - -class TFastLinearRegressionSolver { -private: - TKahanAccumulator<double> SumSquaredGoals; - + +class TFastLinearRegressionSolver { +private: + TKahanAccumulator<double> SumSquaredGoals; + TVector<double> LinearizedOLSMatrix; TVector<double> OLSVector; -public: +public: bool Add(const TVector<double>& features, const double goal, const double weight = 1.); - TLinearModel Solve() const; - double SumSquaredErrors() const; -}; - -class TLinearRegressionSolver { -private: - double GoalsMean = 0.; - double GoalsDeviation = 0.; - + TLinearModel Solve() const; + double SumSquaredErrors() const; +}; + +class TLinearRegressionSolver { +private: + double GoalsMean = 0.; + double GoalsDeviation = 0.; + TVector<double> FeatureMeans; TVector<double> LastMeans; TVector<double> NewMeans; TVector<double> LinearizedOLSMatrix; - + TVector<double> OLSVector; + + TKahanAccumulator<double> SumWeights; - TKahanAccumulator<double> SumWeights; - -public: +public: bool Add(const TVector<double>& features, const double goal, const double weight = 1.); - TLinearModel Solve() const; - double SumSquaredErrors() const; -}; - -template <typename TStoreType> -class TTypedFastSLRSolver { -private: - TStoreType SumFeatures = TStoreType(); - TStoreType SumSquaredFeatures = TStoreType(); - - TStoreType SumGoals = TStoreType(); - TStoreType SumSquaredGoals = TStoreType(); - - TStoreType SumProducts = TStoreType(); - - TStoreType SumWeights = TStoreType(); - -public: + TLinearModel Solve() const; + double SumSquaredErrors() const; +}; + +template <typename TStoreType> +class TTypedFastSLRSolver { +private: + TStoreType SumFeatures = TStoreType(); + TStoreType SumSquaredFeatures = TStoreType(); + + TStoreType SumGoals = TStoreType(); + TStoreType SumSquaredGoals = TStoreType(); + + TStoreType SumProducts = TStoreType(); + + TStoreType SumWeights = TStoreType(); + +public: bool Add(const double feature, const double goal, const double weight = 1.) { - SumFeatures += feature * weight; - SumSquaredFeatures += feature * feature * weight; - - SumGoals += goal * weight; - SumSquaredGoals += goal * goal * weight; - - SumProducts += goal * feature * weight; - - SumWeights += weight; + SumFeatures += feature * weight; + SumSquaredFeatures += feature * feature * weight; + + SumGoals += goal * weight; + SumSquaredGoals += goal * goal * weight; + + SumProducts += goal * feature * weight; + + SumWeights += weight; return true; - } - - template <typename TFloatType> - void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const { + } + + template <typename TFloatType> + void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const { if (!(double)SumGoals) { - factor = intercept = TFloatType(); - return; - } - - double productsDeviation, featuresDeviation; - SetupSolutionFactors(productsDeviation, featuresDeviation); - - if (!featuresDeviation) { - factor = TFloatType(); + factor = intercept = TFloatType(); + return; + } + + double productsDeviation, featuresDeviation; + SetupSolutionFactors(productsDeviation, featuresDeviation); + + if (!featuresDeviation) { + factor = TFloatType(); intercept = (double)SumGoals / (double)SumWeights; - return; - } - - factor = productsDeviation / (featuresDeviation + regularizationParameter); + return; + } + + factor = productsDeviation / (featuresDeviation + regularizationParameter); intercept = (double)SumGoals / (double)SumWeights - factor * (double)SumFeatures / (double)SumWeights; - } - - double SumSquaredErrors(const double regularizationParameter = 0.1) const { + } + + double SumSquaredErrors(const double regularizationParameter = 0.1) const { if (!(double)SumWeights) { - return 0.; - } - + return 0.; + } + const double sumGoalSquaredDeviations = (double)SumSquaredGoals - (double)SumGoals / (double)SumWeights * (double)SumGoals; - - double productsDeviation, featuresDeviation; - SetupSolutionFactors(productsDeviation, featuresDeviation); - if (!featuresDeviation) { - return sumGoalSquaredDeviations; - } - - const double factor = productsDeviation / (featuresDeviation + regularizationParameter); - - const double sumSquaredErrors = factor * factor * featuresDeviation - 2 * factor * productsDeviation + sumGoalSquaredDeviations; - return Max(0., sumSquaredErrors); - } - -private: - void SetupSolutionFactors(double& productsDeviation, double& featuresDeviation) const { + + double productsDeviation, featuresDeviation; + SetupSolutionFactors(productsDeviation, featuresDeviation); + if (!featuresDeviation) { + return sumGoalSquaredDeviations; + } + + const double factor = productsDeviation / (featuresDeviation + regularizationParameter); + + const double sumSquaredErrors = factor * factor * featuresDeviation - 2 * factor * productsDeviation + sumGoalSquaredDeviations; + return Max(0., sumSquaredErrors); + } + +private: + void SetupSolutionFactors(double& productsDeviation, double& featuresDeviation) const { if (!(double)SumWeights) { - productsDeviation = featuresDeviation = 0.; - return; - } - + productsDeviation = featuresDeviation = 0.; + return; + } + featuresDeviation = (double)SumSquaredFeatures - (double)SumFeatures / (double)SumWeights * (double)SumFeatures; - if (!featuresDeviation) { - return; - } + if (!featuresDeviation) { + return; + } productsDeviation = (double)SumProducts - (double)SumFeatures / (double)SumWeights * (double)SumGoals; - } -}; - -using TFastSLRSolver = TTypedFastSLRSolver<double>; + } +}; + +using TFastSLRSolver = TTypedFastSLRSolver<double>; using TKahanSLRSolver = TTypedFastSLRSolver<TKahanAccumulator<double>>; - -class TSLRSolver { -private: - double FeaturesMean = 0.; - double FeaturesDeviation = 0.; - - double GoalsMean = 0.; - double GoalsDeviation = 0.; - - TKahanAccumulator<double> SumWeights; - - double Covariation = 0.; - -public: + +class TSLRSolver { +private: + double FeaturesMean = 0.; + double FeaturesDeviation = 0.; + + double GoalsMean = 0.; + double GoalsDeviation = 0.; + + TKahanAccumulator<double> SumWeights; + + double Covariation = 0.; + +public: bool Add(const double feature, const double goal, const double weight = 1.); - - bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin); - bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin, const double* weightsBegin); - + + bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin); + bool Add(const double* featuresBegin, const double* featuresEnd, const double* goalsBegin, const double* weightsBegin); + bool Add(const TVector<double>& features, const TVector<double>& goals) { Y_ASSERT(features.size() == goals.size()); return Add(features.data(), features.data() + features.size(), goals.data()); - } - + } + bool Add(const TVector<double>& features, const TVector<double>& goals, const TVector<double>& weights) { Y_ASSERT(features.size() == goals.size() && features.size() == weights.size()); return Add(features.data(), features.data() + features.size(), goals.data(), weights.data()); - } - - template <typename TFloatType> - void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const { - if (!FeaturesDeviation) { - factor = 0.; - intercept = GoalsMean; - return; - } - - factor = Covariation / (FeaturesDeviation + regularizationParameter); - intercept = GoalsMean - factor * FeaturesMean; - } - - double SumSquaredErrors(const double regularizationParameter = 0.1) const; - - double GetSumWeights() const { - return SumWeights.Get(); - } -}; - -template <typename TSLRSolverType> -class TTypedBestSLRSolver { -private: + } + + template <typename TFloatType> + void Solve(TFloatType& factor, TFloatType& intercept, const double regularizationParameter = 0.1) const { + if (!FeaturesDeviation) { + factor = 0.; + intercept = GoalsMean; + return; + } + + factor = Covariation / (FeaturesDeviation + regularizationParameter); + intercept = GoalsMean - factor * FeaturesMean; + } + + double SumSquaredErrors(const double regularizationParameter = 0.1) const; + + double GetSumWeights() const { + return SumWeights.Get(); + } +}; + +template <typename TSLRSolverType> +class TTypedBestSLRSolver { +private: TVector<TSLRSolverType> SLRSolvers; -public: +public: bool Add(const TVector<double>& features, const double goal, const double weight = 1.) { - if (SLRSolvers.empty()) { - SLRSolvers.resize(features.size()); - } - - for (size_t featureNumber = 0; featureNumber < features.size(); ++featureNumber) { - SLRSolvers[featureNumber].Add(features[featureNumber], goal, weight); - } + if (SLRSolvers.empty()) { + SLRSolvers.resize(features.size()); + } + + for (size_t featureNumber = 0; featureNumber < features.size(); ++featureNumber) { + SLRSolvers[featureNumber].Add(features[featureNumber], goal, weight); + } return true; - } - - TLinearModel Solve(const double regularizationParameter = 0.1) const { - const TSLRSolverType* bestSolver = nullptr; - for (const TSLRSolverType& solver : SLRSolvers) { - if (!bestSolver || solver.SumSquaredErrors(regularizationParameter) < bestSolver->SumSquaredErrors(regularizationParameter)) { - bestSolver = &solver; - } - } - + } + + TLinearModel Solve(const double regularizationParameter = 0.1) const { + const TSLRSolverType* bestSolver = nullptr; + for (const TSLRSolverType& solver : SLRSolvers) { + if (!bestSolver || solver.SumSquaredErrors(regularizationParameter) < bestSolver->SumSquaredErrors(regularizationParameter)) { + bestSolver = &solver; + } + } + TVector<double> coefficients(SLRSolvers.size()); double intercept = 0.0; - if (bestSolver) { + if (bestSolver) { bestSolver->Solve(coefficients[bestSolver - SLRSolvers.begin()], intercept, regularizationParameter); - } - + } + TLinearModel model(std::move(coefficients), intercept); - return model; - } - - double SumSquaredErrors(const double regularizationParameter = 0.1) const { - if (SLRSolvers.empty()) { - return 0.; - } - - double sse = SLRSolvers.begin()->SumSquaredErrors(regularizationParameter); - for (const TSLRSolver& solver : SLRSolvers) { - sse = Min(solver.SumSquaredErrors(regularizationParameter), sse); - } - return sse; - } -}; - -using TFastBestSLRSolver = TTypedBestSLRSolver<TFastSLRSolver>; -using TKahanBestSLRSolver = TTypedBestSLRSolver<TKahanSLRSolver>; -using TBestSLRSolver = TTypedBestSLRSolver<TSLRSolver>; - -enum ETransformationType { - TT_IDENTITY, - TT_SIGMA, -}; - -struct TTransformationParameters { - double RegressionFactor = 1.; - double RegressionIntercept = 0.; - - double FeatureOffset = 0.; - double FeatureNormalizer = 1.; - + return model; + } + + double SumSquaredErrors(const double regularizationParameter = 0.1) const { + if (SLRSolvers.empty()) { + return 0.; + } + + double sse = SLRSolvers.begin()->SumSquaredErrors(regularizationParameter); + for (const TSLRSolver& solver : SLRSolvers) { + sse = Min(solver.SumSquaredErrors(regularizationParameter), sse); + } + return sse; + } +}; + +using TFastBestSLRSolver = TTypedBestSLRSolver<TFastSLRSolver>; +using TKahanBestSLRSolver = TTypedBestSLRSolver<TKahanSLRSolver>; +using TBestSLRSolver = TTypedBestSLRSolver<TSLRSolver>; + +enum ETransformationType { + TT_IDENTITY, + TT_SIGMA, +}; + +struct TTransformationParameters { + double RegressionFactor = 1.; + double RegressionIntercept = 0.; + + double FeatureOffset = 0.; + double FeatureNormalizer = 1.; + Y_SAVELOAD_DEFINE(RegressionFactor, RegressionIntercept, FeatureOffset, FeatureNormalizer); -}; - -class TFeaturesTransformer { -private: - ETransformationType TransformationType; - TTransformationParameters TransformationParameters; - -public: +}; + +class TFeaturesTransformer { +private: + ETransformationType TransformationType; + TTransformationParameters TransformationParameters; + +public: Y_SAVELOAD_DEFINE(TransformationType, TransformationParameters); - + TFeaturesTransformer() = default; - - TFeaturesTransformer(const ETransformationType transformationType, - const TTransformationParameters transformationParameters) - : TransformationType(transformationType) - , TransformationParameters(transformationParameters) - { - } - - double Transformation(const double value) const { - switch (TransformationType) { + + TFeaturesTransformer(const ETransformationType transformationType, + const TTransformationParameters transformationParameters) + : TransformationType(transformationType) + , TransformationParameters(transformationParameters) + { + } + + double Transformation(const double value) const { + switch (TransformationType) { case ETransformationType::TT_IDENTITY: { - return value; - } + return value; + } case ETransformationType::TT_SIGMA: { - const double valueWithoutOffset = value - TransformationParameters.FeatureOffset; - const double transformedValue = valueWithoutOffset / (fabs(valueWithoutOffset) + TransformationParameters.FeatureNormalizer); - return TransformationParameters.RegressionIntercept + TransformationParameters.RegressionFactor * transformedValue; - } - } + const double valueWithoutOffset = value - TransformationParameters.FeatureOffset; + const double transformedValue = valueWithoutOffset / (fabs(valueWithoutOffset) + TransformationParameters.FeatureNormalizer); + return TransformationParameters.RegressionIntercept + TransformationParameters.RegressionFactor * transformedValue; + } + } Y_ASSERT(0); - return 0.; - } -}; - -class TFeaturesTransformerLearner { -private: - struct TPoint { - float Argument; - float Target; - }; - - float MinimalArgument = Max<float>(); - float MaximalArgument = Min<float>(); - - ETransformationType TransformationType; + return 0.; + } +}; + +class TFeaturesTransformerLearner { +private: + struct TPoint { + float Argument; + float Target; + }; + + float MinimalArgument = Max<float>(); + float MaximalArgument = Min<float>(); + + ETransformationType TransformationType; TVector<TPoint> Points; -public: - TFeaturesTransformerLearner(const ETransformationType transformationType) - : TransformationType(transformationType) - { - } - - void Add(const float argument, const float target) { - Points.push_back(TPoint{argument, target}); - MinimalArgument = Min(MinimalArgument, argument); - MaximalArgument = Max(MaximalArgument, argument); - } - - TFeaturesTransformer Solve(const size_t iterationsCount = 100); -}; - -class TFastFeaturesTransformerLearner { -private: - ETransformationType TransformationType; - - struct TBucket { - TMeanCalculator ArgumentsMean; - TMeanCalculator TargetsMean; - }; - +public: + TFeaturesTransformerLearner(const ETransformationType transformationType) + : TransformationType(transformationType) + { + } + + void Add(const float argument, const float target) { + Points.push_back(TPoint{argument, target}); + MinimalArgument = Min(MinimalArgument, argument); + MaximalArgument = Max(MaximalArgument, argument); + } + + TFeaturesTransformer Solve(const size_t iterationsCount = 100); +}; + +class TFastFeaturesTransformerLearner { +private: + ETransformationType TransformationType; + + struct TBucket { + TMeanCalculator ArgumentsMean; + TMeanCalculator TargetsMean; + }; + THashMap<double, TBucket> Buckets; - double Step; - -public: - TFastFeaturesTransformerLearner(const ETransformationType transformationType, const double step = 0.1) - : TransformationType(transformationType) - , Step(step) - { - } - - void Add(const float argument, const float target) { - TBucket& bucket = Buckets[round(argument / Step)]; - bucket.ArgumentsMean.Add(argument); - bucket.TargetsMean.Add(target); - } - - TFeaturesTransformer Solve(const size_t iterationsCount = 100) { - TFeaturesTransformerLearner learner(TransformationType); - for (auto&& argumentWithBucket : Buckets) { + double Step; + +public: + TFastFeaturesTransformerLearner(const ETransformationType transformationType, const double step = 0.1) + : TransformationType(transformationType) + , Step(step) + { + } + + void Add(const float argument, const float target) { + TBucket& bucket = Buckets[round(argument / Step)]; + bucket.ArgumentsMean.Add(argument); + bucket.TargetsMean.Add(target); + } + + TFeaturesTransformer Solve(const size_t iterationsCount = 100) { + TFeaturesTransformerLearner learner(TransformationType); + for (auto&& argumentWithBucket : Buckets) { const TBucket& bucket = argumentWithBucket.second; - learner.Add(bucket.ArgumentsMean.GetMean(), bucket.TargetsMean.GetMean()); - } - return learner.Solve(iterationsCount); - } -}; + learner.Add(bucket.ArgumentsMean.GetMean(), bucket.TargetsMean.GetMean()); + } + return learner.Solve(iterationsCount); + } +}; |