diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/accurate_accumulate | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/accurate_accumulate')
7 files changed, 366 insertions, 0 deletions
diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.cpp b/library/cpp/accurate_accumulate/accurate_accumulate.cpp new file mode 100644 index 0000000000..20928f81f4 --- /dev/null +++ b/library/cpp/accurate_accumulate/accurate_accumulate.cpp @@ -0,0 +1 @@ +#include "accurate_accumulate.h" diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.h b/library/cpp/accurate_accumulate/accurate_accumulate.h new file mode 100644 index 0000000000..dacced17e9 --- /dev/null +++ b/library/cpp/accurate_accumulate/accurate_accumulate.h @@ -0,0 +1,221 @@ +#pragma once + +#include <util/ysaveload.h> +#include <util/generic/vector.h> +#include <util/system/yassert.h> + +//! See more details here http://en.wikipedia.org/wiki/Kahan_summation_algorithm +template <typename TAccumulateType> +class TKahanAccumulator { +public: + using TValueType = TAccumulateType; + + template <typename TFloatType> + explicit TKahanAccumulator(const TFloatType x) + : Sum_(x) + , Compensation_() + { + } + + TKahanAccumulator() + : Sum_() + , Compensation_() + { + } + + template <typename TFloatType> + TKahanAccumulator& operator=(const TFloatType& rhs) { + Sum_ = TValueType(rhs); + Compensation_ = TValueType(); + return *this; + } + + TValueType Get() const { + return Sum_ + Compensation_; + } + + template <typename TFloatType> + inline operator TFloatType() const { + return Get(); + } + + template <typename TFloatType> + inline bool operator<(const TKahanAccumulator<TFloatType>& other) const { + return Get() < other.Get(); + } + + template <typename TFloatType> + inline bool operator<=(const TKahanAccumulator<TFloatType>& other) const { + return !(other < *this); + } + + template <typename TFloatType> + inline bool operator>(const TKahanAccumulator<TFloatType>& other) const { + return other < *this; + } + + template <typename TFloatType> + inline bool operator>=(const TKahanAccumulator<TFloatType>& other) const { + return !(*this < other); + } + + template <typename TFloatType> + inline TKahanAccumulator& operator+=(const TFloatType x) { + const TValueType y = TValueType(x) - Compensation_; + const TValueType t = Sum_ + y; + Compensation_ = (t - Sum_) - y; + Sum_ = t; + return *this; + } + + template <typename TFloatType> + inline TKahanAccumulator& operator-=(const TFloatType x) { + return *this += -TValueType(x); + } + + template <typename TFloatType> + inline TKahanAccumulator& operator*=(const TFloatType x) { + return *this = TValueType(*this) * TValueType(x); + } + + template <typename TFloatType> + inline TKahanAccumulator& operator/=(const TFloatType x) { + return *this = TValueType(*this) / TValueType(x); + } + + Y_SAVELOAD_DEFINE(Sum_, Compensation_) + +private: + TValueType Sum_; + TValueType Compensation_; +}; + +template <typename TAccumulateType, typename TFloatType> +inline const TKahanAccumulator<TAccumulateType> +operator+(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) { + return lhs += rhs; +} + +template <typename TAccumulateType, typename TFloatType> +inline const TKahanAccumulator<TAccumulateType> +operator-(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) { + return lhs -= rhs; +} + +template <typename TAccumulateType, typename TFloatType> +inline const TKahanAccumulator<TAccumulateType> +operator*(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) { + return lhs *= rhs; +} + +template <typename TAccumulateType, typename TFloatType> +inline const TKahanAccumulator<TAccumulateType> +operator/(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) { + return lhs /= rhs; +} + +template <typename TAccumulatorType, typename It> +static inline TAccumulatorType TypedFastAccumulate(It begin, It end) { + TAccumulatorType accumulator = TAccumulatorType(); + + for (; begin + 15 < end; begin += 16) { + accumulator += *(begin + 0) + + *(begin + 1) + + *(begin + 2) + + *(begin + 3) + + *(begin + 4) + + *(begin + 5) + + *(begin + 6) + + *(begin + 7) + + *(begin + 8) + + *(begin + 9) + + *(begin + 10) + + *(begin + 11) + + *(begin + 12) + + *(begin + 13) + + *(begin + 14) + + *(begin + 15); + } + for (; begin != end; ++begin) { + accumulator += *begin; + } + + return accumulator; +} + +template <class TOperation, typename TAccumulatorType, typename It1, typename It2> +static inline TAccumulatorType TypedFastInnerOperation(It1 begin1, It1 end1, It2 begin2) { + TAccumulatorType accumulator = TAccumulatorType(); + + const TOperation op; + for (; begin1 + 15 < end1; begin1 += 16, begin2 += 16) { + accumulator += op(*(begin1 + 0), *(begin2 + 0)) + + op(*(begin1 + 1), *(begin2 + 1)) + + op(*(begin1 + 2), *(begin2 + 2)) + + op(*(begin1 + 3), *(begin2 + 3)) + + op(*(begin1 + 4), *(begin2 + 4)) + + op(*(begin1 + 5), *(begin2 + 5)) + + op(*(begin1 + 6), *(begin2 + 6)) + + op(*(begin1 + 7), *(begin2 + 7)) + + op(*(begin1 + 8), *(begin2 + 8)) + + op(*(begin1 + 9), *(begin2 + 9)) + + op(*(begin1 + 10), *(begin2 + 10)) + + op(*(begin1 + 11), *(begin2 + 11)) + + op(*(begin1 + 12), *(begin2 + 12)) + + op(*(begin1 + 13), *(begin2 + 13)) + + op(*(begin1 + 14), *(begin2 + 14)) + + op(*(begin1 + 15), *(begin2 + 15)); + } + for (; begin1 != end1; ++begin1, ++begin2) { + accumulator += op(*begin1, *begin2); + } + + return accumulator; +} + +template <typename TAccumulatorType, typename It1, typename It2> +static inline TAccumulatorType TypedFastInnerProduct(It1 begin1, It1 end1, It2 begin2) { + return TypedFastInnerOperation<std::multiplies<>, TAccumulatorType>(begin1, end1, begin2); +} + +template <typename It> +static inline double FastAccumulate(It begin, It end) { + return TypedFastAccumulate<double>(begin, end); +} + +template <typename T> +static inline double FastAccumulate(const TVector<T>& sequence) { + return FastAccumulate(sequence.begin(), sequence.end()); +} + +template <typename It> +static inline double FastKahanAccumulate(It begin, It end) { + return TypedFastAccumulate<TKahanAccumulator<double>>(begin, end); +} + +template <typename T> +static inline double FastKahanAccumulate(const TVector<T>& sequence) { + return FastKahanAccumulate(sequence.begin(), sequence.end()); +} + +template <typename It1, typename It2> +static inline double FastInnerProduct(It1 begin1, It1 end1, It2 begin2) { + return TypedFastInnerProduct<double>(begin1, end1, begin2); +} + +template <typename T> +static inline double FastInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) { + Y_ASSERT(lhs.size() == rhs.size()); + return FastInnerProduct(lhs.begin(), lhs.end(), rhs.begin()); +} + +template <typename It1, typename It2> +static inline double FastKahanInnerProduct(It1 begin1, It1 end1, It2 begin2) { + return TypedFastInnerProduct<TKahanAccumulator<double>>(begin1, end1, begin2); +} + +template <typename T> +static inline double FastKahanInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) { + Y_ASSERT(lhs.size() == rhs.size()); + return FastKahanInnerProduct(lhs.begin(), lhs.end(), rhs.begin()); +} diff --git a/library/cpp/accurate_accumulate/benchmark/main.cpp b/library/cpp/accurate_accumulate/benchmark/main.cpp new file mode 100644 index 0000000000..3c5e6e775d --- /dev/null +++ b/library/cpp/accurate_accumulate/benchmark/main.cpp @@ -0,0 +1,97 @@ +#include <library/cpp/accurate_accumulate/accurate_accumulate.h> +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> +#include <util/generic/vector.h> +#include <util/generic/xrange.h> +#include <util/random/fast.h> + +namespace { + template <typename T, size_t N> + struct TNormalizedExamplesHolder { + TVector<T> Examples; + TNormalizedExamplesHolder() + : Examples(N) + { + TFastRng<ui64> prng{sizeof(T) * N * 42u}; + for (auto& x : Examples) { + x = prng.GenRandReal4(); + } + } + }; + + template <typename T, size_t N> + struct TExamplesHolder { + TVector<T> Examples; + TExamplesHolder() + : Examples(N) + { + TFastRng<ui64> prng{sizeof(T) * N * 42u + 100500u}; + for (auto& x : Examples) { + // operations with non-normalized floating point numbers are rumored to work slower + x = prng.GenRandReal4() + prng.Uniform(1024u); + } + } + }; +} + +#define DEFINE_BENCHMARK(type, count) \ + Y_CPU_BENCHMARK(SimpleNorm_##type##_##count, iface) { \ + const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + Y_DO_NOT_OPTIMIZE_AWAY( \ + (type)Accumulate(std::cbegin(examples), std::cend(examples), type{})); \ + } \ + } \ + \ + Y_CPU_BENCHMARK(KahanNorm_##type##_##count, iface) { \ + const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + Y_DO_NOT_OPTIMIZE_AWAY( \ + (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \ + } \ + } \ + \ + Y_CPU_BENCHMARK(Simple_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + Y_DO_NOT_OPTIMIZE_AWAY( \ + (type)Accumulate(std::cbegin(examples), std::cend(examples), type{})); \ + } \ + } \ + \ + Y_CPU_BENCHMARK(Kahan_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + Y_DO_NOT_OPTIMIZE_AWAY( \ + (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \ + } \ + } + +DEFINE_BENCHMARK(float, 2) +DEFINE_BENCHMARK(float, 4) +DEFINE_BENCHMARK(float, 8) +DEFINE_BENCHMARK(float, 16) +DEFINE_BENCHMARK(float, 32) +DEFINE_BENCHMARK(float, 64) +DEFINE_BENCHMARK(float, 128) +DEFINE_BENCHMARK(float, 256) +DEFINE_BENCHMARK(float, 512) +DEFINE_BENCHMARK(float, 1024) +DEFINE_BENCHMARK(double, 2) +DEFINE_BENCHMARK(double, 4) +DEFINE_BENCHMARK(double, 8) +DEFINE_BENCHMARK(double, 16) +DEFINE_BENCHMARK(double, 32) +DEFINE_BENCHMARK(double, 64) +DEFINE_BENCHMARK(double, 128) +DEFINE_BENCHMARK(double, 256) +DEFINE_BENCHMARK(double, 512) +DEFINE_BENCHMARK(double, 1024) + +#undef DEFINE_BENCHMARK diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/main.py b/library/cpp/accurate_accumulate/benchmark/metrics/main.py new file mode 100644 index 0000000000..311fc219ce --- /dev/null +++ b/library/cpp/accurate_accumulate/benchmark/metrics/main.py @@ -0,0 +1,7 @@ +import yatest.common as yc + + +def test_export_metrics(metrics): + metrics.set_benchmark(yc.execute_benchmark( + 'library/cpp/accurate_accumulate/benchmark/benchmark', + threads=8)) diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/ya.make b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make new file mode 100644 index 0000000000..5d532e1479 --- /dev/null +++ b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make @@ -0,0 +1,17 @@ +OWNER(yazevnul) + +PY2TEST() + +SIZE(LARGE) + +TAG( + ya:force_sandbox + sb:intel_e5_2660v1 + ya:fat +) + +TEST_SRCS(main.py) + +DEPENDS(library/cpp/accurate_accumulate/benchmark) + +END() diff --git a/library/cpp/accurate_accumulate/benchmark/ya.make b/library/cpp/accurate_accumulate/benchmark/ya.make new file mode 100644 index 0000000000..20fd877389 --- /dev/null +++ b/library/cpp/accurate_accumulate/benchmark/ya.make @@ -0,0 +1,13 @@ +OWNER(yazevnul) + +Y_BENCHMARK() + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/accurate_accumulate +) + +END() diff --git a/library/cpp/accurate_accumulate/ya.make b/library/cpp/accurate_accumulate/ya.make new file mode 100644 index 0000000000..82630d19be --- /dev/null +++ b/library/cpp/accurate_accumulate/ya.make @@ -0,0 +1,10 @@ +LIBRARY() + +OWNER(alex-sh) + +SRCS( + accurate_accumulate.h + accurate_accumulate.cpp +) + +END() |