aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/accurate_accumulate
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/accurate_accumulate
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/accurate_accumulate')
-rw-r--r--library/cpp/accurate_accumulate/accurate_accumulate.cpp1
-rw-r--r--library/cpp/accurate_accumulate/accurate_accumulate.h221
-rw-r--r--library/cpp/accurate_accumulate/benchmark/main.cpp97
-rw-r--r--library/cpp/accurate_accumulate/benchmark/metrics/main.py7
-rw-r--r--library/cpp/accurate_accumulate/benchmark/metrics/ya.make17
-rw-r--r--library/cpp/accurate_accumulate/benchmark/ya.make13
-rw-r--r--library/cpp/accurate_accumulate/ya.make10
7 files changed, 366 insertions, 0 deletions
diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.cpp b/library/cpp/accurate_accumulate/accurate_accumulate.cpp
new file mode 100644
index 0000000000..20928f81f4
--- /dev/null
+++ b/library/cpp/accurate_accumulate/accurate_accumulate.cpp
@@ -0,0 +1 @@
+#include "accurate_accumulate.h"
diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.h b/library/cpp/accurate_accumulate/accurate_accumulate.h
new file mode 100644
index 0000000000..dacced17e9
--- /dev/null
+++ b/library/cpp/accurate_accumulate/accurate_accumulate.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <util/ysaveload.h>
+#include <util/generic/vector.h>
+#include <util/system/yassert.h>
+
+//! See more details here http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+template <typename TAccumulateType>
+class TKahanAccumulator {
+public:
+ using TValueType = TAccumulateType;
+
+ template <typename TFloatType>
+ explicit TKahanAccumulator(const TFloatType x)
+ : Sum_(x)
+ , Compensation_()
+ {
+ }
+
+ TKahanAccumulator()
+ : Sum_()
+ , Compensation_()
+ {
+ }
+
+ template <typename TFloatType>
+ TKahanAccumulator& operator=(const TFloatType& rhs) {
+ Sum_ = TValueType(rhs);
+ Compensation_ = TValueType();
+ return *this;
+ }
+
+ TValueType Get() const {
+ return Sum_ + Compensation_;
+ }
+
+ template <typename TFloatType>
+ inline operator TFloatType() const {
+ return Get();
+ }
+
+ template <typename TFloatType>
+ inline bool operator<(const TKahanAccumulator<TFloatType>& other) const {
+ return Get() < other.Get();
+ }
+
+ template <typename TFloatType>
+ inline bool operator<=(const TKahanAccumulator<TFloatType>& other) const {
+ return !(other < *this);
+ }
+
+ template <typename TFloatType>
+ inline bool operator>(const TKahanAccumulator<TFloatType>& other) const {
+ return other < *this;
+ }
+
+ template <typename TFloatType>
+ inline bool operator>=(const TKahanAccumulator<TFloatType>& other) const {
+ return !(*this < other);
+ }
+
+ template <typename TFloatType>
+ inline TKahanAccumulator& operator+=(const TFloatType x) {
+ const TValueType y = TValueType(x) - Compensation_;
+ const TValueType t = Sum_ + y;
+ Compensation_ = (t - Sum_) - y;
+ Sum_ = t;
+ return *this;
+ }
+
+ template <typename TFloatType>
+ inline TKahanAccumulator& operator-=(const TFloatType x) {
+ return *this += -TValueType(x);
+ }
+
+ template <typename TFloatType>
+ inline TKahanAccumulator& operator*=(const TFloatType x) {
+ return *this = TValueType(*this) * TValueType(x);
+ }
+
+ template <typename TFloatType>
+ inline TKahanAccumulator& operator/=(const TFloatType x) {
+ return *this = TValueType(*this) / TValueType(x);
+ }
+
+ Y_SAVELOAD_DEFINE(Sum_, Compensation_)
+
+private:
+ TValueType Sum_;
+ TValueType Compensation_;
+};
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator+(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+ return lhs += rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator-(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+ return lhs -= rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator*(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+ return lhs *= rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator/(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+ return lhs /= rhs;
+}
+
+template <typename TAccumulatorType, typename It>
+static inline TAccumulatorType TypedFastAccumulate(It begin, It end) {
+ TAccumulatorType accumulator = TAccumulatorType();
+
+ for (; begin + 15 < end; begin += 16) {
+ accumulator += *(begin + 0) +
+ *(begin + 1) +
+ *(begin + 2) +
+ *(begin + 3) +
+ *(begin + 4) +
+ *(begin + 5) +
+ *(begin + 6) +
+ *(begin + 7) +
+ *(begin + 8) +
+ *(begin + 9) +
+ *(begin + 10) +
+ *(begin + 11) +
+ *(begin + 12) +
+ *(begin + 13) +
+ *(begin + 14) +
+ *(begin + 15);
+ }
+ for (; begin != end; ++begin) {
+ accumulator += *begin;
+ }
+
+ return accumulator;
+}
+
+template <class TOperation, typename TAccumulatorType, typename It1, typename It2>
+static inline TAccumulatorType TypedFastInnerOperation(It1 begin1, It1 end1, It2 begin2) {
+ TAccumulatorType accumulator = TAccumulatorType();
+
+ const TOperation op;
+ for (; begin1 + 15 < end1; begin1 += 16, begin2 += 16) {
+ accumulator += op(*(begin1 + 0), *(begin2 + 0)) +
+ op(*(begin1 + 1), *(begin2 + 1)) +
+ op(*(begin1 + 2), *(begin2 + 2)) +
+ op(*(begin1 + 3), *(begin2 + 3)) +
+ op(*(begin1 + 4), *(begin2 + 4)) +
+ op(*(begin1 + 5), *(begin2 + 5)) +
+ op(*(begin1 + 6), *(begin2 + 6)) +
+ op(*(begin1 + 7), *(begin2 + 7)) +
+ op(*(begin1 + 8), *(begin2 + 8)) +
+ op(*(begin1 + 9), *(begin2 + 9)) +
+ op(*(begin1 + 10), *(begin2 + 10)) +
+ op(*(begin1 + 11), *(begin2 + 11)) +
+ op(*(begin1 + 12), *(begin2 + 12)) +
+ op(*(begin1 + 13), *(begin2 + 13)) +
+ op(*(begin1 + 14), *(begin2 + 14)) +
+ op(*(begin1 + 15), *(begin2 + 15));
+ }
+ for (; begin1 != end1; ++begin1, ++begin2) {
+ accumulator += op(*begin1, *begin2);
+ }
+
+ return accumulator;
+}
+
+template <typename TAccumulatorType, typename It1, typename It2>
+static inline TAccumulatorType TypedFastInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+ return TypedFastInnerOperation<std::multiplies<>, TAccumulatorType>(begin1, end1, begin2);
+}
+
+template <typename It>
+static inline double FastAccumulate(It begin, It end) {
+ return TypedFastAccumulate<double>(begin, end);
+}
+
+template <typename T>
+static inline double FastAccumulate(const TVector<T>& sequence) {
+ return FastAccumulate(sequence.begin(), sequence.end());
+}
+
+template <typename It>
+static inline double FastKahanAccumulate(It begin, It end) {
+ return TypedFastAccumulate<TKahanAccumulator<double>>(begin, end);
+}
+
+template <typename T>
+static inline double FastKahanAccumulate(const TVector<T>& sequence) {
+ return FastKahanAccumulate(sequence.begin(), sequence.end());
+}
+
+template <typename It1, typename It2>
+static inline double FastInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+ return TypedFastInnerProduct<double>(begin1, end1, begin2);
+}
+
+template <typename T>
+static inline double FastInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) {
+ Y_ASSERT(lhs.size() == rhs.size());
+ return FastInnerProduct(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template <typename It1, typename It2>
+static inline double FastKahanInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+ return TypedFastInnerProduct<TKahanAccumulator<double>>(begin1, end1, begin2);
+}
+
+template <typename T>
+static inline double FastKahanInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) {
+ Y_ASSERT(lhs.size() == rhs.size());
+ return FastKahanInnerProduct(lhs.begin(), lhs.end(), rhs.begin());
+}
diff --git a/library/cpp/accurate_accumulate/benchmark/main.cpp b/library/cpp/accurate_accumulate/benchmark/main.cpp
new file mode 100644
index 0000000000..3c5e6e775d
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/main.cpp
@@ -0,0 +1,97 @@
+#include <library/cpp/accurate_accumulate/accurate_accumulate.h>
+#include <library/cpp/testing/benchmark/bench.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+#include <util/generic/vector.h>
+#include <util/generic/xrange.h>
+#include <util/random/fast.h>
+
+namespace {
+ template <typename T, size_t N>
+ struct TNormalizedExamplesHolder {
+ TVector<T> Examples;
+ TNormalizedExamplesHolder()
+ : Examples(N)
+ {
+ TFastRng<ui64> prng{sizeof(T) * N * 42u};
+ for (auto& x : Examples) {
+ x = prng.GenRandReal4();
+ }
+ }
+ };
+
+ template <typename T, size_t N>
+ struct TExamplesHolder {
+ TVector<T> Examples;
+ TExamplesHolder()
+ : Examples(N)
+ {
+ TFastRng<ui64> prng{sizeof(T) * N * 42u + 100500u};
+ for (auto& x : Examples) {
+ // operations with non-normalized floating point numbers are rumored to work slower
+ x = prng.GenRandReal4() + prng.Uniform(1024u);
+ }
+ }
+ };
+}
+
+#define DEFINE_BENCHMARK(type, count) \
+ Y_CPU_BENCHMARK(SimpleNorm_##type##_##count, iface) { \
+ const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples; \
+ for (const auto i : xrange(iface.Iterations())) { \
+ Y_UNUSED(i); \
+ Y_DO_NOT_OPTIMIZE_AWAY( \
+ (type)Accumulate(std::cbegin(examples), std::cend(examples), type{})); \
+ } \
+ } \
+ \
+ Y_CPU_BENCHMARK(KahanNorm_##type##_##count, iface) { \
+ const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples; \
+ for (const auto i : xrange(iface.Iterations())) { \
+ Y_UNUSED(i); \
+ Y_DO_NOT_OPTIMIZE_AWAY( \
+ (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \
+ } \
+ } \
+ \
+ Y_CPU_BENCHMARK(Simple_##type##_##count, iface) { \
+ const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \
+ for (const auto i : xrange(iface.Iterations())) { \
+ Y_UNUSED(i); \
+ Y_DO_NOT_OPTIMIZE_AWAY( \
+ (type)Accumulate(std::cbegin(examples), std::cend(examples), type{})); \
+ } \
+ } \
+ \
+ Y_CPU_BENCHMARK(Kahan_##type##_##count, iface) { \
+ const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \
+ for (const auto i : xrange(iface.Iterations())) { \
+ Y_UNUSED(i); \
+ Y_DO_NOT_OPTIMIZE_AWAY( \
+ (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \
+ } \
+ }
+
+DEFINE_BENCHMARK(float, 2)
+DEFINE_BENCHMARK(float, 4)
+DEFINE_BENCHMARK(float, 8)
+DEFINE_BENCHMARK(float, 16)
+DEFINE_BENCHMARK(float, 32)
+DEFINE_BENCHMARK(float, 64)
+DEFINE_BENCHMARK(float, 128)
+DEFINE_BENCHMARK(float, 256)
+DEFINE_BENCHMARK(float, 512)
+DEFINE_BENCHMARK(float, 1024)
+DEFINE_BENCHMARK(double, 2)
+DEFINE_BENCHMARK(double, 4)
+DEFINE_BENCHMARK(double, 8)
+DEFINE_BENCHMARK(double, 16)
+DEFINE_BENCHMARK(double, 32)
+DEFINE_BENCHMARK(double, 64)
+DEFINE_BENCHMARK(double, 128)
+DEFINE_BENCHMARK(double, 256)
+DEFINE_BENCHMARK(double, 512)
+DEFINE_BENCHMARK(double, 1024)
+
+#undef DEFINE_BENCHMARK
diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/main.py b/library/cpp/accurate_accumulate/benchmark/metrics/main.py
new file mode 100644
index 0000000000..311fc219ce
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/metrics/main.py
@@ -0,0 +1,7 @@
+import yatest.common as yc
+
+
+def test_export_metrics(metrics):
+ metrics.set_benchmark(yc.execute_benchmark(
+ 'library/cpp/accurate_accumulate/benchmark/benchmark',
+ threads=8))
diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/ya.make b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make
new file mode 100644
index 0000000000..5d532e1479
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make
@@ -0,0 +1,17 @@
+OWNER(yazevnul)
+
+PY2TEST()
+
+SIZE(LARGE)
+
+TAG(
+ ya:force_sandbox
+ sb:intel_e5_2660v1
+ ya:fat
+)
+
+TEST_SRCS(main.py)
+
+DEPENDS(library/cpp/accurate_accumulate/benchmark)
+
+END()
diff --git a/library/cpp/accurate_accumulate/benchmark/ya.make b/library/cpp/accurate_accumulate/benchmark/ya.make
new file mode 100644
index 0000000000..20fd877389
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/ya.make
@@ -0,0 +1,13 @@
+OWNER(yazevnul)
+
+Y_BENCHMARK()
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
+ library/cpp/accurate_accumulate
+)
+
+END()
diff --git a/library/cpp/accurate_accumulate/ya.make b/library/cpp/accurate_accumulate/ya.make
new file mode 100644
index 0000000000..82630d19be
--- /dev/null
+++ b/library/cpp/accurate_accumulate/ya.make
@@ -0,0 +1,10 @@
+LIBRARY()
+
+OWNER(alex-sh)
+
+SRCS(
+ accurate_accumulate.h
+ accurate_accumulate.cpp
+)
+
+END()