intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <arcadia-devtools@yandex-team.ru> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/accurate_accumulate
download: ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
7 files changed, 366 insertions, 0 deletions
diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.cpp b/library/cpp/accurate_accumulate/accurate_accumulate.cpp
new file mode 100644
index 0000000000..20928f81f4
--- /dev/null
+++ b/library/cpp/accurate_accumulate/accurate_accumulate.cpp
@@ -0,0 +1 @@
+#include "accurate_accumulate.h"
diff --git a/library/cpp/accurate_accumulate/accurate_accumulate.h b/library/cpp/accurate_accumulate/accurate_accumulate.h
new file mode 100644
index 0000000000..dacced17e9
--- /dev/null
+++ b/library/cpp/accurate_accumulate/accurate_accumulate.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <util/ysaveload.h>
+#include <util/generic/vector.h>
+#include <util/system/yassert.h>
+
+//! See more details here http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+template <typename TAccumulateType>
+class TKahanAccumulator {
+public:
+    using TValueType = TAccumulateType;
+
+    template <typename TFloatType>
+    explicit TKahanAccumulator(const TFloatType x)
+        : Sum_(x)
+        , Compensation_()
+    {
+    }
+
+    TKahanAccumulator()
+        : Sum_()
+        , Compensation_()
+    {
+    }
+
+    template <typename TFloatType>
+    TKahanAccumulator& operator=(const TFloatType& rhs) {
+        Sum_ = TValueType(rhs);
+        Compensation_ = TValueType();
+        return *this;
+    }
+
+    TValueType Get() const {
+        return Sum_ + Compensation_;
+    }
+
+    template <typename TFloatType>
+    inline operator TFloatType() const {
+        return Get();
+    }
+
+    template <typename TFloatType>
+    inline bool operator<(const TKahanAccumulator<TFloatType>& other) const {
+        return Get() < other.Get();
+    }
+
+    template <typename TFloatType>
+    inline bool operator<=(const TKahanAccumulator<TFloatType>& other) const {
+        return !(other < *this);
+    }
+
+    template <typename TFloatType>
+    inline bool operator>(const TKahanAccumulator<TFloatType>& other) const {
+        return other < *this;
+    }
+
+    template <typename TFloatType>
+    inline bool operator>=(const TKahanAccumulator<TFloatType>& other) const {
+        return !(*this < other);
+    }
+
+    template <typename TFloatType>
+    inline TKahanAccumulator& operator+=(const TFloatType x) {
+        const TValueType y = TValueType(x) - Compensation_;
+        const TValueType t = Sum_ + y;
+        Compensation_ = (t - Sum_) - y;
+        Sum_ = t;
+        return *this;
+    }
+
+    template <typename TFloatType>
+    inline TKahanAccumulator& operator-=(const TFloatType x) {
+        return *this += -TValueType(x);
+    }
+
+    template <typename TFloatType>
+    inline TKahanAccumulator& operator*=(const TFloatType x) {
+        return *this = TValueType(*this) * TValueType(x);
+    }
+
+    template <typename TFloatType>
+    inline TKahanAccumulator& operator/=(const TFloatType x) {
+        return *this = TValueType(*this) / TValueType(x);
+    }
+
+    Y_SAVELOAD_DEFINE(Sum_, Compensation_)
+
+private:
+    TValueType Sum_;
+    TValueType Compensation_;
+};
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator+(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+    return lhs += rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator-(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+    return lhs -= rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator*(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+    return lhs *= rhs;
+}
+
+template <typename TAccumulateType, typename TFloatType>
+inline const TKahanAccumulator<TAccumulateType>
+operator/(TKahanAccumulator<TAccumulateType> lhs, const TFloatType rhs) {
+    return lhs /= rhs;
+}
+
+template <typename TAccumulatorType, typename It>
+static inline TAccumulatorType TypedFastAccumulate(It begin, It end) {
+    TAccumulatorType accumulator = TAccumulatorType();
+
+    for (; begin + 15 < end; begin += 16) {
+        accumulator += *(begin + 0) +
+                       *(begin + 1) +
+                       *(begin + 2) +
+                       *(begin + 3) +
+                       *(begin + 4) +
+                       *(begin + 5) +
+                       *(begin + 6) +
+                       *(begin + 7) +
+                       *(begin + 8) +
+                       *(begin + 9) +
+                       *(begin + 10) +
+                       *(begin + 11) +
+                       *(begin + 12) +
+                       *(begin + 13) +
+                       *(begin + 14) +
+                       *(begin + 15);
+    }
+    for (; begin != end; ++begin) {
+        accumulator += *begin;
+    }
+
+    return accumulator;
+}
+
+template <class TOperation, typename TAccumulatorType, typename It1, typename It2>
+static inline TAccumulatorType TypedFastInnerOperation(It1 begin1, It1 end1, It2 begin2) {
+    TAccumulatorType accumulator = TAccumulatorType();
+
+    const TOperation op;
+    for (; begin1 + 15 < end1; begin1 += 16, begin2 += 16) {
+        accumulator += op(*(begin1 + 0), *(begin2 + 0)) +
+                       op(*(begin1 + 1), *(begin2 + 1)) +
+                       op(*(begin1 + 2), *(begin2 + 2)) +
+                       op(*(begin1 + 3), *(begin2 + 3)) +
+                       op(*(begin1 + 4), *(begin2 + 4)) +
+                       op(*(begin1 + 5), *(begin2 + 5)) +
+                       op(*(begin1 + 6), *(begin2 + 6)) +
+                       op(*(begin1 + 7), *(begin2 + 7)) +
+                       op(*(begin1 + 8), *(begin2 + 8)) +
+                       op(*(begin1 + 9), *(begin2 + 9)) +
+                       op(*(begin1 + 10), *(begin2 + 10)) +
+                       op(*(begin1 + 11), *(begin2 + 11)) +
+                       op(*(begin1 + 12), *(begin2 + 12)) +
+                       op(*(begin1 + 13), *(begin2 + 13)) +
+                       op(*(begin1 + 14), *(begin2 + 14)) +
+                       op(*(begin1 + 15), *(begin2 + 15));
+    }
+    for (; begin1 != end1; ++begin1, ++begin2) {
+        accumulator += op(*begin1, *begin2);
+    }
+
+    return accumulator;
+}
+
+template <typename TAccumulatorType, typename It1, typename It2>
+static inline TAccumulatorType TypedFastInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+    return TypedFastInnerOperation<std::multiplies<>, TAccumulatorType>(begin1, end1, begin2);
+}
+
+template <typename It>
+static inline double FastAccumulate(It begin, It end) {
+    return TypedFastAccumulate<double>(begin, end);
+}
+
+template <typename T>
+static inline double FastAccumulate(const TVector<T>& sequence) {
+    return FastAccumulate(sequence.begin(), sequence.end());
+}
+
+template <typename It>
+static inline double FastKahanAccumulate(It begin, It end) {
+    return TypedFastAccumulate<TKahanAccumulator<double>>(begin, end);
+}
+
+template <typename T>
+static inline double FastKahanAccumulate(const TVector<T>& sequence) {
+    return FastKahanAccumulate(sequence.begin(), sequence.end());
+}
+
+template <typename It1, typename It2>
+static inline double FastInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+    return TypedFastInnerProduct<double>(begin1, end1, begin2);
+}
+
+template <typename T>
+static inline double FastInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) {
+    Y_ASSERT(lhs.size() == rhs.size());
+    return FastInnerProduct(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template <typename It1, typename It2>
+static inline double FastKahanInnerProduct(It1 begin1, It1 end1, It2 begin2) {
+    return TypedFastInnerProduct<TKahanAccumulator<double>>(begin1, end1, begin2);
+}
+
+template <typename T>
+static inline double FastKahanInnerProduct(const TVector<T>& lhs, const TVector<T>& rhs) {
+    Y_ASSERT(lhs.size() == rhs.size());
+    return FastKahanInnerProduct(lhs.begin(), lhs.end(), rhs.begin());
+}
diff --git a/library/cpp/accurate_accumulate/benchmark/main.cpp b/library/cpp/accurate_accumulate/benchmark/main.cpp
new file mode 100644
index 0000000000..3c5e6e775d
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/main.cpp
@@ -0,0 +1,97 @@
+#include <library/cpp/accurate_accumulate/accurate_accumulate.h>
+#include <library/cpp/testing/benchmark/bench.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+#include <util/generic/vector.h>
+#include <util/generic/xrange.h>
+#include <util/random/fast.h>
+
+namespace {
+    template <typename T, size_t N>
+    struct TNormalizedExamplesHolder {
+        TVector<T> Examples;
+        TNormalizedExamplesHolder()
+            : Examples(N)
+        {
+            TFastRng<ui64> prng{sizeof(T) * N * 42u};
+            for (auto& x : Examples) {
+                x = prng.GenRandReal4();
+            }
+        }
+    };
+
+    template <typename T, size_t N>
+    struct TExamplesHolder {
+        TVector<T> Examples;
+        TExamplesHolder()
+            : Examples(N)
+        {
+            TFastRng<ui64> prng{sizeof(T) * N * 42u + 100500u};
+            for (auto& x : Examples) {
+                // operations with non-normalized floating point numbers are rumored to work slower
+                x = prng.GenRandReal4() + prng.Uniform(1024u);
+            }
+        }
+    };
+}
+
+#define DEFINE_BENCHMARK(type, count)                                                                     \
+    Y_CPU_BENCHMARK(SimpleNorm_##type##_##count, iface) {                                                 \
+        const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples;                \
+        for (const auto i : xrange(iface.Iterations())) {                                                 \
+            Y_UNUSED(i);                                                                                  \
+            Y_DO_NOT_OPTIMIZE_AWAY(                                                                       \
+                (type)Accumulate(std::cbegin(examples), std::cend(examples), type{}));                    \
+        }                                                                                                 \
+    }                                                                                                     \
+                                                                                                          \
+    Y_CPU_BENCHMARK(KahanNorm_##type##_##count, iface) {                                                  \
+        const auto& examples = Default<TNormalizedExamplesHolder<type, count>>().Examples;                \
+        for (const auto i : xrange(iface.Iterations())) {                                                 \
+            Y_UNUSED(i);                                                                                  \
+            Y_DO_NOT_OPTIMIZE_AWAY(                                                                       \
+                (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \
+        }                                                                                                 \
+    }                                                                                                     \
+                                                                                                          \
+    Y_CPU_BENCHMARK(Simple_##type##_##count, iface) {                                                     \
+        const auto& examples = Default<TExamplesHolder<type, count>>().Examples;                          \
+        for (const auto i : xrange(iface.Iterations())) {                                                 \
+            Y_UNUSED(i);                                                                                  \
+            Y_DO_NOT_OPTIMIZE_AWAY(                                                                       \
+                (type)Accumulate(std::cbegin(examples), std::cend(examples), type{}));                    \
+        }                                                                                                 \
+    }                                                                                                     \
+                                                                                                          \
+    Y_CPU_BENCHMARK(Kahan_##type##_##count, iface) {                                                      \
+        const auto& examples = Default<TExamplesHolder<type, count>>().Examples;                          \
+        for (const auto i : xrange(iface.Iterations())) {                                                 \
+            Y_UNUSED(i);                                                                                  \
+            Y_DO_NOT_OPTIMIZE_AWAY(                                                                       \
+                (type)Accumulate(std::cbegin(examples), std::cend(examples), TKahanAccumulator<type>{})); \
+        }                                                                                                 \
+    }
+
+DEFINE_BENCHMARK(float, 2)
+DEFINE_BENCHMARK(float, 4)
+DEFINE_BENCHMARK(float, 8)
+DEFINE_BENCHMARK(float, 16)
+DEFINE_BENCHMARK(float, 32)
+DEFINE_BENCHMARK(float, 64)
+DEFINE_BENCHMARK(float, 128)
+DEFINE_BENCHMARK(float, 256)
+DEFINE_BENCHMARK(float, 512)
+DEFINE_BENCHMARK(float, 1024)
+DEFINE_BENCHMARK(double, 2)
+DEFINE_BENCHMARK(double, 4)
+DEFINE_BENCHMARK(double, 8)
+DEFINE_BENCHMARK(double, 16)
+DEFINE_BENCHMARK(double, 32)
+DEFINE_BENCHMARK(double, 64)
+DEFINE_BENCHMARK(double, 128)
+DEFINE_BENCHMARK(double, 256)
+DEFINE_BENCHMARK(double, 512)
+DEFINE_BENCHMARK(double, 1024)
+
+#undef DEFINE_BENCHMARK
diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/main.py b/library/cpp/accurate_accumulate/benchmark/metrics/main.py
new file mode 100644
index 0000000000..311fc219ce
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/metrics/main.py
@@ -0,0 +1,7 @@
+import yatest.common as yc
+
+
+def test_export_metrics(metrics):
+    metrics.set_benchmark(yc.execute_benchmark(
+        'library/cpp/accurate_accumulate/benchmark/benchmark',
+        threads=8))
diff --git a/library/cpp/accurate_accumulate/benchmark/metrics/ya.make b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make
new file mode 100644
index 0000000000..5d532e1479
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/metrics/ya.make
@@ -0,0 +1,17 @@
+OWNER(yazevnul)
+
+PY2TEST()
+
+SIZE(LARGE)
+
+TAG(
+    ya:force_sandbox
+    sb:intel_e5_2660v1
+    ya:fat
+)
+
+TEST_SRCS(main.py)
+
+DEPENDS(library/cpp/accurate_accumulate/benchmark)
+
+END()
diff --git a/library/cpp/accurate_accumulate/benchmark/ya.make b/library/cpp/accurate_accumulate/benchmark/ya.make
new file mode 100644
index 0000000000..20fd877389
--- /dev/null
+++ b/library/cpp/accurate_accumulate/benchmark/ya.make
@@ -0,0 +1,13 @@
+OWNER(yazevnul)
+
+Y_BENCHMARK()
+
+SRCS(
+    main.cpp
+)
+
+PEERDIR(
+    library/cpp/accurate_accumulate
+)
+
+END()
diff --git a/library/cpp/accurate_accumulate/ya.make b/library/cpp/accurate_accumulate/ya.make
new file mode 100644
index 0000000000..82630d19be
--- /dev/null
+++ b/library/cpp/accurate_accumulate/ya.make
@@ -0,0 +1,10 @@
+LIBRARY()
+
+OWNER(alex-sh)
+
+SRCS(
+    accurate_accumulate.h
+    accurate_accumulate.cpp
+)
+
+END()
author	Devtools Arcadia <arcadia-devtools@yandex-team.ru>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/accurate_accumulate
download	ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz