YQ Connector: move tests from yql to ydb (OSS)

Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
author: vitalyisaev <vitalyisaev@ydb.tech> 2023-11-14 09:58:56 +0300
committer: vitalyisaev <vitalyisaev@ydb.tech> 2023-11-14 10:20:20 +0300
commit: c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
tree: cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/IO/readFloatText.h
parent: d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
download: ydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz
1 files changed, 596 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/IO/readFloatText.h b/contrib/clickhouse/src/IO/readFloatText.h
new file mode 100644
index 0000000000..da4719b8dc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/readFloatText.h
@@ -0,0 +1,596 @@
+#pragma once
+#include <type_traits>
+#include <IO/ReadHelpers.h>
+#include <Core/Defines.h>
+#include <base/shift10.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <double-conversion/double-conversion.h>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif
+#include <fast_float/fast_float.h>
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+/** Methods for reading floating point numbers from text with decimal representation.
+  * There are "precise", "fast" and "simple" implementations.
+  *
+  * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
+  *
+  * Precise method always returns a number that is the closest machine representable number to the input.
+  *
+  * Fast method is faster (up to 3 times) and usually return the same value,
+  *  but in rare cases result may differ by lest significant bit (for Float32)
+  *  and by up to two least significant bits (for Float64) from precise method.
+  * Also fast method may parse some garbage as some other unspecified garbage.
+  *
+  * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
+  * It's not recommended to use simple method and it is left only for reference.
+  *
+  * For performance test, look at 'read_float_perf' test.
+  *
+  * For precision test.
+  * Parse all existing Float32 numbers:
+
+CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);
+
+WITH
+    toFloat32(toString(x)) AS y,
+    reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
+    reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
+    abs(bin_x - bin_y) AS diff
+SELECT
+    diff,
+    count()
+FROM test.floats
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+  * Here are the results:
+  *
+    Precise:
+    ┌─diff─┬────count()─┐
+    │    0 │ 4278190082 │
+    └──────┴────────────┘
+    (100% roundtrip property)
+
+    Fast:
+    ┌─diff─┬────count()─┐
+    │    0 │ 3685260580 │
+    │    1 │  592929502 │
+    └──────┴────────────┘
+    (The difference is 1 in least significant bit in 13.8% of numbers.)
+
+    Simple:
+    ┌─diff─┬────count()─┐
+    │    0 │ 2169879994 │
+    │    1 │ 1807178292 │
+    │    2 │  269505944 │
+    │    3 │   28826966 │
+    │    4 │    2566488 │
+    │    5 │     212878 │
+    │    6 │      18276 │
+    │    7 │       1214 │
+    │    8 │         30 │
+    └──────┴────────────┘
+
+  * Parse random Float64 numbers:
+
+WITH
+    rand64() AS bin_x,
+    reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
+    toFloat64(toString(x)) AS y,
+    reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
+    abs(bin_x - bin_y) AS diff
+SELECT
+    diff,
+    count()
+FROM numbers(100000000)
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+  */
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_PARSE_NUMBER;
+}
+
+
+/// Returns true, iff parsed.
+bool parseInfinity(ReadBuffer & buf);
+bool parseNaN(ReadBuffer & buf);
+
+void assertInfinity(ReadBuffer & buf);
+void assertNaN(ReadBuffer & buf);
+
+
+template <bool throw_exception>
+bool assertOrParseInfinity(ReadBuffer & buf)
+{
+    if constexpr (throw_exception)
+    {
+        assertInfinity(buf);
+        return true;
+    }
+    else
+        return parseInfinity(buf);
+}
+
+template <bool throw_exception>
+bool assertOrParseNaN(ReadBuffer & buf)
+{
+    if constexpr (throw_exception)
+    {
+        assertNaN(buf);
+        return true;
+    }
+    else
+        return parseNaN(buf);
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
+{
+    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextPreciseImpl must be float or double");
+    static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+    /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
+    static constexpr int MAX_LENGTH = 316;
+
+    if (likely(!buf.eof() && buf.position() + MAX_LENGTH <= buf.buffer().end()))
+    {
+        auto * initial_position = buf.position();
+        auto res = fast_float::from_chars(initial_position, buf.buffer().end(), x);
+
+        if (unlikely(res.ec != std::errc()))
+        {
+            if constexpr (throw_exception)
+                throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+            else
+                return ReturnType(false);
+        }
+
+        buf.position() += res.ptr - initial_position;
+
+        return ReturnType(true);
+    }
+    else
+    {
+        /// Slow path. Copy characters that may be present in floating point number to temporary buffer.
+        bool negative = false;
+
+        /// We check eof here because we can parse +inf +nan
+        while (!buf.eof())
+        {
+            switch (*buf.position())
+            {
+                case '+':
+                    ++buf.position();
+                    continue;
+
+                case '-':
+                {
+                    negative = true;
+                    ++buf.position();
+                    continue;
+                }
+
+                case 'i': [[fallthrough]];
+                case 'I':
+                {
+                    if (assertOrParseInfinity<throw_exception>(buf))
+                    {
+                        x = std::numeric_limits<T>::infinity();
+                        if (negative)
+                            x = -x;
+                        return ReturnType(true);
+                    }
+                    return ReturnType(false);
+                }
+
+                case 'n': [[fallthrough]];
+                case 'N':
+                {
+                    if (assertOrParseNaN<throw_exception>(buf))
+                    {
+                        x = std::numeric_limits<T>::quiet_NaN();
+                        if (negative)
+                            x = -x;
+                        return ReturnType(true);
+                    }
+                    return ReturnType(false);
+                }
+
+                default:
+                    break;
+            }
+
+            break;
+        }
+
+
+        char tmp_buf[MAX_LENGTH];
+        int num_copied_chars = 0;
+
+        while (!buf.eof() && num_copied_chars < MAX_LENGTH)
+        {
+            char c = *buf.position();
+            if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E'))
+                break;
+
+            tmp_buf[num_copied_chars] = c;
+            ++buf.position();
+            ++num_copied_chars;
+        }
+
+        auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
+
+        if (unlikely(res.ec != std::errc()))
+        {
+            if constexpr (throw_exception)
+                throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+            else
+                return ReturnType(false);
+        }
+
+        if (negative)
+            x = -x;
+
+        return ReturnType(true);
+    }
+}
+
+
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept
+{
+    return (((val & 0xF0F0F0F0F0F0F0F0) | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333);
+}
+
+static inline bool is_made_of_eight_digits_fast(const char * chars) noexcept
+{
+    uint64_t val;
+    ::memcpy(&val, chars, 8);
+    return is_made_of_eight_digits_fast(val);
+}
+
+template <size_t N, typename T>
+static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
+{
+    /// In optimistic case we can skip bound checking for first loop.
+    if (buf.position() + N <= buf.buffer().end())
+    {
+        for (size_t i = 0; i < N; ++i)
+        {
+            if (isNumericASCII(*buf.position()))
+            {
+                x *= 10;
+                x += *buf.position() & 0x0F;
+                ++buf.position();
+            }
+            else
+                return;
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < N; ++i)
+        {
+            if (!buf.eof() && isNumericASCII(*buf.position()))
+            {
+                x *= 10;
+                x += *buf.position() & 0x0F;
+                ++buf.position();
+            }
+            else
+                return;
+        }
+    }
+
+    while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) &&
+         is_made_of_eight_digits_fast(buf.position()))
+    {
+        buf.position() += 8;
+    }
+
+    while (!buf.eof() && isNumericASCII(*buf.position()))
+        ++buf.position();
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
+{
+    static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
+    static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+    bool negative = false;
+    x = 0;
+    UInt64 before_point = 0;
+    UInt64 after_point = 0;
+    int after_point_exponent = 0;
+    int exponent = 0;
+
+    if (in.eof())
+    {
+        if constexpr (throw_exception)
+            throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+        else
+            return false;
+    }
+
+    if (*in.position() == '-')
+    {
+        negative = true;
+        ++in.position();
+    }
+    else if (*in.position() == '+')
+        ++in.position();
+
+    auto count_after_sign = in.count();
+
+    constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
+    readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);
+
+    size_t read_digits = in.count() - count_after_sign;
+
+    if (unlikely(read_digits > significant_digits))
+    {
+        int before_point_additional_exponent = static_cast<int>(read_digits) - significant_digits;
+        x = static_cast<T>(shift10(before_point, before_point_additional_exponent));
+    }
+    else
+    {
+        x = before_point;
+
+        /// Shortcut for the common case when there is an integer that fit in Int64.
+        if (read_digits && (in.eof() || *in.position() < '.'))
+        {
+            if (negative)
+                x = -x;
+            return ReturnType(true);
+        }
+    }
+
+    if (checkChar('.', in))
+    {
+        auto after_point_count = in.count();
+
+        while (!in.eof() && *in.position() == '0')
+            ++in.position();
+
+        auto after_leading_zeros_count = in.count();
+        int after_point_num_leading_zeros = static_cast<int>(after_leading_zeros_count - after_point_count);
+
+        readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
+        read_digits = in.count() - after_leading_zeros_count;
+        after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros;
+    }
+
+    if (checkChar('e', in) || checkChar('E', in))
+    {
+        if (in.eof())
+        {
+            if constexpr (throw_exception)
+                throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
+            else
+                return false;
+        }
+
+        bool exponent_negative = false;
+        if (*in.position() == '-')
+        {
+            exponent_negative = true;
+            ++in.position();
+        }
+        else if (*in.position() == '+')
+        {
+            ++in.position();
+        }
+
+        readUIntTextUpToNSignificantDigits<4>(exponent, in);
+        if (exponent_negative)
+            exponent = -exponent;
+    }
+
+    if (after_point)
+        x += static_cast<T>(shift10(after_point, after_point_exponent));
+
+    if (exponent)
+        x = static_cast<T>(shift10(x, exponent));
+
+    if (negative)
+        x = -x;
+
+    auto num_characters_without_sign = in.count() - count_after_sign;
+
+    /// Denormals. At most one character is read before denormal and it is '-'.
+    if (num_characters_without_sign == 0)
+    {
+        if (in.eof())
+        {
+            if constexpr (throw_exception)
+                throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read");
+            else
+                return false;
+        }
+
+        if (*in.position() == '+')
+        {
+            ++in.position();
+            if (in.eof())
+            {
+                if constexpr (throw_exception)
+                    throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign");
+                else
+                    return false;
+            }
+            else if (negative)
+            {
+                if constexpr (throw_exception)
+                    throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign");
+                else
+                    return false;
+            }
+        }
+
+        if (*in.position() == 'i' || *in.position() == 'I')
+        {
+            if (assertOrParseInfinity<throw_exception>(in))
+            {
+                x = std::numeric_limits<T>::infinity();
+                if (negative)
+                    x = -x;
+                return ReturnType(true);
+            }
+            return ReturnType(false);
+        }
+        else if (*in.position() == 'n' || *in.position() == 'N')
+        {
+            if (assertOrParseNaN<throw_exception>(in))
+            {
+                x = std::numeric_limits<T>::quiet_NaN();
+                if (negative)
+                    x = -x;
+                return ReturnType(true);
+            }
+            return ReturnType(false);
+        }
+    }
+
+    return ReturnType(true);
+}
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
+{
+    static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+    bool negative = false;
+    x = 0;
+    bool after_point = false;
+    T power_of_ten = 1;
+
+    if (buf.eof())
+        throwReadAfterEOF();
+
+    while (!buf.eof())
+    {
+        switch (*buf.position())
+        {
+            case '+':
+                break;
+            case '-':
+                negative = true;
+                break;
+            case '.':
+                after_point = true;
+                break;
+            case '0': [[fallthrough]];
+            case '1': [[fallthrough]];
+            case '2': [[fallthrough]];
+            case '3': [[fallthrough]];
+            case '4': [[fallthrough]];
+            case '5': [[fallthrough]];
+            case '6': [[fallthrough]];
+            case '7': [[fallthrough]];
+            case '8': [[fallthrough]];
+            case '9':
+                if (after_point)
+                {
+                    power_of_ten /= 10;
+                    x += (*buf.position() - '0') * power_of_ten;
+                }
+                else
+                {
+                    x *= 10;
+                    x += *buf.position() - '0';
+                }
+                break;
+            case 'e': [[fallthrough]];
+            case 'E':
+            {
+                ++buf.position();
+                Int32 exponent = 0;
+                readIntText(exponent, buf);
+                x = shift10(x, exponent);
+                if (negative)
+                    x = -x;
+                return ReturnType(true);
+            }
+
+            case 'i': [[fallthrough]];
+            case 'I':
+            {
+                if (assertOrParseInfinity<throw_exception>(buf))
+                {
+                    x = std::numeric_limits<T>::infinity();
+                    if (negative)
+                        x = -x;
+                    return ReturnType(true);
+                }
+                return ReturnType(false);
+            }
+
+            case 'n': [[fallthrough]];
+            case 'N':
+            {
+                if (assertOrParseNaN<throw_exception>(buf))
+                {
+                    x = std::numeric_limits<T>::quiet_NaN();
+                    if (negative)
+                        x = -x;
+                    return ReturnType(true);
+                }
+                return ReturnType(false);
+            }
+
+            default:
+            {
+                if (negative)
+                    x = -x;
+                return ReturnType(true);
+            }
+        }
+        ++buf.position();
+    }
+
+    if (negative)
+        x = -x;
+
+    return ReturnType(true);
+}
+
+template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
+
+
+/// Implementation that is selected as default.
+
+template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
+template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
+
+}
author	vitalyisaev <vitalyisaev@ydb.tech>	2023-11-14 09:58:56 +0300
committer	vitalyisaev <vitalyisaev@ydb.tech>	2023-11-14 10:20:20 +0300
commit	c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
tree	cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/IO/readFloatText.h
parent	d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
download	ydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz