diff options
author | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-14 09:58:56 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-14 10:20:20 +0300 |
commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/IO/readFloatText.h | |
parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
download | ydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/IO/readFloatText.h')
-rw-r--r-- | contrib/clickhouse/src/IO/readFloatText.h | 596 |
1 files changed, 596 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/IO/readFloatText.h b/contrib/clickhouse/src/IO/readFloatText.h new file mode 100644 index 0000000000..da4719b8dc --- /dev/null +++ b/contrib/clickhouse/src/IO/readFloatText.h @@ -0,0 +1,596 @@ +#pragma once +#include <type_traits> +#include <IO/ReadHelpers.h> +#include <Core/Defines.h> +#include <base/shift10.h> +#include <Common/StringUtils/StringUtils.h> +#include <double-conversion/double-conversion.h> + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunneeded-internal-declaration" +#endif +#include <fast_float/fast_float.h> +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +/** Methods for reading floating point numbers from text with decimal representation. + * There are "precise", "fast" and "simple" implementations. + * + * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign. + * + * Precise method always returns a number that is the closest machine representable number to the input. + * + * Fast method is faster (up to 3 times) and usually return the same value, + * but in rare cases result may differ by lest significant bit (for Float32) + * and by up to two least significant bits (for Float64) from precise method. + * Also fast method may parse some garbage as some other unspecified garbage. + * + * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases. + * It's not recommended to use simple method and it is left only for reference. + * + * For performance test, look at 'read_float_perf' test. + * + * For precision test. + * Parse all existing Float32 numbers: + +CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000); + +WITH + toFloat32(toString(x)) AS y, + reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x, + reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y, + abs(bin_x - bin_y) AS diff +SELECT + diff, + count() +FROM test.floats +WHERE NOT isNaN(x) +GROUP BY diff +ORDER BY diff ASC +LIMIT 100 + + * Here are the results: + * + Precise: + ┌─diff─┬────count()─┐ + │ 0 │ 4278190082 │ + └──────┴────────────┘ + (100% roundtrip property) + + Fast: + ┌─diff─┬────count()─┐ + │ 0 │ 3685260580 │ + │ 1 │ 592929502 │ + └──────┴────────────┘ + (The difference is 1 in least significant bit in 13.8% of numbers.) + + Simple: + ┌─diff─┬────count()─┐ + │ 0 │ 2169879994 │ + │ 1 │ 1807178292 │ + │ 2 │ 269505944 │ + │ 3 │ 28826966 │ + │ 4 │ 2566488 │ + │ 5 │ 212878 │ + │ 6 │ 18276 │ + │ 7 │ 1214 │ + │ 8 │ 30 │ + └──────┴────────────┘ + + * Parse random Float64 numbers: + +WITH + rand64() AS bin_x, + reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x, + toFloat64(toString(x)) AS y, + reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y, + abs(bin_x - bin_y) AS diff +SELECT + diff, + count() +FROM numbers(100000000) +WHERE NOT isNaN(x) +GROUP BY diff +ORDER BY diff ASC +LIMIT 100 + + */ + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_NUMBER; +} + + +/// Returns true, iff parsed. +bool parseInfinity(ReadBuffer & buf); +bool parseNaN(ReadBuffer & buf); + +void assertInfinity(ReadBuffer & buf); +void assertNaN(ReadBuffer & buf); + + +template <bool throw_exception> +bool assertOrParseInfinity(ReadBuffer & buf) +{ + if constexpr (throw_exception) + { + assertInfinity(buf); + return true; + } + else + return parseInfinity(buf); +} + +template <bool throw_exception> +bool assertOrParseNaN(ReadBuffer & buf) +{ + if constexpr (throw_exception) + { + assertNaN(buf); + return true; + } + else + return parseNaN(buf); +} + + +template <typename T, typename ReturnType> +ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf) +{ + static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextPreciseImpl must be float or double"); + static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); + + static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; + + /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes. + static constexpr int MAX_LENGTH = 316; + + if (likely(!buf.eof() && buf.position() + MAX_LENGTH <= buf.buffer().end())) + { + auto * initial_position = buf.position(); + auto res = fast_float::from_chars(initial_position, buf.buffer().end(), x); + + if (unlikely(res.ec != std::errc())) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value"); + else + return ReturnType(false); + } + + buf.position() += res.ptr - initial_position; + + return ReturnType(true); + } + else + { + /// Slow path. Copy characters that may be present in floating point number to temporary buffer. + bool negative = false; + + /// We check eof here because we can parse +inf +nan + while (!buf.eof()) + { + switch (*buf.position()) + { + case '+': + ++buf.position(); + continue; + + case '-': + { + negative = true; + ++buf.position(); + continue; + } + + case 'i': [[fallthrough]]; + case 'I': + { + if (assertOrParseInfinity<throw_exception>(buf)) + { + x = std::numeric_limits<T>::infinity(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + + case 'n': [[fallthrough]]; + case 'N': + { + if (assertOrParseNaN<throw_exception>(buf)) + { + x = std::numeric_limits<T>::quiet_NaN(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + + default: + break; + } + + break; + } + + + char tmp_buf[MAX_LENGTH]; + int num_copied_chars = 0; + + while (!buf.eof() && num_copied_chars < MAX_LENGTH) + { + char c = *buf.position(); + if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E')) + break; + + tmp_buf[num_copied_chars] = c; + ++buf.position(); + ++num_copied_chars; + } + + auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x); + + if (unlikely(res.ec != std::errc())) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value"); + else + return ReturnType(false); + } + + if (negative) + x = -x; + + return ReturnType(true); + } +} + + +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +static inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept +{ + return (((val & 0xF0F0F0F0F0F0F0F0) | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333); +} + +static inline bool is_made_of_eight_digits_fast(const char * chars) noexcept +{ + uint64_t val; + ::memcpy(&val, chars, 8); + return is_made_of_eight_digits_fast(val); +} + +template <size_t N, typename T> +static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) +{ + /// In optimistic case we can skip bound checking for first loop. + if (buf.position() + N <= buf.buffer().end()) + { + for (size_t i = 0; i < N; ++i) + { + if (isNumericASCII(*buf.position())) + { + x *= 10; + x += *buf.position() & 0x0F; + ++buf.position(); + } + else + return; + } + } + else + { + for (size_t i = 0; i < N; ++i) + { + if (!buf.eof() && isNumericASCII(*buf.position())) + { + x *= 10; + x += *buf.position() & 0x0F; + ++buf.position(); + } + else + return; + } + } + + while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) && + is_made_of_eight_digits_fast(buf.position())) + { + buf.position() += 8; + } + + while (!buf.eof() && isNumericASCII(*buf.position())) + ++buf.position(); +} + + +template <typename T, typename ReturnType> +ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) +{ + static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double"); + static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); + + static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; + + bool negative = false; + x = 0; + UInt64 before_point = 0; + UInt64 after_point = 0; + int after_point_exponent = 0; + int exponent = 0; + + if (in.eof()) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value"); + else + return false; + } + + if (*in.position() == '-') + { + negative = true; + ++in.position(); + } + else if (*in.position() == '+') + ++in.position(); + + auto count_after_sign = in.count(); + + constexpr int significant_digits = std::numeric_limits<UInt64>::digits10; + readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in); + + size_t read_digits = in.count() - count_after_sign; + + if (unlikely(read_digits > significant_digits)) + { + int before_point_additional_exponent = static_cast<int>(read_digits) - significant_digits; + x = static_cast<T>(shift10(before_point, before_point_additional_exponent)); + } + else + { + x = before_point; + + /// Shortcut for the common case when there is an integer that fit in Int64. + if (read_digits && (in.eof() || *in.position() < '.')) + { + if (negative) + x = -x; + return ReturnType(true); + } + } + + if (checkChar('.', in)) + { + auto after_point_count = in.count(); + + while (!in.eof() && *in.position() == '0') + ++in.position(); + + auto after_leading_zeros_count = in.count(); + int after_point_num_leading_zeros = static_cast<int>(after_leading_zeros_count - after_point_count); + + readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in); + read_digits = in.count() - after_leading_zeros_count; + after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros; + } + + if (checkChar('e', in) || checkChar('E', in)) + { + if (in.eof()) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent"); + else + return false; + } + + bool exponent_negative = false; + if (*in.position() == '-') + { + exponent_negative = true; + ++in.position(); + } + else if (*in.position() == '+') + { + ++in.position(); + } + + readUIntTextUpToNSignificantDigits<4>(exponent, in); + if (exponent_negative) + exponent = -exponent; + } + + if (after_point) + x += static_cast<T>(shift10(after_point, after_point_exponent)); + + if (exponent) + x = static_cast<T>(shift10(x, exponent)); + + if (negative) + x = -x; + + auto num_characters_without_sign = in.count() - count_after_sign; + + /// Denormals. At most one character is read before denormal and it is '-'. + if (num_characters_without_sign == 0) + { + if (in.eof()) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read"); + else + return false; + } + + if (*in.position() == '+') + { + ++in.position(); + if (in.eof()) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign"); + else + return false; + } + else if (negative) + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign"); + else + return false; + } + } + + if (*in.position() == 'i' || *in.position() == 'I') + { + if (assertOrParseInfinity<throw_exception>(in)) + { + x = std::numeric_limits<T>::infinity(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + else if (*in.position() == 'n' || *in.position() == 'N') + { + if (assertOrParseNaN<throw_exception>(in)) + { + x = std::numeric_limits<T>::quiet_NaN(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + } + + return ReturnType(true); +} + +template <typename T, typename ReturnType> +ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) +{ + static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; + + bool negative = false; + x = 0; + bool after_point = false; + T power_of_ten = 1; + + if (buf.eof()) + throwReadAfterEOF(); + + while (!buf.eof()) + { + switch (*buf.position()) + { + case '+': + break; + case '-': + negative = true; + break; + case '.': + after_point = true; + break; + case '0': [[fallthrough]]; + case '1': [[fallthrough]]; + case '2': [[fallthrough]]; + case '3': [[fallthrough]]; + case '4': [[fallthrough]]; + case '5': [[fallthrough]]; + case '6': [[fallthrough]]; + case '7': [[fallthrough]]; + case '8': [[fallthrough]]; + case '9': + if (after_point) + { + power_of_ten /= 10; + x += (*buf.position() - '0') * power_of_ten; + } + else + { + x *= 10; + x += *buf.position() - '0'; + } + break; + case 'e': [[fallthrough]]; + case 'E': + { + ++buf.position(); + Int32 exponent = 0; + readIntText(exponent, buf); + x = shift10(x, exponent); + if (negative) + x = -x; + return ReturnType(true); + } + + case 'i': [[fallthrough]]; + case 'I': + { + if (assertOrParseInfinity<throw_exception>(buf)) + { + x = std::numeric_limits<T>::infinity(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + + case 'n': [[fallthrough]]; + case 'N': + { + if (assertOrParseNaN<throw_exception>(buf)) + { + x = std::numeric_limits<T>::quiet_NaN(); + if (negative) + x = -x; + return ReturnType(true); + } + return ReturnType(false); + } + + default: + { + if (negative) + x = -x; + return ReturnType(true); + } + } + ++buf.position(); + } + + if (negative) + x = -x; + + return ReturnType(true); +} + +template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); } +template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); } + +template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); } +template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); } + +template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); } +template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); } + + +/// Implementation that is selected as default. + +template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } +template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } + +} |