aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/IO/readFloatText.h
diff options
context:
space:
mode:
authorvitalyisaev <vitalyisaev@ydb.tech>2023-11-14 09:58:56 +0300
committervitalyisaev <vitalyisaev@ydb.tech>2023-11-14 10:20:20 +0300
commitc2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
treecd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/IO/readFloatText.h
parentd4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
downloadydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/IO/readFloatText.h')
-rw-r--r--contrib/clickhouse/src/IO/readFloatText.h596
1 files changed, 596 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/IO/readFloatText.h b/contrib/clickhouse/src/IO/readFloatText.h
new file mode 100644
index 0000000000..da4719b8dc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/readFloatText.h
@@ -0,0 +1,596 @@
+#pragma once
+#include <type_traits>
+#include <IO/ReadHelpers.h>
+#include <Core/Defines.h>
+#include <base/shift10.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <double-conversion/double-conversion.h>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif
+#include <fast_float/fast_float.h>
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+/** Methods for reading floating point numbers from text with decimal representation.
+ * There are "precise", "fast" and "simple" implementations.
+ *
+ * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
+ *
+ * Precise method always returns a number that is the closest machine representable number to the input.
+ *
+ * Fast method is faster (up to 3 times) and usually return the same value,
+ * but in rare cases result may differ by lest significant bit (for Float32)
+ * and by up to two least significant bits (for Float64) from precise method.
+ * Also fast method may parse some garbage as some other unspecified garbage.
+ *
+ * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
+ * It's not recommended to use simple method and it is left only for reference.
+ *
+ * For performance test, look at 'read_float_perf' test.
+ *
+ * For precision test.
+ * Parse all existing Float32 numbers:
+
+CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);
+
+WITH
+ toFloat32(toString(x)) AS y,
+ reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
+ reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
+ abs(bin_x - bin_y) AS diff
+SELECT
+ diff,
+ count()
+FROM test.floats
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+ * Here are the results:
+ *
+ Precise:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 4278190082 │
+ └──────┴────────────┘
+ (100% roundtrip property)
+
+ Fast:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 3685260580 │
+ │ 1 │ 592929502 │
+ └──────┴────────────┘
+ (The difference is 1 in least significant bit in 13.8% of numbers.)
+
+ Simple:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 2169879994 │
+ │ 1 │ 1807178292 │
+ │ 2 │ 269505944 │
+ │ 3 │ 28826966 │
+ │ 4 │ 2566488 │
+ │ 5 │ 212878 │
+ │ 6 │ 18276 │
+ │ 7 │ 1214 │
+ │ 8 │ 30 │
+ └──────┴────────────┘
+
+ * Parse random Float64 numbers:
+
+WITH
+ rand64() AS bin_x,
+ reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
+ toFloat64(toString(x)) AS y,
+ reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
+ abs(bin_x - bin_y) AS diff
+SELECT
+ diff,
+ count()
+FROM numbers(100000000)
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+ */
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_NUMBER;
+}
+
+
+/// Returns true, iff parsed.
+bool parseInfinity(ReadBuffer & buf);
+bool parseNaN(ReadBuffer & buf);
+
+void assertInfinity(ReadBuffer & buf);
+void assertNaN(ReadBuffer & buf);
+
+
+template <bool throw_exception>
+bool assertOrParseInfinity(ReadBuffer & buf)
+{
+ if constexpr (throw_exception)
+ {
+ assertInfinity(buf);
+ return true;
+ }
+ else
+ return parseInfinity(buf);
+}
+
+template <bool throw_exception>
+bool assertOrParseNaN(ReadBuffer & buf)
+{
+ if constexpr (throw_exception)
+ {
+ assertNaN(buf);
+ return true;
+ }
+ else
+ return parseNaN(buf);
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
+{
+ static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextPreciseImpl must be float or double");
+ static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
+ static constexpr int MAX_LENGTH = 316;
+
+ if (likely(!buf.eof() && buf.position() + MAX_LENGTH <= buf.buffer().end()))
+ {
+ auto * initial_position = buf.position();
+ auto res = fast_float::from_chars(initial_position, buf.buffer().end(), x);
+
+ if (unlikely(res.ec != std::errc()))
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return ReturnType(false);
+ }
+
+ buf.position() += res.ptr - initial_position;
+
+ return ReturnType(true);
+ }
+ else
+ {
+ /// Slow path. Copy characters that may be present in floating point number to temporary buffer.
+ bool negative = false;
+
+ /// We check eof here because we can parse +inf +nan
+ while (!buf.eof())
+ {
+ switch (*buf.position())
+ {
+ case '+':
+ ++buf.position();
+ continue;
+
+ case '-':
+ {
+ negative = true;
+ ++buf.position();
+ continue;
+ }
+
+ case 'i': [[fallthrough]];
+ case 'I':
+ {
+ if (assertOrParseInfinity<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ case 'n': [[fallthrough]];
+ case 'N':
+ {
+ if (assertOrParseNaN<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ default:
+ break;
+ }
+
+ break;
+ }
+
+
+ char tmp_buf[MAX_LENGTH];
+ int num_copied_chars = 0;
+
+ while (!buf.eof() && num_copied_chars < MAX_LENGTH)
+ {
+ char c = *buf.position();
+ if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E'))
+ break;
+
+ tmp_buf[num_copied_chars] = c;
+ ++buf.position();
+ ++num_copied_chars;
+ }
+
+ auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
+
+ if (unlikely(res.ec != std::errc()))
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return ReturnType(false);
+ }
+
+ if (negative)
+ x = -x;
+
+ return ReturnType(true);
+ }
+}
+
+
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept
+{
+ return (((val & 0xF0F0F0F0F0F0F0F0) | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333);
+}
+
+static inline bool is_made_of_eight_digits_fast(const char * chars) noexcept
+{
+ uint64_t val;
+ ::memcpy(&val, chars, 8);
+ return is_made_of_eight_digits_fast(val);
+}
+
+template <size_t N, typename T>
+static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
+{
+ /// In optimistic case we can skip bound checking for first loop.
+ if (buf.position() + N <= buf.buffer().end())
+ {
+ for (size_t i = 0; i < N; ++i)
+ {
+ if (isNumericASCII(*buf.position()))
+ {
+ x *= 10;
+ x += *buf.position() & 0x0F;
+ ++buf.position();
+ }
+ else
+ return;
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < N; ++i)
+ {
+ if (!buf.eof() && isNumericASCII(*buf.position()))
+ {
+ x *= 10;
+ x += *buf.position() & 0x0F;
+ ++buf.position();
+ }
+ else
+ return;
+ }
+ }
+
+ while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) &&
+ is_made_of_eight_digits_fast(buf.position()))
+ {
+ buf.position() += 8;
+ }
+
+ while (!buf.eof() && isNumericASCII(*buf.position()))
+ ++buf.position();
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
+{
+ static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
+ static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ bool negative = false;
+ x = 0;
+ UInt64 before_point = 0;
+ UInt64 after_point = 0;
+ int after_point_exponent = 0;
+ int exponent = 0;
+
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return false;
+ }
+
+ if (*in.position() == '-')
+ {
+ negative = true;
+ ++in.position();
+ }
+ else if (*in.position() == '+')
+ ++in.position();
+
+ auto count_after_sign = in.count();
+
+ constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
+ readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);
+
+ size_t read_digits = in.count() - count_after_sign;
+
+ if (unlikely(read_digits > significant_digits))
+ {
+ int before_point_additional_exponent = static_cast<int>(read_digits) - significant_digits;
+ x = static_cast<T>(shift10(before_point, before_point_additional_exponent));
+ }
+ else
+ {
+ x = before_point;
+
+ /// Shortcut for the common case when there is an integer that fit in Int64.
+ if (read_digits && (in.eof() || *in.position() < '.'))
+ {
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ }
+
+ if (checkChar('.', in))
+ {
+ auto after_point_count = in.count();
+
+ while (!in.eof() && *in.position() == '0')
+ ++in.position();
+
+ auto after_leading_zeros_count = in.count();
+ int after_point_num_leading_zeros = static_cast<int>(after_leading_zeros_count - after_point_count);
+
+ readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
+ read_digits = in.count() - after_leading_zeros_count;
+ after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros;
+ }
+
+ if (checkChar('e', in) || checkChar('E', in))
+ {
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
+ else
+ return false;
+ }
+
+ bool exponent_negative = false;
+ if (*in.position() == '-')
+ {
+ exponent_negative = true;
+ ++in.position();
+ }
+ else if (*in.position() == '+')
+ {
+ ++in.position();
+ }
+
+ readUIntTextUpToNSignificantDigits<4>(exponent, in);
+ if (exponent_negative)
+ exponent = -exponent;
+ }
+
+ if (after_point)
+ x += static_cast<T>(shift10(after_point, after_point_exponent));
+
+ if (exponent)
+ x = static_cast<T>(shift10(x, exponent));
+
+ if (negative)
+ x = -x;
+
+ auto num_characters_without_sign = in.count() - count_after_sign;
+
+ /// Denormals. At most one character is read before denormal and it is '-'.
+ if (num_characters_without_sign == 0)
+ {
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read");
+ else
+ return false;
+ }
+
+ if (*in.position() == '+')
+ {
+ ++in.position();
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign");
+ else
+ return false;
+ }
+ else if (negative)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign");
+ else
+ return false;
+ }
+ }
+
+ if (*in.position() == 'i' || *in.position() == 'I')
+ {
+ if (assertOrParseInfinity<throw_exception>(in))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+ else if (*in.position() == 'n' || *in.position() == 'N')
+ {
+ if (assertOrParseNaN<throw_exception>(in))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+ }
+
+ return ReturnType(true);
+}
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ bool negative = false;
+ x = 0;
+ bool after_point = false;
+ T power_of_ten = 1;
+
+ if (buf.eof())
+ throwReadAfterEOF();
+
+ while (!buf.eof())
+ {
+ switch (*buf.position())
+ {
+ case '+':
+ break;
+ case '-':
+ negative = true;
+ break;
+ case '.':
+ after_point = true;
+ break;
+ case '0': [[fallthrough]];
+ case '1': [[fallthrough]];
+ case '2': [[fallthrough]];
+ case '3': [[fallthrough]];
+ case '4': [[fallthrough]];
+ case '5': [[fallthrough]];
+ case '6': [[fallthrough]];
+ case '7': [[fallthrough]];
+ case '8': [[fallthrough]];
+ case '9':
+ if (after_point)
+ {
+ power_of_ten /= 10;
+ x += (*buf.position() - '0') * power_of_ten;
+ }
+ else
+ {
+ x *= 10;
+ x += *buf.position() - '0';
+ }
+ break;
+ case 'e': [[fallthrough]];
+ case 'E':
+ {
+ ++buf.position();
+ Int32 exponent = 0;
+ readIntText(exponent, buf);
+ x = shift10(x, exponent);
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+
+ case 'i': [[fallthrough]];
+ case 'I':
+ {
+ if (assertOrParseInfinity<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ case 'n': [[fallthrough]];
+ case 'N':
+ {
+ if (assertOrParseNaN<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ default:
+ {
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ }
+ ++buf.position();
+ }
+
+ if (negative)
+ x = -x;
+
+ return ReturnType(true);
+}
+
+template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
+
+
+/// Implementation that is selected as default.
+
+template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
+template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
+
+}