diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2024-11-14 15:56:23 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2024-11-14 16:10:31 +0300 |
commit | 6559ca8141e036ff577301dde8f75779e599d6e7 (patch) | |
tree | 9c1e6fcb169b4d60d75db4fe094b265d2ce26721 /yt | |
parent | bee451612e60e89d2052570a54940d318799479d (diff) | |
download | ydb-6559ca8141e036ff577301dde8f75779e599d6e7.tar.gz |
Fix various issues related to Decimal and Arrow
* Changelog entry
Type: fix
Component: proxy
Some fixes in decimal:
* Support parsing of nested `decimal128`/`decimal256` in Arrow.
* Fix bug in `decimal256` unversioned value representation --
before this change, Arrow parser for `decimal256(n, p)` was
always emitting strings of length 256 bits, event for n \<= 38,
which is incorrect in the representation of YT `decimal(n, p)`
type. Now it produces strings of variadic length (32, 64, 128
or 256 bits) depending on n, similar to `decimal128(n, p)`.
---
Pull Request resolved: <https://github.com/ytsaurus/ytsaurus/pull/942>
commit_hash:32e66c7eb4d996caf0893f97d269fb1930bc5f7a
Diffstat (limited to 'yt')
-rw-r--r-- | yt/yt/library/decimal/decimal.cpp | 20 | ||||
-rw-r--r-- | yt/yt/library/decimal/decimal.h | 3 | ||||
-rw-r--r-- | yt/yt/library/formats/arrow_parser.cpp | 77 |
3 files changed, 79 insertions, 21 deletions
diff --git a/yt/yt/library/decimal/decimal.cpp b/yt/yt/library/decimal/decimal.cpp index 3df4b44028..9a61ee585a 100644 --- a/yt/yt/library/decimal/decimal.cpp +++ b/yt/yt/library/decimal/decimal.cpp @@ -891,10 +891,28 @@ TStringBuf TDecimal::WriteBinary256(int precision, TValue256 value, char* buffer CheckDecimalIntBits<TValue256>(precision); YT_VERIFY(bufferLength >= resultLength); - DecimalIntegerToBinaryUnchecked(std::move(value), buffer); + DecimalIntegerToBinaryUnchecked(value, buffer); return TStringBuf{buffer, sizeof(TValue256)}; } +TStringBuf TDecimal::WriteBinary256Variadic(int precision, TValue256 value, char* buffer, size_t bufferLength) +{ + const size_t resultLength = GetValueBinarySize(precision); + switch (resultLength) { + case 4: + return WriteBinary32(precision, *reinterpret_cast<i32*>(value.Parts.data()), buffer, bufferLength); + case 8: + return WriteBinary64(precision, *reinterpret_cast<i64*>(value.Parts.data()), buffer, bufferLength); + case 16: + return WriteBinary128(precision, *reinterpret_cast<TValue128*>(value.Parts.data()), buffer, bufferLength); + case 32: + return WriteBinary256(precision, value, buffer, bufferLength); + default: + THROW_ERROR_EXCEPTION("Invalid precision %v", precision); + } +} + + template <typename T> Y_FORCE_INLINE void CheckBufferLength(int precision, size_t bufferLength) { diff --git a/yt/yt/library/decimal/decimal.h b/yt/yt/library/decimal/decimal.h index 27375d3904..1d28efe2ff 100644 --- a/yt/yt/library/decimal/decimal.h +++ b/yt/yt/library/decimal/decimal.h @@ -24,6 +24,7 @@ public: }; static_assert(sizeof(TValue128) == 2 * sizeof(ui64)); + //! Lower-endian representation of 256-bit decimal value. struct TValue256 { std::array<ui32, 8> Parts; @@ -64,6 +65,8 @@ public: // Writes either 32-bit, 64-bit or 128-bit binary value depending on precision, provided a TValue128. static TStringBuf WriteBinary128Variadic(int precision, TValue128 value, char* buffer, size_t bufferLength); + // Writes either 32-bit, 64-bit, 128-bit or 256-bit binary value depending on precision, provided a TValue256. + static TStringBuf WriteBinary256Variadic(int precision, TValue256 value, char* buffer, size_t bufferLength); static i32 ParseBinary32(int precision, TStringBuf buffer); static i64 ParseBinary64(int precision, TStringBuf buffer); diff --git a/yt/yt/library/formats/arrow_parser.cpp b/yt/yt/library/formats/arrow_parser.cpp index fb3846c1a6..e248240fd6 100644 --- a/yt/yt/library/formats/arrow_parser.cpp +++ b/yt/yt/library/formats/arrow_parser.cpp @@ -40,6 +40,28 @@ void ThrowOnError(const arrow::Status& status) } } +template <class TUnderlyingValueType> +TStringBuf SerializeDecimalBinary(const TStringBuf& value, int precision, char* buffer, size_t bufferLength) +{ + // NB: Arrow wire representation of Decimal128 is little-endian and (obviously) 128 bit, + // while YT in-memory representation of Decimal is big-endian, variadic-length of either 32 bit, 64 bit or 128 bit, + // and MSB-flipped to ensure lexical sorting order. + // Representation of Decimal256 is similar, but the upper limit for a length is 256 bit. + TUnderlyingValueType decimalValue; + YT_VERIFY(value.size() == sizeof(decimalValue)); + std::memcpy(&decimalValue, value.data(), value.size()); + + TStringBuf decimalBinary; + if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue128>) { + decimalBinary = TDecimal::WriteBinary128Variadic(precision, decimalValue, buffer, bufferLength); + } else if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>) { + decimalBinary = TDecimal::WriteBinary256Variadic(precision, decimalValue, buffer, bufferLength); + } else { + static_assert(std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>, "Unexpected decimal type"); + } + return decimalBinary; +} + //////////////////////////////////////////////////////////////////////////////// class TArraySimpleVisitor @@ -291,28 +313,12 @@ private: } template <class TUnderlyingValueType> - TUnversionedValue MakeDecimalBinaryValue(const TStringBuf& value, i64 columnId, int precision) + TUnversionedValue MakeDecimalBinaryValue(const TStringBuf& arrowValue, i64 columnId, int precision) { - // NB: Arrow wire representation of Decimal128 is little-endian and (obviously) 128 bit, - // while YT in-memory representation of Decimal is big-endian, variadic-length of either 32 bit, 64 bit or 128 bit, - // and MSB-flipped to ensure lexical sorting order. - // Representation of Decimal256 is similar, but only 256 bits. - TUnderlyingValueType decimalValue; - YT_VERIFY(value.size() == sizeof(decimalValue)); - std::memcpy(&decimalValue, value.data(), value.size()); - - const auto maxByteCount = sizeof(decimalValue); + const auto maxByteCount = sizeof(TUnderlyingValueType); char* buffer = BufferForStringLikeValues_->Preallocate(maxByteCount); - TStringBuf decimalBinary; - if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue128>) { - decimalBinary = TDecimal::WriteBinary128Variadic(precision, decimalValue, buffer, maxByteCount); - } else if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>) { - decimalBinary = TDecimal::WriteBinary256(precision, decimalValue, buffer, maxByteCount); - } else { - static_assert(std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>, "Unexpected decimal type"); - } + auto decimalBinary = SerializeDecimalBinary<TUnderlyingValueType>(arrowValue, precision, buffer, maxByteCount); BufferForStringLikeValues_->Advance(decimalBinary.size()); - return MakeUnversionedStringValue(decimalBinary, columnId); } }; @@ -456,6 +462,20 @@ public: return ParseStruct(); } + arrow::Status Visit(const arrow::Decimal128Type& type) override + { + return ParseStringLikeArray<arrow::Decimal128Array>([&] (const TStringBuf& value) { + WriteDecimalBinary<TDecimal::TValue128>(value, type.precision()); + }); + } + + arrow::Status Visit(const arrow::Decimal256Type& type) override + { + return ParseStringLikeArray<arrow::Decimal256Array>([&] (const TStringBuf& value) { + WriteDecimalBinary<TDecimal::TValue256>(value, type.precision()); + }); + } + private: const int RowIndex_; @@ -506,12 +526,20 @@ private: template <typename ArrayType> arrow::Status ParseStringLikeArray() { + return ParseStringLikeArray<ArrayType>([&] (const TStringBuf& value) { + Writer_->WriteBinaryString(value); + }); + } + + template <typename ArrayType> + arrow::Status ParseStringLikeArray(auto writeStringValue) + { auto array = std::static_pointer_cast<ArrayType>(Array_); if (array->IsNull(RowIndex_)) { Writer_->WriteEntity(); } else { auto element = array->GetView(RowIndex_); - Writer_->WriteBinaryString(TStringBuf(element.data(), element.size())); + writeStringValue(TStringBuf(element.data(), element.size())); } return arrow::Status::OK(); } @@ -610,6 +638,15 @@ private: } return arrow::Status::OK(); } + + template <class TUnderlyingType> + void WriteDecimalBinary(TStringBuf arrowValue, int precision) + { + const auto maxByteCount = sizeof(TUnderlyingType); + char buffer[maxByteCount]; + auto decimalBinary = SerializeDecimalBinary<TUnderlyingType>(arrowValue, precision, buffer, maxByteCount); + Writer_->WriteBinaryString(decimalBinary); + } }; //////////////////////////////////////////////////////////////////////////////// |