// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/parquet/statistics.h" #include #include #include #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/array.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type_traits.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/bit_run_reader.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/checked_cast.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/float16.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/logging.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/ubsan.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/visit_data_inline.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/encoding.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/exception.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/platform.h" #include "contrib/libs/apache/arrow_next/cpp/src/parquet/schema.h" using arrow20::default_memory_pool; using arrow20::MemoryPool; using arrow20::internal::checked_cast; using arrow20::util::Float16; using arrow20::util::SafeCopy; using arrow20::util::SafeLoad; namespace parquet20 { namespace { // ---------------------------------------------------------------------- // Comparator implementations constexpr int value_length(int value_length, const ByteArray& value) { return value.len; } constexpr int value_length(int type_length, const FLBA& value) { return type_length; } // Static "constants" for normalizing float16 min/max values. These need to be expressed // as pointers because `Float16LogicalType` represents an FLBA. struct Float16Constants { static constexpr const uint8_t* lowest() { return lowest_.data(); } static constexpr const uint8_t* max() { return max_.data(); } static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } private: using Bytes = std::array; static constexpr Bytes lowest_ = std::numeric_limits::lowest().ToLittleEndian(); static constexpr Bytes max_ = std::numeric_limits::max().ToLittleEndian(); static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); }; template struct CompareHelper { using T = typename DType::c_type; static_assert(!std::is_unsigned::value || std::is_same::value, "T is an unsigned numeric"); constexpr static T DefaultMin() { return std::numeric_limits::max(); } constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11 // standard requirements. template static ::arrow20::enable_if_t::value, T> Coalesce(T val, T fallback) { return std::isnan(val) ? fallback : val; } template static ::arrow20::enable_if_t::value, T> Coalesce( T val, T fallback) { return val; } static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; } static T Min(int type_length, T a, T b) { return a < b ? a : b; } static T Max(int type_length, T a, T b) { return a < b ? b : a; } }; template struct UnsignedCompareHelperBase { using T = typename DType::c_type; using UCType = typename std::make_unsigned::type; static_assert(!std::is_same::value, "T is unsigned"); static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); // NOTE: according to the C++ spec, unsigned-to-signed conversion is // implementation-defined if the original value does not fit in the signed type // (i.e., two's complement cannot be assumed even on mainstream machines, // because the compiler may decide otherwise). Hence the use of `SafeCopy` // below for deterministic bit-casting. // (see "Integer conversions" in // https://en.cppreference.com/w/cpp/language/implicit_conversion) static const T DefaultMin() { return SafeCopy(std::numeric_limits::max()); } static const T DefaultMax() { return 0; } static T Coalesce(T val, T fallback) { return val; } static bool Compare(int type_length, T a, T b) { return SafeCopy(a) < SafeCopy(b); } static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; } static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; } }; template <> struct CompareHelper : public UnsignedCompareHelperBase {}; template <> struct CompareHelper : public UnsignedCompareHelperBase {}; template struct CompareHelper { using T = typename Int96Type::c_type; using msb_type = typename std::conditional::type; static T DefaultMin() { uint32_t kMsbMax = SafeCopy(std::numeric_limits::max()); uint32_t kMax = std::numeric_limits::max(); return {kMax, kMax, kMsbMax}; } static T DefaultMax() { uint32_t kMsbMin = SafeCopy(std::numeric_limits::min()); uint32_t kMin = std::numeric_limits::min(); return {kMin, kMin, kMsbMin}; } static T Coalesce(T val, T fallback) { return val; } static inline bool Compare(int type_length, const T& a, const T& b) { if (a.value[2] != b.value[2]) { // Only the MSB bit is by Signed comparison. For little-endian, this is the // last bit of Int96 type. return SafeCopy(a.value[2]) < SafeCopy(b.value[2]); } else if (a.value[1] != b.value[1]) { return (a.value[1] < b.value[1]); } return (a.value[0] < b.value[0]); } static T Min(int type_length, const T& a, const T& b) { return Compare(0, a, b) ? a : b; } static T Max(int type_length, const T& a, const T& b) { return Compare(0, a, b) ? b : a; } }; template struct BinaryLikeComparer {}; template struct BinaryLikeComparer { static bool Compare(int type_length, const T& a, const T& b) { int a_length = value_length(type_length, a); int b_length = value_length(type_length, b); // Unsigned comparison is used for non-numeric types so straight // lexicographic comparison makes sense. (a.ptr is always unsigned).... return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length); } }; template struct BinaryLikeComparer { static bool Compare(int type_length, const T& a, const T& b) { // Is signed is used for integers encoded as big-endian twos // complement integers. (e.g. decimals). int a_length = value_length(type_length, a); int b_length = value_length(type_length, b); // At least of the lengths is zero. if (a_length == 0 || b_length == 0) { return a_length == 0 && b_length > 0; } int8_t first_a = *a.ptr; int8_t first_b = *b.ptr; // We can short circuit for different signed numbers or // for equal length bytes arrays that have different first bytes. // The equality requirement is necessary for sign extension cases. // 0xFF10 should be equal to 0x10 (due to big endian sign extension). if ((0x80 & first_a) != (0x80 & first_b) || (a_length == b_length && first_a != first_b)) { return first_a < first_b; } // When the lengths are unequal and the numbers are of the same // sign we need to do comparison by sign extending the shorter // value first, and once we get to equal sized arrays, lexicographical // unsigned comparison of everything but the first byte is sufficient. const uint8_t* a_start = a.ptr; const uint8_t* b_start = b.ptr; if (a_length != b_length) { const uint8_t* lead_start = nullptr; const uint8_t* lead_end = nullptr; if (a_length > b_length) { int lead_length = a_length - b_length; lead_start = a.ptr; lead_end = a.ptr + lead_length; a_start += lead_length; } else { DCHECK_LT(a_length, b_length); int lead_length = b_length - a_length; lead_start = b.ptr; lead_end = b.ptr + lead_length; b_start += lead_length; } // Compare extra bytes to the sign extension of the first // byte of the other number. uint8_t extension = first_a < 0 ? 0xFF : 0; bool not_equal = std::any_of(lead_start, lead_end, [extension](uint8_t a) { return extension != a; }); if (not_equal) { // Since sign extension are extrema values for unsigned bytes: // // Four cases exist: // negative values: // b is the longer value. // b must be the lesser value: return false // else: // a must be the lesser value: return true // // positive values: // b is the longer value. // values in b must be greater than a: return true // else: // values in a must be greater than b: return false bool negative_values = first_a < 0; bool b_longer = a_length < b_length; return negative_values != b_longer; } } else { a_start++; b_start++; } return std::lexicographical_compare(a_start, a.ptr + a_length, b_start, b.ptr + b_length); } }; template struct BinaryLikeCompareHelperBase { using T = typename DType::c_type; static T DefaultMin() { return {}; } static T DefaultMax() { return {}; } static T Coalesce(T val, T fallback) { return val; } static inline bool Compare(int type_length, const T& a, const T& b) { return BinaryLikeComparer::Compare(type_length, a, b); } static T Min(int type_length, const T& a, const T& b) { if (a.ptr == nullptr) return b; if (b.ptr == nullptr) return a; return Compare(type_length, a, b) ? a : b; } static T Max(int type_length, const T& a, const T& b) { if (a.ptr == nullptr) return b; if (b.ptr == nullptr) return a; return Compare(type_length, a, b) ? b : a; } }; template struct CompareHelper : public BinaryLikeCompareHelperBase {}; template struct CompareHelper : public BinaryLikeCompareHelperBase {}; template <> struct CompareHelper { using T = FLBA; static T DefaultMin() { return T{Float16Constants::max()}; } static T DefaultMax() { return T{Float16Constants::lowest()}; } static T Coalesce(T val, T fallback) { return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback : val; } static inline bool Compare(int type_length, const T& a, const T& b) { const auto lhs = Float16::FromLittleEndian(a.ptr); const auto rhs = Float16::FromLittleEndian(b.ptr); // NaN is handled here (same behavior as native float compare) return lhs < rhs; } static T Min(int type_length, const T& a, const T& b) { if (a.ptr == nullptr) return b; if (b.ptr == nullptr) return a; return Compare(type_length, a, b) ? a : b; } static T Max(int type_length, const T& a, const T& b) { if (a.ptr == nullptr) return b; if (b.ptr == nullptr) return a; return Compare(type_length, a, b) ? b : a; } }; using ::std::optional; template ::arrow20::enable_if_t::value, optional>> CleanStatistic(std::pair min_max, LogicalType::Type::type) { return min_max; } // In case of floating point types, the following rules are applied (as per // upstream parquet-mr): // - If any of min/max is NaN, return nothing. // - If min is 0.0f, replace with -0.0f // - If max is -0.0f, replace with 0.0f template ::arrow20::enable_if_t::value, optional>> CleanStatistic(std::pair min_max, LogicalType::Type::type) { T min = min_max.first; T max = min_max.second; // Ignore if one of the value is nan. if (std::isnan(min) || std::isnan(max)) { return ::std::nullopt; } if (min == std::numeric_limits::max() && max == std::numeric_limits::lowest()) { return ::std::nullopt; } T zero{}; if (min == zero && !std::signbit(min)) { min = -min; } if (max == zero && std::signbit(max)) { max = -max; } return {{min, max}}; } optional> CleanFloat16Statistic(std::pair min_max) { FLBA min_flba = min_max.first; FLBA max_flba = min_max.second; Float16 min = Float16::FromLittleEndian(min_flba.ptr); Float16 max = Float16::FromLittleEndian(max_flba.ptr); if (min.is_nan() || max.is_nan()) { return ::std::nullopt; } if (min == std::numeric_limits::max() && max == std::numeric_limits::lowest()) { return ::std::nullopt; } if (min.is_zero() && !min.signbit()) { min_flba = FLBA{Float16Constants::negative_zero()}; } if (max.is_zero() && max.signbit()) { max_flba = FLBA{Float16Constants::positive_zero()}; } return {{min_flba, max_flba}}; } optional> CleanStatistic(std::pair min_max, LogicalType::Type::type logical_type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } if (logical_type == LogicalType::Type::FLOAT16) { return CleanFloat16Statistic(std::move(min_max)); } return min_max; } optional> CleanStatistic( std::pair min_max, LogicalType::Type::type) { if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) { return ::std::nullopt; } return min_max; } template struct RebindLogical { using DType = T; using c_type = typename DType::c_type; }; template <> struct RebindLogical { using DType = FLBAType; using c_type = DType::c_type; }; template class TypedComparatorImpl : virtual public TypedComparator::DType> { public: using T = typename RebindLogical::c_type; using Helper = CompareHelper; explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {} bool CompareInline(const T& a, const T& b) const { return Helper::Compare(type_length_, a, b); } bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); } std::pair GetMinMax(const T* values, int64_t length) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); T max = Helper::DefaultMax(); for (int64_t i = 0; i < length; i++) { const auto val = SafeLoad(values + i); min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin())); max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax())); } return {min, max}; } std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, int64_t valid_bits_offset) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); T max = Helper::DefaultMax(); ::arrow20::internal::VisitSetBitRunsVoid( valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) { for (int64_t i = 0; i < length; i++) { const auto val = SafeLoad(values + i + position); min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin())); max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax())); } }); return {min, max}; } std::pair GetMinMax(const ::arrow20::Array& values) const override { ParquetException::NYI(values.type()->ToString()); } private: int type_length_; }; // ARROW-11675: A hand-written version of GetMinMax(), to work around // what looks like a MSVC code generation bug. // This does not seem to be required for GetMinMaxSpaced(). template <> std::pair TypedComparatorImpl::GetMinMax(const int32_t* values, int64_t length) const { DCHECK_GT(length, 0); const uint32_t* unsigned_values = reinterpret_cast(values); uint32_t min = std::numeric_limits::max(); uint32_t max = std::numeric_limits::lowest(); for (int64_t i = 0; i < length; i++) { const auto val = unsigned_values[i]; min = std::min(min, val); max = std::max(max, val); } return {SafeCopy(min), SafeCopy(max)}; } template std::pair GetMinMaxBinaryHelper( const TypedComparatorImpl& comparator, const ::arrow20::Array& values) { using Helper = CompareHelper; ByteArray min = Helper::DefaultMin(); ByteArray max = Helper::DefaultMax(); constexpr int type_length = -1; const auto valid_func = [&](ByteArray val) { min = Helper::Min(type_length, val, min); max = Helper::Max(type_length, val, max); }; const auto null_func = [&]() {}; if (::arrow20::is_binary_like(values.type_id())) { ::arrow20::VisitArraySpanInline<::arrow20::BinaryType>( *values.data(), std::move(valid_func), std::move(null_func)); } else { DCHECK(::arrow20::is_large_binary_like(values.type_id())); ::arrow20::VisitArraySpanInline<::arrow20::LargeBinaryType>( *values.data(), std::move(valid_func), std::move(null_func)); } return {min, max}; } template <> std::pair TypedComparatorImpl::GetMinMax( const ::arrow20::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } template <> std::pair TypedComparatorImpl::GetMinMax( const ::arrow20::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } LogicalType::Type::type LogicalTypeId(const ColumnDescriptor* descr) { if (const auto& logical_type = descr->logical_type()) { return logical_type->type(); } return LogicalType::Type::NONE; } LogicalType::Type::type LogicalTypeId(const Statistics& stats) { return LogicalTypeId(stats.descr()); } template class TypedStatisticsImpl : public TypedStatistics { public: using T = typename DType::c_type; // Create an empty stats. TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool) : descr_(descr), pool_(pool), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)), logical_type_(LogicalTypeId(descr_)) { comparator_ = MakeComparator(descr); TypedStatisticsImpl::Reset(); } // Create stats from provided values. TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count, int64_t distinct_count) : pool_(default_memory_pool()), min_buffer_(AllocateBuffer(pool_, 0)), max_buffer_(AllocateBuffer(pool_, 0)) { TypedStatisticsImpl::IncrementNumValues(num_values); TypedStatisticsImpl::IncrementNullCount(null_count); SetDistinctCount(distinct_count); Copy(min, &min_, min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); has_min_max_ = true; } // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, bool has_distinct_count, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::IncrementNumValues(num_values); if (has_null_count) { TypedStatisticsImpl::IncrementNullCount(null_count); } else { has_null_count_ = false; } if (has_distinct_count) { SetDistinctCount(distinct_count); } else { has_distinct_count_ = false; } if (!encoded_min.empty()) { PlainDecode(encoded_min, &min_); } if (!encoded_max.empty()) { PlainDecode(encoded_max, &max_); } has_min_max_ = has_min_max; } bool HasDistinctCount() const override { return has_distinct_count_; }; bool HasMinMax() const override { return has_min_max_; } bool HasNullCount() const override { return has_null_count_; }; void IncrementNullCount(int64_t n) override { statistics_.null_count += n; has_null_count_ = true; } void IncrementNumValues(int64_t n) override { num_values_ += n; } static bool IsMeaningfulLogicalType(LogicalType::Type::type type) { switch (type) { case LogicalType::Type::FLOAT16: return true; default: return false; } } bool Equals(const Statistics& raw_other) const override { if (physical_type() != raw_other.physical_type()) return false; const auto other_logical_type = LogicalTypeId(raw_other); // Only compare against logical types that influence the interpretation of the // physical type if (IsMeaningfulLogicalType(logical_type_)) { if (logical_type_ != other_logical_type) return false; } else if (IsMeaningfulLogicalType(other_logical_type)) { return false; } const auto& other = checked_cast(raw_other); if (has_min_max_ != other.has_min_max_) return false; if (has_min_max_) { if (!MinMaxEqual(other)) return false; } return null_count() == other.null_count() && distinct_count() == other.distinct_count() && num_values() == other.num_values(); } bool MinMaxEqual(const TypedStatisticsImpl& other) const; void Reset() override { ResetCounts(); ResetHasFlags(); } void SetMinMax(const T& arg_min, const T& arg_max) override { SetMinMaxPair({arg_min, arg_max}); } void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); // null_count is always valid when merging page statistics into // column chunk statistics. if (other.HasNullCount()) { this->statistics_.null_count += other.null_count(); } else { this->has_null_count_ = false; } if (has_distinct_count_ && other.HasDistinctCount() && (distinct_count() == 0 || other.distinct_count() == 0)) { // We can merge distinct counts if either side is zero. statistics_.distinct_count = std::max(statistics_.distinct_count, other.distinct_count()); } else { // Otherwise clear has_distinct_count_ as distinct count cannot be merged. this->has_distinct_count_ = false; } // Do not clear min/max here if the other side does not provide // min/max which may happen when other is an empty stats or all // its values are null and/or NaN. if (other.HasMinMax()) { SetMinMax(other.min(), other.max()); } } void Update(const T* values, int64_t num_values, int64_t null_count) override; void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_spaced_values, int64_t num_values, int64_t null_count) override; void Update(const ::arrow20::Array& values, bool update_counts) override { if (update_counts) { IncrementNullCount(values.null_count()); IncrementNumValues(values.length() - values.null_count()); } if (values.null_count() == values.length()) { return; } SetMinMaxPair(comparator_->GetMinMax(values)); } const T& min() const override { return min_; } const T& max() const override { return max_; } Type::type physical_type() const override { return descr_->physical_type(); } const ColumnDescriptor* descr() const override { return descr_; } std::string EncodeMin() const override { std::string s; if (HasMinMax()) this->PlainEncode(min_, &s); return s; } std::string EncodeMax() const override { std::string s; if (HasMinMax()) this->PlainEncode(max_, &s); return s; } EncodedStatistics Encode() override { EncodedStatistics s; if (HasMinMax()) { s.set_min(this->EncodeMin()); s.set_max(this->EncodeMax()); } if (HasNullCount()) { s.set_null_count(this->null_count()); // num_values_ is reliable and it means number of non-null values. s.all_null_value = num_values_ == 0; } if (HasDistinctCount()) { s.set_distinct_count(this->distinct_count()); } return s; } int64_t null_count() const override { return statistics_.null_count; } int64_t distinct_count() const override { return statistics_.distinct_count; } int64_t num_values() const override { return num_values_; } private: const ColumnDescriptor* descr_; bool has_min_max_ = false; bool has_null_count_ = false; bool has_distinct_count_ = false; T min_; T max_; ::arrow20::MemoryPool* pool_; // Number of non-null values. // Please note that num_values_ is reliable when has_null_count_ is set. // When has_null_count_ is not set, e.g. a page statistics created from // a statistics thrift message which doesn't have the optional null_count, // `num_values_` may include null values. int64_t num_values_ = 0; EncodedStatistics statistics_; std::shared_ptr> comparator_; std::shared_ptr min_buffer_, max_buffer_; LogicalType::Type::type logical_type_ = LogicalType::Type::NONE; void PlainEncode(const T& src, std::string* dst) const; void PlainDecode(const std::string& src, T* dst) const; void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; } void SetDistinctCount(int64_t n) { // distinct count can only be "set", and cannot be incremented. statistics_.distinct_count = n; has_distinct_count_ = true; } void ResetCounts() { this->statistics_.null_count = 0; this->statistics_.distinct_count = 0; this->num_values_ = 0; } void ResetHasFlags() { // has_min_max_ will only be set when it meets any valid value. this->has_min_max_ = false; // has_distinct_count_ will only be set once SetDistinctCount() // is called because distinct count calculation is not cheap and // disabled by default. this->has_distinct_count_ = false; // Null count calculation is cheap and enabled by default. this->has_null_count_ = true; } void SetMinMaxPair(std::pair min_max) { // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN auto maybe_min_max = CleanStatistic(min_max, logical_type_); if (!maybe_min_max) return; auto min = maybe_min_max.value().first; auto max = maybe_min_max.value().second; if (!has_min_max_) { has_min_max_ = true; Copy(min, &min_, min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); } else { Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get()); Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get()); } } }; template <> inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 && std::memcmp(max_.ptr, other.max_.ptr, len) == 0; } template bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { return min_ == other.min_ && max_ == other.max_; } template <> inline void TypedStatisticsImpl::Copy(const FLBA& src, FLBA* dst, ResizableBuffer* buffer) { if (dst->ptr == src.ptr) return; uint32_t len = descr_->type_length(); PARQUET_THROW_NOT_OK(buffer->Resize(len, false)); std::memcpy(buffer->mutable_data(), src.ptr, len); *dst = FLBA(buffer->data()); } template <> inline void TypedStatisticsImpl::Copy(const ByteArray& src, ByteArray* dst, ResizableBuffer* buffer) { if (dst->ptr == src.ptr) return; PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false)); std::memcpy(buffer->mutable_data(), src.ptr, src.len); *dst = ByteArray(src.len, buffer->data()); } template void TypedStatisticsImpl::Update(const T* values, int64_t num_values, int64_t null_count) { DCHECK_GE(num_values, 0); DCHECK_GE(null_count, 0); IncrementNullCount(null_count); IncrementNumValues(num_values); if (num_values == 0) return; SetMinMaxPair(comparator_->GetMinMax(values, num_values)); } template void TypedStatisticsImpl::UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_spaced_values, int64_t num_values, int64_t null_count) { DCHECK_GE(num_values, 0); DCHECK_GE(null_count, 0); IncrementNullCount(null_count); IncrementNumValues(num_values); if (num_values == 0) return; SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits, valid_bits_offset)); } template void TypedStatisticsImpl::PlainEncode(const T& src, std::string* dst) const { auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, descr_, pool_); encoder->Put(&src, 1); auto buffer = encoder->FlushValues(); auto ptr = reinterpret_cast(buffer->data()); dst->assign(ptr, static_cast(buffer->size())); } template void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_); decoder->SetData(1, reinterpret_cast(src.c_str()), static_cast(src.size())); decoder->Decode(dst, 1); } template <> void TypedStatisticsImpl::PlainEncode(const T& src, std::string* dst) const { dst->assign(reinterpret_cast(src.ptr), src.len); } template <> void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { dst->len = static_cast(src.size()); dst->ptr = reinterpret_cast(src.c_str()); } std::shared_ptr DoMakeComparator(Type::type physical_type, LogicalType::Type::type logical_type, SortOrder::type sort_order, int type_length) { if (SortOrder::SIGNED == sort_order) { switch (physical_type) { case Type::BOOLEAN: return std::make_shared>(); case Type::INT32: return std::make_shared>(); case Type::INT64: return std::make_shared>(); case Type::INT96: return std::make_shared>(); case Type::FLOAT: return std::make_shared>(); case Type::DOUBLE: return std::make_shared>(); case Type::BYTE_ARRAY: return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: if (logical_type == LogicalType::Type::FLOAT16) { return std::make_shared>( type_length); } return std::make_shared>(type_length); default: ParquetException::NYI("Signed Compare not implemented"); } } else if (SortOrder::UNSIGNED == sort_order) { switch (physical_type) { case Type::INT32: return std::make_shared>(); case Type::INT64: return std::make_shared>(); case Type::INT96: return std::make_shared>(); case Type::BYTE_ARRAY: return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared>(type_length); default: ParquetException::NYI("Unsigned Compare not implemented"); } } else { throw ParquetException("UNKNOWN Sort Order"); } return nullptr; } } // namespace // ---------------------------------------------------------------------- // Public factory functions std::shared_ptr Comparator::Make(Type::type physical_type, SortOrder::type sort_order, int type_length) { return DoMakeComparator(physical_type, LogicalType::Type::NONE, sort_order, type_length); } std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { return DoMakeComparator(descr->physical_type(), LogicalTypeId(descr), descr->sort_order(), descr->type_length()); } std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, ::arrow20::MemoryPool* pool) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared>(descr, pool); case Type::INT32: return std::make_shared>(descr, pool); case Type::INT64: return std::make_shared>(descr, pool); case Type::FLOAT: return std::make_shared>(descr, pool); case Type::DOUBLE: return std::make_shared>(descr, pool); case Type::BYTE_ARRAY: return std::make_shared>(descr, pool); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared>(descr, pool); default: ParquetException::NYI("Statistics not implemented"); } } std::shared_ptr Statistics::Make(Type::type physical_type, const void* min, const void* max, int64_t num_values, int64_t null_count, int64_t distinct_count) { #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ return std::make_shared>( \ *reinterpret_cast(min), \ *reinterpret_cast(max), num_values, null_count, \ distinct_count) switch (physical_type) { MAKE_STATS(BOOLEAN, BooleanType); MAKE_STATS(INT32, Int32Type); MAKE_STATS(INT64, Int64Type); MAKE_STATS(FLOAT, FloatType); MAKE_STATS(DOUBLE, DoubleType); MAKE_STATS(BYTE_ARRAY, ByteArrayType); MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType); default: break; } #undef MAKE_STATS DCHECK(false) << "Cannot reach here"; return nullptr; } std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, const EncodedStatistics* encoded_stats, int64_t num_values, ::arrow20::MemoryPool* pool) { DCHECK(encoded_stats != nullptr); return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, encoded_stats->has_min && encoded_stats->has_max, encoded_stats->has_null_count, encoded_stats->has_distinct_count, pool); } std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, bool has_distinct_count, ::arrow20::MemoryPool* pool) { #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ return std::make_shared>( \ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \ has_min_max, has_null_count, has_distinct_count, pool) switch (descr->physical_type()) { MAKE_STATS(BOOLEAN, BooleanType); MAKE_STATS(INT32, Int32Type); MAKE_STATS(INT64, Int64Type); MAKE_STATS(FLOAT, FloatType); MAKE_STATS(DOUBLE, DoubleType); MAKE_STATS(BYTE_ARRAY, ByteArrayType); MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType); default: break; } #undef MAKE_STATS DCHECK(false) << "Cannot reach here"; return nullptr; } } // namespace parquet20