// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/arrow/scalar.h" #include #include #include #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/array.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/array/util.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/buffer.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/compare.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/pretty_print.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/type.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/bitmap_ops.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/checked_cast.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/decimal.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/formatting.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/hashing.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/logging.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/time.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/unreachable.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/utf8.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/value_parsing.h" #include "contrib/libs/apache/arrow_next/cpp/src/arrow/visit_scalar_inline.h" namespace arrow20 { using internal::checked_cast; using internal::checked_pointer_cast; bool Scalar::Equals(const Scalar& other, const EqualOptions& options) const { return ScalarEquals(*this, other, options); } bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const { return ScalarApproxEquals(*this, other, options); } Status Scalar::Accept(ScalarVisitor* visitor) const { return VisitScalarInline(*this, visitor); } namespace { // Implementation of Scalar::hash() struct ScalarHashImpl { Status Visit(const NullScalar& s) { return Status::OK(); } template Status Visit(const internal::PrimitiveScalar& s) { return ValueHash(s); } Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); } template Status Visit(const TemporalScalar& s) { return ValueHash(s); } Status Visit(const DayTimeIntervalScalar& s) { return StdHash(s.value.days) & StdHash(s.value.milliseconds); } Status Visit(const MonthDayNanoIntervalScalar& s) { return StdHash(s.value.days) & StdHash(s.value.months) & StdHash(s.value.nanoseconds); } Status Visit(const Decimal32Scalar& s) { return StdHash(s.value.value()); } Status Visit(const Decimal64Scalar& s) { return StdHash(s.value.value()); } Status Visit(const Decimal128Scalar& s) { return StdHash(s.value.low_bits()) & StdHash(s.value.high_bits()); } Status Visit(const Decimal256Scalar& s) { Status status = Status::OK(); // endianness doesn't affect result for (uint64_t elem : s.value.native_endian_array()) { status &= StdHash(elem); } return status; } Status Visit(const BaseListScalar& s) { return ArrayHash(*s.value); } Status Visit(const StructScalar& s) { for (const auto& child : s.value) { AccumulateHashFrom(*child); } return Status::OK(); } Status Visit(const DictionaryScalar& s) { AccumulateHashFrom(*s.value.index); return Status::OK(); } Status Visit(const DenseUnionScalar& s) { // type_code is ignored when comparing for equality, so do not hash it either AccumulateHashFrom(*s.value); return Status::OK(); } Status Visit(const SparseUnionScalar& s) { // type_code is ignored when comparing for equality, so do not hash it either AccumulateHashFrom(*s.value[s.child_id]); return Status::OK(); } Status Visit(const RunEndEncodedScalar& s) { AccumulateHashFrom(*s.value); return Status::OK(); } Status Visit(const ExtensionScalar& s) { AccumulateHashFrom(*s.value); return Status::OK(); } template Status StdHash(const T& t) { static std::hash hash; hash_ ^= hash(t); return Status::OK(); } template Status ValueHash(const S& s) { return StdHash(s.value); } Status BufferHash(const Buffer& b) { hash_ ^= internal::ComputeStringHash<1>(b.data(), b.size()); return Status::OK(); } Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); } Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) { // Calculate null count within the range const auto* validity = a.buffers[0].data; int64_t null_count = 0; if (validity != NULLPTR) { if (offset == a.offset && length == a.length) { null_count = a.GetNullCount(); } else { null_count = length - internal::CountSetBits(validity, offset, length); } } RETURN_NOT_OK(StdHash(length) & StdHash(null_count)); if (null_count != 0) { // We can't visit values without unboxing the whole array, so only hash // the null bitmap for now. Only hash the null bitmap if the null count // is not 0 to ensure hash consistency. hash_ = internal::ComputeBitmapHash(validity, /*seed=*/hash_, /*bits_offset=*/offset, /*num_bits=*/length); } // Hash the relevant child arrays for each type taking offset and length // from the parent array into account if necessary. switch (a.type->id()) { case Type::STRUCT: for (const auto& child : a.child_data) { RETURN_NOT_OK(ArrayHash(child, offset, length)); } break; // TODO(GH-35830): Investigate what should be the correct behavior for // each nested type. default: // By default, just hash the arrays without considering // the offset and length of the parent. for (const auto& child : a.child_data) { RETURN_NOT_OK(ArrayHash(child)); } break; } return Status::OK(); } Status ArrayHash(const ArraySpan& a) { return ArrayHash(a, a.offset, a.length); } explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) { AccumulateHashFrom(scalar); } void AccumulateHashFrom(const Scalar& scalar) { // Note we already injected the type in ScalarHashImpl::ScalarHashImpl if (scalar.is_valid) { DCHECK_OK(VisitScalarInline(scalar, this)); } } size_t hash_; }; struct ScalarBoundsCheckImpl { int64_t min_value; int64_t max_value; int64_t actual_value = -1; bool ok = true; ScalarBoundsCheckImpl(int64_t min_value, int64_t max_value) : min_value(min_value), max_value(max_value) {} Status Visit(const Scalar&) { Unreachable(); return Status::NotImplemented(""); } template enable_if_integer Visit(const ScalarType& scalar) { actual_value = static_cast(scalar.value); ok = (actual_value >= min_value && actual_value <= max_value); return Status::OK(); } }; // Implementation of Scalar::Validate() and Scalar::ValidateFull() struct ScalarValidateImpl { const bool full_validation_; explicit ScalarValidateImpl(bool full_validation) : full_validation_(full_validation) { ::arrow20::util::InitializeUTF8(); } Status Validate(const Scalar& scalar) { if (!scalar.type) { return Status::Invalid("scalar lacks a type"); } return VisitScalarInline(scalar, this); } Status Visit(const NullScalar& s) { if (s.is_valid) { return Status::Invalid("null scalar should have is_valid = false"); } return Status::OK(); } template Status Visit(const internal::PrimitiveScalar& s) { return Status::OK(); } Status Visit(const BaseBinaryScalar& s) { return ValidateBinaryScalar(s); } Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template Status CheckValueNotNull(const ScalarType& s) { if (!s.value) { return Status::Invalid(s.type->ToString(), " value is null"); } return Status::OK(); } Status Visit(const FixedSizeBinaryScalar& s) { const auto& byte_width = checked_cast(*s.type).byte_width(); RETURN_NOT_OK(CheckValueNotNull(s)); if (s.value->size() != byte_width) { return Status::Invalid(s.type->ToString(), " scalar should have a value of size ", byte_width, ", got ", s.value->size()); } return Status::OK(); } Status Visit(const Decimal32Scalar& s) { const auto& ty = checked_cast(*s.type); if (!s.value.FitsInPrecision(ty.precision())) { return Status::Invalid("Decimal value ", s.value.ToIntegerString(), " does not fit in precision of ", ty); } return Status::OK(); } Status Visit(const Decimal64Scalar& s) { const auto& ty = checked_cast(*s.type); if (!s.value.FitsInPrecision(ty.precision())) { return Status::Invalid("Decimal value ", s.value.ToIntegerString(), " does not fit in precision of ", ty); } return Status::OK(); } Status Visit(const Decimal128Scalar& s) { const auto& ty = checked_cast(*s.type); if (!s.value.FitsInPrecision(ty.precision())) { return Status::Invalid("Decimal value ", s.value.ToIntegerString(), " does not fit in precision of ", ty); } return Status::OK(); } Status Visit(const Decimal256Scalar& s) { const auto& ty = checked_cast(*s.type); if (!s.value.FitsInPrecision(ty.precision())) { return Status::Invalid("Decimal value ", s.value.ToIntegerString(), " does not fit in precision of ", ty); } return Status::OK(); } Status Visit(const BaseListScalar& s) { RETURN_NOT_OK(CheckValueNotNull(s)); const auto st = full_validation_ ? s.value->ValidateFull() : s.value->Validate(); if (!st.ok()) { return st.WithMessage(s.type->ToString(), " scalar fails validation for value: ", st.message()); } const auto& list_type = checked_cast(*s.type); const auto& value_type = *list_type.value_type(); if (!s.value->type()->Equals(value_type)) { return Status::Invalid(list_type.ToString(), " scalar should have a value of type ", value_type.ToString(), ", got ", s.value->type()->ToString()); } return Status::OK(); } Status Visit(const FixedSizeListScalar& s) { RETURN_NOT_OK(Visit(static_cast(s))); const auto& list_type = checked_cast(*s.type); if (s.value->length() != list_type.list_size()) { return Status::Invalid(s.type->ToString(), " scalar should have a child value of length ", list_type.list_size(), ", got ", s.value->length()); } return Status::OK(); } Status Visit(const StructScalar& s) { const int num_fields = s.type->num_fields(); const auto& fields = s.type->fields(); if (fields.size() != s.value.size()) { return Status::Invalid("non-null ", s.type->ToString(), " scalar should have ", num_fields, " child values, got ", s.value.size()); } for (int i = 0; i < num_fields; ++i) { const auto st = Validate(*s.value[i]); if (!st.ok()) { return st.WithMessage(s.type->ToString(), " scalar fails validation for child at index ", i, ": ", st.message()); } if (!s.value[i]->type->Equals(*fields[i]->type())) { return Status::Invalid( s.type->ToString(), " scalar should have a child value of type ", fields[i]->type()->ToString(), "at index ", i, ", got ", s.value[i]->type); } } return Status::OK(); } Status Visit(const DictionaryScalar& s) { const auto& dict_type = checked_cast(*s.type); // Validate index if (!s.value.index) { return Status::Invalid(s.type->ToString(), " scalar doesn't have an index value"); } { const auto st = Validate(*s.value.index); if (!st.ok()) { return st.WithMessage(s.type->ToString(), " scalar fails validation for index value: ", st.message()); } } if (!s.value.index->type->Equals(*dict_type.index_type())) { return Status::Invalid( s.type->ToString(), " scalar should have an index value of type ", dict_type.index_type()->ToString(), ", got ", s.value.index->type->ToString()); } if (s.is_valid && !s.value.index->is_valid) { return Status::Invalid("non-null ", s.type->ToString(), " scalar has null index value"); } if (!s.is_valid && s.value.index->is_valid) { return Status::Invalid("null ", s.type->ToString(), " scalar has non-null index value"); } // Validate dictionary if (!s.value.dictionary) { return Status::Invalid(s.type->ToString(), " scalar doesn't have a dictionary value"); } { const auto st = full_validation_ ? s.value.dictionary->ValidateFull() : s.value.dictionary->Validate(); if (!st.ok()) { return st.WithMessage( s.type->ToString(), " scalar fails validation for dictionary value: ", st.message()); } } if (!s.value.dictionary->type()->Equals(*dict_type.value_type())) { return Status::Invalid(s.type->ToString(), " scalar should have a dictionary value of type ", dict_type.value_type()->ToString(), ", got ", s.value.dictionary->type()->ToString()); } // Check index is in bounds if (full_validation_ && s.value.index->is_valid) { ScalarBoundsCheckImpl bounds_checker{0, s.value.dictionary->length() - 1}; RETURN_NOT_OK(VisitScalarInline(*s.value.index, &bounds_checker)); if (!bounds_checker.ok) { return Status::Invalid(s.type->ToString(), " scalar index value out of bounds: ", bounds_checker.actual_value); } } return Status::OK(); } Status ValidateValue(const Scalar& s, const Scalar& value) { const auto st = Validate(value); if (!st.ok()) { return st.WithMessage( s.type->ToString(), " scalar fails validation for underlying value: ", st.message()); } return Status::OK(); } Status ValidateDenseUnion(const DenseUnionScalar& s, int child_id) { const auto& union_type = checked_cast(*s.type); const auto& field_type = *union_type.field(child_id)->type(); if (!field_type.Equals(*s.value->type)) { return Status::Invalid(s.type->ToString(), " scalar with type code ", s.type_code, " should have an underlying value of type ", field_type.ToString(), ", got ", s.value->type->ToString()); } return ValidateValue(s, *s.value); } Status ValidateSparseUnion(const SparseUnionScalar& s) { const auto& union_type = checked_cast(*s.type); if (union_type.num_fields() != static_cast(s.value.size())) { return Status::Invalid("Sparse union scalar value had ", union_type.num_fields(), " fields but type has ", s.value.size(), " fields."); } for (int j = 0; j < union_type.num_fields(); ++j) { const auto& field_type = *union_type.field(j)->type(); const Scalar& field_value = *s.value[j]; if (!field_type.Equals(*field_value.type)) { return Status::Invalid(s.type->ToString(), " value for field ", union_type.field(j)->ToString(), " had incorrect type of ", field_value.type->ToString()); } RETURN_NOT_OK(ValidateValue(s, field_value)); } return Status::OK(); } Status Visit(const UnionScalar& s) { const int type_code = s.type_code; // avoid 8-bit int types for printing const auto& union_type = checked_cast(*s.type); const auto& child_ids = union_type.child_ids(); if (type_code < 0 || type_code >= static_cast(child_ids.size()) || child_ids[type_code] == UnionType::kInvalidChildId) { return Status::Invalid(s.type->ToString(), " scalar has invalid type code ", type_code); } if (union_type.id() == Type::DENSE_UNION) { return ValidateDenseUnion(checked_cast(s), child_ids[type_code]); } else { return ValidateSparseUnion(checked_cast(s)); } } Status Visit(const RunEndEncodedScalar& s) { const auto& ree_type = checked_cast(*s.type); if (!s.value) { return Status::Invalid(s.type->ToString(), " scalar doesn't have storage value"); } if (!s.is_valid && s.value->is_valid) { return Status::Invalid("null ", s.type->ToString(), " scalar has non-null storage value"); } if (s.is_valid && !s.value->is_valid) { return Status::Invalid("non-null ", s.type->ToString(), " scalar has null storage value"); } if (!ree_type.value_type()->Equals(*s.value->type)) { return Status::Invalid( ree_type.ToString(), " scalar should have an underlying value of type ", ree_type.value_type()->ToString(), ", got ", s.value->type->ToString()); } return ValidateValue(s, *s.value); } Status Visit(const ExtensionScalar& s) { if (!s.value) { return Status::Invalid(s.type->ToString(), " scalar doesn't have storage value"); } if (!s.is_valid && s.value->is_valid) { return Status::Invalid("null ", s.type->ToString(), " scalar has non-null storage value"); } if (s.is_valid && !s.value->is_valid) { return Status::Invalid("non-null ", s.type->ToString(), " scalar has null storage value"); } const auto st = Validate(*s.value); if (!st.ok()) { return st.WithMessage(s.type->ToString(), " scalar fails validation for storage value: ", st.message()); } return Status::OK(); } Status ValidateStringScalar(const BaseBinaryScalar& s) { RETURN_NOT_OK(ValidateBinaryScalar(s)); if (s.is_valid && full_validation_) { if (!::arrow20::util::ValidateUTF8(s.value->data(), s.value->size())) { return Status::Invalid(s.type->ToString(), " scalar contains invalid UTF8 data"); } } return Status::OK(); } Status ValidateBinaryScalar(const BaseBinaryScalar& s) { if (s.is_valid && !s.value) { return Status::Invalid(s.type->ToString(), " scalar is marked valid but doesn't have a value"); } if (!s.is_valid && s.value) { return Status::Invalid(s.type->ToString(), " scalar is marked null but has a value"); } return Status::OK(); } }; template void FillScalarScratchSpace(void* scratch_space, T const (&arr)[N]) { static_assert(sizeof(arr) <= internal::kScalarScratchSpaceSize); std::memcpy(scratch_space, arr, sizeof(arr)); } } // namespace size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; } Status Scalar::Validate() const { return ScalarValidateImpl(/*full_validation=*/false).Validate(*this); } Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} void BinaryScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace( scratch_space, {int32_t(0), value ? static_cast(value->size()) : int32_t(0)}); } void BinaryViewScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { static_assert(sizeof(BinaryViewType::c_type) <= internal::kScalarScratchSpaceSize); auto* view = new (scratch_space) BinaryViewType::c_type; if (value) { *view = util::ToBinaryView(std::string_view{*value}, 0, 0); } else { *view = {}; } } void LargeBinaryScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace( scratch_space, {int64_t(0), value ? static_cast(value->size()) : int64_t(0)}); } FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) : BinaryScalar(std::move(value), std::move(type)) { ARROW_CHECK_EQ(checked_cast(*this->type).byte_width(), this->value->size()); this->is_valid = is_valid; } FixedSizeBinaryScalar::FixedSizeBinaryScalar(const std::shared_ptr& value, bool is_valid) : BinaryScalar(value, fixed_size_binary(static_cast(value->size()))) { this->is_valid = is_valid; } FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::string s, bool is_valid) : FixedSizeBinaryScalar(Buffer::FromString(std::move(s)), is_valid) {} BaseListScalar::BaseListScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) : Scalar{std::move(type), is_valid}, value(std::move(value)) { if (this->value) { ARROW_CHECK(this->type->field(0)->type()->Equals(this->value->type())); } } ListScalar::ListScalar(std::shared_ptr value, bool is_valid) : ListScalar(value, list(value->type()), is_valid) {} void ListScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace( scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : LargeListScalar(value, large_list(value->type()), is_valid) {} void LargeListScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) : ListViewScalar(value, list_view(value->type()), is_valid) {} void ListViewScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace( scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) : LargeListViewScalar(value, large_list_view(value->type()), is_valid) {} void LargeListViewScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); return map(pair_type->field(0)->type(), pair_type->field(1)->type()); } MapScalar::MapScalar(std::shared_ptr value, bool is_valid) : MapScalar(value, MakeMapType(value->type()), is_valid) {} void MapScalar::FillScratchSpace(uint8_t* scratch_space, const std::shared_ptr& value) { FillScalarScratchSpace( scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) : BaseListScalar(std::move(value), std::move(type), is_valid) { if (this->value) { ARROW_CHECK_EQ(this->value->length(), checked_cast(*this->type).list_size()); } } FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar( value, fixed_size_list(value->type(), static_cast(value->length())), is_valid) {} Result> StructScalar::Make( ScalarVector values, std::vector field_names) { if (values.size() != field_names.size()) { return Status::Invalid("Mismatching number of field names and child scalars"); } FieldVector fields(field_names.size()); for (size_t i = 0; i < fields.size(); ++i) { fields[i] = arrow20::field(std::move(field_names[i]), values[i]->type); } return std::make_shared(std::move(values), struct_(std::move(fields))); } Result> StructScalar::field(FieldRef ref) const { ARROW_ASSIGN_OR_RAISE(auto path, ref.FindOne(*type)); if (path.indices().size() != 1) { return Status::NotImplemented("retrieval of nested fields from StructScalar"); } auto index = path.indices()[0]; if (is_valid) { return value[index]; } else { const auto& struct_type = checked_cast(*this->type); const auto& field_type = struct_type.field(index)->type(); return MakeNullScalar(field_type); } } RunEndEncodedScalar::RunEndEncodedScalar(std::shared_ptr value, std::shared_ptr type) : Scalar{std::move(type), value->is_valid}, ArraySpanFillFromScalarScratchSpace(*this->type), value{std::move(value)} { ARROW_CHECK_EQ(this->type->id(), Type::RUN_END_ENCODED); } RunEndEncodedScalar::RunEndEncodedScalar(const std::shared_ptr& type) : RunEndEncodedScalar( MakeNullScalar(checked_cast(*type).value_type()), type) {} RunEndEncodedScalar::~RunEndEncodedScalar() = default; void RunEndEncodedScalar::FillScratchSpace(uint8_t* scratch_space, const DataType& type) { Type::type run_end = checked_cast(type).run_end_type()->id(); switch (run_end) { case Type::INT16: FillScalarScratchSpace(scratch_space, {int16_t(1)}); break; case Type::INT32: FillScalarScratchSpace(scratch_space, {int32_t(1)}); break; default: DCHECK_EQ(run_end, Type::INT64); FillScalarScratchSpace(scratch_space, {int64_t(1)}); } } DictionaryScalar::DictionaryScalar(std::shared_ptr type) : internal::PrimitiveScalarBase(std::move(type)), value{MakeNullScalar(checked_cast(*this->type).index_type()), MakeArrayOfNull(checked_cast(*this->type).value_type(), 0) .ValueOrDie()} {} Result> DictionaryScalar::GetEncodedValue() const { const auto& dict_type = checked_cast(*type); if (!is_valid) { return MakeNullScalar(dict_type.value_type()); } int64_t index_value = 0; switch (dict_type.index_type()->id()) { case Type::UINT8: index_value = static_cast(checked_cast(*value.index).value); break; case Type::INT8: index_value = static_cast(checked_cast(*value.index).value); break; case Type::UINT16: index_value = static_cast(checked_cast(*value.index).value); break; case Type::INT16: index_value = static_cast(checked_cast(*value.index).value); break; case Type::UINT32: index_value = static_cast(checked_cast(*value.index).value); break; case Type::INT32: index_value = static_cast(checked_cast(*value.index).value); break; case Type::UINT64: index_value = static_cast(checked_cast(*value.index).value); break; case Type::INT64: index_value = static_cast(checked_cast(*value.index).value); break; default: return Status::TypeError("Not implemented dictionary index type"); break; } return value.dictionary->GetScalar(index_value); } std::shared_ptr DictionaryScalar::Make(std::shared_ptr index, std::shared_ptr dict) { auto type = dictionary(index->type, dict->type()); auto is_valid = index->is_valid; return std::make_shared(ValueType{std::move(index), std::move(dict)}, std::move(type), is_valid); } Result TimestampScalar::FromISO8601(std::string_view iso8601, TimeUnit::type unit) { ValueType value; if (internal::ParseTimestampISO8601(iso8601.data(), iso8601.size(), unit, &value)) { return TimestampScalar{value, timestamp(unit)}; } return Status::Invalid("Couldn't parse ", iso8601, " as a timestamp"); } SparseUnionScalar::SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, /*is_valid=*/true), ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) { const auto child_ids = checked_cast(*this->type).child_ids(); if (type_code >= 0 && static_cast(type_code) < child_ids.size() && child_ids[type_code] != UnionType::kInvalidChildId) { this->child_id = child_ids[type_code]; // Fix nullness based on whether the selected child is null this->is_valid = this->value[this->child_id]->is_valid; } } std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr value, int field_index, std::shared_ptr type) { const auto& union_type = checked_cast(*type); int8_t type_code = union_type.type_codes()[field_index]; ScalarVector field_values; for (int i = 0; i < type->num_fields(); ++i) { if (i == field_index) { field_values.emplace_back(std::move(value)); } else { field_values.emplace_back(MakeNullScalar(type->field(i)->type())); } } return std::make_shared(field_values, type_code, std::move(type)); } void SparseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; } void DenseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; FillScalarScratchSpace(union_scratch_space->offsets, {int32_t(0), int32_t(1)}); } namespace { template using scalar_constructor_has_arrow_type = std::is_constructible::ScalarType, std::shared_ptr>; template using enable_if_scalar_constructor_has_arrow_type = typename std::enable_if::value, R>::type; template using enable_if_scalar_constructor_has_no_arrow_type = typename std::enable_if::value, R>::type; struct MakeNullImpl { template ::ScalarType> enable_if_scalar_constructor_has_arrow_type Visit(const T&) { out_ = std::make_shared(type_); return Status::OK(); } template ::ScalarType> enable_if_scalar_constructor_has_no_arrow_type Visit(const T&) { out_ = std::make_shared(); return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, AllocateBuffer(type.byte_width())); // Avoid exposing past memory contents memset(value->mutable_data(), 0, value->size()); out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); return Status::OK(); } template ::ScalarType> Status VisitListLike(const T& type, int64_t list_size = 0) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, MakeArrayOfNull(type.value_type(), list_size)); out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); return Status::OK(); } Status Visit(const ListType& type) { return VisitListLike(type); } Status Visit(const LargeListType& type) { return VisitListLike(type); } Status Visit(const MapType& type) { return VisitListLike(type); } Status Visit(const ListViewType& type) { return VisitListLike(type); } Status Visit(const LargeListViewType& type) { return VisitListLike(type); } Status Visit(const FixedSizeListType& type) { return VisitListLike(type, type.list_size()); } Status Visit(const StructType& type) { ScalarVector field_values; for (int i = 0; i < type.num_fields(); ++i) { field_values.push_back(MakeNullScalar(type.field(i)->type())); } out_ = std::make_shared(std::move(field_values), type_, /*is_valid=*/false); return Status::OK(); } Status Visit(const SparseUnionType& type) { if (type.num_fields() == 0) { return Status::Invalid("Cannot make scalar of empty union type"); } ScalarVector field_values; for (int i = 0; i < type.num_fields(); ++i) { field_values.emplace_back(MakeNullScalar(type.field(i)->type())); } out_ = std::make_shared(std::move(field_values), type.type_codes()[0], type_); return Status::OK(); } Status Visit(const DenseUnionType& type) { if (type.num_fields() == 0) { return Status::Invalid("Cannot make scalar of empty union type"); } out_ = std::make_shared(MakeNullScalar(type.field(0)->type()), type.type_codes()[0], type_); return Status::OK(); } Status Visit(const RunEndEncodedType& type) { out_ = std::make_shared(type_); return Status::OK(); } Status Visit(const ExtensionType& type) { out_ = std::make_shared(MakeNullScalar(type.storage_type()), type_, /*is_valid=*/false); return Status::OK(); } std::shared_ptr Finish() && { // Should not fail. DCHECK_OK(VisitTypeInline(*type_, this)); return std::move(out_); } std::shared_ptr type_; std::shared_ptr out_; }; } // namespace std::shared_ptr MakeNullScalar(std::shared_ptr type) { return MakeNullImpl{std::move(type), nullptr}.Finish(); } std::string Scalar::ToString() const { if (!this->is_valid) { return "null"; } if (type->id() == Type::DICTIONARY) { auto dict_scalar = checked_cast(this); return dict_scalar->value.dictionary->ToString() + "[" + dict_scalar->value.index->ToString() + "]"; } auto maybe_repr = CastTo(utf8()); if (maybe_repr.ok()) { return checked_cast(*maybe_repr.ValueOrDie()).value->ToString(); } std::string result; std::shared_ptr as_array = *MakeArrayFromScalar(*this, 1); DCHECK_OK(PrettyPrint(*as_array, PrettyPrintOptions::Defaults(), &result)); return result; } struct ScalarParseImpl { template > Status Visit(const T& t) { typename internal::StringConverter::value_type value; if (!internal::ParseValue(t, s_.data(), s_.size(), &value)) { return Status::Invalid("error parsing '", s_, "' as scalar of type ", t); } return Finish(value); } Status Visit(const BinaryType&) { return FinishWithBuffer(); } Status Visit(const LargeBinaryType&) { return FinishWithBuffer(); } Status Visit(const FixedSizeBinaryType&) { return FinishWithBuffer(); } Status Visit(const DictionaryType& t) { ARROW_ASSIGN_OR_RAISE(auto value, Scalar::Parse(t.value_type(), s_)); return Finish(std::move(value)); } Status Visit(const DataType& t) { return Status::NotImplemented("parsing scalars of type ", t); } template Status Finish(Arg&& arg) { return MakeScalar(std::move(type_), std::forward(arg)).Value(&out_); } Status FinishWithBuffer() { return Finish(Buffer::FromString(std::string(s_))); } Result> Finish() && { RETURN_NOT_OK(VisitTypeInline(*type_, this)); return std::move(out_); } ScalarParseImpl(std::shared_ptr type, std::string_view s) : type_(std::move(type)), s_(s) {} std::shared_ptr type_; std::string_view s_; std::shared_ptr out_; }; Result> Scalar::Parse(const std::shared_ptr& type, std::string_view s) { return ScalarParseImpl{type, s}.Finish(); } namespace internal { Status CheckBufferLength(const FixedSizeBinaryType* t, const std::shared_ptr* b) { return t->byte_width() == (*b)->size() ? Status::OK() : Status::Invalid("buffer length ", (*b)->size(), " is not compatible with ", *t); } } // namespace internal namespace { // CastImpl(...) assumes `to` points to a non null scalar of the correct type with // uninitialized value // helper for StringFormatter template std::shared_ptr FormatToBuffer(Formatter&& formatter, const ScalarType& from) { if (!from.is_valid) { return Buffer::FromString("null"); } return formatter( from.value, [&](std::string_view v) { return Buffer::FromString(std::string(v)); }); } // error fallback template Result> CastImpl(const Scalar& from, std::shared_ptr to_type) { return Status::NotImplemented("casting scalars of type ", *from.type, " to type ", *to_type); } // numeric to numeric template enable_if_number>> CastImpl( const NumericScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; return std::make_shared(static_cast(from.value), std::move(to_type)); } // numeric to boolean template enable_if_boolean>> CastImpl( const NumericScalar& from, std::shared_ptr to_type) { constexpr auto zero = static_cast(0); return std::make_shared(from.value != zero, std::move(to_type)); } // boolean to numeric template enable_if_number>> CastImpl( const BooleanScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; return std::make_shared(static_cast(from.value), std::move(to_type)); } // numeric to temporal template typename std::enable_if::value && !std::is_same::value && !std::is_same::value, Result>>::type CastImpl(const NumericScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; return std::make_shared(static_cast(from.value), std::move(to_type)); } // temporal to numeric template typename std::enable_if::value && std::is_base_of::value && !std::is_same::value && !std::is_same::value, Result>>::type CastImpl(const TemporalScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; return std::make_shared(static_cast(from.value), std::move(to_type)); } // timestamp to timestamp template enable_if_timestamp>> CastImpl( const TimestampScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; ARROW_ASSIGN_OR_RAISE(auto value, util::ConvertTimestampValue(from.type, to_type, from.value)); return std::make_shared(value, std::move(to_type)); } template std::shared_ptr AsTimestampType(const std::shared_ptr& type) { return timestamp(checked_cast(*type).unit()); } // duration to duration template enable_if_duration>> CastImpl( const DurationScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; ARROW_ASSIGN_OR_RAISE( auto value, util::ConvertTimestampValue(AsTimestampType(from.type), AsTimestampType(to_type), from.value)); return std::make_shared(value, std::move(to_type)); } // time to time template ::ScalarType::TypeClass> enable_if_time>> CastImpl( const TimeScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; ARROW_ASSIGN_OR_RAISE( auto value, util::ConvertTimestampValue(AsTimestampType(from.type), AsTimestampType(to_type), from.value)); return std::make_shared(static_cast(value), std::move(to_type)); } constexpr int64_t kMillisecondsInDay = 86400000; // date to date template enable_if_t::value, Result>> CastImpl(const Date32Scalar& from, std::shared_ptr to_type) { return std::make_shared(from.value * kMillisecondsInDay, std::move(to_type)); } template enable_if_t::value, Result>> CastImpl(const Date64Scalar& from, std::shared_ptr to_type) { return std::make_shared( static_cast(from.value / kMillisecondsInDay), std::move(to_type)); } // timestamp to date template enable_if_t::value, Result>> CastImpl(const TimestampScalar& from, std::shared_ptr to_type) { ARROW_ASSIGN_OR_RAISE( auto millis, util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value)); return std::make_shared(millis - millis % kMillisecondsInDay, std::move(to_type)); } template enable_if_t::value, Result>> CastImpl(const TimestampScalar& from, std::shared_ptr to_type) { ARROW_ASSIGN_OR_RAISE( auto millis, util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value)); return std::make_shared(static_cast(millis / kMillisecondsInDay), std::move(to_type)); } // date to timestamp template enable_if_timestamp>> CastImpl( const DateScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; int64_t millis = from.value; if (std::is_same::value) { millis *= kMillisecondsInDay; } ARROW_ASSIGN_OR_RAISE(auto value, util::ConvertTimestampValue( timestamp(TimeUnit::MILLI), to_type, millis)); return std::make_shared(value, std::move(to_type)); } // string to any template Result> CastImpl(const StringScalar& from, std::shared_ptr to_type) { using ToScalar = typename TypeTraits::ScalarType; ARROW_ASSIGN_OR_RAISE(auto out, Scalar::Parse(std::move(to_type), std::string_view(*from.value))); DCHECK(checked_pointer_cast(out) != nullptr); return out; } // binary/large binary/large string to string template enable_if_t::value && std::is_base_of_v && !std::is_same::value, Result>> CastImpl(const From& from, std::shared_ptr to_type) { return std::make_shared(from.value, std::move(to_type)); } // formattable to string template , // note: Value unused but necessary to trigger SFINAE if Formatter is // undefined typename Value = typename Formatter::value_type> typename std::enable_if_t::value, Result>> CastImpl(const From& from, std::shared_ptr to_type) { return std::make_shared(FormatToBuffer(Formatter{from.type.get()}, from), std::move(to_type)); } // struct to string template typename std::enable_if_t::value, Result>> CastImpl(const StructScalar& from, std::shared_ptr to_type) { std::stringstream ss; ss << '{'; for (int i = 0; static_cast(i) < from.value.size(); i++) { if (i > 0) ss << ", "; ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString() << " = " << from.value[i]->ToString(); } ss << '}'; return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } // casts between variable-length and fixed-length list types template std::enable_if_t::value && is_list_type::value, Result>> CastImpl(const FromScalar& from, std::shared_ptr to_type) { if constexpr (sizeof(typename To::offset_type) < sizeof(int64_t)) { if (from.value->length() > std::numeric_limits::max()) { return Status::Invalid(from.type->ToString(), " too large to cast to ", to_type->ToString()); } } if constexpr (is_fixed_size_list_type::value) { const auto& fixed_size_list_type = checked_cast(*to_type); if (from.value->length() != fixed_size_list_type.list_size()) { return Status::Invalid("Cannot cast ", from.type->ToString(), " of length ", from.value->length(), " to fixed size list of length ", fixed_size_list_type.list_size()); } } using ToScalar = typename TypeTraits::ScalarType; return std::make_shared(from.value, std::move(to_type), from.is_valid); } // list based types (list, large list and map (fixed sized list too)) to string template typename std::enable_if_t::value, Result>> CastImpl(const BaseListScalar& from, std::shared_ptr to_type) { std::stringstream ss; ss << from.type->ToString() << "["; for (int64_t i = 0; i < from.value->length(); i++) { if (i > 0) ss << ", "; ARROW_ASSIGN_OR_RAISE(auto value, from.value->GetScalar(i)); ss << value->ToString(); } ss << ']'; return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } // union types to string template typename std::enable_if_t::value, Result>> CastImpl(const UnionScalar& from, std::shared_ptr to_type) { const auto& union_ty = checked_cast(*from.type); std::stringstream ss; const Scalar* selected_value; if (from.type->id() == Type::DENSE_UNION) { selected_value = checked_cast(from).value.get(); } else { const auto& sparse_scalar = checked_cast(from); selected_value = sparse_scalar.value[sparse_scalar.child_id].get(); } ss << "union{" << union_ty.field(union_ty.child_ids()[from.type_code])->ToString() << " = " << selected_value->ToString() << '}'; return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } struct CastImplVisitor { Status NotImplemented() { return Status::NotImplemented("cast to ", *to_type_, " from ", *from_.type); } const Scalar& from_; const std::shared_ptr& to_type_; std::shared_ptr out_ = nullptr; }; template struct FromTypeVisitor : CastImplVisitor { using ToScalar = typename TypeTraits::ScalarType; FromTypeVisitor(const Scalar& from, const std::shared_ptr& to_type) : CastImplVisitor{from, to_type} {} template Status Visit(const FromType&) { ARROW_ASSIGN_OR_RAISE( out_, CastImpl( checked_cast::ScalarType&>(from_), to_type_)); return Status::OK(); } // identity cast only for parameter free types template typename std::enable_if_t::is_parameter_free, Status> Visit( const ToType&) { ARROW_ASSIGN_OR_RAISE( out_, MakeScalar(to_type_, checked_cast(from_).value)); return Status::OK(); } Status Visit(const NullType&) { return NotImplemented(); } Status Visit(const DictionaryType&) { return NotImplemented(); } Status Visit(const ExtensionType&) { return NotImplemented(); } }; struct ToTypeVisitor : CastImplVisitor { ToTypeVisitor(const Scalar& from, const std::shared_ptr& to_type) : CastImplVisitor{from, to_type} {} template Status Visit(const ToType&) { FromTypeVisitor unpack_from_type{from_, to_type_}; ARROW_RETURN_NOT_OK(VisitTypeInline(*from_.type, &unpack_from_type)); out_ = std::move(unpack_from_type.out_); return Status::OK(); } Status Visit(const NullType&) { if (from_.is_valid) { return Status::Invalid("attempting to cast non-null scalar to NullScalar"); } return Status::OK(); } Status Visit(const DictionaryType& dict_type) { ARROW_ASSIGN_OR_RAISE(auto cast_value, from_.CastTo(dict_type.value_type())); ARROW_ASSIGN_OR_RAISE(auto dictionary, MakeArrayFromScalar(*cast_value, 1)); ARROW_ASSIGN_OR_RAISE(auto index, Int32Scalar(0).CastTo(dict_type.index_type())); out_ = DictionaryScalar::Make(std::move(index), std::move(dictionary)); return Status::OK(); } Status Visit(const ExtensionType&) { return NotImplemented(); } Result> Finish() && { ARROW_RETURN_NOT_OK(VisitTypeInline(*to_type_, this)); return std::move(out_); } }; } // namespace Result> Scalar::CastTo(std::shared_ptr to) const { if (is_valid) { return ToTypeVisitor{*this, std::move(to)}.Finish(); } return MakeNullScalar(std::move(to)); } void PrintTo(const Scalar& scalar, std::ostream* os) { *os << scalar.ToString(); } } // namespace arrow20