#include "dense_union.h" #include #include namespace NYql::NUdf { namespace { struct TForwardScanResult { TVector MinOffsets; TVector MaxOffsets; TVector ChildHasOccurrence; size_t ForwardScanEnd = 0; }; TForwardScanResult ForwardScanOffsetsUntilAllMinsFound( TArrayRef typeCodes, TArrayRef valueOffsets, size_t numberOfTypeIds, size_t numberOfNonEmptyChildren) { TForwardScanResult result{ .MinOffsets = TVector(numberOfTypeIds, Max()), .MaxOffsets = TVector(numberOfTypeIds, Min()), .ChildHasOccurrence = TVector(numberOfTypeIds, false), }; size_t childrenFoundFromLeft = 0; size_t idx = 0; for (; idx < typeCodes.size() && childrenFoundFromLeft < numberOfNonEmptyChildren; ++idx) { const size_t typeCode = static_cast(static_cast(typeCodes[idx])); const ui64 offset = static_cast(valueOffsets[idx]); childrenFoundFromLeft += static_cast(!result.ChildHasOccurrence[typeCode]); result.ChildHasOccurrence[typeCode] = true; result.MinOffsets[typeCode] = Min(result.MinOffsets[typeCode], offset); result.MaxOffsets[typeCode] = Max(result.MaxOffsets[typeCode], offset); } result.ForwardScanEnd = idx; return result; } void BackwardScanCompleteMaxOffsets( TArrayRef typeCodes, TArrayRef valueOffsets, size_t forwardScanEnd, size_t numberOfNonEmptyChildren, TVector& maxOffsets) { TVector childFoundFromRight(maxOffsets.size(), false); size_t childrenFoundFromRight = 0; for (size_t idx = typeCodes.size(); idx > forwardScanEnd && childrenFoundFromRight < numberOfNonEmptyChildren; --idx) { const size_t typeCode = static_cast(static_cast(typeCodes[idx - 1])); const ui64 offset = static_cast(valueOffsets[idx - 1]); childrenFoundFromRight += static_cast(!childFoundFromRight[typeCode]); childFoundFromRight[typeCode] = true; maxOffsets[typeCode] = Max(maxOffsets[typeCode], offset); } } } // namespace TVector CalculateDenseUnionChildrenUsage(const arrow::ArrayData& data) { const size_t numberOfTypeIds = data.child_data.size(); const size_t numberOfNonEmptyChildren = CountIf(data.child_data, [](const std::shared_ptr& child) { Y_ENSURE(child, "Child data is null"); return child->length > 0; }); TVector result(numberOfTypeIds); const size_t length = static_cast(data.length); const TArrayRef typeCodes{data.GetValues(1), length}; const TArrayRef valueOffsets{data.GetValues(2), length}; auto minResult = ForwardScanOffsetsUntilAllMinsFound(typeCodes, valueOffsets, numberOfTypeIds, numberOfNonEmptyChildren); BackwardScanCompleteMaxOffsets(typeCodes, valueOffsets, minResult.ForwardScanEnd, numberOfNonEmptyChildren, minResult.MaxOffsets); for (size_t childIndex = 0; childIndex < numberOfTypeIds; ++childIndex) { if (minResult.ChildHasOccurrence[childIndex]) { result[childIndex] = { .Offset = minResult.MinOffsets[childIndex], .Length = minResult.MaxOffsets[childIndex] - minResult.MinOffsets[childIndex] + 1, }; } } return result; } void AdjustDenseUnionValueOffsets( TArrayRef valueOffsets, TArrayRef typeCodes, TArrayRef childUsage) { const bool needsAdjust = AnyOf(childUsage, [](const TDenseUnionChildUsage& usage) { return usage.Offset > 0; }); if (!needsAdjust) { return; } for (size_t rowIndex = 0; rowIndex < valueOffsets.size(); ++rowIndex) { const size_t typeCode = static_cast(static_cast(typeCodes[rowIndex])); valueOffsets[rowIndex] -= static_cast(childUsage[typeCode].Offset); } } } // namespace NYql::NUdf