1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#include "dense_union.h"
#include <util/generic/algorithm.h>
#include <util/system/types.h>
namespace NYql::NUdf {
namespace {
struct TForwardScanResult {
TVector<ui64> MinOffsets;
TVector<ui64> MaxOffsets;
TVector<bool> ChildHasOccurrence;
size_t ForwardScanEnd = 0;
};
TForwardScanResult ForwardScanOffsetsUntilAllMinsFound(
TArrayRef<const i8> typeCodes,
TArrayRef<const i32> valueOffsets,
size_t numberOfTypeIds,
size_t numberOfNonEmptyChildren) {
TForwardScanResult result{
.MinOffsets = TVector<ui64>(numberOfTypeIds, Max<ui64>()),
.MaxOffsets = TVector<ui64>(numberOfTypeIds, Min<ui64>()),
.ChildHasOccurrence = TVector<bool>(numberOfTypeIds, false),
};
size_t childrenFoundFromLeft = 0;
size_t idx = 0;
for (; idx < typeCodes.size() && childrenFoundFromLeft < numberOfNonEmptyChildren; ++idx) {
const size_t typeCode = static_cast<size_t>(static_cast<ui8>(typeCodes[idx]));
const ui64 offset = static_cast<ui64>(valueOffsets[idx]);
childrenFoundFromLeft += static_cast<size_t>(!result.ChildHasOccurrence[typeCode]);
result.ChildHasOccurrence[typeCode] = true;
result.MinOffsets[typeCode] = Min(result.MinOffsets[typeCode], offset);
result.MaxOffsets[typeCode] = Max(result.MaxOffsets[typeCode], offset);
}
result.ForwardScanEnd = idx;
return result;
}
void BackwardScanCompleteMaxOffsets(
TArrayRef<const i8> typeCodes,
TArrayRef<const i32> valueOffsets,
size_t forwardScanEnd,
size_t numberOfNonEmptyChildren,
TVector<ui64>& maxOffsets) {
TVector<bool> childFoundFromRight(maxOffsets.size(), false);
size_t childrenFoundFromRight = 0;
for (size_t idx = typeCodes.size(); idx > forwardScanEnd && childrenFoundFromRight < numberOfNonEmptyChildren; --idx) {
const size_t typeCode = static_cast<size_t>(static_cast<ui8>(typeCodes[idx - 1]));
const ui64 offset = static_cast<ui64>(valueOffsets[idx - 1]);
childrenFoundFromRight += static_cast<size_t>(!childFoundFromRight[typeCode]);
childFoundFromRight[typeCode] = true;
maxOffsets[typeCode] = Max(maxOffsets[typeCode], offset);
}
}
} // namespace
TVector<TDenseUnionChildUsage> CalculateDenseUnionChildrenUsage(const arrow::ArrayData& data) {
const size_t numberOfTypeIds = data.child_data.size();
const size_t numberOfNonEmptyChildren = CountIf(data.child_data, [](const std::shared_ptr<arrow::ArrayData>& child) {
Y_ENSURE(child, "Child data is null");
return child->length > 0;
});
TVector<TDenseUnionChildUsage> result(numberOfTypeIds);
const size_t length = static_cast<size_t>(data.length);
const TArrayRef<const i8> typeCodes{data.GetValues<i8>(1), length};
const TArrayRef<const i32> valueOffsets{data.GetValues<i32>(2), length};
auto minResult = ForwardScanOffsetsUntilAllMinsFound(typeCodes, valueOffsets, numberOfTypeIds, numberOfNonEmptyChildren);
BackwardScanCompleteMaxOffsets(typeCodes, valueOffsets, minResult.ForwardScanEnd, numberOfNonEmptyChildren, minResult.MaxOffsets);
for (size_t childIndex = 0; childIndex < numberOfTypeIds; ++childIndex) {
if (minResult.ChildHasOccurrence[childIndex]) {
result[childIndex] = {
.Offset = minResult.MinOffsets[childIndex],
.Length = minResult.MaxOffsets[childIndex] - minResult.MinOffsets[childIndex] + 1,
};
}
}
return result;
}
void AdjustDenseUnionValueOffsets(
TArrayRef<i32> valueOffsets,
TArrayRef<const i8> typeCodes,
TArrayRef<const TDenseUnionChildUsage> childUsage) {
const bool needsAdjust = AnyOf(childUsage, [](const TDenseUnionChildUsage& usage) {
return usage.Offset > 0;
});
if (!needsAdjust) {
return;
}
for (size_t rowIndex = 0; rowIndex < valueOffsets.size(); ++rowIndex) {
const size_t typeCode = static_cast<size_t>(static_cast<ui8>(typeCodes[rowIndex]));
valueOffsets[rowIndex] -= static_cast<i32>(childUsage[typeCode].Offset);
}
}
} // namespace NYql::NUdf
|