summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/arrow_next/cpp/src/arrow/array/array_dict.h
blob: 14c17798d3673f8431da0a84933291a97e52343c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#pragma clang system_header
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <memory>

#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow20 {

// ----------------------------------------------------------------------
// DictionaryArray

/// \brief Array type for dictionary-encoded data with a
/// data-dependent dictionary
///
/// A dictionary array contains an array of non-negative integers (the
/// "dictionary indices") along with a data type containing a "dictionary"
/// corresponding to the distinct values represented in the data.
///
/// For example, the array
///
///   ["foo", "bar", "foo", "bar", "foo", "bar"]
///
/// with dictionary ["bar", "foo"], would have dictionary array representation
///
///   indices: [1, 0, 1, 0, 1, 0]
///   dictionary: ["bar", "foo"]
///
/// The indices in principle may be any integer type.
class ARROW_EXPORT DictionaryArray : public Array {
 public:
  using TypeClass = DictionaryType;

  explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);

  DictionaryArray(const std::shared_ptr<DataType>& type,
                  const std::shared_ptr<Array>& indices,
                  const std::shared_ptr<Array>& dictionary);

  /// \brief Construct DictionaryArray from dictionary and indices
  /// array and validate
  ///
  /// This function does the validation of the indices and input type. It checks if
  /// all indices are non-negative and smaller than the size of the dictionary.
  ///
  /// \param[in] type a dictionary type
  /// \param[in] dictionary the dictionary with same value type as the
  /// type object
  /// \param[in] indices an array of non-negative integers smaller than the
  /// size of the dictionary
  static Result<std::shared_ptr<Array>> FromArrays(
      const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
      const std::shared_ptr<Array>& dictionary);

  static Result<std::shared_ptr<Array>> FromArrays(
      const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
    return FromArrays(::arrow20::dictionary(indices->type(), dictionary->type()), indices,
                      dictionary);
  }

  /// \brief Transpose this DictionaryArray
  ///
  /// This method constructs a new dictionary array with the given dictionary
  /// type, transposing indices using the transpose map.  The type and the
  /// transpose map are typically computed using DictionaryUnifier.
  ///
  /// \param[in] type the new type object
  /// \param[in] dictionary the new dictionary
  /// \param[in] transpose_map transposition array of this array's indices
  ///   into the target array's indices
  /// \param[in] pool a pool to allocate the array data from
  Result<std::shared_ptr<Array>> Transpose(
      const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
      const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;

  Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;

  /// \brief Determine whether dictionary arrays may be compared without unification
  bool CanCompareIndices(const DictionaryArray& other) const;

  /// \brief Return the dictionary for this array, which is stored as
  /// a member of the ArrayData internal structure
  const std::shared_ptr<Array>& dictionary() const;
  const std::shared_ptr<Array>& indices() const;

  /// \brief Return the ith value of indices, cast to int64_t. Not recommended
  /// for use in performance-sensitive code. Does not validate whether the
  /// value is null or out-of-bounds.
  int64_t GetValueIndex(int64_t i) const;

  const DictionaryType* dict_type() const { return dict_type_; }

 private:
  void SetData(const std::shared_ptr<ArrayData>& data);
  const DictionaryType* dict_type_;
  std::shared_ptr<Array> indices_;

  // Lazily initialized when invoking dictionary()
  mutable std::shared_ptr<Array> dictionary_;
};

/// \brief Helper class for incremental dictionary unification
class ARROW_EXPORT DictionaryUnifier {
 public:
  virtual ~DictionaryUnifier() = default;

  /// \brief Construct a DictionaryUnifier
  /// \param[in] value_type the data type of the dictionaries
  /// \param[in] pool MemoryPool to use for memory allocations
  static Result<std::unique_ptr<DictionaryUnifier>> Make(
      std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());

  /// \brief Unify dictionaries across array chunks
  ///
  /// The dictionaries in the array chunks will be unified, their indices
  /// accordingly transposed.
  ///
  /// Only dictionaries with a primitive value type are currently supported.
  /// However, dictionaries nested inside a more complex type are correctly unified.
  static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
      const std::shared_ptr<ChunkedArray>& array,
      MemoryPool* pool = default_memory_pool());

  /// \brief Unify dictionaries across the chunks of each table column
  ///
  /// The dictionaries in each table column will be unified, their indices
  /// accordingly transposed.
  ///
  /// Only dictionaries with a primitive value type are currently supported.
  /// However, dictionaries nested inside a more complex type are correctly unified.
  static Result<std::shared_ptr<Table>> UnifyTable(
      const Table& table, MemoryPool* pool = default_memory_pool());

  /// \brief Append dictionary to the internal memo
  virtual Status Unify(const Array& dictionary) = 0;

  /// \brief Append dictionary and compute transpose indices
  /// \param[in] dictionary the dictionary values to unify
  /// \param[out] out_transpose a Buffer containing computed transpose indices
  /// as int32_t values equal in length to the passed dictionary. The value in
  /// each slot corresponds to the new index value for each original index
  /// for a DictionaryArray with the old dictionary
  virtual Status Unify(const Array& dictionary,
                       std::shared_ptr<Buffer>* out_transpose) = 0;

  /// \brief Return a result DictionaryType with the smallest possible index
  /// type to accommodate the unified dictionary. The unifier cannot be used
  /// after this is called
  virtual Status GetResult(std::shared_ptr<DataType>* out_type,
                           std::shared_ptr<Array>* out_dict) = 0;

  /// \brief Return a unified dictionary with the given index type.  If
  /// the index type is not large enough then an invalid status will be returned.
  /// The unifier cannot be used after this is called
  virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
                                        std::shared_ptr<Array>* out_dict) = 0;
};

}  // namespace arrow20