summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/arrow_next/cpp/src/arrow/compute/key_hash_internal.h
blob: d3041ba0b58f9b4818858377a63241869db5f724 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#pragma clang system_header
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>

#include "contrib/libs/apache/arrow_next/cpp/src/arrow/compute/light_array_internal.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/compute/util.h"
#include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/simd.h"

namespace arrow20 {
namespace compute {

// Forward declarations only needed for making test functions a friend of the classes in
// this file.
//
enum class BloomFilterBuildStrategy;

// Implementations are based on xxh3 32-bit algorithm description from:
// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
//
class ARROW_EXPORT Hashing32 {
  friend class TestVectorHash;
  template <typename T>
  friend void TestBloomLargeHashHelper(int64_t, int64_t, const std::vector<uint64_t>&,
                                       int64_t, int, T*);
  friend void TestBloomSmall(BloomFilterBuildStrategy, int64_t, int, bool, bool);

 public:
  static void HashMultiColumn(const std::vector<KeyColumnArray>& cols, LightContext* ctx,
                              uint32_t* out_hash);

  // Clarify the max temp stack usage for HashBatch, which might be necessary for the
  // caller to be aware of at compile time to reserve enough stack size in advance. The
  // HashBatch implementation uses one uint32 temp vector as a buffer for hash, one uint16
  // temp vector as a buffer for null indices and one uint32 temp vector as a buffer for
  // null hash, all are of size kMiniBatchLength. Plus extra kMiniBatchLength to cope with
  // stack padding and aligning.
  static constexpr auto kHashBatchTempStackUsage =
      (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + /*extra=*/1) *
      util::MiniBatch::kMiniBatchLength;

  static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes,
                          std::vector<KeyColumnArray>& column_arrays,
                          int64_t hardware_flags, util::TempVectorStack* temp_stack,
                          int64_t start_row, int64_t num_rows);

  static void HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t num_keys,
                        uint64_t key_length, const uint8_t* keys, uint32_t* hashes,
                        uint32_t* temp_hashes_for_combine);

 private:
  static const uint32_t PRIME32_1 = 0x9E3779B1;
  static const uint32_t PRIME32_2 = 0x85EBCA77;
  static const uint32_t PRIME32_3 = 0xC2B2AE3D;
  static const uint32_t PRIME32_4 = 0x27D4EB2F;
  static const uint32_t PRIME32_5 = 0x165667B1;
  static const uint32_t kCombineConst = 0x9e3779b9UL;
  static const int64_t kStripeSize = 4 * sizeof(uint32_t);

  static void HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t num_rows,
                         const uint32_t* offsets, const uint8_t* concatenated_keys,
                         uint32_t* hashes, uint32_t* temp_hashes_for_combine);

  static void HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t num_rows,
                         const uint64_t* offsets, const uint8_t* concatenated_keys,
                         uint32_t* hashes, uint32_t* temp_hashes_for_combine);

  static inline uint32_t Avalanche(uint32_t acc) {
    acc ^= (acc >> 15);
    acc *= PRIME32_2;
    acc ^= (acc >> 13);
    acc *= PRIME32_3;
    acc ^= (acc >> 16);
    return acc;
  }
  static inline uint32_t Round(uint32_t acc, uint32_t input);
  static inline uint32_t CombineAccumulators(uint32_t acc1, uint32_t acc2, uint32_t acc3,
                                             uint32_t acc4);
  static inline uint32_t CombineHashesImp(uint32_t previous_hash, uint32_t hash) {
    uint32_t next_hash = previous_hash ^ (hash + kCombineConst + (previous_hash << 6) +
                                          (previous_hash >> 2));
    return next_hash;
  }
  static inline void ProcessFullStripes(uint64_t num_stripes, const uint8_t* key,
                                        uint32_t* out_acc1, uint32_t* out_acc2,
                                        uint32_t* out_acc3, uint32_t* out_acc4);
  static inline void ProcessLastStripe(uint32_t mask1, uint32_t mask2, uint32_t mask3,
                                       uint32_t mask4, const uint8_t* last_stripe,
                                       uint32_t* acc1, uint32_t* acc2, uint32_t* acc3,
                                       uint32_t* acc4);
  static inline void StripeMask(int i, uint32_t* mask1, uint32_t* mask2, uint32_t* mask3,
                                uint32_t* mask4);
  template <bool T_COMBINE_HASHES>
  static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const uint8_t* keys,
                              uint32_t* hashes);
  template <typename T, bool T_COMBINE_HASHES>
  static void HashVarLenImp(uint32_t num_rows, const T* offsets,
                            const uint8_t* concatenated_keys, uint32_t* hashes);
  template <bool T_COMBINE_HASHES>
  static void HashBitImp(int64_t bit_offset, uint32_t num_keys, const uint8_t* keys,
                         uint32_t* hashes);
  static void HashBit(bool combine_hashes, int64_t bit_offset, uint32_t num_keys,
                      const uint8_t* keys, uint32_t* hashes);
  template <bool T_COMBINE_HASHES, typename T>
  static void HashIntImp(uint32_t num_keys, const T* keys, uint32_t* hashes);
  static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length,
                      const uint8_t* keys, uint32_t* hashes);

#if defined(ARROW_HAVE_RUNTIME_AVX2)
  static inline __m256i Avalanche_avx2(__m256i hash);
  static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i hash);
  template <bool T_COMBINE_HASHES>
  static void AvalancheAll_avx2(uint32_t num_rows, uint32_t* hashes,
                                const uint32_t* hashes_temp_for_combine);
  static inline __m256i Round_avx2(__m256i acc, __m256i input);
  static inline uint64_t CombineAccumulators_avx2(__m256i acc);
  static inline __m256i StripeMask_avx2(int i, int j);
  template <bool two_equal_lengths>
  static inline __m256i ProcessStripes_avx2(int64_t num_stripes_A, int64_t num_stripes_B,
                                            __m256i mask_last_stripe, const uint8_t* keys,
                                            int64_t offset_A, int64_t offset_B);
  template <bool T_COMBINE_HASHES>
  static uint32_t HashFixedLenImp_avx2(uint32_t num_rows, uint64_t key_length,
                                       const uint8_t* keys, uint32_t* hashes,
                                       uint32_t* hashes_temp_for_combine);
  static uint32_t HashFixedLen_avx2(bool combine_hashes, uint32_t num_rows,
                                    uint64_t key_length, const uint8_t* keys,
                                    uint32_t* hashes, uint32_t* hashes_temp_for_combine);
  template <typename T, bool T_COMBINE_HASHES>
  static uint32_t HashVarLenImp_avx2(uint32_t num_rows, const T* offsets,
                                     const uint8_t* concatenated_keys, uint32_t* hashes,
                                     uint32_t* hashes_temp_for_combine);
  static uint32_t HashVarLen_avx2(bool combine_hashes, uint32_t num_rows,
                                  const uint32_t* offsets,
                                  const uint8_t* concatenated_keys, uint32_t* hashes,
                                  uint32_t* hashes_temp_for_combine);
  static uint32_t HashVarLen_avx2(bool combine_hashes, uint32_t num_rows,
                                  const uint64_t* offsets,
                                  const uint8_t* concatenated_keys, uint32_t* hashes,
                                  uint32_t* hashes_temp_for_combine);
#endif
};

class ARROW_EXPORT Hashing64 {
  friend class TestVectorHash;
  template <typename T>
  friend void TestBloomLargeHashHelper(int64_t, int64_t, const std::vector<uint64_t>&,
                                       int64_t, int, T*);
  friend void TestBloomSmall(BloomFilterBuildStrategy, int64_t, int, bool, bool);

 public:
  static void HashMultiColumn(const std::vector<KeyColumnArray>& cols, LightContext* ctx,
                              uint64_t* hashes);

  // Clarify the max temp stack usage for HashBatch, which might be necessary for the
  // caller to be aware of at compile time to reserve enough stack size in advance. The
  // HashBatch implementation uses one uint16 temp vector as a buffer for null indices and
  // one uint64 temp vector as a buffer for null hash, all are of size kMiniBatchLength.
  // Plus extra kMiniBatchLength to cope with stack padding and aligning.
  static constexpr auto kHashBatchTempStackUsage =
      (sizeof(uint16_t) + sizeof(uint64_t) + /*extra=*/1) *
      util::MiniBatch::kMiniBatchLength;

  static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes,
                          std::vector<KeyColumnArray>& column_arrays,
                          int64_t hardware_flags, util::TempVectorStack* temp_stack,
                          int64_t start_row, int64_t num_rows);

  static void HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t key_length,
                        const uint8_t* keys, uint64_t* hashes);

 private:
  static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
  static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
  static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
  static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
  static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
  static const uint32_t kCombineConst = 0x9e3779b9UL;
  static const int64_t kStripeSize = 4 * sizeof(uint64_t);

  static void HashVarLen(bool combine_hashes, uint32_t num_rows, const uint32_t* offsets,
                         const uint8_t* concatenated_keys, uint64_t* hashes);

  static void HashVarLen(bool combine_hashes, uint32_t num_rows, const uint64_t* offsets,
                         const uint8_t* concatenated_keys, uint64_t* hashes);

  static inline uint64_t Avalanche(uint64_t acc);
  static inline uint64_t Round(uint64_t acc, uint64_t input);
  static inline uint64_t CombineAccumulators(uint64_t acc1, uint64_t acc2, uint64_t acc3,
                                             uint64_t acc4);
  static inline uint64_t CombineHashesImp(uint64_t previous_hash, uint64_t hash) {
    uint64_t next_hash = previous_hash ^ (hash + kCombineConst + (previous_hash << 6) +
                                          (previous_hash >> 2));
    return next_hash;
  }
  static inline void ProcessFullStripes(uint64_t num_stripes, const uint8_t* key,
                                        uint64_t* out_acc1, uint64_t* out_acc2,
                                        uint64_t* out_acc3, uint64_t* out_acc4);
  static inline void ProcessLastStripe(uint64_t mask1, uint64_t mask2, uint64_t mask3,
                                       uint64_t mask4, const uint8_t* last_stripe,
                                       uint64_t* acc1, uint64_t* acc2, uint64_t* acc3,
                                       uint64_t* acc4);
  static inline void StripeMask(int i, uint64_t* mask1, uint64_t* mask2, uint64_t* mask3,
                                uint64_t* mask4);
  template <bool T_COMBINE_HASHES>
  static void HashFixedLenImp(uint32_t num_rows, uint64_t key_length, const uint8_t* keys,
                              uint64_t* hashes);
  template <typename T, bool T_COMBINE_HASHES>
  static void HashVarLenImp(uint32_t num_rows, const T* offsets,
                            const uint8_t* concatenated_keys, uint64_t* hashes);
  template <bool T_COMBINE_HASHES>
  static void HashBitImp(int64_t bit_offset, uint32_t num_keys, const uint8_t* keys,
                         uint64_t* hashes);
  static void HashBit(bool combine_hashes, int64_t bit_offset, uint32_t num_keys,
                      const uint8_t* keys, uint64_t* hashes);
  template <bool T_COMBINE_HASHES, typename T>
  static void HashIntImp(uint32_t num_keys, const T* keys, uint64_t* hashes);
  static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_length,
                      const uint8_t* keys, uint64_t* hashes);
};

}  // namespace compute
}  // namespace arrow20