aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfixthgame <fixthgame@yandex-team.com>2023-10-12 18:09:19 +0300
committerfixthgame <fixthgame@yandex-team.com>2023-10-12 18:47:49 +0300
commitdeabfde11b4514e0221cb5fac13b072f152ccd8f (patch)
tree096e3b94493f09d56f738a4638636ae8378a483a
parentbba1559e3cdc2fa94aa9faeeaf8fb7323a6b060f (diff)
downloadydb-deabfde11b4514e0221cb5fac13b072f152ccd8f.tar.gz
SIMD Реализации для Westmere и Haswell
Unit test for SIMM Westmere and Haswell base.h Westmere and Haswell Simd
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/avx2/begin.h3
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/avx2/end.h1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/avx2/simd.h324
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/sse42/begin.h3
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/sse42/end.h1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/block_join/sse42/simd.h310
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt1
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp176
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/ya.make1
12 files changed, 823 insertions, 0 deletions
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/avx2/begin.h b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/begin.h
new file mode 100644
index 00000000000..81d8f3ca346
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/begin.h
@@ -0,0 +1,3 @@
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to=function)
+
+#include "simd.h" \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/avx2/end.h b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/end.h
new file mode 100644
index 00000000000..fcef763036d
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/end.h
@@ -0,0 +1 @@
+#pragma clang attribute pop \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/avx2/simd.h b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/simd.h
new file mode 100644
index 00000000000..45beff01d33
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/avx2/simd.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include <cstdint>
+#include <immintrin.h>
+
+namespace NKikimr {
+namespace NMiniKQL {
+namespace NBlockJoin {
+namespace NAVX2 {
+namespace NSIMD {
+
+template <typename T>
+struct TSimd8;
+
+template<typename Child>
+struct TBase {
+ __m256i Value;
+
+ inline TBase()
+ : Value{__m256i()} {
+ }
+
+ inline TBase(const __m256i value)
+ : Value(value) {
+ }
+
+ inline operator const __m256i&() const {
+ return this->Value;
+ }
+ inline operator __m256i&() {
+ return this->Value;
+ }
+
+ inline Child operator|(const Child other) const {
+ return _mm256_or_si256(*this, other);
+ }
+ inline Child operator&(const Child other) const {
+ return _mm256_and_si256(*this, other);
+ }
+ inline Child operator^(const Child other) const {
+ return _mm256_xor_si256(*this, other);
+ }
+ inline Child BitAndNot(const Child other) const {
+ return _mm256_andnot_si256(*this, other);
+ };
+ inline Child& operator|=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast | other;
+ return *cast;
+ }
+ inline Child& operator&=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast & other;
+ return *cast;
+ };
+ inline Child& operator^=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast ^ other;
+ return *cast;
+ };
+};
+
+template<typename T, typename Mask=TSimd8<bool>>
+struct TBase8: TBase<TSimd8<T>> {
+
+ inline TBase8()
+ : TBase<TSimd8<T>>()
+ {
+ }
+
+ inline TBase8(const __m256i _value)
+ : TBase<TSimd8<T>>(_value)
+ {
+ }
+
+ friend inline Mask operator==(const TSimd8<T> lhs, const TSimd8<T> rhs) {
+ return _mm256_cmpeq_epi8(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(TBase<T>::Value);
+};
+
+template<>
+struct TSimd8<bool>: TBase8<bool> {
+
+ inline TSimd8<bool>()
+ : TBase8()
+ {
+ }
+
+ inline TSimd8<bool>(const __m256i value)
+ : TBase8<bool>(value)
+ {
+ }
+
+ inline TSimd8<bool>(bool value)
+ : TBase8<bool>(Set(value))
+ {
+ }
+
+ static inline TSimd8<bool> Set(bool value) {
+ return _mm256_set1_epi8(ui8(-(!!value)));
+ }
+
+ inline bool Any() const {
+ return !_mm256_testz_si256(*this, *this);
+ }
+
+ inline TSimd8<bool> operator~() const {
+ return *this ^ true;
+ }
+};
+
+template<typename T>
+struct TBase8Numeric: TBase8<T> {
+
+ inline TBase8Numeric()
+ : TBase8<T>()
+ {
+ }
+ inline TBase8Numeric(const __m256i value)
+ : TBase8<T>(value)
+ {
+ }
+
+ static inline TSimd8<T> Set(T value) {
+ return _mm256_set1_epi8(value);
+ }
+ static inline TSimd8<T> Zero() {
+ return _mm256_setzero_si256();
+ }
+ static inline TSimd8<T> Load(const T values[32]) {
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+ }
+
+ static inline TSimd8<T> Repeat16(
+ T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
+ ) {
+ return TSimd8<T>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15,
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline void Store(T dst[32]) const {
+ return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
+ }
+
+ inline TSimd8<T> operator+(const TSimd8<T> other) const {
+ return _mm256_add_epi8(*this, other);
+ }
+ inline TSimd8<T> operator-(const TSimd8<T> other) const {
+ return _mm256_sub_epi8(*this, other);
+ }
+ inline TSimd8<T>& operator+=(const TSimd8<T> other) {
+ *this = *this + other;
+ return *static_cast<TSimd8<T>*>(this);
+ }
+ inline TSimd8<T>& operator-=(const TSimd8<T> other) {
+ *this = *this - other;
+ return *static_cast<TSimd8<T>*>(this);
+ }
+
+ // 0xFFu = 11111111 = 2^8 - 1
+ inline TSimd8<T> operator~() const {
+ return *this ^ 0xFFu;
+ }
+};
+
+template<>
+struct TSimd8<i8> : TBase8Numeric<i8> {
+ inline TSimd8()
+ : TBase8Numeric<i8>()
+ {
+ }
+ inline TSimd8(const __m256i value)
+ : TBase8Numeric<i8>(value)
+ {
+ }
+ inline TSimd8(i8 value)
+ : TSimd8(Set(value))
+ {
+ }
+ inline TSimd8(const i8 values[32])
+ : TSimd8(Load(values))
+ {
+ }
+ inline TSimd8(
+ i8 v0, i8 v1, i8 v2, i8 v3, i8 v4, i8 v5, i8 v6, i8 v7,
+ i8 v8, i8 v9, i8 v10, i8 v11, i8 v12, i8 v13, i8 v14, i8 v15,
+ i8 v16, i8 v17, i8 v18, i8 v19, i8 v20, i8 v21, i8 v22, i8 v23,
+ i8 v24, i8 v25, i8 v26, i8 v27, i8 v28, i8 v29, i8 v30, i8 v31
+ ) : TSimd8(_mm256_setr_epi8(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15,
+ v16,v17,v18,v19,v20,v21,v22,v23,
+ v24,v25,v26,v27,v28,v29,v30,v31
+ ))
+ {
+ }
+
+ inline static TSimd8<i8> Repeat16(
+ i8 v0, i8 v1, i8 v2, i8 v3, i8 v4, i8 v5, i8 v6, i8 v7,
+ i8 v8, i8 v9, i8 v10, i8 v11, i8 v12, i8 v13, i8 v14, i8 v15
+ ) {
+ return TSimd8<i8>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15,
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline TSimd8<i8> MaxValue(const TSimd8<i8> other) const {
+ return _mm256_max_epi8(*this, other);
+ }
+ inline TSimd8<i8> MinValue(const TSimd8<i8> other) const {
+ return _mm256_min_epi8(*this, other);
+ }
+ inline TSimd8<bool> operator>(const TSimd8<i8> other) const {
+ return _mm256_cmpgt_epi8(*this, other);
+ }
+ inline TSimd8<bool> operator<(const TSimd8<i8> other) const {
+ return _mm256_cmpgt_epi8(other, *this);
+ }
+};
+
+template<>
+struct TSimd8<ui8>: TBase8Numeric<ui8> {
+ inline TSimd8()
+ : TBase8Numeric<ui8>()
+ {
+ }
+ inline TSimd8(const __m256i _value)
+ : TBase8Numeric<ui8>(_value)
+ {
+ }
+ inline TSimd8(ui8 _value)
+ : TSimd8(Set(_value))
+ {
+ }
+ inline TSimd8(const ui8 values[32])
+ : TSimd8(Load(values))
+ {
+ }
+ inline TSimd8(
+ ui8 v0, ui8 v1, ui8 v2, ui8 v3, ui8 v4, ui8 v5, ui8 v6, ui8 v7,
+ ui8 v8, ui8 v9, ui8 v10, ui8 v11, ui8 v12, ui8 v13, ui8 v14, ui8 v15,
+ ui8 v16, ui8 v17, ui8 v18, ui8 v19, ui8 v20, ui8 v21, ui8 v22, ui8 v23,
+ ui8 v24, ui8 v25, ui8 v26, ui8 v27, ui8 v28, ui8 v29, ui8 v30, ui8 v31
+ ) : TSimd8(_mm256_setr_epi8(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15,
+ v16,v17,v18,v19,v20,v21,v22,v23,
+ v24,v25,v26,v27,v28,v29,v30,v31
+ )) {}
+
+ inline static TSimd8<ui8> Repeat16(
+ ui8 v0, ui8 v1, ui8 v2, ui8 v3, ui8 v4, ui8 v5, ui8 v6, ui8 v7,
+ ui8 v8, ui8 v9, ui8 v10, ui8 v11, ui8 v12, ui8 v13, ui8 v14, ui8 v15
+ ) {
+ return TSimd8<ui8>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15,
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline TSimd8<ui8> MaxValue(const TSimd8<ui8> other) const {
+ return _mm256_max_epu8(*this, other);
+ }
+ inline TSimd8<ui8> MinValue(const TSimd8<ui8> other) const {
+ return _mm256_min_epu8(other, *this);
+ }
+ inline TSimd8<bool> operator<=(const TSimd8<ui8> other) const {
+ return other.MaxValue(*this) == other;
+ }
+ inline TSimd8<bool> operator>=(const TSimd8<ui8> other) const {
+ return other.MinValue(*this) == other;
+ }
+
+ inline TSimd8<bool> BitsNotSet() const {
+ return *this == ui8(0);
+ }
+ inline TSimd8<bool> AnyBitsSet() const {
+ return ~this->BitsNotSet();
+ }
+ inline bool BitsNotSetAnywhere() const {
+ return _mm256_testz_si256(*this, *this);
+ }
+ inline bool AnyBitsSetAnywhere() const {
+ return !BitsNotSetAnywhere();
+ }
+ inline bool BitsNotSetAnywhere(TSimd8<ui8> bits) const {
+ return _mm256_testz_si256(*this, bits);
+ }
+ inline bool AnyBitsSetAnywhere(TSimd8<ui8> bits) const {
+ return !BitsNotSetAnywhere(bits);
+ }
+
+ template<int N>
+ inline TSimd8<ui8> Shr() const {
+ return TSimd8<ui8>(_mm256_srli_epi16(*this, N)) & ui8(0xFFu >> N);
+ }
+ template<int N>
+ inline TSimd8<ui8> Shl() const {
+ return TSimd8<ui8>(_mm256_slli_epi16(*this, N)) & ui8(0xFFu << N);
+ }
+
+ template<int N>
+ inline int GetBit() const {
+ return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N));
+ }
+};
+
+}
+}
+}
+}
+} \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/sse42/begin.h b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/begin.h
new file mode 100644
index 00000000000..00fffa0930b
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/begin.h
@@ -0,0 +1,3 @@
+#pragma clang attribute push(__attribute__((target("sse4.2"))), apply_to=function)
+
+#include "simd.h" \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/sse42/end.h b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/end.h
new file mode 100644
index 00000000000..fcef763036d
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/end.h
@@ -0,0 +1 @@
+#pragma clang attribute pop \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/block_join/sse42/simd.h b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/simd.h
new file mode 100644
index 00000000000..71201ce40a7
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/block_join/sse42/simd.h
@@ -0,0 +1,310 @@
+#pragma once
+
+#include <cstdint>
+#include <immintrin.h>
+
+namespace NKikimr {
+namespace NMiniKQL {
+namespace NBlockJoin {
+namespace NSSE42 {
+namespace NSIMD {
+
+template <typename T>
+struct TSimd8;
+
+template<typename Child>
+struct TBase {
+ __m128i Value;
+
+ inline TBase()
+ : Value{__m128i()} {
+ }
+
+ inline TBase(const __m128i value)
+ : Value(value) {
+ }
+
+ inline operator const __m128i&() const {
+ return this->Value;
+ }
+ inline operator __m128i&() {
+ return this->Value;
+ }
+
+ inline Child operator|(const Child other) const {
+ return _mm_or_si128(*this, other);
+ }
+ inline Child operator&(const Child other) const {
+ return _mm_and_si128(*this, other);
+ }
+ inline Child operator^(const Child other) const {
+ return _mm_xor_si128(*this, other);
+ }
+ inline Child BitAndNot(const Child other) const {
+ return _mm_andnot_si128(*this, other);
+ };
+ inline Child& operator|=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast | other;
+ return *cast;
+ }
+ inline Child& operator&=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast & other;
+ return *cast;
+ };
+ inline Child& operator^=(const Child other) {
+ auto cast = static_cast<Child*>(*this);
+ *cast = *cast ^ other;
+ return *cast;
+ };
+};
+
+template<typename T, typename Mask=TSimd8<bool>>
+struct TBase8: TBase<TSimd8<T>> {
+
+ inline TBase8()
+ : TBase<TSimd8<T>>()
+ {
+ }
+
+ inline TBase8(const __m128i _value)
+ : TBase<TSimd8<T>>(_value)
+ {
+ }
+
+ friend inline Mask operator==(const TSimd8<T> lhs, const TSimd8<T> rhs) {
+ return _mm_cmpeq_epi8(lhs, rhs);
+ }
+
+ static const int SIZE = sizeof(TBase<T>::Value);
+};
+
+template<>
+struct TSimd8<bool>: TBase8<bool> {
+
+ inline TSimd8<bool>()
+ : TBase8()
+ {
+ }
+
+ inline TSimd8<bool>(const __m128i value)
+ : TBase8<bool>(value)
+ {
+ }
+
+ inline TSimd8<bool>(bool value)
+ : TBase8<bool>(Set(value))
+ {
+ }
+
+ static inline TSimd8<bool> Set(bool value) {
+ return _mm_set1_epi8(ui8(-(!!value)));
+ }
+
+ inline bool Any() const {
+ return !_mm_testz_si128(*this, *this);
+ }
+
+ inline TSimd8<bool> operator~() const {
+ return *this ^ true;
+ }
+};
+
+template<typename T>
+struct TBase8Numeric: TBase8<T> {
+
+ inline TBase8Numeric()
+ : TBase8<T>()
+ {
+ }
+ inline TBase8Numeric(const __m128i value)
+ : TBase8<T>(value)
+ {
+ }
+
+ static inline TSimd8<T> Set(T value) {
+ return _mm_set1_epi8(value);
+ }
+ static inline TSimd8<T> Zero() {
+ return _mm_setzero_si128();
+ }
+ static inline TSimd8<T> Load(const T values[16]) {
+ return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+ }
+
+ static inline TSimd8<T> Repeat16(
+ T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+ T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
+ ) {
+ return TSimd8<T>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline void Store(T dst[16]) const {
+ return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
+ }
+
+ inline TSimd8<T> operator+(const TSimd8<T> other) const {
+ return _mm_add_epi8(*this, other);
+ }
+ inline TSimd8<T> operator-(const TSimd8<T> other) const {
+ return _mm_sub_epi8(*this, other);
+ }
+ inline TSimd8<T>& operator+=(const TSimd8<T> other) {
+ *this = *this + other;
+ return *static_cast<TSimd8<T>*>(this);
+ }
+ inline TSimd8<T>& operator-=(const TSimd8<T> other) {
+ *this = *this - other;
+ return *static_cast<TSimd8<T>*>(this);
+ }
+
+ // 0xFFu = 11111111 = 2^8 - 1
+ inline TSimd8<T> operator~() const {
+ return *this ^ 0xFFu;
+ }
+};
+
+template<>
+struct TSimd8<i8> : TBase8Numeric<i8> {
+ inline TSimd8()
+ : TBase8Numeric<i8>()
+ {
+ }
+ inline TSimd8(const __m128i value)
+ : TBase8Numeric<i8>(value)
+ {
+ }
+ inline TSimd8(i8 value)
+ : TSimd8(Set(value))
+ {
+ }
+ inline TSimd8(const i8 values[16])
+ : TSimd8(Load(values))
+ {
+ }
+ inline TSimd8(
+ i8 v0, i8 v1, i8 v2, i8 v3, i8 v4, i8 v5, i8 v6, i8 v7,
+ i8 v8, i8 v9, i8 v10, i8 v11, i8 v12, i8 v13, i8 v14, i8 v15
+ ) : TSimd8(_mm_setr_epi8(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ ))
+ {
+ }
+
+ inline static TSimd8<i8> Repeat16(
+ i8 v0, i8 v1, i8 v2, i8 v3, i8 v4, i8 v5, i8 v6, i8 v7,
+ i8 v8, i8 v9, i8 v10, i8 v11, i8 v12, i8 v13, i8 v14, i8 v15
+ ) {
+ return TSimd8<i8>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline TSimd8<i8> MaxValue(const TSimd8<i8> other) const {
+ return _mm_max_epi8(*this, other);
+ }
+ inline TSimd8<i8> MinValue(const TSimd8<i8> other) const {
+ return _mm_min_epi8(*this, other);
+ }
+ inline TSimd8<bool> operator>(const TSimd8<i8> other) const {
+ return _mm_cmpgt_epi8(*this, other);
+ }
+ inline TSimd8<bool> operator<(const TSimd8<i8> other) const {
+ return _mm_cmpgt_epi8(other, *this);
+ }
+};
+
+template<>
+struct TSimd8<ui8>: TBase8Numeric<ui8> {
+ inline TSimd8()
+ : TBase8Numeric<ui8>()
+ {
+ }
+ inline TSimd8(const __m128i _value)
+ : TBase8Numeric<ui8>(_value)
+ {
+ }
+ inline TSimd8(ui8 _value)
+ : TSimd8(Set(_value))
+ {
+ }
+ inline TSimd8(const ui8 values[16])
+ : TSimd8(Load(values))
+ {
+ }
+ inline TSimd8(
+ ui8 v0, ui8 v1, ui8 v2, ui8 v3, ui8 v4, ui8 v5, ui8 v6, ui8 v7,
+ ui8 v8, ui8 v9, ui8 v10, ui8 v11, ui8 v12, ui8 v13, ui8 v14, ui8 v15
+ ) : TSimd8(_mm_setr_epi8(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ )) {}
+
+ inline static TSimd8<ui8> Repeat16(
+ ui8 v0, ui8 v1, ui8 v2, ui8 v3, ui8 v4, ui8 v5, ui8 v6, ui8 v7,
+ ui8 v8, ui8 v9, ui8 v10, ui8 v11, ui8 v12, ui8 v13, ui8 v14, ui8 v15
+ ) {
+ return TSimd8<ui8>(
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, v10,v11,v12,v13,v14,v15
+ );
+ }
+
+ inline TSimd8<ui8> MaxValue(const TSimd8<ui8> other) const {
+ return _mm_max_epu8(*this, other);
+ }
+ inline TSimd8<ui8> MinValue(const TSimd8<ui8> other) const {
+ return _mm_min_epu8(other, *this);
+ }
+ inline TSimd8<bool> operator<=(const TSimd8<ui8> other) const {
+ return other.MaxValue(*this) == other;
+ }
+ inline TSimd8<bool> operator>=(const TSimd8<ui8> other) const {
+ return other.MinValue(*this) == other;
+ }
+
+ inline TSimd8<bool> BitsNotSet() const {
+ return *this == ui8(0);
+ }
+ inline TSimd8<bool> AnyBitsSet() const {
+ return ~this->BitsNotSet();
+ }
+ inline bool BitsNotSetAnywhere() const {
+ return _mm_testz_si128(*this, *this);
+ }
+ inline bool AnyBitsSetAnywhere() const {
+ return !BitsNotSetAnywhere();
+ }
+ inline bool BitsNotSetAnywhere(TSimd8<ui8> bits) const {
+ return _mm_testz_si128(*this, bits);
+ }
+ inline bool AnyBitsSetAnywhere(TSimd8<ui8> bits) const {
+ return !BitsNotSetAnywhere(bits);
+ }
+
+ template<int N>
+ inline TSimd8<ui8> Shr() const {
+ return TSimd8<ui8>(_mm_srli_epi16(*this, N)) & ui8(0xFFu >> N);
+ }
+ template<int N>
+ inline TSimd8<ui8> Shl() const {
+ return TSimd8<ui8>(_mm_slli_epi16(*this, N)) & ui8(0xFFu << N);
+ }
+
+ template<int N>
+ inline int GetBit() const {
+ return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N));
+ }
+};
+
+}
+}
+}
+}
+} \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt
index 8d2f0c72220..46f14d57bbc 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt
+++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt
@@ -66,6 +66,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_todict_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_variant_ut.cpp
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt
index 36c36db8dd0..51201f10e6c 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt
+++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt
@@ -69,6 +69,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_todict_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_variant_ut.cpp
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt
index 3ba4890e0ce..8a15c2fc377 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt
+++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt
@@ -70,6 +70,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_todict_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_variant_ut.cpp
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt
index e7c59de3ad1..d2c5ed73e1e 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt
+++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt
@@ -59,6 +59,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_todict_ut.cpp
${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_variant_ut.cpp
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
new file mode 100644
index 00000000000..70ebcb8a22c
--- /dev/null
+++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_simd_ut.cpp
@@ -0,0 +1,176 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/system/cpu_id.h>
+
+#if __AVX2__
+#include <ydb/library/yql/minikql/comp_nodes/block_join/avx2/begin.h>
+Y_UNIT_TEST_SUITE(TMiniKQLBlockJoinHaswell) {
+ using namespace NKikimr::NMiniKQL::NBlockJoin::NAVX2::NSIMD;
+ Y_UNIT_TEST(SimdBool) {
+ TSimd8<bool> tr(true);
+ TSimd8<bool> fal(false);
+ UNIT_ASSERT_EQUAL(tr.Any(), true);
+ UNIT_ASSERT_EQUAL(fal.Any(), false);
+ UNIT_ASSERT_UNEQUAL(tr.Any(), fal.Any());
+ UNIT_ASSERT_EQUAL(tr.Any(), (tr ^ fal).Any());
+ UNIT_ASSERT_EQUAL(fal.Any(), (tr ^ tr).Any());
+ UNIT_ASSERT_EQUAL(fal.Any(), (tr & fal).Any());
+ UNIT_ASSERT_EQUAL((~tr).Any(), fal.Any());
+ UNIT_ASSERT_EQUAL((~fal).Any(), tr.Any());
+
+ TSimd8<bool> bit_or = tr | fal;
+ UNIT_ASSERT_EQUAL(bit_or.Any(), tr.Any());
+
+ TSimd8<bool> tr_m(_mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1));
+ UNIT_ASSERT_EQUAL((tr_m == tr).Any(), TSimd8<bool>(true).Any());
+ }
+ Y_UNIT_TEST(SimdUInt) {
+ __m256i x = _mm256_set1_epi8(0U);
+ uint8_t arr[32];
+ for (auto &i : arr) {
+ i = 0;
+ }
+ TSimd8<uint8_t> a(x), b(arr), c(uint8_t(0));
+ UNIT_ASSERT_EQUAL((a == b).Any(), true);
+ UNIT_ASSERT_EQUAL((b == c).Any(), true);
+ UNIT_ASSERT_EQUAL((c == TSimd8<uint8_t>::Zero()).Any(), true);
+
+ a = TSimd8<uint8_t>(uint8_t(50));
+ b = TSimd8<uint8_t>(uint8_t(49));
+ UNIT_ASSERT_EQUAL((a.MaxValue(b) == a).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == b).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == a).Any(), false);
+
+ UNIT_ASSERT_EQUAL(c.BitsNotSet().Any(), true);
+ UNIT_ASSERT_EQUAL(a.BitsNotSet().Any(), false);
+ UNIT_ASSERT_EQUAL(a.AnyBitsSet().Any(), true);
+
+
+ TSimd8<uint8_t> a2(uint8_t(100));
+ TSimd8<uint8_t> a3(uint8_t(25));
+ UNIT_ASSERT_EQUAL((a.Shl<1>() == a2).Any(), true);
+ UNIT_ASSERT_EQUAL((a.Shr<1>() == a3).Any(), true);
+ UNIT_ASSERT_EQUAL((a.Shr<8>() == c).Any(), true);
+ }
+
+ Y_UNIT_TEST(SimdInt) {
+ __m256i x = _mm256_set1_epi8(0);
+ int8_t arr[32];
+ for (auto &i : arr) {
+ i = 0;
+ }
+ TSimd8<int8_t> a(x), b(arr), c(int8_t(0));
+ UNIT_ASSERT_EQUAL((a == b).Any(), true);
+ UNIT_ASSERT_EQUAL((b == c).Any(), true);
+ UNIT_ASSERT_EQUAL((c == TSimd8<int8_t>::Zero()).Any(), true);
+
+ a = TSimd8<int8_t>(int8_t(50));
+ b = TSimd8<int8_t>(int8_t(49));
+ UNIT_ASSERT_EQUAL((a.MaxValue(b) == a).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == b).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == a).Any(), false);
+
+
+ TSimd8<int8_t> a2(int8_t(5));
+ TSimd8<int8_t> a3(int8_t(25));
+ a = TSimd8<int8_t>(int8_t(15));
+ b = TSimd8<int8_t>(int8_t(10));
+ UNIT_ASSERT_EQUAL(((a + b) == a3).Any(), true);
+ UNIT_ASSERT_EQUAL(((a - b) == a2).Any(), true);
+ }
+}
+
+#include <ydb/library/yql/minikql/comp_nodes/block_join/avx2/end.h>
+#else
+Y_UNIT_TEST_SUITE(TMiniKQLBlockJoinHaswell) {
+ Y_UNIT_TEST(SimdBool) {}
+ Y_UNIT_TEST(SimdUInt) {}
+ Y_UNIT_TEST(SimdInt) {}
+}
+#endif
+
+#if __SSE4_2__
+#include <ydb/library/yql/minikql/comp_nodes/block_join/sse42/begin.h>
+Y_UNIT_TEST_SUITE(TMiniKQLBlockJoinWestmere) {
+ using namespace NKikimr::NMiniKQL::NBlockJoin::NSSE42::NSIMD;
+ Y_UNIT_TEST(SimdBool) {
+
+ TSimd8<bool> tr(true);
+ TSimd8<bool> fal(false);
+ UNIT_ASSERT_EQUAL(tr.Any(), true);
+ UNIT_ASSERT_EQUAL(fal.Any(), false);
+ UNIT_ASSERT_UNEQUAL(tr.Any(), fal.Any());
+ UNIT_ASSERT_EQUAL(tr.Any(), (tr ^ fal).Any());
+ UNIT_ASSERT_EQUAL(fal.Any(), (tr ^ tr).Any());
+ UNIT_ASSERT_EQUAL(fal.Any(), (tr & fal).Any());
+ UNIT_ASSERT_EQUAL((~tr).Any(), fal.Any());
+ UNIT_ASSERT_EQUAL((~fal).Any(), tr.Any());
+
+ TSimd8<bool> bit_or = tr | fal;
+ UNIT_ASSERT_EQUAL(bit_or.Any(), tr.Any());
+
+ TSimd8<bool> tr_m(_mm_set_epi32(-1, -1, -1, -1));
+ UNIT_ASSERT_EQUAL((tr_m == tr).Any(), TSimd8<bool>(true).Any());
+ }
+ Y_UNIT_TEST(SimdUInt) {
+ __m128i x = _mm_set1_epi8(0U);
+ uint8_t arr[16];
+ for (auto &i : arr) {
+ i = 0;
+ }
+ TSimd8<uint8_t> a(x), b(arr), c(uint8_t(0));
+ UNIT_ASSERT_EQUAL((a == b).Any(), true);
+ UNIT_ASSERT_EQUAL((b == c).Any(), true);
+ UNIT_ASSERT_EQUAL((c == TSimd8<uint8_t>::Zero()).Any(), true);
+
+ a = TSimd8<uint8_t>(uint8_t(50));
+ b = TSimd8<uint8_t>(uint8_t(49));
+ UNIT_ASSERT_EQUAL((a.MaxValue(b) == a).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == b).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == a).Any(), false);
+
+ UNIT_ASSERT_EQUAL(c.BitsNotSet().Any(), true);
+ UNIT_ASSERT_EQUAL(a.BitsNotSet().Any(), false);
+ UNIT_ASSERT_EQUAL(a.AnyBitsSet().Any(), true);
+
+
+ TSimd8<uint8_t> a2(uint8_t(100));
+ TSimd8<uint8_t> a3(uint8_t(25));
+ UNIT_ASSERT_EQUAL((a.Shl<1>() == a2).Any(), true);
+ UNIT_ASSERT_EQUAL((a.Shr<1>() == a3).Any(), true);
+ UNIT_ASSERT_EQUAL((a.Shr<8>() == c).Any(), true);
+ }
+
+ Y_UNIT_TEST(SimdInt) {
+ __m128i x = _mm_set1_epi8(0);
+ int8_t arr[16];
+ for (auto &i : arr) {
+ i = 0;
+ }
+ TSimd8<int8_t> a(x), b(arr), c(int8_t(0));
+ UNIT_ASSERT_EQUAL((a == b).Any(), true);
+ UNIT_ASSERT_EQUAL((b == c).Any(), true);
+ UNIT_ASSERT_EQUAL((c == TSimd8<int8_t>::Zero()).Any(), true);
+
+ a = TSimd8<int8_t>(int8_t(50));
+ b = TSimd8<int8_t>(int8_t(49));
+ UNIT_ASSERT_EQUAL((a.MaxValue(b) == a).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == b).Any(), true);
+ UNIT_ASSERT_EQUAL((a.MinValue(b) == a).Any(), false);
+
+
+ TSimd8<int8_t> a2(int8_t(5));
+ TSimd8<int8_t> a3(int8_t(25));
+ a = TSimd8<int8_t>(int8_t(15));
+ b = TSimd8<int8_t>(int8_t(10));
+ UNIT_ASSERT_EQUAL(((a + b) == a3).Any(), true);
+ UNIT_ASSERT_EQUAL(((a - b) == a2).Any(), true);
+ }
+}
+#include <ydb/library/yql/minikql/comp_nodes/block_join/sse42/end.h>
+#else
+Y_UNIT_TEST_SUITE(TMiniKQLBlockJoinWestmere) {
+ Y_UNIT_TEST(SimdBool) {}
+ Y_UNIT_TEST(SimdUInt) {}
+ Y_UNIT_TEST(SimdInt) {}
+}
+#endif \ No newline at end of file
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/ya.make b/ydb/library/yql/minikql/comp_nodes/ut/ya.make
index a883fed1a10..9d7ad954076 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/ya.make
+++ b/ydb/library/yql/minikql/comp_nodes/ut/ya.make
@@ -48,6 +48,7 @@ SRCS(
mkql_match_recognize_nfa_ut.cpp
mkql_safe_circular_buffer_ut.cpp
mkql_sort_ut.cpp
+ mkql_simd_ut.cpp
mkql_switch_ut.cpp
mkql_todict_ut.cpp
mkql_variant_ut.cpp