diff options
author | vvvv <vvvv@yandex-team.com> | 2024-11-09 03:48:37 +0300 |
---|---|---|
committer | Vitaly Stoyan <vvvv@ydb.tech> | 2024-11-09 12:02:05 +0300 |
commit | ea0c2bd5b40e0be0147c40092f9d270ec11d015a (patch) | |
tree | ea1c768caad5d542e2ff8740e8ce5b55840037c4 /yql/essentials/minikql | |
parent | 2d0e7498c5e5f795c1a040623052b112691fac7e (diff) | |
download | ydb-ea0c2bd5b40e0be0147c40092f9d270ec11d015a.tar.gz |
Tune YDB <-> YQL deps
init
commit_hash:16572ab4e94aea4f7455c2ccb90b70ea99a412db
Diffstat (limited to 'yql/essentials/minikql')
16 files changed, 16 insertions, 2592 deletions
diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/hashes_calc.h b/yql/essentials/minikql/comp_nodes/packed_tuple/hashes_calc.h deleted file mode 100644 index a33679e4e5..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/hashes_calc.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include <contrib/ydb/library/yql/utils/simd/simd.h> - -namespace NKikimr { -namespace NMiniKQL { -namespace NPackedTuple { - - - -template <typename TTraits> -inline ui32 CalculateCRC32(const ui8 * data, ui32 size, ui32 hash = 0 ) { - - using TSimdI8 = typename TTraits::TSimdI8; - - while (size >= 8) { - hash = TSimdI8::CRC32u64(hash, ReadUnaligned<ui64>(data)); - size -= 8; - data += 8; - } - - switch(size) { - case 7: - hash = TSimdI8::CRC32u32(hash, ReadUnaligned<ui32>(data)); - data += 4; - [[fallthrough]]; - case 3: - hash = TSimdI8::CRC32u16(hash, ReadUnaligned<ui16>(data)); - data += 2; - [[fallthrough]]; - case 1: - hash = TSimdI8::CRC32u8(hash, ReadUnaligned<ui8>(data)); - break; - case 6: - hash = TSimdI8::CRC32u32(hash, ReadUnaligned<ui32>(data)); - data += 4; - [[fallthrough]]; - case 2: - hash = TSimdI8::CRC32u16(hash, ReadUnaligned<ui16>(data)); - break; - case 5: - hash = TSimdI8::CRC32u32(hash, ReadUnaligned<ui32>(data)); - data += 4; - hash = TSimdI8::CRC32u8(hash, ReadUnaligned<ui8>(data)); - break; - case 4: - hash = TSimdI8::CRC32u32(hash, ReadUnaligned<ui32>(data)); - break; - case 0: - break; - } - return hash; - -} -template -__attribute__((target("avx2"))) -ui32 CalculateCRC32<NSimd::TSimdAVX2Traits>(const ui8 * data, ui32 size, ui32 hash = 0 ); -template -__attribute__((target("sse4.2"))) -ui32 CalculateCRC32<NSimd::TSimdSSE42Traits>(const ui8 * data, ui32 size, ui32 hash = 0 ); -} - -} - -} diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/packed_tuple_ut.cpp b/yql/essentials/minikql/comp_nodes/packed_tuple/packed_tuple_ut.cpp deleted file mode 100644 index 25ac2a46c3..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/packed_tuple_ut.cpp +++ /dev/null @@ -1,899 +0,0 @@ -#include <yql/essentials/minikql/mkql_runtime_version.h> -#include <yql/essentials/minikql/comp_nodes/ut/mkql_computation_node_ut.h> -#include <library/cpp/testing/unittest/registar.h> - -#include <chrono> -#include <vector> -#include <set> -#include <random> - -#include <util/system/fs.h> -#include <util/system/compiler.h> -#include <util/stream/null.h> -#include <util/system/mem_info.h> - -#include <yql/essentials/minikql/comp_nodes/packed_tuple/hashes_calc.h> -#include <yql/essentials/minikql/comp_nodes/packed_tuple/tuple.h> - -#include <yql/essentials/minikql/comp_nodes/mkql_rh_hash.h> - -namespace NKikimr { -namespace NMiniKQL { -namespace NPackedTuple { - -using namespace std::chrono_literals; - -static volatile bool IsVerbose = false; -#define CTEST (IsVerbose ? Cerr : Cnull) - -namespace { - -template <typename TTraits> -void TestCalculateCRC32_Impl() { - std::mt19937_64 rng; // fixed-seed (0) prng - std::vector<ui64> v(1024); - std::generate(v.begin(), v.end(), rng); - - ui64 nanoseconds = 0; - ui64 totalBytes = 0; - ui32 hash = 0; - for (ui32 test = 0; test < 65535; ++test) { - ui32 bytes = rng() % (sizeof(v[0])*v.size()); - - std::chrono::steady_clock::time_point begin01 = std::chrono::steady_clock::now(); - hash = CalculateCRC32<TTraits>((const ui8 *) v.data(), bytes, hash); - std::chrono::steady_clock::time_point end01 = std::chrono::steady_clock::now(); - - nanoseconds += std::chrono::duration_cast<std::chrono::nanoseconds>(end01 - begin01).count(); - totalBytes += bytes; - } - CTEST << "Hash: " << hash << Endl; - UNIT_ASSERT_VALUES_EQUAL(hash, 80113928); - CTEST << "Data Size: " << totalBytes << Endl; - CTEST << "Time for hash: " << ((nanoseconds + 999)/1000) << "[microseconds]" << Endl; - CTEST << "Calculating speed: " << totalBytes / ((nanoseconds + 999)/1000) << "MB/sec" << Endl; -} -} - -Y_UNIT_TEST_SUITE(TestHash) { - -Y_UNIT_TEST(TestCalculateCRC32Fallback) { - TestCalculateCRC32_Impl<NSimd::TSimdFallbackTraits>(); -} - -Y_UNIT_TEST(TestCalculateCRC32SSE42) { - if (NX86::HaveSSE42()) - TestCalculateCRC32_Impl<NSimd::TSimdSSE42Traits>(); - else - CTEST << "Skipped SSE42 test\n"; -} - -Y_UNIT_TEST(TestCalculateCRC32AVX2) { - if (NX86::HaveAVX2()) - TestCalculateCRC32_Impl<NSimd::TSimdAVX2Traits>(); - else - CTEST << "Skipped AVX2 test\n"; -} - -} - -Y_UNIT_TEST_SUITE(TupleLayout) { -Y_UNIT_TEST(CreateLayout) { - - TColumnDesc kc1, kc2, pc1, pc2, pc3; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 8; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 4; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 16; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - pc3.Role = EColumnRole::Payload; - pc3.DataSize = 8; - - std::vector<TColumnDesc> columns{kc1, kc2, pc1, pc2, pc3}; - - auto tl = TTupleLayout::Create(columns); - UNIT_ASSERT(tl->TotalRowSize == 45); -} - -Y_UNIT_TEST(Pack) { - - TScopedAlloc alloc(__LOCATION__); - - TColumnDesc kc1, kc2, pc1, pc2; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 8; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 4; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - std::vector<TColumnDesc> columns{kc1, kc2, pc1, pc2}; - - auto tl = TTupleLayout::Create(columns); - UNIT_ASSERT(tl->TotalRowSize == 29); - - const ui64 NTuples1 = 10e6; - - const ui64 Tuples1DataBytes = (tl->TotalRowSize) * NTuples1; - - std::vector<ui64> col1(NTuples1, 0); - std::vector<ui32> col2(NTuples1, 0); - std::vector<ui64> col3(NTuples1, 0); - std::vector<ui32> col4(NTuples1, 0); - - std::vector<ui8> res(Tuples1DataBytes + 64, 0); - - for (ui32 i = 0; i < NTuples1; ++i) { - col1[i] = i; - col2[i] = i; - col3[i] = i; - col4[i] = i; - } - - const ui8* cols[4]; - - cols[0] = (ui8*) col1.data(); - cols[1] = (ui8*) col2.data(); - cols[2] = (ui8*) col3.data(); - cols[3] = (ui8*) col4.data(); - - std::chrono::steady_clock::time_point begin02 = std::chrono::steady_clock::now(); - - std::vector<ui8> colValid1((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid2((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid3((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid4((NTuples1 + 7)/8, ~0); - const ui8 *colsValid[4] = { - colValid1.data(), - colValid2.data(), - colValid3.data(), - colValid4.data(), - }; - - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - tl->Pack(cols, colsValid, res.data(), overflow, 0, NTuples1); - std::chrono::steady_clock::time_point end02 = std::chrono::steady_clock::now(); - ui64 microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end02 - begin02).count(); - if (microseconds == 0) microseconds = 1; - - CTEST << "Time for " << (NTuples1) << " transpose (external cycle)= " << microseconds << "[microseconds]" << Endl; - CTEST << "Data size = " << Tuples1DataBytes / (1024 * 1024) << "[MB]" << Endl; - CTEST << "Calculating speed = " << Tuples1DataBytes / microseconds << "MB/sec" << Endl; - CTEST << Endl; - - UNIT_ASSERT(true); - -} - -Y_UNIT_TEST(Unpack) { - - TScopedAlloc alloc(__LOCATION__); - - TColumnDesc kc1, kc2, pc1, pc2; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 8; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 4; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - std::vector<TColumnDesc> columns{kc1, kc2, pc1, pc2}; - - auto tl = TTupleLayout::Create(columns); - UNIT_ASSERT(tl->TotalRowSize == 29); - - const ui64 NTuples1 = 10e6; - - const ui64 Tuples1DataBytes = (tl->TotalRowSize) * NTuples1; - - std::vector<ui64> col1(NTuples1, 0); - std::vector<ui32> col2(NTuples1, 0); - std::vector<ui64> col3(NTuples1, 0); - std::vector<ui32> col4(NTuples1, 0); - - std::vector<ui8> res(Tuples1DataBytes + 64, 0); - - for (ui32 i = 0; i < NTuples1; ++i) { - col1[i] = i; - col2[i] = i; - col3[i] = i; - col4[i] = i; - } - - const ui8* cols[4]; - - cols[0] = (ui8*) col1.data(); - cols[1] = (ui8*) col2.data(); - cols[2] = (ui8*) col3.data(); - cols[3] = (ui8*) col4.data(); - - std::vector<ui8> colValid1((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid2((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid3((NTuples1 + 7)/8, ~0); - std::vector<ui8> colValid4((NTuples1 + 7)/8, ~0); - const ui8 *colsValid[4] = { - colValid1.data(), - colValid2.data(), - colValid3.data(), - colValid4.data(), - }; - - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - tl->Pack(cols, colsValid, res.data(), overflow, 0, NTuples1); - - std::vector<ui64> col1_new(NTuples1, 0); - std::vector<ui32> col2_new(NTuples1, 0); - std::vector<ui64> col3_new(NTuples1, 0); - std::vector<ui32> col4_new(NTuples1, 0); - - ui8* cols_new[4]; - cols_new[0] = (ui8*) col1_new.data(); - cols_new[1] = (ui8*) col2_new.data(); - cols_new[2] = (ui8*) col3_new.data(); - cols_new[3] = (ui8*) col4_new.data(); - - std::vector<ui8> colValid1_new((NTuples1 + 7)/8, 0); - std::vector<ui8> colValid2_new((NTuples1 + 7)/8, 0); - std::vector<ui8> colValid3_new((NTuples1 + 7)/8, 0); - std::vector<ui8> colValid4_new((NTuples1 + 7)/8, 0); - - ui8 *colsValid_new[4] = { - colValid1_new.data(), - colValid2_new.data(), - colValid3_new.data(), - colValid4_new.data(), - }; - - std::chrono::steady_clock::time_point begin02 = std::chrono::steady_clock::now(); - tl->Unpack(cols_new, colsValid_new, res.data(), overflow, 0, NTuples1); - std::chrono::steady_clock::time_point end02 = std::chrono::steady_clock::now(); - ui64 microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end02 - begin02).count(); - - if (microseconds == 0) microseconds = 1; - - CTEST << "Time for " << (NTuples1) << " transpose (external cycle)= " << microseconds << "[microseconds]" << Endl; - CTEST << "Data size = " << Tuples1DataBytes / (1024 * 1024) << "[MB]" << Endl; - CTEST << "Calculating speed = " << Tuples1DataBytes / microseconds << "MB/sec" << Endl; - CTEST << Endl; - - UNIT_ASSERT(std::memcmp(col1.data(), col1_new.data(), sizeof(ui64) * col1.size()) == 0); - UNIT_ASSERT(std::memcmp(col2.data(), col2_new.data(), sizeof(ui32) * col2.size()) == 0); - UNIT_ASSERT(std::memcmp(col3.data(), col3_new.data(), sizeof(ui64) * col3.size()) == 0); - UNIT_ASSERT(std::memcmp(col4.data(), col4_new.data(), sizeof(ui32) * col4.size()) == 0); - - UNIT_ASSERT(std::memcmp(colValid1.data(), colValid1_new.data(), colValid1.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid2.data(), colValid2_new.data(), colValid2.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid3.data(), colValid3_new.data(), colValid3.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid4.data(), colValid4_new.data(), colValid4.size()) == 0); -} - -Y_UNIT_TEST(PackVarSize) { - - TScopedAlloc alloc(__LOCATION__); - - TColumnDesc kc1, kcv1, kcv2, kc2, pc1, pc2; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 8; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 4; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - kcv1.Role = EColumnRole::Key; - kcv1.DataSize = 8; - kcv1.SizeType = EColumnSizeType::Variable; - - kcv2.Role = EColumnRole::Key; - kcv2.DataSize = 16; - kcv2.SizeType = EColumnSizeType::Variable; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - std::vector<TColumnDesc> columns{kc1, kc2, kcv1, kcv2, pc1, pc2}; - - auto tl = TTupleLayout::Create(columns); - CTEST << "TotalRowSize = " << tl->TotalRowSize << Endl; - UNIT_ASSERT_VALUES_EQUAL(tl->TotalRowSize, 54); - - const ui64 NTuples1 = 3; - - const ui64 Tuples1DataBytes = (tl->TotalRowSize) * NTuples1; - - std::vector<ui64> col1(NTuples1, 0); - std::vector<ui32> col2(NTuples1, 0); - std::vector<ui64> col3(NTuples1, 0); - std::vector<ui32> col4(NTuples1, 0); - - std::vector<ui32> vcol1(1, 0); - - std::vector<ui8> vcol1data; - std::vector<ui32> vcol2(1, 0); - std::vector<ui8> vcol2data; - - std::vector<ui8> res(Tuples1DataBytes + 64, 0); - std::vector<TString> vcol1str { - "abc", - "ABCDEFGHIJKLMNO", - "ZYXWVUTSPR" - }; - std::vector<TString> vcol2str { - "ABC", - "abcdefghijklmno", - "zyxwvutspr" - }; - for (auto &&str: vcol1str) { - for (auto c: str) - vcol1data.push_back(c); - vcol1.push_back(vcol1data.size()); - } - UNIT_ASSERT_VALUES_EQUAL(vcol1.size(), NTuples1 + 1); - for (auto &&str: vcol2str) { - for (auto c: str) - vcol2data.push_back(c); - vcol2.push_back(vcol2data.size()); - } - UNIT_ASSERT_VALUES_EQUAL(vcol2.size(), NTuples1 + 1); - for (ui32 i = 0; i < NTuples1; ++i) { - col1[i] = (1ull<<(sizeof(col1[0])*8 - 4)) + i + 1; - col2[i] = (2ull<<(sizeof(col2[0])*8 - 4)) + i + 1; - col3[i] = (3ull<<(sizeof(col3[0])*8 - 4)) + i + 1; - col4[i] = (4ull<<(sizeof(col4[0])*8 - 4)) + i + 1; - } - - const ui8* cols[4 + 2*2]; - - cols[0] = (ui8*) col1.data(); - cols[1] = (ui8*) col2.data(); - cols[2] = (ui8*) vcol1.data(); - cols[3] = (ui8*) vcol1data.data(); - cols[4] = (ui8*) vcol2.data(); - cols[5] = (ui8*) vcol2data.data(); - cols[6] = (ui8*) col3.data(); - cols[7] = (ui8*) col4.data(); - - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - std::vector<ui8> colValid((NTuples1 + 7)/8, ~0); - const ui8 *colsValid[8] = { - colValid.data(), - colValid.data(), - colValid.data(), - nullptr, - colValid.data(), - nullptr, - colValid.data(), - colValid.data(), - }; - - std::chrono::steady_clock::time_point begin02 = std::chrono::steady_clock::now(); - tl->Pack(cols, colsValid, res.data(), overflow, 0, NTuples1); - std::chrono::steady_clock::time_point end02 = std::chrono::steady_clock::now(); - ui64 microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end02 - begin02).count(); - - if (microseconds == 0) - microseconds = 1; - - CTEST << "Time for " << (NTuples1) << " transpose (external cycle)= " << microseconds << "[microseconds]" << Endl; -#ifndef NDEBUG - CTEST << "Result size = " << Tuples1DataBytes << Endl; - CTEST << "Result = "; - for (ui32 i = 0; i < Tuples1DataBytes; ++i) - CTEST << int(res[i]) << ' '; - CTEST << Endl; - CTEST << "Overflow size = " << overflow.size() << Endl; - CTEST << "Overflow = "; - for (auto c: overflow) - CTEST << int(c) << ' '; - CTEST << Endl; -#endif - static const ui8 expected_data[54*3] = { - // row1 - - 0xe2,0x47,0x16,0x6c, // hash - 0x1, 0, 0, 0x20, // col1 - 0x1, 0, 0, 0, 0, 0, 0, 0x10, // col2 - 0x3, 0x61, 0x62, 0x63, 0, 0, 0, 0, 0, // vcol1 - 0x3, 0x41, 0x42, 0x43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // vcol2 - 0x3f, //NULL bitmap - 0x1, 0, 0, 0x40, // col3 - 0x1, 0, 0, 0, 0, 0, 0, 0x30, // col4 - // row2 - 0xc2, 0x1c, 0x1b, 0xa8, // hash - 0x2, 0, 0, 0x20, // col1 - 0x2, 0, 0, 0, 0, 0, 0, 0x10, // col2 - 0xff, 0, 0, 0, 0, 0xf, 0, 0, 0, // vcol1 [overflow offset, overflow size] - 0xf, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, // vcol2 - 0x3f, // NULL bitmap - 0x2, 0, 0, 0x40, // col3 - 0x2, 0, 0, 0, 0, 0, 0, 0x30, // col4 - // row3 - 0xfa, 0x49, 0x5, 0xe9, // hash - 0x3, 0, 0, 0x20, // col1 - 0x3, 0, 0, 0, 0, 0, 0, 0x10, // col2 - 0xff, 0xf, 0, 0, 0, 0xa, 0, 0, 0, // vcol1 [overflow offset, overflow size] - 0xa, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x70, 0x72, 0, 0, 0, 0, 0, // vcol2 - 0x3f, // NULL bitmap - 0x3, 0, 0, 0x40, // col3 - 0x3, 0, 0, 0, 0, 0, 0, 0x30, // col4 - }; - static const ui8 expected_overflow[25] = { - 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, - 0x5a, 0x59, 0x58, 0x57, 0x56, 0x55, 0x54, 0x53, 0x50, 0x52, - }; - UNIT_ASSERT_VALUES_EQUAL(sizeof(expected_data), tl->TotalRowSize*NTuples1); - UNIT_ASSERT_VALUES_EQUAL(overflow.size(), sizeof(expected_overflow)); - for (ui32 i = 0; i < sizeof(expected_data); ++i) - UNIT_ASSERT_VALUES_EQUAL(expected_data[i], res[i]); - for (ui32 i = 0; i < sizeof(expected_overflow); ++i) - UNIT_ASSERT_VALUES_EQUAL(expected_overflow[i], overflow[i]); -} - -Y_UNIT_TEST(UnpackVarSize) { - - TScopedAlloc alloc(__LOCATION__); - - TColumnDesc kc1, kcv1, kcv2, kc2, pc1, pc2; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 8; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 4; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - kcv1.Role = EColumnRole::Key; - kcv1.DataSize = 8; - kcv1.SizeType = EColumnSizeType::Variable; - - kcv2.Role = EColumnRole::Key; - kcv2.DataSize = 16; - kcv2.SizeType = EColumnSizeType::Variable; - - pc1.Role = EColumnRole::Payload; - pc1.DataSize = 8; - - pc2.Role = EColumnRole::Payload; - pc2.DataSize = 4; - - std::vector<TColumnDesc> columns{kc1, kc2, kcv1, kcv2, pc1, pc2}; - - auto tl = TTupleLayout::Create(columns); - CTEST << "TotalRowSize = " << tl->TotalRowSize << Endl; - UNIT_ASSERT_VALUES_EQUAL(tl->TotalRowSize, 54); - - const ui64 NTuples1 = 3; - - const ui64 Tuples1DataBytes = (tl->TotalRowSize) * NTuples1; - - std::vector<ui64> col1(NTuples1, 0); - std::vector<ui32> col2(NTuples1, 0); - std::vector<ui64> col3(NTuples1, 0); - std::vector<ui32> col4(NTuples1, 0); - - std::vector<ui32> vcol1(1, 0); - std::vector<ui8> vcol1data; - std::vector<ui32> vcol2(1, 0); - std::vector<ui8> vcol2data; - - std::vector<ui8> res(Tuples1DataBytes + 64, 0); - std::vector<TString> vcol1str { - "abc", - "ABCDEFGHIJKLMNO", - "ZYXWVUTSPR" - }; - std::vector<TString> vcol2str { - "ABC", - "abcdefghijklmno", - "zyxwvutspr" - }; - for (auto &&str: vcol1str) { - for (auto c: str) - vcol1data.push_back(c); - vcol1.push_back(vcol1data.size()); - } - UNIT_ASSERT_VALUES_EQUAL(vcol1.size(), NTuples1 + 1); - for (auto &&str: vcol2str) { - for (auto c: str) - vcol2data.push_back(c); - vcol2.push_back(vcol2data.size()); - } - UNIT_ASSERT_VALUES_EQUAL(vcol2.size(), NTuples1 + 1); - for (ui32 i = 0; i < NTuples1; ++i) { - col1[i] = (1ull<<(sizeof(col1[0])*8 - 4)) + i + 1; - col2[i] = (2ull<<(sizeof(col2[0])*8 - 4)) + i + 1; - col3[i] = (3ull<<(sizeof(col3[0])*8 - 4)) + i + 1; - col4[i] = (4ull<<(sizeof(col4[0])*8 - 4)) + i + 1; - } - - const ui8* cols[4 + 2*2]; - - cols[0] = (ui8*) col1.data(); - cols[1] = (ui8*) col2.data(); - cols[2] = (ui8*) vcol1.data(); - cols[3] = (ui8*) vcol1data.data(); - cols[4] = (ui8*) vcol2.data(); - cols[5] = (ui8*) vcol2data.data(); - cols[6] = (ui8*) col3.data(); - cols[7] = (ui8*) col4.data(); - - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - std::vector<ui8> colValid((NTuples1 + 7)/8, ~0); - const ui8 *colsValid[8] = { - colValid.data(), - colValid.data(), - colValid.data(), - nullptr, - colValid.data(), - nullptr, - colValid.data(), - colValid.data(), - }; - - tl->Pack(cols, colsValid, res.data(), overflow, 0, NTuples1); - - std::vector<ui64> col1_new(NTuples1, 0); - std::vector<ui32> col2_new(NTuples1, 0); - std::vector<ui64> col3_new(NTuples1, 0); - std::vector<ui32> col4_new(NTuples1, 0); - - std::vector<ui32> vcol1_new(NTuples1 + 1, 0); - std::vector<ui8> vcol1data_new(vcol1data.size()); - std::vector<ui32> vcol2_new(NTuples1 + 1, 0); - std::vector<ui8> vcol2data_new(vcol2data.size()); - - ui8* cols_new[4 + 2 * 2]; - cols_new[0] = (ui8*) col1_new.data(); - cols_new[1] = (ui8*) col2_new.data(); - cols_new[2] = (ui8*) vcol1_new.data(); - cols_new[3] = (ui8*) vcol1data_new.data(); - cols_new[4] = (ui8*) vcol2_new.data(); - cols_new[5] = (ui8*) vcol2data_new.data(); - cols_new[6] = (ui8*) col3_new.data(); - cols_new[7] = (ui8*) col4_new.data(); - - std::vector<ui8> colValid1_new((NTuples1 + 7)/8, 0); - colValid1_new.back() = ~0; - std::vector<ui8> colValid2_new((NTuples1 + 7)/8, 0); - colValid2_new.back() = ~0; - std::vector<ui8> colValid3_new((NTuples1 + 7)/8, 0); - colValid3_new.back() = ~0; - std::vector<ui8> colValid4_new((NTuples1 + 7)/8, 0); - colValid4_new.back() = ~0; - std::vector<ui8> colValid5_new((NTuples1 + 7)/8, 0); - colValid5_new.back() = ~0; - std::vector<ui8> colValid6_new((NTuples1 + 7)/8, 0); - colValid6_new.back() = ~0; - - ui8 *colsValid_new[8] = { - colValid1_new.data(), - colValid2_new.data(), - colValid3_new.data(), - nullptr, - colValid4_new.data(), - nullptr, - colValid5_new.data(), - colValid6_new.data(), - }; - - std::chrono::steady_clock::time_point begin02 = std::chrono::steady_clock::now(); - tl->Unpack(cols_new, colsValid_new, res.data(), overflow, 0, NTuples1); - std::chrono::steady_clock::time_point end02 = std::chrono::steady_clock::now(); - ui64 microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end02 - begin02).count(); - - if (microseconds == 0) - microseconds = 1; - - CTEST << "Time for " << (NTuples1) << " transpose (external cycle)= " << microseconds << "[microseconds]" << Endl; -#ifndef NDEBUG - CTEST << "Result size = " << Tuples1DataBytes << Endl; - CTEST << "Result = "; - for (ui32 i = 0; i < Tuples1DataBytes; ++i) - CTEST << int(res[i]) << ' '; - CTEST << Endl; - CTEST << "Overflow size = " << overflow.size() << Endl; - CTEST << "Overflow = "; - for (auto c: overflow) - CTEST << int(c) << ' '; - CTEST << Endl; -#endif - - UNIT_ASSERT(std::memcmp(cols[0], cols_new[0], sizeof(ui64) * col1.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[1], cols_new[1], sizeof(ui32) * col2.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[2], cols_new[2], sizeof(ui32) * vcol1.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[3], cols_new[3], vcol1data.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[4], cols_new[4], sizeof(ui32) * vcol2.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[5], cols_new[5], vcol1data.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[6], cols_new[6], sizeof(ui64) * col3.size()) == 0); - UNIT_ASSERT(std::memcmp(cols[7], cols_new[7], sizeof(ui32) * col4.size()) == 0); - - UNIT_ASSERT(std::memcmp(colValid.data(), colValid1_new.data(), colValid.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid.data(), colValid2_new.data(), colValid.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid.data(), colValid3_new.data(), colValid.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid.data(), colValid4_new.data(), colValid.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid.data(), colValid5_new.data(), colValid.size()) == 0); - UNIT_ASSERT(std::memcmp(colValid.data(), colValid6_new.data(), colValid.size()) == 0); -} - -Y_UNIT_TEST(PackVarSizeBig) { - - TScopedAlloc alloc(__LOCATION__); - - TColumnDesc kc1, kc2, kcv1; - - kc1.Role = EColumnRole::Key; - kc1.DataSize = 1; - - kc2.Role = EColumnRole::Key; - kc2.DataSize = 2; - - kcv1.Role = EColumnRole::Key; - kcv1.DataSize = 1000; - kcv1.SizeType = EColumnSizeType::Variable; - - std::vector<TColumnDesc> columns{kc1, kc2, kcv1 }; - - auto tl = TTupleLayout::Create(columns); - //CTEST << "TotalRowSize = " << tl->TotalRowSize << Endl; - UNIT_ASSERT_VALUES_EQUAL(tl->TotalRowSize, 263); - - const ui64 NTuples1 = 2; - - const ui64 Tuples1DataBytes = (tl->TotalRowSize) * NTuples1; - - std::vector<ui8> col1(NTuples1, 0); - std::vector<ui16> col2(NTuples1, 0); - - std::vector<ui32> vcol1(1, 0); - - std::vector<ui8> vcol1data; - - std::vector<ui8> res(Tuples1DataBytes + 64, 0); - std::vector<TString> vcol1str { - "zaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbb" - "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" - "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabcdefghijklnmorstuvwxy", - "zaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbb" - "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" - "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbrstuv", - }; - for (auto &&str: vcol1str) { - for (auto c: str) - vcol1data.push_back(c); - vcol1.push_back(vcol1data.size()); - } - UNIT_ASSERT_VALUES_EQUAL(vcol1.size(), NTuples1 + 1); - for (ui32 i = 0; i < NTuples1; ++i) { - col1[i] = (1ull<<(sizeof(col1[0])*8 - 4)) + i + 1; - col2[i] = (2ull<<(sizeof(col2[0])*8 - 4)) + i + 1; - } - - const ui8* cols[2 + 1*2]; - - cols[0] = (ui8*) col1.data(); - cols[1] = (ui8*) col2.data(); - cols[2] = (ui8*) vcol1.data(); - cols[3] = (ui8*) vcol1data.data(); - - std::vector<ui8> colValid((NTuples1 + 7)/8, ~0); - const ui8 *colsValid[2 + 1*2] = { - colValid.data(), - colValid.data(), - colValid.data(), - nullptr, - }; - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - - std::chrono::steady_clock::time_point begin02 = std::chrono::steady_clock::now(); - tl->Pack(cols, colsValid, res.data(), overflow, 0, NTuples1); - std::chrono::steady_clock::time_point end02 = std::chrono::steady_clock::now(); - ui64 microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end02 - begin02).count(); - - CTEST << "Time for " << (NTuples1) << " transpose (external cycle)= " << microseconds << "[microseconds]" << Endl; -#ifndef NDEBUG - CTEST << "Result size = " << Tuples1DataBytes << Endl; - CTEST << "Result = "; - for (ui32 i = 0; i < Tuples1DataBytes; ++i) - CTEST << int(res[i]) << ' '; - CTEST << Endl; - CTEST << "Overflow size = " << overflow.size() << Endl; - CTEST << "Overflow = "; - for (auto c: overflow) - CTEST << int(c) << ' '; - CTEST << Endl; -#endif - static const ui8 expected_data[263*2] = { - // row1 - 0xe1,0x22,0x63,0xf5, // hash - 0x11, // col1 - 0x1, 0x20, // col2 - 0xff, 0, 0, 0, 0, 0xb, 0, 0, 0, // vcol2 [ overflow offset, overflow size ] - 0x7a, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, - 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, - 0x7, // NULL bitmap - // row 2 - 0xab,0xa5,0x5f,0xd4, // hash - 0x12, // col1 - 0x2, 0x20, // col2 - 0xfe, 0x7a, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, - 0x61, 0x61, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, 0x62, - 0x62, 0x62, 0x72, 0x73, 0x74, 0x75, 0x76, - 0x7, // NULLs bitmap - }; - static const ui8 expected_overflow[11] = { - 0x6e, 0x6d, 0x6f, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, - }; - UNIT_ASSERT_VALUES_EQUAL(sizeof(expected_data), tl->TotalRowSize*NTuples1); - UNIT_ASSERT_VALUES_EQUAL(overflow.size(), sizeof(expected_overflow)); - for (ui32 i = 0; i < sizeof(expected_data); ++i) - UNIT_ASSERT_VALUES_EQUAL(expected_data[i], res[i]); - for (ui32 i = 0; i < sizeof(expected_overflow); ++i) - UNIT_ASSERT_VALUES_EQUAL(expected_overflow[i], overflow[i]); -} -Y_UNIT_TEST(PackIsValidFuzz) { - - TScopedAlloc alloc(__LOCATION__); - - std::mt19937 rng; // fixed-seed (0) prng - std::vector<TColumnDesc> columns; - std::vector<std::vector<ui8>> colsdata; - std::vector<const ui8*> colsptr; - std::vector<std::vector<ui8>> isValidData; - std::vector<const ui8*> isValidPtr; - - ui64 totalNanoseconds = 0; - ui64 totalSize = 0; - ui64 totalRows = 0; - for (ui32 test = 0; test < 10; ++test) { - ui32 rows = 1 + (rng() % 1000); - ui32 cols = 1 + (rng() % 100); - columns.resize(cols); - colsdata.resize(cols); - colsptr.resize(cols); - isValidData.resize(cols); - isValidPtr.resize(cols); - ui32 isValidSize = (rows + 7)/8; - totalRows += rows; - for (ui32 j = 0; j < cols; ++j) { - auto &col = columns[j]; - col.Role = (rng() % 10 < 1) ? EColumnRole::Key : EColumnRole::Payload; - col.DataSize = 1u <<(rng() % 16); - col.SizeType = EColumnSizeType::Fixed; - colsdata[j].resize(rows*col.DataSize); - colsptr[j] = colsdata[j].data(); - isValidData[j].resize(isValidSize); - isValidPtr[j] = isValidData[j].data(); - std::generate(isValidData[j].begin(), isValidData[j].end(), rng); - } - auto tl = TTupleLayout::Create(columns); - std::vector<ui8> res; - for (ui32 subtest = 0; subtest < 20; ++subtest) { - ui32 subRows = 1 + (rows ? rng() % (rows - 1) : 0); - ui32 off = subRows != rows ? rng() % (rows - subRows) : 0; - std::vector<ui8, TMKQLAllocator<ui8>> overflow; - totalSize += subRows*tl->TotalRowSize; - res.resize(subRows*tl->TotalRowSize); - - std::chrono::steady_clock::time_point begin01 = std::chrono::steady_clock::now(); - tl->Pack(colsptr.data(), isValidPtr.data(), res.data(), overflow, off, subRows); - std::chrono::steady_clock::time_point end01 = std::chrono::steady_clock::now(); - totalNanoseconds += std::chrono::duration_cast<std::chrono::nanoseconds>(end01 - begin01).count(); - - UNIT_ASSERT_VALUES_EQUAL(overflow.size(), 0); - auto resptr = res.data(); - for (ui32 row = 0; row < subRows; ++row, resptr += tl->TotalRowSize) { - for (ui32 j = 0; j < cols; ++j) { - auto &col = tl->Columns[j]; - UNIT_ASSERT_VALUES_EQUAL(((resptr[tl->BitmaskOffset + (j / 8)] >> (j % 8)) & 1), ((isValidData[col.OriginalIndex][(off + row) / 8] >> ((off + row) % 8)) & 1)); - } - } - } - } - - if (totalNanoseconds == 0) totalNanoseconds = 1; - - CTEST << "Time for " << totalRows << " transpose (external cycle)= " << (totalNanoseconds + 999)/1000 << "[microseconds]" << Endl; - CTEST << "Data size = " << totalSize / (1024 * 1024) << "[MB]" << Endl; - CTEST << "Calculating speed = " << totalSize / ((totalNanoseconds + 999)/1000) << "MB/sec" << Endl; - CTEST << Endl; -} -} - - - -} -} // namespace NMiniKQL -} // namespace NKikimr diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/packing.h b/yql/essentials/minikql/comp_nodes/packed_tuple/packing.h deleted file mode 100644 index 929e11fea4..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/packing.h +++ /dev/null @@ -1,424 +0,0 @@ -#include <util/system/unaligned_mem.h> -#include <contrib/ydb/library/yql/utils/simd/simd.h> - -namespace NKikimr { -namespace NMiniKQL { -namespace NPackedTuple { - -static void -PackTupleFallbackRowImpl(const ui8 *const src_cols[], ui8 *const dst_rows, - const size_t cols, const size_t size, - const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t start = 0) { - for (size_t row = 0; row != size; ++row) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - *reinterpret_cast<ui##bits *>(dst_rows + row * tuple_size + \ - offsets[col]) = \ - *reinterpret_cast<const ui##bits *>(src_cols[col] + \ - (start + row) * (bits / 8)); \ - break - - MULTY_8x4(CASE); - -#undef CASE -#undef MULTY_8x4 - - default: - memcpy(dst_rows + row * tuple_size + offsets[col], - src_cols[col] + (start + row) * col_sizes[col], - col_sizes[col]); - } - } - } -} - -static void -UnpackTupleFallbackRowImpl(const ui8 *const src_rows, ui8 *const dst_cols[], - const size_t cols, const size_t size, - const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t start = 0) { - for (size_t row = 0; row != size; ++row) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - *reinterpret_cast<ui##bits *>(dst_cols[col] + \ - (start + row) * (bits / 8)) = \ - *reinterpret_cast<const ui##bits *>(src_rows + row * tuple_size + \ - offsets[col]); \ - break - - MULTY_8x4(CASE); - -#undef CASE -#undef MULTY_8x4 - - default: - memcpy(dst_cols[col] + (start + row) * col_sizes[col], - src_rows + row * tuple_size + offsets[col], - col_sizes[col]); - } - } - } -} - -template <class ByteType> -Y_FORCE_INLINE static void -PackTupleFallbackTypedColImpl(const ui8 *const src_col, ui8 *const dst_rows, - const size_t size, const size_t tuple_size, - const size_t start = 0) { - static constexpr size_t BYTES = sizeof(ByteType); - for (size_t row = 0; row != size; ++row) { - WriteUnaligned<ByteType>( - dst_rows + row * tuple_size, - ReadUnaligned<ByteType>(src_col + (start + row) * BYTES)); - } -} - -template <class ByteType> -Y_FORCE_INLINE static void -UnpackTupleFallbackTypedColImpl(const ui8 *const src_rows, ui8 *const dst_col, - const size_t size, const size_t tuple_size, - const size_t start = 0) { - static constexpr size_t BYTES = sizeof(ByteType); - for (size_t row = 0; row != size; ++row) { - WriteUnaligned<ByteType>( - dst_col + (start + row) * BYTES, - ReadUnaligned<ByteType>(src_rows + row * tuple_size)); - } -} - -static void -PackTupleFallbackColImpl(const ui8 *const src_cols[], ui8 *const dst_rows, - const size_t cols, const size_t size, - const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t start = 0) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - PackTupleFallbackTypedColImpl<ui##bits>( \ - src_cols[col], dst_rows + offsets[col], size, tuple_size, start); \ - break - - MULTY_8x4(CASE); - -#undef CASE -#undef MULTY_8x4 - - default: - for (size_t row = 0; row != size; ++row) { - memcpy(dst_rows + row * tuple_size + offsets[col], - src_cols[col] + (start + row) * col_sizes[col], - col_sizes[col]); - } - } - } -} - -static void -UnpackTupleFallbackColImpl(const ui8 *const src_rows, ui8 *const dst_cols[], - const size_t cols, const size_t size, - const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t start = 0) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - UnpackTupleFallbackTypedColImpl<ui##bits>( \ - src_rows + offsets[col], dst_cols[col], size, tuple_size, start); \ - break - - MULTY_8x4(CASE); - -#undef CASE -#undef MULTY_8x4 - - default: - for (size_t row = 0; row != size; ++row) { - memcpy(dst_cols[col] + (start + row) * col_sizes[col], - src_rows + row * tuple_size + offsets[col], - col_sizes[col]); - } - } - } -} - -[[maybe_unused]] static void PackTupleFallbackBlockImpl( - const ui8 *const src_cols[], ui8 *const dst_rows, const size_t cols, - const size_t size, const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t block_rows, const size_t start = 0) { - - const size_t block_size = size / block_rows; - for (size_t block = 0; block != block_size; ++block) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define BLOCK_LOOP(...) \ - for (size_t block_i = 0; block_i != block_rows; ++block_i) { \ - const size_t row = block_rows * block + block_i; \ - __VA_ARGS__ \ - } - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - PackTupleFallbackTypedColImpl<ui##bits>( \ - src_cols[col], \ - dst_rows + block * block_rows * tuple_size + offsets[col], \ - block_rows, tuple_size, start + block * block_rows); \ - break - - MULTY_8x4(CASE); - - default: - BLOCK_LOOP( - memcpy(dst_rows + row * tuple_size + offsets[col], - src_cols[col] + (start + row) * col_sizes[col], - col_sizes[col]);) - -#undef CASE -#undef MULTY_8x4 -#undef BLOCK_LOOP - } - } - } - - PackTupleFallbackColImpl( - src_cols, dst_rows + block_size * block_rows * tuple_size, cols, - size - block_size * block_rows, col_sizes, offsets, tuple_size, - start + block_size * block_rows); -} - -[[maybe_unused]] static void UnpackTupleFallbackBlockImpl( - const ui8 *const src_rows, ui8 *const dst_cols[], const size_t cols, - const size_t size, const size_t col_sizes[], const size_t offsets[], - const size_t tuple_size, const size_t block_rows, const size_t start = 0) { - - const size_t block_size = size / block_rows; - for (size_t block = 0; block != block_size; ++block) { - for (ui8 col = 0; col != cols; ++col) { - switch (col_sizes[col] * 8) { - -#define BLOCK_LOOP(...) \ - for (size_t block_i = 0; block_i != block_rows; ++block_i) { \ - const size_t row = block_rows * block + block_i; \ - __VA_ARGS__ \ - } - -#define MULTY_8x4(...) \ - __VA_ARGS__(8); \ - __VA_ARGS__(16); \ - __VA_ARGS__(32); \ - __VA_ARGS__(64) - -#define CASE(bits) \ - case bits: \ - UnpackTupleFallbackTypedColImpl<ui##bits>( \ - src_rows + block * block_rows * tuple_size + offsets[col], \ - dst_cols[col], block_rows, tuple_size, \ - start + block * block_rows); \ - break - - MULTY_8x4(CASE); - - default: - BLOCK_LOOP( - memcpy(dst_cols[col] + (start + row) * col_sizes[col], - src_rows + row * tuple_size + offsets[col], - col_sizes[col]);) - -#undef CASE -#undef MULTY_8x4 -#undef BLOCK_LOOP - } - } - } - - UnpackTupleFallbackColImpl(src_rows + block_size * block_rows * tuple_size, - dst_cols, cols, size - block_size * block_rows, - col_sizes, offsets, tuple_size, - start + block_size * block_rows); -} - -template <class TTraits> struct SIMDPack { - template <class T> using TSimd = typename TTraits::template TSimd8<T>; - - static TSimd<ui8> BuildTuplePerm(size_t col_size, size_t col_pad, - ui8 offset, ui8 ind, bool packing) { - ui8 perm[TSimd<ui8>::SIZE]; - std::memset(perm, 0x80, TSimd<ui8>::SIZE); - - size_t iters = std::max(size_t(1u), TSimd<ui8>::SIZE / (col_size + col_pad)); - while (iters--) { - for (size_t it = col_size; it; --it, ++offset, ++ind) { - if (packing) { - perm[offset] = ind; - } else { - perm[ind] = offset; - } - } - offset += col_pad; - } - - return TSimd<ui8>{perm}; - } - - template <ui8 TupleSize> static TSimd<ui8> TupleOr(TSimd<ui8> vec[]) { - return TupleOrImpl<TupleSize>(vec); - } - - template <ui8 TupleSize> static TSimd<ui8> TupleOrImpl(TSimd<ui8> vec[]) { - static constexpr ui8 Left = TupleSize / 2; - static constexpr ui8 Right = TupleSize - Left; - - return TupleOrImpl<Left>(vec) | TupleOrImpl<Right>(vec + Left); - } - - template <> TSimd<ui8> TupleOrImpl<0>(TSimd<ui8>[]) { std::abort(); } - - template <> TSimd<ui8> TupleOrImpl<1>(TSimd<ui8> vec[]) { return vec[0]; } - - template <> TSimd<ui8> TupleOrImpl<2>(TSimd<ui8> vec[]) { - return vec[0] | vec[1]; - } - - template <ui8 StoresPerLoad, ui8 Cols> - static void - PackTupleOrImpl(const ui8 *const src_cols[], ui8 *const dst_rows, - const size_t size, const size_t col_sizes[], - const size_t offsets[], const size_t tuple_size, - const TSimd<ui8> perms[], const size_t start = 0) { - static constexpr size_t kSIMD_Rem = sizeof(TSimd<ui8>) - StoresPerLoad; - const ui8 tuples_per_store = - std::max(size_t(1u), TSimd<ui8>::SIZE / tuple_size); - const size_t simd_iters = (size > kSIMD_Rem ? size - kSIMD_Rem : 0) / - (tuples_per_store * StoresPerLoad); - - TSimd<ui8> src_regs[Cols]; - TSimd<ui8> perm_regs[Cols]; - - const ui8 *srcs[Cols]; - std::memcpy(srcs, src_cols, sizeof(srcs)); - for (ui8 col = 0; col != Cols; ++col) { - srcs[col] += col_sizes[col] * start; - } - - auto dst = dst_rows; - ui8 *const end = dst_rows + simd_iters * tuples_per_store * - StoresPerLoad * tuple_size; - while (dst != end) { - for (ui8 col = 0; col != Cols; ++col) { - src_regs[col] = TSimd<ui8>(srcs[col]); - srcs[col] += col_sizes[col] * tuples_per_store * StoresPerLoad; - } - - for (ui8 iter = 0; iter != StoresPerLoad; ++iter) { - // shuffling each col bytes to the right positions - // then blending them together with 'or' - for (ui8 col = 0; col != Cols; ++col) { - perm_regs[col] = src_regs[col].Shuffle( - perms[col * StoresPerLoad + iter]); - } - - TupleOr<Cols>(perm_regs).Store(dst); - dst += tuple_size * tuples_per_store; - } - } - - PackTupleFallbackRowImpl(srcs, dst, Cols, - size - simd_iters * tuples_per_store * - StoresPerLoad, - col_sizes, offsets, tuple_size); - } - - template <ui8 LoadsPerStore, ui8 Cols> - static void - UnpackTupleOrImpl(const ui8 *const src_rows, ui8 *const dst_cols[], - size_t size, const size_t col_sizes[], - const size_t offsets[], const size_t tuple_size, - const TSimd<ui8> perms[], const size_t start = 0) { - static constexpr size_t kSIMD_Rem = sizeof(TSimd<ui8>) - LoadsPerStore; - const ui8 tuples_per_load = - std::max(size_t(1u), TSimd<ui8>::SIZE / tuple_size); - const size_t simd_iters = (size > kSIMD_Rem ? size - kSIMD_Rem : 0) / - (tuples_per_load * LoadsPerStore); - - TSimd<ui8> src_regs[LoadsPerStore]; - TSimd<ui8> perm_regs[LoadsPerStore]; - - auto src = src_rows; - const ui8 *const end = src_rows + simd_iters * tuples_per_load * - LoadsPerStore * tuple_size; - - ui8 *dsts[Cols]; - std::memcpy(dsts, dst_cols, sizeof(dsts)); - for (ui8 col = 0; col != Cols; ++col) { - dsts[col] += col_sizes[col] * start; - } - - while (src != end) { - for (ui8 iter = 0; iter != LoadsPerStore; ++iter) { - src_regs[iter] = TSimd<ui8>(src); - src += tuple_size * tuples_per_load; - } - - for (ui8 col = 0; col != Cols; ++col) { - // shuffling each col bytes to the right positions - // then blending them together with 'or' - for (ui8 iter = 0; iter != LoadsPerStore; ++iter) { - perm_regs[iter] = src_regs[iter].Shuffle( - perms[col * LoadsPerStore + iter]); - } - - TupleOr<LoadsPerStore>(perm_regs).Store(dsts[col]); - dsts[col] += col_sizes[col] * tuples_per_load * LoadsPerStore; - } - } - - UnpackTupleFallbackRowImpl(src, dsts, Cols, - size - simd_iters * tuples_per_load * - LoadsPerStore, - col_sizes, offsets, tuple_size); - } -}; - -} // namespace NPackedTuple -} // namespace NMiniKQL -} // namespace NKikimr diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.cpp b/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.cpp deleted file mode 100644 index 8a51b8c123..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.cpp +++ /dev/null @@ -1,983 +0,0 @@ -#include "tuple.h" - -#include <algorithm> -#include <queue> - -#include <yql/essentials/minikql/mkql_node.h> -#include <yql/essentials/public/udf/udf_data_type.h> -#include <yql/essentials/public/udf/udf_types.h> -#include <yql/essentials/public/udf/udf_value.h> - -#include <util/generic/bitops.h> -#include <util/generic/buffer.h> - -#include "hashes_calc.h" -#include "packing.h" - -namespace NKikimr { -namespace NMiniKQL { -namespace NPackedTuple { - -namespace { - -// Transpose 8x8 bit-matrix packed in ui64 integer -Y_FORCE_INLINE ui64 transposeBitmatrix(ui64 x) { - if (x == 0xFFFFFFFFFFFFFFFFLL) { - return x; - } - - // a b A B aa bb AA BB - // c d C D cc dd CC DD - // -> - // a c A C aa cc AA CC - // b d B D bb dd BB DD - // a b A B aa bb AA BB // c d C D cc dd CC DD - // a c A C aa cc AA CC // b d B D bb dd BB DD - x = ((x & - 0b10101010'01010101'10101010'01010101'10101010'01010101'10101010'01010101ull)) | - ((x & - 0b01010101'00000000'01010101'00000000'01010101'00000000'01010101'00000000ull) >> - 7) | - ((x & - 0b00000000'10101010'00000000'10101010'00000000'10101010'00000000'10101010ull) - << 7); - // a1 a2 b1 b2 A1 A2 B1 B2 - // a3 a4 b3 b4 A3 A4 B3 B4 - // c1 c2 d1 d2 C1 C2 D1 D2 - // c3 c4 d3 d4 C3 C4 D3 D4 - // -> - // a1 a2 c1 c2 A1 A2 C1 C2 - // a3 a4 c3 c4 A3 A4 C3 C4 - // b1 b2 d1 d2 B1 B2 D1 D2 - // b3 b4 d3 d4 B3 B4 D3 D4 - // - // - // a1 a2 b1 b2 A1 A2 B1 B2 // a3 a4 b3 b4 A3 A4 B3 B4 // c1 c2 d1 d2 C1 C2 - // D1 D2 // c3 c4 d3 d4 C3 C4 D3 D4 - // -> - // a1 a2 c1 c2 A1 A2 C1 C2 // a3 a4 c3 c4 A3 A4 C3 C4 // b1 b2 d1 d2 B1 B2 - // D1 D2 // b3 b4 d3 d4 B3 B4 D3 D4 - x = ((x & - 0b1100110011001100'0011001100110011'1100110011001100'0011001100110011ull)) | - ((x & - 0b0011001100110011'0000000000000000'0011001100110011'0000000000000000ull) >> - 14) | - ((x & - 0b0000000000000000'1100110011001100'0000000000000000'1100110011001100ull) - << 14); - x = ((x & - 0b11110000111100001111000011110000'00001111000011110000111100001111ull)) | - ((x & - 0b00001111000011110000111100001111'00000000000000000000000000000000ull) >> - 28) | - ((x & - 0b00000000000000000000000000000000'11110000111100001111000011110000ull) - << 28); - return x; -} - -void transposeBitmatrix(ui8 dst[], const ui8 *src[], const size_t row_size) { - ui64 x = 0; - for (size_t ind = 0; ind != 8; ++ind) { - x |= ui64(*src[ind]) << (ind * 8); - } - - x = transposeBitmatrix(x); - - for (size_t ind = 0; ind != 8; ++ind) { - dst[ind * row_size] = x; - x >>= 8; - } -} - -void transposeBitmatrix(ui8 *dst[], const ui8 src[], const size_t row_size) { - ui64 x = 0; - for (size_t ind = 0; ind != 8; ++ind) { - x |= ui64(src[ind * row_size]) << (ind * 8); - } - - x = transposeBitmatrix(x); - - for (size_t ind = 0; ind != 8; ++ind) { - *dst[ind] = x; - x >>= 8; - } -} - -} // namespace - -THolder<TTupleLayout> -TTupleLayout::Create(const std::vector<TColumnDesc> &columns) { - - if (NX86::HaveAVX2()) - return MakeHolder<TTupleLayoutFallback<NSimd::TSimdAVX2Traits>>( - columns); - - if (NX86::HaveSSE42()) - return MakeHolder<TTupleLayoutFallback<NSimd::TSimdSSE42Traits>>( - columns); - - return MakeHolder<TTupleLayoutFallback<NSimd::TSimdFallbackTraits>>( - columns); -} - -template <typename TTraits> -TTupleLayoutFallback<TTraits>::TTupleLayoutFallback( - const std::vector<TColumnDesc> &columns) - : TTupleLayout(columns) { - - for (ui32 i = 0, idx = 0; i < OrigColumns.size(); ++i) { - auto &col = OrigColumns[i]; - - col.OriginalIndex = idx; - - if (col.SizeType == EColumnSizeType::Variable) { - // we cannot handle (rare) overflow strings unless we have at least - // space for header; size of inlined strings is limited to 254 - // bytes, limit maximum inline data size - col.DataSize = std::max<ui32>(1 + 2 * sizeof(ui32), - std::min<ui32>(255, col.DataSize)); - idx += 2; // Variable-size takes two columns: one for offsets, and - // another for payload - } else { - idx += 1; - } - - if (col.Role == EColumnRole::Key) { - KeyColumns.push_back(col); - } else { - PayloadColumns.push_back(col); - } - } - - KeyColumnsNum = KeyColumns.size(); - - auto ColumnDescLess = [](const TColumnDesc &a, const TColumnDesc &b) { - if (a.SizeType != b.SizeType) // Fixed first - return a.SizeType == EColumnSizeType::Fixed; - - if (a.DataSize == b.DataSize) - // relative order of (otherwise) same key columns must be preserved - return a.OriginalIndex < b.OriginalIndex; - - return a.DataSize < b.DataSize; - }; - - std::sort(KeyColumns.begin(), KeyColumns.end(), ColumnDescLess); - std::sort(PayloadColumns.begin(), PayloadColumns.end(), ColumnDescLess); - - KeyColumnsFixedEnd = 0; - - ui32 currOffset = 4; // crc32 hash in the beginning - KeyColumnsOffset = currOffset; - KeyColumnsFixedNum = KeyColumnsNum; - - for (ui32 i = 0; i < KeyColumnsNum; ++i) { - auto &col = KeyColumns[i]; - - if (col.SizeType == EColumnSizeType::Variable && - KeyColumnsFixedEnd == 0) { - KeyColumnsFixedEnd = currOffset; - KeyColumnsFixedNum = i; - } - - col.ColumnIndex = i; - col.Offset = currOffset; - Columns.push_back(col); - currOffset += col.DataSize; - } - - KeyColumnsEnd = currOffset; - - if (KeyColumnsFixedEnd == 0) // >= 4 if was ever assigned - KeyColumnsFixedEnd = KeyColumnsEnd; - - KeyColumnsSize = KeyColumnsEnd - KeyColumnsOffset; - BitmaskOffset = currOffset; - - BitmaskSize = (OrigColumns.size() + 7) / 8; - - currOffset += BitmaskSize; - BitmaskEnd = currOffset; - - PayloadOffset = currOffset; - - for (ui32 i = 0; i < PayloadColumns.size(); ++i) { - auto &col = PayloadColumns[i]; - col.ColumnIndex = KeyColumnsNum + i; - col.Offset = currOffset; - Columns.push_back(col); - currOffset += col.DataSize; - } - - PayloadEnd = currOffset; - PayloadSize = PayloadEnd - PayloadOffset; - - TotalRowSize = currOffset; - - for (auto &col : Columns) { - if (col.SizeType == EColumnSizeType::Variable) { - VariableColumns_.push_back(col); - } else if (IsPowerOf2(col.DataSize) && - col.DataSize < (1u << FixedPOTColumns_.size())) { - FixedPOTColumns_[CountTrailingZeroBits(col.DataSize)].push_back( - col); - } else { - FixedNPOTColumns_.push_back(col); - } - } - - /// TODO: dynamic configuration - BlockRows_ = 256; - const bool use_simd = true; - - std::vector<const TColumnDesc *> block_fallback; - std::queue<const TColumnDesc *> next_cols; - - size_t fixed_cols_left = - KeyColumnsFixedNum + - std::accumulate(PayloadColumns.begin(), PayloadColumns.end(), 0ul, - [](size_t prev, const auto &col) { - return prev + - (col.SizeType == EColumnSizeType::Fixed); - }); - - size_t prev_tuple_size; - size_t curr_tuple_size = 0; - - const auto manage_block_packing = [&](const std::vector<TColumnDesc> - &columns) { - for (size_t col_ind = 0; - col_ind != columns.size() && - columns[col_ind].SizeType == EColumnSizeType::Fixed;) { - --fixed_cols_left; - next_cols.push(&columns[col_ind]); - prev_tuple_size = curr_tuple_size; - curr_tuple_size = next_cols.back()->Offset + - next_cols.back()->DataSize - - next_cols.front()->Offset; - - ++col_ind; - if (curr_tuple_size >= TSimd<ui8>::SIZE || - next_cols.size() == kSIMDMaxCols || !fixed_cols_left) { - const bool oversize = curr_tuple_size > TSimd<ui8>::SIZE; - const size_t tuple_size = - oversize ? prev_tuple_size : curr_tuple_size; - const size_t tuple_cols = next_cols.size() - oversize; - - if (!use_simd || !tuple_cols || - (Columns.size() != next_cols.size() && - tuple_size < TSimd<ui8>::SIZE * 7 / 8) || - tuple_size > TSimd<ui8>::SIZE || - (!SIMDBlock_.empty() && - TotalRowSize - next_cols.front()->Offset < - TSimd<ui8>::SIZE)) { - block_fallback.push_back(next_cols.front()); - next_cols.pop(); - continue; - } - - SIMDDesc simd_desc; - simd_desc.Cols = tuple_cols; - simd_desc.PermMaskOffset = SIMDPermMasks_.size(); - simd_desc.RowOffset = next_cols.front()->Offset; - - const TColumnDesc *col_descs[kSIMDMaxCols]; - ui32 col_max_size = 0; - for (ui8 col_ind = 0; col_ind != simd_desc.Cols; ++col_ind) { - col_descs[col_ind] = next_cols.front(); - col_max_size = - std::max(col_max_size, col_descs[col_ind]->DataSize); - next_cols.pop(); - } - - simd_desc.InnerLoopIters = std::min( - size_t(kSIMDMaxInnerLoopSize), - (TSimd<ui8>::SIZE / col_max_size) / - std::max(size_t(1u), size_t(TSimd<ui8>::SIZE / TotalRowSize))); - - const auto tuples_per_register = - std::max(1u, TSimd<ui8>::SIZE / TotalRowSize); - - for (ui8 col_ind = 0; col_ind != simd_desc.Cols; ++col_ind) { - const auto &col_desc = col_descs[col_ind]; - const size_t offset = - col_desc->Offset - simd_desc.RowOffset; - - BlockFixedColsSizes_.push_back(col_desc->DataSize); - BlockColsOffsets_.push_back(offset); - BlockColumnsOrigInds_.push_back(col_desc->OriginalIndex); - } - - for (size_t packing_flag = 1; packing_flag != 3; - ++packing_flag) { - for (ui8 col_ind = 0; col_ind != simd_desc.Cols; - ++col_ind) { - const auto &col_desc = col_descs[col_ind]; - const size_t offset = - col_desc->Offset - simd_desc.RowOffset; - - for (ui8 ind = 0; ind != simd_desc.InnerLoopIters; - ++ind) { - SIMDPermMasks_.push_back( - SIMDPack<TTraits>::BuildTuplePerm( - col_desc->DataSize, - TotalRowSize - col_desc->DataSize, offset, - ind * col_desc->DataSize * - tuples_per_register, - packing_flag % 2)); - } - } - } - - SIMDBlock_.push_back(simd_desc); - } - } - - while (!next_cols.empty()) { - block_fallback.push_back(next_cols.front()); - next_cols.pop(); - } - }; - - manage_block_packing(KeyColumns); - manage_block_packing(PayloadColumns); - - for (const auto col_desc_p : block_fallback) { - BlockColsOffsets_.push_back(col_desc_p->Offset); - BlockFixedColsSizes_.push_back(col_desc_p->DataSize); - BlockColumnsOrigInds_.push_back(col_desc_p->OriginalIndex); - } -} - -// Columns (SoA) format: -// for fixed size: packed data -// for variable size: offset (ui32) into next column; size of colum is -// rowCount + 1 -// -// Row (AoS) format: -// fixed size: packed data -// variable size: -// assumes DataSize <= 255 && DataSize >= 1 + 2*4 -// if size of payload is less than col.DataSize: -// u8 one byte of size (0..254) -// u8 [size] data -// u8 [DataSize - 1 - size] padding -// if size of payload is greater than DataSize: -// u8 = 255 -// u32 = offset in overflow buffer -// u32 = size -// u8 [DataSize - 1 - 2*4] initial bytes of data -// Data is expected to be consistent with isValidBitmask (0 for fixed-size, -// empty for variable-size) -template <> -void TTupleLayoutFallback<NSimd::TSimdFallbackTraits>::Pack( - const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const { - using TTraits = NSimd::TSimdFallbackTraits; - - std::vector<ui64> bitmaskMatrix(BitmaskSize); - - if (auto off = (start % 8)) { - auto bitmaskIdx = start / 8; - - for (ui32 j = Columns.size(); j--;) - bitmaskMatrix[j / 8] |= - ui64(isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx]) - << ((j % 8) * 8); - - for (auto &m : bitmaskMatrix) { - m = transposeBitmatrix(m); - m >>= off * 8; - } - } - - for (; count--; ++start, res += TotalRowSize) { - ui32 hash = 0; - auto bitmaskIdx = start / 8; - - bool anyOverflow = false; - - for (ui32 i = KeyColumnsFixedNum; i < KeyColumns.size(); ++i) { - auto &col = KeyColumns[i]; - ui32 dataOffset = ReadUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * start); - ui32 nextOffset = ReadUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * (start + 1)); - auto size = nextOffset - dataOffset; - - if (size >= col.DataSize) { - anyOverflow = true; - break; - } - } - - if ((start % 8) == 0) { - std::fill(bitmaskMatrix.begin(), bitmaskMatrix.end(), 0); - for (ui32 j = Columns.size(); j--;) - bitmaskMatrix[j / 8] |= - ui64(isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx]) - << ((j % 8) * 8); - for (auto &m : bitmaskMatrix) - m = transposeBitmatrix(m); - } - - for (ui32 j = 0; j < BitmaskSize; ++j) { - res[BitmaskOffset + j] = ui8(bitmaskMatrix[j]); - bitmaskMatrix[j] >>= 8; - } - - for (auto &col : FixedNPOTColumns_) { - std::memcpy(res + col.Offset, - columns[col.OriginalIndex] + start * col.DataSize, - col.DataSize); - } - -#define PackPOTColumn(POT) \ - for (auto &col : FixedPOTColumns_[POT]) { \ - std::memcpy(res + col.Offset, \ - columns[col.OriginalIndex] + start * (1u << POT), \ - 1u << POT); \ - } - - PackPOTColumn(0); - PackPOTColumn(1); - PackPOTColumn(2); - PackPOTColumn(3); - PackPOTColumn(4); -#undef PackPOTColumn - - for (auto &col : VariableColumns_) { - auto dataOffset = ReadUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * start); - auto nextOffset = ReadUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * (start + 1)); - auto size = nextOffset - dataOffset; - auto data = columns[col.OriginalIndex + 1] + dataOffset; - - if (size >= col.DataSize) { - res[col.Offset] = 255; - - ui32 prefixSize = (col.DataSize - 1 - 2 * sizeof(ui32)); - auto overflowSize = size - prefixSize; - auto overflowOffset = overflow.size(); - - overflow.resize(overflowOffset + overflowSize); - - WriteUnaligned<ui32>(res + col.Offset + 1 + 0 * sizeof(ui32), - overflowOffset); - WriteUnaligned<ui32>(res + col.Offset + 1 + 1 * sizeof(ui32), - overflowSize); - std::memcpy(res + col.Offset + 1 + 2 * sizeof(ui32), data, - prefixSize); - std::memcpy(overflow.data() + overflowOffset, data + prefixSize, - overflowSize); - } else { - Y_DEBUG_ABORT_UNLESS(size < 255); - res[col.Offset] = size; - std::memcpy(res + col.Offset + 1, data, size); - std::memset(res + col.Offset + 1 + size, 0, - col.DataSize - (size + 1)); - } - - if (anyOverflow && col.Role == EColumnRole::Key) { - hash = - CalculateCRC32<TTraits>((ui8 *)&size, sizeof(ui32), hash); - hash = CalculateCRC32<TTraits>(data, size, hash); - } - } - - // isValid bitmap is NOT included into hashed data - if (anyOverflow) { - hash = CalculateCRC32<TTraits>( - res + KeyColumnsOffset, KeyColumnsFixedEnd - KeyColumnsOffset, - hash); - } else { - hash = CalculateCRC32<TTraits>(res + KeyColumnsOffset, - KeyColumnsEnd - KeyColumnsOffset); - } - WriteUnaligned<ui32>(res, hash); - } -} - -template <> -void TTupleLayoutFallback<NSimd::TSimdFallbackTraits>::Unpack( - ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const { - std::vector<ui64> bitmaskMatrix(BitmaskSize, 0); - - { - const auto bitmaskIdx = start / 8; - const auto bitmaskShift = start % 8; - const auto bitmaskIdxC = (start + count) / 8; - const auto bitmaskShiftC = (start + count) % 8; - - /// ready first bitmatrix bytes - for (ui32 j = Columns.size(); j--;) - bitmaskMatrix[j / 8] |= - (isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx] & - ~(0xFF << bitmaskShift)) - << ((j % 8) * 8); - - /// ready last (which are same as above) bitmatrix bytes if needed - if (bitmaskIdx == bitmaskIdxC) - for (ui32 j = Columns.size(); j--;) - bitmaskMatrix[j / 8] |= - (isValidBitmask[Columns[j].OriginalIndex][bitmaskIdxC] & - (0xFF << bitmaskShiftC)) - << ((j % 8) * 8); - - for (auto &m : bitmaskMatrix) - m = transposeBitmatrix(m); - } - - for (auto ind = 0; ind != start % 8; ++ind) { - for (ui32 j = 0; j < BitmaskSize; ++j) { - bitmaskMatrix[j] |= - ui64( - (res - (start % 8 - ind) * TotalRowSize)[BitmaskOffset + j]) - << (ind * 8); - } - } - - for (; count--; ++start, res += TotalRowSize) { - const auto bitmaskIdx = start / 8; - const auto bitmaskShift = start % 8; - - for (ui32 j = 0; j < BitmaskSize; ++j) { - bitmaskMatrix[j] |= ui64(res[BitmaskOffset + j]) - << (bitmaskShift * 8); - } - - if (bitmaskShift == 7 || count == 0) { - for (auto &m : bitmaskMatrix) - m = transposeBitmatrix(m); - for (ui32 j = Columns.size(); j--;) - isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx] = - ui8(bitmaskMatrix[j / 8] >> ((j % 8) * 8)); - std::fill(bitmaskMatrix.begin(), bitmaskMatrix.end(), 0); - - if (count && count < 8) { - /// ready last bitmatrix bytes - for (ui32 j = Columns.size(); j--;) - bitmaskMatrix[j / 8] |= - (isValidBitmask[Columns[j].OriginalIndex] - [bitmaskIdx + 1] & - (0xFF << count)) - << ((j % 8) * 8); - - for (auto &m : bitmaskMatrix) - m = transposeBitmatrix(m); - } - } - - for (auto &col : FixedNPOTColumns_) { - std::memcpy(columns[col.OriginalIndex] + start * col.DataSize, - res + col.Offset, col.DataSize); - } - -#define PackPOTColumn(POT) \ - for (auto &col : FixedPOTColumns_[POT]) { \ - std::memcpy(columns[col.OriginalIndex] + start * (1u << POT), \ - res + col.Offset, 1u << POT); \ - } - PackPOTColumn(0); - PackPOTColumn(1); - PackPOTColumn(2); - PackPOTColumn(3); - PackPOTColumn(4); -#undef PackPOTColumn - - for (auto &col : VariableColumns_) { - const auto dataOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * start); - auto *const data = columns[col.OriginalIndex + 1] + dataOffset; - - ui32 size = ReadUnaligned<ui8>(res + col.Offset); - - if (size < 255) { // embedded str - std::memcpy(data, res + col.Offset + 1, size); - } else { // overflow buffer used - const auto prefixSize = (col.DataSize - 1 - 2 * sizeof(ui32)); - const auto overflowOffset = ReadUnaligned<ui32>( - res + col.Offset + 1 + 0 * sizeof(ui32)); - const auto overflowSize = ReadUnaligned<ui32>( - res + col.Offset + 1 + 1 * sizeof(ui32)); - - std::memcpy(data, res + col.Offset + 1 + 2 * sizeof(ui32), - prefixSize); - std::memcpy(data + prefixSize, overflow.data() + overflowOffset, - overflowSize); - - size = prefixSize + overflowSize; - } - - WriteUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * (start + 1), - dataOffset + size); - } - } -} - -#define MULTI_8_I(C, i) \ - C(i, 0) C(i, 1) C(i, 2) C(i, 3) C(i, 4) C(i, 5) C(i, 6) C(i, 7) -#define MULTI_8(C, A) \ - C(A, 0) C(A, 1) C(A, 2) C(A, 3) C(A, 4) C(A, 5) C(A, 6) C(A, 7) - -template <typename TTraits> -void TTupleLayoutFallback<TTraits>::Pack( - const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const { - std::vector<const ui8 *> block_columns; - for (const auto col_ind : BlockColumnsOrigInds_) { - block_columns.push_back(columns[col_ind]); - } - - for (size_t row_ind = 0; row_ind < count; row_ind += BlockRows_) { - const size_t cur_block_size = std::min(count - row_ind, BlockRows_); - size_t cols_past = 0; - - for (const auto &simd_block : SIMDBlock_) { -#define CASE(i, j) \ - case i *kSIMDMaxCols + j: \ - SIMDPack<TTraits>::template PackTupleOrImpl<i + 1, j + 1>( \ - block_columns.data() + cols_past, res + simd_block.RowOffset, \ - cur_block_size, BlockFixedColsSizes_.data() + cols_past, \ - BlockColsOffsets_.data() + cols_past, TotalRowSize, \ - SIMDPermMasks_.data() + simd_block.PermMaskOffset, start); \ - break; - - switch ((simd_block.InnerLoopIters - 1) * kSIMDMaxCols + - simd_block.Cols - 1) { - MULTI_8(MULTI_8_I, CASE) - - default: - std::abort(); - } - -#undef CASE - - cols_past += simd_block.Cols; - } - - PackTupleFallbackColImpl( - block_columns.data() + cols_past, res, - BlockColsOffsets_.size() - cols_past, cur_block_size, - BlockFixedColsSizes_.data() + cols_past, - BlockColsOffsets_.data() + cols_past, TotalRowSize, start); - - for (ui32 cols_ind = 0; cols_ind < Columns.size(); cols_ind += 8) { - const ui8 *bitmasks[8]; - const size_t cols = std::min<size_t>(8ul, Columns.size() - cols_ind); - for (size_t ind = 0; ind != cols; ++ind) { - const auto &col = Columns[cols_ind + ind]; - bitmasks[ind] = isValidBitmask[col.OriginalIndex] + start / 8; - } - const ui8 ones_byte = 0xFF; - for (size_t ind = cols; ind != 8; ++ind) { - // dereferencable + all-ones fast path - bitmasks[ind] = &ones_byte; - } - - const auto advance_masks = [&] { - for (size_t ind = 0; ind != cols; ++ind) { - ++bitmasks[ind]; - } - }; - - const size_t first_full_byte = - std::min<size_t>((8ul - start) & 7, cur_block_size); - size_t block_row_ind = 0; - - const auto simple_mask_transpose = [&](const size_t until) { - for (; block_row_ind < until; ++block_row_ind) { - const auto shift = (start + block_row_ind) % 8; - - const auto new_res = res + block_row_ind * TotalRowSize; - const auto res = new_res; - - res[BitmaskOffset + cols_ind / 8] = 0; - for (size_t col_ind = 0; col_ind != cols; ++col_ind) { - res[BitmaskOffset + cols_ind / 8] |= - ((bitmasks[col_ind][0] >> shift) & 1u) << col_ind; - } - } - }; - - simple_mask_transpose(first_full_byte); - if (first_full_byte) { - advance_masks(); - } - - for (; block_row_ind + 7 < cur_block_size; block_row_ind += 8) { - transposeBitmatrix(res + block_row_ind * TotalRowSize + - BitmaskOffset + cols_ind / 8, - bitmasks, TotalRowSize); - advance_masks(); - } - - simple_mask_transpose(cur_block_size); - } - - for (size_t block_row_ind = 0; block_row_ind != cur_block_size; - ++block_row_ind) { - - const auto new_start = start + block_row_ind; - const auto start = new_start; - - const auto new_res = res + block_row_ind * TotalRowSize; - const auto res = new_res; - - ui32 hash = 0; - bool anyOverflow = false; - - for (ui32 i = KeyColumnsFixedNum; i < KeyColumns.size(); ++i) { - auto &col = KeyColumns[i]; - auto dataOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * start); - auto nextOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * (start + 1)); - auto size = nextOffset - dataOffset; - - if (size >= col.DataSize) { - anyOverflow = true; - break; - } - } - - for (auto &col : VariableColumns_) { - auto dataOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * start); - auto nextOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * (start + 1)); - auto size = nextOffset - dataOffset; - auto data = columns[col.OriginalIndex + 1] + dataOffset; - if (size >= col.DataSize) { - res[col.Offset] = 255; - - auto prefixSize = (col.DataSize - 1 - 2 * sizeof(ui32)); - auto overflowSize = size - prefixSize; - auto overflowOffset = overflow.size(); - - overflow.resize(overflowOffset + overflowSize); - - WriteUnaligned<ui32>(res + col.Offset + 1 + - 0 * sizeof(ui32), - overflowOffset); - WriteUnaligned<ui32>( - res + col.Offset + 1 + 1 * sizeof(ui32), overflowSize); - std::memcpy(res + col.Offset + 1 + 2 * sizeof(ui32), data, - prefixSize); - std::memcpy(overflow.data() + overflowOffset, - data + prefixSize, overflowSize); - } else { - Y_DEBUG_ABORT_UNLESS(size < 255); - res[col.Offset] = size; - std::memcpy(res + col.Offset + 1, data, size); - std::memset(res + col.Offset + 1 + size, 0, - col.DataSize - (size + 1)); - } - if (anyOverflow && col.Role == EColumnRole::Key) { - hash = CalculateCRC32<TTraits>((ui8 *)&size, sizeof(ui32), - hash); - hash = CalculateCRC32<TTraits>(data, size, hash); - } - } - - // isValid bitmap is NOT included into hashed data - if (anyOverflow) { - hash = CalculateCRC32<TTraits>( - res + KeyColumnsOffset, - KeyColumnsFixedEnd - KeyColumnsOffset, hash); - } else { - hash = CalculateCRC32<TTraits>( - res + KeyColumnsOffset, KeyColumnsEnd - KeyColumnsOffset); - } - WriteUnaligned<ui32>(res, hash); - } - - start += cur_block_size; - res += cur_block_size * TotalRowSize; - } -} - -template <typename TTraits> -void TTupleLayoutFallback<TTraits>::Unpack( - ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const { - - std::vector<ui8 *> block_columns; - for (const auto col_ind : BlockColumnsOrigInds_) { - block_columns.push_back(columns[col_ind]); - } - - for (size_t row_ind = 0; row_ind < count; row_ind += BlockRows_) { - const size_t cur_block_size = std::min(count - row_ind, BlockRows_); - size_t cols_past = 0; - - for (const auto &simd_block : SIMDBlock_) { -#define CASE(i, j) \ - case i *kSIMDMaxCols + j: \ - SIMDPack<TTraits>::template UnpackTupleOrImpl<i + 1, j + 1>( \ - res + simd_block.RowOffset, block_columns.data() + cols_past, \ - cur_block_size, BlockFixedColsSizes_.data() + cols_past, \ - BlockColsOffsets_.data() + cols_past, TotalRowSize, \ - SIMDPermMasks_.data() + simd_block.PermMaskOffset + i * j, start); \ - break; - - switch ((simd_block.InnerLoopIters - 1) * kSIMDMaxCols + - simd_block.Cols - 1) { - MULTI_8(MULTI_8_I, CASE) - - default: - std::abort(); - } - -#undef CASE - - cols_past += simd_block.Cols; - } - - UnpackTupleFallbackColImpl( - res, block_columns.data() + cols_past, - BlockColsOffsets_.size() - cols_past, cur_block_size, - BlockFixedColsSizes_.data() + cols_past, - BlockColsOffsets_.data() + cols_past, TotalRowSize, start); - - for (ui32 cols_ind = 0; cols_ind < Columns.size(); cols_ind += 8) { - ui8 *bitmasks[8]; - const size_t cols = std::min<size_t>(8ul, Columns.size() - cols_ind); - for (size_t ind = 0; ind != cols; ++ind) { - const auto &col = Columns[cols_ind + ind]; - bitmasks[ind] = isValidBitmask[col.OriginalIndex] + start / 8; - } - ui8 trash_byte; - for (size_t ind = cols; ind != 8; ++ind) { - bitmasks[ind] = &trash_byte; // dereferencable - } - - const auto advance_masks = [&] { - for (size_t ind = 0; ind != cols; ++ind) { - ++bitmasks[ind]; - } - }; - - const size_t first_full_byte = - std::min<size_t>((8ul - start) & 7, cur_block_size); - size_t block_row_ind = 0; - - const auto simple_mask_transpose = [&](const size_t until) { - for (size_t col_ind = 0; - block_row_ind != until && col_ind != cols; ++col_ind) { - auto col_bitmask = - bitmasks[col_ind][0] & ~((0xFF << (block_row_ind & 7)) ^ - (0xFF << (until & 7))); - - for (size_t row_ind = block_row_ind; row_ind < until; - ++row_ind) { - const auto shift = (start + row_ind) % 8; - - const auto new_res = res + row_ind * TotalRowSize; - const auto res = new_res; - - col_bitmask |= - ((res[BitmaskOffset + cols_ind / 8] >> col_ind) & - 1u) - << shift; - } - - bitmasks[col_ind][0] = col_bitmask; - } - block_row_ind = until; - }; - - simple_mask_transpose(first_full_byte); - if (first_full_byte) { - advance_masks(); - } - - for (; block_row_ind + 7 < cur_block_size; block_row_ind += 8) { - transposeBitmatrix(bitmasks, - res + block_row_ind * TotalRowSize + - BitmaskOffset + cols_ind / 8, - TotalRowSize); - advance_masks(); - } - - simple_mask_transpose(cur_block_size); - } - - for (size_t block_row_ind = 0; block_row_ind != cur_block_size; - ++block_row_ind) { - - const auto new_start = start + block_row_ind; - const auto start = new_start; - - const auto new_res = res + block_row_ind * TotalRowSize; - const auto res = new_res; - - for (auto &col : VariableColumns_) { - const auto dataOffset = ReadUnaligned<ui32>( - columns[col.OriginalIndex] + sizeof(ui32) * start); - auto *const data = columns[col.OriginalIndex + 1] + dataOffset; - - ui32 size = ReadUnaligned<ui8>(res + col.Offset); - - if (size < 255) { // embedded str - std::memcpy(data, res + col.Offset + 1, size); - } else { // overflow buffer used - const auto prefixSize = - (col.DataSize - 1 - 2 * sizeof(ui32)); - const auto overflowOffset = ReadUnaligned<ui32>( - res + col.Offset + 1 + 0 * sizeof(ui32)); - const auto overflowSize = ReadUnaligned<ui32>( - res + col.Offset + 1 + 1 * sizeof(ui32)); - - std::memcpy(data, res + col.Offset + 1 + 2 * sizeof(ui32), - prefixSize); - std::memcpy(data + prefixSize, - overflow.data() + overflowOffset, overflowSize); - - size = prefixSize + overflowSize; - } - - WriteUnaligned<ui32>(columns[col.OriginalIndex] + - sizeof(ui32) * (start + 1), - dataOffset + size); - } - } - - start += cur_block_size; - res += cur_block_size * TotalRowSize; - } -} - -template __attribute__((target("avx2"))) void -TTupleLayoutFallback<NSimd::TSimdAVX2Traits>::Pack( - const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; -template __attribute__((target("sse4.2"))) void -TTupleLayoutFallback<NSimd::TSimdSSE42Traits>::Pack( - const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; - -template __attribute__((target("avx2"))) void -TTupleLayoutFallback<NSimd::TSimdAVX2Traits>::Unpack( - ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; -template __attribute__((target("sse4.2"))) void -TTupleLayoutFallback<NSimd::TSimdSSE42Traits>::Unpack( - ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; - -} // namespace NPackedTuple -} // namespace NMiniKQL -} // namespace NKikimr diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.h b/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.h deleted file mode 100644 index 3c6b76750c..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/tuple.h +++ /dev/null @@ -1,136 +0,0 @@ -#pragma once - -#include <yql/essentials/minikql/mkql_node.h> -#include <yql/essentials/public/udf/udf_data_type.h> -#include <yql/essentials/public/udf/udf_types.h> - -#include <util/generic/buffer.h> - -#include <util/system/cpu_id.h> -#include <contrib/ydb/library/yql/utils/simd/simd.h> - -namespace NKikimr { -namespace NMiniKQL { -namespace NPackedTuple { - -// Defines if data type of particular column variable or fixed -enum class EColumnSizeType { Fixed, Variable }; - -// Defines if particular column is key column or payload column -enum class EColumnRole { Key, Payload }; - -// Describes layout and size of particular column -struct TColumnDesc { - ui32 ColumnIndex = 0; // Index of the column in particular layout - ui32 OriginalIndex = 0; // Index of the column in input representation - EColumnRole Role = EColumnRole::Payload; // Role of the particular column in - // tuple (Key or Payload) - EColumnSizeType SizeType = - EColumnSizeType::Fixed; // Fixed size or variable size column - ui32 DataSize = 0; // Size of the column in bytes for fixed size part - // Must be same for matching key columns - ui32 Offset = - 0; // Offset in bytes for column value from the beginning of tuple -}; - -// Defines in memory layout of tuple. -struct TTupleLayout { - std::vector<TColumnDesc> OrigColumns; // Columns description and order as - // passed during layout construction - std::vector<TColumnDesc> Columns; // Vector describing all columns in order - // corresponding to tuple layout - std::vector<TColumnDesc> KeyColumns; // Vector describing key columns - std::vector<TColumnDesc> - PayloadColumns; // Vector describing payload columns - ui32 KeyColumnsNum; // Total number of key columns - ui32 KeyColumnsSize; // Total size of all key columns in bytes - ui32 KeyColumnsOffset; // Start of row-packed keys data - ui32 KeyColumnsFixedEnd; // Offset in row-packed keys data of first variable - // key (can be same as KeyColumnsEnd, if there are - // none) - ui32 KeyColumnsFixedNum; // Number of fixed-size columns - ui32 KeyColumnsEnd; // First byte after key columns. Start of bitmask for - // row-based columns - ui32 BitmaskSize; // Size of bitmask for null values flag in columns - ui32 BitmaskOffset; // Offset of nulls bitmask. = KeyColumnsEnd - ui32 BitmaskEnd; // First byte after bitmask. = PayloadOffset - ui32 PayloadSize; // Total size in bytes of the payload columns - ui32 PayloadOffset; // Offset of payload values. = BitmaskEnd. - ui32 PayloadEnd; // First byte after payload - ui32 TotalRowSize; // Total size of bytes for packed row - - // Creates new tuple layout based on provided columns description. - static THolder<TTupleLayout> - Create(const std::vector<TColumnDesc> &columns); - - TTupleLayout(const std::vector<TColumnDesc> &columns) - : OrigColumns(columns) {} - virtual ~TTupleLayout() {} - - // Takes array of pointer to columns, array of validity bitmaps, - // outputs packed rows - virtual void Pack(const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, - ui32 start, ui32 count) const = 0; - - // Takes packed rows, - // outputs array of pointer to columns, array of validity bitmaps - virtual void Unpack(ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, - ui32 start, ui32 count) const = 0; -}; - -template <typename TTrait> struct TTupleLayoutFallback : public TTupleLayout { - - TTupleLayoutFallback(const std::vector<TColumnDesc> &columns); - - void Pack(const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const override; - - void Unpack(ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, - ui32 start, ui32 count) const override; - - private: - std::array<std::vector<TColumnDesc>, 5> - FixedPOTColumns_; // Fixed-size columns for power-of-two sizes from 1 to - // 16 bytes - std::vector<TColumnDesc> FixedNPOTColumns_; // Remaining fixed-size columns - std::vector<TColumnDesc> VariableColumns_; // Variable-size columns only - using TSimdI8 = typename TTrait::TSimdI8; - template <class T> using TSimd = typename TTrait::template TSimd8<T>; - - static constexpr ui8 kSIMDMaxCols = 8; - static constexpr ui8 kSIMDMaxInnerLoopSize = 8; - - size_t BlockRows_; // Estimated rows per cache block - std::vector<size_t> BlockColsOffsets_; - std::vector<size_t> BlockFixedColsSizes_; - std::vector<size_t> BlockColumnsOrigInds_; - - struct SIMDDesc { - ui8 InnerLoopIters; - ui8 Cols; - size_t PermMaskOffset; - size_t RowOffset; - }; - std::vector<SIMDDesc> SIMDBlock_; // SIMD iterations description - std::vector<TSimd<ui8>> SIMDPermMasks_; // SIMD precomputed masks -}; - -template <> -void TTupleLayoutFallback<NSimd::TSimdFallbackTraits>::Pack( - const ui8 **columns, const ui8 **isValidBitmask, ui8 *res, - std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; - -template <> -void TTupleLayoutFallback<NSimd::TSimdFallbackTraits>::Unpack( - ui8 **columns, ui8 **isValidBitmask, const ui8 *res, - const std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, - ui32 count) const; - -} // namespace NPackedTuple -} // namespace NMiniKQL -} // namespace NKikimr diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/ut/ya.make b/yql/essentials/minikql/comp_nodes/packed_tuple/ut/ya.make deleted file mode 100644 index 4d27ab3b99..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/ut/ya.make +++ /dev/null @@ -1,41 +0,0 @@ -UNITTEST_FOR(yql/essentials/minikql/comp_nodes/packed_tuple) - -IF (SANITIZER_TYPE OR NOT OPENSOURCE) - REQUIREMENTS(ram:32) -ENDIF() - -IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND) - TIMEOUT(3600) - SIZE(LARGE) - TAG(ya:fat) -ELSE() - TIMEOUT(600) - SIZE(MEDIUM) -ENDIF() - - -SRCS( - packed_tuple_ut.cpp -) - -PEERDIR( - yql/essentials/public/udf - yql/essentials/public/udf/arrow - yql/essentials/public/udf/service/exception_policy - yql/essentials/sql/pg_dummy -) - -CFLAGS( - -mprfchw -) - -YQL_LAST_ABI_VERSION() - -IF (MKQL_RUNTIME_VERSION) - CFLAGS( - -DMKQL_RUNTIME_VERSION=$MKQL_RUNTIME_VERSION - ) -ENDIF() - - -END() diff --git a/yql/essentials/minikql/comp_nodes/packed_tuple/ya.make b/yql/essentials/minikql/comp_nodes/packed_tuple/ya.make deleted file mode 100644 index fa3ffbf859..0000000000 --- a/yql/essentials/minikql/comp_nodes/packed_tuple/ya.make +++ /dev/null @@ -1,28 +0,0 @@ -LIBRARY() - -SRCS( - tuple.cpp -) - -PEERDIR( - contrib/libs/apache/arrow - yql/essentials/types/binary_json - yql/essentials/minikql - yql/essentials/utils - yql/essentials/utils/log - library/cpp/digest/crc32c -) - -CFLAGS( - -mprfchw - -mavx2 - -DMKQL_DISABLE_CODEGEN -) - -YQL_LAST_ABI_VERSION() - -END() - -RECURSE_FOR_TESTS( - ut -) diff --git a/yql/essentials/minikql/comp_nodes/ya.make b/yql/essentials/minikql/comp_nodes/ya.make index 1b2cc49327..fa7a96b61a 100644 --- a/yql/essentials/minikql/comp_nodes/ya.make +++ b/yql/essentials/minikql/comp_nodes/ya.make @@ -13,7 +13,6 @@ END() RECURSE( llvm14 no_llvm - packed_tuple ) RECURSE_FOR_TESTS( diff --git a/yql/essentials/minikql/comp_nodes/ya.make.inc b/yql/essentials/minikql/comp_nodes/ya.make.inc index 89b315fa39..3c1531eac7 100644 --- a/yql/essentials/minikql/comp_nodes/ya.make.inc +++ b/yql/essentials/minikql/comp_nodes/ya.make.inc @@ -160,7 +160,6 @@ PEERDIR( yql/essentials/public/udf/arrow yql/essentials/parser/pg_wrapper/interface yql/essentials/utils - contrib/ydb/library/actors/core yql/essentials/public/issue/protos ) diff --git a/yql/essentials/minikql/computation/mkql_computation_pattern_cache_ut.cpp b/yql/essentials/minikql/computation/mkql_computation_pattern_cache_ut.cpp index 0a28de9162..400f77e9d1 100644 --- a/yql/essentials/minikql/computation/mkql_computation_pattern_cache_ut.cpp +++ b/yql/essentials/minikql/computation/mkql_computation_pattern_cache_ut.cpp @@ -11,7 +11,6 @@ #include <yql/essentials/minikql/computation/mkql_computation_node_impl.h> #include <yql/essentials/minikql/invoke_builtins/mkql_builtins.h> #include <yql/essentials/minikql/comp_nodes/mkql_factories.h> -#include <contrib/ydb/library/yql/dq/proto/dq_tasks.pb.h> #include <library/cpp/testing/unittest/registar.h> diff --git a/yql/essentials/minikql/computation/mkql_spiller_factory.h b/yql/essentials/minikql/computation/mkql_spiller_factory.h index c9c1bfab38..0d6dffe673 100644 --- a/yql/essentials/minikql/computation/mkql_spiller_factory.h +++ b/yql/essentials/minikql/computation/mkql_spiller_factory.h @@ -2,7 +2,9 @@ #include "mkql_spiller.h" -#include <contrib/ydb/library/yql/dq/actors/spilling/spilling_counters.h> +namespace NYql::NDq { +struct TSpillingTaskCounters; +} namespace NKikimr::NMiniKQL { @@ -11,7 +13,7 @@ class ISpillerFactory : private TNonCopyable public: virtual ISpiller::TPtr CreateSpiller() = 0; - virtual void SetTaskCounters(TIntrusivePtr<NYql::NDq::TSpillingTaskCounters> spillingTaskCounters) = 0; + virtual void SetTaskCounters(const TIntrusivePtr<NYql::NDq::TSpillingTaskCounters>& spillingTaskCounters) = 0; virtual ~ISpillerFactory(){} }; diff --git a/yql/essentials/minikql/computation/mock_spiller_factory_ut.h b/yql/essentials/minikql/computation/mock_spiller_factory_ut.h index 9ea74569b1..c053b2c52e 100644 --- a/yql/essentials/minikql/computation/mock_spiller_factory_ut.h +++ b/yql/essentials/minikql/computation/mock_spiller_factory_ut.h @@ -8,7 +8,7 @@ namespace NKikimr::NMiniKQL { class TMockSpillerFactory : public ISpillerFactory { public: - void SetTaskCounters(TIntrusivePtr<NYql::NDq::TSpillingTaskCounters> /*spillingTaskCounters*/) override { + void SetTaskCounters(const TIntrusivePtr<NYql::NDq::TSpillingTaskCounters>& /*spillingTaskCounters*/) override { } ISpiller::TPtr CreateSpiller() override { diff --git a/yql/essentials/minikql/computation/ut/ya.make.inc b/yql/essentials/minikql/computation/ut/ya.make.inc index 0b419140e2..4de4b13fc5 100644 --- a/yql/essentials/minikql/computation/ut/ya.make.inc +++ b/yql/essentials/minikql/computation/ut/ya.make.inc @@ -30,7 +30,6 @@ PEERDIR( library/cpp/threading/local_executor yql/essentials/parser/pg_wrapper yql/essentials/public/udf/service/exception_policy - contrib/ydb/library/yql/dq/proto ) YQL_LAST_ABI_VERSION() diff --git a/yql/essentials/minikql/invoke_builtins/mkql_builtins_convert.cpp b/yql/essentials/minikql/invoke_builtins/mkql_builtins_convert.cpp index 78d62d423d..0f2a676190 100644 --- a/yql/essentials/minikql/invoke_builtins/mkql_builtins_convert.cpp +++ b/yql/essentials/minikql/invoke_builtins/mkql_builtins_convert.cpp @@ -572,12 +572,13 @@ struct TStringConvert { }; NUdf::TUnboxedValuePod JsonToJsonDocument(const NUdf::TUnboxedValuePod value) { - auto binaryJson = NKikimr::NBinaryJson::SerializeToBinaryJson(value.AsStringRef()); - if (!binaryJson.IsSuccess()) { + auto maybeBinaryJson = NKikimr::NBinaryJson::SerializeToBinaryJson(value.AsStringRef()); + if (std::holds_alternative<TString>(maybeBinaryJson)) { // JSON parse error happened, return NULL return NUdf::TUnboxedValuePod(); } - return MakeString(TStringBuf(binaryJson->Data(), binaryJson->Size())); + const auto& binaryJson = std::get<NKikimr::NBinaryJson::TBinaryJson>(maybeBinaryJson); + return MakeString(TStringBuf(binaryJson.Data(), binaryJson.Size())); } struct TJsonToJsonDocumentConvert { diff --git a/yql/essentials/minikql/jsonpath/ut/test_base.cpp b/yql/essentials/minikql/jsonpath/ut/test_base.cpp index feceecddb1..c9b4f5669d 100644 --- a/yql/essentials/minikql/jsonpath/ut/test_base.cpp +++ b/yql/essentials/minikql/jsonpath/ut/test_base.cpp @@ -26,7 +26,7 @@ void TJsonPathTestBase::RunTestCase(const TString& rawJson, const TString& rawJs try { const auto unboxedValueJson = TValue(ParseJson(rawJson)); - const auto binaryJson = *SerializeToBinaryJson(rawJson);; + const auto binaryJson = std::get<TBinaryJson>(SerializeToBinaryJson(rawJson)); auto reader = TBinaryJsonReader::Make(binaryJson); auto binaryJsonRoot = TValue(reader->GetRootCursor()); @@ -77,7 +77,7 @@ void TJsonPathTestBase::RunRuntimeErrorTestCase(const TString& rawJson, const TS try { const auto unboxedValueJson = TValue(ParseJson(rawJson)); - const auto binaryJson = *SerializeToBinaryJson(rawJson); + const auto binaryJson = std::get<TBinaryJson>(SerializeToBinaryJson(rawJson)); auto reader = TBinaryJsonReader::Make(binaryJson); auto binaryJsonRoot = TValue(reader->GetRootCursor()); @@ -105,7 +105,7 @@ void TJsonPathTestBase::RunVariablesTestCase(const TString& rawJson, const THash try { const auto unboxedValueJson = TValue(ParseJson(rawJson)); - const auto binaryJson = *SerializeToBinaryJson(rawJson); + const auto binaryJson = std::get<TBinaryJson>(SerializeToBinaryJson(rawJson)); auto reader = TBinaryJsonReader::Make(binaryJson); auto binaryJsonRoot = TValue(reader->GetRootCursor()); @@ -120,7 +120,7 @@ void TJsonPathTestBase::RunVariablesTestCase(const TString& rawJson, const THash storage.reserve(variables.size()); readers.reserve(variables.size()); for (const auto& it : variables) { - storage.push_back(*SerializeToBinaryJson(it.second)); + storage.push_back(std::get<TBinaryJson>(SerializeToBinaryJson(it.second))); readers.push_back(TBinaryJsonReader::Make(storage.back())); binaryJsonVariables[it.first] = TValue(readers.back()->GetRootCursor()); } diff --git a/yql/essentials/minikql/mkql_type_ops.cpp b/yql/essentials/minikql/mkql_type_ops.cpp index 6b823dbdf5..11762e96c4 100644 --- a/yql/essentials/minikql/mkql_type_ops.cpp +++ b/yql/essentials/minikql/mkql_type_ops.cpp @@ -2518,11 +2518,12 @@ NUdf::TUnboxedValuePod ValueFromString(NUdf::EDataSlot type, NUdf::TStringRef bu case NUdf::EDataSlot::JsonDocument: { auto binaryJson = NKikimr::NBinaryJson::SerializeToBinaryJson(buf); - if (binaryJson.IsFail()) { + if (std::holds_alternative<TString>(binaryJson)) { // JSON parse error happened, return NULL return NUdf::TUnboxedValuePod(); } - return MakeString(TStringBuf(binaryJson->Data(), binaryJson->Size())); + const auto& value = std::get<NKikimr::NBinaryJson::TBinaryJson>(binaryJson); + return MakeString(TStringBuf(value.Data(), value.Size())); } case NUdf::EDataSlot::Decimal: |