diff options
author | fixthgame <fixthgame@yandex-team.com> | 2023-11-03 14:17:02 +0300 |
---|---|---|
committer | fixthgame <fixthgame@yandex-team.com> | 2023-11-03 14:43:57 +0300 |
commit | 95fcb69e4e7b744eed6fcf641bb0161dffbbfcaa (patch) | |
tree | 324589b8ccc975df6c4e182468722c05052d3a4e | |
parent | 60401e3c6fa09feb6f663b912d54566231eb0ef3 (diff) | |
download | ydb-95fcb69e4e7b744eed6fcf641bb0161dffbbfcaa.tar.gz |
TuplePackTest
more logs
add test
16 files changed, 375 insertions, 11 deletions
diff --git a/.mapping.json b/.mapping.json index 808cc52993..027f520e1f 100644 --- a/.mapping.json +++ b/.mapping.json @@ -8790,6 +8790,11 @@ "ydb/library/yql/utils/simd/CMakeLists.txt":"", "ydb/library/yql/utils/simd/CMakeLists.windows-x86_64.txt":"", "ydb/library/yql/utils/simd/exec/CMakeLists.txt":"", + "ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.darwin-x86_64.txt":"", + "ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-aarch64.txt":"", + "ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-x86_64.txt":"", + "ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.txt":"", + "ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.windows-x86_64.txt":"", "ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.darwin-x86_64.txt":"", "ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-aarch64.txt":"", "ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-x86_64.txt":"", diff --git a/ydb/library/yql/utils/simd/exec/CMakeLists.txt b/ydb/library/yql/utils/simd/exec/CMakeLists.txt index 1949327f1e..a3b4da0bea 100644 --- a/ydb/library/yql/utils/simd/exec/CMakeLists.txt +++ b/ydb/library/yql/utils/simd/exec/CMakeLists.txt @@ -6,4 +6,5 @@ # original buildsystem will not be accepted. +add_subdirectory(pack_tuple) add_subdirectory(stream_store) diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..ae045da3a7 --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,31 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(pack_tuple) +target_compile_options(pack_tuple PRIVATE + -mavx2 +) +target_link_libraries(pack_tuple PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-cpuid_check + yql-utils-simd +) +target_link_options(pack_tuple PRIVATE + -Wl,-platform_version,macos,11.0,11.0 + -fPIC + -fPIC +) +target_sources(pack_tuple PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp +) +target_allocator(pack_tuple + system_allocator +) +vcs_info(pack_tuple) diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-aarch64.txt b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..2d36690667 --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-aarch64.txt @@ -0,0 +1,36 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(pack_tuple) +target_compile_options(pack_tuple PRIVATE + -mavx2 +) +target_link_libraries(pack_tuple PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + yql-utils-simd +) +target_link_options(pack_tuple PRIVATE + -ldl + -lrt + -Wl,--no-as-needed + -fPIC + -fPIC + -lpthread + -lrt + -ldl +) +target_sources(pack_tuple PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp +) +target_allocator(pack_tuple + cpp-malloc-jemalloc +) +vcs_info(pack_tuple) diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-x86_64.txt b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..6299f9b65a --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.linux-x86_64.txt @@ -0,0 +1,38 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(pack_tuple) +target_compile_options(pack_tuple PRIVATE + -mavx2 +) +target_link_libraries(pack_tuple PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + library-cpp-cpuid_check + yql-utils-simd +) +target_link_options(pack_tuple PRIVATE + -ldl + -lrt + -Wl,--no-as-needed + -fPIC + -fPIC + -lpthread + -lrt + -ldl +) +target_sources(pack_tuple PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp +) +target_allocator(pack_tuple + cpp-malloc-tcmalloc + libs-tcmalloc-no_percpu_cache +) +vcs_info(pack_tuple) diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.txt b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.windows-x86_64.txt b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..37c13c73e8 --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/CMakeLists.windows-x86_64.txt @@ -0,0 +1,26 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(pack_tuple) +target_compile_options(pack_tuple PRIVATE + -mavx2 +) +target_link_libraries(pack_tuple PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-cpuid_check + yql-utils-simd +) +target_sources(pack_tuple PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp +) +target_allocator(pack_tuple + system_allocator +) +vcs_info(pack_tuple) diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp b/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp new file mode 100644 index 0000000000..ed9a4e0444 --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp @@ -0,0 +1,175 @@ +#include <util/generic/ptr.h> +#include <util/system/cpu_id.h> +#include <util/system/types.h> + +#include <ydb/library/yql/utils/simd/simd.h> + +struct TPerfomancer { + TPerfomancer() = default; + + struct TWrapWorker { + virtual int PackTuple(bool log) = 0; + virtual ~TWrapWorker() = default; + }; + + template<typename TTraits> + struct TWorker : TWrapWorker { + template<typename T> + using TSimd = typename TTraits::template TSimd8<T>; + TWorker() = default; + + ui8* ShuffleMask(ui32 v[8]) { + ui8* det = new ui8[32]; + for (size_t i = 0; i < 32; i += 1) { + det[i] = v[i / 4] + i % 4; + } + return det; + } + + int PackTupleImpl(bool log = true) { + if (TTraits::Size != 32) + return 1; + const ui64 NTuples = 32 << 18; + const ui64 TupleSize = sizeof(ui32) + sizeof(ui64); + + ui32 *arrUi32 __attribute__((aligned(32))) = new ui32[NTuples]; + ui64 *arrUi64 __attribute__((aligned(32))) = new ui64[NTuples]; + + for (ui32 i = 0; i < NTuples; i++) { + arrUi32[i] = 2 * i; + } + + for (ui32 i = 0; i < NTuples; i++) { + arrUi64[i] = 2 * i + 1; + } + + TSimd<ui8> readReg1, readReg2, readReg1Fwd; + + TSimd<ui8> permReg11, permReg21; + TSimd<ui8> permReg12, permReg22; + + TSimd<ui8> permIdx11(ShuffleMask((ui32[8]) {0, 0, 0, 0, 1, 0, 0, 0})); + TSimd<ui8> permIdx12(ShuffleMask((ui32[8]) {0, 0, 0, 0, 3, 0, 0, 2})); + TSimd<ui8> permIdx1f(ShuffleMask((ui32[8]) {7, 7, 7, 7, 7, 6, 5, 4})); + + TSimd<ui8> permIdx21(ShuffleMask((ui32[8]) {0, 0, 3, 2, 0, 1, 0, 0})); + TSimd<ui8> permIdx22(ShuffleMask((ui32[8]) {0, 0, 7, 6, 0, 5, 4, 0})); + + ui32 val1[8], val2[8]; // val3[8]; + + using TReg = typename TTraits::TRegister; + TSimd<ui8> blended1, blended2; + + TReg *addr1 = (TReg*) arrUi32; + TReg *addr2 = (TReg*) arrUi64; + + std::chrono::steady_clock::time_point begin01 = + std::chrono::steady_clock::now(); + + ui64 accum1 = 0; + ui64 accum2 = 0; + ui64 accum3 = 0; + ui64 accum4 = 0; + + const int blendMask = 0b00110110; + + ui32 hash1 = 0; + ui32 hash2 = 0; + ui32 hash3 = 0; + ui32 hash4 = 0; + + for (ui32 i = 0; i < NTuples; i += 8) { + readReg1 = TSimd<ui8>((ui8*) addr1); + for (ui32 j = 0; j < 2; j++) { + + permReg11 = readReg1.Shuffle(permIdx11); + readReg2 = TSimd<ui8>((ui8*) addr2); + addr2++; + permReg21 = readReg2.Shuffle(permIdx21); + blended1 = permReg11.template Blend16<blendMask>(permReg21); + blended1.Store((ui8*) val1); + + hash1 = TSimd<ui8>::CRC32u32(0, val1[0]); + hash2 = TSimd<ui8>::CRC32u32(0, val1[3]); + + accum1 += hash1; + accum2 += hash2; + + permReg12 = readReg1.Shuffle(permIdx12); + permReg22 = readReg2.Shuffle(permIdx22); + blended2 = permReg12.template Blend16<blendMask>(permReg12); + blended2.Store((ui8*) val2); + + hash3 = TSimd<ui8>::CRC32u32(0, val2[0]); + hash4 = TSimd<ui8>::CRC32u32(0, val2[3]); + + accum3 += hash3; + accum4 += hash4; + + readReg1Fwd = readReg1.Shuffle(permIdx1f); + readReg1Fwd.Store((ui8*) &readReg1.Value); + + } + addr1++; + } + + Cerr << "Loaded col1 "; + readReg1.template Log<ui32>(Cerr); + Cerr << "Loaded col2 "; + readReg2.template Log<ui32>(Cerr);; + Cerr << "Permuted col1 "; + permReg11.template Log<ui32>(Cerr);; + Cerr << "Permuted col2 "; + permReg21.template Log<ui32>(Cerr); + Cerr << "Blended "; + blended1.template Log<ui32>(Cerr); + + + std::chrono::steady_clock::time_point end01 = + std::chrono::steady_clock::now(); + + ui64 microseconds = + std::chrono::duration_cast<std::chrono::microseconds>(end01 - begin01).count(); + if (log) { + Cerr << "Accum 1 2 hash: " << accum1 << " " << accum2 << " " << accum3 << " " << accum4 << " " + << hash1 << " " << hash2 << " " << hash3 << " " << hash4 << Endl; + Cerr << "Time for stream load = " << microseconds << "[microseconds]" + << Endl; + Cerr << "Data size = " << ((NTuples * TupleSize) / (1024 * 1024)) + << " [MB]" << Endl; + Cerr << "Stream load/save/accum speed = " + << (NTuples * TupleSize * 1000 * 1000) / + (1024 * 1024 * (microseconds + 1)) + << " MB/sec" << Endl; + Cerr << Endl; + } + delete[] arrUi32; + delete[] arrUi64; + + return 1; + } + + int PackTuple(bool log = true) override { + return PackTupleImpl(log); + } + + ~TWorker() = default; + }; + + template<typename TTraits> + THolder<TWrapWorker> Create() const { + return MakeHolder<TWorker<TTraits>>(); + }; +}; + +int main() { + if (!NX86::HaveAVX2()) + return 0; + + TPerfomancer tp; + auto worker = tp.Create<NSimd::TSimdAVX2Traits>(); + + bool fine = true; + fine &= worker->PackTuple(true); + return !fine; +}
\ No newline at end of file diff --git a/ydb/library/yql/utils/simd/exec/pack_tuple/ya.make b/ydb/library/yql/utils/simd/exec/pack_tuple/ya.make new file mode 100644 index 0000000000..61c6742c2a --- /dev/null +++ b/ydb/library/yql/utils/simd/exec/pack_tuple/ya.make @@ -0,0 +1,9 @@ +PROGRAM() + +SRCS(main.cpp) + +CFLAGS(-mavx2) + +PEERDIR(ydb/library/yql/utils/simd) + +END()
\ No newline at end of file diff --git a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.darwin-x86_64.txt index a99ac27d6f..e5df3ab627 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.darwin-x86_64.txt +++ b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.darwin-x86_64.txt @@ -12,6 +12,7 @@ target_link_libraries(stream_store PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + yql-utils-simd ) target_link_options(stream_store PRIVATE -Wl,-platform_version,macos,11.0,11.0 diff --git a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-aarch64.txt b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-aarch64.txt index 049d3baab7..d4dd5985c4 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-aarch64.txt +++ b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-aarch64.txt @@ -12,6 +12,7 @@ target_link_libraries(stream_store PUBLIC contrib-libs-linux-headers contrib-libs-cxxsupp yutil + yql-utils-simd ) target_link_options(stream_store PRIVATE -ldl diff --git a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-x86_64.txt b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-x86_64.txt index cbf6c43e02..2dbcba4a9e 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-x86_64.txt +++ b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.linux-x86_64.txt @@ -13,6 +13,7 @@ target_link_libraries(stream_store PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + yql-utils-simd ) target_link_options(stream_store PRIVATE -ldl diff --git a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.windows-x86_64.txt b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.windows-x86_64.txt index a00f4d06e7..c5757a9974 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.windows-x86_64.txt +++ b/ydb/library/yql/utils/simd/exec/stream_store/CMakeLists.windows-x86_64.txt @@ -12,6 +12,7 @@ target_link_libraries(stream_store PUBLIC contrib-libs-cxxsupp yutil library-cpp-cpuid_check + yql-utils-simd ) target_sources(stream_store PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/utils/simd/exec/stream_store/main.cpp diff --git a/ydb/library/yql/utils/simd/exec/stream_store/main.cpp b/ydb/library/yql/utils/simd/exec/stream_store/main.cpp index 4a25a86852..b879bf6fd7 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/main.cpp +++ b/ydb/library/yql/utils/simd/exec/stream_store/main.cpp @@ -1,4 +1,3 @@ - #include <util/generic/ptr.h> #include <util/system/cpu_id.h> #include <util/system/types.h> @@ -9,7 +8,7 @@ struct TPerfomancer { TPerfomancer() = default; struct TWrapWorker { - virtual int StreamLoad(bool log) = 0; + virtual int StoreStream(bool log) = 0; virtual ~TWrapWorker() = default; }; @@ -29,7 +28,7 @@ struct TPerfomancer { } } - int StreamLoad(bool log = true) override { + int StoreStream(bool log = true) override { const size_t batch = 32 / TTraits::Size; const size_t batch_size = TTraits::Size / 8; size_t log_batch_size = 0; @@ -41,8 +40,15 @@ struct TPerfomancer { log_batch_size = 2; } - size_t size = (32LL << 21); - i64* buf __attribute__((aligned(32))) = new i64[size]; + const size_t size = (32LL << 21); + const size_t arrSize = size / 8; + + i64* buf __attribute__((aligned(32))) = new i64[arrSize]; + + for (size_t i = 0; i < arrSize; i += 1) { + buf[i] = 0; + } + i64 tmp[4]; for (size_t i = 0; i < 4; i += 1) { @@ -56,21 +62,18 @@ struct TPerfomancer { std::chrono::steady_clock::time_point begin01 = std::chrono::steady_clock::now(); - const size_t size_loop = size / 8; - - for (size_t i = 0; i < size_loop; i += 4) { + for (size_t i = 0; i < arrSize; i += 4) { for (size_t j = 0; j < batch; j += 1) { tmpSimd[j].StoreStream((i8*)(buf + i + j * batch_size)); } } - std::chrono::steady_clock::time_point end01 = std::chrono::steady_clock::now(); bool is_ok = true; - for (size_t i = 0; i < size_loop; i += 1) { + for (size_t i = 0; i < arrSize; i += 1) { if (buf[i] != i % 4) { is_ok = false; } @@ -94,6 +97,14 @@ struct TPerfomancer { return is_ok; } + ui8* ShuffleMask(ui32 v[8]) { + ui8* det = new ui8[32]; + for (size_t i = 0; i < 32; i += 1) { + det[i] = v[i / 4] + i % 4; + } + return det; + } + ~TWorker() = default; }; @@ -106,5 +117,8 @@ struct TPerfomancer { int main() { TPerfomancer tp; auto worker = NSimd::SelectSimdTraits(tp); - return !worker->StreamLoad(false); + + bool fine = true; + fine &= worker->StoreStream(true); + return !fine; }
\ No newline at end of file diff --git a/ydb/library/yql/utils/simd/exec/stream_store/ya.make b/ydb/library/yql/utils/simd/exec/stream_store/ya.make index 631001cbdc..704a7b6c0a 100644 --- a/ydb/library/yql/utils/simd/exec/stream_store/ya.make +++ b/ydb/library/yql/utils/simd/exec/stream_store/ya.make @@ -2,4 +2,6 @@ PROGRAM() SRCS(main.cpp) +PEERDIR(ydb/library/yql/utils/simd) + END()
\ No newline at end of file diff --git a/ydb/library/yql/utils/simd/exec/ya.make b/ydb/library/yql/utils/simd/exec/ya.make index 3fcf4d03e0..5d18536908 100644 --- a/ydb/library/yql/utils/simd/exec/ya.make +++ b/ydb/library/yql/utils/simd/exec/ya.make @@ -4,8 +4,13 @@ RUN( stream_store ) +RUN( + pack_tuple +) + DEPENDS( ydb/library/yql/utils/simd/exec/stream_store + ydb/library/yql/utils/simd/exec/pack_tuple ) PEERDIR( @@ -15,5 +20,6 @@ PEERDIR( END() RECURSE( + pack_tuple stream_store )
\ No newline at end of file |