diff options
author | aozeritsky <aozeritsky@ydb.tech> | 2023-07-11 19:14:37 +0300 |
---|---|---|
committer | aozeritsky <aozeritsky@ydb.tech> | 2023-07-11 19:14:37 +0300 |
commit | d37dee05b1b5e7f062a73c35e04503aa62d372d5 (patch) | |
tree | 0ae90d3ffd9c303ad62629de4b7f77c3809229b2 | |
parent | 0ac6cfd62e12fe842b8c0daf621ecf17255f0b17 (diff) | |
download | ydb-d37dee05b1b5e7f062a73c35e04503aa62d372d5.tar.gz |
Build transitive closure of equality classes
20 files changed, 303 insertions, 2 deletions
diff --git a/library/cpp/CMakeLists.darwin-x86_64.txt b/library/cpp/CMakeLists.darwin-x86_64.txt index 2e5648a15c..86cbdf3d7e 100644 --- a/library/cpp/CMakeLists.darwin-x86_64.txt +++ b/library/cpp/CMakeLists.darwin-x86_64.txt @@ -32,6 +32,7 @@ add_subdirectory(dbg_output) add_subdirectory(deprecated) add_subdirectory(diff) add_subdirectory(digest) +add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) diff --git a/library/cpp/CMakeLists.linux-aarch64.txt b/library/cpp/CMakeLists.linux-aarch64.txt index 09cf4909bb..4f422c15bc 100644 --- a/library/cpp/CMakeLists.linux-aarch64.txt +++ b/library/cpp/CMakeLists.linux-aarch64.txt @@ -31,6 +31,7 @@ add_subdirectory(dbg_output) add_subdirectory(deprecated) add_subdirectory(diff) add_subdirectory(digest) +add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) diff --git a/library/cpp/CMakeLists.linux-x86_64.txt b/library/cpp/CMakeLists.linux-x86_64.txt index 2e5648a15c..86cbdf3d7e 100644 --- a/library/cpp/CMakeLists.linux-x86_64.txt +++ b/library/cpp/CMakeLists.linux-x86_64.txt @@ -32,6 +32,7 @@ add_subdirectory(dbg_output) add_subdirectory(deprecated) add_subdirectory(diff) add_subdirectory(digest) +add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) diff --git a/library/cpp/CMakeLists.windows-x86_64.txt b/library/cpp/CMakeLists.windows-x86_64.txt index 2e5648a15c..86cbdf3d7e 100644 --- a/library/cpp/CMakeLists.windows-x86_64.txt +++ b/library/cpp/CMakeLists.windows-x86_64.txt @@ -32,6 +32,7 @@ add_subdirectory(dbg_output) add_subdirectory(deprecated) add_subdirectory(diff) add_subdirectory(digest) +add_subdirectory(disjoint_sets) add_subdirectory(dns) add_subdirectory(enumbitset) add_subdirectory(execprofile) diff --git a/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..83a72be4c8 --- /dev/null +++ b/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-disjoint_sets) +target_link_libraries(library-cpp-disjoint_sets PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-disjoint_sets PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp +) diff --git a/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt b/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..66c7676cda --- /dev/null +++ b/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-disjoint_sets) +target_link_libraries(library-cpp-disjoint_sets PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-disjoint_sets PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp +) diff --git a/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..66c7676cda --- /dev/null +++ b/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-disjoint_sets) +target_link_libraries(library-cpp-disjoint_sets PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-disjoint_sets PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp +) diff --git a/library/cpp/disjoint_sets/CMakeLists.txt b/library/cpp/disjoint_sets/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/disjoint_sets/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..83a72be4c8 --- /dev/null +++ b/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(library-cpp-disjoint_sets) +target_link_libraries(library-cpp-disjoint_sets PUBLIC + contrib-libs-cxxsupp + yutil +) +target_sources(library-cpp-disjoint_sets PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp +) diff --git a/library/cpp/disjoint_sets/disjoint_sets.cpp b/library/cpp/disjoint_sets/disjoint_sets.cpp new file mode 100644 index 0000000000..5720e6c41a --- /dev/null +++ b/library/cpp/disjoint_sets/disjoint_sets.cpp @@ -0,0 +1 @@ +#include "disjoint_sets.h" diff --git a/library/cpp/disjoint_sets/disjoint_sets.h b/library/cpp/disjoint_sets/disjoint_sets.h new file mode 100644 index 0000000000..5768886704 --- /dev/null +++ b/library/cpp/disjoint_sets/disjoint_sets.h @@ -0,0 +1,81 @@ +#pragma once + +#include <util/system/yassert.h> +#include <util/generic/vector.h> + +// Implementation of disjoint-set data structure with union by rank and path compression. +// See http://en.wikipedia.org/wiki/Disjoint-set_data_structure +class TDisjointSets { +public: + using TElement = size_t; + +private: + mutable TVector<TElement> Parents; + TVector<size_t> Ranks; + TVector<size_t> Sizes; + size_t NumberOfSets; + +public: + TDisjointSets(size_t setCount) + : Parents(setCount) + , Ranks(setCount, 0) + , Sizes(setCount, 1) + , NumberOfSets(setCount) + { + for (size_t i = 0; i < setCount; ++i) + Parents[i] = i; + } + + TElement CanonicSetElement(TElement item) const { + if (Parents[item] != item) + Parents[item] = CanonicSetElement(Parents[item]); + return Parents[item]; + } + + size_t SizeOfSet(TElement item) const { + return Sizes[CanonicSetElement(item)]; + } + + size_t InitialSetCount() const { + return Parents.size(); + } + + size_t SetCount() const { + return NumberOfSets; + } + + void UnionSets(TElement item1, TElement item2) { + TElement canonic1 = CanonicSetElement(item1); + TElement canonic2 = CanonicSetElement(item2); + if (canonic1 == canonic2) + return; + + --NumberOfSets; + if (Ranks[canonic1] < Ranks[canonic2]) { + Parents[canonic1] = canonic2; + Sizes[canonic2] += Sizes[canonic1]; + } else { + Parents[canonic2] = canonic1; + Sizes[canonic1] += Sizes[canonic2]; + Ranks[canonic2] += Ranks[canonic1] == Ranks[canonic2] ? 1 : 0; + } + } + + void Expand(size_t setCount) { + if (setCount < Parents.size()) { + return; + } + + size_t prevSize = Parents.size(); + Parents.resize(setCount); + Ranks.resize(setCount); + Sizes.resize(setCount); + NumberOfSets += setCount - prevSize; + + for (size_t i = prevSize; i < setCount; ++i) { + Parents[i] = i; + Ranks[i] = 0; + Sizes[i] = 1; + } + } +}; diff --git a/library/cpp/disjoint_sets/ya.make b/library/cpp/disjoint_sets/ya.make new file mode 100644 index 0000000000..2104298baa --- /dev/null +++ b/library/cpp/disjoint_sets/ya.make @@ -0,0 +1,7 @@ +LIBRARY() + +SRCS( + disjoint_sets.cpp +) + +END() diff --git a/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt index 32062084c6..64f3490aa7 100644 --- a/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt +++ b/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt @@ -11,6 +11,7 @@ add_library(yql-core-cbo) target_link_libraries(yql-core-cbo PUBLIC contrib-libs-cxxsupp yutil + library-cpp-disjoint_sets ) target_sources(yql-core-cbo PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp diff --git a/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt b/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt index e1eef0c49f..70a3d8682e 100644 --- a/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt +++ b/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt @@ -12,6 +12,7 @@ target_link_libraries(yql-core-cbo PUBLIC contrib-libs-linux-headers contrib-libs-cxxsupp yutil + library-cpp-disjoint_sets ) target_sources(yql-core-cbo PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp diff --git a/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt index e1eef0c49f..70a3d8682e 100644 --- a/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt +++ b/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt @@ -12,6 +12,7 @@ target_link_libraries(yql-core-cbo PUBLIC contrib-libs-linux-headers contrib-libs-cxxsupp yutil + library-cpp-disjoint_sets ) target_sources(yql-core-cbo PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp diff --git a/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt index 32062084c6..64f3490aa7 100644 --- a/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt +++ b/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt @@ -11,6 +11,7 @@ add_library(yql-core-cbo) target_link_libraries(yql-core-cbo PUBLIC contrib-libs-cxxsupp yutil + library-cpp-disjoint_sets ) target_sources(yql-core-cbo PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp diff --git a/ydb/library/yql/core/cbo/cbo_optimizer.cpp b/ydb/library/yql/core/cbo/cbo_optimizer.cpp index d14f5a6f87..0008e45d3c 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer.cpp +++ b/ydb/library/yql/core/cbo/cbo_optimizer.cpp @@ -3,6 +3,10 @@ #include <array> #include <util/string/builder.h> +#include <util/generic/hash.h> +#include <util/generic/hash_set.h> + +#include <library/cpp/disjoint_sets/disjoint_sets.h> namespace NYql { @@ -111,7 +115,7 @@ TString IOptimizer::TOutput::ToString() const { return b; } -TString IOptimizer::TInput::ToString() { +TString IOptimizer::TInput::ToString() const { TStringBuilder b; b << "Rels: ["; for (ui32 i = 0; i < Rels.size(); ++i) { @@ -139,4 +143,55 @@ TString IOptimizer::TInput::ToString() { return b; } +void IOptimizer::TInput::Normalize() { + using TId = TDisjointSets::TElement; + + THashMap<TVarId, TId> var2id; + std::vector<TVarId> id2var; + TId curId = 1; + + for (auto& eq : EqClasses) { + for (auto& v : eq.Vars) { + auto& id = var2id[v]; + if (id == 0) { + id = curId; + id2var.resize(curId + 1); + id2var[curId] = v; + ++curId; + } + } + } + + TDisjointSets u(curId + 1); + for (auto& eq : EqClasses) { + Y_VERIFY(!eq.Vars.empty()); + + ui32 i = 0; + TId first = var2id[eq.Vars[i++]]; + for (; i < eq.Vars.size(); ++i) { + TId id = var2id[eq.Vars[i]]; + u.UnionSets(first, id); + } + } + + THashMap<TId, THashSet<TId>> eqClasses; + for (auto& eq : EqClasses) { + for (auto& var : eq.Vars) { + auto id = var2id[var]; + auto canonicalId = u.CanonicSetElement(id); + eqClasses[canonicalId].emplace(id); + } + } + EqClasses.clear(); + for (auto& [_, ids]: eqClasses) { + TEq eqClass; + eqClass.Vars.reserve(ids.size()); + for (auto id : ids) { + eqClass.Vars.emplace_back(id2var[id]); + } + std::sort(eqClass.Vars.begin(), eqClass.Vars.end()); + EqClasses.emplace_back(std::move(eqClass)); + } +} + } // namespace NYql diff --git a/ydb/library/yql/core/cbo/cbo_optimizer.h b/ydb/library/yql/core/cbo/cbo_optimizer.h index ad6db62ab8..4cd7884b11 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer.h +++ b/ydb/library/yql/core/cbo/cbo_optimizer.h @@ -29,7 +29,8 @@ struct IOptimizer { std::vector<TRel> Rels; std::vector<TEq> EqClasses; - TString ToString(); + TString ToString() const; + void Normalize(); }; enum class EJoinType { diff --git a/ydb/library/yql/core/cbo/ya.make b/ydb/library/yql/core/cbo/ya.make index 3a8fd2a503..fff500f853 100644 --- a/ydb/library/yql/core/cbo/ya.make +++ b/ydb/library/yql/core/cbo/ya.make @@ -2,5 +2,9 @@ LIBRARY() SRCS(cbo_optimizer.cpp) +PEERDIR( + library/cpp/disjoint_sets +) + END() diff --git a/ydb/library/yql/sql/pg/optimizer_ut.cpp b/ydb/library/yql/sql/pg/optimizer_ut.cpp index 2cff43bb21..0bcfa3aaf4 100644 --- a/ydb/library/yql/sql/pg/optimizer_ut.cpp +++ b/ydb/library/yql/sql/pg/optimizer_ut.cpp @@ -117,4 +117,61 @@ EqClasses: [[a,b,c]] UNIT_ASSERT_STRINGS_EQUAL(expected, str); } +Y_UNIT_TEST(InputNormalize) { + IOptimizer::TRel rel1 = {100000, 1000000, {{'a'}}}; + IOptimizer::TRel rel2 = {1000000, 9000009, {{'b'}}}; + IOptimizer::TRel rel3 = {10000, 9009, {{'c'}}}; + IOptimizer::TInput input = {{rel1, rel2, rel3}}; + + input.EqClasses.emplace_back(IOptimizer::TEq { + {{1, 1}, {2, 1}} + }); + input.EqClasses.emplace_back(IOptimizer::TEq { + {{2, 1}, {3, 1}} + }); + + TString expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]}, +{rows: 1000000,cost: 9000009,vars: [b]}, +{rows: 10000,cost: 9009,vars: [c]}] +EqClasses: [[a,b],[b,c]] +)__"; + UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString()); + + input.Normalize(); + + expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]}, +{rows: 1000000,cost: 9000009,vars: [b]}, +{rows: 10000,cost: 9009,vars: [c]}] +EqClasses: [[a,b,c]] +)__"; + UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString()); + + IOptimizer::TRel rel4 = {10001, 9009, {{'d'}}}; + IOptimizer::TInput input2 = {{rel1, rel2, rel3, rel4}}; + input2.EqClasses.emplace_back(IOptimizer::TEq { + {{1, 1}, {2, 1}} + }); + input2.EqClasses.emplace_back(IOptimizer::TEq { + {{4, 1}, {3, 1}} + }); + + expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]}, +{rows: 1000000,cost: 9000009,vars: [b]}, +{rows: 10000,cost: 9009,vars: [c]}, +{rows: 10001,cost: 9009,vars: [d]}] +EqClasses: [[a,b],[d,c]] +)__"; + UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString()); + + input2.Normalize(); + + expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]}, +{rows: 1000000,cost: 9000009,vars: [b]}, +{rows: 10000,cost: 9009,vars: [c]}, +{rows: 10001,cost: 9009,vars: [d]}] +EqClasses: [[a,b],[c,d]] +)__"; + UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString()); +} + } // Y_UNIT_TEST_SUITE(PgOptimizer) |