aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraozeritsky <aozeritsky@ydb.tech>2023-07-11 19:14:37 +0300
committeraozeritsky <aozeritsky@ydb.tech>2023-07-11 19:14:37 +0300
commitd37dee05b1b5e7f062a73c35e04503aa62d372d5 (patch)
tree0ae90d3ffd9c303ad62629de4b7f77c3809229b2
parent0ac6cfd62e12fe842b8c0daf621ecf17255f0b17 (diff)
downloadydb-d37dee05b1b5e7f062a73c35e04503aa62d372d5.tar.gz
Build transitive closure of equality classes
-rw-r--r--library/cpp/CMakeLists.darwin-x86_64.txt1
-rw-r--r--library/cpp/CMakeLists.linux-aarch64.txt1
-rw-r--r--library/cpp/CMakeLists.linux-x86_64.txt1
-rw-r--r--library/cpp/CMakeLists.windows-x86_64.txt1
-rw-r--r--library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt17
-rw-r--r--library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt18
-rw-r--r--library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt18
-rw-r--r--library/cpp/disjoint_sets/CMakeLists.txt17
-rw-r--r--library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt17
-rw-r--r--library/cpp/disjoint_sets/disjoint_sets.cpp1
-rw-r--r--library/cpp/disjoint_sets/disjoint_sets.h81
-rw-r--r--library/cpp/disjoint_sets/ya.make7
-rw-r--r--ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt1
-rw-r--r--ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt1
-rw-r--r--ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt1
-rw-r--r--ydb/library/yql/core/cbo/cbo_optimizer.cpp57
-rw-r--r--ydb/library/yql/core/cbo/cbo_optimizer.h3
-rw-r--r--ydb/library/yql/core/cbo/ya.make4
-rw-r--r--ydb/library/yql/sql/pg/optimizer_ut.cpp57
20 files changed, 303 insertions, 2 deletions
diff --git a/library/cpp/CMakeLists.darwin-x86_64.txt b/library/cpp/CMakeLists.darwin-x86_64.txt
index 2e5648a15c..86cbdf3d7e 100644
--- a/library/cpp/CMakeLists.darwin-x86_64.txt
+++ b/library/cpp/CMakeLists.darwin-x86_64.txt
@@ -32,6 +32,7 @@ add_subdirectory(dbg_output)
add_subdirectory(deprecated)
add_subdirectory(diff)
add_subdirectory(digest)
+add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
diff --git a/library/cpp/CMakeLists.linux-aarch64.txt b/library/cpp/CMakeLists.linux-aarch64.txt
index 09cf4909bb..4f422c15bc 100644
--- a/library/cpp/CMakeLists.linux-aarch64.txt
+++ b/library/cpp/CMakeLists.linux-aarch64.txt
@@ -31,6 +31,7 @@ add_subdirectory(dbg_output)
add_subdirectory(deprecated)
add_subdirectory(diff)
add_subdirectory(digest)
+add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
diff --git a/library/cpp/CMakeLists.linux-x86_64.txt b/library/cpp/CMakeLists.linux-x86_64.txt
index 2e5648a15c..86cbdf3d7e 100644
--- a/library/cpp/CMakeLists.linux-x86_64.txt
+++ b/library/cpp/CMakeLists.linux-x86_64.txt
@@ -32,6 +32,7 @@ add_subdirectory(dbg_output)
add_subdirectory(deprecated)
add_subdirectory(diff)
add_subdirectory(digest)
+add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
diff --git a/library/cpp/CMakeLists.windows-x86_64.txt b/library/cpp/CMakeLists.windows-x86_64.txt
index 2e5648a15c..86cbdf3d7e 100644
--- a/library/cpp/CMakeLists.windows-x86_64.txt
+++ b/library/cpp/CMakeLists.windows-x86_64.txt
@@ -32,6 +32,7 @@ add_subdirectory(dbg_output)
add_subdirectory(deprecated)
add_subdirectory(diff)
add_subdirectory(digest)
+add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
diff --git a/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..83a72be4c8
--- /dev/null
+++ b/library/cpp/disjoint_sets/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-disjoint_sets)
+target_link_libraries(library-cpp-disjoint_sets PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-disjoint_sets PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp
+)
diff --git a/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt b/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..66c7676cda
--- /dev/null
+++ b/library/cpp/disjoint_sets/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-disjoint_sets)
+target_link_libraries(library-cpp-disjoint_sets PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-disjoint_sets PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp
+)
diff --git a/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..66c7676cda
--- /dev/null
+++ b/library/cpp/disjoint_sets/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-disjoint_sets)
+target_link_libraries(library-cpp-disjoint_sets PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-disjoint_sets PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp
+)
diff --git a/library/cpp/disjoint_sets/CMakeLists.txt b/library/cpp/disjoint_sets/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/disjoint_sets/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt b/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..83a72be4c8
--- /dev/null
+++ b/library/cpp/disjoint_sets/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-disjoint_sets)
+target_link_libraries(library-cpp-disjoint_sets PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-disjoint_sets PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/disjoint_sets/disjoint_sets.cpp
+)
diff --git a/library/cpp/disjoint_sets/disjoint_sets.cpp b/library/cpp/disjoint_sets/disjoint_sets.cpp
new file mode 100644
index 0000000000..5720e6c41a
--- /dev/null
+++ b/library/cpp/disjoint_sets/disjoint_sets.cpp
@@ -0,0 +1 @@
+#include "disjoint_sets.h"
diff --git a/library/cpp/disjoint_sets/disjoint_sets.h b/library/cpp/disjoint_sets/disjoint_sets.h
new file mode 100644
index 0000000000..5768886704
--- /dev/null
+++ b/library/cpp/disjoint_sets/disjoint_sets.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <util/system/yassert.h>
+#include <util/generic/vector.h>
+
+// Implementation of disjoint-set data structure with union by rank and path compression.
+// See http://en.wikipedia.org/wiki/Disjoint-set_data_structure
+class TDisjointSets {
+public:
+ using TElement = size_t;
+
+private:
+ mutable TVector<TElement> Parents;
+ TVector<size_t> Ranks;
+ TVector<size_t> Sizes;
+ size_t NumberOfSets;
+
+public:
+ TDisjointSets(size_t setCount)
+ : Parents(setCount)
+ , Ranks(setCount, 0)
+ , Sizes(setCount, 1)
+ , NumberOfSets(setCount)
+ {
+ for (size_t i = 0; i < setCount; ++i)
+ Parents[i] = i;
+ }
+
+ TElement CanonicSetElement(TElement item) const {
+ if (Parents[item] != item)
+ Parents[item] = CanonicSetElement(Parents[item]);
+ return Parents[item];
+ }
+
+ size_t SizeOfSet(TElement item) const {
+ return Sizes[CanonicSetElement(item)];
+ }
+
+ size_t InitialSetCount() const {
+ return Parents.size();
+ }
+
+ size_t SetCount() const {
+ return NumberOfSets;
+ }
+
+ void UnionSets(TElement item1, TElement item2) {
+ TElement canonic1 = CanonicSetElement(item1);
+ TElement canonic2 = CanonicSetElement(item2);
+ if (canonic1 == canonic2)
+ return;
+
+ --NumberOfSets;
+ if (Ranks[canonic1] < Ranks[canonic2]) {
+ Parents[canonic1] = canonic2;
+ Sizes[canonic2] += Sizes[canonic1];
+ } else {
+ Parents[canonic2] = canonic1;
+ Sizes[canonic1] += Sizes[canonic2];
+ Ranks[canonic2] += Ranks[canonic1] == Ranks[canonic2] ? 1 : 0;
+ }
+ }
+
+ void Expand(size_t setCount) {
+ if (setCount < Parents.size()) {
+ return;
+ }
+
+ size_t prevSize = Parents.size();
+ Parents.resize(setCount);
+ Ranks.resize(setCount);
+ Sizes.resize(setCount);
+ NumberOfSets += setCount - prevSize;
+
+ for (size_t i = prevSize; i < setCount; ++i) {
+ Parents[i] = i;
+ Ranks[i] = 0;
+ Sizes[i] = 1;
+ }
+ }
+};
diff --git a/library/cpp/disjoint_sets/ya.make b/library/cpp/disjoint_sets/ya.make
new file mode 100644
index 0000000000..2104298baa
--- /dev/null
+++ b/library/cpp/disjoint_sets/ya.make
@@ -0,0 +1,7 @@
+LIBRARY()
+
+SRCS(
+ disjoint_sets.cpp
+)
+
+END()
diff --git a/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt
index 32062084c6..64f3490aa7 100644
--- a/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt
+++ b/ydb/library/yql/core/cbo/CMakeLists.darwin-x86_64.txt
@@ -11,6 +11,7 @@ add_library(yql-core-cbo)
target_link_libraries(yql-core-cbo PUBLIC
contrib-libs-cxxsupp
yutil
+ library-cpp-disjoint_sets
)
target_sources(yql-core-cbo PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp
diff --git a/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt b/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt
index e1eef0c49f..70a3d8682e 100644
--- a/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt
+++ b/ydb/library/yql/core/cbo/CMakeLists.linux-aarch64.txt
@@ -12,6 +12,7 @@ target_link_libraries(yql-core-cbo PUBLIC
contrib-libs-linux-headers
contrib-libs-cxxsupp
yutil
+ library-cpp-disjoint_sets
)
target_sources(yql-core-cbo PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp
diff --git a/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt
index e1eef0c49f..70a3d8682e 100644
--- a/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt
+++ b/ydb/library/yql/core/cbo/CMakeLists.linux-x86_64.txt
@@ -12,6 +12,7 @@ target_link_libraries(yql-core-cbo PUBLIC
contrib-libs-linux-headers
contrib-libs-cxxsupp
yutil
+ library-cpp-disjoint_sets
)
target_sources(yql-core-cbo PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp
diff --git a/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt b/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt
index 32062084c6..64f3490aa7 100644
--- a/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt
+++ b/ydb/library/yql/core/cbo/CMakeLists.windows-x86_64.txt
@@ -11,6 +11,7 @@ add_library(yql-core-cbo)
target_link_libraries(yql-core-cbo PUBLIC
contrib-libs-cxxsupp
yutil
+ library-cpp-disjoint_sets
)
target_sources(yql-core-cbo PRIVATE
${CMAKE_SOURCE_DIR}/ydb/library/yql/core/cbo/cbo_optimizer.cpp
diff --git a/ydb/library/yql/core/cbo/cbo_optimizer.cpp b/ydb/library/yql/core/cbo/cbo_optimizer.cpp
index d14f5a6f87..0008e45d3c 100644
--- a/ydb/library/yql/core/cbo/cbo_optimizer.cpp
+++ b/ydb/library/yql/core/cbo/cbo_optimizer.cpp
@@ -3,6 +3,10 @@
#include <array>
#include <util/string/builder.h>
+#include <util/generic/hash.h>
+#include <util/generic/hash_set.h>
+
+#include <library/cpp/disjoint_sets/disjoint_sets.h>
namespace NYql {
@@ -111,7 +115,7 @@ TString IOptimizer::TOutput::ToString() const {
return b;
}
-TString IOptimizer::TInput::ToString() {
+TString IOptimizer::TInput::ToString() const {
TStringBuilder b;
b << "Rels: [";
for (ui32 i = 0; i < Rels.size(); ++i) {
@@ -139,4 +143,55 @@ TString IOptimizer::TInput::ToString() {
return b;
}
+void IOptimizer::TInput::Normalize() {
+ using TId = TDisjointSets::TElement;
+
+ THashMap<TVarId, TId> var2id;
+ std::vector<TVarId> id2var;
+ TId curId = 1;
+
+ for (auto& eq : EqClasses) {
+ for (auto& v : eq.Vars) {
+ auto& id = var2id[v];
+ if (id == 0) {
+ id = curId;
+ id2var.resize(curId + 1);
+ id2var[curId] = v;
+ ++curId;
+ }
+ }
+ }
+
+ TDisjointSets u(curId + 1);
+ for (auto& eq : EqClasses) {
+ Y_VERIFY(!eq.Vars.empty());
+
+ ui32 i = 0;
+ TId first = var2id[eq.Vars[i++]];
+ for (; i < eq.Vars.size(); ++i) {
+ TId id = var2id[eq.Vars[i]];
+ u.UnionSets(first, id);
+ }
+ }
+
+ THashMap<TId, THashSet<TId>> eqClasses;
+ for (auto& eq : EqClasses) {
+ for (auto& var : eq.Vars) {
+ auto id = var2id[var];
+ auto canonicalId = u.CanonicSetElement(id);
+ eqClasses[canonicalId].emplace(id);
+ }
+ }
+ EqClasses.clear();
+ for (auto& [_, ids]: eqClasses) {
+ TEq eqClass;
+ eqClass.Vars.reserve(ids.size());
+ for (auto id : ids) {
+ eqClass.Vars.emplace_back(id2var[id]);
+ }
+ std::sort(eqClass.Vars.begin(), eqClass.Vars.end());
+ EqClasses.emplace_back(std::move(eqClass));
+ }
+}
+
} // namespace NYql
diff --git a/ydb/library/yql/core/cbo/cbo_optimizer.h b/ydb/library/yql/core/cbo/cbo_optimizer.h
index ad6db62ab8..4cd7884b11 100644
--- a/ydb/library/yql/core/cbo/cbo_optimizer.h
+++ b/ydb/library/yql/core/cbo/cbo_optimizer.h
@@ -29,7 +29,8 @@ struct IOptimizer {
std::vector<TRel> Rels;
std::vector<TEq> EqClasses;
- TString ToString();
+ TString ToString() const;
+ void Normalize();
};
enum class EJoinType {
diff --git a/ydb/library/yql/core/cbo/ya.make b/ydb/library/yql/core/cbo/ya.make
index 3a8fd2a503..fff500f853 100644
--- a/ydb/library/yql/core/cbo/ya.make
+++ b/ydb/library/yql/core/cbo/ya.make
@@ -2,5 +2,9 @@ LIBRARY()
SRCS(cbo_optimizer.cpp)
+PEERDIR(
+ library/cpp/disjoint_sets
+)
+
END()
diff --git a/ydb/library/yql/sql/pg/optimizer_ut.cpp b/ydb/library/yql/sql/pg/optimizer_ut.cpp
index 2cff43bb21..0bcfa3aaf4 100644
--- a/ydb/library/yql/sql/pg/optimizer_ut.cpp
+++ b/ydb/library/yql/sql/pg/optimizer_ut.cpp
@@ -117,4 +117,61 @@ EqClasses: [[a,b,c]]
UNIT_ASSERT_STRINGS_EQUAL(expected, str);
}
+Y_UNIT_TEST(InputNormalize) {
+ IOptimizer::TRel rel1 = {100000, 1000000, {{'a'}}};
+ IOptimizer::TRel rel2 = {1000000, 9000009, {{'b'}}};
+ IOptimizer::TRel rel3 = {10000, 9009, {{'c'}}};
+ IOptimizer::TInput input = {{rel1, rel2, rel3}};
+
+ input.EqClasses.emplace_back(IOptimizer::TEq {
+ {{1, 1}, {2, 1}}
+ });
+ input.EqClasses.emplace_back(IOptimizer::TEq {
+ {{2, 1}, {3, 1}}
+ });
+
+ TString expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]}]
+EqClasses: [[a,b],[b,c]]
+)__";
+ UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString());
+
+ input.Normalize();
+
+ expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]}]
+EqClasses: [[a,b,c]]
+)__";
+ UNIT_ASSERT_STRINGS_EQUAL(expected, input.ToString());
+
+ IOptimizer::TRel rel4 = {10001, 9009, {{'d'}}};
+ IOptimizer::TInput input2 = {{rel1, rel2, rel3, rel4}};
+ input2.EqClasses.emplace_back(IOptimizer::TEq {
+ {{1, 1}, {2, 1}}
+ });
+ input2.EqClasses.emplace_back(IOptimizer::TEq {
+ {{4, 1}, {3, 1}}
+ });
+
+ expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]},
+{rows: 10001,cost: 9009,vars: [d]}]
+EqClasses: [[a,b],[d,c]]
+)__";
+ UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString());
+
+ input2.Normalize();
+
+ expected = R"__(Rels: [{rows: 100000,cost: 1000000,vars: [a]},
+{rows: 1000000,cost: 9000009,vars: [b]},
+{rows: 10000,cost: 9009,vars: [c]},
+{rows: 10001,cost: 9009,vars: [d]}]
+EqClasses: [[a,b],[c,d]]
+)__";
+ UNIT_ASSERT_STRINGS_EQUAL(expected, input2.ToString());
+}
+
} // Y_UNIT_TEST_SUITE(PgOptimizer)