diff options
author | ivanmorozov <ivanmorozov@yandex-team.com> | 2023-07-25 13:14:34 +0300 |
---|---|---|
committer | root <root@qavm-2ed34686.qemu> | 2023-07-25 13:14:34 +0300 |
commit | 22d9f8e8361fb30d4a132e3fe8536550b13d0628 (patch) | |
tree | d53e4250eb5c32f67bebbc058fda076c68c4c149 | |
parent | 102ed5791ddc7a3c7859c5359bb5a209c6bb45dc (diff) | |
download | ydb-22d9f8e8361fb30d4a132e3fe8536550b13d0628.tar.gz |
use TStoreReplaceKey for store special record in long term (automatically reduce memory consumption)
-rw-r--r-- | ydb/core/formats/arrow/CMakeLists.darwin-x86_64.txt | 5 | ||||
-rw-r--r-- | ydb/core/formats/arrow/CMakeLists.linux-aarch64.txt | 5 | ||||
-rw-r--r-- | ydb/core/formats/arrow/CMakeLists.linux-x86_64.txt | 5 | ||||
-rw-r--r-- | ydb/core/formats/arrow/CMakeLists.windows-x86_64.txt | 5 | ||||
-rw-r--r-- | ydb/core/formats/arrow/arrow_helpers.cpp | 10 | ||||
-rw-r--r-- | ydb/core/formats/arrow/arrow_helpers.h | 6 | ||||
-rw-r--r-- | ydb/core/formats/arrow/permutations.cpp | 1 | ||||
-rw-r--r-- | ydb/core/formats/arrow/replace_key.cpp | 15 | ||||
-rw-r--r-- | ydb/core/formats/arrow/replace_key.h | 20 | ||||
-rw-r--r-- | ydb/core/formats/arrow/ya.make | 9 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/changes/mark.h | 2 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/changes/mark_granules.cpp | 2 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/changes/split_compaction.cpp | 2 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/granules_table.h | 3 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/predicate/container.h | 1 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/engines/scheme/tier_info.h | 2 |
16 files changed, 58 insertions, 35 deletions
diff --git a/ydb/core/formats/arrow/CMakeLists.darwin-x86_64.txt b/ydb/core/formats/arrow/CMakeLists.darwin-x86_64.txt index 80f3745ad37..a2b306eb306 100644 --- a/ydb/core/formats/arrow/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/formats/arrow/CMakeLists.darwin-x86_64.txt @@ -39,13 +39,14 @@ target_link_libraries(core-formats-arrow PUBLIC ) target_sources(core-formats-arrow PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_batch_builder.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_filter.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/converter.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/custom_registry.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/merging_sorted_input_stream.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/permutations.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/program.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/replace_key.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/size_calcer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp ) diff --git a/ydb/core/formats/arrow/CMakeLists.linux-aarch64.txt b/ydb/core/formats/arrow/CMakeLists.linux-aarch64.txt index 4b338876c23..9db7b6a92ae 100644 --- a/ydb/core/formats/arrow/CMakeLists.linux-aarch64.txt +++ b/ydb/core/formats/arrow/CMakeLists.linux-aarch64.txt @@ -40,13 +40,14 @@ target_link_libraries(core-formats-arrow PUBLIC ) target_sources(core-formats-arrow PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_batch_builder.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_filter.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/converter.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/custom_registry.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/merging_sorted_input_stream.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/permutations.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/program.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/replace_key.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/size_calcer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp ) diff --git a/ydb/core/formats/arrow/CMakeLists.linux-x86_64.txt b/ydb/core/formats/arrow/CMakeLists.linux-x86_64.txt index 4b338876c23..9db7b6a92ae 100644 --- a/ydb/core/formats/arrow/CMakeLists.linux-x86_64.txt +++ b/ydb/core/formats/arrow/CMakeLists.linux-x86_64.txt @@ -40,13 +40,14 @@ target_link_libraries(core-formats-arrow PUBLIC ) target_sources(core-formats-arrow PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_batch_builder.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_filter.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/converter.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/custom_registry.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/merging_sorted_input_stream.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/permutations.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/program.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/replace_key.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/size_calcer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp ) diff --git a/ydb/core/formats/arrow/CMakeLists.windows-x86_64.txt b/ydb/core/formats/arrow/CMakeLists.windows-x86_64.txt index 524a02842f8..ce82e9eac7e 100644 --- a/ydb/core/formats/arrow/CMakeLists.windows-x86_64.txt +++ b/ydb/core/formats/arrow/CMakeLists.windows-x86_64.txt @@ -39,13 +39,14 @@ target_link_libraries(core-formats-arrow PUBLIC ) target_sources(core-formats-arrow PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_batch_builder.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_filter.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/arrow_helpers.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/converter.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/custom_registry.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/merging_sorted_input_stream.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/permutations.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/program.cpp - ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/replace_key.cpp ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/size_calcer.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/formats/arrow/ssa_program_optimizer.cpp ) diff --git a/ydb/core/formats/arrow/arrow_helpers.cpp b/ydb/core/formats/arrow/arrow_helpers.cpp index b4837eae5d0..b757020f47f 100644 --- a/ydb/core/formats/arrow/arrow_helpers.cpp +++ b/ydb/core/formats/arrow/arrow_helpers.cpp @@ -501,16 +501,6 @@ std::vector<TString> ColumnNames(const std::shared_ptr<arrow::Schema>& schema) { return out; } -size_t LowerBound(const std::vector<TRawReplaceKey>& batchKeys, const TReplaceKey& key, size_t offset) { - Y_VERIFY(offset <= batchKeys.size()); - if (offset == batchKeys.size()) { - return offset; - } - auto start = batchKeys.begin() + offset; - auto it = std::lower_bound(start, batchKeys.end(), key.ToRaw()); - return it - batchKeys.begin(); -} - std::shared_ptr<arrow::UInt64Array> MakeUI64Array(ui64 value, i64 size) { auto res = arrow::MakeArrayFromScalar(arrow::UInt64Scalar(value), size); Y_VERIFY(res.ok()); diff --git a/ydb/core/formats/arrow/arrow_helpers.h b/ydb/core/formats/arrow/arrow_helpers.h index 26a2a93b4c6..13e2893c152 100644 --- a/ydb/core/formats/arrow/arrow_helpers.h +++ b/ydb/core/formats/arrow/arrow_helpers.h @@ -1,7 +1,6 @@ #pragma once #include "switch_type.h" #include "size_calcer.h" -#include "replace_key.h" #include <ydb/core/formats/factory.h> #include <ydb/core/scheme/scheme_tablecell.h> #include <library/cpp/json/writer/json_value.h> @@ -13,10 +12,6 @@ namespace NKikimr::NArrow { using TArrayVec = std::vector<std::shared_ptr<arrow::Array>>; -template<typename T> -class TReplaceKeyTemplate; -using TReplaceKey = TReplaceKeyTemplate<std::shared_ptr<TArrayVec>>; -using TRawReplaceKey = TReplaceKeyTemplate<const TArrayVec*>; std::shared_ptr<arrow::DataType> GetArrowType(NScheme::TTypeInfo typeInfo); std::shared_ptr<arrow::DataType> GetCSVArrowType(NScheme::TTypeInfo typeId); @@ -94,7 +89,6 @@ std::vector<std::shared_ptr<arrow::Array>> Finish(std::vector<std::unique_ptr<ar std::shared_ptr<arrow::UInt64Array> MakeUI64Array(ui64 value, i64 size); std::vector<TString> ColumnNames(const std::shared_ptr<arrow::Schema>& schema); -size_t LowerBound(const std::vector<TRawReplaceKey>& batchKeys, const TReplaceKey& key, size_t offset = 0); bool ReserveData(arrow::ArrayBuilder& builder, const size_t size); bool MergeBatchColumns(const std::vector<std::shared_ptr<arrow::RecordBatch>>& batches, std::shared_ptr<arrow::RecordBatch>& result, const std::vector<std::string>& columnsOrder = {}, const bool orderFieldsAreNecessary = true); diff --git a/ydb/core/formats/arrow/permutations.cpp b/ydb/core/formats/arrow/permutations.cpp index 51e188b5815..e1856b71a97 100644 --- a/ydb/core/formats/arrow/permutations.cpp +++ b/ydb/core/formats/arrow/permutations.cpp @@ -1,5 +1,6 @@ #include "arrow_helpers.h" #include "permutations.h" +#include "replace_key.h" #include <ydb/core/formats/arrow/common/validation.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h> diff --git a/ydb/core/formats/arrow/replace_key.cpp b/ydb/core/formats/arrow/replace_key.cpp new file mode 100644 index 00000000000..7a8ca9f1844 --- /dev/null +++ b/ydb/core/formats/arrow/replace_key.cpp @@ -0,0 +1,15 @@ +#include "replace_key.h" + +namespace NKikimr::NArrow { + +size_t TReplaceKeyHelper::LowerBound(const std::vector<TRawReplaceKey>& batchKeys, const TReplaceKey& key, size_t offset) { + Y_VERIFY(offset <= batchKeys.size()); + if (offset == batchKeys.size()) { + return offset; + } + auto start = batchKeys.begin() + offset; + auto it = std::lower_bound(start, batchKeys.end(), key.ToRaw()); + return it - batchKeys.begin(); +} + +} diff --git a/ydb/core/formats/arrow/replace_key.h b/ydb/core/formats/arrow/replace_key.h index 9e49a4a247b..fab522c7ab9 100644 --- a/ydb/core/formats/arrow/replace_key.h +++ b/ydb/core/formats/arrow/replace_key.h @@ -1,4 +1,5 @@ #pragma once +#include "arrow_helpers.h" #include "permutations.h" #include "common/validation.h" #include <ydb/core/base/defs.h> @@ -8,8 +9,6 @@ namespace NKikimr::NArrow { -bool IsGoodScalar(const std::shared_ptr<arrow::Scalar>& x); - using TArrayVec = std::vector<std::shared_ptr<arrow::Array>>; template<typename TArrayVecPtr> @@ -383,6 +382,22 @@ private: using TReplaceKey = TReplaceKeyTemplate<std::shared_ptr<TArrayVec>>; using TRawReplaceKey = TReplaceKeyTemplate<const TArrayVec*>; +class TStoreReplaceKey: public TReplaceKey { +private: + using TBase = TReplaceKey; +public: + TStoreReplaceKey(const TReplaceKey& baseKey) + : TBase(baseKey) + { + TBase::ShrinkToFit(); + } +}; + +class TReplaceKeyHelper { +public: + static size_t LowerBound(const std::vector<TRawReplaceKey>& batchKeys, const TReplaceKey& key, size_t offset); +}; + } template<> @@ -398,3 +413,4 @@ struct THash<NKikimr::NArrow::TRawReplaceKey> { return x.Hash(); } }; + diff --git a/ydb/core/formats/arrow/ya.make b/ydb/core/formats/arrow/ya.make index f53171a5927..37000a29807 100644 --- a/ydb/core/formats/arrow/ya.make +++ b/ydb/core/formats/arrow/ya.make @@ -35,20 +35,21 @@ YQL_LAST_ABI_VERSION() SRCS( arrow_batch_builder.cpp - arrow_helpers.cpp arrow_filter.cpp - converter.h + arrow_helpers.cpp converter.cpp + converter.h custom_registry.cpp input_stream.h merging_sorted_input_stream.cpp merging_sorted_input_stream.h one_batch_input_stream.h permutations.cpp - sort_cursor.h program.cpp - ssa_program_optimizer.cpp + replace_key.cpp size_calcer.cpp + sort_cursor.h + ssa_program_optimizer.cpp ) END() diff --git a/ydb/core/tx/columnshard/engines/changes/mark.h b/ydb/core/tx/columnshard/engines/changes/mark.h index d34260d65a8..c1aeeeaad62 100644 --- a/ydb/core/tx/columnshard/engines/changes/mark.h +++ b/ydb/core/tx/columnshard/engines/changes/mark.h @@ -1,6 +1,6 @@ #pragma once -#include <ydb/core/formats/arrow/arrow_helpers.h> +#include <ydb/core/formats/arrow/replace_key.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/scalar.h> namespace NKikimr::NOlap { diff --git a/ydb/core/tx/columnshard/engines/changes/mark_granules.cpp b/ydb/core/tx/columnshard/engines/changes/mark_granules.cpp index 57ede5644a6..2ee755687f4 100644 --- a/ydb/core/tx/columnshard/engines/changes/mark_granules.cpp +++ b/ydb/core/tx/columnshard/engines/changes/mark_granules.cpp @@ -85,7 +85,7 @@ THashMap<ui64, std::shared_ptr<arrow::RecordBatch>> TMarksGranules::SliceIntoGra // Just take the number of elements in the key column for the last granule. ? effKey->num_rows() // Locate position of the next granule in the key. - : NArrow::LowerBound(keys, granules[i + 1].first.GetBorder(), offset); + : NArrow::TReplaceKeyHelper::LowerBound(keys, granules[i + 1].first.GetBorder(), offset); if (const i64 size = end - offset) { Y_VERIFY(out.emplace(granules[i].second, batch->Slice(offset, size)).second); diff --git a/ydb/core/tx/columnshard/engines/changes/split_compaction.cpp b/ydb/core/tx/columnshard/engines/changes/split_compaction.cpp index 6db2a5a988d..01118f3c379 100644 --- a/ydb/core/tx/columnshard/engines/changes/split_compaction.cpp +++ b/ydb/core/tx/columnshard/engines/changes/split_compaction.cpp @@ -232,7 +232,7 @@ std::vector<std::pair<NKikimr::NOlap::TMark, std::shared_ptr<arrow::RecordBatch> batchOffsets.push_back(0); for (const auto& border : borders) { - int offset = NArrow::LowerBound(keys, border, batchOffsets.back()); + int offset = NArrow::TReplaceKeyHelper::LowerBound(keys, border, batchOffsets.back()); Y_VERIFY(offset >= batchOffsets.back()); Y_VERIFY(offset <= batch->num_rows()); batchOffsets.push_back(offset); diff --git a/ydb/core/tx/columnshard/engines/granules_table.h b/ydb/core/tx/columnshard/engines/granules_table.h index b5561ac68df..5b333927cce 100644 --- a/ydb/core/tx/columnshard/engines/granules_table.h +++ b/ydb/core/tx/columnshard/engines/granules_table.h @@ -10,7 +10,7 @@ private: public: ui64 PathId; ui64 Granule; - NArrow::TReplaceKey Mark; + NArrow::TStoreReplaceKey Mark; TGranuleRecord(ui64 pathId, ui64 granule, const TSnapshot& createdAt, const NArrow::TReplaceKey& mark) : CreatedAt(createdAt) @@ -18,7 +18,6 @@ public: , Granule(granule) , Mark(mark) { - Mark.ShrinkToFit(); Y_VERIFY(Mark.Size()); } diff --git a/ydb/core/tx/columnshard/engines/predicate/container.h b/ydb/core/tx/columnshard/engines/predicate/container.h index b43f6fbf3f7..22991579d86 100644 --- a/ydb/core/tx/columnshard/engines/predicate/container.h +++ b/ydb/core/tx/columnshard/engines/predicate/container.h @@ -1,6 +1,7 @@ #pragma once #include "predicate.h" #include <ydb/core/formats/arrow/arrow_filter.h> +#include <ydb/core/formats/arrow/replace_key.h> #include <ydb/library/accessor/accessor.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h> #include <optional> diff --git a/ydb/core/tx/columnshard/engines/scheme/tier_info.h b/ydb/core/tx/columnshard/engines/scheme/tier_info.h index b05a1d76557..2271e4c87c6 100644 --- a/ydb/core/tx/columnshard/engines/scheme/tier_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/tier_info.h @@ -6,6 +6,8 @@ #include <ydb/core/formats/arrow/compression/object.h> #include <ydb/core/tx/columnshard/common/scalars.h> #include <contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h> +#include <util/generic/set.h> +#include <util/generic/hash_set.h> namespace NKikimr::NOlap { |