diff options
author | galaxycrab <UgnineSirdis@ydb.tech> | 2022-07-19 19:37:37 +0300 |
---|---|---|
committer | galaxycrab <UgnineSirdis@ydb.tech> | 2022-07-19 19:37:37 +0300 |
commit | 64735435ab95ae8e9622d9016f4db18ea60459c0 (patch) | |
tree | 5f779acf373a542c2153c569f510b23bb05c7222 | |
parent | 5e561dc030f602c54a8549749d885b5f9547f31d (diff) | |
download | ydb-64735435ab95ae8e9622d9016f4db18ea60459c0.tar.gz |
Lighten S3 tasks params and source desc params
Rename to deprecated
Improve test
Read paths from tree
Store file tree in range params
18 files changed, 610 insertions, 34 deletions
diff --git a/CMakeLists.darwin.txt b/CMakeLists.darwin.txt index ec4fdd1cade..e9df897de34 100644 --- a/CMakeLists.darwin.txt +++ b/CMakeLists.darwin.txt @@ -778,6 +778,7 @@ add_subdirectory(ydb/library/yql/providers/pq/common) add_subdirectory(ydb/library/yql/providers/pq/expr_nodes) add_subdirectory(ydb/library/yql/providers/s3/provider) add_subdirectory(ydb/library/yql/providers/s3/expr_nodes) +add_subdirectory(ydb/library/yql/providers/s3/range_helpers) add_subdirectory(ydb/library/yql/providers/ydb/provider) add_subdirectory(ydb/public/lib/experimental) add_subdirectory(ydb/library/yql/providers/ydb/expr_nodes) @@ -1353,6 +1354,7 @@ add_subdirectory(ydb/library/yql/providers/common/http_gateway/mock) add_subdirectory(ydb/library/yql/providers/common/structured_token/ut) add_subdirectory(ydb/library/yql/providers/pq/gateway/dummy) add_subdirectory(ydb/library/yql/providers/s3/path_generator/ut) +add_subdirectory(ydb/library/yql/providers/s3/range_helpers/ut) add_subdirectory(ydb/library/yql/udfs/common/stat/ut) add_subdirectory(ydb/library/yql/udfs/common/topfreq/ut) add_subdirectory(ydb/public/sdk/cpp/client/extensions/discovery_mutator/ut) diff --git a/CMakeLists.linux.txt b/CMakeLists.linux.txt index 80027f208a4..872b06e43ed 100644 --- a/CMakeLists.linux.txt +++ b/CMakeLists.linux.txt @@ -782,6 +782,7 @@ add_subdirectory(ydb/library/yql/providers/pq/common) add_subdirectory(ydb/library/yql/providers/pq/expr_nodes) add_subdirectory(ydb/library/yql/providers/s3/provider) add_subdirectory(ydb/library/yql/providers/s3/expr_nodes) +add_subdirectory(ydb/library/yql/providers/s3/range_helpers) add_subdirectory(ydb/library/yql/providers/ydb/provider) add_subdirectory(ydb/public/lib/experimental) add_subdirectory(ydb/library/yql/providers/ydb/expr_nodes) @@ -1374,6 +1375,7 @@ add_subdirectory(ydb/library/yql/providers/common/http_gateway/mock) add_subdirectory(ydb/library/yql/providers/common/structured_token/ut) add_subdirectory(ydb/library/yql/providers/pq/gateway/dummy) add_subdirectory(ydb/library/yql/providers/s3/path_generator/ut) +add_subdirectory(ydb/library/yql/providers/s3/range_helpers/ut) add_subdirectory(ydb/library/yql/udfs/common/stat/ut) add_subdirectory(ydb/library/yql/udfs/common/topfreq/ut) add_subdirectory(ydb/public/sdk/cpp/client/extensions/discovery_mutator/ut) diff --git a/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt b/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt index 73edb9f3464..e828c67edbf 100644 --- a/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt +++ b/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt @@ -29,6 +29,7 @@ target_link_libraries(providers-s3-actors PUBLIC common-token_accessor-client providers-s3-compressors providers-s3-proto + providers-s3-range_helpers providers-s3-serializations yql-public-types clickhouse_client_udf diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 1d0cc239cdb..67ef5f0a8c3 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -33,8 +33,8 @@ #include <ydb/library/yql/providers/s3/compressors/factory.h> #include <ydb/library/yql/providers/s3/proto/range.pb.h> +#include <ydb/library/yql/providers/s3/range_helpers/path_list_reader.h> #include <ydb/library/yql/providers/s3/serializations/serialization_interval.h> -#include <ydb/library/yql/providers/common/provider/yql_provider_names.h> #include <library/cpp/actors/core/actor_bootstrapped.h> #include <library/cpp/actors/core/actor_coroutine.h> @@ -48,7 +48,8 @@ namespace NYql::NDq { -using namespace NActors; +using namespace ::NActors; +using namespace ::NYql::NS3Details; namespace { @@ -105,9 +106,6 @@ struct TEvPrivate { }; }; -using TPath = std::tuple<TString, size_t>; -using TPathList = std::vector<TPath>; - class TRetryParams { public: TRetryParams(const std::shared_ptr<NS3::TRetryConfig>& retryConfig) @@ -673,28 +671,10 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor( const std::shared_ptr<NS3::TRetryConfig>& retryConfig) { const IFunctionRegistry& functionRegistry = *holderFactory.GetFunctionRegistry(); - std::unordered_map<TString, size_t> map(params.GetPath().size()); - for (auto i = 0; i < params.GetPath().size(); ++i) - map.emplace(params.GetPath().Get(i).GetPath(), params.GetPath().Get(i).GetSize()); TPathList paths; ui64 startPathIndex = 0; - if (const auto taskParamsIt = taskParams.find(S3ProviderName); taskParamsIt != taskParams.cend()) { - NS3::TRange range; - TStringInput input(taskParamsIt->second); - range.Load(&input); - startPathIndex = range.GetStartPathIndex(); - for (auto i = 0; i < range.GetPath().size(); ++i) { - const auto& path = range.GetPath().Get(i); - auto it = map.find(path); - YQL_ENSURE(it != map.end()); - paths.emplace_back(path, it->second); - } - } else { - for (auto i = 0; i < params.GetPath().size(); ++i) { - paths.emplace_back(params.GetPath().Get(i).GetPath(), params.GetPath().Get(i).GetSize()); - } - } + ReadPathsList(params, taskParams, paths, startPathIndex); const auto token = secureParams.Value(params.GetToken(), TString{}); const auto credentialsProviderFactory = CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, token); diff --git a/ydb/library/yql/providers/s3/proto/range.proto b/ydb/library/yql/providers/s3/proto/range.proto index ce018fbe9a4..975c0d559ec 100644 --- a/ydb/library/yql/providers/s3/proto/range.proto +++ b/ydb/library/yql/providers/s3/proto/range.proto @@ -4,6 +4,14 @@ option cc_enable_arenas = true; package NYql.NS3; message TRange { - repeated string Path = 2; + repeated string DeprecatedPath = 2; // deprecated uint64 StartPathIndex = 3; + repeated TPath Paths = 4; + + message TPath { + string Name = 1; + repeated TPath Children = 2; + uint64 Size = 3; + bool Read = 4; // Read this path + } } diff --git a/ydb/library/yql/providers/s3/proto/source.proto b/ydb/library/yql/providers/s3/proto/source.proto index 857ef69262a..232abb19cd1 100644 --- a/ydb/library/yql/providers/s3/proto/source.proto +++ b/ydb/library/yql/providers/s3/proto/source.proto @@ -11,7 +11,7 @@ message TPath { message TSource { string Url = 1; string Token = 2; - repeated TPath Path = 3; + repeated TPath DeprecatedPath = 3; // deprecated optional string RowType = 4; optional string Format = 5; map<string, string> Settings = 6; diff --git a/ydb/library/yql/providers/s3/provider/CMakeLists.txt b/ydb/library/yql/providers/s3/provider/CMakeLists.txt index 5bbc6501ecd..62694c69ca3 100644 --- a/ydb/library/yql/providers/s3/provider/CMakeLists.txt +++ b/ydb/library/yql/providers/s3/provider/CMakeLists.txt @@ -43,6 +43,7 @@ target_link_libraries(providers-s3-provider PUBLIC providers-result-expr_nodes providers-s3-expr_nodes providers-s3-proto + providers-s3-range_helpers library-yql-utils ) target_sources(providers-s3-provider PRIVATE diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index 5d0b9c45a1c..2f3b05faa6f 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -10,6 +10,7 @@ #include <ydb/library/yql/providers/s3/proto/range.pb.h> #include <ydb/library/yql/providers/s3/proto/sink.pb.h> #include <ydb/library/yql/providers/s3/proto/source.pb.h> +#include <ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h> #include <ydb/library/yql/utils/log/log.h> namespace NYql { @@ -18,6 +19,8 @@ using namespace NNodes; namespace { +using namespace NYql::NS3Details; + class TS3DqIntegration: public TDqIntegrationBase { public: TS3DqIntegration(TS3State::TPtr state) @@ -27,13 +30,16 @@ public: ui64 Partition(const TDqSettings&, size_t maxPartitions, const TExprNode& node, TVector<TString>& partitions, TString*, TExprContext&, bool) override { TString cluster; - std::vector<std::vector<TString>> parts; + std::vector<std::vector<std::pair<TString, ui64>>> parts; if (const TMaybeNode<TDqSource> source = &node) { cluster = source.Cast().DataSource().Cast<TS3DataSource>().Cluster().Value(); const auto settings = source.Cast().Settings().Cast<TS3SourceSettingsBase>(); parts.reserve(settings.Paths().Size()); for (auto i = 0u; i < settings.Paths().Size(); ++i) - parts.emplace_back(std::vector<TString>(1U, settings.Paths().Item(i).Path().StringValue())); + parts.emplace_back(1U, + std::pair( + settings.Paths().Item(i).Path().StringValue(), + FromString<ui64>(settings.Paths().Item(i).Size().Value()))); } if (maxPartitions && parts.size() > maxPartitions) { @@ -62,7 +68,9 @@ public: for (const auto& part : parts) { NS3::TRange range; range.SetStartPathIndex(startIdx); - std::for_each(part.cbegin(), part.cend(), [&range, &startIdx](const TString& path) { range.AddPath(path); ++startIdx; }); + TFileTreeBuilder builder; + std::for_each(part.cbegin(), part.cend(), [&builder, &startIdx](const std::pair<TString, ui64>& f) { builder.AddPath(f.first, f.second); ++startIdx; }); + builder.Save(&range); partitions.emplace_back(); TStringOutput out(partitions.back()); @@ -170,11 +178,6 @@ public: const auto& paths = settings.Paths(); YQL_ENSURE(paths.Size() > 0); const TStructExprType* extraColumnsType = paths.Item(0).ExtraColumns().Ref().GetTypeAnn()->Cast<TStructExprType>(); - for (auto i = 0U; i < paths.Size(); ++i) { - const auto p = srcDesc.AddPath(); - p->SetPath(paths.Item(i).Path().StringValue()); - p->SetSize(FromString<ui64>(paths.Item(i).Size().Value())); - } if (const auto mayParseSettings = settings.Maybe<TS3ParseSettings>()) { const auto parseSettings = mayParseSettings.Cast(); diff --git a/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt b/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt new file mode 100644 index 00000000000..7e22842e685 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt @@ -0,0 +1,24 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(providers-s3-range_helpers) +target_compile_options(providers-s3-range_helpers PRIVATE + -DUSE_CURRENT_UDF_ABI_VERSION +) +target_link_libraries(providers-s3-range_helpers PUBLIC + contrib-libs-cxxsupp + yutil + providers-common-provider + providers-s3-proto + library-yql-utils +) +target_sources(providers-s3-range_helpers PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp +) diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp new file mode 100644 index 00000000000..166e097b984 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp @@ -0,0 +1,41 @@ +#include "file_tree_builder.h" + +namespace NYql::NS3Details { + +void TFileTreeBuilder::AddPath(const TString& path, ui64 fileSize) { + const auto parts = SplitPath(path); + std::map<TString, TPath>* currentChildren = &Roots; + for (size_t i = 0, size = parts.size(); i < size; ++i) { + TPath& p = (*currentChildren)[parts[i]]; + if (i == size - 1) { // last + Y_VERIFY(p.FileSize == 0); + Y_VERIFY(!p.Read); + p.FileSize = fileSize; + p.Read = true; + } else { + currentChildren = &p.Children; + } + } +} + +void TFileTreeBuilder::Save(NS3::TRange* range) const { + for (const auto& [n, p] : Roots) { + SaveImpl(range->AddPaths(), n, p); + } +} + +void TFileTreeBuilder::SaveImpl(NS3::TRange::TPath* path, const TString& name, const TPath& srcPath) const { + path->SetName(name); + path->SetSize(srcPath.FileSize); + path->SetRead(srcPath.Read); + for (const auto& [n, p] : srcPath.Children) { + SaveImpl(path->AddChildren(), n, p); + } +} + +std::vector<TString> TFileTreeBuilder::SplitPath(const TString& path) { + std::vector<TString> parts = StringSplitter(path).Split('/'); + return parts; +} + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h new file mode 100644 index 00000000000..363c60f78bd --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h @@ -0,0 +1,30 @@ +#pragma once +#include <ydb/library/yql/providers/s3/proto/range.pb.h> + +#include <util/generic/string.h> +#include <util/string/split.h> + +#include <map> + +namespace NYql::NS3Details { + +class TFileTreeBuilder { + struct TPath { + ui64 FileSize = 0; + bool Read = false; + std::map<TString, TPath> Children; + }; + +public: + void AddPath(const TString& path, ui64 fileSize); + void Save(NS3::TRange* range) const; + +private: + void SaveImpl(NS3::TRange::TPath* path, const TString& name, const TPath& srcPath) const; + static std::vector<TString> SplitPath(const TString& path); + +private: + std::map<TString, TPath> Roots; +}; + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp new file mode 100644 index 00000000000..8644c9acc01 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp @@ -0,0 +1,163 @@ +#include "file_tree_builder.h" +#include "path_list_reader.h" + +#include <ydb/library/yql/providers/common/provider/yql_provider_names.h> + +#include <library/cpp/testing/unittest/registar.h> + +namespace NYql::NS3Details { + +Y_UNIT_TEST_SUITE(S3FileTreeBuilderTest) { + THashMap<TString, TString> MakeParams(const NS3::TRange& range) { + TStringBuilder str; + range.Save(&str.Out); + THashMap<TString, TString> map; + map[S3ProviderName] = str; + return map; + } + + Y_UNIT_TEST(Simple) { + TFileTreeBuilder b; + b.AddPath("name", 42); + + NS3::TRange range; + b.Save(&range); + + UNIT_ASSERT_VALUES_EQUAL(range.PathsSize(), 1); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetName(), "name"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetSize(), 42); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).ChildrenSize(), 0); + } + + Y_UNIT_TEST(Interesting) { + TFileTreeBuilder b; + b.AddPath("name", 42); + b.AddPath("root/folder/file", 100500); + b.AddPath("root2/file", 10); + b.AddPath("root2", 42); + b.AddPath("root/folder/other_file", 22); + b.AddPath("root/file/", 12); + + NS3::TRange range; + b.Save(&range); + + UNIT_ASSERT_VALUES_EQUAL(range.PathsSize(), 3); + + // name + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetName(), "name"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetSize(), 42); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).ChildrenSize(), 0); + + // root + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetName(), "root"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetSize(), 0); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).ChildrenSize(), 2); + + // root/file/ + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).ChildrenSize(), 1); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetName(), "file"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).GetName(), ""); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).GetSize(), 12); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).ChildrenSize(), 0); + + // root/folder + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetName(), "folder"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetSize(), 0); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).ChildrenSize(), 2); + + // root/folder/file + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).GetName(), "file"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).GetSize(), 100500); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).ChildrenSize(), 0); + + // root/folder/other_file + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).GetName(), "other_file"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).GetSize(), 22); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).ChildrenSize(), 0); + + // root2 + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetName(), "root2"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetSize(), 42); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).ChildrenSize(), 1); + + // root2/file + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).GetName(), "file"); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).GetSize(), 10); + UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).ChildrenSize(), 0); + } + + Y_UNIT_TEST(PassesFileWithZeroSize) { + TFileTreeBuilder b; + b.AddPath("name", 0); + + NS3::TRange range; + b.Save(&range); + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList({}, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1); + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "name"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 0); + } + + Y_UNIT_TEST(DeserializesManySlashes) { + TFileTreeBuilder b; + b.AddPath("a///b", 42); + + NS3::TRange range; + b.Save(&range); + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList({}, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1); + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "a///b"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 42); + } + + Y_UNIT_TEST(DeserializesTrailingSlash) { + TFileTreeBuilder b; + b.AddPath("root/name//", 3); + b.AddPath("root/name/", 0); + + NS3::TRange range; + b.Save(&range); + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList({}, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2); + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "root/name/"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 0); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "root/name//"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 3); + } + + Y_UNIT_TEST(DeserializesLeadingSlash) { + TFileTreeBuilder b; + b.AddPath("/root/name", 3); + b.AddPath("/", 42); + + NS3::TRange range; + b.Save(&range); + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList({}, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "/"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 42); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "/root/name"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 3); + } +} + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp new file mode 100644 index 00000000000..4813dfa12ac --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp @@ -0,0 +1,59 @@ +#include "path_list_reader.h" + +#include <ydb/library/yql/providers/common/provider/yql_provider_names.h> +#include <ydb/library/yql/utils/yql_panic.h> + +#include <util/stream/str.h> + +namespace NYql::NS3Details { + +static void BuildPathsFromTree(const google::protobuf::RepeatedPtrField<NYql::NS3::TRange::TPath>& children, TPathList& paths, TString& currentPath, size_t currentDepth = 0) { + if (children.empty()) { + return; + } + if (currentDepth) { + currentPath += '/'; + } + for (const auto& path : children) { + const size_t prevSize = currentPath.size(); + currentPath += path.GetName(); + if (path.GetRead()) { + paths.emplace_back(currentPath, path.GetSize()); + } + BuildPathsFromTree(path.GetChildren(), paths, currentPath, currentDepth + 1); + currentPath.resize(prevSize); + } +} + +void ReadPathsList(const NS3::TSource& sourceDesc, const THashMap<TString, TString>& taskParams, TPathList& paths, ui64& startPathIndex) { + if (const auto taskParamsIt = taskParams.find(S3ProviderName); taskParamsIt != taskParams.cend()) { + NS3::TRange range; + TStringInput input(taskParamsIt->second); + range.Load(&input); + startPathIndex = range.GetStartPathIndex(); + + // Modern way + if (range.PathsSize()) { + TString buf; + BuildPathsFromTree(range.GetPaths(), paths, buf); + return; + } + + std::unordered_map<TString, size_t> map(sourceDesc.GetDeprecatedPath().size()); + for (auto i = 0; i < sourceDesc.GetDeprecatedPath().size(); ++i) + map.emplace(sourceDesc.GetDeprecatedPath().Get(i).GetPath(), sourceDesc.GetDeprecatedPath().Get(i).GetSize()); + + for (auto i = 0; i < range.GetDeprecatedPath().size(); ++i) { + const auto& path = range.GetDeprecatedPath().Get(i); + auto it = map.find(path); + YQL_ENSURE(it != map.end()); + paths.emplace_back(path, it->second); + } + } else { + for (auto i = 0; i < sourceDesc.GetDeprecatedPath().size(); ++i) { + paths.emplace_back(sourceDesc.GetDeprecatedPath().Get(i).GetPath(), sourceDesc.GetDeprecatedPath().Get(i).GetSize()); + } + } +} + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h new file mode 100644 index 00000000000..9be8c7814ba --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h @@ -0,0 +1,18 @@ +#pragma once +#include <ydb/library/yql/providers/s3/proto/source.pb.h> +#include <ydb/library/yql/providers/s3/proto/range.pb.h> + +#include <util/generic/hash.h> +#include <util/generic/string.h> + +#include <utility> +#include <vector> + +namespace NYql::NS3Details { + +using TPath = std::tuple<TString, size_t>; +using TPathList = std::vector<TPath>; + +void ReadPathsList(const NS3::TSource& sourceDesc, const THashMap<TString, TString>& taskParams, TPathList& paths, ui64& startPathIndex); + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp b/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp new file mode 100644 index 00000000000..a54135e6561 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp @@ -0,0 +1,133 @@ +#include "path_list_reader.h" + +#include <ydb/library/yql/providers/common/provider/yql_provider_names.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/string/builder.h> + +namespace NYql::NS3Details { + +Y_UNIT_TEST_SUITE(PathListReaderTest) { + THashMap<TString, TString> MakeParams(const NS3::TRange& range) { + TStringBuilder str; + range.Save(&str.Out); + THashMap<TString, TString> map; + map[S3ProviderName] = str; + return map; + } + + Y_UNIT_TEST(ReadsFilesListFromSourceSettings) { + NS3::TSource src; + { + auto* p = src.AddDeprecatedPath(); + p->SetPath("my/path"); + p->SetSize(100500); + } + { + auto* p = src.AddDeprecatedPath(); + p->SetPath("other/path"); + p->SetSize(1); + } + + TPathList paths; + ui64 startPathIndex = 42; + ReadPathsList(src, {}, paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42); + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "my/path"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 100500); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "other/path"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 1); + } + + Y_UNIT_TEST(ReadsFilesListFromParamsAndSourceSettings) { + NS3::TSource src; + { + auto* p = src.AddDeprecatedPath(); + p->SetPath("my/path"); + p->SetSize(100500); + } + { + auto* p = src.AddDeprecatedPath(); + p->SetPath("other/path"); + p->SetSize(1); + } + + NS3::TRange range; + range.SetStartPathIndex(42); + range.AddDeprecatedPath("my/path"); + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList(src, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42); + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "my/path"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 100500); + } + + NYql::NS3::TRange::TPath* SetPath(NYql::NS3::TRange::TPath* path, const TString& name = {}, ui64 size = 0, bool read = false) { + path->SetName(name); + path->SetSize(size); + path->SetRead(read); + return path; + } + + Y_UNIT_TEST(ReadsFilesListFromTreeParams) { + NS3::TSource src; + { + auto* p = src.AddDeprecatedPath(); + p->SetPath("my/path"); + p->SetSize(100500); + } + + NS3::TRange range; + range.SetStartPathIndex(42); + range.AddDeprecatedPath("my/path"); // We shouldn't react on this + + { + auto* root = SetPath(range.AddPaths(), "root", 1, true); + { + auto* folder = SetPath(root->AddChildren(), "folder"); + SetPath(folder->AddChildren(), "f1", 42, true); + SetPath(folder->AddChildren(), "f2", 100500, true); + } + SetPath(root->AddChildren(), "f3", 0, true); + SetPath(root->AddChildren(), "nothing"); // Shouldn't be processed. + } + { + auto* root2 = SetPath(range.AddPaths(), "root2"); + SetPath(root2->AddChildren(), "f4", 42, true); + } + + TPathList paths; + ui64 startPathIndex = 0; + ReadPathsList(src, MakeParams(range), paths, startPathIndex); + + UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42); + UNIT_ASSERT_VALUES_EQUAL(paths.size(), 5); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "root"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 1); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "root/folder/f1"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 42); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[2]), "root/folder/f2"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[2]), 100500); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[3]), "root/f3"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[3]), 0); + + UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[4]), "root2/f4"); + UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[4]), 42); + } +} + +} // namespace NYql::NS3Details diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt new file mode 100644 index 00000000000..690425de126 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt @@ -0,0 +1,46 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(ydb-library-yql-providers-s3-range_helpers-ut) +target_compile_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + -DUSE_CURRENT_UDF_ABI_VERSION +) +target_include_directories(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers +) +target_link_libraries(ydb-library-yql-providers-s3-range_helpers-ut PUBLIC + contrib-libs-cxxsupp + yutil + library-cpp-cpuid_check + cpp-testing-unittest_main + providers-s3-range_helpers + providers-common-provider +) +target_link_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + -Wl,-no_deduplicate + -Wl,-sdk_version,10.15 + -fPIC + -fPIC +) +target_sources(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp +) +add_test( + NAME + ydb-library-yql-providers-s3-range_helpers-ut + COMMAND + ydb-library-yql-providers-s3-range_helpers-ut + --print-before-suite + --print-before-test + --fork-tests + --print-times + --show-fails +) +vcs_info(ydb-library-yql-providers-s3-range_helpers-ut) diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt new file mode 100644 index 00000000000..df529a3d408 --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt @@ -0,0 +1,52 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_executable(ydb-library-yql-providers-s3-range_helpers-ut) +target_compile_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + -DUSE_CURRENT_UDF_ABI_VERSION +) +target_include_directories(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers +) +target_link_libraries(ydb-library-yql-providers-s3-range_helpers-ut PUBLIC + contrib-libs-cxxsupp + yutil + cpp-malloc-tcmalloc + libs-tcmalloc-no_percpu_cache + library-cpp-cpuid_check + cpp-testing-unittest_main + providers-s3-range_helpers + providers-common-provider +) +target_link_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + -ldl + -lrt + -Wl,--no-as-needed + -fPIC + -fPIC + -lpthread + -lrt + -ldl +) +target_sources(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp +) +add_test( + NAME + ydb-library-yql-providers-s3-range_helpers-ut + COMMAND + ydb-library-yql-providers-s3-range_helpers-ut + --print-before-suite + --print-before-test + --fork-tests + --print-times + --show-fails +) +vcs_info(ydb-library-yql-providers-s3-range_helpers-ut) diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt new file mode 100644 index 00000000000..fc7b1ee73ce --- /dev/null +++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt @@ -0,0 +1,13 @@ + +# This file was gererated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (APPLE) + include(CMakeLists.darwin.txt) +elseif (UNIX AND NOT APPLE) + include(CMakeLists.linux.txt) +endif() |