aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorgalaxycrab <UgnineSirdis@ydb.tech>2022-07-19 19:37:37 +0300
committergalaxycrab <UgnineSirdis@ydb.tech>2022-07-19 19:37:37 +0300
commit64735435ab95ae8e9622d9016f4db18ea60459c0 (patch)
tree5f779acf373a542c2153c569f510b23bb05c7222
parent5e561dc030f602c54a8549749d885b5f9547f31d (diff)
downloadydb-64735435ab95ae8e9622d9016f4db18ea60459c0.tar.gz
Lighten S3 tasks params and source desc params
Rename to deprecated Improve test Read paths from tree Store file tree in range params
-rw-r--r--CMakeLists.darwin.txt2
-rw-r--r--CMakeLists.linux.txt2
-rw-r--r--ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt1
-rw-r--r--ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp28
-rw-r--r--ydb/library/yql/providers/s3/proto/range.proto10
-rw-r--r--ydb/library/yql/providers/s3/proto/source.proto2
-rw-r--r--ydb/library/yql/providers/s3/provider/CMakeLists.txt1
-rw-r--r--ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp19
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt24
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp41
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h30
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp163
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp59
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/path_list_reader.h18
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp133
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt46
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt52
-rw-r--r--ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt13
18 files changed, 610 insertions, 34 deletions
diff --git a/CMakeLists.darwin.txt b/CMakeLists.darwin.txt
index ec4fdd1cade..e9df897de34 100644
--- a/CMakeLists.darwin.txt
+++ b/CMakeLists.darwin.txt
@@ -778,6 +778,7 @@ add_subdirectory(ydb/library/yql/providers/pq/common)
add_subdirectory(ydb/library/yql/providers/pq/expr_nodes)
add_subdirectory(ydb/library/yql/providers/s3/provider)
add_subdirectory(ydb/library/yql/providers/s3/expr_nodes)
+add_subdirectory(ydb/library/yql/providers/s3/range_helpers)
add_subdirectory(ydb/library/yql/providers/ydb/provider)
add_subdirectory(ydb/public/lib/experimental)
add_subdirectory(ydb/library/yql/providers/ydb/expr_nodes)
@@ -1353,6 +1354,7 @@ add_subdirectory(ydb/library/yql/providers/common/http_gateway/mock)
add_subdirectory(ydb/library/yql/providers/common/structured_token/ut)
add_subdirectory(ydb/library/yql/providers/pq/gateway/dummy)
add_subdirectory(ydb/library/yql/providers/s3/path_generator/ut)
+add_subdirectory(ydb/library/yql/providers/s3/range_helpers/ut)
add_subdirectory(ydb/library/yql/udfs/common/stat/ut)
add_subdirectory(ydb/library/yql/udfs/common/topfreq/ut)
add_subdirectory(ydb/public/sdk/cpp/client/extensions/discovery_mutator/ut)
diff --git a/CMakeLists.linux.txt b/CMakeLists.linux.txt
index 80027f208a4..872b06e43ed 100644
--- a/CMakeLists.linux.txt
+++ b/CMakeLists.linux.txt
@@ -782,6 +782,7 @@ add_subdirectory(ydb/library/yql/providers/pq/common)
add_subdirectory(ydb/library/yql/providers/pq/expr_nodes)
add_subdirectory(ydb/library/yql/providers/s3/provider)
add_subdirectory(ydb/library/yql/providers/s3/expr_nodes)
+add_subdirectory(ydb/library/yql/providers/s3/range_helpers)
add_subdirectory(ydb/library/yql/providers/ydb/provider)
add_subdirectory(ydb/public/lib/experimental)
add_subdirectory(ydb/library/yql/providers/ydb/expr_nodes)
@@ -1374,6 +1375,7 @@ add_subdirectory(ydb/library/yql/providers/common/http_gateway/mock)
add_subdirectory(ydb/library/yql/providers/common/structured_token/ut)
add_subdirectory(ydb/library/yql/providers/pq/gateway/dummy)
add_subdirectory(ydb/library/yql/providers/s3/path_generator/ut)
+add_subdirectory(ydb/library/yql/providers/s3/range_helpers/ut)
add_subdirectory(ydb/library/yql/udfs/common/stat/ut)
add_subdirectory(ydb/library/yql/udfs/common/topfreq/ut)
add_subdirectory(ydb/public/sdk/cpp/client/extensions/discovery_mutator/ut)
diff --git a/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt b/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt
index 73edb9f3464..e828c67edbf 100644
--- a/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt
+++ b/ydb/library/yql/providers/s3/actors/CMakeLists.linux.txt
@@ -29,6 +29,7 @@ target_link_libraries(providers-s3-actors PUBLIC
common-token_accessor-client
providers-s3-compressors
providers-s3-proto
+ providers-s3-range_helpers
providers-s3-serializations
yql-public-types
clickhouse_client_udf
diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
index 1d0cc239cdb..67ef5f0a8c3 100644
--- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
+++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp
@@ -33,8 +33,8 @@
#include <ydb/library/yql/providers/s3/compressors/factory.h>
#include <ydb/library/yql/providers/s3/proto/range.pb.h>
+#include <ydb/library/yql/providers/s3/range_helpers/path_list_reader.h>
#include <ydb/library/yql/providers/s3/serializations/serialization_interval.h>
-#include <ydb/library/yql/providers/common/provider/yql_provider_names.h>
#include <library/cpp/actors/core/actor_bootstrapped.h>
#include <library/cpp/actors/core/actor_coroutine.h>
@@ -48,7 +48,8 @@
namespace NYql::NDq {
-using namespace NActors;
+using namespace ::NActors;
+using namespace ::NYql::NS3Details;
namespace {
@@ -105,9 +106,6 @@ struct TEvPrivate {
};
};
-using TPath = std::tuple<TString, size_t>;
-using TPathList = std::vector<TPath>;
-
class TRetryParams {
public:
TRetryParams(const std::shared_ptr<NS3::TRetryConfig>& retryConfig)
@@ -673,28 +671,10 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor(
const std::shared_ptr<NS3::TRetryConfig>& retryConfig)
{
const IFunctionRegistry& functionRegistry = *holderFactory.GetFunctionRegistry();
- std::unordered_map<TString, size_t> map(params.GetPath().size());
- for (auto i = 0; i < params.GetPath().size(); ++i)
- map.emplace(params.GetPath().Get(i).GetPath(), params.GetPath().Get(i).GetSize());
TPathList paths;
ui64 startPathIndex = 0;
- if (const auto taskParamsIt = taskParams.find(S3ProviderName); taskParamsIt != taskParams.cend()) {
- NS3::TRange range;
- TStringInput input(taskParamsIt->second);
- range.Load(&input);
- startPathIndex = range.GetStartPathIndex();
- for (auto i = 0; i < range.GetPath().size(); ++i) {
- const auto& path = range.GetPath().Get(i);
- auto it = map.find(path);
- YQL_ENSURE(it != map.end());
- paths.emplace_back(path, it->second);
- }
- } else {
- for (auto i = 0; i < params.GetPath().size(); ++i) {
- paths.emplace_back(params.GetPath().Get(i).GetPath(), params.GetPath().Get(i).GetSize());
- }
- }
+ ReadPathsList(params, taskParams, paths, startPathIndex);
const auto token = secureParams.Value(params.GetToken(), TString{});
const auto credentialsProviderFactory = CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, token);
diff --git a/ydb/library/yql/providers/s3/proto/range.proto b/ydb/library/yql/providers/s3/proto/range.proto
index ce018fbe9a4..975c0d559ec 100644
--- a/ydb/library/yql/providers/s3/proto/range.proto
+++ b/ydb/library/yql/providers/s3/proto/range.proto
@@ -4,6 +4,14 @@ option cc_enable_arenas = true;
package NYql.NS3;
message TRange {
- repeated string Path = 2;
+ repeated string DeprecatedPath = 2; // deprecated
uint64 StartPathIndex = 3;
+ repeated TPath Paths = 4;
+
+ message TPath {
+ string Name = 1;
+ repeated TPath Children = 2;
+ uint64 Size = 3;
+ bool Read = 4; // Read this path
+ }
}
diff --git a/ydb/library/yql/providers/s3/proto/source.proto b/ydb/library/yql/providers/s3/proto/source.proto
index 857ef69262a..232abb19cd1 100644
--- a/ydb/library/yql/providers/s3/proto/source.proto
+++ b/ydb/library/yql/providers/s3/proto/source.proto
@@ -11,7 +11,7 @@ message TPath {
message TSource {
string Url = 1;
string Token = 2;
- repeated TPath Path = 3;
+ repeated TPath DeprecatedPath = 3; // deprecated
optional string RowType = 4;
optional string Format = 5;
map<string, string> Settings = 6;
diff --git a/ydb/library/yql/providers/s3/provider/CMakeLists.txt b/ydb/library/yql/providers/s3/provider/CMakeLists.txt
index 5bbc6501ecd..62694c69ca3 100644
--- a/ydb/library/yql/providers/s3/provider/CMakeLists.txt
+++ b/ydb/library/yql/providers/s3/provider/CMakeLists.txt
@@ -43,6 +43,7 @@ target_link_libraries(providers-s3-provider PUBLIC
providers-result-expr_nodes
providers-s3-expr_nodes
providers-s3-proto
+ providers-s3-range_helpers
library-yql-utils
)
target_sources(providers-s3-provider PRIVATE
diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp
index 5d0b9c45a1c..2f3b05faa6f 100644
--- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp
+++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp
@@ -10,6 +10,7 @@
#include <ydb/library/yql/providers/s3/proto/range.pb.h>
#include <ydb/library/yql/providers/s3/proto/sink.pb.h>
#include <ydb/library/yql/providers/s3/proto/source.pb.h>
+#include <ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h>
#include <ydb/library/yql/utils/log/log.h>
namespace NYql {
@@ -18,6 +19,8 @@ using namespace NNodes;
namespace {
+using namespace NYql::NS3Details;
+
class TS3DqIntegration: public TDqIntegrationBase {
public:
TS3DqIntegration(TS3State::TPtr state)
@@ -27,13 +30,16 @@ public:
ui64 Partition(const TDqSettings&, size_t maxPartitions, const TExprNode& node, TVector<TString>& partitions, TString*, TExprContext&, bool) override {
TString cluster;
- std::vector<std::vector<TString>> parts;
+ std::vector<std::vector<std::pair<TString, ui64>>> parts;
if (const TMaybeNode<TDqSource> source = &node) {
cluster = source.Cast().DataSource().Cast<TS3DataSource>().Cluster().Value();
const auto settings = source.Cast().Settings().Cast<TS3SourceSettingsBase>();
parts.reserve(settings.Paths().Size());
for (auto i = 0u; i < settings.Paths().Size(); ++i)
- parts.emplace_back(std::vector<TString>(1U, settings.Paths().Item(i).Path().StringValue()));
+ parts.emplace_back(1U,
+ std::pair(
+ settings.Paths().Item(i).Path().StringValue(),
+ FromString<ui64>(settings.Paths().Item(i).Size().Value())));
}
if (maxPartitions && parts.size() > maxPartitions) {
@@ -62,7 +68,9 @@ public:
for (const auto& part : parts) {
NS3::TRange range;
range.SetStartPathIndex(startIdx);
- std::for_each(part.cbegin(), part.cend(), [&range, &startIdx](const TString& path) { range.AddPath(path); ++startIdx; });
+ TFileTreeBuilder builder;
+ std::for_each(part.cbegin(), part.cend(), [&builder, &startIdx](const std::pair<TString, ui64>& f) { builder.AddPath(f.first, f.second); ++startIdx; });
+ builder.Save(&range);
partitions.emplace_back();
TStringOutput out(partitions.back());
@@ -170,11 +178,6 @@ public:
const auto& paths = settings.Paths();
YQL_ENSURE(paths.Size() > 0);
const TStructExprType* extraColumnsType = paths.Item(0).ExtraColumns().Ref().GetTypeAnn()->Cast<TStructExprType>();
- for (auto i = 0U; i < paths.Size(); ++i) {
- const auto p = srcDesc.AddPath();
- p->SetPath(paths.Item(i).Path().StringValue());
- p->SetSize(FromString<ui64>(paths.Item(i).Size().Value()));
- }
if (const auto mayParseSettings = settings.Maybe<TS3ParseSettings>()) {
const auto parseSettings = mayParseSettings.Cast();
diff --git a/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt b/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt
new file mode 100644
index 00000000000..7e22842e685
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/CMakeLists.txt
@@ -0,0 +1,24 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(providers-s3-range_helpers)
+target_compile_options(providers-s3-range_helpers PRIVATE
+ -DUSE_CURRENT_UDF_ABI_VERSION
+)
+target_link_libraries(providers-s3-range_helpers PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ providers-common-provider
+ providers-s3-proto
+ library-yql-utils
+)
+target_sources(providers-s3-range_helpers PRIVATE
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp
+)
diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp
new file mode 100644
index 00000000000..166e097b984
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.cpp
@@ -0,0 +1,41 @@
+#include "file_tree_builder.h"
+
+namespace NYql::NS3Details {
+
+void TFileTreeBuilder::AddPath(const TString& path, ui64 fileSize) {
+ const auto parts = SplitPath(path);
+ std::map<TString, TPath>* currentChildren = &Roots;
+ for (size_t i = 0, size = parts.size(); i < size; ++i) {
+ TPath& p = (*currentChildren)[parts[i]];
+ if (i == size - 1) { // last
+ Y_VERIFY(p.FileSize == 0);
+ Y_VERIFY(!p.Read);
+ p.FileSize = fileSize;
+ p.Read = true;
+ } else {
+ currentChildren = &p.Children;
+ }
+ }
+}
+
+void TFileTreeBuilder::Save(NS3::TRange* range) const {
+ for (const auto& [n, p] : Roots) {
+ SaveImpl(range->AddPaths(), n, p);
+ }
+}
+
+void TFileTreeBuilder::SaveImpl(NS3::TRange::TPath* path, const TString& name, const TPath& srcPath) const {
+ path->SetName(name);
+ path->SetSize(srcPath.FileSize);
+ path->SetRead(srcPath.Read);
+ for (const auto& [n, p] : srcPath.Children) {
+ SaveImpl(path->AddChildren(), n, p);
+ }
+}
+
+std::vector<TString> TFileTreeBuilder::SplitPath(const TString& path) {
+ std::vector<TString> parts = StringSplitter(path).Split('/');
+ return parts;
+}
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h
new file mode 100644
index 00000000000..363c60f78bd
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder.h
@@ -0,0 +1,30 @@
+#pragma once
+#include <ydb/library/yql/providers/s3/proto/range.pb.h>
+
+#include <util/generic/string.h>
+#include <util/string/split.h>
+
+#include <map>
+
+namespace NYql::NS3Details {
+
+class TFileTreeBuilder {
+ struct TPath {
+ ui64 FileSize = 0;
+ bool Read = false;
+ std::map<TString, TPath> Children;
+ };
+
+public:
+ void AddPath(const TString& path, ui64 fileSize);
+ void Save(NS3::TRange* range) const;
+
+private:
+ void SaveImpl(NS3::TRange::TPath* path, const TString& name, const TPath& srcPath) const;
+ static std::vector<TString> SplitPath(const TString& path);
+
+private:
+ std::map<TString, TPath> Roots;
+};
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp
new file mode 100644
index 00000000000..8644c9acc01
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp
@@ -0,0 +1,163 @@
+#include "file_tree_builder.h"
+#include "path_list_reader.h"
+
+#include <ydb/library/yql/providers/common/provider/yql_provider_names.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NYql::NS3Details {
+
+Y_UNIT_TEST_SUITE(S3FileTreeBuilderTest) {
+ THashMap<TString, TString> MakeParams(const NS3::TRange& range) {
+ TStringBuilder str;
+ range.Save(&str.Out);
+ THashMap<TString, TString> map;
+ map[S3ProviderName] = str;
+ return map;
+ }
+
+ Y_UNIT_TEST(Simple) {
+ TFileTreeBuilder b;
+ b.AddPath("name", 42);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ UNIT_ASSERT_VALUES_EQUAL(range.PathsSize(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetName(), "name");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetSize(), 42);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).ChildrenSize(), 0);
+ }
+
+ Y_UNIT_TEST(Interesting) {
+ TFileTreeBuilder b;
+ b.AddPath("name", 42);
+ b.AddPath("root/folder/file", 100500);
+ b.AddPath("root2/file", 10);
+ b.AddPath("root2", 42);
+ b.AddPath("root/folder/other_file", 22);
+ b.AddPath("root/file/", 12);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ UNIT_ASSERT_VALUES_EQUAL(range.PathsSize(), 3);
+
+ // name
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetName(), "name");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).GetSize(), 42);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(0).ChildrenSize(), 0);
+
+ // root
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetName(), "root");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetSize(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).ChildrenSize(), 2);
+
+ // root/file/
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).ChildrenSize(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetName(), "file");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).GetName(), "");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).GetSize(), 12);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(0).GetChildren(0).ChildrenSize(), 0);
+
+ // root/folder
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetName(), "folder");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetSize(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).ChildrenSize(), 2);
+
+ // root/folder/file
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).GetName(), "file");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).GetSize(), 100500);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(0).ChildrenSize(), 0);
+
+ // root/folder/other_file
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).GetName(), "other_file");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).GetSize(), 22);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(1).GetChildren(1).GetChildren(1).ChildrenSize(), 0);
+
+ // root2
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetName(), "root2");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetSize(), 42);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).ChildrenSize(), 1);
+
+ // root2/file
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).GetName(), "file");
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).GetSize(), 10);
+ UNIT_ASSERT_VALUES_EQUAL(range.GetPaths(2).GetChildren(0).ChildrenSize(), 0);
+ }
+
+ Y_UNIT_TEST(PassesFileWithZeroSize) {
+ TFileTreeBuilder b;
+ b.AddPath("name", 0);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList({}, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "name");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 0);
+ }
+
+ Y_UNIT_TEST(DeserializesManySlashes) {
+ TFileTreeBuilder b;
+ b.AddPath("a///b", 42);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList({}, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "a///b");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 42);
+ }
+
+ Y_UNIT_TEST(DeserializesTrailingSlash) {
+ TFileTreeBuilder b;
+ b.AddPath("root/name//", 3);
+ b.AddPath("root/name/", 0);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList({}, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "root/name/");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 0);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "root/name//");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 3);
+ }
+
+ Y_UNIT_TEST(DeserializesLeadingSlash) {
+ TFileTreeBuilder b;
+ b.AddPath("/root/name", 3);
+ b.AddPath("/", 42);
+
+ NS3::TRange range;
+ b.Save(&range);
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList({}, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "/");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 42);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "/root/name");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 3);
+ }
+}
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp
new file mode 100644
index 00000000000..4813dfa12ac
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.cpp
@@ -0,0 +1,59 @@
+#include "path_list_reader.h"
+
+#include <ydb/library/yql/providers/common/provider/yql_provider_names.h>
+#include <ydb/library/yql/utils/yql_panic.h>
+
+#include <util/stream/str.h>
+
+namespace NYql::NS3Details {
+
+static void BuildPathsFromTree(const google::protobuf::RepeatedPtrField<NYql::NS3::TRange::TPath>& children, TPathList& paths, TString& currentPath, size_t currentDepth = 0) {
+ if (children.empty()) {
+ return;
+ }
+ if (currentDepth) {
+ currentPath += '/';
+ }
+ for (const auto& path : children) {
+ const size_t prevSize = currentPath.size();
+ currentPath += path.GetName();
+ if (path.GetRead()) {
+ paths.emplace_back(currentPath, path.GetSize());
+ }
+ BuildPathsFromTree(path.GetChildren(), paths, currentPath, currentDepth + 1);
+ currentPath.resize(prevSize);
+ }
+}
+
+void ReadPathsList(const NS3::TSource& sourceDesc, const THashMap<TString, TString>& taskParams, TPathList& paths, ui64& startPathIndex) {
+ if (const auto taskParamsIt = taskParams.find(S3ProviderName); taskParamsIt != taskParams.cend()) {
+ NS3::TRange range;
+ TStringInput input(taskParamsIt->second);
+ range.Load(&input);
+ startPathIndex = range.GetStartPathIndex();
+
+ // Modern way
+ if (range.PathsSize()) {
+ TString buf;
+ BuildPathsFromTree(range.GetPaths(), paths, buf);
+ return;
+ }
+
+ std::unordered_map<TString, size_t> map(sourceDesc.GetDeprecatedPath().size());
+ for (auto i = 0; i < sourceDesc.GetDeprecatedPath().size(); ++i)
+ map.emplace(sourceDesc.GetDeprecatedPath().Get(i).GetPath(), sourceDesc.GetDeprecatedPath().Get(i).GetSize());
+
+ for (auto i = 0; i < range.GetDeprecatedPath().size(); ++i) {
+ const auto& path = range.GetDeprecatedPath().Get(i);
+ auto it = map.find(path);
+ YQL_ENSURE(it != map.end());
+ paths.emplace_back(path, it->second);
+ }
+ } else {
+ for (auto i = 0; i < sourceDesc.GetDeprecatedPath().size(); ++i) {
+ paths.emplace_back(sourceDesc.GetDeprecatedPath().Get(i).GetPath(), sourceDesc.GetDeprecatedPath().Get(i).GetSize());
+ }
+ }
+}
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h
new file mode 100644
index 00000000000..9be8c7814ba
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <ydb/library/yql/providers/s3/proto/source.pb.h>
+#include <ydb/library/yql/providers/s3/proto/range.pb.h>
+
+#include <util/generic/hash.h>
+#include <util/generic/string.h>
+
+#include <utility>
+#include <vector>
+
+namespace NYql::NS3Details {
+
+using TPath = std::tuple<TString, size_t>;
+using TPathList = std::vector<TPath>;
+
+void ReadPathsList(const NS3::TSource& sourceDesc, const THashMap<TString, TString>& taskParams, TPathList& paths, ui64& startPathIndex);
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp b/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp
new file mode 100644
index 00000000000..a54135e6561
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp
@@ -0,0 +1,133 @@
+#include "path_list_reader.h"
+
+#include <ydb/library/yql/providers/common/provider/yql_provider_names.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/string/builder.h>
+
+namespace NYql::NS3Details {
+
+Y_UNIT_TEST_SUITE(PathListReaderTest) {
+ THashMap<TString, TString> MakeParams(const NS3::TRange& range) {
+ TStringBuilder str;
+ range.Save(&str.Out);
+ THashMap<TString, TString> map;
+ map[S3ProviderName] = str;
+ return map;
+ }
+
+ Y_UNIT_TEST(ReadsFilesListFromSourceSettings) {
+ NS3::TSource src;
+ {
+ auto* p = src.AddDeprecatedPath();
+ p->SetPath("my/path");
+ p->SetSize(100500);
+ }
+ {
+ auto* p = src.AddDeprecatedPath();
+ p->SetPath("other/path");
+ p->SetSize(1);
+ }
+
+ TPathList paths;
+ ui64 startPathIndex = 42;
+ ReadPathsList(src, {}, paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42);
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "my/path");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 100500);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "other/path");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 1);
+ }
+
+ Y_UNIT_TEST(ReadsFilesListFromParamsAndSourceSettings) {
+ NS3::TSource src;
+ {
+ auto* p = src.AddDeprecatedPath();
+ p->SetPath("my/path");
+ p->SetSize(100500);
+ }
+ {
+ auto* p = src.AddDeprecatedPath();
+ p->SetPath("other/path");
+ p->SetSize(1);
+ }
+
+ NS3::TRange range;
+ range.SetStartPathIndex(42);
+ range.AddDeprecatedPath("my/path");
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList(src, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42);
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "my/path");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 100500);
+ }
+
+ NYql::NS3::TRange::TPath* SetPath(NYql::NS3::TRange::TPath* path, const TString& name = {}, ui64 size = 0, bool read = false) {
+ path->SetName(name);
+ path->SetSize(size);
+ path->SetRead(read);
+ return path;
+ }
+
+ Y_UNIT_TEST(ReadsFilesListFromTreeParams) {
+ NS3::TSource src;
+ {
+ auto* p = src.AddDeprecatedPath();
+ p->SetPath("my/path");
+ p->SetSize(100500);
+ }
+
+ NS3::TRange range;
+ range.SetStartPathIndex(42);
+ range.AddDeprecatedPath("my/path"); // We shouldn't react on this
+
+ {
+ auto* root = SetPath(range.AddPaths(), "root", 1, true);
+ {
+ auto* folder = SetPath(root->AddChildren(), "folder");
+ SetPath(folder->AddChildren(), "f1", 42, true);
+ SetPath(folder->AddChildren(), "f2", 100500, true);
+ }
+ SetPath(root->AddChildren(), "f3", 0, true);
+ SetPath(root->AddChildren(), "nothing"); // Shouldn't be processed.
+ }
+ {
+ auto* root2 = SetPath(range.AddPaths(), "root2");
+ SetPath(root2->AddChildren(), "f4", 42, true);
+ }
+
+ TPathList paths;
+ ui64 startPathIndex = 0;
+ ReadPathsList(src, MakeParams(range), paths, startPathIndex);
+
+ UNIT_ASSERT_VALUES_EQUAL(startPathIndex, 42);
+ UNIT_ASSERT_VALUES_EQUAL(paths.size(), 5);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[0]), "root");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[0]), 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[1]), "root/folder/f1");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[1]), 42);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[2]), "root/folder/f2");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[2]), 100500);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[3]), "root/f3");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[3]), 0);
+
+ UNIT_ASSERT_VALUES_EQUAL(std::get<TString>(paths[4]), "root2/f4");
+ UNIT_ASSERT_VALUES_EQUAL(std::get<ui64>(paths[4]), 42);
+ }
+}
+
+} // namespace NYql::NS3Details
diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt
new file mode 100644
index 00000000000..690425de126
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.darwin.txt
@@ -0,0 +1,46 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_executable(ydb-library-yql-providers-s3-range_helpers-ut)
+target_compile_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ -DUSE_CURRENT_UDF_ABI_VERSION
+)
+target_include_directories(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers
+)
+target_link_libraries(ydb-library-yql-providers-s3-range_helpers-ut PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-cpuid_check
+ cpp-testing-unittest_main
+ providers-s3-range_helpers
+ providers-common-provider
+)
+target_link_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ -Wl,-no_deduplicate
+ -Wl,-sdk_version,10.15
+ -fPIC
+ -fPIC
+)
+target_sources(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp
+)
+add_test(
+ NAME
+ ydb-library-yql-providers-s3-range_helpers-ut
+ COMMAND
+ ydb-library-yql-providers-s3-range_helpers-ut
+ --print-before-suite
+ --print-before-test
+ --fork-tests
+ --print-times
+ --show-fails
+)
+vcs_info(ydb-library-yql-providers-s3-range_helpers-ut)
diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt
new file mode 100644
index 00000000000..df529a3d408
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.linux.txt
@@ -0,0 +1,52 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_executable(ydb-library-yql-providers-s3-range_helpers-ut)
+target_compile_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ -DUSE_CURRENT_UDF_ABI_VERSION
+)
+target_include_directories(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers
+)
+target_link_libraries(ydb-library-yql-providers-s3-range_helpers-ut PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-malloc-tcmalloc
+ libs-tcmalloc-no_percpu_cache
+ library-cpp-cpuid_check
+ cpp-testing-unittest_main
+ providers-s3-range_helpers
+ providers-common-provider
+)
+target_link_options(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ -ldl
+ -lrt
+ -Wl,--no-as-needed
+ -fPIC
+ -fPIC
+ -lpthread
+ -lrt
+ -ldl
+)
+target_sources(ydb-library-yql-providers-s3-range_helpers-ut PRIVATE
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/file_tree_builder_ut.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/library/yql/providers/s3/range_helpers/path_list_reader_ut.cpp
+)
+add_test(
+ NAME
+ ydb-library-yql-providers-s3-range_helpers-ut
+ COMMAND
+ ydb-library-yql-providers-s3-range_helpers-ut
+ --print-before-suite
+ --print-before-test
+ --fork-tests
+ --print-times
+ --show-fails
+)
+vcs_info(ydb-library-yql-providers-s3-range_helpers-ut)
diff --git a/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt
new file mode 100644
index 00000000000..fc7b1ee73ce
--- /dev/null
+++ b/ydb/library/yql/providers/s3/range_helpers/ut/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+# This file was gererated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (APPLE)
+ include(CMakeLists.darwin.txt)
+elseif (UNIX AND NOT APPLE)
+ include(CMakeLists.linux.txt)
+endif()