summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhcpp <[email protected]>2022-07-28 23:11:39 +0300
committerhcpp <[email protected]>2022-07-28 23:11:39 +0300
commit6680613e3d1f09a604866924b2a21855ffbeb249 (patch)
tree759589179c8e03c932397c8b4b14661405759cdf
parent75bd1e6d8a8845828be6d151670104c36ba572d1 (diff)
path generator improvements
-rw-r--r--ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp40
-rw-r--r--ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp30
-rw-r--r--ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp690
-rw-r--r--ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h100
4 files changed, 477 insertions, 383 deletions
diff --git a/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp b/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp
index 58cfa728eb8..213b13c86a5 100644
--- a/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp
+++ b/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp
@@ -6,7 +6,7 @@ namespace NYql::NPathGenerator {
Y_UNIT_TEST_SUITE(TGenerateTests) {
Y_UNIT_TEST(SuccessGenerate) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -17,7 +17,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/0/");
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2);
@@ -30,7 +30,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
Y_UNIT_TEST(SuccessGenerateInteger) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -43,7 +43,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/0/");
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2);
@@ -56,7 +56,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
Y_UNIT_TEST(SuccessGenerateIntegerWithDigits) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -70,7 +70,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/00000/");
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2);
@@ -83,7 +83,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
Y_UNIT_TEST(SuccessGenerateIntegerWithDigitsOverflow) {
- auto result = ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -95,13 +95,11 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
"projection.code.digits" : 10000,
"storage.location.template" : "/${city}/${code}/"
}
- )", {"city", "code"});
-
- UNIT_ASSERT_EXCEPTION_CONTAINS(ExpandPartitioningRules(result, 100), yexception, "Digits cannot exceed 64, but received 10000");
+ )", {"city", "code"}), yexception, "Digits cannot exceed 64, but received 10000");
}
Y_UNIT_TEST(CheckLimit) {
- auto result = ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -110,21 +108,19 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
"projection.code.values" : "0,1",
"storage.location.template" : "/${city}/${code}/"
}
- )", {"city", "code"});
-
- UNIT_ASSERT_EXCEPTION_CONTAINS(ExpandPartitioningRules(result, 2), yexception, "The limit on the number of paths has been reached: 2 of 2");
+ )", {"city", "code"}, 2), yexception, "The limit on the number of paths has been reached: 2 of 2");
}
Y_UNIT_TEST(CheckHiveFormat) {
- auto result = ParsePartitioningRules({}, {"city", "code", "device_id"});
- auto rules = ExpandPartitioningRules(result, 1);
+ auto generator = CreatePathGenerator({}, {"city", "code", "device_id"}, 1);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 1);
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 0);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/city=${city}/code=${code}/device_id=${device_id}");
}
Y_UNIT_TEST(SuccessGenerateDateWith) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -139,7 +135,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/2010-01-01/");
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2);
@@ -152,7 +148,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
Y_UNIT_TEST(SuccessGenerateDateWithUnixtime) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -167,7 +163,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 2);
UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/1970-01-03/");
UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2);
@@ -176,7 +172,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
Y_UNIT_TEST(SuccessGenerateDateWithNow) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -191,7 +187,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) {
}
)", {"city", "code"});
auto nowBefore = TInstant::Now();
- auto rules = ExpandPartitioningRules(result, 100);
+ auto rules = generator->GetRules();
auto nowAfter = TInstant::Now();
UNIT_ASSERT_VALUES_EQUAL(rules.size(), 2);
UNIT_ASSERT_GE(rules[0].Path, "/MSK/" + (nowBefore + TDuration::Days(1)).FormatLocalTime("%F") + "/");
diff --git a/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp b/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp
index 2cfbb77e010..bea34672ffa 100644
--- a/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp
+++ b/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp
@@ -6,7 +6,7 @@ namespace NYql::NPathGenerator {
Y_UNIT_TEST_SUITE(TParseTests) {
Y_UNIT_TEST(SuccessParseEnum) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -14,11 +14,12 @@ Y_UNIT_TEST_SUITE(TParseTests) {
"storage.location.template" : "/${city}/"
}
)", {"city"});
+ const auto& result = generator->GetConfig();
UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true);
UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${city}/");
UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1);
const auto& rule = result.Rules.front();
- UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::ENUM);
+ UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::ENUM);
UNIT_ASSERT_VALUES_EQUAL(rule.Name, "city");
UNIT_ASSERT_VALUES_EQUAL(rule.Values.size(), 2);
UNIT_ASSERT_VALUES_EQUAL(rule.Values.front(), "MSK");
@@ -26,7 +27,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(ParseTwoEnabled) {
- UNIT_ASSERT_NO_EXCEPTION(ParsePartitioningRules(R"(
+ UNIT_ASSERT_NO_EXCEPTION(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.enabled.type" : "enum",
@@ -37,7 +38,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(InvalidPartitioningTemplate) {
- UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -48,7 +49,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(InvalidProjectionTemplate) {
- UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -59,7 +60,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(InvalidTemplate) {
- UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -70,7 +71,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(StartSubstition) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -78,11 +79,12 @@ Y_UNIT_TEST_SUITE(TParseTests) {
"storage.location.template" : "/${city}/${device_id}/"
}
)", {"city", "device_id"});
+ const auto& result = generator->GetConfig();
UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true);
UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${city}/${device_id}/");
UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1);
const auto& rule = result.Rules.front();
- UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::ENUM);
+ UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::ENUM);
UNIT_ASSERT_VALUES_EQUAL(rule.Name, "city");
UNIT_ASSERT_VALUES_EQUAL(rule.Values.size(), 2);
UNIT_ASSERT_VALUES_EQUAL(rule.Values.front(), "MSK");
@@ -90,7 +92,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(InvalidValuesType) {
- UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"(
+ UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.city.type" : "enum",
@@ -101,7 +103,7 @@ Y_UNIT_TEST_SUITE(TParseTests) {
}
Y_UNIT_TEST(SuccessParseInteger) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : true,
"projection.id.type" : "integer",
@@ -110,18 +112,19 @@ Y_UNIT_TEST_SUITE(TParseTests) {
"storage.location.template" : "/${id}/"
}
)", {"id"});
+ const auto& result = generator->GetConfig();
UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true);
UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${id}/");
UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1);
const auto& rule = result.Rules.front();
- UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::INTEGER);
+ UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::INTEGER);
UNIT_ASSERT_VALUES_EQUAL(rule.Name, "id");
UNIT_ASSERT_VALUES_EQUAL(rule.Min, 5);
UNIT_ASSERT_VALUES_EQUAL(rule.Max, 6);
}
Y_UNIT_TEST(SuccessDisableProjection) {
- auto result = ParsePartitioningRules(R"(
+ auto generator = CreatePathGenerator(R"(
{
"projection.enabled" : false,
"projection.id.type" : "integer",
@@ -130,11 +133,12 @@ Y_UNIT_TEST_SUITE(TParseTests) {
"storage.location.template" : "/${id}/"
}
)", {"id"});
+ const auto& result = generator->GetConfig();
UNIT_ASSERT_VALUES_EQUAL(result.Enabled, false);
UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${id}/");
UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1);
const auto& rule = result.Rules.front();
- UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::INTEGER);
+ UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::INTEGER);
UNIT_ASSERT_VALUES_EQUAL(rule.Name, "id");
UNIT_ASSERT_VALUES_EQUAL(rule.Min, 5);
UNIT_ASSERT_VALUES_EQUAL(rule.Max, 6);
diff --git a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp
index 60877b38fdb..c5bec51f401 100644
--- a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp
+++ b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp
@@ -4,6 +4,7 @@
#include <regex>
#include <util/datetime/base.h>
#include <util/generic/serialized_enum.h>
+#include <util/string/cast.h>
#include <util/string/split.h>
namespace NYql::NPathGenerator {
@@ -65,167 +66,32 @@ i64 GetBoolOrThrow(const NSc::TValue& json, const TString& error) {
return json.IsTrue();
}
-EIntervalUnit ToIntervalUnit(const TString& unit) {
- const auto names = GetEnumNames<EIntervalUnit>();
+IPathGenerator::EIntervalUnit ToIntervalUnit(const TString& unit) {
+ const auto names = GetEnumNames<IPathGenerator::EIntervalUnit>();
for (const auto& name: names) {
if (name.second == unit) {
return name.first;
}
}
- ythrow yexception() << "Invalid projection scheme: unit " << unit << " must be one of " << GetEnumAllNames<EIntervalUnit>();
+ ythrow yexception() << "Invalid projection scheme: unit " << unit << " must be one of " << GetEnumAllNames<IPathGenerator::EIntervalUnit>();
}
-TMap<EType, TString> ToLowerType() {
- TMap<EType, TString> result;
- for (const auto& p: GetEnumNames<EType>()) {
+TMap<IPathGenerator::EType, TString> ToLowerType() {
+ TMap<IPathGenerator::EType, TString> result;
+ for (const auto& p: GetEnumNames<IPathGenerator::EType>()) {
result[p.first] = to_lower(p.second);
}
return result;
};
-EType ToType(const TString& type) {
- static TMap<EType, TString> enumNames{ToLowerType()};
+IPathGenerator::EType ToType(const TString& type) {
+ static TMap<IPathGenerator::EType, TString> enumNames{ToLowerType()};
for (const auto& name: enumNames) {
if (name.second == type) {
return name.first;
}
}
- ythrow yexception() << "Invalid projection scheme: type " << type << " must be one of " << to_lower(GetEnumAllNames<EType>());
-}
-
-void DoParseEnumType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) {
- if (!projection.contains("values")) {
- ythrow yexception() << "Invalid projection scheme: values are required field for " << columnName << " " << type;
- }
- if (!projection.contains("type")) {
- ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
- }
- for (const auto& p: projection) {
- if (p.first == "type") {
- // already processed
- } else if (p.first == "values") {
- TString values = GetStringOrThrow(p.second, "The values must be a string");
- config.Rules.push_back(ColumnPartitioningConfig{.Type=ToType(type), .Name=columnName, .Values=StringSplitter(values).Split(',').ToList<TString>()});
- } else {
- ythrow yexception() << "Invalid projection scheme: enum element must include only type or values (as string) but got " << p.first;
- }
- }
-}
-
-void DoParseIntegerType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) {
- if (!projection.contains("type")) {
- ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
- }
- if (!projection.contains("min")) {
- ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type;
- }
- if (!projection.contains("max")) {
- ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type;
- }
- ColumnPartitioningConfig columnConfig;
- columnConfig.Name = columnName;
- columnConfig.Type = ToType(type);
- for (const auto& p: projection) {
- if (p.first == "type") {
- // already processed
- } else if (p.first == "min") {
- columnConfig.Min = GetIntOrThrow(p.second, "The min must be a number");
- } else if (p.first == "max") {
- columnConfig.Max = GetIntOrThrow(p.second, "The max must be a number");
- } else if (p.first == "interval") {
- columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number");
- } else if (p.first == "digits") {
- columnConfig.Digits = GetIntOrThrow(p.second, "The digits must be a number");
- } else {
- ythrow yexception() << "Invalid projection scheme: integer element must include only type, min, max, interval, digits but got " << p.first;
- }
- }
- config.Rules.push_back(columnConfig);
-}
-
-void DoParseDateType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) {
- if (!projection.contains("type")) {
- ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
- }
- if (!projection.contains("min")) {
- ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type;
- }
- if (!projection.contains("max")) {
- ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type;
- }
- if (!projection.contains("format")) {
- ythrow yexception() << "Invalid projection scheme: format are required field for " << columnName << " " << type;
- }
- ColumnPartitioningConfig columnConfig;
- columnConfig.Name = columnName;
- columnConfig.Type = ToType(type);
- for (const auto& p: projection) {
- if (p.first == "type") {
- // already processed
- } else if (p.first == "min") {
- columnConfig.From = GetStringOrThrow(p.second, "The min must be a string");
- } else if (p.first == "max") {
- columnConfig.To = GetStringOrThrow(p.second, "The max must be a string");
- } else if (p.first == "format") {
- columnConfig.Format = GetStringOrThrow(p.second, "The format must be a string");
- } else if (p.first == "interval") {
- columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number");
- } else if (p.first == "unit") {
- columnConfig.IntervalUnit = ToIntervalUnit(GetStringOrThrow(p.second, "The unit must be a string"));
- } else {
- ythrow yexception() << "Invalid projection scheme: date element must include only type, min, max, format, interval, unit but got " << p.first;
- }
- }
- config.Rules.push_back(columnConfig);
-}
-
-void DoParseColumn(const TString& columnName, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) {
- auto it = projection.find("type");
- if (it == projection.end()) {
- ythrow yexception() << "Invalid projection scheme: type element must exist for the column" << columnName;
- }
-
- const TString type = GetStringOrThrow(it->second, "The type must be a string for column " + columnName);
- if (type == "enum") {
- DoParseEnumType(columnName, type, projection, config);
- } else if (type == "integer") {
- DoParseIntegerType(columnName, type, projection, config);
- } else if (type == "date") {
- DoParseDateType(columnName, type, projection, config);
- } else {
- ythrow yexception() << "Invalid projection scheme: type element should be one of enum, date, integer but got " << type;
- }
-}
-
-void DoParseProjection(TMap<TString, TMap<TString, NSc::TValue>> projection, ExplicitPartitioningConfig& config) {
- for (const auto& p: projection) {
- DoParseColumn(p.first, p.second, config);
- }
-}
-
-void DoGenerate(const std::vector<ColumnPartitioningConfig>& rules,
- TString locationTemplate,
- std::vector<ColumnWithValue>& columnsWithValue,
- TMap<TString, std::vector<ColumnWithValue>>& result,
- int pathsLimit,
- TInstant now,
- size_t p = 0);
-
-void DoGenerateEnum(const std::vector<ColumnPartitioningConfig>& rules,
- TString locationTemplate,
- std::vector<ColumnWithValue>& columnsWithValue,
- TMap<TString, std::vector<ColumnWithValue>>& result,
- int pathsLimit,
- TInstant now,
- size_t p = 0) {
- const auto& rule = rules[p];
- for (const auto& value: rule.Values) {
- TString copyLocationTemplate = locationTemplate;
- ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", value);
- columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=value});
- DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
- columnsWithValue.pop_back();
- }
+ ythrow yexception() << "Invalid projection scheme: type " << type << " must be one of " << to_lower(GetEnumAllNames<IPathGenerator::EType>());
}
std::string fmtInteger(int32_t width, int64_t value)
@@ -256,47 +122,26 @@ bool IsOverflow(uint64_t a, uint64_t b) {
return b > diff;
}
-void DoGenerateInteger(const std::vector<ColumnPartitioningConfig>& rules,
- TString locationTemplate,
- std::vector<ColumnWithValue>& columnsWithValue,
- TMap<TString, std::vector<ColumnWithValue>>& result,
- int pathsLimit,
- TInstant now,
- size_t p = 0) {
- const auto& rule = rules[p];
- for (int64_t i = rule.Min; i <= rule.Max; i += rule.Interval) {
- TString copyLocationTemplate = locationTemplate;
- ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", fmtInteger(rule.Digits, i));
- columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::Int64, .Value=ToString(i)});
- DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
- columnsWithValue.pop_back();
-
- if (IsOverflow(i, rule.Interval)) {
- return; // correct overflow handling
- }
- }
-}
-
-TDuration FromUnit(int64_t interval, EIntervalUnit unit) {
+TDuration FromUnit(int64_t interval, IPathGenerator::EIntervalUnit unit) {
switch (unit) {
- case EIntervalUnit::MILLISECONDS:
+ case IPathGenerator::EIntervalUnit::MILLISECONDS:
return TDuration::MilliSeconds(interval);
- case EIntervalUnit::SECONDS:
+ case IPathGenerator::EIntervalUnit::SECONDS:
return TDuration::Seconds(interval);
- case EIntervalUnit::MINUTES:
+ case IPathGenerator::EIntervalUnit::MINUTES:
return TDuration::Minutes(interval);
- case EIntervalUnit::HOURS:
+ case IPathGenerator::EIntervalUnit::HOURS:
return TDuration::Hours(interval);
- case EIntervalUnit::DAYS:
+ case IPathGenerator::EIntervalUnit::DAYS:
return TDuration::Days(interval);
- case EIntervalUnit::WEEKS:
+ case IPathGenerator::EIntervalUnit::WEEKS:
return TDuration::Days(interval * 7);
- case EIntervalUnit::MONTHS:
+ case IPathGenerator::EIntervalUnit::MONTHS:
return TDuration::Seconds(interval * 2629746LL); /// Exactly 1/12 of a year.
- case EIntervalUnit::YEARS:
+ case IPathGenerator::EIntervalUnit::YEARS:
return TDuration::Seconds(interval * 31556952LL); /// The average length of a Gregorian year is equal to 365.2425 days
default:
- ythrow yexception() << "Only the " << GetEnumAllNames<EIntervalUnit>() << " units are supported but got " << unit;
+ ythrow yexception() << "Only the " << GetEnumAllNames<IPathGenerator::EIntervalUnit>() << " units are supported but got " << unit;
}
}
@@ -330,176 +175,415 @@ TInstant ParseDate(const TString& dateStr, const TInstant& now) {
return TInstant::ParseIso8601(dateStr);
}
-void DoGenerateDate(const std::vector<ColumnPartitioningConfig>& rules,
- TString locationTemplate,
- std::vector<ColumnWithValue>& columnsWithValue,
- TMap<TString, std::vector<ColumnWithValue>>& result,
- int pathsLimit,
- TInstant now,
- size_t p = 0) {
- const auto& rule = rules[p];
- const TInstant to = ParseDate(rule.To, now);
- const TDuration interval = FromUnit(rule.Interval, rule.IntervalUnit);
- for (TInstant current = ParseDate(rule.From, now); current <= to; current += interval) {
- TString copyLocationTemplate = locationTemplate;
- const TString time = current.FormatLocalTime(rule.Format.c_str());
- ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", time);
- columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=time});
- DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
- columnsWithValue.pop_back();
+}
- if (IsOverflow(current.GetValue(), interval.GetValue())) {
- return; // correct overflow handling
- }
+struct TPathGenerator: public IPathGenerator {
+ TExplicitPartitioningConfig Config;
+ TRules Rules;
+ TMap<TString, TColumnPartitioningConfig> ColumnConfig; // column name -> config
+
+public:
+ TPathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit)
+ {
+ ParsePartitioningRules(projection, partitionedBy);
+ ExpandPartitioningRules(pathsLimit);
}
-}
-void DoGenerate(const std::vector<ColumnPartitioningConfig>& rules,
- TString locationTemplate,
- std::vector<ColumnWithValue>& columnsWithValue,
- TMap<TString, std::vector<ColumnWithValue>>& result,
- int pathsLimit,
- TInstant now,
- size_t p) {
- if (rules.size() == p) {
- if (result.size() == static_cast<uint>(pathsLimit)) {
- ythrow yexception() << "The limit on the number of paths has been reached: " << result.size() << " of " << pathsLimit;
- }
- result[locationTemplate] = columnsWithValue;
- return;
- }
-
- const auto& rule = rules[p];
- switch (rule.Type) {
- case EType::ENUM:
- DoGenerateEnum(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
- break;
- case EType::INTEGER:
- DoGenerateInteger(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
+ TString Format(const TStringBuf& columnName, const TStringBuf& dataValue) const override {
+ auto it = ColumnConfig.find(columnName);
+ if (it == ColumnConfig.end()) {
+ ythrow yexception() << columnName << " column not found in config";
+ }
+ const auto& config = it->second;
+ switch (config.Type) {
+ case IPathGenerator::EType::UNDEFINED: {
+ ythrow yexception() << columnName << " column has undefined type";
+ }
+ case IPathGenerator::EType::ENUM: {
+ if (Find(config.Values, dataValue) == config.Values.end()) {
+ ythrow yexception() << dataValue << " data not found as enum item";
+ }
+ return TString{dataValue};
+ }
+ case IPathGenerator::EType::INTEGER: {
+ int64_t value = 0;
+ if (!TryFromString(dataValue.Data(), dataValue.size(), value)) {
+ ythrow yexception() << dataValue << " data is not a int64";
+ }
+ return fmtInteger(config.Digits, value);
+ }
+ case IPathGenerator::EType::DATE: {
+ TInstant time = TInstant::Zero() /* ToInstant(dataValue) */;
+ return time.FormatLocalTime(config.Format.c_str());
+ }
break;
- case EType::DATE:
- DoGenerateDate(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
+ }
+ }
+
+ TString Parse(const TStringBuf& columnName, const TStringBuf& pathValue) const override {
+ auto it = ColumnConfig.find(columnName);
+ if (it == ColumnConfig.end()) {
+ ythrow yexception() << columnName << " column not found in config";
+ }
+ const auto& config = it->second;
+ switch (config.Type) {
+ case IPathGenerator::EType::UNDEFINED: {
+ ythrow yexception() << columnName << " column has undefined type";
+ }
+ case IPathGenerator::EType::ENUM: {
+ if (Find(config.Values, pathValue) == config.Values.end()) {
+ ythrow yexception() << pathValue << " value not found as enum item";
+ }
+ return TString{pathValue};
+ }
+ case IPathGenerator::EType::INTEGER: {
+ int64_t value = 0;
+ if (!TryFromString(pathValue.Data(), pathValue.size(), value)) {
+ ythrow yexception() << pathValue << " value is not a int64";
+ }
+ return std::to_string(value);
+ }
+ case IPathGenerator::EType::DATE: {
+ TInstant time = TInstant::Zero() /* ToInstant(dataValue) */;
+ return time.FormatLocalTime(config.Format.c_str());
+ }
break;
- default:
- ythrow yexception() << "Only the enum, integer, date types are supported but got " << to_lower(ToString(rule.Type));
+ }
}
-}
-void DoValidateTemplate(const ExplicitPartitioningConfig& config, const std::vector<TString>& partitionBy) {
- TSet<TString> vars;
- std::regex word_regex("\\$\\{(.*?)\\}");
- auto wordBegin = std::sregex_iterator(config.LocationTemplate.begin(), config.LocationTemplate.end(), word_regex);
- auto wordEnd = std::sregex_iterator();
- for (auto word = wordBegin; word != wordEnd; ++word) {
- vars.insert((word->begin() + 1)->str());
+ const TRules& GetRules() const override {
+ return Rules;
+ }
+
+ const TExplicitPartitioningConfig& GetConfig() const override {
+ return Config;
}
- TSet<TString> columns;
- for (const auto& columnName: partitionBy) {
- columns.insert(columnName);
- if (!vars.contains(columnName)) {
- ythrow yexception() << "Template " << config.LocationTemplate << " must include ${" << columnName << "}";
+private:
+ // Parse
+ void ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionedBy) {
+ if (partitionedBy.empty()) {
+ ythrow yexception() << "Partition by must always be specified";
+ }
+
+ if (!config) {
+ for (const auto& columnName: partitionedBy) {
+ Config.LocationTemplate += "/" + columnName + "=${" + columnName + "}";
+ }
+ return;
+ }
+
+ NSc::TValue json = NSc::TValue::FromJsonThrow(config, NSc::TJsonOpts::JO_PARSER_DISALLOW_DUPLICATE_KEYS | NSc::TJsonOpts::JO_SORT_KEYS);
+ if (!json.IsDict()) {
+ ythrow yexception() << "Invalid projection scheme: top-level element must be a dictionary";
+ }
+
+ TMap<TString, TMap<TString, NSc::TValue>> projection;
+ for (const auto& p: json.GetDict()) {
+ const auto path = GetPath(p.first);
+ if (path.empty()) {
+ ythrow yexception() << "Invalid key: key should start with storage or projection, but got an empty value";
+ }
+ const TString& kind = path.front();
+ if (kind == "projection") {
+ AddProjection(p.first, p.second, path, projection);
+ } else if (kind == "storage") {
+ AddStorage(p.first, p.second, path);
+ } else {
+ ythrow yexception() << "Invalid key: key should start with storage or projection, but got " << p.first;
+ }
+ }
+ DoParseProjection(projection);
+ DoValidateTemplate(partitionedBy);
+ for (const auto& config: Config.Rules) {
+ ColumnConfig[config.Name] = config;
}
}
- for (const auto& rule: config.Rules) {
- columns.insert(rule.Name);
- if (!vars.contains(rule.Name)) {
- ythrow yexception() << "Template " << config.LocationTemplate << " must include ${" << rule.Name << "}";
+ void AddProjection(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, TMap<TString, TMap<TString, NSc::TValue>>& projection) {
+ if (path.size() != 3 && path.size() != 2) {
+ ythrow yexception() << "The key must be three-component or two-component, but received " << key;
+ }
+
+ if (path.size() == 2 && path[1] != "enabled") {
+ ythrow yexception() << "Unknown key " << key;
+ }
+
+ if (path.size() == 2) {
+ Config.Enabled = GetBoolOrThrow(json, "The projection.enabled must be a bool");
+ return;
}
+
+ projection[path[1]][path[2]] = json;
}
- for (const auto& var: vars) {
- if (!columns.contains(var)) {
- ythrow yexception() << "Colum named " << var << " does not exist for template " << config.LocationTemplate;
+ void AddStorage(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path) {
+ if (path.size() != 3) {
+ ythrow yexception() << "The key must be three-component, but received " << key;
+ }
+
+ if (path[1] != "location" || path[2] != "template") {
+ ythrow yexception() << "The key must be storage.location.template, but received " << key;
}
+
+ Config.LocationTemplate = GetStringOrThrow(json, "The storage.location.template must be a string");
}
-}
-void AddProjection(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, TMap<TString, TMap<TString, NSc::TValue>>& projection, ExplicitPartitioningConfig& result) {
- if (path.size() != 3 && path.size() != 2) {
- ythrow yexception() << "The key must be three-component or two-component, but received " << key;
+
+ void DoParseEnumType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) {
+ if (!projection.contains("values")) {
+ ythrow yexception() << "Invalid projection scheme: values are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("type")) {
+ ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
+ }
+ for (const auto& p: projection) {
+ if (p.first == "type") {
+ // already processed
+ } else if (p.first == "values") {
+ TString values = GetStringOrThrow(p.second, "The values must be a string");
+ Config.Rules.push_back(IPathGenerator::TColumnPartitioningConfig{.Type=ToType(type), .Name=columnName, .Values=StringSplitter(values).Split(',').ToList<TString>()});
+ } else {
+ ythrow yexception() << "Invalid projection scheme: enum element must include only type or values (as string) but got " << p.first;
+ }
+ }
}
- if (path.size() == 2 && path[1] != "enabled") {
- ythrow yexception() << "Unknown key " << key;
+ void DoParseIntegerType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) {
+ if (!projection.contains("type")) {
+ ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("min")) {
+ ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("max")) {
+ ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type;
+ }
+ IPathGenerator::TColumnPartitioningConfig columnConfig;
+ columnConfig.Name = columnName;
+ columnConfig.Type = ToType(type);
+ for (const auto& p: projection) {
+ if (p.first == "type") {
+ // already processed
+ } else if (p.first == "min") {
+ columnConfig.Min = GetIntOrThrow(p.second, "The min must be a number");
+ } else if (p.first == "max") {
+ columnConfig.Max = GetIntOrThrow(p.second, "The max must be a number");
+ } else if (p.first == "interval") {
+ columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number");
+ } else if (p.first == "digits") {
+ columnConfig.Digits = GetIntOrThrow(p.second, "The digits must be a number");
+ } else {
+ ythrow yexception() << "Invalid projection scheme: integer element must include only type, min, max, interval, digits but got " << p.first;
+ }
+ }
+ Config.Rules.push_back(columnConfig);
}
- if (path.size() == 2) {
- result.Enabled = GetBoolOrThrow(json, "The projection.enabled must be a bool");
- return;
+ void DoParseDateType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) {
+ if (!projection.contains("type")) {
+ ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("min")) {
+ ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("max")) {
+ ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type;
+ }
+ if (!projection.contains("format")) {
+ ythrow yexception() << "Invalid projection scheme: format are required field for " << columnName << " " << type;
+ }
+ IPathGenerator::TColumnPartitioningConfig columnConfig;
+ columnConfig.Name = columnName;
+ columnConfig.Type = ToType(type);
+ for (const auto& p: projection) {
+ if (p.first == "type") {
+ // already processed
+ } else if (p.first == "min") {
+ columnConfig.From = GetStringOrThrow(p.second, "The min must be a string");
+ } else if (p.first == "max") {
+ columnConfig.To = GetStringOrThrow(p.second, "The max must be a string");
+ } else if (p.first == "format") {
+ columnConfig.Format = GetStringOrThrow(p.second, "The format must be a string");
+ } else if (p.first == "interval") {
+ columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number");
+ } else if (p.first == "unit") {
+ columnConfig.IntervalUnit = ToIntervalUnit(GetStringOrThrow(p.second, "The unit must be a string"));
+ } else {
+ ythrow yexception() << "Invalid projection scheme: date element must include only type, min, max, format, interval, unit but got " << p.first;
+ }
+ }
+ Config.Rules.push_back(columnConfig);
}
- projection[path[1]][path[2]] = json;
-}
+ void DoParseColumn(const TString& columnName, const TMap<TString, NSc::TValue>& projection) {
+ auto it = projection.find("type");
+ if (it == projection.end()) {
+ ythrow yexception() << "Invalid projection scheme: type element must exist for the column" << columnName;
+ }
-void AddStorage(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, ExplicitPartitioningConfig& result) {
- if (path.size() != 3) {
- ythrow yexception() << "The key must be three-component, but received " << key;
+ const TString type = GetStringOrThrow(it->second, "The type must be a string for column " + columnName);
+ if (type == "enum") {
+ DoParseEnumType(columnName, type, projection);
+ } else if (type == "integer") {
+ DoParseIntegerType(columnName, type, projection);
+ } else if (type == "date") {
+ DoParseDateType(columnName, type, projection);
+ } else {
+ ythrow yexception() << "Invalid projection scheme: type element should be one of enum, date, integer but got " << type;
+ }
}
- if (path[1] != "location" || path[2] != "template") {
- ythrow yexception() << "The key must be storage.location.template, but received " << key;
+ void DoParseProjection(TMap<TString, TMap<TString, NSc::TValue>> projection) {
+ for (const auto& p: projection) {
+ DoParseColumn(p.first, p.second);
+ }
}
- result.LocationTemplate = GetStringOrThrow(json, "The storage.location.template must be a string");
-}
+ void DoValidateTemplate(const std::vector<TString>& partitionedBy) {
+ TSet<TString> vars;
+ std::regex word_regex("\\$\\{(.*?)\\}");
+ auto wordBegin = std::sregex_iterator(Config.LocationTemplate.begin(), Config.LocationTemplate.end(), word_regex);
+ auto wordEnd = std::sregex_iterator();
+ for (auto word = wordBegin; word != wordEnd; ++word) {
+ vars.insert((word->begin() + 1)->str());
+ }
-}
+ TSet<TString> columns;
+ for (const auto& columnName: partitionedBy) {
+ columns.insert(columnName);
+ if (!vars.contains(columnName)) {
+ ythrow yexception() << "Template " << Config.LocationTemplate << " must include ${" << columnName << "}";
+ }
+ }
+
+ for (const auto& rule: Config.Rules) {
+ columns.insert(rule.Name);
+ if (!vars.contains(rule.Name)) {
+ ythrow yexception() << "Template " << Config.LocationTemplate << " must include ${" << rule.Name << "}";
+ }
+ }
-ExplicitPartitioningConfig ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionBy) {
- ExplicitPartitioningConfig result;
- if (partitionBy.empty()) {
- ythrow yexception() << "Partition by must always be specified";
+ for (const auto& var: vars) {
+ if (!columns.contains(var)) {
+ ythrow yexception() << "Colum named " << var << " does not exist for template " << Config.LocationTemplate;
+ }
+ }
}
- if (!config) {
- for (const auto& columnName: partitionBy) {
- result.LocationTemplate += "/" + columnName + "=${" + columnName + "}";
+ // Generate
+ void ExpandPartitioningRules(size_t pathsLimit) {
+ if (!Config.Enabled) {
+ Rules = Config.LocationTemplate
+ ? TRules{TExpandedPartitioningRule{.Path=Config.LocationTemplate}}
+ : TRules{};
+ return;
+ }
+ auto now = TInstant::Now();
+ TMap<TString, std::vector<TColumnWithValue>> result;
+ std::vector<TColumnWithValue> columns;
+ DoGenerate(Config.Rules, Config.LocationTemplate, columns, result, pathsLimit, now);
+ for (const auto& p: result) {
+ Rules.push_back(TExpandedPartitioningRule{.Path=p.first, .ColumnValues=p.second});
}
- return result;
}
- NSc::TValue json = NSc::TValue::FromJsonThrow(config, NSc::TJsonOpts::JO_PARSER_DISALLOW_DUPLICATE_KEYS | NSc::TJsonOpts::JO_SORT_KEYS);
- if (!json.IsDict()) {
- ythrow yexception() << "Invalid projection scheme: top-level element must be a dictionary";
+
+ void DoGenerateDate(const std::vector<TColumnPartitioningConfig>& rules,
+ TString locationTemplate,
+ std::vector<TColumnWithValue>& columnsWithValue,
+ TMap<TString, std::vector<TColumnWithValue>>& result,
+ size_t pathsLimit,
+ TInstant now,
+ size_t p = 0) {
+ const auto& rule = rules[p];
+ const TInstant to = ParseDate(rule.To, now);
+ const TDuration interval = FromUnit(rule.Interval, rule.IntervalUnit);
+ for (TInstant current = ParseDate(rule.From, now); current <= to; current += interval) {
+ TString copyLocationTemplate = locationTemplate;
+ const TString time = current.FormatLocalTime(rule.Format.c_str());
+ ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", time);
+ columnsWithValue.push_back(TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=time});
+ DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
+ columnsWithValue.pop_back();
+
+ if (IsOverflow(current.GetValue(), interval.GetValue())) {
+ return; // correct overflow handling
+ }
+ }
}
- TMap<TString, TMap<TString, NSc::TValue>> projection;
+ void DoGenerate(const std::vector<TColumnPartitioningConfig>& rules,
+ TString locationTemplate,
+ std::vector<TColumnWithValue>& columnsWithValue,
+ TMap<TString, std::vector<TColumnWithValue>>& result,
+ size_t pathsLimit,
+ TInstant now,
+ size_t p = 0) {
+ if (rules.size() == p) {
+ if (result.size() == pathsLimit) {
+ ythrow yexception() << "The limit on the number of paths has been reached: " << result.size() << " of " << pathsLimit;
+ }
+ result[locationTemplate] = columnsWithValue;
+ return;
+ }
- for (const auto& p: json.GetDict()) {
- const auto path = GetPath(p.first);
- if (path.empty()) {
- ythrow yexception() << "Invalid key: key should start with storage or projection, but got an empty value";
+ const auto& rule = rules[p];
+ switch (rule.Type) {
+ case IPathGenerator::EType::ENUM:
+ DoGenerateEnum(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
+ break;
+ case IPathGenerator::EType::INTEGER:
+ DoGenerateInteger(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
+ break;
+ case IPathGenerator::EType::DATE:
+ DoGenerateDate(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p);
+ break;
+ default:
+ ythrow yexception() << "Only the enum, integer, date types are supported but got " << to_lower(ToString(rule.Type));
}
- const TString& kind = path.front();
- if (kind == "projection") {
- AddProjection(p.first, p.second, path, projection, result);
- } else if (kind == "storage") {
- AddStorage(p.first, p.second, path, result);
- } else {
- ythrow yexception() << "Invalid key: key should start with storage or projection, but got " << p.first;
+ }
+
+ void DoGenerateEnum(const std::vector<IPathGenerator::TColumnPartitioningConfig>& rules,
+ TString locationTemplate,
+ std::vector<IPathGenerator::TColumnWithValue>& columnsWithValue,
+ TMap<TString, std::vector<IPathGenerator::TColumnWithValue>>& result,
+ size_t pathsLimit,
+ TInstant now,
+ size_t p = 0) {
+ const auto& rule = rules[p];
+ for (const auto& value: rule.Values) {
+ TString copyLocationTemplate = locationTemplate;
+ ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", value);
+ columnsWithValue.push_back(IPathGenerator::TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=value});
+ DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
+ columnsWithValue.pop_back();
}
}
- DoParseProjection(projection, result);
- DoValidateTemplate(result, partitionBy);
- return result;
-}
-std::vector<ExpandedPartitioningRule> ExpandPartitioningRules(const ExplicitPartitioningConfig& config, int pathsLimit) {
- if (!config.Enabled) {
- return config.LocationTemplate
- ? std::vector{ExpandedPartitioningRule{.Path=config.LocationTemplate}}
- : std::vector<ExpandedPartitioningRule>{};
- }
- auto now = TInstant::Now();
- TMap<TString, std::vector<ColumnWithValue>> result;
- std::vector<ColumnWithValue> columns;
- DoGenerate(config.Rules, config.LocationTemplate, columns, result, pathsLimit, now);
- std::vector<ExpandedPartitioningRule> rules;
- for (const auto& p: result) {
- rules.push_back(ExpandedPartitioningRule{.Path=p.first, .ColumnValues=p.second});
- }
- return rules;
+ void DoGenerateInteger(const std::vector<IPathGenerator::TColumnPartitioningConfig>& rules,
+ TString locationTemplate,
+ std::vector<IPathGenerator::TColumnWithValue>& columnsWithValue,
+ TMap<TString, std::vector<IPathGenerator::TColumnWithValue>>& result,
+ size_t pathsLimit,
+ TInstant now,
+ size_t p = 0) {
+ const auto& rule = rules[p];
+ for (int64_t i = rule.Min; i <= rule.Max; i += rule.Interval) {
+ TString copyLocationTemplate = locationTemplate;
+ ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", fmtInteger(rule.Digits, i));
+ columnsWithValue.push_back(IPathGenerator::TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::Int64, .Value=ToString(i)});
+ DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1);
+ columnsWithValue.pop_back();
+
+ if (IsOverflow(i, rule.Interval)) {
+ return; // correct overflow handling
+ }
+ }
+ }
+};
+
+TPathGeneratorPtr CreatePathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit) {
+ return std::make_shared<TPathGenerator>(projection, partitionedBy, pathsLimit);
}
}
diff --git a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h
index 1c0cc3b298b..449d4c7d19e 100644
--- a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h
+++ b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h
@@ -7,58 +7,68 @@
namespace NYql::NPathGenerator {
-enum class EType {
- UNDEFINED = 0,
- ENUM = 1,
- INTEGER = 2,
- DATE = 3
-};
+struct IPathGenerator {
+ enum class EType {
+ UNDEFINED = 0,
+ ENUM = 1,
+ INTEGER = 2,
+ DATE = 3
+ };
-enum class EIntervalUnit {
- UNDEFINED = 0,
- MILLISECONDS = 1,
- SECONDS = 2,
- MINUTES = 3,
- HOURS = 4,
- DAYS = 5,
- WEEKS = 6,
- MONTHS = 7,
- YEARS = 8
-};
+ enum class EIntervalUnit {
+ UNDEFINED = 0,
+ MILLISECONDS = 1,
+ SECONDS = 2,
+ MINUTES = 3,
+ HOURS = 4,
+ DAYS = 5,
+ WEEKS = 6,
+ MONTHS = 7,
+ YEARS = 8
+ };
-struct ColumnPartitioningConfig {
- EType Type = EType::UNDEFINED;
- TString Name;
- TString Format;
- TString From;
- TString To;
- int64_t Min = 0;
- int64_t Max = 0;
- EIntervalUnit IntervalUnit = EIntervalUnit::DAYS;
- int64_t Interval = 1;
- int32_t Digits = 0;
- std::vector<TString> Values;
-};
+ struct TColumnPartitioningConfig {
+ EType Type = EType::UNDEFINED;
+ TString Name;
+ TString Format;
+ TString From;
+ TString To;
+ int64_t Min = 0;
+ int64_t Max = 0;
+ EIntervalUnit IntervalUnit = EIntervalUnit::DAYS;
+ int64_t Interval = 1;
+ int32_t Digits = 0;
+ std::vector<TString> Values;
+ };
-struct ExplicitPartitioningConfig {
- bool Enabled = false;
- TString LocationTemplate;
- std::vector<ColumnPartitioningConfig> Rules;
-};
+ struct TExplicitPartitioningConfig {
+ bool Enabled = false;
+ TString LocationTemplate;
+ std::vector<TColumnPartitioningConfig> Rules;
+ };
-ExplicitPartitioningConfig ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionBy);
+ struct TColumnWithValue {
+ TString Name;
+ NUdf::EDataSlot Type;
+ TString Value;
+ };
-struct ColumnWithValue {
- TString Name;
- NUdf::EDataSlot Type;
- TString Value;
-};
+ struct TExpandedPartitioningRule {
+ TString Path;
+ std::vector<TColumnWithValue> ColumnValues;
+ };
+
+ using TRules = std::vector<TExpandedPartitioningRule>;
-struct ExpandedPartitioningRule {
- TString Path;
- std::vector<ColumnWithValue> ColumnValues;
+ virtual TString Format(const TStringBuf& columnName, const TStringBuf& dataValue) const = 0;
+ virtual TString Parse(const TStringBuf& columnName, const TStringBuf& pathValue) const = 0;
+ virtual const TRules& GetRules() const = 0;
+ virtual const TExplicitPartitioningConfig& GetConfig() const = 0;
+ virtual ~IPathGenerator() = default;
};
-std::vector<ExpandedPartitioningRule> ExpandPartitioningRules(const ExplicitPartitioningConfig& config, int pathsLimit);
+using TPathGeneratorPtr = std::shared_ptr<const IPathGenerator>;
+
+TPathGeneratorPtr CreatePathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit = 1000);
}