diff options
author | hcpp <[email protected]> | 2022-07-28 23:11:39 +0300 |
---|---|---|
committer | hcpp <[email protected]> | 2022-07-28 23:11:39 +0300 |
commit | 6680613e3d1f09a604866924b2a21855ffbeb249 (patch) | |
tree | 759589179c8e03c932397c8b4b14661405759cdf | |
parent | 75bd1e6d8a8845828be6d151670104c36ba572d1 (diff) |
path generator improvements
4 files changed, 477 insertions, 383 deletions
diff --git a/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp b/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp index 58cfa728eb8..213b13c86a5 100644 --- a/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp +++ b/ydb/library/yql/providers/s3/path_generator/ut/yql_generate_partitioning_rules_ut.cpp @@ -6,7 +6,7 @@ namespace NYql::NPathGenerator { Y_UNIT_TEST_SUITE(TGenerateTests) { Y_UNIT_TEST(SuccessGenerate) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -17,7 +17,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/0/"); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2); @@ -30,7 +30,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } Y_UNIT_TEST(SuccessGenerateInteger) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -43,7 +43,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/0/"); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2); @@ -56,7 +56,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } Y_UNIT_TEST(SuccessGenerateIntegerWithDigits) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -70,7 +70,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/00000/"); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2); @@ -83,7 +83,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } Y_UNIT_TEST(SuccessGenerateIntegerWithDigitsOverflow) { - auto result = ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -95,13 +95,11 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { "projection.code.digits" : 10000, "storage.location.template" : "/${city}/${code}/" } - )", {"city", "code"}); - - UNIT_ASSERT_EXCEPTION_CONTAINS(ExpandPartitioningRules(result, 100), yexception, "Digits cannot exceed 64, but received 10000"); + )", {"city", "code"}), yexception, "Digits cannot exceed 64, but received 10000"); } Y_UNIT_TEST(CheckLimit) { - auto result = ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -110,21 +108,19 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { "projection.code.values" : "0,1", "storage.location.template" : "/${city}/${code}/" } - )", {"city", "code"}); - - UNIT_ASSERT_EXCEPTION_CONTAINS(ExpandPartitioningRules(result, 2), yexception, "The limit on the number of paths has been reached: 2 of 2"); + )", {"city", "code"}, 2), yexception, "The limit on the number of paths has been reached: 2 of 2"); } Y_UNIT_TEST(CheckHiveFormat) { - auto result = ParsePartitioningRules({}, {"city", "code", "device_id"}); - auto rules = ExpandPartitioningRules(result, 1); + auto generator = CreatePathGenerator({}, {"city", "code", "device_id"}, 1); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 1); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 0); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/city=${city}/code=${code}/device_id=${device_id}"); } Y_UNIT_TEST(SuccessGenerateDateWith) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -139,7 +135,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 4); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/2010-01-01/"); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2); @@ -152,7 +148,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } Y_UNIT_TEST(SuccessGenerateDateWithUnixtime) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -167,7 +163,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 2); UNIT_ASSERT_VALUES_EQUAL(rules[0].Path, "/MSK/1970-01-03/"); UNIT_ASSERT_VALUES_EQUAL(rules[0].ColumnValues.size(), 2); @@ -176,7 +172,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } Y_UNIT_TEST(SuccessGenerateDateWithNow) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -191,7 +187,7 @@ Y_UNIT_TEST_SUITE(TGenerateTests) { } )", {"city", "code"}); auto nowBefore = TInstant::Now(); - auto rules = ExpandPartitioningRules(result, 100); + auto rules = generator->GetRules(); auto nowAfter = TInstant::Now(); UNIT_ASSERT_VALUES_EQUAL(rules.size(), 2); UNIT_ASSERT_GE(rules[0].Path, "/MSK/" + (nowBefore + TDuration::Days(1)).FormatLocalTime("%F") + "/"); diff --git a/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp b/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp index 2cfbb77e010..bea34672ffa 100644 --- a/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp +++ b/ydb/library/yql/providers/s3/path_generator/ut/yql_parse_partitioning_rules_ut.cpp @@ -6,7 +6,7 @@ namespace NYql::NPathGenerator { Y_UNIT_TEST_SUITE(TParseTests) { Y_UNIT_TEST(SuccessParseEnum) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -14,11 +14,12 @@ Y_UNIT_TEST_SUITE(TParseTests) { "storage.location.template" : "/${city}/" } )", {"city"}); + const auto& result = generator->GetConfig(); UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true); UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${city}/"); UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1); const auto& rule = result.Rules.front(); - UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::ENUM); + UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::ENUM); UNIT_ASSERT_VALUES_EQUAL(rule.Name, "city"); UNIT_ASSERT_VALUES_EQUAL(rule.Values.size(), 2); UNIT_ASSERT_VALUES_EQUAL(rule.Values.front(), "MSK"); @@ -26,7 +27,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(ParseTwoEnabled) { - UNIT_ASSERT_NO_EXCEPTION(ParsePartitioningRules(R"( + UNIT_ASSERT_NO_EXCEPTION(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.enabled.type" : "enum", @@ -37,7 +38,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(InvalidPartitioningTemplate) { - UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -48,7 +49,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(InvalidProjectionTemplate) { - UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -59,7 +60,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(InvalidTemplate) { - UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -70,7 +71,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(StartSubstition) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -78,11 +79,12 @@ Y_UNIT_TEST_SUITE(TParseTests) { "storage.location.template" : "/${city}/${device_id}/" } )", {"city", "device_id"}); + const auto& result = generator->GetConfig(); UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true); UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${city}/${device_id}/"); UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1); const auto& rule = result.Rules.front(); - UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::ENUM); + UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::ENUM); UNIT_ASSERT_VALUES_EQUAL(rule.Name, "city"); UNIT_ASSERT_VALUES_EQUAL(rule.Values.size(), 2); UNIT_ASSERT_VALUES_EQUAL(rule.Values.front(), "MSK"); @@ -90,7 +92,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(InvalidValuesType) { - UNIT_ASSERT_EXCEPTION_CONTAINS(ParsePartitioningRules(R"( + UNIT_ASSERT_EXCEPTION_CONTAINS(CreatePathGenerator(R"( { "projection.enabled" : true, "projection.city.type" : "enum", @@ -101,7 +103,7 @@ Y_UNIT_TEST_SUITE(TParseTests) { } Y_UNIT_TEST(SuccessParseInteger) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : true, "projection.id.type" : "integer", @@ -110,18 +112,19 @@ Y_UNIT_TEST_SUITE(TParseTests) { "storage.location.template" : "/${id}/" } )", {"id"}); + const auto& result = generator->GetConfig(); UNIT_ASSERT_VALUES_EQUAL(result.Enabled, true); UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${id}/"); UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1); const auto& rule = result.Rules.front(); - UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::INTEGER); + UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::INTEGER); UNIT_ASSERT_VALUES_EQUAL(rule.Name, "id"); UNIT_ASSERT_VALUES_EQUAL(rule.Min, 5); UNIT_ASSERT_VALUES_EQUAL(rule.Max, 6); } Y_UNIT_TEST(SuccessDisableProjection) { - auto result = ParsePartitioningRules(R"( + auto generator = CreatePathGenerator(R"( { "projection.enabled" : false, "projection.id.type" : "integer", @@ -130,11 +133,12 @@ Y_UNIT_TEST_SUITE(TParseTests) { "storage.location.template" : "/${id}/" } )", {"id"}); + const auto& result = generator->GetConfig(); UNIT_ASSERT_VALUES_EQUAL(result.Enabled, false); UNIT_ASSERT_VALUES_EQUAL(result.LocationTemplate, "/${id}/"); UNIT_ASSERT_VALUES_EQUAL(result.Rules.size(), 1); const auto& rule = result.Rules.front(); - UNIT_ASSERT_VALUES_EQUAL(rule.Type, EType::INTEGER); + UNIT_ASSERT_VALUES_EQUAL(rule.Type, IPathGenerator::EType::INTEGER); UNIT_ASSERT_VALUES_EQUAL(rule.Name, "id"); UNIT_ASSERT_VALUES_EQUAL(rule.Min, 5); UNIT_ASSERT_VALUES_EQUAL(rule.Max, 6); diff --git a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp index 60877b38fdb..c5bec51f401 100644 --- a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp +++ b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.cpp @@ -4,6 +4,7 @@ #include <regex> #include <util/datetime/base.h> #include <util/generic/serialized_enum.h> +#include <util/string/cast.h> #include <util/string/split.h> namespace NYql::NPathGenerator { @@ -65,167 +66,32 @@ i64 GetBoolOrThrow(const NSc::TValue& json, const TString& error) { return json.IsTrue(); } -EIntervalUnit ToIntervalUnit(const TString& unit) { - const auto names = GetEnumNames<EIntervalUnit>(); +IPathGenerator::EIntervalUnit ToIntervalUnit(const TString& unit) { + const auto names = GetEnumNames<IPathGenerator::EIntervalUnit>(); for (const auto& name: names) { if (name.second == unit) { return name.first; } } - ythrow yexception() << "Invalid projection scheme: unit " << unit << " must be one of " << GetEnumAllNames<EIntervalUnit>(); + ythrow yexception() << "Invalid projection scheme: unit " << unit << " must be one of " << GetEnumAllNames<IPathGenerator::EIntervalUnit>(); } -TMap<EType, TString> ToLowerType() { - TMap<EType, TString> result; - for (const auto& p: GetEnumNames<EType>()) { +TMap<IPathGenerator::EType, TString> ToLowerType() { + TMap<IPathGenerator::EType, TString> result; + for (const auto& p: GetEnumNames<IPathGenerator::EType>()) { result[p.first] = to_lower(p.second); } return result; }; -EType ToType(const TString& type) { - static TMap<EType, TString> enumNames{ToLowerType()}; +IPathGenerator::EType ToType(const TString& type) { + static TMap<IPathGenerator::EType, TString> enumNames{ToLowerType()}; for (const auto& name: enumNames) { if (name.second == type) { return name.first; } } - ythrow yexception() << "Invalid projection scheme: type " << type << " must be one of " << to_lower(GetEnumAllNames<EType>()); -} - -void DoParseEnumType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) { - if (!projection.contains("values")) { - ythrow yexception() << "Invalid projection scheme: values are required field for " << columnName << " " << type; - } - if (!projection.contains("type")) { - ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; - } - for (const auto& p: projection) { - if (p.first == "type") { - // already processed - } else if (p.first == "values") { - TString values = GetStringOrThrow(p.second, "The values must be a string"); - config.Rules.push_back(ColumnPartitioningConfig{.Type=ToType(type), .Name=columnName, .Values=StringSplitter(values).Split(',').ToList<TString>()}); - } else { - ythrow yexception() << "Invalid projection scheme: enum element must include only type or values (as string) but got " << p.first; - } - } -} - -void DoParseIntegerType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) { - if (!projection.contains("type")) { - ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; - } - if (!projection.contains("min")) { - ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type; - } - if (!projection.contains("max")) { - ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type; - } - ColumnPartitioningConfig columnConfig; - columnConfig.Name = columnName; - columnConfig.Type = ToType(type); - for (const auto& p: projection) { - if (p.first == "type") { - // already processed - } else if (p.first == "min") { - columnConfig.Min = GetIntOrThrow(p.second, "The min must be a number"); - } else if (p.first == "max") { - columnConfig.Max = GetIntOrThrow(p.second, "The max must be a number"); - } else if (p.first == "interval") { - columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number"); - } else if (p.first == "digits") { - columnConfig.Digits = GetIntOrThrow(p.second, "The digits must be a number"); - } else { - ythrow yexception() << "Invalid projection scheme: integer element must include only type, min, max, interval, digits but got " << p.first; - } - } - config.Rules.push_back(columnConfig); -} - -void DoParseDateType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) { - if (!projection.contains("type")) { - ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; - } - if (!projection.contains("min")) { - ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type; - } - if (!projection.contains("max")) { - ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type; - } - if (!projection.contains("format")) { - ythrow yexception() << "Invalid projection scheme: format are required field for " << columnName << " " << type; - } - ColumnPartitioningConfig columnConfig; - columnConfig.Name = columnName; - columnConfig.Type = ToType(type); - for (const auto& p: projection) { - if (p.first == "type") { - // already processed - } else if (p.first == "min") { - columnConfig.From = GetStringOrThrow(p.second, "The min must be a string"); - } else if (p.first == "max") { - columnConfig.To = GetStringOrThrow(p.second, "The max must be a string"); - } else if (p.first == "format") { - columnConfig.Format = GetStringOrThrow(p.second, "The format must be a string"); - } else if (p.first == "interval") { - columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number"); - } else if (p.first == "unit") { - columnConfig.IntervalUnit = ToIntervalUnit(GetStringOrThrow(p.second, "The unit must be a string")); - } else { - ythrow yexception() << "Invalid projection scheme: date element must include only type, min, max, format, interval, unit but got " << p.first; - } - } - config.Rules.push_back(columnConfig); -} - -void DoParseColumn(const TString& columnName, const TMap<TString, NSc::TValue>& projection, ExplicitPartitioningConfig& config) { - auto it = projection.find("type"); - if (it == projection.end()) { - ythrow yexception() << "Invalid projection scheme: type element must exist for the column" << columnName; - } - - const TString type = GetStringOrThrow(it->second, "The type must be a string for column " + columnName); - if (type == "enum") { - DoParseEnumType(columnName, type, projection, config); - } else if (type == "integer") { - DoParseIntegerType(columnName, type, projection, config); - } else if (type == "date") { - DoParseDateType(columnName, type, projection, config); - } else { - ythrow yexception() << "Invalid projection scheme: type element should be one of enum, date, integer but got " << type; - } -} - -void DoParseProjection(TMap<TString, TMap<TString, NSc::TValue>> projection, ExplicitPartitioningConfig& config) { - for (const auto& p: projection) { - DoParseColumn(p.first, p.second, config); - } -} - -void DoGenerate(const std::vector<ColumnPartitioningConfig>& rules, - TString locationTemplate, - std::vector<ColumnWithValue>& columnsWithValue, - TMap<TString, std::vector<ColumnWithValue>>& result, - int pathsLimit, - TInstant now, - size_t p = 0); - -void DoGenerateEnum(const std::vector<ColumnPartitioningConfig>& rules, - TString locationTemplate, - std::vector<ColumnWithValue>& columnsWithValue, - TMap<TString, std::vector<ColumnWithValue>>& result, - int pathsLimit, - TInstant now, - size_t p = 0) { - const auto& rule = rules[p]; - for (const auto& value: rule.Values) { - TString copyLocationTemplate = locationTemplate; - ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", value); - columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=value}); - DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); - columnsWithValue.pop_back(); - } + ythrow yexception() << "Invalid projection scheme: type " << type << " must be one of " << to_lower(GetEnumAllNames<IPathGenerator::EType>()); } std::string fmtInteger(int32_t width, int64_t value) @@ -256,47 +122,26 @@ bool IsOverflow(uint64_t a, uint64_t b) { return b > diff; } -void DoGenerateInteger(const std::vector<ColumnPartitioningConfig>& rules, - TString locationTemplate, - std::vector<ColumnWithValue>& columnsWithValue, - TMap<TString, std::vector<ColumnWithValue>>& result, - int pathsLimit, - TInstant now, - size_t p = 0) { - const auto& rule = rules[p]; - for (int64_t i = rule.Min; i <= rule.Max; i += rule.Interval) { - TString copyLocationTemplate = locationTemplate; - ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", fmtInteger(rule.Digits, i)); - columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::Int64, .Value=ToString(i)}); - DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); - columnsWithValue.pop_back(); - - if (IsOverflow(i, rule.Interval)) { - return; // correct overflow handling - } - } -} - -TDuration FromUnit(int64_t interval, EIntervalUnit unit) { +TDuration FromUnit(int64_t interval, IPathGenerator::EIntervalUnit unit) { switch (unit) { - case EIntervalUnit::MILLISECONDS: + case IPathGenerator::EIntervalUnit::MILLISECONDS: return TDuration::MilliSeconds(interval); - case EIntervalUnit::SECONDS: + case IPathGenerator::EIntervalUnit::SECONDS: return TDuration::Seconds(interval); - case EIntervalUnit::MINUTES: + case IPathGenerator::EIntervalUnit::MINUTES: return TDuration::Minutes(interval); - case EIntervalUnit::HOURS: + case IPathGenerator::EIntervalUnit::HOURS: return TDuration::Hours(interval); - case EIntervalUnit::DAYS: + case IPathGenerator::EIntervalUnit::DAYS: return TDuration::Days(interval); - case EIntervalUnit::WEEKS: + case IPathGenerator::EIntervalUnit::WEEKS: return TDuration::Days(interval * 7); - case EIntervalUnit::MONTHS: + case IPathGenerator::EIntervalUnit::MONTHS: return TDuration::Seconds(interval * 2629746LL); /// Exactly 1/12 of a year. - case EIntervalUnit::YEARS: + case IPathGenerator::EIntervalUnit::YEARS: return TDuration::Seconds(interval * 31556952LL); /// The average length of a Gregorian year is equal to 365.2425 days default: - ythrow yexception() << "Only the " << GetEnumAllNames<EIntervalUnit>() << " units are supported but got " << unit; + ythrow yexception() << "Only the " << GetEnumAllNames<IPathGenerator::EIntervalUnit>() << " units are supported but got " << unit; } } @@ -330,176 +175,415 @@ TInstant ParseDate(const TString& dateStr, const TInstant& now) { return TInstant::ParseIso8601(dateStr); } -void DoGenerateDate(const std::vector<ColumnPartitioningConfig>& rules, - TString locationTemplate, - std::vector<ColumnWithValue>& columnsWithValue, - TMap<TString, std::vector<ColumnWithValue>>& result, - int pathsLimit, - TInstant now, - size_t p = 0) { - const auto& rule = rules[p]; - const TInstant to = ParseDate(rule.To, now); - const TDuration interval = FromUnit(rule.Interval, rule.IntervalUnit); - for (TInstant current = ParseDate(rule.From, now); current <= to; current += interval) { - TString copyLocationTemplate = locationTemplate; - const TString time = current.FormatLocalTime(rule.Format.c_str()); - ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", time); - columnsWithValue.push_back(ColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=time}); - DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); - columnsWithValue.pop_back(); +} - if (IsOverflow(current.GetValue(), interval.GetValue())) { - return; // correct overflow handling - } +struct TPathGenerator: public IPathGenerator { + TExplicitPartitioningConfig Config; + TRules Rules; + TMap<TString, TColumnPartitioningConfig> ColumnConfig; // column name -> config + +public: + TPathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit) + { + ParsePartitioningRules(projection, partitionedBy); + ExpandPartitioningRules(pathsLimit); } -} -void DoGenerate(const std::vector<ColumnPartitioningConfig>& rules, - TString locationTemplate, - std::vector<ColumnWithValue>& columnsWithValue, - TMap<TString, std::vector<ColumnWithValue>>& result, - int pathsLimit, - TInstant now, - size_t p) { - if (rules.size() == p) { - if (result.size() == static_cast<uint>(pathsLimit)) { - ythrow yexception() << "The limit on the number of paths has been reached: " << result.size() << " of " << pathsLimit; - } - result[locationTemplate] = columnsWithValue; - return; - } - - const auto& rule = rules[p]; - switch (rule.Type) { - case EType::ENUM: - DoGenerateEnum(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); - break; - case EType::INTEGER: - DoGenerateInteger(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); + TString Format(const TStringBuf& columnName, const TStringBuf& dataValue) const override { + auto it = ColumnConfig.find(columnName); + if (it == ColumnConfig.end()) { + ythrow yexception() << columnName << " column not found in config"; + } + const auto& config = it->second; + switch (config.Type) { + case IPathGenerator::EType::UNDEFINED: { + ythrow yexception() << columnName << " column has undefined type"; + } + case IPathGenerator::EType::ENUM: { + if (Find(config.Values, dataValue) == config.Values.end()) { + ythrow yexception() << dataValue << " data not found as enum item"; + } + return TString{dataValue}; + } + case IPathGenerator::EType::INTEGER: { + int64_t value = 0; + if (!TryFromString(dataValue.Data(), dataValue.size(), value)) { + ythrow yexception() << dataValue << " data is not a int64"; + } + return fmtInteger(config.Digits, value); + } + case IPathGenerator::EType::DATE: { + TInstant time = TInstant::Zero() /* ToInstant(dataValue) */; + return time.FormatLocalTime(config.Format.c_str()); + } break; - case EType::DATE: - DoGenerateDate(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); + } + } + + TString Parse(const TStringBuf& columnName, const TStringBuf& pathValue) const override { + auto it = ColumnConfig.find(columnName); + if (it == ColumnConfig.end()) { + ythrow yexception() << columnName << " column not found in config"; + } + const auto& config = it->second; + switch (config.Type) { + case IPathGenerator::EType::UNDEFINED: { + ythrow yexception() << columnName << " column has undefined type"; + } + case IPathGenerator::EType::ENUM: { + if (Find(config.Values, pathValue) == config.Values.end()) { + ythrow yexception() << pathValue << " value not found as enum item"; + } + return TString{pathValue}; + } + case IPathGenerator::EType::INTEGER: { + int64_t value = 0; + if (!TryFromString(pathValue.Data(), pathValue.size(), value)) { + ythrow yexception() << pathValue << " value is not a int64"; + } + return std::to_string(value); + } + case IPathGenerator::EType::DATE: { + TInstant time = TInstant::Zero() /* ToInstant(dataValue) */; + return time.FormatLocalTime(config.Format.c_str()); + } break; - default: - ythrow yexception() << "Only the enum, integer, date types are supported but got " << to_lower(ToString(rule.Type)); + } } -} -void DoValidateTemplate(const ExplicitPartitioningConfig& config, const std::vector<TString>& partitionBy) { - TSet<TString> vars; - std::regex word_regex("\\$\\{(.*?)\\}"); - auto wordBegin = std::sregex_iterator(config.LocationTemplate.begin(), config.LocationTemplate.end(), word_regex); - auto wordEnd = std::sregex_iterator(); - for (auto word = wordBegin; word != wordEnd; ++word) { - vars.insert((word->begin() + 1)->str()); + const TRules& GetRules() const override { + return Rules; + } + + const TExplicitPartitioningConfig& GetConfig() const override { + return Config; } - TSet<TString> columns; - for (const auto& columnName: partitionBy) { - columns.insert(columnName); - if (!vars.contains(columnName)) { - ythrow yexception() << "Template " << config.LocationTemplate << " must include ${" << columnName << "}"; +private: + // Parse + void ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionedBy) { + if (partitionedBy.empty()) { + ythrow yexception() << "Partition by must always be specified"; + } + + if (!config) { + for (const auto& columnName: partitionedBy) { + Config.LocationTemplate += "/" + columnName + "=${" + columnName + "}"; + } + return; + } + + NSc::TValue json = NSc::TValue::FromJsonThrow(config, NSc::TJsonOpts::JO_PARSER_DISALLOW_DUPLICATE_KEYS | NSc::TJsonOpts::JO_SORT_KEYS); + if (!json.IsDict()) { + ythrow yexception() << "Invalid projection scheme: top-level element must be a dictionary"; + } + + TMap<TString, TMap<TString, NSc::TValue>> projection; + for (const auto& p: json.GetDict()) { + const auto path = GetPath(p.first); + if (path.empty()) { + ythrow yexception() << "Invalid key: key should start with storage or projection, but got an empty value"; + } + const TString& kind = path.front(); + if (kind == "projection") { + AddProjection(p.first, p.second, path, projection); + } else if (kind == "storage") { + AddStorage(p.first, p.second, path); + } else { + ythrow yexception() << "Invalid key: key should start with storage or projection, but got " << p.first; + } + } + DoParseProjection(projection); + DoValidateTemplate(partitionedBy); + for (const auto& config: Config.Rules) { + ColumnConfig[config.Name] = config; } } - for (const auto& rule: config.Rules) { - columns.insert(rule.Name); - if (!vars.contains(rule.Name)) { - ythrow yexception() << "Template " << config.LocationTemplate << " must include ${" << rule.Name << "}"; + void AddProjection(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, TMap<TString, TMap<TString, NSc::TValue>>& projection) { + if (path.size() != 3 && path.size() != 2) { + ythrow yexception() << "The key must be three-component or two-component, but received " << key; + } + + if (path.size() == 2 && path[1] != "enabled") { + ythrow yexception() << "Unknown key " << key; + } + + if (path.size() == 2) { + Config.Enabled = GetBoolOrThrow(json, "The projection.enabled must be a bool"); + return; } + + projection[path[1]][path[2]] = json; } - for (const auto& var: vars) { - if (!columns.contains(var)) { - ythrow yexception() << "Colum named " << var << " does not exist for template " << config.LocationTemplate; + void AddStorage(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path) { + if (path.size() != 3) { + ythrow yexception() << "The key must be three-component, but received " << key; + } + + if (path[1] != "location" || path[2] != "template") { + ythrow yexception() << "The key must be storage.location.template, but received " << key; } + + Config.LocationTemplate = GetStringOrThrow(json, "The storage.location.template must be a string"); } -} -void AddProjection(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, TMap<TString, TMap<TString, NSc::TValue>>& projection, ExplicitPartitioningConfig& result) { - if (path.size() != 3 && path.size() != 2) { - ythrow yexception() << "The key must be three-component or two-component, but received " << key; + + void DoParseEnumType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) { + if (!projection.contains("values")) { + ythrow yexception() << "Invalid projection scheme: values are required field for " << columnName << " " << type; + } + if (!projection.contains("type")) { + ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; + } + for (const auto& p: projection) { + if (p.first == "type") { + // already processed + } else if (p.first == "values") { + TString values = GetStringOrThrow(p.second, "The values must be a string"); + Config.Rules.push_back(IPathGenerator::TColumnPartitioningConfig{.Type=ToType(type), .Name=columnName, .Values=StringSplitter(values).Split(',').ToList<TString>()}); + } else { + ythrow yexception() << "Invalid projection scheme: enum element must include only type or values (as string) but got " << p.first; + } + } } - if (path.size() == 2 && path[1] != "enabled") { - ythrow yexception() << "Unknown key " << key; + void DoParseIntegerType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) { + if (!projection.contains("type")) { + ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; + } + if (!projection.contains("min")) { + ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type; + } + if (!projection.contains("max")) { + ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type; + } + IPathGenerator::TColumnPartitioningConfig columnConfig; + columnConfig.Name = columnName; + columnConfig.Type = ToType(type); + for (const auto& p: projection) { + if (p.first == "type") { + // already processed + } else if (p.first == "min") { + columnConfig.Min = GetIntOrThrow(p.second, "The min must be a number"); + } else if (p.first == "max") { + columnConfig.Max = GetIntOrThrow(p.second, "The max must be a number"); + } else if (p.first == "interval") { + columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number"); + } else if (p.first == "digits") { + columnConfig.Digits = GetIntOrThrow(p.second, "The digits must be a number"); + } else { + ythrow yexception() << "Invalid projection scheme: integer element must include only type, min, max, interval, digits but got " << p.first; + } + } + Config.Rules.push_back(columnConfig); } - if (path.size() == 2) { - result.Enabled = GetBoolOrThrow(json, "The projection.enabled must be a bool"); - return; + void DoParseDateType(const TString& columnName, const TString& type, const TMap<TString, NSc::TValue>& projection) { + if (!projection.contains("type")) { + ythrow yexception() << "Invalid projection scheme: type are required field for " << columnName << " " << type; + } + if (!projection.contains("min")) { + ythrow yexception() << "Invalid projection scheme: min are required field for " << columnName << " " << type; + } + if (!projection.contains("max")) { + ythrow yexception() << "Invalid projection scheme: max are required field for " << columnName << " " << type; + } + if (!projection.contains("format")) { + ythrow yexception() << "Invalid projection scheme: format are required field for " << columnName << " " << type; + } + IPathGenerator::TColumnPartitioningConfig columnConfig; + columnConfig.Name = columnName; + columnConfig.Type = ToType(type); + for (const auto& p: projection) { + if (p.first == "type") { + // already processed + } else if (p.first == "min") { + columnConfig.From = GetStringOrThrow(p.second, "The min must be a string"); + } else if (p.first == "max") { + columnConfig.To = GetStringOrThrow(p.second, "The max must be a string"); + } else if (p.first == "format") { + columnConfig.Format = GetStringOrThrow(p.second, "The format must be a string"); + } else if (p.first == "interval") { + columnConfig.Interval = GetIntOrThrow(p.second, "The interval must be a number"); + } else if (p.first == "unit") { + columnConfig.IntervalUnit = ToIntervalUnit(GetStringOrThrow(p.second, "The unit must be a string")); + } else { + ythrow yexception() << "Invalid projection scheme: date element must include only type, min, max, format, interval, unit but got " << p.first; + } + } + Config.Rules.push_back(columnConfig); } - projection[path[1]][path[2]] = json; -} + void DoParseColumn(const TString& columnName, const TMap<TString, NSc::TValue>& projection) { + auto it = projection.find("type"); + if (it == projection.end()) { + ythrow yexception() << "Invalid projection scheme: type element must exist for the column" << columnName; + } -void AddStorage(const TStringBuf& key, const NSc::TValue& json, const TVector<TString>& path, ExplicitPartitioningConfig& result) { - if (path.size() != 3) { - ythrow yexception() << "The key must be three-component, but received " << key; + const TString type = GetStringOrThrow(it->second, "The type must be a string for column " + columnName); + if (type == "enum") { + DoParseEnumType(columnName, type, projection); + } else if (type == "integer") { + DoParseIntegerType(columnName, type, projection); + } else if (type == "date") { + DoParseDateType(columnName, type, projection); + } else { + ythrow yexception() << "Invalid projection scheme: type element should be one of enum, date, integer but got " << type; + } } - if (path[1] != "location" || path[2] != "template") { - ythrow yexception() << "The key must be storage.location.template, but received " << key; + void DoParseProjection(TMap<TString, TMap<TString, NSc::TValue>> projection) { + for (const auto& p: projection) { + DoParseColumn(p.first, p.second); + } } - result.LocationTemplate = GetStringOrThrow(json, "The storage.location.template must be a string"); -} + void DoValidateTemplate(const std::vector<TString>& partitionedBy) { + TSet<TString> vars; + std::regex word_regex("\\$\\{(.*?)\\}"); + auto wordBegin = std::sregex_iterator(Config.LocationTemplate.begin(), Config.LocationTemplate.end(), word_regex); + auto wordEnd = std::sregex_iterator(); + for (auto word = wordBegin; word != wordEnd; ++word) { + vars.insert((word->begin() + 1)->str()); + } -} + TSet<TString> columns; + for (const auto& columnName: partitionedBy) { + columns.insert(columnName); + if (!vars.contains(columnName)) { + ythrow yexception() << "Template " << Config.LocationTemplate << " must include ${" << columnName << "}"; + } + } + + for (const auto& rule: Config.Rules) { + columns.insert(rule.Name); + if (!vars.contains(rule.Name)) { + ythrow yexception() << "Template " << Config.LocationTemplate << " must include ${" << rule.Name << "}"; + } + } -ExplicitPartitioningConfig ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionBy) { - ExplicitPartitioningConfig result; - if (partitionBy.empty()) { - ythrow yexception() << "Partition by must always be specified"; + for (const auto& var: vars) { + if (!columns.contains(var)) { + ythrow yexception() << "Colum named " << var << " does not exist for template " << Config.LocationTemplate; + } + } } - if (!config) { - for (const auto& columnName: partitionBy) { - result.LocationTemplate += "/" + columnName + "=${" + columnName + "}"; + // Generate + void ExpandPartitioningRules(size_t pathsLimit) { + if (!Config.Enabled) { + Rules = Config.LocationTemplate + ? TRules{TExpandedPartitioningRule{.Path=Config.LocationTemplate}} + : TRules{}; + return; + } + auto now = TInstant::Now(); + TMap<TString, std::vector<TColumnWithValue>> result; + std::vector<TColumnWithValue> columns; + DoGenerate(Config.Rules, Config.LocationTemplate, columns, result, pathsLimit, now); + for (const auto& p: result) { + Rules.push_back(TExpandedPartitioningRule{.Path=p.first, .ColumnValues=p.second}); } - return result; } - NSc::TValue json = NSc::TValue::FromJsonThrow(config, NSc::TJsonOpts::JO_PARSER_DISALLOW_DUPLICATE_KEYS | NSc::TJsonOpts::JO_SORT_KEYS); - if (!json.IsDict()) { - ythrow yexception() << "Invalid projection scheme: top-level element must be a dictionary"; + + void DoGenerateDate(const std::vector<TColumnPartitioningConfig>& rules, + TString locationTemplate, + std::vector<TColumnWithValue>& columnsWithValue, + TMap<TString, std::vector<TColumnWithValue>>& result, + size_t pathsLimit, + TInstant now, + size_t p = 0) { + const auto& rule = rules[p]; + const TInstant to = ParseDate(rule.To, now); + const TDuration interval = FromUnit(rule.Interval, rule.IntervalUnit); + for (TInstant current = ParseDate(rule.From, now); current <= to; current += interval) { + TString copyLocationTemplate = locationTemplate; + const TString time = current.FormatLocalTime(rule.Format.c_str()); + ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", time); + columnsWithValue.push_back(TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=time}); + DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); + columnsWithValue.pop_back(); + + if (IsOverflow(current.GetValue(), interval.GetValue())) { + return; // correct overflow handling + } + } } - TMap<TString, TMap<TString, NSc::TValue>> projection; + void DoGenerate(const std::vector<TColumnPartitioningConfig>& rules, + TString locationTemplate, + std::vector<TColumnWithValue>& columnsWithValue, + TMap<TString, std::vector<TColumnWithValue>>& result, + size_t pathsLimit, + TInstant now, + size_t p = 0) { + if (rules.size() == p) { + if (result.size() == pathsLimit) { + ythrow yexception() << "The limit on the number of paths has been reached: " << result.size() << " of " << pathsLimit; + } + result[locationTemplate] = columnsWithValue; + return; + } - for (const auto& p: json.GetDict()) { - const auto path = GetPath(p.first); - if (path.empty()) { - ythrow yexception() << "Invalid key: key should start with storage or projection, but got an empty value"; + const auto& rule = rules[p]; + switch (rule.Type) { + case IPathGenerator::EType::ENUM: + DoGenerateEnum(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); + break; + case IPathGenerator::EType::INTEGER: + DoGenerateInteger(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); + break; + case IPathGenerator::EType::DATE: + DoGenerateDate(rules, locationTemplate, columnsWithValue, result, pathsLimit, now, p); + break; + default: + ythrow yexception() << "Only the enum, integer, date types are supported but got " << to_lower(ToString(rule.Type)); } - const TString& kind = path.front(); - if (kind == "projection") { - AddProjection(p.first, p.second, path, projection, result); - } else if (kind == "storage") { - AddStorage(p.first, p.second, path, result); - } else { - ythrow yexception() << "Invalid key: key should start with storage or projection, but got " << p.first; + } + + void DoGenerateEnum(const std::vector<IPathGenerator::TColumnPartitioningConfig>& rules, + TString locationTemplate, + std::vector<IPathGenerator::TColumnWithValue>& columnsWithValue, + TMap<TString, std::vector<IPathGenerator::TColumnWithValue>>& result, + size_t pathsLimit, + TInstant now, + size_t p = 0) { + const auto& rule = rules[p]; + for (const auto& value: rule.Values) { + TString copyLocationTemplate = locationTemplate; + ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", value); + columnsWithValue.push_back(IPathGenerator::TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::String, .Value=value}); + DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); + columnsWithValue.pop_back(); } } - DoParseProjection(projection, result); - DoValidateTemplate(result, partitionBy); - return result; -} -std::vector<ExpandedPartitioningRule> ExpandPartitioningRules(const ExplicitPartitioningConfig& config, int pathsLimit) { - if (!config.Enabled) { - return config.LocationTemplate - ? std::vector{ExpandedPartitioningRule{.Path=config.LocationTemplate}} - : std::vector<ExpandedPartitioningRule>{}; - } - auto now = TInstant::Now(); - TMap<TString, std::vector<ColumnWithValue>> result; - std::vector<ColumnWithValue> columns; - DoGenerate(config.Rules, config.LocationTemplate, columns, result, pathsLimit, now); - std::vector<ExpandedPartitioningRule> rules; - for (const auto& p: result) { - rules.push_back(ExpandedPartitioningRule{.Path=p.first, .ColumnValues=p.second}); - } - return rules; + void DoGenerateInteger(const std::vector<IPathGenerator::TColumnPartitioningConfig>& rules, + TString locationTemplate, + std::vector<IPathGenerator::TColumnWithValue>& columnsWithValue, + TMap<TString, std::vector<IPathGenerator::TColumnWithValue>>& result, + size_t pathsLimit, + TInstant now, + size_t p = 0) { + const auto& rule = rules[p]; + for (int64_t i = rule.Min; i <= rule.Max; i += rule.Interval) { + TString copyLocationTemplate = locationTemplate; + ReplaceAll(copyLocationTemplate, "${" + rule.Name + "}", fmtInteger(rule.Digits, i)); + columnsWithValue.push_back(IPathGenerator::TColumnWithValue{.Name=rule.Name, .Type=NUdf::EDataSlot::Int64, .Value=ToString(i)}); + DoGenerate(rules, copyLocationTemplate, columnsWithValue, result, pathsLimit, now, p + 1); + columnsWithValue.pop_back(); + + if (IsOverflow(i, rule.Interval)) { + return; // correct overflow handling + } + } + } +}; + +TPathGeneratorPtr CreatePathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit) { + return std::make_shared<TPathGenerator>(projection, partitionedBy, pathsLimit); } } diff --git a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h index 1c0cc3b298b..449d4c7d19e 100644 --- a/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h +++ b/ydb/library/yql/providers/s3/path_generator/yql_s3_path_generator.h @@ -7,58 +7,68 @@ namespace NYql::NPathGenerator { -enum class EType { - UNDEFINED = 0, - ENUM = 1, - INTEGER = 2, - DATE = 3 -}; +struct IPathGenerator { + enum class EType { + UNDEFINED = 0, + ENUM = 1, + INTEGER = 2, + DATE = 3 + }; -enum class EIntervalUnit { - UNDEFINED = 0, - MILLISECONDS = 1, - SECONDS = 2, - MINUTES = 3, - HOURS = 4, - DAYS = 5, - WEEKS = 6, - MONTHS = 7, - YEARS = 8 -}; + enum class EIntervalUnit { + UNDEFINED = 0, + MILLISECONDS = 1, + SECONDS = 2, + MINUTES = 3, + HOURS = 4, + DAYS = 5, + WEEKS = 6, + MONTHS = 7, + YEARS = 8 + }; -struct ColumnPartitioningConfig { - EType Type = EType::UNDEFINED; - TString Name; - TString Format; - TString From; - TString To; - int64_t Min = 0; - int64_t Max = 0; - EIntervalUnit IntervalUnit = EIntervalUnit::DAYS; - int64_t Interval = 1; - int32_t Digits = 0; - std::vector<TString> Values; -}; + struct TColumnPartitioningConfig { + EType Type = EType::UNDEFINED; + TString Name; + TString Format; + TString From; + TString To; + int64_t Min = 0; + int64_t Max = 0; + EIntervalUnit IntervalUnit = EIntervalUnit::DAYS; + int64_t Interval = 1; + int32_t Digits = 0; + std::vector<TString> Values; + }; -struct ExplicitPartitioningConfig { - bool Enabled = false; - TString LocationTemplate; - std::vector<ColumnPartitioningConfig> Rules; -}; + struct TExplicitPartitioningConfig { + bool Enabled = false; + TString LocationTemplate; + std::vector<TColumnPartitioningConfig> Rules; + }; -ExplicitPartitioningConfig ParsePartitioningRules(const TString& config, const std::vector<TString>& partitionBy); + struct TColumnWithValue { + TString Name; + NUdf::EDataSlot Type; + TString Value; + }; -struct ColumnWithValue { - TString Name; - NUdf::EDataSlot Type; - TString Value; -}; + struct TExpandedPartitioningRule { + TString Path; + std::vector<TColumnWithValue> ColumnValues; + }; + + using TRules = std::vector<TExpandedPartitioningRule>; -struct ExpandedPartitioningRule { - TString Path; - std::vector<ColumnWithValue> ColumnValues; + virtual TString Format(const TStringBuf& columnName, const TStringBuf& dataValue) const = 0; + virtual TString Parse(const TStringBuf& columnName, const TStringBuf& pathValue) const = 0; + virtual const TRules& GetRules() const = 0; + virtual const TExplicitPartitioningConfig& GetConfig() const = 0; + virtual ~IPathGenerator() = default; }; -std::vector<ExpandedPartitioningRule> ExpandPartitioningRules(const ExplicitPartitioningConfig& config, int pathsLimit); +using TPathGeneratorPtr = std::shared_ptr<const IPathGenerator>; + +TPathGeneratorPtr CreatePathGenerator(const TString& projection, const std::vector<TString>& partitionedBy, size_t pathsLimit = 1000); } |