diff options
author | hcpp <[email protected]> | 2022-08-03 23:37:07 +0300 |
---|---|---|
committer | hcpp <[email protected]> | 2022-08-03 23:37:07 +0300 |
commit | dbd229b9d6e3ae64e48726e3a8e790a3bf31f518 (patch) | |
tree | d92c2367c3310f383ae436e7b2b47aa708582fb4 | |
parent | 58a4673f45222b6c60584270ad2fada532dabb88 (diff) |
object storage partitioning has been added for binding (public API)
-rw-r--r-- | ydb/public/api/protos/yq.proto | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/ydb/public/api/protos/yq.proto b/ydb/public/api/protos/yq.proto index 846b783eec8..1e3db543745 100644 --- a/ydb/public/api/protos/yq.proto +++ b/ydb/public/api/protos/yq.proto @@ -632,6 +632,48 @@ message ObjectStorageBinding { map<string, string> format_setting = 3 [(Ydb.size).le = 100]; string compression = 4 [(Ydb.length).le = 1024]; Schema schema = 5; + + /* + Partition projection is used to speed up the processing of highly partitioned + storages and automate the management of partitions. In partition projection, partition values and + locations are calculated from configuration rather than read from an object storage. Depending on the + specific characteristics of the query and underlying data, partition projection can significantly + reduce query execution time if it uses partitioning constraints on partition metadata retrieval. Similar + functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html + Only enum, integer and date types are supported for path generation. When using projection, there must + be at least one element in partitioned_by. This behavior is introduced for symmetric query usage and + compatibility with Athena behavior. + + Example: + projection = { + "projection.enabled" : "true", // used to enable and disable partitioning + "projection.city.type" : "enum", // to generate the city column, the enum type will be used (enumeration of objects separated by commas) + "projection.city.values" : "Washington,Roma", // column values city Washington or Roma + "projection.code.type" : "enum", // to generate the code column, the enum type will be used (enumeration of objects separated by commas) + "projection.code.values" : "0,1", // column values code 0 or 1 + "storage.location.template" : "/${city}/${code}/${device_id}" // the template to which the generated values will be substituted + } + partitioned_by = [ "city", "device_id" ] // a subset of columns that are included in partitioning + - If storage.location.template and partitioned_by are specified together, then the rule from storage.location.template will be used. + - If only partitioned_by is specified, then the Hive Metastore format will be used for storage.location.template: "/city=${city}/device_id=${device_id}" + The list of paths that correspond to described projection and partitioned_by values are: + "/Washington/0/${device_id}", "/Washington/1/${device_id}", "/Roma/0/${device_id}", "/Roma/1/${device_id}" + */ + map<string, string> projection = 6; + + /* + By separating the data, it is possible to limit the amount of data scanned by each query, thereby improving + performance and reducing costs. Therefore, user data is partition by key (in practice, this is a partition by time). + The partitioned_by defines the keys on which to partition data. The columns described in partitioned_by + must be specified in the schema. If projection is not specified, the template will be generated according to + partitioned_by. Similar functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partitions.html + + Example: + partitioned_by = [ "city", "code", "device_id" ] + The corresponding storage.location.template will be as follows: + "/city=${city}/code=${code}/device_id=${device_id}" + */ + repeated string partitioned_by = 7; } repeated Subset subset = 1; |