object storage partitioning has been added for binding (public API)

author: hcpp <[email protected]> 2022-08-03 23:37:07 +0300
committer: hcpp <[email protected]> 2022-08-03 23:37:07 +0300
commit: dbd229b9d6e3ae64e48726e3a8e790a3bf31f518 (patch)
tree: d92c2367c3310f383ae436e7b2b47aa708582fb4
parent: 58a4673f45222b6c60584270ad2fada532dabb88 (diff)
1 files changed, 42 insertions, 0 deletions
diff --git a/ydb/public/api/protos/yq.proto b/ydb/public/api/protos/yq.proto
index 846b783eec8..1e3db543745 100644
--- a/ydb/public/api/protos/yq.proto
+++ b/ydb/public/api/protos/yq.proto
@@ -632,6 +632,48 @@ message ObjectStorageBinding {
         map<string, string> format_setting = 3 [(Ydb.size).le = 100];
         string compression = 4 [(Ydb.length).le = 1024];
         Schema schema = 5;
+
+        /*
+        Partition projection is used to speed up the processing of highly partitioned
+        storages and automate the management of partitions. In partition projection, partition values and
+        locations are calculated from configuration rather than read from an object storage. Depending on the
+        specific characteristics of the query and underlying data, partition projection can significantly
+        reduce query execution time if it uses partitioning constraints on partition metadata retrieval. Similar
+        functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html
+        Only enum, integer and date types are supported for path generation. When using projection, there must
+        be at least one element in partitioned_by. This behavior is introduced for symmetric query usage and
+        compatibility with Athena behavior.
+
+        Example:
+        projection = {
+            "projection.enabled" : "true", // used to enable and disable partitioning
+            "projection.city.type" : "enum", // to generate the city column, the enum type will be used (enumeration of objects separated by commas)
+            "projection.city.values" : "Washington,Roma", // column values city Washington or Roma
+            "projection.code.type" : "enum", // to generate the code column, the enum type will be used (enumeration of objects separated by commas)
+            "projection.code.values" : "0,1", // column values code 0 or 1
+            "storage.location.template" : "/${city}/${code}/${device_id}" // the template to which the generated values will be substituted
+        }
+        partitioned_by = [ "city", "device_id" ] // a subset of columns that are included in partitioning
+        - If storage.location.template and partitioned_by are specified together, then the rule from storage.location.template will be used.
+        - If only partitioned_by is specified, then the Hive Metastore format will be used for storage.location.template: "/city=${city}/device_id=${device_id}"
+        The list of paths that correspond to described projection and partitioned_by values are:
+        "/Washington/0/${device_id}", "/Washington/1/${device_id}", "/Roma/0/${device_id}", "/Roma/1/${device_id}"
+        */
+        map<string, string> projection = 6;
+
+        /*
+        By separating the data, it is possible to limit the amount of data scanned by each query, thereby improving
+        performance and reducing costs. Therefore, user data is partition by key (in practice, this is a partition by time).
+        The partitioned_by defines the keys on which to partition data. The columns described in partitioned_by
+        must be specified in the schema. If projection is not specified, the template will be generated according to
+        partitioned_by. Similar functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partitions.html
+
+        Example:
+        partitioned_by = [ "city", "code", "device_id" ]
+        The corresponding storage.location.template will be as follows:
+        "/city=${city}/code=${code}/device_id=${device_id}"
+        */
+        repeated string partitioned_by = 7;
     }
 
     repeated Subset subset = 1;
author	hcpp <[email protected]>	2022-08-03 23:37:07 +0300
committer	hcpp <[email protected]>	2022-08-03 23:37:07 +0300
commit	dbd229b9d6e3ae64e48726e3a8e790a3bf31f518 (patch)
tree	d92c2367c3310f383ae436e7b2b47aa708582fb4
parent	58a4673f45222b6c60584270ad2fada532dabb88 (diff)