summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhcpp <[email protected]>2022-08-03 23:37:07 +0300
committerhcpp <[email protected]>2022-08-03 23:37:07 +0300
commitdbd229b9d6e3ae64e48726e3a8e790a3bf31f518 (patch)
treed92c2367c3310f383ae436e7b2b47aa708582fb4
parent58a4673f45222b6c60584270ad2fada532dabb88 (diff)
object storage partitioning has been added for binding (public API)
-rw-r--r--ydb/public/api/protos/yq.proto42
1 files changed, 42 insertions, 0 deletions
diff --git a/ydb/public/api/protos/yq.proto b/ydb/public/api/protos/yq.proto
index 846b783eec8..1e3db543745 100644
--- a/ydb/public/api/protos/yq.proto
+++ b/ydb/public/api/protos/yq.proto
@@ -632,6 +632,48 @@ message ObjectStorageBinding {
map<string, string> format_setting = 3 [(Ydb.size).le = 100];
string compression = 4 [(Ydb.length).le = 1024];
Schema schema = 5;
+
+ /*
+ Partition projection is used to speed up the processing of highly partitioned
+ storages and automate the management of partitions. In partition projection, partition values and
+ locations are calculated from configuration rather than read from an object storage. Depending on the
+ specific characteristics of the query and underlying data, partition projection can significantly
+ reduce query execution time if it uses partitioning constraints on partition metadata retrieval. Similar
+ functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html
+ Only enum, integer and date types are supported for path generation. When using projection, there must
+ be at least one element in partitioned_by. This behavior is introduced for symmetric query usage and
+ compatibility with Athena behavior.
+
+ Example:
+ projection = {
+ "projection.enabled" : "true", // used to enable and disable partitioning
+ "projection.city.type" : "enum", // to generate the city column, the enum type will be used (enumeration of objects separated by commas)
+ "projection.city.values" : "Washington,Roma", // column values city Washington or Roma
+ "projection.code.type" : "enum", // to generate the code column, the enum type will be used (enumeration of objects separated by commas)
+ "projection.code.values" : "0,1", // column values code 0 or 1
+ "storage.location.template" : "/${city}/${code}/${device_id}" // the template to which the generated values will be substituted
+ }
+ partitioned_by = [ "city", "device_id" ] // a subset of columns that are included in partitioning
+ - If storage.location.template and partitioned_by are specified together, then the rule from storage.location.template will be used.
+ - If only partitioned_by is specified, then the Hive Metastore format will be used for storage.location.template: "/city=${city}/device_id=${device_id}"
+ The list of paths that correspond to described projection and partitioned_by values are:
+ "/Washington/0/${device_id}", "/Washington/1/${device_id}", "/Roma/0/${device_id}", "/Roma/1/${device_id}"
+ */
+ map<string, string> projection = 6;
+
+ /*
+ By separating the data, it is possible to limit the amount of data scanned by each query, thereby improving
+ performance and reducing costs. Therefore, user data is partition by key (in practice, this is a partition by time).
+ The partitioned_by defines the keys on which to partition data. The columns described in partitioned_by
+ must be specified in the schema. If projection is not specified, the template will be generated according to
+ partitioned_by. Similar functionality is implemented in Athena: https://docs.aws.amazon.com/athena/latest/ug/partitions.html
+
+ Example:
+ partitioned_by = [ "city", "code", "device_id" ]
+ The corresponding storage.location.template will be as follows:
+ "/city=${city}/code=${code}/device_id=${device_id}"
+ */
+ repeated string partitioned_by = 7;
}
repeated Subset subset = 1;