1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
|
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto2";
package orc.proto;
option java_package = "org.apache.orc";
message IntegerStatistics {
optional sint64 minimum = 1;
optional sint64 maximum = 2;
optional sint64 sum = 3;
}
message DoubleStatistics {
optional double minimum = 1;
optional double maximum = 2;
optional double sum = 3;
}
message StringStatistics {
optional string minimum = 1;
optional string maximum = 2;
// sum will store the total length of all strings in a stripe
optional sint64 sum = 3;
// If the minimum or maximum value was longer than 1024 bytes, store a lower or upper
// bound instead of the minimum or maximum values above.
optional string lowerBound = 4;
optional string upperBound = 5;
}
message BucketStatistics {
repeated uint64 count = 1 [packed=true];
}
message DecimalStatistics {
optional string minimum = 1;
optional string maximum = 2;
optional string sum = 3;
}
message DateStatistics {
// min,max values saved as days since epoch
optional sint32 minimum = 1;
optional sint32 maximum = 2;
}
message TimestampStatistics {
// min,max values saved as milliseconds since epoch
optional sint64 minimum = 1;
optional sint64 maximum = 2;
optional sint64 minimumUtc = 3;
optional sint64 maximumUtc = 4;
// store the lower 6 TS digits for min/max to achieve nanosecond precision
optional int32 minimumNanos = 5;
optional int32 maximumNanos = 6;
}
message BinaryStatistics {
// sum will store the total binary blob length in a stripe
optional sint64 sum = 1;
}
// Statistics for list and map
message CollectionStatistics {
optional uint64 minChildren = 1;
optional uint64 maxChildren = 2;
optional uint64 totalChildren = 3;
}
message ColumnStatistics {
optional uint64 numberOfValues = 1;
optional IntegerStatistics intStatistics = 2;
optional DoubleStatistics doubleStatistics = 3;
optional StringStatistics stringStatistics = 4;
optional BucketStatistics bucketStatistics = 5;
optional DecimalStatistics decimalStatistics = 6;
optional DateStatistics dateStatistics = 7;
optional BinaryStatistics binaryStatistics = 8;
optional TimestampStatistics timestampStatistics = 9;
optional bool hasNull = 10;
optional uint64 bytesOnDisk = 11;
optional CollectionStatistics collectionStatistics = 12;
}
message RowIndexEntry {
repeated uint64 positions = 1 [packed=true];
optional ColumnStatistics statistics = 2;
}
message RowIndex {
repeated RowIndexEntry entry = 1;
}
message BloomFilter {
optional uint32 numHashFunctions = 1;
repeated fixed64 bitset = 2;
optional bytes utf8bitset = 3;
}
message BloomFilterIndex {
repeated BloomFilter bloomFilter = 1;
}
message Stream {
// if you add new index stream kinds, you need to make sure to update
// StreamName to ensure it is added to the stripe in the right area
enum Kind {
PRESENT = 0;
DATA = 1;
LENGTH = 2;
DICTIONARY_DATA = 3;
DICTIONARY_COUNT = 4;
SECONDARY = 5;
ROW_INDEX = 6;
BLOOM_FILTER = 7;
BLOOM_FILTER_UTF8 = 8;
// Virtual stream kinds to allocate space for encrypted index and data.
ENCRYPTED_INDEX = 9;
ENCRYPTED_DATA = 10;
// stripe statistics streams
STRIPE_STATISTICS = 100;
// A virtual stream kind that is used for setting the encryption IV.
FILE_STATISTICS = 101;
}
optional Kind kind = 1;
optional uint32 column = 2;
optional uint64 length = 3;
}
message ColumnEncoding {
enum Kind {
DIRECT = 0;
DICTIONARY = 1;
DIRECT_V2 = 2;
DICTIONARY_V2 = 3;
}
optional Kind kind = 1;
optional uint32 dictionarySize = 2;
// The encoding of the bloom filters for this column:
// 0 or missing = none or original
// 1 = ORC-135 (utc for timestamps)
optional uint32 bloomEncoding = 3;
}
message StripeEncryptionVariant {
repeated Stream streams = 1;
repeated ColumnEncoding encoding = 2;
}
// each stripe looks like:
// index streams
// unencrypted
// variant 1..N
// data streams
// unencrypted
// variant 1..N
// footer
message StripeFooter {
repeated Stream streams = 1;
repeated ColumnEncoding columns = 2;
optional string writerTimezone = 3;
// one for each column encryption variant
repeated StripeEncryptionVariant encryption = 4;
}
// the file tail looks like:
// encrypted stripe statistics: ColumnarStripeStatistics (order by variant)
// stripe statistics: Metadata
// footer: Footer
// postscript: PostScript
// psLen: byte
message StringPair {
optional string key = 1;
optional string value = 2;
}
message Type {
enum Kind {
BOOLEAN = 0;
BYTE = 1;
SHORT = 2;
INT = 3;
LONG = 4;
FLOAT = 5;
DOUBLE = 6;
STRING = 7;
BINARY = 8;
TIMESTAMP = 9;
LIST = 10;
MAP = 11;
STRUCT = 12;
UNION = 13;
DECIMAL = 14;
DATE = 15;
VARCHAR = 16;
CHAR = 17;
TIMESTAMP_INSTANT = 18;
}
optional Kind kind = 1;
repeated uint32 subtypes = 2 [packed=true];
repeated string fieldNames = 3;
optional uint32 maximumLength = 4;
optional uint32 precision = 5;
optional uint32 scale = 6;
repeated StringPair attributes = 7;
}
message StripeInformation {
// the global file offset of the start of the stripe
optional uint64 offset = 1;
// the number of bytes of index
optional uint64 indexLength = 2;
// the number of bytes of data
optional uint64 dataLength = 3;
// the number of bytes in the stripe footer
optional uint64 footerLength = 4;
// the number of rows in this stripe
optional uint64 numberOfRows = 5;
// If this is present, the reader should use this value for the encryption
// stripe id for setting the encryption IV. Otherwise, the reader should
// use one larger than the previous stripe's encryptStripeId.
// For unmerged ORC files, the first stripe will use 1 and the rest of the
// stripes won't have it set. For merged files, the stripe information
// will be copied from their original files and thus the first stripe of
// each of the input files will reset it to 1.
// Note that 1 was choosen, because protobuf v3 doesn't serialize
// primitive types that are the default (eg. 0).
optional uint64 encryptStripeId = 6;
// For each encryption variant, the new encrypted local key to use
// until we find a replacement.
repeated bytes encryptedLocalKeys = 7;
}
message UserMetadataItem {
optional string name = 1;
optional bytes value = 2;
}
// StripeStatistics (1 per a stripe), which each contain the
// ColumnStatistics for each column.
// This message type is only used in ORC v0 and v1.
message StripeStatistics {
repeated ColumnStatistics colStats = 1;
}
// This message type is only used in ORC v0 and v1.
message Metadata {
repeated StripeStatistics stripeStats = 1;
}
// In ORC v2 (and for encrypted columns in v1), each column has
// their column statistics written separately.
message ColumnarStripeStatistics {
// one value for each stripe in the file
repeated ColumnStatistics colStats = 1;
}
enum EncryptionAlgorithm {
UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms
AES_CTR_128 = 1;
AES_CTR_256 = 2;
}
message FileStatistics {
repeated ColumnStatistics column = 1;
}
// How was the data masked? This isn't necessary for reading the file, but
// is documentation about how the file was written.
message DataMask {
// the kind of masking, which may include third party masks
optional string name = 1;
// parameters for the mask
repeated string maskParameters = 2;
// the unencrypted column roots this mask was applied to
repeated uint32 columns = 3 [packed = true];
}
// Information about the encryption keys.
message EncryptionKey {
optional string keyName = 1;
optional uint32 keyVersion = 2;
optional EncryptionAlgorithm algorithm = 3;
}
// The description of an encryption variant.
// Each variant is a single subtype that is encrypted with a single key.
message EncryptionVariant {
// the column id of the root
optional uint32 root = 1;
// The master key that was used to encrypt the local key, referenced as
// an index into the Encryption.key list.
optional uint32 key = 2;
// the encrypted key for the file footer
optional bytes encryptedKey = 3;
// the stripe statistics for this variant
repeated Stream stripeStatistics = 4;
// encrypted file statistics as a FileStatistics
optional bytes fileStatistics = 5;
}
// Which KeyProvider encrypted the local keys.
enum KeyProviderKind {
UNKNOWN = 0;
HADOOP = 1;
AWS = 2;
GCP = 3;
AZURE = 4;
}
message Encryption {
// all of the masks used in this file
repeated DataMask mask = 1;
// all of the keys used in this file
repeated EncryptionKey key = 2;
// The encrypted variants.
// Readers should prefer the first variant that the user has access to
// the corresponding key. If they don't have access to any of the keys,
// they should get the unencrypted masked data.
repeated EncryptionVariant variants = 3;
// How are the local keys encrypted?
optional KeyProviderKind keyProvider = 4;
}
enum CalendarKind {
UNKNOWN_CALENDAR = 0;
// A hybrid Julian/Gregorian calendar with a cutover point in October 1582.
JULIAN_GREGORIAN = 1;
// A calendar that extends the Gregorian calendar back forever.
PROLEPTIC_GREGORIAN = 2;
}
message Footer {
optional uint64 headerLength = 1;
optional uint64 contentLength = 2;
repeated StripeInformation stripes = 3;
repeated Type types = 4;
repeated UserMetadataItem metadata = 5;
optional uint64 numberOfRows = 6;
repeated ColumnStatistics statistics = 7;
optional uint32 rowIndexStride = 8;
// Each implementation that writes ORC files should register for a code
// 0 = ORC Java
// 1 = ORC C++
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
// 4 = Trino
optional uint32 writer = 9;
// information about the encryption in this file
optional Encryption encryption = 10;
optional CalendarKind calendar = 11;
// informative description about the version of the software that wrote
// the file. It is assumed to be within a given writer, so for example
// ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT".
optional string softwareVersion = 12;
}
enum CompressionKind {
NONE = 0;
ZLIB = 1;
SNAPPY = 2;
LZO = 3;
LZ4 = 4;
ZSTD = 5;
}
// Serialized length must be less that 255 bytes
message PostScript {
optional uint64 footerLength = 1;
optional CompressionKind compression = 2;
optional uint64 compressionBlockSize = 3;
// the version of the file format
// [0, 11] = Hive 0.11
// [0, 12] = Hive 0.12
repeated uint32 version = 4 [packed = true];
optional uint64 metadataLength = 5;
// The version of the writer that wrote the file. This number is
// updated when we make fixes or large changes to the writer so that
// readers can detect whether a given bug is present in the data.
//
// Only the Java ORC writer may use values under 6 (or missing) so that
// readers that predate ORC-202 treat the new writers correctly. Each
// writer should assign their own sequence of versions starting from 6.
//
// Version of the ORC Java writer:
// 0 = original
// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
// string statistics use utf8 for min/max)
// 2 = HIVE-4243 fixed (use real column names from Hive tables)
// 3 = HIVE-12055 added (vectorized writer implementation)
// 4 = HIVE-13083 fixed (decimals write present stream correctly)
// 5 = ORC-101 fixed (bloom filters use utf8 consistently)
// 6 = ORC-135 fixed (timestamp statistics use utc)
// 7 = ORC-517 fixed (decimal64 min/max incorrect)
// 8 = ORC-203 added (trim very long string statistics)
// 9 = ORC-14 added (column encryption)
//
// Version of the ORC C++ writer:
// 6 = original
//
// Version of the Presto writer:
// 6 = original
//
// Version of the Scritchley Go writer:
// 6 = original
//
// Version of the Trino writer:
// 6 = original
//
optional uint32 writerVersion = 6;
// the number of bytes in the encrypted stripe statistics
optional uint64 stripeStatisticsLength = 7;
// Leave this last in the record
optional string magic = 8000;
}
// The contents of the file tail that must be serialized.
// This gets serialized as part of OrcSplit, also used by footer cache.
message FileTail {
optional PostScript postscript = 1;
optional Footer footer = 2;
optional uint64 fileLength = 3;
optional uint64 postscriptLength = 4;
}
|