1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
#pragma once
#include <map>
#include <optional>
#include <city.h>
#include <base/types.h>
#include <Disks/IDisk.h>
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
class SipHash;
namespace DB
{
class IDataPartStorage;
/// Checksum of one file.
struct MergeTreeDataPartChecksum
{
using uint128 = CityHash_v1_0_2::uint128;
UInt64 file_size {};
uint128 file_hash {};
bool is_compressed = false;
UInt64 uncompressed_size {};
uint128 uncompressed_hash {};
MergeTreeDataPartChecksum() = default;
MergeTreeDataPartChecksum(UInt64 file_size_, uint128 file_hash_) : file_size(file_size_), file_hash(file_hash_) {}
MergeTreeDataPartChecksum(UInt64 file_size_, uint128 file_hash_, UInt64 uncompressed_size_, uint128 uncompressed_hash_)
: file_size(file_size_), file_hash(file_hash_), is_compressed(true),
uncompressed_size(uncompressed_size_), uncompressed_hash(uncompressed_hash_) {}
void checkEqual(const MergeTreeDataPartChecksum & rhs, bool have_uncompressed, const String & name) const;
void checkSize(const IDataPartStorage & storage, const String & name) const;
};
/** Checksums of all non-temporary files.
* For compressed files, the check sum and the size of the decompressed data are stored to not depend on the compression method.
*/
struct MergeTreeDataPartChecksums
{
using Checksum = MergeTreeDataPartChecksum;
/// The order is important.
using FileChecksums = std::map<String, Checksum>;
FileChecksums files;
void addFile(const String & file_name, UInt64 file_size, Checksum::uint128 file_hash);
void add(MergeTreeDataPartChecksums && rhs_checksums);
bool has(const String & file_name) const { return files.find(file_name) != files.end(); }
bool empty() const { return files.empty(); }
/// Checks that the set of columns and their checksums are the same. If not, throws an exception.
/// If have_uncompressed, for compressed files it compares the checksums of the decompressed data.
/// Otherwise, it compares only the checksums of the files.
void checkEqual(const MergeTreeDataPartChecksums & rhs, bool have_uncompressed) const;
static bool isBadChecksumsErrorCode(int code);
/// Checks that the directory contains all the needed files of the correct size. Does not check the checksum.
void checkSizes(const IDataPartStorage & storage) const;
/// Returns false if the checksum is too old.
bool read(ReadBuffer & in);
/// Assume that header with version (the first line) is read
bool read(ReadBuffer & in, size_t format_version);
bool readV2(ReadBuffer & in);
bool readV3(ReadBuffer & in);
bool readV4(ReadBuffer & from);
void write(WriteBuffer & to) const;
/// Checksum from the set of checksums of .bin files (for deduplication).
void computeTotalChecksumDataOnly(SipHash & hash) const;
/// SipHash of all all files hashes represented as hex string
String getTotalChecksumHex() const;
Checksum::uint128 getTotalChecksumUInt128() const;
String getSerializedString() const;
static MergeTreeDataPartChecksums deserializeFrom(const String & s);
UInt64 getTotalSizeOnDisk() const;
};
/// A kind of MergeTreeDataPartChecksums intended to be stored in ZooKeeper (to save its RAM)
/// MinimalisticDataPartChecksums and MergeTreeDataPartChecksums have the same serialization format
/// for versions less than MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS.
struct MinimalisticDataPartChecksums
{
UInt64 num_compressed_files = 0;
UInt64 num_uncompressed_files = 0;
using uint128 = MergeTreeDataPartChecksum::uint128;
uint128 hash_of_all_files {};
uint128 hash_of_uncompressed_files {};
uint128 uncompressed_hash_of_compressed_files {};
bool operator==(const MinimalisticDataPartChecksums & other) const
{
return num_compressed_files == other.num_compressed_files
&& num_uncompressed_files == other.num_uncompressed_files
&& hash_of_all_files == other.hash_of_all_files
&& hash_of_uncompressed_files == other.hash_of_uncompressed_files
&& uncompressed_hash_of_compressed_files == other.uncompressed_hash_of_compressed_files;
}
/// Is set only for old formats
std::optional<MergeTreeDataPartChecksums> full_checksums;
static constexpr size_t MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS = 5;
MinimalisticDataPartChecksums() = default;
void computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums);
bool deserialize(ReadBuffer & in);
void deserializeWithoutHeader(ReadBuffer & in);
static MinimalisticDataPartChecksums deserializeFrom(const String & s);
void serialize(WriteBuffer & to) const;
void serializeWithoutHeader(WriteBuffer & to) const;
String getSerializedString() const;
static String getSerializedString(const MergeTreeDataPartChecksums & full_checksums, bool minimalistic);
void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
};
}
|