aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/IO
diff options
context:
space:
mode:
authorvitalyisaev <vitalyisaev@ydb.tech>2023-11-14 09:58:56 +0300
committervitalyisaev <vitalyisaev@ydb.tech>2023-11-14 10:20:20 +0300
commitc2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
treecd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/IO
parentd4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
downloadydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/IO')
-rw-r--r--contrib/clickhouse/src/IO/AIO.cpp148
-rw-r--r--contrib/clickhouse/src/IO/AIO.h79
-rw-r--r--contrib/clickhouse/src/IO/Archives/ArchiveUtils.h14
-rw-r--r--contrib/clickhouse/src/IO/Archives/IArchiveReader.h69
-rw-r--r--contrib/clickhouse/src/IO/Archives/IArchiveWriter.h39
-rw-r--r--contrib/clickhouse/src/IO/Archives/LibArchiveReader.cpp354
-rw-r--r--contrib/clickhouse/src/IO/Archives/LibArchiveReader.h78
-rw-r--r--contrib/clickhouse/src/IO/Archives/ZipArchiveReader.cpp662
-rw-r--r--contrib/clickhouse/src/IO/Archives/ZipArchiveReader.h87
-rw-r--r--contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.cpp407
-rw-r--r--contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.h92
-rw-r--r--contrib/clickhouse/src/IO/Archives/createArchiveReader.cpp70
-rw-r--r--contrib/clickhouse/src/IO/Archives/createArchiveReader.h23
-rw-r--r--contrib/clickhouse/src/IO/Archives/createArchiveWriter.cpp38
-rw-r--r--contrib/clickhouse/src/IO/Archives/createArchiveWriter.h19
-rw-r--r--contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.cpp12
-rw-r--r--contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.h12
-rw-r--r--contrib/clickhouse/src/IO/AsyncReadCounters.cpp37
-rw-r--r--contrib/clickhouse/src/IO/AsyncReadCounters.h32
-rw-r--r--contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.cpp109
-rw-r--r--contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.h84
-rw-r--r--contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp272
-rw-r--r--contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.h77
-rw-r--r--contrib/clickhouse/src/IO/AsynchronousReader.h87
-rw-r--r--contrib/clickhouse/src/IO/BitHelpers.h236
-rw-r--r--contrib/clickhouse/src/IO/BoundedReadBuffer.cpp66
-rw-r--r--contrib/clickhouse/src/IO/BoundedReadBuffer.h38
-rw-r--r--contrib/clickhouse/src/IO/BrotliReadBuffer.cpp109
-rw-r--r--contrib/clickhouse/src/IO/BrotliReadBuffer.h37
-rw-r--r--contrib/clickhouse/src/IO/BrotliWriteBuffer.cpp126
-rw-r--r--contrib/clickhouse/src/IO/BrotliWriteBuffer.h38
-rw-r--r--contrib/clickhouse/src/IO/BufferBase.h132
-rw-r--r--contrib/clickhouse/src/IO/BufferWithOwnMemory.h196
-rw-r--r--contrib/clickhouse/src/IO/Bzip2ReadBuffer.cpp139
-rw-r--r--contrib/clickhouse/src/IO/Bzip2ReadBuffer.h31
-rw-r--r--contrib/clickhouse/src/IO/Bzip2WriteBuffer.cpp110
-rw-r--r--contrib/clickhouse/src/IO/Bzip2WriteBuffer.h31
-rw-r--r--contrib/clickhouse/src/IO/CascadeWriteBuffer.cpp119
-rw-r--r--contrib/clickhouse/src/IO/CascadeWriteBuffer.h63
-rw-r--r--contrib/clickhouse/src/IO/CompressedReadBufferWrapper.h28
-rw-r--r--contrib/clickhouse/src/IO/CompressionMethod.cpp205
-rw-r--r--contrib/clickhouse/src/IO/CompressionMethod.h73
-rw-r--r--contrib/clickhouse/src/IO/ConcatReadBuffer.h86
-rw-r--r--contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.cpp144
-rw-r--r--contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.h46
-rw-r--r--contrib/clickhouse/src/IO/ConnectionTimeouts.cpp136
-rw-r--r--contrib/clickhouse/src/IO/ConnectionTimeouts.h72
-rw-r--r--contrib/clickhouse/src/IO/DoubleConverter.cpp16
-rw-r--r--contrib/clickhouse/src/IO/DoubleConverter.h46
-rw-r--r--contrib/clickhouse/src/IO/EmptyReadBuffer.h18
-rw-r--r--contrib/clickhouse/src/IO/FileEncryptionCommon.cpp465
-rw-r--r--contrib/clickhouse/src/IO/FileEncryptionCommon.h154
-rw-r--r--contrib/clickhouse/src/IO/ForkWriteBuffer.cpp60
-rw-r--r--contrib/clickhouse/src/IO/ForkWriteBuffer.h34
-rw-r--r--contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.cpp91
-rw-r--r--contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.h27
-rw-r--r--contrib/clickhouse/src/IO/HTTPCommon.cpp449
-rw-r--r--contrib/clickhouse/src/IO/HTTPCommon.h104
-rw-r--r--contrib/clickhouse/src/IO/HTTPHeaderEntries.h18
-rw-r--r--contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.cpp240
-rw-r--r--contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.h117
-rw-r--r--contrib/clickhouse/src/IO/HashingReadBuffer.h55
-rw-r--r--contrib/clickhouse/src/IO/HashingWriteBuffer.cpp54
-rw-r--r--contrib/clickhouse/src/IO/HashingWriteBuffer.h92
-rw-r--r--contrib/clickhouse/src/IO/IReadableWriteBuffer.h32
-rw-r--r--contrib/clickhouse/src/IO/IResourceManager.h53
-rw-r--r--contrib/clickhouse/src/IO/ISchedulerConstraint.h55
-rw-r--r--contrib/clickhouse/src/IO/ISchedulerNode.h222
-rw-r--r--contrib/clickhouse/src/IO/ISchedulerQueue.h60
-rw-r--r--contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.cpp123
-rw-r--r--contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.h35
-rw-r--r--contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.cpp99
-rw-r--r--contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.h29
-rw-r--r--contrib/clickhouse/src/IO/LimitReadBuffer.cpp99
-rw-r--r--contrib/clickhouse/src/IO/LimitReadBuffer.h36
-rw-r--r--contrib/clickhouse/src/IO/LimitSeekableReadBuffer.cpp125
-rw-r--r--contrib/clickhouse/src/IO/LimitSeekableReadBuffer.h33
-rw-r--r--contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.cpp161
-rw-r--r--contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.h43
-rw-r--r--contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.cpp89
-rw-r--r--contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.h39
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFile.cpp79
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFile.h40
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.cpp105
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.h47
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.cpp78
-rw-r--r--contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.h29
-rw-r--r--contrib/clickhouse/src/IO/MMappedFile.cpp78
-rw-r--r--contrib/clickhouse/src/IO/MMappedFile.h40
-rw-r--r--contrib/clickhouse/src/IO/MMappedFileCache.h60
-rw-r--r--contrib/clickhouse/src/IO/MMappedFileDescriptor.cpp107
-rw-r--r--contrib/clickhouse/src/IO/MMappedFileDescriptor.h60
-rw-r--r--contrib/clickhouse/src/IO/MemoryReadWriteBuffer.cpp147
-rw-r--r--contrib/clickhouse/src/IO/MemoryReadWriteBuffer.h60
-rw-r--r--contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.cpp80
-rw-r--r--contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.h30
-rw-r--r--contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.cpp61
-rw-r--r--contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.h33
-rw-r--r--contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.cpp61
-rw-r--r--contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.h36
-rw-r--r--contrib/clickhouse/src/IO/NullWriteBuffer.cpp16
-rw-r--r--contrib/clickhouse/src/IO/NullWriteBuffer.h21
-rw-r--r--contrib/clickhouse/src/IO/OpenedFile.cpp77
-rw-r--r--contrib/clickhouse/src/IO/OpenedFile.h43
-rw-r--r--contrib/clickhouse/src/IO/OpenedFileCache.h116
-rw-r--r--contrib/clickhouse/src/IO/Operators.h98
-rw-r--r--contrib/clickhouse/src/IO/ParallelReadBuffer.cpp307
-rw-r--r--contrib/clickhouse/src/IO/ParallelReadBuffer.h100
-rw-r--r--contrib/clickhouse/src/IO/PeekableReadBuffer.cpp378
-rw-r--r--contrib/clickhouse/src/IO/PeekableReadBuffer.h143
-rw-r--r--contrib/clickhouse/src/IO/Progress.cpp240
-rw-r--r--contrib/clickhouse/src/IO/Progress.h151
-rw-r--r--contrib/clickhouse/src/IO/ReadBuffer.cpp47
-rw-r--r--contrib/clickhouse/src/IO/ReadBuffer.h277
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromEmptyFile.h25
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.cpp106
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.h50
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFile.cpp97
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFile.h108
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileBase.cpp49
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileBase.h63
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.cpp60
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.h37
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.cpp288
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.h103
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromIStream.cpp39
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromIStream.h21
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromMemory.cpp47
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromMemory.h27
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.cpp129
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.h37
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromS3.cpp498
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromS3.h108
-rw-r--r--contrib/clickhouse/src/IO/ReadBufferFromString.h28
-rw-r--r--contrib/clickhouse/src/IO/ReadHelpers.cpp1712
-rw-r--r--contrib/clickhouse/src/IO/ReadHelpers.h1619
-rw-r--r--contrib/clickhouse/src/IO/ReadHelpersArena.h33
-rw-r--r--contrib/clickhouse/src/IO/ReadSettings.h136
-rw-r--r--contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.cpp940
-rw-r--r--contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.h291
-rw-r--r--contrib/clickhouse/src/IO/Resource/ClassifiersConfig.cpp40
-rw-r--r--contrib/clickhouse/src/IO/Resource/ClassifiersConfig.h39
-rw-r--r--contrib/clickhouse/src/IO/Resource/DynamicResourceManager.cpp234
-rw-r--r--contrib/clickhouse/src/IO/Resource/DynamicResourceManager.h93
-rw-r--r--contrib/clickhouse/src/IO/Resource/FairPolicy.cpp13
-rw-r--r--contrib/clickhouse/src/IO/Resource/FairPolicy.h232
-rw-r--r--contrib/clickhouse/src/IO/Resource/FifoQueue.cpp13
-rw-r--r--contrib/clickhouse/src/IO/Resource/FifoQueue.h91
-rw-r--r--contrib/clickhouse/src/IO/Resource/PriorityPolicy.cpp13
-rw-r--r--contrib/clickhouse/src/IO/Resource/PriorityPolicy.h143
-rw-r--r--contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.cpp13
-rw-r--r--contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.h138
-rw-r--r--contrib/clickhouse/src/IO/Resource/StaticResourceManager.cpp138
-rw-r--r--contrib/clickhouse/src/IO/Resource/StaticResourceManager.h49
-rw-r--r--contrib/clickhouse/src/IO/Resource/registerResourceManagers.cpp17
-rw-r--r--contrib/clickhouse/src/IO/Resource/registerResourceManagers.h8
-rw-r--r--contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.cpp30
-rw-r--r--contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.h8
-rw-r--r--contrib/clickhouse/src/IO/ResourceBudget.h55
-rw-r--r--contrib/clickhouse/src/IO/ResourceGuard.h139
-rw-r--r--contrib/clickhouse/src/IO/ResourceLink.h39
-rw-r--r--contrib/clickhouse/src/IO/ResourceManagerFactory.h55
-rw-r--r--contrib/clickhouse/src/IO/ResourceRequest.h89
-rw-r--r--contrib/clickhouse/src/IO/S3/AWSLogger.cpp78
-rw-r--r--contrib/clickhouse/src/IO/S3/AWSLogger.h39
-rw-r--r--contrib/clickhouse/src/IO/S3/Client.cpp903
-rw-r--r--contrib/clickhouse/src/IO/S3/Client.h329
-rw-r--r--contrib/clickhouse/src/IO/S3/Credentials.cpp574
-rw-r--r--contrib/clickhouse/src/IO/S3/Credentials.h146
-rw-r--r--contrib/clickhouse/src/IO/S3/PocoHTTPClient.cpp558
-rw-r--r--contrib/clickhouse/src/IO/S3/PocoHTTPClient.h198
-rw-r--r--contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.cpp40
-rw-r--r--contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.h28
-rw-r--r--contrib/clickhouse/src/IO/S3/ProviderType.cpp40
-rw-r--r--contrib/clickhouse/src/IO/S3/ProviderType.h44
-rw-r--r--contrib/clickhouse/src/IO/S3/Requests.cpp156
-rw-r--r--contrib/clickhouse/src/IO/S3/Requests.h135
-rw-r--r--contrib/clickhouse/src/IO/S3/SessionAwareIOStream.h30
-rw-r--r--contrib/clickhouse/src/IO/S3/URI.cpp119
-rw-r--r--contrib/clickhouse/src/IO/S3/URI.h41
-rw-r--r--contrib/clickhouse/src/IO/S3/copyS3File.cpp850
-rw-r--r--contrib/clickhouse/src/IO/S3/copyS3File.h68
-rw-r--r--contrib/clickhouse/src/IO/S3/getObjectInfo.cpp144
-rw-r--r--contrib/clickhouse/src/IO/S3/getObjectInfo.h63
-rw-r--r--contrib/clickhouse/src/IO/S3Common.cpp178
-rw-r--r--contrib/clickhouse/src/IO/S3Common.h98
-rw-r--r--contrib/clickhouse/src/IO/SchedulerNodeFactory.h57
-rw-r--r--contrib/clickhouse/src/IO/SchedulerRoot.h250
-rw-r--r--contrib/clickhouse/src/IO/SeekableReadBuffer.cpp109
-rw-r--r--contrib/clickhouse/src/IO/SeekableReadBuffer.h103
-rw-r--r--contrib/clickhouse/src/IO/SharedThreadPools.cpp141
-rw-r--r--contrib/clickhouse/src/IO/SharedThreadPools.h64
-rw-r--r--contrib/clickhouse/src/IO/SnappyReadBuffer.cpp75
-rw-r--r--contrib/clickhouse/src/IO/SnappyReadBuffer.h35
-rw-r--r--contrib/clickhouse/src/IO/SnappyWriteBuffer.cpp92
-rw-r--r--contrib/clickhouse/src/IO/SnappyWriteBuffer.h41
-rw-r--r--contrib/clickhouse/src/IO/StdIStreamFromMemory.cpp62
-rw-r--r--contrib/clickhouse/src/IO/StdIStreamFromMemory.h36
-rw-r--r--contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.cpp111
-rw-r--r--contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.h39
-rw-r--r--contrib/clickhouse/src/IO/StdStreamFromReadBuffer.h38
-rw-r--r--contrib/clickhouse/src/IO/SwapHelper.cpp17
-rw-r--r--contrib/clickhouse/src/IO/SwapHelper.h19
-rw-r--r--contrib/clickhouse/src/IO/SynchronousReader.cpp89
-rw-r--r--contrib/clickhouse/src/IO/SynchronousReader.h20
-rw-r--r--contrib/clickhouse/src/IO/TimeoutSetter.cpp56
-rw-r--r--contrib/clickhouse/src/IO/TimeoutSetter.h34
-rw-r--r--contrib/clickhouse/src/IO/UncompressedCache.h80
-rw-r--r--contrib/clickhouse/src/IO/UseSSL.cpp24
-rw-r--r--contrib/clickhouse/src/IO/UseSSL.h13
-rw-r--r--contrib/clickhouse/src/IO/VarInt.cpp16
-rw-r--r--contrib/clickhouse/src/IO/VarInt.h218
-rw-r--r--contrib/clickhouse/src/IO/WithFileName.cpp39
-rw-r--r--contrib/clickhouse/src/IO/WithFileName.h19
-rw-r--r--contrib/clickhouse/src/IO/WithFileSize.cpp86
-rw-r--r--contrib/clickhouse/src/IO/WithFileSize.h26
-rw-r--r--contrib/clickhouse/src/IO/WriteBuffer.cpp33
-rw-r--r--contrib/clickhouse/src/IO/WriteBuffer.h181
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferDecorator.h55
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromArena.h73
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.cpp61
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.h46
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFile.cpp115
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFile.h64
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileBase.cpp11
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileBase.h21
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.cpp74
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.h35
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.cpp179
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.h62
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.cpp32
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.h23
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromHTTP.cpp50
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromHTTP.h40
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromOStream.cpp42
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromOStream.h29
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.cpp140
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.h42
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromS3.cpp699
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromS3.h140
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp112
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.cpp176
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.h72
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromString.h42
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferFromVector.h103
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferValidUTF8.cpp157
-rw-r--r--contrib/clickhouse/src/IO/WriteBufferValidUTF8.h41
-rw-r--r--contrib/clickhouse/src/IO/WriteHelpers.cpp125
-rw-r--r--contrib/clickhouse/src/IO/WriteHelpers.h1274
-rw-r--r--contrib/clickhouse/src/IO/WriteIntText.h46
-rw-r--r--contrib/clickhouse/src/IO/WriteSettings.h32
-rw-r--r--contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.cpp131
-rw-r--r--contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.h41
-rw-r--r--contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.cpp125
-rw-r--r--contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.h44
-rw-r--r--contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp223
-rw-r--r--contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.h84
-rw-r--r--contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.cpp104
-rw-r--r--contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.h47
-rw-r--r--contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.cpp95
-rw-r--r--contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.h37
-rw-r--r--contrib/clickhouse/src/IO/copyData.cpp112
-rw-r--r--contrib/clickhouse/src/IO/copyData.h37
-rw-r--r--contrib/clickhouse/src/IO/parseDateTimeBestEffort.cpp723
-rw-r--r--contrib/clickhouse/src/IO/parseDateTimeBestEffort.h66
-rw-r--r--contrib/clickhouse/src/IO/readDecimalText.h227
-rw-r--r--contrib/clickhouse/src/IO/readFloatText.cpp70
-rw-r--r--contrib/clickhouse/src/IO/readFloatText.h596
268 files changed, 34177 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/IO/AIO.cpp b/contrib/clickhouse/src/IO/AIO.cpp
new file mode 100644
index 0000000000..7088be633e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AIO.cpp
@@ -0,0 +1,148 @@
+#include <IO/AIO.h>
+
+#if defined(OS_LINUX)
+
+# include <Common/Exception.h>
+
+# include <sys/syscall.h>
+# include <unistd.h>
+# include <utility>
+
+
+/** Small wrappers for asynchronous I/O.
+ */
+
+namespace DB
+{
+ namespace ErrorCodes
+ {
+ extern const int CANNOT_IOSETUP;
+ }
+}
+
+
+int io_setup(unsigned nr, aio_context_t * ctxp)
+{
+ return static_cast<int>(syscall(__NR_io_setup, nr, ctxp));
+}
+
+int io_destroy(aio_context_t ctx)
+{
+ return static_cast<int>(syscall(__NR_io_destroy, ctx));
+}
+
+int io_submit(aio_context_t ctx, long nr, struct iocb * iocbpp[]) // NOLINT
+{
+ return static_cast<int>(syscall(__NR_io_submit, ctx, nr, iocbpp));
+}
+
+int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event * events, struct timespec * timeout) // NOLINT
+{
+ return static_cast<int>(syscall(__NR_io_getevents, ctx, min_nr, max_nr, events, timeout));
+}
+
+
+AIOContext::AIOContext(unsigned int nr_events)
+{
+ ctx = 0;
+ if (io_setup(nr_events, &ctx) < 0)
+ DB::throwFromErrno("io_setup failed", DB::ErrorCodes::CANNOT_IOSETUP);
+}
+
+AIOContext::~AIOContext()
+{
+ if (ctx)
+ io_destroy(ctx);
+}
+
+AIOContext::AIOContext(AIOContext && rhs) noexcept
+{
+ *this = std::move(rhs);
+}
+
+AIOContext & AIOContext::operator=(AIOContext && rhs) noexcept
+{
+ std::swap(ctx, rhs.ctx);
+ return *this;
+}
+
+#elif defined(OS_FREEBSD)
+
+# include <Common/Exception.h>
+
+
+/** Small wrappers for asynchronous I/O.
+ */
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_IOSETUP;
+}
+}
+
+
+int io_setup(void)
+{
+ return kqueue();
+}
+
+int io_destroy(int ctx)
+{
+ return close(ctx);
+}
+
+int io_submit(int ctx, long nr, struct iocb * iocbpp[])
+{
+ for (long i = 0; i < nr; ++i)
+ {
+ struct aiocb * iocb = &iocbpp[i]->aio;
+
+ struct sigevent * se = &iocb->aio_sigevent;
+ se->sigev_notify_kqueue = ctx;
+ se->sigev_notify_kevent_flags = 0;
+ se->sigev_notify = SIGEV_KEVENT;
+ se->sigev_value.sival_ptr = iocbpp[i];
+
+ switch (iocb->aio_lio_opcode)
+ {
+ case LIO_READ:
+ {
+ int r = aio_read(iocb);
+ if (r < 0)
+ return r;
+ break;
+ }
+ case LIO_WRITE:
+ {
+ int r = aio_write(iocb);
+ if (r < 0)
+ return r;
+ break;
+ }
+ }
+ }
+
+ return static_cast<int>(nr);
+}
+
+int io_getevents(int ctx, long, long max_nr, struct kevent * events, struct timespec * timeout)
+{
+ return kevent(ctx, nullptr, 0, events, static_cast<int>(max_nr), timeout);
+}
+
+
+AIOContext::AIOContext(unsigned int)
+{
+ ctx = io_setup();
+ if (ctx < 0)
+ DB::throwFromErrno("io_setup failed", DB::ErrorCodes::CANNOT_IOSETUP);
+}
+
+AIOContext::~AIOContext()
+{
+ io_destroy(ctx);
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/AIO.h b/contrib/clickhouse/src/IO/AIO.h
new file mode 100644
index 0000000000..202939638b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AIO.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+
+#if defined(OS_LINUX)
+
+/// https://stackoverflow.com/questions/20759750/resolving-redefinition-of-timespec-in-time-h
+# define timespec linux_timespec
+# define timeval linux_timeval
+# define itimerspec linux_itimerspec
+# define sigset_t linux_sigset_t
+
+# include <linux/aio_abi.h>
+
+# undef timespec
+# undef timeval
+# undef itimerspec
+# undef sigset_t
+
+
+/** Small wrappers for asynchronous I/O.
+ */
+
+int io_setup(unsigned nr, aio_context_t * ctxp);
+
+int io_destroy(aio_context_t ctx);
+
+/// last argument is an array of pointers technically speaking
+int io_submit(aio_context_t ctx, long nr, struct iocb * iocbpp[]); /// NOLINT
+
+int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event * events, struct timespec * timeout); /// NOLINT
+
+
+struct AIOContext : private boost::noncopyable
+{
+ aio_context_t ctx = 0;
+
+ AIOContext() = default;
+ explicit AIOContext(unsigned int nr_events);
+ ~AIOContext();
+ AIOContext(AIOContext && rhs) noexcept;
+ AIOContext & operator=(AIOContext && rhs) noexcept;
+};
+
+#elif defined(OS_FREEBSD)
+
+# include <aio.h>
+# include <sys/event.h>
+# include <sys/time.h>
+# include <sys/types.h>
+
+typedef struct kevent io_event;
+typedef int aio_context_t;
+
+struct iocb
+{
+ struct aiocb aio;
+ long aio_data;
+};
+
+int io_setup(void);
+
+int io_destroy(void);
+
+/// last argument is an array of pointers technically speaking
+int io_submit(int ctx, long nr, struct iocb * iocbpp[]);
+
+int io_getevents(int ctx, long min_nr, long max_nr, struct kevent * events, struct timespec * timeout);
+
+
+struct AIOContext : private boost::noncopyable
+{
+ int ctx;
+
+ AIOContext(unsigned int nr_events = 128);
+ ~AIOContext();
+};
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/ArchiveUtils.h b/contrib/clickhouse/src/IO/Archives/ArchiveUtils.h
new file mode 100644
index 0000000000..00bebcc890
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/ArchiveUtils.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_LIBARCHIVE
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+
+#error #include <archive.h>
+#error #include <archive_entry.h>
+#endif
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/IArchiveReader.h b/contrib/clickhouse/src/IO/Archives/IArchiveReader.h
new file mode 100644
index 0000000000..84a1dc21f5
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/IArchiveReader.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include <base/types.h>
+#include <functional>
+#include <memory>
+
+
+namespace DB
+{
+class ReadBuffer;
+class ReadBufferFromFileBase;
+class SeekableReadBuffer;
+
+/// Interface for reading an archive.
+class IArchiveReader : public std::enable_shared_from_this<IArchiveReader>, boost::noncopyable
+{
+public:
+ virtual ~IArchiveReader() = default;
+
+ /// Returns true if there is a specified file in the archive.
+ virtual bool fileExists(const String & filename) = 0;
+
+ struct FileInfo
+ {
+ UInt64 uncompressed_size;
+ UInt64 compressed_size;
+ bool is_encrypted;
+ };
+
+ /// Returns the information about a file stored in the archive.
+ virtual FileInfo getFileInfo(const String & filename) = 0;
+
+ class FileEnumerator
+ {
+ public:
+ virtual ~FileEnumerator() = default;
+ virtual const String & getFileName() const = 0;
+ virtual const FileInfo & getFileInfo() const = 0;
+ virtual bool nextFile() = 0;
+ };
+
+ virtual const std::string & getPath() const = 0;
+
+ /// Starts enumerating files in the archive.
+ virtual std::unique_ptr<FileEnumerator> firstFile() = 0;
+
+ using NameFilter = std::function<bool(const std::string &)>;
+
+ /// Starts reading a file from the archive. The function returns a read buffer,
+ /// you can read that buffer to extract uncompressed data from the archive.
+ /// Several read buffers can be used at the same time in parallel.
+ virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) = 0;
+ virtual std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) = 0;
+
+ /// It's possible to convert a file enumerator to a read buffer and vice versa.
+ virtual std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) = 0;
+ virtual std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) = 0;
+
+ virtual std::vector<std::string> getAllFiles() = 0;
+ virtual std::vector<std::string> getAllFiles(NameFilter filter) = 0;
+
+ /// Sets password used to decrypt files in the archive.
+ virtual void setPassword(const String & /* password */) {}
+
+ using ReadArchiveFunction = std::function<std::unique_ptr<SeekableReadBuffer>()>;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/IArchiveWriter.h b/contrib/clickhouse/src/IO/Archives/IArchiveWriter.h
new file mode 100644
index 0000000000..d7ff038e7b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/IArchiveWriter.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include <base/types.h>
+#include <memory>
+
+
+namespace DB
+{
+class WriteBufferFromFileBase;
+
+/// Interface for writing an archive.
+class IArchiveWriter : public std::enable_shared_from_this<IArchiveWriter>, boost::noncopyable
+{
+public:
+ /// Destructors finalizes writing the archive.
+ virtual ~IArchiveWriter() = default;
+
+ /// Starts writing a file to the archive. The function returns a write buffer,
+ /// any data written to that buffer will be compressed and then put to the archive.
+ /// You can keep only one such buffer at a time, a buffer returned by previous call
+ /// of the function `writeFile()` should be destroyed before next call of `writeFile()`.
+ virtual std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & filename) = 0;
+
+ /// Returns true if there is an active instance of WriteBuffer returned by writeFile().
+ /// This function should be used mostly for debugging purposes.
+ virtual bool isWritingFile() const = 0;
+
+ static constexpr const int kDefaultCompressionLevel = -1;
+
+ /// Sets compression method and level.
+ /// Changing them will affect next file in the archive.
+ virtual void setCompression(const String & /* compression_method */, int /* compression_level */ = kDefaultCompressionLevel) {}
+
+ /// Sets password. If the password is not empty it will enable encryption in the archive.
+ virtual void setPassword(const String & /* password */) {}
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/LibArchiveReader.cpp b/contrib/clickhouse/src/IO/Archives/LibArchiveReader.cpp
new file mode 100644
index 0000000000..a411b4bb4b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/LibArchiveReader.cpp
@@ -0,0 +1,354 @@
+#include <IO/Archives/LibArchiveReader.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <Common/quoteString.h>
+#include <Common/scope_guard_safe.h>
+
+#include <IO/Archives/ArchiveUtils.h>
+
+#include <mutex>
+
+namespace DB
+{
+
+#if USE_LIBARCHIVE
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_UNPACK_ARCHIVE;
+ extern const int LOGICAL_ERROR;
+ extern const int CANNOT_READ_ALL_DATA;
+ extern const int UNSUPPORTED_METHOD;
+}
+
+class LibArchiveReader::Handle
+{
+public:
+ explicit Handle(std::string path_to_archive_, bool lock_on_reading_)
+ : path_to_archive(path_to_archive_), lock_on_reading(lock_on_reading_)
+ {
+ current_archive = open(path_to_archive);
+ }
+
+ Handle(const Handle &) = delete;
+ Handle(Handle && other) noexcept
+ : current_archive(other.current_archive)
+ , current_entry(other.current_entry)
+ , lock_on_reading(other.lock_on_reading)
+ {
+ other.current_archive = nullptr;
+ other.current_entry = nullptr;
+ }
+
+ ~Handle()
+ {
+ close(current_archive);
+ }
+
+ bool locateFile(const std::string & filename)
+ {
+ return locateFile([&](const std::string & file) { return file == filename; });
+ }
+
+ bool locateFile(NameFilter filter)
+ {
+ resetFileInfo();
+ int err = ARCHIVE_OK;
+ while (true)
+ {
+ err = readNextHeader(current_archive, &current_entry);
+
+ if (err == ARCHIVE_RETRY)
+ continue;
+
+ if (err != ARCHIVE_OK)
+ break;
+
+ if (filter(archive_entry_pathname(current_entry)))
+ return true;
+ }
+
+ checkError(err);
+ return false;
+ }
+
+ bool nextFile()
+ {
+ resetFileInfo();
+ int err = ARCHIVE_OK;
+ do
+ {
+ err = readNextHeader(current_archive, &current_entry);
+ } while (err == ARCHIVE_RETRY);
+
+ checkError(err);
+ return err == ARCHIVE_OK;
+ }
+
+ std::vector<std::string> getAllFiles(NameFilter filter)
+ {
+ auto * archive = open(path_to_archive);
+ SCOPE_EXIT(
+ close(archive);
+ );
+
+ struct archive_entry * entry = nullptr;
+
+ std::vector<std::string> files;
+ int error = readNextHeader(archive, &entry);
+ while (error == ARCHIVE_OK || error == ARCHIVE_RETRY)
+ {
+ chassert(entry != nullptr);
+ std::string name = archive_entry_pathname(entry);
+ if (!filter || filter(name))
+ files.push_back(std::move(name));
+
+ error = readNextHeader(archive, &entry);
+ }
+
+ checkError(error);
+ return files;
+ }
+
+ const String & getFileName() const
+ {
+ chassert(current_entry);
+ if (!file_name)
+ file_name.emplace(archive_entry_pathname(current_entry));
+
+ return *file_name;
+ }
+
+ const FileInfo & getFileInfo() const
+ {
+ chassert(current_entry);
+ if (!file_info)
+ {
+ file_info.emplace();
+ file_info->uncompressed_size = archive_entry_size(current_entry);
+ file_info->compressed_size = archive_entry_size(current_entry);
+ file_info->is_encrypted = false;
+ }
+
+ return *file_info;
+ }
+
+ struct archive * current_archive;
+ struct archive_entry * current_entry = nullptr;
+private:
+ void checkError(int error) const
+ {
+ if (error == ARCHIVE_FATAL)
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Failed to read archive while fetching all files: {}", archive_error_string(current_archive));
+ }
+
+ void resetFileInfo()
+ {
+ file_name.reset();
+ file_info.reset();
+ }
+
+ static struct archive * open(const String & path_to_archive)
+ {
+ auto * archive = archive_read_new();
+ try
+ {
+ archive_read_support_filter_all(archive);
+ archive_read_support_format_all(archive);
+ if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK)
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive {}: {}", quoteString(path_to_archive), archive_error_string(archive));
+ }
+ catch (...)
+ {
+ close(archive);
+ throw;
+ }
+
+ return archive;
+ }
+
+ static void close(struct archive * archive)
+ {
+ if (archive)
+ {
+ archive_read_close(archive);
+ archive_read_free(archive);
+ }
+ }
+
+ int readNextHeader(struct archive * archive, struct archive_entry ** entry) const
+ {
+ std::unique_lock lock(Handle::read_lock, std::defer_lock);
+ if (lock_on_reading)
+ lock.lock();
+
+ return archive_read_next_header(archive, entry);
+ }
+
+ const String path_to_archive;
+
+ /// for some archive types when we are reading headers static variables are used
+ /// which are not thread-safe
+ const bool lock_on_reading;
+ static inline std::mutex read_lock;
+
+ mutable std::optional<String> file_name;
+ mutable std::optional<FileInfo> file_info;
+};
+
+class LibArchiveReader::FileEnumeratorImpl : public FileEnumerator
+{
+public:
+ explicit FileEnumeratorImpl(Handle handle_) : handle(std::move(handle_)) {}
+
+ const String & getFileName() const override { return handle.getFileName(); }
+ const FileInfo & getFileInfo() const override { return handle.getFileInfo(); }
+ bool nextFile() override { return handle.nextFile(); }
+
+ /// Releases owned handle to pass it to a read buffer.
+ Handle releaseHandle() && { return std::move(handle); }
+private:
+ Handle handle;
+};
+
+class LibArchiveReader::ReadBufferFromLibArchive : public ReadBufferFromFileBase
+{
+public:
+ explicit ReadBufferFromLibArchive(Handle handle_, std::string path_to_archive_)
+ : ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+ , handle(std::move(handle_))
+ , path_to_archive(std::move(path_to_archive_))
+ {}
+
+ off_t seek(off_t /* off */, int /* whence */) override
+ {
+ throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Seek is not supported when reading from archive");
+ }
+
+ off_t getPosition() override
+ {
+ throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "getPosition not supported when reading from archive");
+ }
+
+ String getFileName() const override { return handle.getFileName(); }
+
+ size_t getFileSize() override { return handle.getFileInfo().uncompressed_size; }
+
+ Handle releaseHandle() &&
+ {
+ return std::move(handle);
+ }
+
+private:
+ bool nextImpl() override
+ {
+ auto bytes_read = archive_read_data(handle.current_archive, internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+
+ if (bytes_read < 0)
+ throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to read file {} from {}: {}", handle.getFileName(), path_to_archive, archive_error_string(handle.current_archive));
+
+ if (!bytes_read)
+ return false;
+
+ total_bytes_read += bytes;
+
+ working_buffer = internal_buffer;
+ working_buffer.resize(bytes_read);
+ return true;
+ }
+
+ Handle handle;
+ const String path_to_archive;
+ size_t total_bytes_read = 0;
+};
+
+LibArchiveReader::LibArchiveReader(std::string archive_name_, bool lock_on_reading_, std::string path_to_archive_)
+ : archive_name(std::move(archive_name_)), lock_on_reading(lock_on_reading_), path_to_archive(std::move(path_to_archive_))
+{}
+
+LibArchiveReader::~LibArchiveReader() = default;
+
+const std::string & LibArchiveReader::getPath() const
+{
+ return path_to_archive;
+}
+
+bool LibArchiveReader::fileExists(const String & filename)
+{
+ Handle handle(path_to_archive, lock_on_reading);
+ return handle.locateFile(filename);
+}
+
+LibArchiveReader::FileInfo LibArchiveReader::getFileInfo(const String & filename)
+{
+ Handle handle(path_to_archive, lock_on_reading);
+ if (!handle.locateFile(filename))
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: file not found", path_to_archive);
+ return handle.getFileInfo();
+}
+
+std::unique_ptr<LibArchiveReader::FileEnumerator> LibArchiveReader::firstFile()
+{
+ Handle handle(path_to_archive, lock_on_reading);
+ if (!handle.nextFile())
+ return nullptr;
+
+ return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(const String & filename, bool throw_on_not_found)
+{
+ return readFile([&](const std::string & file) { return file == filename; }, throw_on_not_found);
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(NameFilter filter, bool throw_on_not_found)
+{
+ Handle handle(path_to_archive, lock_on_reading);
+ if (!handle.locateFile(filter))
+ {
+ if (throw_on_not_found)
+ throw Exception(
+ ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive);
+ return nullptr;
+ }
+ return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
+}
+
+std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(std::unique_ptr<FileEnumerator> enumerator)
+{
+ if (!dynamic_cast<FileEnumeratorImpl *>(enumerator.get()))
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong enumerator passed to readFile()");
+ auto enumerator_impl = std::unique_ptr<FileEnumeratorImpl>(static_cast<FileEnumeratorImpl *>(enumerator.release()));
+ auto handle = std::move(*enumerator_impl).releaseHandle();
+ return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
+}
+
+std::unique_ptr<LibArchiveReader::FileEnumerator> LibArchiveReader::nextFile(std::unique_ptr<ReadBuffer> read_buffer)
+{
+ if (!dynamic_cast<ReadBufferFromLibArchive *>(read_buffer.get()))
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()");
+ auto read_buffer_from_libarchive = std::unique_ptr<ReadBufferFromLibArchive>(static_cast<ReadBufferFromLibArchive *>(read_buffer.release()));
+ auto handle = std::move(*read_buffer_from_libarchive).releaseHandle();
+ if (!handle.nextFile())
+ return nullptr;
+ return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::vector<std::string> LibArchiveReader::getAllFiles()
+{
+ return getAllFiles({});
+}
+
+std::vector<std::string> LibArchiveReader::getAllFiles(NameFilter filter)
+{
+ Handle handle(path_to_archive, lock_on_reading);
+ return handle.getAllFiles(filter);
+}
+
+void LibArchiveReader::setPassword(const String & /*password_*/)
+{
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not set password to {} archive", archive_name);
+}
+
+#endif
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/LibArchiveReader.h b/contrib/clickhouse/src/IO/Archives/LibArchiveReader.h
new file mode 100644
index 0000000000..5d5e9a5a25
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/LibArchiveReader.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#include <IO/Archives/IArchiveReader.h>
+
+
+namespace DB
+{
+
+#if USE_LIBARCHIVE
+
+class ReadBuffer;
+class ReadBufferFromFileBase;
+class SeekableReadBuffer;
+
+/// Implementation of IArchiveReader for reading archives using libarchive.
+class LibArchiveReader : public IArchiveReader
+{
+public:
+ ~LibArchiveReader() override;
+
+ const std::string & getPath() const override;
+
+ /// Returns true if there is a specified file in the archive.
+ bool fileExists(const String & filename) override;
+
+ /// Returns the information about a file stored in the archive.
+ FileInfo getFileInfo(const String & filename) override;
+
+ /// Starts enumerating files in the archive.
+ std::unique_ptr<FileEnumerator> firstFile() override;
+
+ /// Starts reading a file from the archive. The function returns a read buffer,
+ /// you can read that buffer to extract uncompressed data from the archive.
+ /// Several read buffers can be used at the same time in parallel.
+ std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) override;
+ std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) override;
+
+ /// It's possible to convert a file enumerator to a read buffer and vice versa.
+ std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
+ std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
+
+ std::vector<std::string> getAllFiles() override;
+ std::vector<std::string> getAllFiles(NameFilter filter) override;
+
+ /// Sets password used to decrypt the contents of the files in the archive.
+ void setPassword(const String & password_) override;
+
+protected:
+ /// Constructs an archive's reader that will read from a file in the local filesystem.
+ LibArchiveReader(std::string archive_name_, bool lock_on_reading_, std::string path_to_archive_);
+
+private:
+ class ReadBufferFromLibArchive;
+ class Handle;
+ class FileEnumeratorImpl;
+
+ const std::string archive_name;
+ const bool lock_on_reading;
+ const String path_to_archive;
+};
+
+class TarArchiveReader : public LibArchiveReader
+{
+public:
+ explicit TarArchiveReader(std::string path_to_archive) : LibArchiveReader("tar", /*lock_on_reading_=*/ true, std::move(path_to_archive)) { }
+};
+
+class SevenZipArchiveReader : public LibArchiveReader
+{
+public:
+ explicit SevenZipArchiveReader(std::string path_to_archive) : LibArchiveReader("7z", /*lock_on_reading_=*/ false, std::move(path_to_archive)) { }
+};
+
+#endif
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.cpp b/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.cpp
new file mode 100644
index 0000000000..970211f06b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.cpp
@@ -0,0 +1,662 @@
+#include <IO/Archives/ZipArchiveReader.h>
+
+#if USE_MINIZIP
+#include <IO/Archives/ZipArchiveWriter.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <Common/quoteString.h>
+#include <base/errnoToString.h>
+#error #include <unzip.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_UNPACK_ARCHIVE;
+ extern const int LOGICAL_ERROR;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+}
+
+using RawHandle = unzFile;
+
+
+namespace
+{
+ void checkCompressionMethodIsEnabled(int compression_method_)
+ {
+ ZipArchiveWriter::checkCompressionMethodIsEnabled(compression_method_);
+ }
+
+ void checkEncryptionIsEnabled()
+ {
+ ZipArchiveWriter::checkEncryptionIsEnabled();
+ }
+}
+
+
+/// Holds a raw handle, calls acquireRawHandle() in the constructor and releaseRawHandle() in the destructor.
+class ZipArchiveReader::HandleHolder
+{
+public:
+ HandleHolder() = default;
+
+ explicit HandleHolder(const std::shared_ptr<ZipArchiveReader> & reader_) : reader(reader_), raw_handle(reader->acquireRawHandle()) { }
+
+ ~HandleHolder()
+ {
+ if (raw_handle)
+ {
+ try
+ {
+ closeFile();
+ }
+ catch (...)
+ {
+ tryLogCurrentException("ZipArchiveReader");
+ }
+ reader->releaseRawHandle(raw_handle);
+ }
+ }
+
+ HandleHolder(HandleHolder && src) noexcept
+ {
+ *this = std::move(src);
+ }
+
+ HandleHolder & operator=(HandleHolder && src) noexcept
+ {
+ reader = std::exchange(src.reader, nullptr);
+ raw_handle = std::exchange(src.raw_handle, nullptr);
+ file_name = std::exchange(src.file_name, {});
+ file_info = std::exchange(src.file_info, {});
+ return *this;
+ }
+
+ RawHandle getRawHandle() const { return raw_handle; }
+ std::shared_ptr<ZipArchiveReader> getReader() const { return reader; }
+
+ bool locateFile(const String & file_name_)
+ {
+ resetFileInfo();
+ bool case_sensitive = true;
+ int err = unzLocateFile(raw_handle, file_name_.c_str(), reinterpret_cast<unzFileNameComparer>(static_cast<size_t>(case_sensitive)));
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return false;
+ file_name = file_name_;
+ return true;
+ }
+
+ bool locateFile(NameFilter filter)
+ {
+ int err = unzGoToFirstFile(raw_handle);
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return false;
+
+ do
+ {
+ checkResult(err);
+ resetFileInfo();
+ retrieveFileInfo();
+ if (filter(getFileName()))
+ return true;
+
+ err = unzGoToNextFile(raw_handle);
+ } while (err != UNZ_END_OF_LIST_OF_FILE);
+
+ return false;
+ }
+
+ bool tryLocateFile(const String & file_name_)
+ {
+ resetFileInfo();
+ bool case_sensitive = true;
+ int err = unzLocateFile(raw_handle, file_name_.c_str(), reinterpret_cast<unzFileNameComparer>(static_cast<size_t>(case_sensitive)));
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return false;
+ checkResult(err);
+ file_name = file_name_;
+ return true;
+ }
+
+ bool firstFile()
+ {
+ resetFileInfo();
+ int err = unzGoToFirstFile(raw_handle);
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return false;
+ checkResult(err);
+ return true;
+ }
+
+ bool nextFile()
+ {
+ resetFileInfo();
+ int err = unzGoToNextFile(raw_handle);
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return false;
+ checkResult(err);
+ return true;
+ }
+
+ const String & getFileName() const
+ {
+ if (!file_name)
+ retrieveFileInfo();
+ return *file_name;
+ }
+
+ const FileInfoImpl & getFileInfo() const
+ {
+ if (!file_info)
+ retrieveFileInfo();
+ return *file_info;
+ }
+
+ std::vector<std::string> getAllFiles(NameFilter filter)
+ {
+ std::vector<std::string> files;
+ resetFileInfo();
+ int err = unzGoToFirstFile(raw_handle);
+ if (err == UNZ_END_OF_LIST_OF_FILE)
+ return files;
+
+ do
+ {
+ checkResult(err);
+ resetFileInfo();
+ retrieveFileInfo();
+ if (!filter || filter(getFileName()))
+ files.push_back(*file_name);
+ err = unzGoToNextFile(raw_handle);
+ } while (err != UNZ_END_OF_LIST_OF_FILE);
+
+ return files;
+ }
+
+ void closeFile()
+ {
+ int err = unzCloseCurrentFile(raw_handle);
+ /// If err == UNZ_PARAMERROR the file is already closed.
+ if (err != UNZ_PARAMERROR)
+ checkResult(err);
+ }
+
+ void checkResult(int code) const { reader->checkResult(code); }
+ [[noreturn]] void showError(const String & message) const { reader->showError(message); }
+
+private:
+ void retrieveFileInfo() const
+ {
+ if (file_name && file_info)
+ return;
+ unz_file_info64 finfo;
+ int err = unzGetCurrentFileInfo64(raw_handle, &finfo, nullptr, 0, nullptr, 0, nullptr, 0);
+ if (err == UNZ_PARAMERROR)
+ showError("No current file");
+ checkResult(err);
+ if (!file_info)
+ {
+ file_info.emplace();
+ file_info->uncompressed_size = finfo.uncompressed_size;
+ file_info->compressed_size = finfo.compressed_size;
+ file_info->compression_method = finfo.compression_method;
+ file_info->is_encrypted = (finfo.flag & MZ_ZIP_FLAG_ENCRYPTED);
+ }
+ if (!file_name)
+ {
+ file_name.emplace();
+ file_name->resize(finfo.size_filename);
+ checkResult(unzGetCurrentFileInfo64(raw_handle, nullptr, file_name->data(), finfo.size_filename, nullptr, 0, nullptr, 0));
+ }
+ }
+
+ void resetFileInfo()
+ {
+ file_info.reset();
+ file_name.reset();
+ }
+
+ std::shared_ptr<ZipArchiveReader> reader;
+ RawHandle raw_handle = nullptr;
+ mutable std::optional<String> file_name;
+ mutable std::optional<FileInfoImpl> file_info;
+};
+
+
+/// This class represents a ReadBuffer actually returned by readFile().
+class ZipArchiveReader::ReadBufferFromZipArchive : public ReadBufferFromFileBase
+{
+public:
+ explicit ReadBufferFromZipArchive(HandleHolder && handle_)
+ : ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+ , handle(std::move(handle_))
+ {
+ const auto & file_info = handle.getFileInfo();
+ checkCompressionMethodIsEnabled(file_info.compression_method);
+
+ const char * password_cstr = nullptr;
+ if (file_info.is_encrypted)
+ {
+ const auto & password_str = handle.getReader()->password;
+ if (password_str.empty())
+ showError("Password is required");
+ password_cstr = password_str.c_str();
+ checkEncryptionIsEnabled();
+ }
+
+ RawHandle raw_handle = handle.getRawHandle();
+ int err = unzOpenCurrentFilePassword(raw_handle, password_cstr);
+ if (err == MZ_PASSWORD_ERROR)
+ showError("Wrong password");
+ checkResult(err);
+ }
+
+ off_t seek(off_t off, int whence) override
+ {
+ off_t current_pos = getPosition();
+ off_t new_pos;
+ if (whence == SEEK_SET)
+ new_pos = off;
+ else if (whence == SEEK_CUR)
+ new_pos = off + current_pos;
+ else
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Only SEEK_SET and SEEK_CUR seek modes allowed.");
+
+ if (new_pos == current_pos)
+ return current_pos; /// The position is the same.
+
+ if (new_pos < 0)
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
+
+ off_t working_buffer_start_pos = current_pos - offset();
+ off_t working_buffer_end_pos = current_pos + available();
+
+ if ((working_buffer_start_pos <= new_pos) && (new_pos <= working_buffer_end_pos))
+ {
+ /// The new position is still inside the buffer.
+ position() += new_pos - current_pos;
+ return new_pos;
+ }
+
+ RawHandle raw_handle = handle.getRawHandle();
+
+ /// Check that the new position is now beyond the end of the file.
+ const auto & file_info = handle.getFileInfo();
+ if (new_pos > static_cast<off_t>(file_info.uncompressed_size))
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bound");
+
+ if (file_info.compression_method == MZ_COMPRESS_METHOD_STORE)
+ {
+ /// unzSeek64() works only for non-compressed files.
+ checkResult(unzSeek64(raw_handle, off, whence));
+ return unzTell64(raw_handle);
+ }
+
+ /// As a last try we go slow way, we're going to simply ignore all data before the new position.
+ if (new_pos < current_pos)
+ {
+ checkResult(unzCloseCurrentFile(raw_handle));
+ checkResult(unzOpenCurrentFile(raw_handle));
+ current_pos = 0;
+ }
+
+ ignore(new_pos - current_pos);
+ return new_pos;
+ }
+
+ off_t getPosition() override
+ {
+ RawHandle raw_handle = handle.getRawHandle();
+ return unzTell64(raw_handle) - available();
+ }
+
+ String getFileName() const override { return handle.getFileName(); }
+
+ size_t getFileSize() override { return handle.getFileInfo().uncompressed_size; }
+
+ /// Releases owned handle to pass it to an enumerator.
+ HandleHolder releaseHandle() &&
+ {
+ handle.closeFile();
+ return std::move(handle);
+ }
+
+private:
+ bool nextImpl() override
+ {
+ RawHandle raw_handle = handle.getRawHandle();
+ auto bytes_read = unzReadCurrentFile(raw_handle, internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+
+ if (bytes_read < 0)
+ checkResult(bytes_read);
+
+ if (!bytes_read)
+ return false;
+
+ working_buffer = internal_buffer;
+ working_buffer.resize(bytes_read);
+ return true;
+ }
+
+ void checkResult(int code) const { handle.checkResult(code); }
+ [[noreturn]] void showError(const String & message) const { handle.showError(message); }
+
+ HandleHolder handle;
+};
+
+
+class ZipArchiveReader::FileEnumeratorImpl : public FileEnumerator
+{
+public:
+ explicit FileEnumeratorImpl(HandleHolder && handle_) : handle(std::move(handle_)) {}
+
+ const String & getFileName() const override { return handle.getFileName(); }
+ const FileInfo & getFileInfo() const override { return handle.getFileInfo(); }
+ bool nextFile() override { return handle.nextFile(); }
+
+ /// Releases owned handle to pass it to a read buffer.
+ HandleHolder releaseHandle() && { return std::move(handle); }
+
+private:
+ HandleHolder handle;
+};
+
+
+namespace
+{
+ /// Provides a set of functions allowing the minizip library to read its input
+ /// from a SeekableReadBuffer instead of an ordinary file in the local filesystem.
+ class StreamFromReadBuffer
+ {
+ public:
+ static RawHandle open(std::unique_ptr<SeekableReadBuffer> archive_read_buffer, UInt64 archive_size)
+ {
+ StreamFromReadBuffer::Opaque opaque{std::move(archive_read_buffer), archive_size};
+
+ zlib_filefunc64_def func_def;
+ func_def.zopen64_file = &StreamFromReadBuffer::openFileFunc;
+ func_def.zclose_file = &StreamFromReadBuffer::closeFileFunc;
+ func_def.zread_file = &StreamFromReadBuffer::readFileFunc;
+ func_def.zwrite_file = &StreamFromReadBuffer::writeFileFunc;
+ func_def.zseek64_file = &StreamFromReadBuffer::seekFunc;
+ func_def.ztell64_file = &StreamFromReadBuffer::tellFunc;
+ func_def.zerror_file = &StreamFromReadBuffer::testErrorFunc;
+ func_def.opaque = &opaque;
+
+ return unzOpen2_64(/* path= */ nullptr,
+ &func_def);
+ }
+
+ private:
+ std::unique_ptr<SeekableReadBuffer> read_buffer;
+ UInt64 start_offset = 0;
+ UInt64 total_size = 0;
+ bool at_end = false;
+
+ struct Opaque
+ {
+ std::unique_ptr<SeekableReadBuffer> read_buffer;
+ UInt64 total_size = 0;
+ };
+
+ static void * openFileFunc(void * opaque, const void *, int)
+ {
+ auto & opq = *reinterpret_cast<Opaque *>(opaque);
+ return new StreamFromReadBuffer(std::move(opq.read_buffer), opq.total_size);
+ }
+
+ StreamFromReadBuffer(std::unique_ptr<SeekableReadBuffer> read_buffer_, UInt64 total_size_)
+ : read_buffer(std::move(read_buffer_)), start_offset(read_buffer->getPosition()), total_size(total_size_) {}
+
+ static int closeFileFunc(void *, void * stream)
+ {
+ delete reinterpret_cast<StreamFromReadBuffer *>(stream);
+ return ZIP_OK;
+ }
+
+ static StreamFromReadBuffer & get(void * ptr)
+ {
+ return *reinterpret_cast<StreamFromReadBuffer *>(ptr);
+ }
+
+ static int testErrorFunc(void *, void *)
+ {
+ return ZIP_OK;
+ }
+
+ static unsigned long readFileFunc(void *, void * stream, void * buf, unsigned long size) // NOLINT(google-runtime-int)
+ {
+ auto & strm = get(stream);
+ if (strm.at_end)
+ return 0;
+ auto read_bytes = strm.read_buffer->read(reinterpret_cast<char *>(buf), size);
+ return read_bytes;
+ }
+
+ static ZPOS64_T tellFunc(void *, void * stream)
+ {
+ auto & strm = get(stream);
+ if (strm.at_end)
+ return strm.total_size;
+ auto pos = strm.read_buffer->getPosition() - strm.start_offset;
+ return pos;
+ }
+
+ static long seekFunc(void *, void * stream, ZPOS64_T offset, int origin) // NOLINT(google-runtime-int)
+ {
+ auto & strm = get(stream);
+ if (origin == SEEK_END)
+ {
+ /// Our implementations of SeekableReadBuffer don't support SEEK_END,
+ /// but the minizip library needs it, so we have to simulate it here.
+ strm.at_end = true;
+ return ZIP_OK;
+ }
+ strm.at_end = false;
+ if (origin == SEEK_SET)
+ offset += strm.start_offset;
+ strm.read_buffer->seek(offset, origin);
+ return ZIP_OK;
+ }
+
+ static unsigned long writeFileFunc(void *, void *, const void *, unsigned long) // NOLINT(google-runtime-int)
+ {
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "StreamFromReadBuffer::writeFile must not be called");
+ }
+ };
+}
+
+
+ZipArchiveReader::ZipArchiveReader(const String & path_to_archive_)
+ : path_to_archive(path_to_archive_)
+{
+ init();
+
+}
+
+ZipArchiveReader::ZipArchiveReader(
+ const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_)
+ : path_to_archive(path_to_archive_), archive_read_function(archive_read_function_), archive_size(archive_size_)
+{
+ init();
+}
+
+void ZipArchiveReader::init()
+{
+ /// Prepare the first handle in `free_handles` and check that the archive can be read.
+ releaseRawHandle(acquireRawHandle());
+}
+
+ZipArchiveReader::~ZipArchiveReader()
+{
+ /// Close all `free_handles`.
+ for (RawHandle free_handle : free_handles)
+ {
+ try
+ {
+ checkResult(unzClose(free_handle));
+ }
+ catch (...)
+ {
+ tryLogCurrentException("ZipArchiveReader");
+ }
+ }
+}
+
+const std::string & ZipArchiveReader::getPath() const
+{
+ return path_to_archive;
+}
+
+bool ZipArchiveReader::fileExists(const String & filename)
+{
+ return acquireHandle().tryLocateFile(filename);
+}
+
+ZipArchiveReader::FileInfo ZipArchiveReader::getFileInfo(const String & filename)
+{
+ auto handle = acquireHandle();
+ if (!handle.locateFile(filename))
+ showError(fmt::format("File {} was not found in archive", quoteString(filename)));
+
+ return handle.getFileInfo();
+}
+
+std::unique_ptr<ZipArchiveReader::FileEnumerator> ZipArchiveReader::firstFile()
+{
+ auto handle = acquireHandle();
+ if (!handle.firstFile())
+ return nullptr;
+ return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(const String & filename, bool throw_on_not_found)
+{
+ auto handle = acquireHandle();
+ if (!handle.locateFile(filename))
+ {
+ if (throw_on_not_found)
+ showError(fmt::format("File {} was not found in archive", quoteString(filename)));
+
+ return nullptr;
+ }
+
+ return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
+}
+
+std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(NameFilter filter, bool throw_on_not_found)
+{
+ auto handle = acquireHandle();
+ if (!handle.locateFile(filter))
+ {
+ if (throw_on_not_found)
+ showError(fmt::format("No file satisfying filter in archive"));
+
+ return nullptr;
+ }
+
+ return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
+}
+
+std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(std::unique_ptr<FileEnumerator> enumerator)
+{
+ if (!dynamic_cast<FileEnumeratorImpl *>(enumerator.get()))
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong enumerator passed to readFile()");
+ auto enumerator_impl = std::unique_ptr<FileEnumeratorImpl>(static_cast<FileEnumeratorImpl *>(enumerator.release()));
+ auto handle = std::move(*enumerator_impl).releaseHandle();
+ return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
+}
+
+std::unique_ptr<ZipArchiveReader::FileEnumerator> ZipArchiveReader::nextFile(std::unique_ptr<ReadBuffer> read_buffer)
+{
+ if (!dynamic_cast<ReadBufferFromZipArchive *>(read_buffer.get()))
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()");
+ auto read_buffer_from_zip = std::unique_ptr<ReadBufferFromZipArchive>(static_cast<ReadBufferFromZipArchive *>(read_buffer.release()));
+ auto handle = std::move(*read_buffer_from_zip).releaseHandle();
+ if (!handle.nextFile())
+ return nullptr;
+ return std::make_unique<FileEnumeratorImpl>(std::move(handle));
+}
+
+std::vector<std::string> ZipArchiveReader::getAllFiles()
+{
+ return getAllFiles({});
+}
+
+std::vector<std::string> ZipArchiveReader::getAllFiles(NameFilter filter)
+{
+ auto handle = acquireHandle();
+ return handle.getAllFiles(filter);
+}
+
+void ZipArchiveReader::setPassword(const String & password_)
+{
+ std::lock_guard lock{mutex};
+ password = password_;
+}
+
+ZipArchiveReader::HandleHolder ZipArchiveReader::acquireHandle()
+{
+ return HandleHolder{std::static_pointer_cast<ZipArchiveReader>(shared_from_this())};
+}
+
+ZipArchiveReader::RawHandle ZipArchiveReader::acquireRawHandle()
+{
+ std::lock_guard lock{mutex};
+
+ if (!free_handles.empty())
+ {
+ RawHandle free_handle = free_handles.back();
+ free_handles.pop_back();
+ return free_handle;
+ }
+
+ RawHandle new_handle = nullptr;
+ if (archive_read_function)
+ new_handle = StreamFromReadBuffer::open(archive_read_function(), archive_size);
+ else
+ new_handle = unzOpen64(path_to_archive.c_str());
+
+ if (!new_handle)
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open zip archive {}", quoteString(path_to_archive));
+
+ return new_handle;
+}
+
+void ZipArchiveReader::releaseRawHandle(RawHandle handle_)
+{
+ if (!handle_)
+ return;
+
+ std::lock_guard lock{mutex};
+ free_handles.push_back(handle_);
+}
+
+void ZipArchiveReader::checkResult(int code) const
+{
+ if (code >= UNZ_OK)
+ return;
+
+ String message = "Code = ";
+ switch (code)
+ {
+ case UNZ_OK: return;
+ case UNZ_ERRNO: message += "ERRNO, errno = " + errnoToString(); break;
+ case UNZ_PARAMERROR: message += "PARAMERROR"; break;
+ case UNZ_BADZIPFILE: message += "BADZIPFILE"; break;
+ case UNZ_INTERNALERROR: message += "INTERNALERROR"; break;
+ case UNZ_CRCERROR: message += "CRCERROR"; break;
+ case UNZ_BADPASSWORD: message += "BADPASSWORD"; break;
+ default: message += std::to_string(code); break;
+ }
+ showError(message);
+}
+
+void ZipArchiveReader::showError(const String & message) const
+{
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack zip archive {}: {}", quoteString(path_to_archive), message);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.h b/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.h
new file mode 100644
index 0000000000..74fa26b6fe
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/ZipArchiveReader.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_MINIZIP
+#include <IO/Archives/IArchiveReader.h>
+#include <mutex>
+#include <vector>
+
+
+namespace DB
+{
+class ReadBuffer;
+class ReadBufferFromFileBase;
+class SeekableReadBuffer;
+
+/// Implementation of IArchiveReader for reading zip archives.
+class ZipArchiveReader : public IArchiveReader
+{
+public:
+ /// Constructs an archive's reader that will read from a file in the local filesystem.
+ explicit ZipArchiveReader(const String & path_to_archive_);
+
+ /// Constructs an archive's reader that will read by making a read buffer by using
+ /// a specified function.
+ ZipArchiveReader(const String & path_to_archive_, const ReadArchiveFunction & archive_read_function_, UInt64 archive_size_);
+
+ ~ZipArchiveReader() override;
+
+ const std::string & getPath() const override;
+
+ /// Returns true if there is a specified file in the archive.
+ bool fileExists(const String & filename) override;
+
+ /// Returns the information about a file stored in the archive.
+ FileInfo getFileInfo(const String & filename) override;
+
+ /// Starts enumerating files in the archive.
+ std::unique_ptr<FileEnumerator> firstFile() override;
+
+ /// Starts reading a file from the archive. The function returns a read buffer,
+ /// you can read that buffer to extract uncompressed data from the archive.
+ /// Several read buffers can be used at the same time in parallel.
+ std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) override;
+ std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) override;
+
+ /// It's possible to convert a file enumerator to a read buffer and vice versa.
+ std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;
+ std::unique_ptr<FileEnumerator> nextFile(std::unique_ptr<ReadBuffer> read_buffer) override;
+
+ std::vector<std::string> getAllFiles() override;
+ std::vector<std::string> getAllFiles(NameFilter filter) override;
+
+ /// Sets password used to decrypt the contents of the files in the archive.
+ void setPassword(const String & password_) override;
+
+private:
+ class ReadBufferFromZipArchive;
+ class FileEnumeratorImpl;
+ class HandleHolder;
+ using RawHandle = void *;
+
+ void init();
+
+ struct FileInfoImpl : public FileInfo
+ {
+ int compression_method;
+ };
+
+ HandleHolder acquireHandle();
+ RawHandle acquireRawHandle();
+ void releaseRawHandle(RawHandle handle_);
+
+ void checkResult(int code) const;
+ [[noreturn]] void showError(const String & message) const;
+
+ const String path_to_archive;
+ const ReadArchiveFunction archive_read_function;
+ const UInt64 archive_size = 0;
+ String password;
+ std::vector<RawHandle> free_handles;
+ mutable std::mutex mutex;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.cpp b/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.cpp
new file mode 100644
index 0000000000..4f8aa27df6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.cpp
@@ -0,0 +1,407 @@
+#include <IO/Archives/ZipArchiveWriter.h>
+
+#if USE_MINIZIP
+#include <IO/WriteBufferFromFileBase.h>
+#include <Common/quoteString.h>
+#include <base/errnoToString.h>
+#error #include <zip.h>
+#include <boost/algorithm/string/predicate.hpp>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_PACK_ARCHIVE;
+ extern const int SUPPORT_IS_DISABLED;
+ extern const int LOGICAL_ERROR;
+}
+
+using RawHandle = zipFile;
+
+
+/// Holds a raw handle, calls acquireRawHandle() in the constructor and releaseRawHandle() in the destructor.
+class ZipArchiveWriter::HandleHolder
+{
+public:
+ HandleHolder() = default;
+
+ explicit HandleHolder(const std::shared_ptr<ZipArchiveWriter> & writer_) : writer(writer_), raw_handle(writer->acquireRawHandle()) { }
+
+ ~HandleHolder()
+ {
+ if (raw_handle)
+ {
+ try
+ {
+ int err = zipCloseFileInZip(raw_handle);
+ /// If err == ZIP_PARAMERROR the file is already closed.
+ if (err != ZIP_PARAMERROR)
+ checkResult(err);
+ }
+ catch (...)
+ {
+ tryLogCurrentException("ZipArchiveWriter");
+ }
+ writer->releaseRawHandle(raw_handle);
+ }
+ }
+
+ HandleHolder(HandleHolder && src) noexcept
+ {
+ *this = std::move(src);
+ }
+
+ HandleHolder & operator=(HandleHolder && src) noexcept
+ {
+ writer = std::exchange(src.writer, nullptr);
+ raw_handle = std::exchange(src.raw_handle, nullptr);
+ return *this;
+ }
+
+ RawHandle getRawHandle() const { return raw_handle; }
+ std::shared_ptr<ZipArchiveWriter> getWriter() const { return writer; }
+
+ void checkResult(int code) const { writer->checkResult(code); }
+
+private:
+ std::shared_ptr<ZipArchiveWriter> writer;
+ RawHandle raw_handle = nullptr;
+};
+
+
+/// This class represents a WriteBuffer actually returned by writeFile().
+class ZipArchiveWriter::WriteBufferFromZipArchive : public WriteBufferFromFileBase
+{
+public:
+ WriteBufferFromZipArchive(HandleHolder && handle_, const String & filename_)
+ : WriteBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+ , handle(std::move(handle_))
+ , filename(filename_)
+ {
+ auto compress_method = handle.getWriter()->compression_method;
+ auto compress_level = handle.getWriter()->compression_level;
+ checkCompressionMethodIsEnabled(compress_method);
+
+ const char * password_cstr = nullptr;
+ const String & password_str = handle.getWriter()->password;
+ if (!password_str.empty())
+ {
+ checkEncryptionIsEnabled();
+ password_cstr = password_str.c_str();
+ }
+
+ RawHandle raw_handle = handle.getRawHandle();
+
+ checkResult(zipOpenNewFileInZip3_64(
+ raw_handle,
+ filename_.c_str(),
+ /* zipfi= */ nullptr,
+ /* extrafield_local= */ nullptr,
+ /* size_extrafield_local= */ 0,
+ /* extrafield_global= */ nullptr,
+ /* size_extrafield_global= */ 0,
+ /* comment= */ nullptr,
+ compress_method,
+ compress_level,
+ /* raw= */ false,
+ /* windowBits= */ 0,
+ /* memLevel= */ 0,
+ /* strategy= */ 0,
+ password_cstr,
+ /* crc_for_crypting= */ 0,
+ /* zip64= */ true));
+ }
+
+ ~WriteBufferFromZipArchive() override
+ {
+ try
+ {
+ finalize();
+ }
+ catch (...)
+ {
+ tryLogCurrentException("ZipArchiveWriter");
+ }
+ }
+
+ void sync() override { next(); }
+ std::string getFileName() const override { return filename; }
+
+private:
+ void nextImpl() override
+ {
+ if (!offset())
+ return;
+ RawHandle raw_handle = handle.getRawHandle();
+ int code = zipWriteInFileInZip(raw_handle, working_buffer.begin(), static_cast<uint32_t>(offset()));
+ checkResult(code);
+ }
+
+ void checkResult(int code) const { handle.checkResult(code); }
+
+ HandleHolder handle;
+ String filename;
+};
+
+
+namespace
+{
+ /// Provides a set of functions allowing the minizip library to write its output
+ /// to a WriteBuffer instead of an ordinary file in the local filesystem.
+ class StreamFromWriteBuffer
+ {
+ public:
+ static RawHandle open(std::unique_ptr<WriteBuffer> archive_write_buffer)
+ {
+ Opaque opaque{std::move(archive_write_buffer)};
+
+ zlib_filefunc64_def func_def;
+ func_def.zopen64_file = &StreamFromWriteBuffer::openFileFunc;
+ func_def.zclose_file = &StreamFromWriteBuffer::closeFileFunc;
+ func_def.zread_file = &StreamFromWriteBuffer::readFileFunc;
+ func_def.zwrite_file = &StreamFromWriteBuffer::writeFileFunc;
+ func_def.zseek64_file = &StreamFromWriteBuffer::seekFunc;
+ func_def.ztell64_file = &StreamFromWriteBuffer::tellFunc;
+ func_def.zerror_file = &StreamFromWriteBuffer::testErrorFunc;
+ func_def.opaque = &opaque;
+
+ return zipOpen2_64(
+ /* path= */ nullptr,
+ /* append= */ false,
+ /* globalcomment= */ nullptr,
+ &func_def);
+ }
+
+ private:
+ std::unique_ptr<WriteBuffer> write_buffer;
+ UInt64 start_offset = 0;
+
+ struct Opaque
+ {
+ std::unique_ptr<WriteBuffer> write_buffer;
+ };
+
+ static void * openFileFunc(void * opaque, const void *, int)
+ {
+ Opaque & opq = *reinterpret_cast<Opaque *>(opaque);
+ return new StreamFromWriteBuffer(std::move(opq.write_buffer));
+ }
+
+ explicit StreamFromWriteBuffer(std::unique_ptr<WriteBuffer> write_buffer_)
+ : write_buffer(std::move(write_buffer_)), start_offset(write_buffer->count()) {}
+
+ ~StreamFromWriteBuffer()
+ {
+ write_buffer->finalize();
+ }
+
+ static int closeFileFunc(void *, void * stream)
+ {
+ delete reinterpret_cast<StreamFromWriteBuffer *>(stream);
+ return ZIP_OK;
+ }
+
+ static StreamFromWriteBuffer & get(void * ptr)
+ {
+ return *reinterpret_cast<StreamFromWriteBuffer *>(ptr);
+ }
+
+ static unsigned long writeFileFunc(void *, void * stream, const void * buf, unsigned long size) // NOLINT(google-runtime-int)
+ {
+ auto & strm = get(stream);
+ strm.write_buffer->write(reinterpret_cast<const char *>(buf), size);
+ return size;
+ }
+
+ static int testErrorFunc(void *, void *)
+ {
+ return ZIP_OK;
+ }
+
+ static ZPOS64_T tellFunc(void *, void * stream)
+ {
+ auto & strm = get(stream);
+ auto pos = strm.write_buffer->count() - strm.start_offset;
+ return pos;
+ }
+
+ static long seekFunc(void *, void *, ZPOS64_T, int) // NOLINT(google-runtime-int)
+ {
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "StreamFromWriteBuffer::seek must not be called");
+ }
+
+ static unsigned long readFileFunc(void *, void *, void *, unsigned long) // NOLINT(google-runtime-int)
+ {
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "StreamFromWriteBuffer::readFile must not be called");
+ }
+ };
+}
+
+
+ZipArchiveWriter::ZipArchiveWriter(const String & path_to_archive_)
+ : ZipArchiveWriter(path_to_archive_, nullptr)
+{
+}
+
+ZipArchiveWriter::ZipArchiveWriter(const String & path_to_archive_, std::unique_ptr<WriteBuffer> archive_write_buffer_)
+ : path_to_archive(path_to_archive_), compression_method(MZ_COMPRESS_METHOD_DEFLATE)
+{
+ if (archive_write_buffer_)
+ handle = StreamFromWriteBuffer::open(std::move(archive_write_buffer_));
+ else
+ handle = zipOpen64(path_to_archive.c_str(), /* append= */ false);
+ if (!handle)
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Couldn't create zip archive {}", quoteString(path_to_archive));
+
+}
+
+ZipArchiveWriter::~ZipArchiveWriter()
+{
+ if (handle)
+ {
+ try
+ {
+ checkResult(zipClose(handle, /* global_comment= */ nullptr));
+ }
+ catch (...)
+ {
+ tryLogCurrentException("ZipArchiveWriter");
+ }
+ }
+}
+
+std::unique_ptr<WriteBufferFromFileBase> ZipArchiveWriter::writeFile(const String & filename)
+{
+ return std::make_unique<WriteBufferFromZipArchive>(acquireHandle(), filename);
+}
+
+bool ZipArchiveWriter::isWritingFile() const
+{
+ std::lock_guard lock{mutex};
+ return !handle;
+}
+
+void ZipArchiveWriter::setCompression(const String & compression_method_, int compression_level_)
+{
+ std::lock_guard lock{mutex};
+ compression_method = compressionMethodToInt(compression_method_);
+ compression_level = compression_level_;
+}
+
+void ZipArchiveWriter::setPassword(const String & password_)
+{
+ std::lock_guard lock{mutex};
+ password = password_;
+}
+
+int ZipArchiveWriter::compressionMethodToInt(const String & compression_method_)
+{
+ if (compression_method_.empty())
+ return MZ_COMPRESS_METHOD_DEFLATE; /// By default the compression method is "deflate".
+ else if (compression_method_ == kStore)
+ return MZ_COMPRESS_METHOD_STORE;
+ else if (compression_method_ == kDeflate)
+ return MZ_COMPRESS_METHOD_DEFLATE;
+ else if (compression_method_ == kBzip2)
+ return MZ_COMPRESS_METHOD_BZIP2;
+ else if (compression_method_ == kLzma)
+ return MZ_COMPRESS_METHOD_LZMA;
+ else if (compression_method_ == kZstd)
+ return MZ_COMPRESS_METHOD_ZSTD;
+ else if (compression_method_ == kXz)
+ return MZ_COMPRESS_METHOD_XZ;
+ else
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Unknown compression method specified for a zip archive: {}", compression_method_);
+}
+
+String ZipArchiveWriter::intToCompressionMethod(int compression_method_)
+{
+ switch (compression_method_)
+ {
+ case MZ_COMPRESS_METHOD_STORE: return kStore;
+ case MZ_COMPRESS_METHOD_DEFLATE: return kDeflate;
+ case MZ_COMPRESS_METHOD_BZIP2: return kBzip2;
+ case MZ_COMPRESS_METHOD_LZMA: return kLzma;
+ case MZ_COMPRESS_METHOD_ZSTD: return kZstd;
+ case MZ_COMPRESS_METHOD_XZ: return kXz;
+ }
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Unknown compression method specified for a zip archive: {}", compression_method_);
+}
+
+/// Checks that a passed compression method can be used.
+void ZipArchiveWriter::checkCompressionMethodIsEnabled(int compression_method_)
+{
+ switch (compression_method_)
+ {
+ case MZ_COMPRESS_METHOD_STORE: [[fallthrough]];
+ case MZ_COMPRESS_METHOD_DEFLATE:
+ case MZ_COMPRESS_METHOD_LZMA:
+ case MZ_COMPRESS_METHOD_ZSTD:
+ case MZ_COMPRESS_METHOD_XZ:
+ return;
+
+ case MZ_COMPRESS_METHOD_BZIP2:
+ {
+#if USE_BZIP2
+ return;
+#else
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "bzip2 compression method is disabled");
+#endif
+ }
+ }
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Unknown compression method specified for a zip archive: {}", compression_method_);
+}
+
+/// Checks that encryption is enabled.
+void ZipArchiveWriter::checkEncryptionIsEnabled()
+{
+#if !USE_SSL
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Encryption in zip archive is disabled");
+#endif
+}
+
+ZipArchiveWriter::HandleHolder ZipArchiveWriter::acquireHandle()
+{
+ return HandleHolder{std::static_pointer_cast<ZipArchiveWriter>(shared_from_this())};
+}
+
+RawHandle ZipArchiveWriter::acquireRawHandle()
+{
+ std::lock_guard lock{mutex};
+ if (!handle)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot have more than one write buffer while writing a zip archive");
+ return std::exchange(handle, nullptr);
+}
+
+void ZipArchiveWriter::releaseRawHandle(RawHandle raw_handle_)
+{
+ std::lock_guard lock{mutex};
+ handle = raw_handle_;
+}
+
+void ZipArchiveWriter::checkResult(int code) const
+{
+ if (code >= ZIP_OK)
+ return;
+
+ String message = "Code = ";
+ switch (code)
+ {
+ case ZIP_ERRNO: message += "ERRNO, errno = " + errnoToString(); break;
+ case ZIP_PARAMERROR: message += "PARAMERROR"; break;
+ case ZIP_BADZIPFILE: message += "BADZIPFILE"; break;
+ case ZIP_INTERNALERROR: message += "INTERNALERROR"; break;
+ default: message += std::to_string(code); break;
+ }
+ showError(message);
+}
+
+void ZipArchiveWriter::showError(const String & message) const
+{
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Couldn't pack zip archive {}: {}", quoteString(path_to_archive), message);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.h b/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.h
new file mode 100644
index 0000000000..6650705fca
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/ZipArchiveWriter.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_MINIZIP
+#include <IO/Archives/IArchiveWriter.h>
+#include <mutex>
+
+
+namespace DB
+{
+class WriteBuffer;
+class WriteBufferFromFileBase;
+
+/// Implementation of IArchiveWriter for writing zip archives.
+class ZipArchiveWriter : public IArchiveWriter
+{
+public:
+ /// Constructs an archive that will be written as a file in the local filesystem.
+ explicit ZipArchiveWriter(const String & path_to_archive_);
+
+ /// Constructs an archive that will be written by using a specified `archive_write_buffer_`.
+ ZipArchiveWriter(const String & path_to_archive_, std::unique_ptr<WriteBuffer> archive_write_buffer_);
+
+ /// Destructors finalizes writing the archive.
+ ~ZipArchiveWriter() override;
+
+ /// Starts writing a file to the archive. The function returns a write buffer,
+ /// any data written to that buffer will be compressed and then put to the archive.
+ /// You can keep only one such buffer at a time, a buffer returned by previous call
+ /// of the function `writeFile()` should be destroyed before next call of `writeFile()`.
+ std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & filename) override;
+
+ /// Returns true if there is an active instance of WriteBuffer returned by writeFile().
+ /// This function should be used mostly for debugging purposes.
+ bool isWritingFile() const override;
+
+ /// Supported compression methods.
+ static constexpr const char kStore[] = "store";
+ static constexpr const char kDeflate[] = "deflate";
+ static constexpr const char kBzip2[] = "bzip2";
+ static constexpr const char kLzma[] = "lzma";
+ static constexpr const char kZstd[] = "zstd";
+ static constexpr const char kXz[] = "xz";
+
+ /// Some compression levels.
+ enum class CompressionLevels
+ {
+ kDefault = kDefaultCompressionLevel,
+ kFast = 2,
+ kNormal = 6,
+ kBest = 9,
+ };
+
+ /// Sets compression method and level.
+ /// Changing them will affect next file in the archive.
+ void setCompression(const String & compression_method_, int compression_level_) override;
+
+ /// Sets password. Only contents of the files are encrypted,
+ /// names of files are not encrypted.
+ /// Changing the password will affect next file in the archive.
+ void setPassword(const String & password_) override;
+
+ /// Utility functions.
+ static int compressionMethodToInt(const String & compression_method_);
+ static String intToCompressionMethod(int compression_method_);
+ static void checkCompressionMethodIsEnabled(int compression_method_);
+ static void checkEncryptionIsEnabled();
+
+private:
+ class WriteBufferFromZipArchive;
+ class HandleHolder;
+ using RawHandle = void *;
+
+ HandleHolder acquireHandle();
+ RawHandle acquireRawHandle();
+ void releaseRawHandle(RawHandle raw_handle_);
+
+ void checkResult(int code) const;
+ [[noreturn]] void showError(const String & message) const;
+
+ const String path_to_archive;
+ int compression_method; /// By default the compression method is "deflate".
+ int compression_level = kDefaultCompressionLevel;
+ String password;
+ RawHandle handle = nullptr;
+ mutable std::mutex mutex;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Archives/createArchiveReader.cpp b/contrib/clickhouse/src/IO/Archives/createArchiveReader.cpp
new file mode 100644
index 0000000000..0c998971de
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/createArchiveReader.cpp
@@ -0,0 +1,70 @@
+#include <IO/Archives/createArchiveReader.h>
+#include <IO/Archives/ZipArchiveReader.h>
+#include <IO/Archives/LibArchiveReader.h>
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_UNPACK_ARCHIVE;
+ extern const int SUPPORT_IS_DISABLED;
+}
+
+
+std::shared_ptr<IArchiveReader> createArchiveReader(const String & path_to_archive)
+{
+ return createArchiveReader(path_to_archive, {}, 0);
+}
+
+
+std::shared_ptr<IArchiveReader> createArchiveReader(
+ const String & path_to_archive,
+ [[maybe_unused]] const std::function<std::unique_ptr<SeekableReadBuffer>()> & archive_read_function,
+ [[maybe_unused]] size_t archive_size)
+{
+ using namespace std::literals;
+ static constexpr std::array tar_extensions
+ {
+ ".tar"sv,
+ ".tar.gz"sv,
+ ".tgz"sv,
+ ".tar.zst"sv,
+ ".tzst"sv,
+ ".tar.xz"sv,
+ ".tar.bz2"sv
+ };
+
+ if (path_to_archive.ends_with(".zip") || path_to_archive.ends_with(".zipx"))
+ {
+#if USE_MINIZIP
+ return std::make_shared<ZipArchiveReader>(path_to_archive, archive_read_function, archive_size);
+#else
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled");
+#endif
+ }
+ else if (std::any_of(
+ tar_extensions.begin(), tar_extensions.end(), [&](const auto extension) { return path_to_archive.ends_with(extension); }))
+ {
+#if USE_LIBARCHIVE
+ return std::make_shared<TarArchiveReader>(path_to_archive);
+#else
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "libarchive library is disabled");
+#endif
+ }
+ else if (path_to_archive.ends_with(".7z"))
+ {
+#if USE_LIBARCHIVE
+ return std::make_shared<SevenZipArchiveReader>(path_to_archive);
+#else
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "libarchive library is disabled");
+#endif
+ }
+ else
+ {
+ throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Cannot determine the type of archive {}", path_to_archive);
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/createArchiveReader.h b/contrib/clickhouse/src/IO/Archives/createArchiveReader.h
new file mode 100644
index 0000000000..64eb4c8eab
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/createArchiveReader.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <base/types.h>
+#include <functional>
+#include <memory>
+
+
+namespace DB
+{
+class IArchiveReader;
+class SeekableReadBuffer;
+
+/// Starts reading a specified archive in the local filesystem.
+std::shared_ptr<IArchiveReader> createArchiveReader(const String & path_to_archive);
+
+/// Starts reading a specified archive, the archive is read by using a specified read buffer,
+/// `path_to_archive` is used only to determine the archive's type.
+std::shared_ptr<IArchiveReader> createArchiveReader(
+ const String & path_to_archive,
+ const std::function<std::unique_ptr<SeekableReadBuffer>()> & archive_read_function,
+ size_t archive_size);
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/createArchiveWriter.cpp b/contrib/clickhouse/src/IO/Archives/createArchiveWriter.cpp
new file mode 100644
index 0000000000..807fe66e6a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/createArchiveWriter.cpp
@@ -0,0 +1,38 @@
+#include <IO/Archives/createArchiveWriter.h>
+#include <IO/Archives/ZipArchiveWriter.h>
+#include <IO/WriteBuffer.h>
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_PACK_ARCHIVE;
+ extern const int SUPPORT_IS_DISABLED;
+}
+
+
+std::shared_ptr<IArchiveWriter> createArchiveWriter(const String & path_to_archive)
+{
+ return createArchiveWriter(path_to_archive, nullptr);
+}
+
+
+std::shared_ptr<IArchiveWriter> createArchiveWriter(
+ const String & path_to_archive,
+ [[maybe_unused]] std::unique_ptr<WriteBuffer> archive_write_buffer)
+{
+ if (path_to_archive.ends_with(".zip") || path_to_archive.ends_with(".zipx"))
+ {
+#if USE_MINIZIP
+ return std::make_shared<ZipArchiveWriter>(path_to_archive, std::move(archive_write_buffer));
+#else
+ throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled");
+#endif
+ }
+ else
+ throw Exception(ErrorCodes::CANNOT_PACK_ARCHIVE, "Cannot determine the type of archive {}", path_to_archive);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/createArchiveWriter.h b/contrib/clickhouse/src/IO/Archives/createArchiveWriter.h
new file mode 100644
index 0000000000..51ffd4d114
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/createArchiveWriter.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <base/types.h>
+#include <memory>
+
+
+namespace DB
+{
+class IArchiveWriter;
+class WriteBuffer;
+
+/// Starts writing a specified archive in the local filesystem.
+std::shared_ptr<IArchiveWriter> createArchiveWriter(const String & path_to_archive);
+
+/// Starts writing a specified archive, the archive is written by using a specified write buffer,
+/// `path_to_archive` is used only to determine the archive's type.
+std::shared_ptr<IArchiveWriter> createArchiveWriter(const String & path_to_archive, std::unique_ptr<WriteBuffer> archive_write_buffer);
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.cpp b/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.cpp
new file mode 100644
index 0000000000..6b2ef29d05
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.cpp
@@ -0,0 +1,12 @@
+#include <IO/Archives/hasRegisteredArchiveFileExtension.h>
+
+
+namespace DB
+{
+
+bool hasRegisteredArchiveFileExtension(const String & path)
+{
+ return path.ends_with(".zip") || path.ends_with(".zipx");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.h b/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.h
new file mode 100644
index 0000000000..cab938aa0b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Archives/hasRegisteredArchiveFileExtension.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <base/types.h>
+
+
+namespace DB
+{
+
+/// Returns true if a specified path has one of the registered file extensions for an archive.
+bool hasRegisteredArchiveFileExtension(const String & path);
+
+}
diff --git a/contrib/clickhouse/src/IO/AsyncReadCounters.cpp b/contrib/clickhouse/src/IO/AsyncReadCounters.cpp
new file mode 100644
index 0000000000..816da0d331
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsyncReadCounters.cpp
@@ -0,0 +1,37 @@
+#include <IO/AsyncReadCounters.h>
+
+namespace DB
+{
+
+void AsyncReadCounters::dumpToMapColumn(IColumn * column) const
+{
+ auto * column_map = column ? &typeid_cast<DB::ColumnMap &>(*column) : nullptr;
+ if (!column_map)
+ return;
+
+ auto & offsets = column_map->getNestedColumn().getOffsets();
+ auto & tuple_column = column_map->getNestedData();
+ auto & key_column = tuple_column.getColumn(0);
+ auto & value_column = tuple_column.getColumn(1);
+
+ size_t size = 0;
+ auto load_if_not_empty = [&](const auto & key, const auto & value)
+ {
+ if (value)
+ {
+ key_column.insert(key);
+ value_column.insert(value);
+ ++size;
+ }
+ };
+
+ std::lock_guard lock(mutex);
+
+ load_if_not_empty("max_parallel_read_tasks", max_parallel_read_tasks);
+ load_if_not_empty("max_parallel_prefetch_tasks", max_parallel_prefetch_tasks);
+ load_if_not_empty("total_prefetch_tasks", total_prefetch_tasks);
+
+ offsets.push_back(offsets.back() + size);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/AsyncReadCounters.h b/contrib/clickhouse/src/IO/AsyncReadCounters.h
new file mode 100644
index 0000000000..1f84b2a214
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsyncReadCounters.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <Core/Types.h>
+#include <Columns/ColumnMap.h>
+
+namespace DB
+{
+
+/// Metrics for asynchronous reading feature.
+struct AsyncReadCounters
+{
+ /// Count current and max number of tasks in a asynchronous read pool.
+ /// The tasks are requests to read the data.
+ size_t max_parallel_read_tasks = 0;
+ size_t current_parallel_read_tasks = 0;
+
+ /// Count current and max number of tasks in a reader prefetch read pool.
+ /// The tasks are calls to IMergeTreeReader::prefetch(), which does not do
+ /// any reading but creates a request for read. But as we need to wait for
+ /// marks to be loaded during this prefetch, we do it in a threadpool too.
+ size_t max_parallel_prefetch_tasks = 0;
+ size_t current_parallel_prefetch_tasks = 0;
+ size_t total_prefetch_tasks = 0;
+
+ mutable std::mutex mutex;
+
+ AsyncReadCounters() = default;
+
+ void dumpToMapColumn(IColumn * column) const;
+};
+using AsyncReadCountersPtr = std::shared_ptr<AsyncReadCounters>;
+
+}
diff --git a/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.cpp b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.cpp
new file mode 100644
index 0000000000..0e6c8090cb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.cpp
@@ -0,0 +1,109 @@
+#include <fcntl.h>
+
+#include <IO/AsynchronousReadBufferFromFile.h>
+#include <IO/WriteHelpers.h>
+#include <Common/ProfileEvents.h>
+#include <base/defines.h>
+#include <cerrno>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ const std::string & file_name_,
+ size_t buf_size,
+ int flags,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_)
+ : AsynchronousReadBufferFromFileDescriptor(reader_, priority_, -1, buf_size, existing_memory, alignment, file_size_)
+ , file_name(file_name_)
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+#ifdef OS_DARWIN
+ bool o_direct = (flags != -1) && (flags & O_DIRECT);
+ if (o_direct)
+ flags = flags & ~O_DIRECT;
+#endif
+ fd = ::open(file_name.c_str(), flags == -1 ? O_RDONLY | O_CLOEXEC : flags | O_CLOEXEC);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+#ifdef OS_DARWIN
+ if (o_direct)
+ {
+ if (fcntl(fd, F_NOCACHE, 1) == -1)
+ throwFromErrnoWithPath("Cannot set F_NOCACHE on file " + file_name, file_name, ErrorCodes::CANNOT_OPEN_FILE);
+ }
+#endif
+}
+
+
+AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ int & fd_,
+ const std::string & original_file_name,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_)
+ : AsynchronousReadBufferFromFileDescriptor(reader_, priority_, fd_, buf_size, existing_memory, alignment, file_size_)
+ , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name)
+{
+ fd_ = -1;
+}
+
+
+AsynchronousReadBufferFromFile::~AsynchronousReadBufferFromFile()
+{
+ /// Must wait for events in flight before closing the file.
+ finalize();
+
+ if (fd < 0)
+ return;
+
+ int err = ::close(fd);
+ chassert(!err || errno == EINTR);
+}
+
+
+void AsynchronousReadBufferFromFile::close()
+{
+ if (fd < 0)
+ return;
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+}
+
+
+AsynchronousReadBufferFromFileWithDescriptorsCache::~AsynchronousReadBufferFromFileWithDescriptorsCache()
+{
+ /// Must wait for events in flight before potentially closing the file by destroying OpenedFilePtr.
+ finalize();
+}
+
+
+}
diff --git a/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.h b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.h
new file mode 100644
index 0000000000..d3b7ffbc7d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFile.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <Common/Throttler_fwd.h>
+#include <IO/AsynchronousReadBufferFromFileDescriptor.h>
+#include <IO/OpenedFileCache.h>
+
+
+namespace DB
+{
+
+/* NOTE: Unused */
+class AsynchronousReadBufferFromFile : public AsynchronousReadBufferFromFileDescriptor
+{
+protected:
+ std::string file_name;
+
+public:
+ explicit AsynchronousReadBufferFromFile(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt);
+
+ /// Use pre-opened file descriptor.
+ explicit AsynchronousReadBufferFromFile(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
+ const std::string & original_file_name = {},
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt);
+
+ ~AsynchronousReadBufferFromFile() override;
+
+ /// Close file before destruction of object.
+ void close();
+
+ std::string getFileName() const override
+ {
+ return file_name;
+ }
+};
+
+/** Similar to AsynchronousReadBufferFromFile but also transparently shares open file descriptors.
+ */
+class AsynchronousReadBufferFromFileWithDescriptorsCache : public AsynchronousReadBufferFromFileDescriptor
+{
+private:
+ std::string file_name;
+ OpenedFileCache::OpenedFilePtr file;
+
+public:
+ AsynchronousReadBufferFromFileWithDescriptorsCache(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler_ = {})
+ : AsynchronousReadBufferFromFileDescriptor(reader_, priority_, -1, buf_size, existing_memory, alignment, file_size_, throttler_)
+ , file_name(file_name_)
+ {
+ file = OpenedFileCache::instance().get(file_name, flags);
+ fd = file->getFD();
+ }
+
+ ~AsynchronousReadBufferFromFileWithDescriptorsCache() override;
+
+ std::string getFileName() const override
+ {
+ return file_name;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp
new file mode 100644
index 0000000000..d30773f88f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp
@@ -0,0 +1,272 @@
+#include <cerrno>
+#include <ctime>
+#include <optional>
+#include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
+#include <Common/Exception.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/Throttler.h>
+#include <Common/filesystemHelpers.h>
+#include <IO/AsynchronousReadBufferFromFileDescriptor.h>
+#include <IO/WriteHelpers.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event AsynchronousReadWaitMicroseconds;
+ extern const Event LocalReadThrottlerBytes;
+ extern const Event LocalReadThrottlerSleepMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric AsynchronousReadWait;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+ extern const int LOGICAL_ERROR;
+}
+
+
+std::string AsynchronousReadBufferFromFileDescriptor::getFileName() const
+{
+ return "(fd = " + toString(fd) + ")";
+}
+
+
+std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescriptor::asyncReadInto(char * data, size_t size, Priority priority)
+{
+ IAsynchronousReader::Request request;
+ request.descriptor = std::make_shared<IAsynchronousReader::LocalFileDescriptor>(fd);
+ request.buf = data;
+ request.size = size;
+ request.offset = file_offset_of_buffer_end;
+ request.priority = Priority{base_priority.value + priority.value};
+ request.ignore = bytes_to_ignore;
+ bytes_to_ignore = 0;
+
+ /// This is a workaround of a read pass EOF bug in linux kernel with pread()
+ if (file_size.has_value() && file_offset_of_buffer_end >= *file_size)
+ {
+ return std::async(std::launch::deferred, [] { return IAsynchronousReader::Result{.size = 0, .offset = 0}; });
+ }
+
+ return reader.submit(request);
+}
+
+
+void AsynchronousReadBufferFromFileDescriptor::prefetch(Priority priority)
+{
+ if (prefetch_future.valid())
+ return;
+
+ /// Will request the same amount of data that is read in nextImpl.
+ prefetch_buffer.resize(internal_buffer.size());
+ prefetch_future = asyncReadInto(prefetch_buffer.data(), prefetch_buffer.size(), priority);
+}
+
+
+bool AsynchronousReadBufferFromFileDescriptor::nextImpl()
+{
+ if (prefetch_future.valid())
+ {
+ /// Read request already in flight. Wait for its completion.
+
+ size_t size = 0;
+ size_t offset = 0;
+ {
+ Stopwatch watch;
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait};
+ auto result = prefetch_future.get();
+ ProfileEvents::increment(ProfileEvents::AsynchronousReadWaitMicroseconds, watch.elapsedMicroseconds());
+ size = result.size;
+ offset = result.offset;
+ assert(offset < size || size == 0);
+ }
+
+ prefetch_future = {};
+ file_offset_of_buffer_end += size;
+
+ assert(offset <= size);
+ size_t bytes_read = size - offset;
+ if (throttler)
+ throttler->add(bytes_read, ProfileEvents::LocalReadThrottlerBytes, ProfileEvents::LocalReadThrottlerSleepMicroseconds);
+
+ if (bytes_read)
+ {
+ prefetch_buffer.swap(memory);
+ /// Adjust the working buffer so that it ignores `offset` bytes.
+ internal_buffer = Buffer(memory.data(), memory.data() + memory.size());
+ working_buffer = Buffer(memory.data() + offset, memory.data() + size);
+ pos = working_buffer.begin();
+ return true;
+ }
+
+ return false;
+ }
+ else
+ {
+ /// No pending request. Do synchronous read.
+
+ Stopwatch watch;
+ auto [size, offset, _] = asyncReadInto(memory.data(), memory.size(), DEFAULT_PREFETCH_PRIORITY).get();
+ ProfileEvents::increment(ProfileEvents::AsynchronousReadWaitMicroseconds, watch.elapsedMicroseconds());
+
+ file_offset_of_buffer_end += size;
+
+ assert(offset <= size);
+ size_t bytes_read = size - offset;
+ if (throttler)
+ throttler->add(bytes_read, ProfileEvents::LocalReadThrottlerBytes, ProfileEvents::LocalReadThrottlerSleepMicroseconds);
+
+ if (bytes_read)
+ {
+ /// Adjust the working buffer so that it ignores `offset` bytes.
+ internal_buffer = Buffer(memory.data(), memory.data() + memory.size());
+ working_buffer = Buffer(memory.data() + offset, memory.data() + size);
+ pos = working_buffer.begin();
+ return true;
+ }
+
+ return false;
+ }
+}
+
+
+void AsynchronousReadBufferFromFileDescriptor::finalize()
+{
+ if (prefetch_future.valid())
+ {
+ prefetch_future.wait();
+ prefetch_future = {};
+ }
+}
+
+
+AsynchronousReadBufferFromFileDescriptor::AsynchronousReadBufferFromFileDescriptor(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ int fd_,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_,
+ ThrottlerPtr throttler_)
+ : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_)
+ , reader(reader_)
+ , base_priority(priority_)
+ , required_alignment(alignment)
+ , fd(fd_)
+ , throttler(throttler_)
+{
+ if (required_alignment > buf_size)
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Too large alignment. Cannot have required_alignment greater than buf_size: {} > {}. It is a bug",
+ required_alignment,
+ buf_size);
+
+ prefetch_buffer.alignment = alignment;
+}
+
+AsynchronousReadBufferFromFileDescriptor::~AsynchronousReadBufferFromFileDescriptor()
+{
+ finalize();
+}
+
+
+/// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
+off_t AsynchronousReadBufferFromFileDescriptor::seek(off_t offset, int whence)
+{
+ size_t new_pos;
+ if (whence == SEEK_SET)
+ {
+ assert(offset >= 0);
+ new_pos = offset;
+ }
+ else if (whence == SEEK_CUR)
+ {
+ new_pos = file_offset_of_buffer_end - (working_buffer.end() - pos) + offset;
+ }
+ else
+ {
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "ReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence");
+ }
+
+ /// Position is unchanged.
+ if (new_pos + (working_buffer.end() - pos) == file_offset_of_buffer_end)
+ return new_pos;
+
+ while (true)
+ {
+ if (file_offset_of_buffer_end - working_buffer.size() <= new_pos && new_pos <= file_offset_of_buffer_end)
+ {
+ /// Position is still inside the buffer.
+ /// Probably it is at the end of the buffer - then we will load data on the following 'next' call.
+
+ pos = working_buffer.end() - file_offset_of_buffer_end + new_pos;
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+
+ return new_pos;
+ }
+ else if (prefetch_future.valid())
+ {
+ /// Read from prefetch buffer and recheck if the new position is valid inside.
+ if (nextImpl())
+ continue;
+ }
+
+ break;
+ }
+
+ assert(!prefetch_future.valid());
+
+ /// Position is out of the buffer, we need to do real seek.
+ off_t seek_pos = required_alignment > 1
+ ? new_pos / required_alignment * required_alignment
+ : new_pos;
+
+ /// First reset the buffer so the next read will fetch new data to the buffer.
+ resetWorkingBuffer();
+
+ /// Just update the info about the next position in file.
+
+ file_offset_of_buffer_end = seek_pos;
+ bytes_to_ignore = new_pos - seek_pos;
+
+ if (bytes_to_ignore >= internal_buffer.size())
+ throw Exception(ErrorCodes::LOGICAL_ERROR,
+ "Logical error in AsynchronousReadBufferFromFileDescriptor, bytes_to_ignore ({}"
+ ") >= internal_buffer.size() ({})", bytes_to_ignore, internal_buffer.size());
+
+ return seek_pos;
+}
+
+
+void AsynchronousReadBufferFromFileDescriptor::rewind()
+{
+ if (prefetch_future.valid())
+ {
+ prefetch_future.wait();
+ prefetch_future = {};
+ }
+
+ /// Clearing the buffer with existing data. New data will be read on subsequent call to 'next'.
+ working_buffer.resize(0);
+ pos = working_buffer.begin();
+ file_offset_of_buffer_end = 0;
+}
+
+size_t AsynchronousReadBufferFromFileDescriptor::getFileSize()
+{
+ return getSizeFromFileDescriptor(fd, getFileName());
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.h b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.h
new file mode 100644
index 0000000000..4a4130ebab
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsynchronousReadBufferFromFileDescriptor.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/AsynchronousReader.h>
+#include <Interpreters/Context.h>
+#include <Common/Throttler_fwd.h>
+#include <Common/Priority.h>
+
+#include <optional>
+#include <unistd.h>
+
+
+namespace DB
+{
+
+/** Use ready file descriptor. Does not open or close a file.
+ */
+class AsynchronousReadBufferFromFileDescriptor : public ReadBufferFromFileBase
+{
+protected:
+ IAsynchronousReader & reader;
+ Priority base_priority;
+
+ Memory<> prefetch_buffer;
+ std::future<IAsynchronousReader::Result> prefetch_future;
+
+ const size_t required_alignment = 0; /// For O_DIRECT both file offsets and memory addresses have to be aligned.
+ size_t file_offset_of_buffer_end = 0; /// What offset in file corresponds to working_buffer.end().
+ size_t bytes_to_ignore = 0; /// How many bytes should we ignore upon a new read request.
+ int fd;
+ ThrottlerPtr throttler;
+
+ bool nextImpl() override;
+
+ /// Name or some description of file.
+ std::string getFileName() const override;
+
+ void finalize();
+
+public:
+ AsynchronousReadBufferFromFileDescriptor(
+ IAsynchronousReader & reader_,
+ Priority priority_,
+ int fd_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler_ = {});
+
+ ~AsynchronousReadBufferFromFileDescriptor() override;
+
+ void prefetch(Priority priority) override;
+
+ int getFD() const
+ {
+ return fd;
+ }
+
+ off_t getPosition() override
+ {
+ return file_offset_of_buffer_end - (working_buffer.end() - pos);
+ }
+
+ /// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
+ off_t seek(off_t off, int whence) override;
+
+ /// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read.
+ void rewind();
+
+ size_t getFileSize() override;
+
+private:
+ std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, Priority priority);
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/AsynchronousReader.h b/contrib/clickhouse/src/IO/AsynchronousReader.h
new file mode 100644
index 0000000000..467a3f1d6a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/AsynchronousReader.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <optional>
+#include <memory>
+#include <future>
+#include <boost/noncopyable.hpp>
+#include <Common/Stopwatch.h>
+#include <Common/Priority.h>
+
+
+namespace DB
+{
+
+/** Interface for asynchronous reads from file descriptors.
+ * It can abstract Linux AIO, io_uring or normal reads from separate thread pool,
+ * and also reads from non-local filesystems.
+ * The implementation not necessarily to be efficient for large number of small requests,
+ * instead it should be ok for moderate number of sufficiently large requests
+ * (e.g. read 1 MB of data 50 000 times per seconds; BTW this is normal performance for reading from page cache).
+ * For example, this interface may not suffice if you want to serve 10 000 000 of 4 KiB requests per second.
+ * This interface is fairly limited.
+ */
+class IAsynchronousReader : private boost::noncopyable
+{
+public:
+ /// For local filesystems, the file descriptor is simply integer
+ /// but it can be arbitrary opaque object for remote filesystems.
+ struct IFileDescriptor
+ {
+ virtual ~IFileDescriptor() = default;
+ };
+
+ using FileDescriptorPtr = std::shared_ptr<IFileDescriptor>;
+
+ struct LocalFileDescriptor : public IFileDescriptor
+ {
+ explicit LocalFileDescriptor(int fd_) : fd(fd_) {}
+ int fd;
+ };
+
+ /// Read from file descriptor at specified offset up to size bytes into buf.
+ /// Some implementations may require alignment and it is responsibility of
+ /// the caller to provide conforming requests.
+ struct Request
+ {
+ FileDescriptorPtr descriptor;
+ size_t offset = 0;
+ size_t size = 0;
+ char * buf = nullptr;
+ Priority priority;
+ size_t ignore = 0;
+ };
+
+ struct Result
+ {
+ /// size
+ /// Less than requested amount of data can be returned.
+ /// If size is zero - the file has ended.
+ /// (for example, EINTR must be handled by implementation automatically)
+ size_t size = 0;
+
+ /// offset
+ /// Optional. Useful when implementation needs to do ignore().
+ size_t offset = 0;
+
+ std::unique_ptr<Stopwatch> execution_watch = {};
+
+ operator std::tuple<size_t &, size_t &>() { return {size, offset}; }
+ };
+
+ /// Submit request and obtain a handle. This method don't perform any waits.
+ /// If this method did not throw, the caller must wait for the result with 'wait' method
+ /// or destroy the whole reader before destroying the buffer for request.
+ /// The method can be called concurrently from multiple threads.
+ virtual std::future<Result> submit(Request request) = 0;
+
+ virtual void wait() = 0;
+
+ /// Destructor must wait for all not completed request and ignore the results.
+ /// It may also cancel the requests.
+ virtual ~IAsynchronousReader() = default;
+};
+
+using AsynchronousReaderPtr = std::shared_ptr<IAsynchronousReader>;
+
+}
diff --git a/contrib/clickhouse/src/IO/BitHelpers.h b/contrib/clickhouse/src/IO/BitHelpers.h
new file mode 100644
index 0000000000..a384da0a95
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BitHelpers.h
@@ -0,0 +1,236 @@
+#pragma once
+
+#include <bit>
+#include <base/types.h>
+#include <Common/BitHelpers.h>
+#include <Common/Exception.h>
+
+#include <cstring>
+#include <cassert>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+extern const int ATTEMPT_TO_READ_AFTER_EOF;
+}
+
+/** Reads data from underlying ReadBuffer bit by bit, max 64 bits at once.
+ *
+ * reads MSB bits first, imagine that you have a data:
+ * 11110000 10101010 00100100 11111110
+ *
+ * Given that r is BitReader created with a ReadBuffer that reads from data above:
+ * r.readBits(3) => 0b111
+ * r.readBit() => 0b1
+ * r.readBits(8) => 0b1010 // 4 leading zero-bits are not shown
+ * r.readBit() => 0b1
+ * r.readBit() => 0b0
+ * r.readBits(15) => 0b10001001001111111
+ * r.readBit() => 0b0
+**/
+
+class BitReader
+{
+ const char * const source_begin;
+ const char * const source_end;
+ const char * source_current;
+
+ using BufferType = unsigned __int128;
+ BufferType bits_buffer = 0;
+
+ UInt8 bits_count = 0;
+
+public:
+ BitReader(const char * begin, size_t size)
+ : source_begin(begin)
+ , source_end(begin + size)
+ , source_current(begin)
+ {}
+
+ ~BitReader() = default;
+
+ // reads bits_to_read high-bits from bits_buffer
+ ALWAYS_INLINE UInt64 readBits(UInt8 bits_to_read)
+ {
+ if (bits_to_read > bits_count)
+ fillBitBuffer();
+
+ return getBitsFromBitBuffer<CONSUME>(bits_to_read);
+ }
+
+ UInt8 peekByte()
+ {
+ if (bits_count < 8)
+ fillBitBuffer();
+
+ return getBitsFromBitBuffer<PEEK>(8);
+ }
+
+ ALWAYS_INLINE UInt8 readBit()
+ {
+ return static_cast<UInt8>(readBits(1));
+ }
+
+ // skip bits from bits_buffer
+ void skipBufferedBits(UInt8 bits)
+ {
+ bits_buffer <<= bits;
+ bits_count -= bits;
+ }
+
+
+ bool eof() const
+ {
+ return bits_count == 0 && source_current >= source_end;
+ }
+
+ // number of bits that was already read by clients with readBits()
+ UInt64 count() const
+ {
+ return (source_current - source_begin) * 8 - bits_count;
+ }
+
+ UInt64 remaining() const
+ {
+ return (source_end - source_current) * 8 + bits_count;
+ }
+
+private:
+ enum GetBitsMode {CONSUME, PEEK};
+ // read data from internal buffer, if it has not enough bits, result is undefined.
+ template <GetBitsMode mode>
+ UInt64 getBitsFromBitBuffer(UInt8 bits_to_read)
+ {
+ assert(bits_to_read > 0);
+
+ // push down the high-bits
+ const UInt64 result = static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) * 8 - bits_to_read));
+
+ if constexpr (mode == CONSUME)
+ {
+ // 'erase' high-bits that were have read
+ skipBufferedBits(bits_to_read);
+ }
+
+ return result;
+ }
+
+
+ // Fills internal bits_buffer with data from source, reads at most 64 bits
+ ALWAYS_INLINE size_t fillBitBuffer()
+ {
+ const size_t available = source_end - source_current;
+ const auto bytes_to_read = std::min<size_t>(64 / 8, available);
+ if (available == 0)
+ {
+ if (bytes_to_read == 0)
+ return 0;
+
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Buffer is empty, but requested to read {} more bytes.",
+ bytes_to_read);
+ }
+
+ UInt64 tmp_buffer = 0;
+ memcpy(&tmp_buffer, source_current, bytes_to_read);
+ source_current += bytes_to_read;
+
+ if constexpr (std::endian::native == std::endian::little)
+ tmp_buffer = DB::byteswap(tmp_buffer);
+
+ bits_buffer |= BufferType(tmp_buffer) << ((sizeof(BufferType) - sizeof(tmp_buffer)) * 8 - bits_count);
+ bits_count += static_cast<UInt8>(bytes_to_read) * 8;
+
+ return bytes_to_read;
+ }
+};
+
+class BitWriter
+{
+ char * dest_begin;
+ char * dest_end;
+ char * dest_current;
+
+ using BufferType = unsigned __int128;
+ BufferType bits_buffer = 0;
+
+ UInt8 bits_count = 0;
+
+ static constexpr UInt8 BIT_BUFFER_SIZE = sizeof(bits_buffer) * 8;
+
+public:
+ BitWriter(char * begin, size_t size)
+ : dest_begin(begin)
+ , dest_end(begin + size)
+ , dest_current(begin)
+ {}
+
+ ~BitWriter()
+ {
+ flush();
+ }
+
+ // write `bits_to_write` low-bits of `value` to the buffer
+ void writeBits(UInt8 bits_to_write, UInt64 value)
+ {
+ assert(bits_to_write > 0);
+
+ UInt32 capacity = BIT_BUFFER_SIZE - bits_count;
+ if (capacity < bits_to_write)
+ {
+ doFlush();
+ capacity = BIT_BUFFER_SIZE - bits_count;
+ }
+
+ // write low bits of value as high bits of bits_buffer
+ const UInt64 mask = maskLowBits<UInt64>(bits_to_write);
+ BufferType v = value & mask;
+ v <<= capacity - bits_to_write;
+
+ bits_buffer |= v;
+ bits_count += bits_to_write;
+ }
+
+ // flush contents of bits_buffer to the dest_current, partial bytes are completed with zeroes.
+ void flush()
+ {
+ bits_count = (bits_count + 8 - 1) & ~(8 - 1); // align up to 8-bytes, so doFlush will write all data from bits_buffer
+ while (bits_count != 0)
+ doFlush();
+ }
+
+ UInt64 count() const
+ {
+ return (dest_current - dest_begin) * 8 + bits_count;
+ }
+
+private:
+ void doFlush()
+ {
+ // write whole bytes to the dest_current, leaving partial bits in bits_buffer
+ const size_t available = dest_end - dest_current;
+ const size_t to_write = std::min<size_t>(sizeof(UInt64), bits_count / 8); // align to 8-bit boundary
+
+ if (available < to_write)
+ {
+ throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER,
+ "Can not write past end of buffer. Space available is {} bytes, required to write {} bytes.",
+ available, to_write);
+ }
+
+ UInt64 tmp_buffer = static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) - sizeof(UInt64)) * 8);
+ if constexpr (std::endian::native == std::endian::little)
+ tmp_buffer = DB::byteswap(tmp_buffer);
+
+ memcpy(dest_current, &tmp_buffer, to_write);
+ dest_current += to_write;
+
+ bits_buffer <<= to_write * 8;
+ bits_count -= to_write * 8;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/BoundedReadBuffer.cpp b/contrib/clickhouse/src/IO/BoundedReadBuffer.cpp
new file mode 100644
index 0000000000..bda79d82ad
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BoundedReadBuffer.cpp
@@ -0,0 +1,66 @@
+#include "BoundedReadBuffer.h"
+#include <IO/SwapHelper.h>
+
+namespace DB
+{
+
+BoundedReadBuffer::BoundedReadBuffer(std::unique_ptr<SeekableReadBuffer> impl_)
+ : ReadBufferFromFileDecorator(std::move(impl_))
+{
+}
+
+void BoundedReadBuffer::setReadUntilPosition(size_t position)
+{
+ read_until_position = position;
+}
+
+void BoundedReadBuffer::setReadUntilEnd()
+{
+ read_until_position.reset();
+}
+
+off_t BoundedReadBuffer::getPosition()
+{
+ return file_offset_of_buffer_end - (working_buffer.end() - pos);
+}
+
+bool BoundedReadBuffer::nextImpl()
+{
+ if (read_until_position && file_offset_of_buffer_end == *read_until_position)
+ return false;
+
+ bool result;
+ {
+ SwapHelper swap(*this, *impl);
+ result = impl->next();
+ }
+ chassert(file_offset_of_buffer_end + available() == impl->getFileOffsetOfBufferEnd());
+ if (result && read_until_position)
+ {
+ size_t remaining_size_to_read = *read_until_position - file_offset_of_buffer_end;
+ if (working_buffer.size() > remaining_size_to_read)
+ {
+ /// file: [______________________________]
+ /// working buffer: [_______________]
+ /// ^
+ /// read_until_position
+ /// ^
+ /// file_offset_of_buffer_end
+ working_buffer.resize(remaining_size_to_read);
+ }
+ }
+ file_offset_of_buffer_end += available();
+ return result;
+}
+
+off_t BoundedReadBuffer::seek(off_t off, int whence)
+{
+ swap(*impl);
+ auto result = impl->seek(off, whence);
+ swap(*impl);
+
+ file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd();
+ return result;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/BoundedReadBuffer.h b/contrib/clickhouse/src/IO/BoundedReadBuffer.h
new file mode 100644
index 0000000000..eb65857e83
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BoundedReadBuffer.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <IO/ReadBufferFromFileDecorator.h>
+
+
+namespace DB
+{
+
+/// A buffer which allows to make an underlying buffer as right bounded,
+/// e.g. the buffer cannot return data beyond offset specified in `setReadUntilPosition`.
+class BoundedReadBuffer : public ReadBufferFromFileDecorator
+{
+public:
+ explicit BoundedReadBuffer(std::unique_ptr<SeekableReadBuffer> impl_);
+
+ bool supportsRightBoundedReads() const override { return true; }
+
+ void setReadUntilPosition(size_t position) override;
+
+ void setReadUntilEnd() override;
+
+ bool nextImpl() override;
+
+ off_t seek(off_t off, int whence) override;
+
+ size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; }
+
+ /// file_offset_of_buffer_end can differ from impl's file_offset_of_buffer_end
+ /// because of resizing of the tail. => Need to also override getPosition() as
+ /// it uses file_offset_of_buffer_end.
+ off_t getPosition() override;
+
+private:
+ std::optional<size_t> read_until_position;
+ /// atomic because can be used in log or exception messages while being updated.
+ std::atomic<size_t> file_offset_of_buffer_end = 0;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/BrotliReadBuffer.cpp b/contrib/clickhouse/src/IO/BrotliReadBuffer.cpp
new file mode 100644
index 0000000000..effdfb4b8c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BrotliReadBuffer.cpp
@@ -0,0 +1,109 @@
+#include "clickhouse_config.h"
+
+#if USE_BROTLI
+# error #include <brotli/decode.h>
+# include "BrotliReadBuffer.h"
+# include <IO/WithFileName.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BROTLI_READ_FAILED;
+}
+
+
+class BrotliReadBuffer::BrotliStateWrapper
+{
+public:
+ BrotliStateWrapper()
+ : state(BrotliDecoderCreateInstance(nullptr, nullptr, nullptr))
+ , result(BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT)
+ {
+ }
+
+ ~BrotliStateWrapper()
+ {
+ BrotliDecoderDestroyInstance(state);
+ }
+
+ BrotliDecoderState * state;
+ BrotliDecoderResult result;
+};
+
+BrotliReadBuffer::BrotliReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char *existing_memory, size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+ , brotli(std::make_unique<BrotliStateWrapper>())
+ , in_available(0)
+ , in_data(nullptr)
+ , out_capacity(0)
+ , out_data(nullptr)
+ , eof_flag(false)
+{
+}
+
+BrotliReadBuffer::~BrotliReadBuffer() = default;
+
+bool BrotliReadBuffer::nextImpl()
+{
+ if (eof_flag)
+ return false;
+
+ do
+ {
+ if (!in_available)
+ {
+ in->nextIfAtEnd();
+ in_available = in->buffer().end() - in->position();
+ in_data = reinterpret_cast<uint8_t *>(in->position());
+ }
+
+ if (brotli->result == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT && (!in_available || in->eof()))
+ {
+ throw Exception(
+ ErrorCodes::BROTLI_READ_FAILED,
+ "brotli decode error{}",
+ getExceptionEntryWithFileName(*in));
+ }
+
+ out_capacity = internal_buffer.size();
+ out_data = reinterpret_cast<uint8_t *>(internal_buffer.begin());
+
+ brotli->result = BrotliDecoderDecompressStream(brotli->state, &in_available, &in_data, &out_capacity, &out_data, nullptr);
+
+ in->position() = in->buffer().end() - in_available;
+ }
+ while (brotli->result == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT && out_capacity == internal_buffer.size());
+
+ working_buffer.resize(internal_buffer.size() - out_capacity);
+
+ if (brotli->result == BROTLI_DECODER_RESULT_SUCCESS)
+ {
+ if (in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+ else
+ {
+ throw Exception(
+ ErrorCodes::BROTLI_READ_FAILED,
+ "brotli decode error{}",
+ getExceptionEntryWithFileName(*in));
+ }
+ }
+
+ if (brotli->result == BROTLI_DECODER_RESULT_ERROR)
+ {
+ throw Exception(
+ ErrorCodes::BROTLI_READ_FAILED,
+ "brotli decode error{}",
+ getExceptionEntryWithFileName(*in));
+ }
+
+ return true;
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/BrotliReadBuffer.h b/contrib/clickhouse/src/IO/BrotliReadBuffer.h
new file mode 100644
index 0000000000..8583d6892e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BrotliReadBuffer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/CompressedReadBufferWrapper.h>
+
+
+namespace DB
+{
+
+class BrotliReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ explicit BrotliReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~BrotliReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ class BrotliStateWrapper;
+ std::unique_ptr<BrotliStateWrapper> brotli;
+
+ size_t in_available;
+ const uint8_t * in_data;
+
+ size_t out_capacity;
+ uint8_t * out_data;
+
+ bool eof_flag;
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/BrotliWriteBuffer.cpp b/contrib/clickhouse/src/IO/BrotliWriteBuffer.cpp
new file mode 100644
index 0000000000..6ec427049c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BrotliWriteBuffer.cpp
@@ -0,0 +1,126 @@
+#include "clickhouse_config.h"
+
+#if USE_BROTLI
+# include <IO/BrotliWriteBuffer.h>
+# error #include <brotli/encode.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BROTLI_WRITE_FAILED;
+}
+
+
+class BrotliWriteBuffer::BrotliStateWrapper
+{
+public:
+ BrotliStateWrapper()
+ : state(BrotliEncoderCreateInstance(nullptr, nullptr, nullptr))
+ {
+ }
+
+ ~BrotliStateWrapper()
+ {
+ BrotliEncoderDestroyInstance(state);
+ }
+
+ BrotliEncoderState * state;
+};
+
+BrotliWriteBuffer::BrotliWriteBuffer(std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+ , brotli(std::make_unique<BrotliStateWrapper>())
+ , in_available(0)
+ , in_data(nullptr)
+ , out_capacity(0)
+ , out_data(nullptr)
+{
+ BrotliEncoderSetParameter(brotli->state, BROTLI_PARAM_QUALITY, static_cast<uint32_t>(compression_level));
+ // Set LZ77 window size. According to brotli sources default value is 24 (c/tools/brotli.c:81)
+ BrotliEncoderSetParameter(brotli->state, BROTLI_PARAM_LGWIN, 24);
+}
+
+BrotliWriteBuffer::~BrotliWriteBuffer() = default;
+
+void BrotliWriteBuffer::nextImpl()
+{
+ if (!offset())
+ {
+ return;
+ }
+
+ in_data = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ in_available = offset();
+
+ try
+ {
+ do
+ {
+ out->nextIfAtEnd();
+ out_data = reinterpret_cast<unsigned char *>(out->position());
+ out_capacity = out->buffer().end() - out->position();
+
+ int result = BrotliEncoderCompressStream(
+ brotli->state,
+ in_available ? BROTLI_OPERATION_PROCESS : BROTLI_OPERATION_FINISH,
+ &in_available,
+ &in_data,
+ &out_capacity,
+ &out_data,
+ nullptr);
+
+ out->position() = out->buffer().end() - out_capacity;
+
+ if (result == 0)
+ {
+ throw Exception(ErrorCodes::BROTLI_WRITE_FAILED, "brotli compress failed");
+ }
+ }
+ while (in_available > 0);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+}
+
+void BrotliWriteBuffer::finalizeBefore()
+{
+ next();
+
+ while (true)
+ {
+ out->nextIfAtEnd();
+ out_data = reinterpret_cast<unsigned char *>(out->position());
+ out_capacity = out->buffer().end() - out->position();
+
+ int result = BrotliEncoderCompressStream(
+ brotli->state,
+ BROTLI_OPERATION_FINISH,
+ &in_available,
+ &in_data,
+ &out_capacity,
+ &out_data,
+ nullptr);
+
+ out->position() = out->buffer().end() - out_capacity;
+
+ if (BrotliEncoderIsFinished(brotli->state))
+ {
+ return;
+ }
+
+ if (result == 0)
+ {
+ throw Exception(ErrorCodes::BROTLI_WRITE_FAILED, "brotli compress failed");
+ }
+ }
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/BrotliWriteBuffer.h b/contrib/clickhouse/src/IO/BrotliWriteBuffer.h
new file mode 100644
index 0000000000..8cbc78bd9e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BrotliWriteBuffer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/WriteBufferDecorator.h>
+
+namespace DB
+{
+
+class BrotliWriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ BrotliWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~BrotliWriteBuffer() override;
+
+private:
+ void nextImpl() override;
+
+ void finalizeBefore() override;
+
+ class BrotliStateWrapper;
+ std::unique_ptr<BrotliStateWrapper> brotli;
+
+
+ size_t in_available;
+ const uint8_t * in_data;
+
+ size_t out_capacity;
+ uint8_t * out_data;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/BufferBase.h b/contrib/clickhouse/src/IO/BufferBase.h
new file mode 100644
index 0000000000..7a59687fa5
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BufferBase.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <Core/Defines.h>
+#include <algorithm>
+
+
+namespace DB
+{
+
+
+/** Base class for ReadBuffer and WriteBuffer.
+ * Contains common types, variables, and functions.
+ *
+ * ReadBuffer and WriteBuffer are similar to istream and ostream, respectively.
+ * They have to be used, because using iostreams it is impossible to effectively implement some operations.
+ * For example, using istream, you can not quickly read string values from a tab-separated file,
+ * so that after reading, the position remains immediately after the read value.
+ * (The only option is to call the std::istream::get() function on each byte, but this slows down due to several virtual calls.)
+ *
+ * Read/WriteBuffers provide direct access to the internal buffer, so the necessary operations are implemented more efficiently.
+ * Only one virtual function nextImpl() is used, which is rarely called:
+ * - in the case of ReadBuffer - fill in the buffer with new data from the source;
+ * - in the case of WriteBuffer - write data from the buffer into the receiver.
+ *
+ * Read/WriteBuffer can own or not own an own piece of memory.
+ * In the second case, you can effectively read from an already existing piece of memory / std::string without copying it.
+ */
+class BufferBase
+{
+public:
+ /** Cursor in the buffer. The position of write or read. */
+ using Position = char *;
+
+ /** A reference to the range of memory. */
+ struct Buffer
+ {
+ Buffer(Position begin_pos_, Position end_pos_) : begin_pos(begin_pos_), end_pos(end_pos_) {}
+
+ inline Position begin() const { return begin_pos; }
+ inline Position end() const { return end_pos; }
+ inline size_t size() const { return size_t(end_pos - begin_pos); }
+ inline void resize(size_t size) { end_pos = begin_pos + size; }
+ inline bool empty() const { return size() == 0; }
+
+ inline void swap(Buffer & other)
+ {
+ std::swap(begin_pos, other.begin_pos);
+ std::swap(end_pos, other.end_pos);
+ }
+
+ private:
+ Position begin_pos;
+ Position end_pos; /// 1 byte after the end of the buffer
+ };
+
+ /** The constructor takes a range of memory to use for the buffer.
+ * offset - the starting point of the cursor. ReadBuffer must set it to the end of the range, and WriteBuffer - to the beginning.
+ */
+ BufferBase(Position ptr, size_t size, size_t offset)
+ : pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {}
+
+ void set(Position ptr, size_t size, size_t offset)
+ {
+ internal_buffer = Buffer(ptr, ptr + size);
+ working_buffer = Buffer(ptr, ptr + size);
+ pos = ptr + offset;
+ }
+
+ /// get buffer
+ inline Buffer & internalBuffer() { return internal_buffer; }
+
+ /// get the part of the buffer from which you can read / write data
+ inline Buffer & buffer() { return working_buffer; }
+
+ /// get (for reading and modifying) the position in the buffer
+ inline Position & position() { return pos; }
+
+ /// offset in bytes of the cursor from the beginning of the buffer
+ inline size_t offset() const { return size_t(pos - working_buffer.begin()); }
+
+ /// How many bytes are available for read/write
+ inline size_t available() const { return size_t(working_buffer.end() - pos); }
+
+ inline void swap(BufferBase & other)
+ {
+ internal_buffer.swap(other.internal_buffer);
+ working_buffer.swap(other.working_buffer);
+ std::swap(pos, other.pos);
+ }
+
+ /** How many bytes have been read/written, counting those that are still in the buffer. */
+ size_t count() const { return bytes + offset(); }
+
+ /** Check that there is more bytes in buffer after cursor. */
+ bool ALWAYS_INLINE hasPendingData() const { return available() > 0; }
+
+ bool isPadded() const { return padded; }
+
+protected:
+ void resetWorkingBuffer()
+ {
+ /// Move position to the end of buffer to trigger call of 'next' on next reading.
+ /// Discard all data in current working buffer to prevent wrong assumptions on content
+ /// of buffer, e.g. for optimizations of seeks in seekable buffers.
+ working_buffer.resize(0);
+ pos = working_buffer.end();
+ }
+
+ /// Read/write position.
+ Position pos;
+
+ /** How many bytes have been read/written, not counting those that are now in the buffer.
+ * (counting those that were already used and "removed" from the buffer)
+ */
+ size_t bytes = 0;
+
+ /** A piece of memory that you can use.
+ * For example, if internal_buffer is 1MB, and from a file for reading it was loaded into the buffer
+ * only 10 bytes, then working_buffer will be 10 bytes in size
+ * (working_buffer.end() will point to the position immediately after the 10 bytes that can be read).
+ */
+ Buffer working_buffer;
+
+ /// A reference to a piece of memory for the buffer.
+ Buffer internal_buffer;
+
+ /// Indicator of 15 bytes pad_right
+ bool padded{false};
+};
+
+
+}
diff --git a/contrib/clickhouse/src/IO/BufferWithOwnMemory.h b/contrib/clickhouse/src/IO/BufferWithOwnMemory.h
new file mode 100644
index 0000000000..39c83e9167
--- /dev/null
+++ b/contrib/clickhouse/src/IO/BufferWithOwnMemory.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+
+#include <Common/ProfileEvents.h>
+#include <Common/Allocator.h>
+
+#include <Common/Exception.h>
+#include <Core/Defines.h>
+
+#include <base/arithmeticOverflow.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event IOBufferAllocs;
+ extern const Event IOBufferAllocBytes;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+
+/** Replacement for std::vector<char> to use in buffers.
+ * Differs in that is doesn't do unneeded memset. (And also tries to do as little as possible.)
+ * Also allows to allocate aligned piece of memory (to use with O_DIRECT, for example).
+ */
+template <typename Allocator = Allocator<false>>
+struct Memory : boost::noncopyable, Allocator
+{
+ static constexpr size_t pad_right = PADDING_FOR_SIMD - 1;
+
+ size_t m_capacity = 0; /// With padding.
+ size_t m_size = 0;
+ char * m_data = nullptr;
+ size_t alignment = 0;
+
+ Memory() = default;
+
+ /// If alignment != 0, then allocate memory aligned to specified value.
+ explicit Memory(size_t size_, size_t alignment_ = 0) : alignment(alignment_)
+ {
+ alloc(size_);
+ }
+
+ ~Memory()
+ {
+ dealloc();
+ }
+
+ void swap(Memory & rhs) noexcept
+ {
+ std::swap(m_capacity, rhs.m_capacity);
+ std::swap(m_size, rhs.m_size);
+ std::swap(m_data, rhs.m_data);
+ std::swap(alignment, rhs.alignment);
+ }
+
+ Memory(Memory && rhs) noexcept
+ {
+ swap(rhs);
+ }
+
+ Memory & operator=(Memory && rhs) noexcept
+ {
+ swap(rhs);
+ return *this;
+ }
+
+ size_t size() const { return m_size; }
+ const char & operator[](size_t i) const { return m_data[i]; }
+ char & operator[](size_t i) { return m_data[i]; }
+ const char * data() const { return m_data; }
+ char * data() { return m_data; }
+
+ void resize(size_t new_size)
+ {
+ if (!m_data)
+ {
+ alloc(new_size);
+ return;
+ }
+
+ if (new_size <= m_capacity - pad_right)
+ {
+ m_size = new_size;
+ return;
+ }
+
+ size_t new_capacity = withPadding(new_size);
+
+ size_t diff = new_capacity - m_capacity;
+ ProfileEvents::increment(ProfileEvents::IOBufferAllocBytes, diff);
+
+ m_data = static_cast<char *>(Allocator::realloc(m_data, m_capacity, new_capacity, alignment));
+ m_capacity = new_capacity;
+ m_size = new_size;
+ }
+
+private:
+ static size_t withPadding(size_t value)
+ {
+ size_t res = 0;
+
+ if (common::addOverflow<size_t>(value, pad_right, res))
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "value is too big to apply padding");
+
+ return res;
+ }
+
+ void alloc(size_t new_size)
+ {
+ if (!new_size)
+ {
+ m_data = nullptr;
+ return;
+ }
+
+ size_t new_capacity = withPadding(new_size);
+
+ ProfileEvents::increment(ProfileEvents::IOBufferAllocs);
+ ProfileEvents::increment(ProfileEvents::IOBufferAllocBytes, new_capacity);
+
+ m_data = static_cast<char *>(Allocator::alloc(new_capacity, alignment));
+ m_capacity = new_capacity;
+ m_size = new_size;
+ }
+
+ void dealloc()
+ {
+ if (!m_data)
+ return;
+
+ Allocator::free(m_data, m_capacity);
+ m_data = nullptr; /// To avoid double free if next alloc will throw an exception.
+ }
+};
+
+
+/** Buffer that could own its working memory.
+ * Template parameter: ReadBuffer or WriteBuffer
+ */
+template <typename Base>
+class BufferWithOwnMemory : public Base
+{
+protected:
+ Memory<> memory;
+public:
+ /// If non-nullptr 'existing_memory' is passed, then buffer will not create its own memory and will use existing_memory without ownership.
+ explicit BufferWithOwnMemory(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
+ : Base(nullptr, 0), memory(existing_memory ? 0 : size, alignment)
+ {
+ Base::set(existing_memory ? existing_memory : memory.data(), size);
+ Base::padded = !existing_memory;
+ }
+};
+
+
+/** Buffer that could write data to external memory which came from outside
+ * Template parameter: ReadBuffer or WriteBuffer
+ */
+template <typename Base>
+class BufferWithOutsideMemory : public Base
+{
+protected:
+ Memory<> & memory;
+public:
+
+ explicit BufferWithOutsideMemory(Memory<> & memory_)
+ : Base(memory_.data(), memory_.size()), memory(memory_)
+ {
+ Base::set(memory.data(), memory.size(), 0);
+ Base::padded = false;
+ }
+
+ size_t getActualSize()
+ {
+ return Base::count();
+ }
+
+private:
+ void nextImpl() final
+ {
+ const size_t prev_size = Base::position() - memory.data();
+ memory.resize(2 * prev_size + 1);
+ Base::set(memory.data() + prev_size, memory.size() - prev_size, 0);
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Bzip2ReadBuffer.cpp b/contrib/clickhouse/src/IO/Bzip2ReadBuffer.cpp
new file mode 100644
index 0000000000..a08367dedc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Bzip2ReadBuffer.cpp
@@ -0,0 +1,139 @@
+#include "clickhouse_config.h"
+
+#if USE_BZIP2
+# include <IO/Bzip2ReadBuffer.h>
+# error #include <bzlib.h>
+# include <IO/WithFileName.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BZIP2_STREAM_DECODER_FAILED;
+ extern const int UNEXPECTED_END_OF_FILE;
+}
+
+
+class Bzip2ReadBuffer::Bzip2StateWrapper
+{
+public:
+ Bzip2StateWrapper()
+ {
+ memset(&stream, 0, sizeof(stream));
+
+ int ret = BZ2_bzDecompressInit(&stream, 0, 0);
+
+ if (ret != BZ_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
+ "bzip2 stream encoder init failed: error code: {}",
+ ret);
+ }
+
+ ~Bzip2StateWrapper()
+ {
+ BZ2_bzDecompressEnd(&stream);
+ }
+
+ void reinitialize()
+ {
+ auto avail_out = stream.avail_out;
+ auto * next_out = stream.next_out;
+
+ int ret = BZ2_bzDecompressEnd(&stream);
+
+ if (ret != BZ_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
+ "bzip2 stream encoder reinit decompress end failed: error code: {}",
+ ret);
+
+ memset(&stream, 0, sizeof(bz->stream));
+
+ ret = BZ2_bzDecompressInit(&stream, 0, 0);
+
+ if (ret != BZ_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
+ "bzip2 stream encoder reinit failed: error code: {}",
+ ret);
+
+ stream.avail_out = avail_out;
+ stream.next_out = next_out;
+ }
+
+ bz_stream stream;
+};
+
+Bzip2ReadBuffer::Bzip2ReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char *existing_memory, size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+ , bz(std::make_unique<Bzip2StateWrapper>())
+ , eof_flag(false)
+{
+}
+
+Bzip2ReadBuffer::~Bzip2ReadBuffer() = default;
+
+bool Bzip2ReadBuffer::nextImpl()
+{
+ if (eof_flag)
+ return false;
+
+ int ret;
+ do
+ {
+ if (!bz->stream.avail_in)
+ {
+ in->nextIfAtEnd();
+ bz->stream.avail_in = static_cast<unsigned>(in->buffer().end() - in->position());
+ bz->stream.next_in = in->position();
+ }
+
+ bz->stream.avail_out = static_cast<unsigned>(internal_buffer.size());
+ bz->stream.next_out = internal_buffer.begin();
+
+ ret = BZ2_bzDecompress(&bz->stream);
+
+ in->position() = in->buffer().end() - bz->stream.avail_in;
+
+ if (ret == BZ_STREAM_END && !in->eof())
+ {
+ bz->reinitialize();
+ bz->stream.avail_in = static_cast<unsigned>(in->buffer().end() - in->position());
+ bz->stream.next_in = in->position();
+
+ ret = BZ_OK;
+ }
+ }
+ while (bz->stream.avail_out == internal_buffer.size() && ret == BZ_OK && !in->eof());
+
+ working_buffer.resize(internal_buffer.size() - bz->stream.avail_out);
+
+ if (ret == BZ_STREAM_END && in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+
+ if (ret != BZ_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
+ "bzip2 stream decoder failed: error code: {}{}",
+ ret,
+ getExceptionEntryWithFileName(*in));
+
+ if (in->eof())
+ {
+ eof_flag = true;
+ throw Exception(
+ ErrorCodes::UNEXPECTED_END_OF_FILE,
+ "Unexpected end of bzip2 archive{}",
+ getExceptionEntryWithFileName(*in));
+ }
+
+ return true;
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Bzip2ReadBuffer.h b/contrib/clickhouse/src/IO/Bzip2ReadBuffer.h
new file mode 100644
index 0000000000..9131bf780b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Bzip2ReadBuffer.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/CompressedReadBufferWrapper.h>
+
+
+namespace DB
+{
+
+class Bzip2ReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ explicit Bzip2ReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~Bzip2ReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ class Bzip2StateWrapper;
+ std::unique_ptr<Bzip2StateWrapper> bz;
+
+ bool eof_flag;
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/Bzip2WriteBuffer.cpp b/contrib/clickhouse/src/IO/Bzip2WriteBuffer.cpp
new file mode 100644
index 0000000000..6bcbd872a3
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Bzip2WriteBuffer.cpp
@@ -0,0 +1,110 @@
+#include "clickhouse_config.h"
+
+#if USE_BZIP2
+# include <IO/Bzip2WriteBuffer.h>
+# error #include <bzlib.h>
+
+#include <Common/MemoryTracker.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BZIP2_STREAM_ENCODER_FAILED;
+}
+
+
+class Bzip2WriteBuffer::Bzip2StateWrapper
+{
+public:
+ explicit Bzip2StateWrapper(int compression_level)
+ {
+ memset(&stream, 0, sizeof(stream));
+
+ int ret = BZ2_bzCompressInit(&stream, compression_level, 0, 0);
+
+ if (ret != BZ_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
+ "bzip2 stream encoder init failed: error code: {}",
+ ret);
+ }
+
+ ~Bzip2StateWrapper()
+ {
+ BZ2_bzCompressEnd(&stream);
+ }
+
+ bz_stream stream;
+};
+
+Bzip2WriteBuffer::Bzip2WriteBuffer(std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+ , bz(std::make_unique<Bzip2StateWrapper>(compression_level))
+{
+}
+
+Bzip2WriteBuffer::~Bzip2WriteBuffer() = default;
+
+void Bzip2WriteBuffer::nextImpl()
+{
+ if (!offset())
+ {
+ return;
+ }
+
+ bz->stream.next_in = working_buffer.begin();
+ bz->stream.avail_in = static_cast<unsigned>(offset());
+
+ try
+ {
+ do
+ {
+ out->nextIfAtEnd();
+ bz->stream.next_out = out->position();
+ bz->stream.avail_out = static_cast<unsigned>(out->buffer().end() - out->position());
+
+ int ret = BZ2_bzCompress(&bz->stream, BZ_RUN);
+
+ out->position() = out->buffer().end() - bz->stream.avail_out;
+
+ if (ret != BZ_RUN_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
+ "bzip2 stream encoder failed: error code: {}",
+ ret);
+
+ }
+ while (bz->stream.avail_in > 0);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+}
+
+void Bzip2WriteBuffer::finalizeBefore()
+{
+ next();
+
+ out->nextIfAtEnd();
+ bz->stream.next_out = out->position();
+ bz->stream.avail_out = static_cast<unsigned>(out->buffer().end() - out->position());
+
+ int ret = BZ2_bzCompress(&bz->stream, BZ_FINISH);
+
+ out->position() = out->buffer().end() - bz->stream.avail_out;
+
+ if (ret != BZ_STREAM_END && ret != BZ_FINISH_OK)
+ throw Exception(
+ ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
+ "bzip2 stream encoder failed: error code: {}",
+ ret);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/Bzip2WriteBuffer.h b/contrib/clickhouse/src/IO/Bzip2WriteBuffer.h
new file mode 100644
index 0000000000..d037190348
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Bzip2WriteBuffer.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/WriteBufferDecorator.h>
+
+namespace DB
+{
+
+class Bzip2WriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ Bzip2WriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~Bzip2WriteBuffer() override;
+
+private:
+ void nextImpl() override;
+
+ void finalizeBefore() override;
+
+ class Bzip2StateWrapper;
+ std::unique_ptr<Bzip2StateWrapper> bz;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/CascadeWriteBuffer.cpp b/contrib/clickhouse/src/IO/CascadeWriteBuffer.cpp
new file mode 100644
index 0000000000..91a42e77fd
--- /dev/null
+++ b/contrib/clickhouse/src/IO/CascadeWriteBuffer.cpp
@@ -0,0 +1,119 @@
+#include <IO/CascadeWriteBuffer.h>
+#include <IO/MemoryReadWriteBuffer.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CURRENT_WRITE_BUFFER_IS_EXHAUSTED;
+ extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+ extern const int CANNOT_CREATE_IO_BUFFER;
+}
+
+CascadeWriteBuffer::CascadeWriteBuffer(WriteBufferPtrs && prepared_sources_, WriteBufferConstructors && lazy_sources_)
+ : WriteBuffer(nullptr, 0), prepared_sources(std::move(prepared_sources_)), lazy_sources(std::move(lazy_sources_))
+{
+ first_lazy_source_num = prepared_sources.size();
+ num_sources = first_lazy_source_num + lazy_sources.size();
+
+ /// fill lazy sources by nullptr
+ prepared_sources.resize(num_sources);
+
+ curr_buffer_num = 0;
+ curr_buffer = setNextBuffer();
+ set(curr_buffer->buffer().begin(), curr_buffer->buffer().size());
+}
+
+
+void CascadeWriteBuffer::nextImpl()
+{
+ if (!curr_buffer)
+ return;
+ try
+ {
+ curr_buffer->position() = position();
+ curr_buffer->next();
+ }
+ catch (const MemoryWriteBuffer::CurrentBufferExhausted &)
+ {
+ if (curr_buffer_num < num_sources)
+ {
+ /// TODO: protocol should require set(position(), 0) before Exception
+
+ /// good situation, fetch next WriteBuffer
+ ++curr_buffer_num;
+ curr_buffer = setNextBuffer();
+ }
+ else
+ throw Exception(ErrorCodes::CURRENT_WRITE_BUFFER_IS_EXHAUSTED, "MemoryWriteBuffer limit is exhausted");
+ }
+
+ set(curr_buffer->position(), curr_buffer->buffer().end() - curr_buffer->position());
+}
+
+
+void CascadeWriteBuffer::getResultBuffers(WriteBufferPtrs & res)
+{
+ finalize();
+
+ /// Sync position with underlying buffer before invalidating
+ curr_buffer->position() = position();
+
+ res = std::move(prepared_sources);
+
+ curr_buffer = nullptr;
+ curr_buffer_num = num_sources = 0;
+ prepared_sources.clear();
+ lazy_sources.clear();
+}
+
+void CascadeWriteBuffer::finalizeImpl()
+{
+ if (curr_buffer)
+ curr_buffer->position() = position();
+
+ for (auto & buf : prepared_sources)
+ {
+ if (buf)
+ {
+ buf->finalize();
+ }
+ }
+}
+
+WriteBuffer * CascadeWriteBuffer::setNextBuffer()
+{
+ if (first_lazy_source_num <= curr_buffer_num && curr_buffer_num < num_sources)
+ {
+ if (!prepared_sources[curr_buffer_num])
+ {
+ WriteBufferPtr prev_buf = (curr_buffer_num > 0) ? prepared_sources[curr_buffer_num - 1] : nullptr;
+ prepared_sources[curr_buffer_num] = lazy_sources[curr_buffer_num - first_lazy_source_num](prev_buf);
+ }
+ }
+ else if (curr_buffer_num >= num_sources)
+ throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER, "There are no WriteBuffers to write result");
+
+ WriteBuffer * res = prepared_sources[curr_buffer_num].get();
+ if (!res)
+ throw Exception(ErrorCodes::CANNOT_CREATE_IO_BUFFER, "Required WriteBuffer is not created");
+
+ /// Check that returned buffer isn't empty
+ if (!res->hasPendingData())
+ res->next();
+
+ return res;
+}
+
+
+CascadeWriteBuffer::~CascadeWriteBuffer()
+{
+ /// Sync position with underlying buffer before exit
+ if (curr_buffer)
+ curr_buffer->position() = position();
+}
+
+
+}
diff --git a/contrib/clickhouse/src/IO/CascadeWriteBuffer.h b/contrib/clickhouse/src/IO/CascadeWriteBuffer.h
new file mode 100644
index 0000000000..a003d11bd8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/CascadeWriteBuffer.h
@@ -0,0 +1,63 @@
+#pragma once
+#include <functional>
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+}
+
+/* The buffer is similar to ConcatReadBuffer, but writes data
+ *
+ * It has WriteBuffers sequence [prepared_sources, lazy_sources]
+ * (lazy_sources contains not pointers themself, but their delayed constructors)
+ *
+ * Firtly, CascadeWriteBuffer redirects data to first buffer of the sequence
+ * If current WriteBuffer cannot receive data anymore, it throws special exception MemoryWriteBuffer::CurrentBufferExhausted in nextImpl() body,
+ * CascadeWriteBuffer prepare next buffer and continuously redirects data to it.
+ * If there are no buffers anymore CascadeWriteBuffer throws an exception.
+ *
+ * NOTE: If you use one of underlying WriteBuffers buffers outside, you need sync its position() with CascadeWriteBuffer's position().
+ * The sync is performed into nextImpl(), getResultBuffers() and destructor.
+ */
+class CascadeWriteBuffer : public WriteBuffer
+{
+public:
+
+ using WriteBufferPtrs = std::vector<WriteBufferPtr>;
+ using WriteBufferConstructor = std::function<WriteBufferPtr (const WriteBufferPtr & prev_buf)>;
+ using WriteBufferConstructors = std::vector<WriteBufferConstructor>;
+
+ explicit CascadeWriteBuffer(WriteBufferPtrs && prepared_sources_, WriteBufferConstructors && lazy_sources_ = {});
+
+ void nextImpl() override;
+
+ /// Should be called once
+ void getResultBuffers(WriteBufferPtrs & res);
+
+ const WriteBuffer * getCurrentBuffer() const
+ {
+ return curr_buffer;
+ }
+
+ ~CascadeWriteBuffer() override;
+
+private:
+
+ void finalizeImpl() override;
+
+ WriteBuffer * setNextBuffer();
+
+ WriteBufferPtrs prepared_sources;
+ WriteBufferConstructors lazy_sources;
+ size_t first_lazy_source_num;
+ size_t num_sources;
+
+ WriteBuffer * curr_buffer;
+ size_t curr_buffer_num;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/CompressedReadBufferWrapper.h b/contrib/clickhouse/src/IO/CompressedReadBufferWrapper.h
new file mode 100644
index 0000000000..bb58a7bfeb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/CompressedReadBufferWrapper.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class CompressedReadBufferWrapper : public BufferWithOwnMemory<ReadBuffer>
+{
+public:
+ CompressedReadBufferWrapper(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment)
+ : BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment)
+ , in(std::move(in_)) {}
+
+ const ReadBuffer & getWrappedReadBuffer() const { return *in; }
+ ReadBuffer & getWrappedReadBuffer() { return *in; }
+
+ void prefetch(Priority priority) override { in->prefetch(priority); }
+
+protected:
+ std::unique_ptr<ReadBuffer> in;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/CompressionMethod.cpp b/contrib/clickhouse/src/IO/CompressionMethod.cpp
new file mode 100644
index 0000000000..e873f5dc8e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/CompressionMethod.cpp
@@ -0,0 +1,205 @@
+#include <IO/CompressionMethod.h>
+
+#include <IO/BrotliReadBuffer.h>
+#include <IO/BrotliWriteBuffer.h>
+#include <IO/LZMADeflatingWriteBuffer.h>
+#include <IO/LZMAInflatingReadBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ZlibDeflatingWriteBuffer.h>
+#include <IO/ZlibInflatingReadBuffer.h>
+#include <IO/ZstdDeflatingWriteBuffer.h>
+#include <IO/ZstdInflatingReadBuffer.h>
+#include <IO/Lz4DeflatingWriteBuffer.h>
+#include <IO/Lz4InflatingReadBuffer.h>
+#include <IO/Bzip2ReadBuffer.h>
+#include <IO/Bzip2WriteBuffer.h>
+#include <IO/HadoopSnappyReadBuffer.h>
+
+#include "clickhouse_config.h"
+
+#include <boost/algorithm/string/case_conv.hpp>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int NOT_IMPLEMENTED;
+}
+
+
+std::string toContentEncodingName(CompressionMethod method)
+{
+ switch (method)
+ {
+ case CompressionMethod::Gzip:
+ return "gzip";
+ case CompressionMethod::Zlib:
+ return "deflate";
+ case CompressionMethod::Brotli:
+ return "br";
+ case CompressionMethod::Xz:
+ return "xz";
+ case CompressionMethod::Zstd:
+ return "zstd";
+ case CompressionMethod::Lz4:
+ return "lz4";
+ case CompressionMethod::Bzip2:
+ return "bz2";
+ case CompressionMethod::Snappy:
+ return "snappy";
+ case CompressionMethod::None:
+ return "";
+ }
+ UNREACHABLE();
+}
+
+CompressionMethod chooseHTTPCompressionMethod(const std::string & list)
+{
+ /// The compression methods are ordered from most to least preferred.
+
+ if (std::string::npos != list.find("zstd"))
+ return CompressionMethod::Zstd;
+ else if (std::string::npos != list.find("br"))
+ return CompressionMethod::Brotli;
+ else if (std::string::npos != list.find("lz4"))
+ return CompressionMethod::Lz4;
+ else if (std::string::npos != list.find("snappy"))
+ return CompressionMethod::Snappy;
+ else if (std::string::npos != list.find("gzip"))
+ return CompressionMethod::Gzip;
+ else if (std::string::npos != list.find("deflate"))
+ return CompressionMethod::Zlib;
+ else if (std::string::npos != list.find("xz"))
+ return CompressionMethod::Xz;
+ else if (std::string::npos != list.find("bz2"))
+ return CompressionMethod::Bzip2;
+ else
+ return CompressionMethod::None;
+}
+
+CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint)
+{
+ std::string file_extension;
+ if (hint.empty() || hint == "auto")
+ {
+ auto pos = path.find_last_of('.');
+ if (pos != std::string::npos)
+ file_extension = path.substr(pos + 1, std::string::npos);
+ }
+
+ std::string method_str;
+
+ if (file_extension.empty())
+ method_str = hint;
+ else
+ method_str = std::move(file_extension);
+
+ boost::algorithm::to_lower(method_str);
+
+ if (method_str == "gzip" || method_str == "gz")
+ return CompressionMethod::Gzip;
+ if (method_str == "deflate")
+ return CompressionMethod::Zlib;
+ if (method_str == "brotli" || method_str == "br")
+ return CompressionMethod::Brotli;
+ if (method_str == "lzma" || method_str == "xz")
+ return CompressionMethod::Xz;
+ if (method_str == "zstd" || method_str == "zst")
+ return CompressionMethod::Zstd;
+ if (method_str == "lz4")
+ return CompressionMethod::Lz4;
+ if (method_str == "bz2")
+ return CompressionMethod::Bzip2;
+ if (method_str == "snappy")
+ return CompressionMethod::Snappy;
+ if (hint.empty() || hint == "auto" || hint == "none")
+ return CompressionMethod::None;
+
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown compression method '{}'. "
+ "Only 'auto', 'none', 'gzip', 'deflate', 'br', 'xz', 'zstd', 'lz4', 'bz2', 'snappy' are supported as compression methods", hint);
+}
+
+std::pair<uint64_t, uint64_t> getCompressionLevelRange(const CompressionMethod & method)
+{
+ switch (method)
+ {
+ case CompressionMethod::Zstd:
+ return {1, 22};
+ case CompressionMethod::Lz4:
+ return {1, 12};
+ default:
+ return {1, 9};
+ }
+}
+
+static std::unique_ptr<CompressedReadBufferWrapper> createCompressedWrapper(
+ std::unique_ptr<ReadBuffer> nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max)
+{
+ if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib)
+ return std::make_unique<ZlibInflatingReadBuffer>(std::move(nested), method, buf_size, existing_memory, alignment);
+#if USE_BROTLI
+ if (method == CompressionMethod::Brotli)
+ return std::make_unique<BrotliReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
+#endif
+ if (method == CompressionMethod::Xz)
+ return std::make_unique<LZMAInflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
+ if (method == CompressionMethod::Zstd)
+ return std::make_unique<ZstdInflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment, zstd_window_log_max);
+ if (method == CompressionMethod::Lz4)
+ return std::make_unique<Lz4InflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
+#if USE_BZIP2
+ if (method == CompressionMethod::Bzip2)
+ return std::make_unique<Bzip2ReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
+#endif
+#if USE_SNAPPY
+ if (method == CompressionMethod::Snappy)
+ return std::make_unique<HadoopSnappyReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
+#endif
+
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method");
+}
+
+std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
+ std::unique_ptr<ReadBuffer> nested, CompressionMethod method, int zstd_window_log_max, size_t buf_size, char * existing_memory, size_t alignment)
+{
+ if (method == CompressionMethod::None)
+ return nested;
+ return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment, zstd_window_log_max);
+}
+
+std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod(
+ std::unique_ptr<WriteBuffer> nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment)
+{
+ if (method == DB::CompressionMethod::Gzip || method == CompressionMethod::Zlib)
+ return std::make_unique<ZlibDeflatingWriteBuffer>(std::move(nested), method, level, buf_size, existing_memory, alignment);
+
+#if USE_BROTLI
+ if (method == DB::CompressionMethod::Brotli)
+ return std::make_unique<BrotliWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
+#endif
+ if (method == CompressionMethod::Xz)
+ return std::make_unique<LZMADeflatingWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
+
+ if (method == CompressionMethod::Zstd)
+ return std::make_unique<ZstdDeflatingWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
+
+ if (method == CompressionMethod::Lz4)
+ return std::make_unique<Lz4DeflatingWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
+
+#if USE_BZIP2
+ if (method == CompressionMethod::Bzip2)
+ return std::make_unique<Bzip2WriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
+#endif
+#if USE_SNAPPY
+ if (method == CompressionMethod::Snappy)
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method");
+#endif
+ if (method == CompressionMethod::None)
+ return nested;
+
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/CompressionMethod.h b/contrib/clickhouse/src/IO/CompressionMethod.h
new file mode 100644
index 0000000000..c142531cd0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/CompressionMethod.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <Core/Defines.h>
+
+namespace DB
+{
+class ReadBuffer;
+class WriteBuffer;
+
+/** These are "generally recognizable" compression methods for data import/export.
+ * Do not mess with more efficient compression methods used by ClickHouse internally
+ * (they use non-standard framing, indexes, checksums...)
+ */
+
+enum class CompressionMethod
+{
+ None,
+ /// DEFLATE compression with gzip header and CRC32 checksum.
+ /// This option corresponds to files produced by gzip(1) or HTTP Content-Encoding: gzip.
+ Gzip,
+ /// DEFLATE compression with zlib header and Adler32 checksum.
+ /// This option corresponds to HTTP Content-Encoding: deflate.
+ Zlib,
+ /// LZMA2-based content compression
+ /// This option corresponds to HTTP Content-Encoding: xz
+ Xz,
+ /// Zstd compressor
+ /// This option corresponds to HTTP Content-Encoding: zstd
+ Zstd,
+ Brotli,
+ Lz4,
+ Bzip2,
+ Snappy,
+};
+
+/// How the compression method is named in HTTP.
+std::string toContentEncodingName(CompressionMethod method);
+
+/** Choose compression method from path and hint.
+ * if hint is "auto" or empty string, then path is analyzed,
+ * otherwise path parameter is ignored and hint is used as compression method name.
+ * path is arbitrary string that will be analyzed for file extension (gz, br...) that determines compression.
+ */
+CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint);
+
+/** Choose a compression method from HTTP header list of supported compression methods.
+ */
+CompressionMethod chooseHTTPCompressionMethod(const std::string & list);
+
+/// Get a range of the valid compression levels for the compression method.
+std::pair<uint64_t, uint64_t> getCompressionLevelRange(const CompressionMethod & method);
+
+std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
+ std::unique_ptr<ReadBuffer> nested,
+ CompressionMethod method,
+ int zstd_window_log_max = 0,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+
+std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod(
+ std::unique_ptr<WriteBuffer> nested,
+ CompressionMethod method,
+ int level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+}
diff --git a/contrib/clickhouse/src/IO/ConcatReadBuffer.h b/contrib/clickhouse/src/IO/ConcatReadBuffer.h
new file mode 100644
index 0000000000..3f44181a6e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ConcatReadBuffer.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <vector>
+
+#include <IO/ReadBuffer.h>
+
+
+namespace DB
+{
+
+/// Reads from the concatenation of multiple ReadBuffer's
+class ConcatReadBuffer : public ReadBuffer
+{
+public:
+ using Buffers = std::vector<std::unique_ptr<ReadBuffer>>;
+
+ ConcatReadBuffer() : ReadBuffer(nullptr, 0), current(buffers.end())
+ {
+ }
+
+ explicit ConcatReadBuffer(Buffers && buffers_) : ReadBuffer(nullptr, 0), buffers(std::move(buffers_)), current(buffers.begin())
+ {
+ assert(!buffers.empty());
+ }
+
+ ConcatReadBuffer(std::unique_ptr<ReadBuffer> buf1, std::unique_ptr<ReadBuffer> buf2) : ConcatReadBuffer()
+ {
+ appendBuffer(std::move(buf1));
+ appendBuffer(std::move(buf2));
+ }
+
+ ConcatReadBuffer(ReadBuffer & buf1, ReadBuffer & buf2) : ConcatReadBuffer()
+ {
+ appendBuffer(wrapReadBufferReference(buf1));
+ appendBuffer(wrapReadBufferReference(buf2));
+ }
+
+ void appendBuffer(std::unique_ptr<ReadBuffer> buffer)
+ {
+ assert(!count());
+ buffers.push_back(std::move(buffer));
+ current = buffers.begin();
+ }
+
+protected:
+ Buffers buffers;
+ Buffers::iterator current;
+
+ bool nextImpl() override
+ {
+ if (buffers.end() == current)
+ return false;
+
+ /// First reading
+ if (working_buffer.empty())
+ {
+ if ((*current)->hasPendingData())
+ {
+ working_buffer = Buffer((*current)->position(), (*current)->buffer().end());
+ return true;
+ }
+ }
+ else
+ (*current)->position() = position();
+
+ if (!(*current)->next())
+ {
+ ++current;
+ if (buffers.end() == current)
+ return false;
+
+ /// We skip the filled up buffers; if the buffer is not filled in, but the cursor is at the end, then read the next piece of data.
+ while ((*current)->eof())
+ {
+ ++current;
+ if (buffers.end() == current)
+ return false;
+ }
+ }
+
+ working_buffer = Buffer((*current)->position(), (*current)->buffer().end());
+ return true;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.cpp b/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.cpp
new file mode 100644
index 0000000000..ec2793898f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.cpp
@@ -0,0 +1,144 @@
+#include <IO/ConcatSeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+ConcatSeekableReadBuffer::BufferInfo::BufferInfo(BufferInfo && src) noexcept
+ : in(std::exchange(src.in, nullptr)), own_in(std::exchange(src.own_in, false)), size(std::exchange(src.size, 0))
+{
+}
+
+ConcatSeekableReadBuffer::BufferInfo::~BufferInfo()
+{
+ if (own_in)
+ delete in;
+}
+
+ConcatSeekableReadBuffer::ConcatSeekableReadBuffer(std::unique_ptr<SeekableReadBuffer> buf1, size_t size1, std::unique_ptr<SeekableReadBuffer> buf2, size_t size2) : ConcatSeekableReadBuffer()
+{
+ appendBuffer(std::move(buf1), size1);
+ appendBuffer(std::move(buf2), size2);
+}
+
+ConcatSeekableReadBuffer::ConcatSeekableReadBuffer(SeekableReadBuffer & buf1, size_t size1, SeekableReadBuffer & buf2, size_t size2) : ConcatSeekableReadBuffer()
+{
+ appendBuffer(buf1, size1);
+ appendBuffer(buf2, size2);
+}
+
+void ConcatSeekableReadBuffer::appendBuffer(std::unique_ptr<SeekableReadBuffer> buffer, size_t size)
+{
+ appendBuffer(buffer.release(), true, size);
+}
+
+void ConcatSeekableReadBuffer::appendBuffer(SeekableReadBuffer & buffer, size_t size)
+{
+ appendBuffer(&buffer, false, size);
+}
+
+void ConcatSeekableReadBuffer::appendBuffer(SeekableReadBuffer * buffer, bool own, size_t size)
+{
+ BufferInfo info;
+ info.in = buffer;
+ info.own_in = own;
+ info.size = size;
+
+ if (!size)
+ return;
+
+ buffers.emplace_back(std::move(info));
+ total_size += size;
+
+ if (current == buffers.size() - 1)
+ {
+ working_buffer = buffers[current].in->buffer();
+ pos = buffers[current].in->position();
+ }
+}
+
+bool ConcatSeekableReadBuffer::nextImpl()
+{
+ if (current < buffers.size())
+ {
+ buffers[current].in->position() = pos;
+ while ((current < buffers.size()) && buffers[current].in->eof())
+ {
+ current_start_pos += buffers[current++].size;
+ if (current < buffers.size())
+ buffers[current].in->seek(0, SEEK_SET);
+ }
+ }
+
+ if (current >= buffers.size())
+ {
+ current_start_pos = total_size;
+ set(nullptr, 0);
+ return false;
+ }
+
+ working_buffer = buffers[current].in->buffer();
+ pos = buffers[current].in->position();
+ return true;
+}
+
+off_t ConcatSeekableReadBuffer::getPosition()
+{
+ size_t current_pos = current_start_pos;
+ if (current < buffers.size())
+ current_pos += buffers[current].in->getPosition() + offset();
+ return current_pos;
+}
+
+off_t ConcatSeekableReadBuffer::seek(off_t off, int whence)
+{
+ off_t new_position;
+ off_t current_position = getPosition();
+ if (whence == SEEK_SET)
+ new_position = off;
+ else if (whence == SEEK_CUR)
+ new_position = current_position + off;
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "ConcatSeekableReadBuffer::seek expects SEEK_SET or SEEK_CUR as whence");
+
+ if (new_position < 0)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "SEEK_SET underflow: off = {}", off);
+ if (static_cast<UInt64>(new_position) > total_size)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "SEEK_CUR shift out of bounds");
+
+ if (static_cast<UInt64>(new_position) == total_size)
+ {
+ current = buffers.size();
+ current_start_pos = total_size;
+ set(nullptr, 0);
+ return new_position;
+ }
+
+ off_t change_position = new_position - current_position;
+ if ((working_buffer.begin() <= pos + change_position) && (pos + change_position <= working_buffer.end()))
+ {
+ /// Position is still inside the same working buffer.
+ pos += change_position;
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+ return new_position;
+ }
+
+ while (new_position < static_cast<off_t>(current_start_pos))
+ current_start_pos -= buffers[--current].size;
+
+ while (new_position >= static_cast<off_t>(current_start_pos + buffers[current].size))
+ current_start_pos += buffers[current++].size;
+
+ buffers[current].in->seek(new_position - current_start_pos, SEEK_SET);
+ working_buffer = buffers[current].in->buffer();
+ pos = buffers[current].in->position();
+ return new_position;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.h b/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.h
new file mode 100644
index 0000000000..c8c16c5d88
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ConcatSeekableReadBuffer.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <IO/SeekableReadBuffer.h>
+#include <vector>
+
+
+namespace DB
+{
+
+/// Reads from the concatenation of multiple SeekableReadBuffer's
+class ConcatSeekableReadBuffer : public SeekableReadBuffer, public WithFileSize
+{
+public:
+ ConcatSeekableReadBuffer() : SeekableReadBuffer(nullptr, 0) { }
+ ConcatSeekableReadBuffer(std::unique_ptr<SeekableReadBuffer> buf1, size_t size1, std::unique_ptr<SeekableReadBuffer> buf2, size_t size2);
+ ConcatSeekableReadBuffer(SeekableReadBuffer & buf1, size_t size1, SeekableReadBuffer & buf2, size_t size2);
+
+ void appendBuffer(std::unique_ptr<SeekableReadBuffer> buffer, size_t size);
+ void appendBuffer(SeekableReadBuffer & buffer, size_t size);
+
+ off_t seek(off_t off, int whence) override;
+ off_t getPosition() override;
+
+ size_t getFileSize() override { return total_size; }
+
+private:
+ bool nextImpl() override;
+ void appendBuffer(SeekableReadBuffer * buffer, bool own, size_t size);
+
+ struct BufferInfo
+ {
+ BufferInfo() = default;
+ BufferInfo(BufferInfo && src) noexcept;
+ ~BufferInfo();
+ SeekableReadBuffer * in = nullptr;
+ bool own_in = false;
+ size_t size = 0;
+ };
+
+ std::vector<BufferInfo> buffers;
+ size_t total_size = 0;
+ size_t current = 0;
+ size_t current_start_pos = 0; /// Position of the current buffer's begin.
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ConnectionTimeouts.cpp b/contrib/clickhouse/src/IO/ConnectionTimeouts.cpp
new file mode 100644
index 0000000000..01fbaa4f81
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ConnectionTimeouts.cpp
@@ -0,0 +1,136 @@
+#include <IO/ConnectionTimeouts.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+ConnectionTimeouts::ConnectionTimeouts(
+ Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_)
+ : connection_timeout(connection_timeout_)
+ , send_timeout(send_timeout_)
+ , receive_timeout(receive_timeout_)
+ , tcp_keep_alive_timeout(0)
+ , http_keep_alive_timeout(0)
+ , secure_connection_timeout(connection_timeout)
+ , hedged_connection_timeout(receive_timeout_)
+ , receive_data_timeout(receive_timeout_)
+ , handshake_timeout(receive_timeout_)
+{
+}
+
+ConnectionTimeouts::ConnectionTimeouts(
+ Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan handshake_timeout_)
+ : connection_timeout(connection_timeout_)
+ , send_timeout(send_timeout_)
+ , receive_timeout(receive_timeout_)
+ , tcp_keep_alive_timeout(tcp_keep_alive_timeout_)
+ , http_keep_alive_timeout(0)
+ , secure_connection_timeout(connection_timeout)
+ , hedged_connection_timeout(receive_timeout_)
+ , receive_data_timeout(receive_timeout_)
+ , handshake_timeout(handshake_timeout_)
+{
+}
+
+ConnectionTimeouts::ConnectionTimeouts(
+ Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan http_keep_alive_timeout_,
+ Poco::Timespan handshake_timeout_)
+ : connection_timeout(connection_timeout_)
+ , send_timeout(send_timeout_)
+ , receive_timeout(receive_timeout_)
+ , tcp_keep_alive_timeout(tcp_keep_alive_timeout_)
+ , http_keep_alive_timeout(http_keep_alive_timeout_)
+ , secure_connection_timeout(connection_timeout)
+ , hedged_connection_timeout(receive_timeout_)
+ , receive_data_timeout(receive_timeout_)
+ , handshake_timeout(handshake_timeout_)
+{
+}
+
+ConnectionTimeouts::ConnectionTimeouts(
+ Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan http_keep_alive_timeout_,
+ Poco::Timespan secure_connection_timeout_,
+ Poco::Timespan hedged_connection_timeout_,
+ Poco::Timespan receive_data_timeout_,
+ Poco::Timespan handshake_timeout_)
+ : connection_timeout(connection_timeout_)
+ , send_timeout(send_timeout_)
+ , receive_timeout(receive_timeout_)
+ , tcp_keep_alive_timeout(tcp_keep_alive_timeout_)
+ , http_keep_alive_timeout(http_keep_alive_timeout_)
+ , secure_connection_timeout(secure_connection_timeout_)
+ , hedged_connection_timeout(hedged_connection_timeout_)
+ , receive_data_timeout(receive_data_timeout_)
+ , handshake_timeout(handshake_timeout_)
+{
+}
+
+Poco::Timespan ConnectionTimeouts::saturate(Poco::Timespan timespan, Poco::Timespan limit)
+{
+ if (limit.totalMicroseconds() == 0)
+ return timespan;
+ else
+ return (timespan > limit) ? limit : timespan;
+}
+
+ConnectionTimeouts ConnectionTimeouts::getSaturated(Poco::Timespan limit) const
+{
+ return ConnectionTimeouts(saturate(connection_timeout, limit),
+ saturate(send_timeout, limit),
+ saturate(receive_timeout, limit),
+ saturate(tcp_keep_alive_timeout, limit),
+ saturate(http_keep_alive_timeout, limit),
+ saturate(secure_connection_timeout, limit),
+ saturate(hedged_connection_timeout, limit),
+ saturate(receive_data_timeout, limit),
+ saturate(handshake_timeout, limit));
+}
+
+/// Timeouts for the case when we have just single attempt to connect.
+ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(const Settings & settings)
+{
+ return ConnectionTimeouts(settings.connect_timeout, settings.send_timeout, settings.receive_timeout, settings.tcp_keep_alive_timeout, settings.handshake_timeout_ms);
+}
+
+/// Timeouts for the case when we will try many addresses in a loop.
+ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithFailover(const Settings & settings)
+{
+ return ConnectionTimeouts(
+ settings.connect_timeout_with_failover_ms,
+ settings.send_timeout,
+ settings.receive_timeout,
+ settings.tcp_keep_alive_timeout,
+ 0,
+ settings.connect_timeout_with_failover_secure_ms,
+ settings.hedged_connection_timeout_ms,
+ settings.receive_data_timeout_ms,
+ settings.handshake_timeout_ms);
+}
+
+ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Settings & settings, Poco::Timespan http_keep_alive_timeout)
+{
+ return ConnectionTimeouts(
+ settings.http_connection_timeout,
+ settings.http_send_timeout,
+ settings.http_receive_timeout,
+ settings.tcp_keep_alive_timeout,
+ http_keep_alive_timeout,
+ settings.http_receive_timeout);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ConnectionTimeouts.h b/contrib/clickhouse/src/IO/ConnectionTimeouts.h
new file mode 100644
index 0000000000..684af42827
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ConnectionTimeouts.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <Core/Defines.h>
+#include <Interpreters/Context_fwd.h>
+
+#include <Poco/Timespan.h>
+
+namespace DB
+{
+
+struct Settings;
+
+struct ConnectionTimeouts
+{
+ Poco::Timespan connection_timeout;
+ Poco::Timespan send_timeout;
+ Poco::Timespan receive_timeout;
+ Poco::Timespan tcp_keep_alive_timeout;
+ Poco::Timespan http_keep_alive_timeout;
+ Poco::Timespan secure_connection_timeout;
+
+ /// Timeouts for HedgedConnections
+ Poco::Timespan hedged_connection_timeout;
+ Poco::Timespan receive_data_timeout;
+
+ /// Timeout for receiving HELLO packet
+ Poco::Timespan handshake_timeout;
+
+ /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus)
+ Poco::Timespan sync_request_timeout = Poco::Timespan(DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC, 0);
+
+ ConnectionTimeouts() = default;
+
+ ConnectionTimeouts(Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_);
+
+ ConnectionTimeouts(Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan handshake_timeout_);
+
+ ConnectionTimeouts(Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan http_keep_alive_timeout_,
+ Poco::Timespan handshake_timeout_);
+
+ ConnectionTimeouts(Poco::Timespan connection_timeout_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ Poco::Timespan tcp_keep_alive_timeout_,
+ Poco::Timespan http_keep_alive_timeout_,
+ Poco::Timespan secure_connection_timeout_,
+ Poco::Timespan hedged_connection_timeout_,
+ Poco::Timespan receive_data_timeout_,
+ Poco::Timespan handshake_timeout_);
+
+ static Poco::Timespan saturate(Poco::Timespan timespan, Poco::Timespan limit);
+ ConnectionTimeouts getSaturated(Poco::Timespan limit) const;
+
+ /// Timeouts for the case when we have just single attempt to connect.
+ static ConnectionTimeouts getTCPTimeoutsWithoutFailover(const Settings & settings);
+
+ /// Timeouts for the case when we will try many addresses in a loop.
+ static ConnectionTimeouts getTCPTimeoutsWithFailover(const Settings & settings);
+ static ConnectionTimeouts getHTTPTimeouts(const Settings & settings, Poco::Timespan http_keep_alive_timeout);
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/DoubleConverter.cpp b/contrib/clickhouse/src/IO/DoubleConverter.cpp
new file mode 100644
index 0000000000..911da5eabc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/DoubleConverter.cpp
@@ -0,0 +1,16 @@
+#include <IO/DoubleConverter.h>
+
+namespace DB
+{
+template <bool emit_decimal_point>
+const double_conversion::DoubleToStringConverter & DoubleConverter<emit_decimal_point>::instance()
+{
+ static const double_conversion::DoubleToStringConverter instance{
+ DoubleToStringConverterFlags<emit_decimal_point>::flags, "inf", "nan", 'e', -6, 21, 6, 1};
+
+ return instance;
+}
+
+template class DoubleConverter<true>;
+template class DoubleConverter<false>;
+}
diff --git a/contrib/clickhouse/src/IO/DoubleConverter.h b/contrib/clickhouse/src/IO/DoubleConverter.h
new file mode 100644
index 0000000000..18cbe4e3a1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/DoubleConverter.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#include <base/defines.h>
+#include <double-conversion/double-conversion.h>
+#include <boost/noncopyable.hpp>
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+
+namespace DB
+{
+
+template <bool emit_decimal_point> struct DoubleToStringConverterFlags
+{
+ static constexpr auto flags = double_conversion::DoubleToStringConverter::NO_FLAGS;
+};
+
+template <> struct DoubleToStringConverterFlags<true>
+{
+ static constexpr auto flags = double_conversion::DoubleToStringConverter::EMIT_TRAILING_DECIMAL_POINT;
+};
+
+template <bool emit_decimal_point>
+class DoubleConverter : private boost::noncopyable
+{
+ DoubleConverter() = default;
+
+public:
+ /// Sign (1 byte) + DigitsBeforePoint + point (1 byte) + DigitsAfterPoint + zero byte.
+ /// See comment to DoubleToStringConverter::ToFixed method for explanation.
+ static constexpr auto MAX_REPRESENTATION_LENGTH =
+ 1 + double_conversion::DoubleToStringConverter::kMaxFixedDigitsBeforePoint +
+ 1 + double_conversion::DoubleToStringConverter::kMaxFixedDigitsAfterPoint + 1;
+ using BufferType = char[MAX_REPRESENTATION_LENGTH];
+
+ static const double_conversion::DoubleToStringConverter & instance();
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/EmptyReadBuffer.h b/contrib/clickhouse/src/IO/EmptyReadBuffer.h
new file mode 100644
index 0000000000..e2189b9943
--- /dev/null
+++ b/contrib/clickhouse/src/IO/EmptyReadBuffer.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+/// Just a stub - reads nothing from nowhere.
+class EmptyReadBuffer : public ReadBuffer
+{
+public:
+ EmptyReadBuffer() : ReadBuffer(nullptr, 0) {}
+
+private:
+ bool nextImpl() override { return false; }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/FileEncryptionCommon.cpp b/contrib/clickhouse/src/IO/FileEncryptionCommon.cpp
new file mode 100644
index 0000000000..c354a1c8df
--- /dev/null
+++ b/contrib/clickhouse/src/IO/FileEncryptionCommon.cpp
@@ -0,0 +1,465 @@
+#include <IO/FileEncryptionCommon.h>
+
+#if USE_SSL
+#include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteHelpers.h>
+#include <Common/SipHash.h>
+#include <Common/safe_cast.h>
+
+# include <cassert>
+# include <boost/algorithm/string/predicate.hpp>
+
+# include <openssl/err.h>
+# include <openssl/rand.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BAD_ARGUMENTS;
+ extern const int DATA_ENCRYPTION_ERROR;
+ extern const int OPENSSL_ERROR;
+}
+
+namespace FileEncryption
+{
+
+namespace
+{
+ const EVP_CIPHER * getCipher(Algorithm algorithm)
+ {
+ switch (algorithm)
+ {
+ case Algorithm::AES_128_CTR: return EVP_aes_128_ctr();
+ case Algorithm::AES_192_CTR: return EVP_aes_192_ctr();
+ case Algorithm::AES_256_CTR: return EVP_aes_256_ctr();
+ case Algorithm::MAX: break;
+ }
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Encryption algorithm {} is not supported, specify one of the following: aes_128_ctr, aes_192_ctr, aes_256_ctr",
+ static_cast<int>(algorithm));
+ }
+
+ void checkKeySize(const EVP_CIPHER * evp_cipher, size_t key_size)
+ {
+ if (!key_size)
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Encryption key must not be empty");
+ size_t expected_key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
+ if (key_size != expected_key_size)
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Got an encryption key with unexpected size {}, the size should be {}",
+ key_size, expected_key_size);
+ }
+
+ void checkInitVectorSize(const EVP_CIPHER * evp_cipher)
+ {
+ size_t expected_iv_length = static_cast<size_t>(EVP_CIPHER_iv_length(evp_cipher));
+ if (InitVector::kSize != expected_iv_length)
+ throw Exception(
+ ErrorCodes::DATA_ENCRYPTION_ERROR,
+ "Got an initialization vector with unexpected size {}, the size should be {}",
+ InitVector::kSize,
+ expected_iv_length);
+ }
+
+ constexpr const size_t kBlockSize = 16;
+
+ size_t blockOffset(size_t pos) { return pos % kBlockSize; }
+ size_t blocks(size_t pos) { return pos / kBlockSize; }
+
+ size_t partBlockSize(size_t size, size_t off)
+ {
+ assert(off < kBlockSize);
+ /// write the part as usual block
+ if (off == 0)
+ return 0;
+ return off + size <= kBlockSize ? size : (kBlockSize - off) % kBlockSize;
+ }
+
+ size_t encryptBlocks(EVP_CIPHER_CTX * evp_ctx, const char * data, size_t size, WriteBuffer & out)
+ {
+ const uint8_t * in = reinterpret_cast<const uint8_t *>(data);
+ size_t in_size = 0;
+ size_t out_size = 0;
+
+ while (in_size < size)
+ {
+ out.nextIfAtEnd();
+
+ size_t part_size = std::min(size - in_size, out.available());
+ part_size = std::min<size_t>(part_size, INT_MAX);
+
+ uint8_t * ciphertext = reinterpret_cast<uint8_t *>(out.position());
+ int ciphertext_size = 0;
+ if (!EVP_EncryptUpdate(evp_ctx, ciphertext, &ciphertext_size, &in[in_size], static_cast<int>(part_size)))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to encrypt");
+
+ in_size += part_size;
+ if (ciphertext_size)
+ {
+ out.position() += ciphertext_size;
+ out_size += ciphertext_size;
+ }
+ }
+
+ return out_size;
+ }
+
+ size_t encryptBlockWithPadding(EVP_CIPHER_CTX * evp_ctx, const char * data, size_t size, size_t pad_left, WriteBuffer & out)
+ {
+ assert((size <= kBlockSize) && (size + pad_left <= kBlockSize));
+ uint8_t padded_data[kBlockSize] = {};
+ memcpy(&padded_data[pad_left], data, size);
+ size_t padded_data_size = pad_left + size;
+
+ uint8_t ciphertext[kBlockSize];
+ int ciphertext_size = 0;
+ if (!EVP_EncryptUpdate(evp_ctx, ciphertext, &ciphertext_size, padded_data, safe_cast<int>(padded_data_size)))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to encrypt");
+
+ if (!ciphertext_size)
+ return 0;
+
+ if (static_cast<size_t>(ciphertext_size) < pad_left)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Unexpected size of encrypted data: {} < {}", ciphertext_size, pad_left);
+
+ uint8_t * ciphertext_begin = &ciphertext[pad_left];
+ ciphertext_size -= pad_left;
+ out.write(reinterpret_cast<const char *>(ciphertext_begin), ciphertext_size);
+ return ciphertext_size;
+ }
+
+ size_t encryptFinal(EVP_CIPHER_CTX * evp_ctx, WriteBuffer & out)
+ {
+ uint8_t ciphertext[kBlockSize];
+ int ciphertext_size = 0;
+ if (!EVP_EncryptFinal_ex(evp_ctx,
+ ciphertext, &ciphertext_size))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to finalize encrypting");
+ if (ciphertext_size)
+ out.write(reinterpret_cast<const char *>(ciphertext), ciphertext_size);
+ return ciphertext_size;
+ }
+
+ size_t decryptBlocks(EVP_CIPHER_CTX * evp_ctx, const char * data, size_t size, char * out)
+ {
+ const uint8_t * in = reinterpret_cast<const uint8_t *>(data);
+ uint8_t * plaintext = reinterpret_cast<uint8_t *>(out);
+ int plaintext_size = 0;
+ if (!EVP_DecryptUpdate(evp_ctx, plaintext, &plaintext_size, in, safe_cast<int>(size)))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to decrypt");
+ return plaintext_size;
+ }
+
+ size_t decryptBlockWithPadding(EVP_CIPHER_CTX * evp_ctx, const char * data, size_t size, size_t pad_left, char * out)
+ {
+ assert((size <= kBlockSize) && (size + pad_left <= kBlockSize));
+ uint8_t padded_data[kBlockSize] = {};
+ memcpy(&padded_data[pad_left], data, size);
+ size_t padded_data_size = pad_left + size;
+ uint8_t plaintext[kBlockSize];
+ int plaintext_size = 0;
+ if (!EVP_DecryptUpdate(evp_ctx, plaintext, &plaintext_size, padded_data, safe_cast<int>(padded_data_size)))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to decrypt");
+
+ if (!plaintext_size)
+ return 0;
+
+ if (static_cast<size_t>(plaintext_size) < pad_left)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Unexpected size of decrypted data: {} < {}", plaintext_size, pad_left);
+
+ const uint8_t * plaintext_begin = &plaintext[pad_left];
+ plaintext_size -= pad_left;
+ memcpy(out, plaintext_begin, plaintext_size);
+ return plaintext_size;
+ }
+
+ size_t decryptFinal(EVP_CIPHER_CTX * evp_ctx, char * out)
+ {
+ uint8_t plaintext[kBlockSize];
+ int plaintext_size = 0;
+ if (!EVP_DecryptFinal_ex(evp_ctx, plaintext, &plaintext_size))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to finalize decrypting");
+ if (plaintext_size)
+ memcpy(out, plaintext, plaintext_size);
+ return plaintext_size;
+ }
+
+ constexpr const std::string_view kHeaderSignature = "ENC";
+
+ UInt128 calculateV1KeyFingerprint(UInt8 small_key_hash, UInt64 key_id)
+ {
+ /// In the version 1 we stored {key_id, very_small_hash(key)} instead of a fingerprint.
+ return static_cast<UInt128>(key_id) | (static_cast<UInt128>(small_key_hash) << 64);
+ }
+}
+
+String toString(Algorithm algorithm)
+{
+ switch (algorithm)
+ {
+ case Algorithm::AES_128_CTR: return "aes_128_ctr";
+ case Algorithm::AES_192_CTR: return "aes_192_ctr";
+ case Algorithm::AES_256_CTR: return "aes_256_ctr";
+ case Algorithm::MAX: break;
+ }
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Encryption algorithm {} is not supported, specify one of the following: aes_128_ctr, aes_192_ctr, aes_256_ctr",
+ static_cast<int>(algorithm));
+}
+
+Algorithm parseAlgorithmFromString(const String & str)
+{
+ if (boost::iequals(str, "aes_128_ctr"))
+ return Algorithm::AES_128_CTR;
+ else if (boost::iequals(str, "aes_192_ctr"))
+ return Algorithm::AES_192_CTR;
+ else if (boost::iequals(str, "aes_256_ctr"))
+ return Algorithm::AES_256_CTR;
+ else
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Encryption algorithm '{}' is not supported, specify one of the following: aes_128_ctr, aes_192_ctr, aes_256_ctr",
+ str);
+}
+
+void checkKeySize(size_t key_size, Algorithm algorithm) { checkKeySize(getCipher(algorithm), key_size); }
+
+
+String InitVector::toString() const
+{
+ static_assert(sizeof(counter) == InitVector::kSize);
+ WriteBufferFromOwnString out;
+ writeBinaryBigEndian(counter, out);
+ return std::move(out.str());
+}
+
+InitVector InitVector::fromString(const String & str)
+{
+ if (str.length() != InitVector::kSize)
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected iv with size {}, got iv with size {}", InitVector::kSize, str.length());
+ ReadBufferFromMemory in{str.data(), str.length()};
+ UInt128 counter;
+ readBinaryBigEndian(counter, in);
+ return InitVector{counter};
+}
+
+void InitVector::read(ReadBuffer & in)
+{
+ readBinaryBigEndian(counter, in);
+}
+
+void InitVector::write(WriteBuffer & out) const
+{
+ writeBinaryBigEndian(counter, out);
+}
+
+InitVector InitVector::random()
+{
+ UInt128 counter;
+ auto * buf = reinterpret_cast<unsigned char *>(counter.items);
+ auto ret = RAND_bytes(buf, sizeof(counter.items));
+ if (ret != 1)
+ throw Exception(DB::ErrorCodes::OPENSSL_ERROR, "OpenSSL error code: {}", ERR_get_error());
+ return InitVector{counter};
+}
+
+
+Encryptor::Encryptor(Algorithm algorithm_, const String & key_, const InitVector & iv_)
+ : key(key_)
+ , init_vector(iv_)
+ , evp_cipher(getCipher(algorithm_))
+{
+ checkKeySize(evp_cipher, key.size());
+ checkInitVectorSize(evp_cipher);
+}
+
+void Encryptor::encrypt(const char * data, size_t size, WriteBuffer & out)
+{
+ if (!size)
+ return;
+
+ auto current_iv = (init_vector + blocks(offset)).toString();
+
+ auto evp_ctx_ptr = std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>(EVP_CIPHER_CTX_new(), &EVP_CIPHER_CTX_free);
+ auto * evp_ctx = evp_ctx_ptr.get();
+
+ if (!EVP_EncryptInit_ex(evp_ctx, evp_cipher, nullptr, nullptr, nullptr))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to initialize encryption context with cipher");
+
+ if (!EVP_EncryptInit_ex(evp_ctx, nullptr, nullptr,
+ reinterpret_cast<const uint8_t*>(key.c_str()), reinterpret_cast<const uint8_t*>(current_iv.c_str())))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to set key and IV for encryption");
+
+ size_t in_size = 0;
+ size_t out_size = 0;
+
+ auto off = blockOffset(offset);
+ if (off)
+ {
+ size_t in_part_size = partBlockSize(size, off);
+ size_t out_part_size = encryptBlockWithPadding(evp_ctx, &data[in_size], in_part_size, off, out);
+ in_size += in_part_size;
+ out_size += out_part_size;
+ }
+
+ if (in_size < size)
+ {
+ size_t in_part_size = size - in_size;
+ size_t out_part_size = encryptBlocks(evp_ctx, &data[in_size], in_part_size, out);
+ in_size += in_part_size;
+ out_size += out_part_size;
+ }
+
+ out_size += encryptFinal(evp_ctx, out);
+
+ if (out_size != in_size)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Only part of the data was encrypted");
+ offset += in_size;
+}
+
+void Encryptor::decrypt(const char * data, size_t size, char * out)
+{
+ if (!size)
+ return;
+
+ auto current_iv = (init_vector + blocks(offset)).toString();
+
+ auto evp_ctx_ptr = std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>(EVP_CIPHER_CTX_new(), &EVP_CIPHER_CTX_free);
+ auto * evp_ctx = evp_ctx_ptr.get();
+
+ if (!EVP_DecryptInit_ex(evp_ctx, evp_cipher, nullptr, nullptr, nullptr))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to initialize decryption context with cipher");
+
+ if (!EVP_DecryptInit_ex(evp_ctx, nullptr, nullptr,
+ reinterpret_cast<const uint8_t*>(key.c_str()), reinterpret_cast<const uint8_t*>(current_iv.c_str())))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Failed to set key and IV for decryption");
+
+ size_t in_size = 0;
+ size_t out_size = 0;
+
+ auto off = blockOffset(offset);
+ if (off)
+ {
+ size_t in_part_size = partBlockSize(size, off);
+ size_t out_part_size = decryptBlockWithPadding(evp_ctx, &data[in_size], in_part_size, off, &out[out_size]);
+ in_size += in_part_size;
+ out_size += out_part_size;
+ }
+
+ if (in_size < size)
+ {
+ size_t in_part_size = size - in_size;
+ size_t out_part_size = decryptBlocks(evp_ctx, &data[in_size], in_part_size, &out[out_size]);
+ in_size += in_part_size;
+ out_size += out_part_size;
+ }
+
+ out_size += decryptFinal(evp_ctx, &out[out_size]);
+
+ if (out_size != in_size)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Only part of the data was decrypted");
+ offset += in_size;
+}
+
+
+void Header::read(ReadBuffer & in)
+{
+ char signature[kHeaderSignature.length()];
+ in.readStrict(signature, kHeaderSignature.length());
+ if (memcmp(signature, kHeaderSignature.data(), kHeaderSignature.length()) != 0)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Wrong signature, this is not an encrypted file");
+
+ /// The endianness of how the header is written.
+ /// Starting from version 2 the header is always in little endian.
+ std::endian endian = std::endian::little;
+
+ readBinaryLittleEndian(version, in);
+
+ if (version == 0x0100ULL)
+ {
+ /// Version 1 could write the header of an encrypted file in either little-endian or big-endian.
+ /// So now if we read the version as little-endian and it's 256 that means two things: the version is actually 1 and the whole header is in big endian.
+ endian = std::endian::big;
+ version = 1;
+ }
+
+ if (version < 1 || version > kCurrentVersion)
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Version {} of the header is not supported", version);
+
+ UInt16 algorithm_u16;
+ readPODBinary(algorithm_u16, in);
+ if (std::endian::native != endian)
+ algorithm_u16 = DB::byteswap(algorithm_u16);
+ if (algorithm_u16 >= static_cast<UInt16>(Algorithm::MAX))
+ throw Exception(ErrorCodes::DATA_ENCRYPTION_ERROR, "Algorithm {} is not supported", algorithm_u16);
+ algorithm = static_cast<Algorithm>(algorithm_u16);
+
+ size_t bytes_to_skip = kSize - kHeaderSignature.length() - sizeof(version) - sizeof(algorithm_u16) - InitVector::kSize;
+
+ if (version < 2)
+ {
+ UInt64 key_id;
+ UInt8 small_key_hash;
+ readPODBinary(key_id, in);
+ readPODBinary(small_key_hash, in);
+ bytes_to_skip -= sizeof(key_id) + sizeof(small_key_hash);
+ if (std::endian::native != endian)
+ key_id = DB::byteswap(key_id);
+ key_fingerprint = calculateV1KeyFingerprint(small_key_hash, key_id);
+ }
+ else
+ {
+ readBinaryLittleEndian(key_fingerprint, in);
+ bytes_to_skip -= sizeof(key_fingerprint);
+ }
+
+ init_vector.read(in);
+
+ chassert(bytes_to_skip < kSize);
+ in.ignore(bytes_to_skip);
+}
+
+void Header::write(WriteBuffer & out) const
+{
+ writeString(kHeaderSignature, out);
+
+ writeBinaryLittleEndian(version, out);
+
+ UInt16 algorithm_u16 = static_cast<UInt16>(algorithm);
+ writeBinaryLittleEndian(algorithm_u16, out);
+
+ writeBinaryLittleEndian(key_fingerprint, out);
+
+ init_vector.write(out);
+
+ constexpr size_t reserved_size = kSize - kHeaderSignature.length() - sizeof(version) - sizeof(algorithm_u16) - sizeof(key_fingerprint) - InitVector::kSize;
+ static_assert(reserved_size < kSize);
+ char zero_bytes[reserved_size] = {};
+ out.write(zero_bytes, reserved_size);
+}
+
+UInt128 calculateKeyFingerprint(const String & key)
+{
+ const UInt64 seed0 = 0x4368456E63727970ULL; // ChEncryp
+ const UInt64 seed1 = 0x7465644469736B46ULL; // tedDiskF
+ return sipHash128Keyed(seed0, seed1, key.data(), key.size());
+}
+
+UInt128 calculateV1KeyFingerprint(const String & key, UInt64 key_id)
+{
+ /// In the version 1 we stored {key_id, very_small_hash(key)} instead of a fingerprint.
+ UInt8 small_key_hash = sipHash64(key.data(), key.size()) & 0x0F;
+ return calculateV1KeyFingerprint(small_key_hash, key_id);
+}
+
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/FileEncryptionCommon.h b/contrib/clickhouse/src/IO/FileEncryptionCommon.h
new file mode 100644
index 0000000000..777d171157
--- /dev/null
+++ b/contrib/clickhouse/src/IO/FileEncryptionCommon.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_SSL
+#include <Core/Types.h>
+#include <openssl/evp.h>
+
+namespace DB
+{
+class ReadBuffer;
+class WriteBuffer;
+
+namespace FileEncryption
+{
+
+/// Encryption algorithm.
+/// We chose to use CTR cipther algorithms because they have the following features which are important for us:
+/// - No right padding, so we can append encrypted files without deciphering;
+/// - One byte is always ciphered as one byte, so we get random access to encrypted files easily.
+enum class Algorithm
+{
+ AES_128_CTR, /// Size of key is 16 bytes.
+ AES_192_CTR, /// Size of key is 24 bytes.
+ AES_256_CTR, /// Size of key is 32 bytes.
+ MAX
+};
+
+String toString(Algorithm algorithm);
+Algorithm parseAlgorithmFromString(const String & str);
+
+/// Throws an exception if a specified key size doesn't correspond a specified encryption algorithm.
+void checkKeySize(size_t key_size, Algorithm algorithm);
+
+
+/// Initialization vector. Its size is always 16 bytes.
+class InitVector
+{
+public:
+ static constexpr const size_t kSize = 16;
+
+ InitVector() = default;
+ explicit InitVector(const UInt128 & counter_) { set(counter_); }
+
+ void set(const UInt128 & counter_) { counter = counter_; }
+ UInt128 get() const { return counter; }
+
+ void read(ReadBuffer & in);
+ void write(WriteBuffer & out) const;
+
+ /// Write 16 bytes of the counter to a string in big endian order.
+ /// We need big endian because the used cipher algorithms treat an initialization vector as a counter in big endian.
+ String toString() const;
+
+ /// Converts a string of 16 bytes length in big endian order to a counter.
+ static InitVector fromString(const String & str_);
+
+ /// Adds a specified offset to the counter.
+ InitVector & operator++() { ++counter; return *this; }
+ InitVector operator++(int) { InitVector res = *this; ++counter; return res; } /// NOLINT
+ InitVector & operator+=(size_t offset) { counter += offset; return *this; }
+ InitVector operator+(size_t offset) const { InitVector res = *this; return res += offset; }
+
+ /// Generates a random initialization vector.
+ static InitVector random();
+
+private:
+ UInt128 counter = 0;
+};
+
+
+/// Encrypts or decrypts data.
+class Encryptor
+{
+public:
+ /// The `key` should have size 16 or 24 or 32 bytes depending on which `algorithm` is specified.
+ Encryptor(Algorithm algorithm_, const String & key_, const InitVector & iv_);
+
+ /// Sets the current position in the data stream from the very beginning of data.
+ /// It affects how the data will be encrypted or decrypted because
+ /// the initialization vector is increased by an index of the current block
+ /// and the index of the current block is calculated from this offset.
+ void setOffset(size_t offset_) { offset = offset_; }
+ size_t getOffset() const { return offset; }
+
+ /// Encrypts some data.
+ /// Also the function moves `offset` by `size` (for successive encryptions).
+ void encrypt(const char * data, size_t size, WriteBuffer & out);
+
+ /// Decrypts some data.
+ /// The used cipher algorithms generate the same number of bytes in output as they were in input,
+ /// so the function always writes `size` bytes of the plaintext to `out`.
+ /// Also the function moves `offset` by `size` (for successive decryptions).
+ void decrypt(const char * data, size_t size, char * out);
+
+private:
+ const String key;
+ const InitVector init_vector;
+ const EVP_CIPHER * const evp_cipher;
+
+ /// The current position in the data stream from the very beginning of data.
+ size_t offset = 0;
+};
+
+
+/// File header which is stored at the beginning of encrypted files.
+///
+/// The format of that header is following:
+/// +--------+------+--------------------------------------------------------------------------+
+/// | offset | size | description |
+/// +--------+------+--------------------------------------------------------------------------+
+/// | 0 | 3 | 'E', 'N', 'C' (file's signature) |
+/// | 3 | 2 | version of this header (1..2) |
+/// | 5 | 2 | encryption algorithm (0..2, 0=AES_128_CTR, 1=AES_192_CTR, 2=AES_256_CTR) |
+/// | 7 | 16 | fingerprint of encryption key (SipHash) |
+/// | 23 | 16 | initialization vector (randomly generated) |
+/// | 39 | 25 | reserved for future use |
+/// +--------+------+--------------------------------------------------------------------------+
+///
+struct Header
+{
+ /// Versions:
+ /// 1 - Initial version
+ /// 2 - The header of an encrypted file contains the fingerprint of a used encryption key instead of a pair {key_id, very_small_hash(key)}.
+ /// The header is always stored in little endian.
+ static constexpr const UInt16 kCurrentVersion = 2;
+
+ UInt16 version = kCurrentVersion;
+
+ /// Encryption algorithm.
+ Algorithm algorithm = Algorithm::AES_128_CTR;
+
+ /// Fingerprint of a key.
+ UInt128 key_fingerprint = 0;
+
+ InitVector init_vector;
+
+ /// The size of this header in bytes, including reserved bytes.
+ static constexpr const size_t kSize = 64;
+
+ void read(ReadBuffer & in);
+ void write(WriteBuffer & out) const;
+};
+
+/// Calculates the fingerprint of a passed encryption key.
+UInt128 calculateKeyFingerprint(const String & key);
+
+/// Calculates kind of the fingerprint of a passed encryption key & key ID as it was implemented in version 1.
+UInt128 calculateV1KeyFingerprint(const String & key, UInt64 key_id);
+
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/ForkWriteBuffer.cpp b/contrib/clickhouse/src/IO/ForkWriteBuffer.cpp
new file mode 100644
index 0000000000..8e11b9ff59
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ForkWriteBuffer.cpp
@@ -0,0 +1,60 @@
+#include <IO/ForkWriteBuffer.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int CANNOT_CREATE_IO_BUFFER;
+}
+
+ForkWriteBuffer::ForkWriteBuffer(WriteBufferPtrs && sources_)
+ : WriteBuffer(nullptr, 0), sources(std::move(sources_))
+{
+ if (sources.empty())
+ {
+ throw Exception(ErrorCodes::CANNOT_CREATE_IO_BUFFER, "Expected non-zero number of buffers for `ForkWriteBuffer`");
+ }
+ set(sources.front()->buffer().begin(), sources.front()->buffer().size());
+}
+
+
+void ForkWriteBuffer::nextImpl()
+{
+ sources.front()->position() = position();
+
+ try
+ {
+ auto & source_buffer = sources.front();
+ for (auto it = sources.begin() + 1; it != sources.end(); ++it)
+ {
+ auto & buffer = *it;
+ buffer->write(source_buffer->buffer().begin(), source_buffer->offset());
+ buffer->next();
+ }
+ source_buffer->next();
+ }
+ catch (Exception & exception)
+ {
+ exception.addMessage("While writing to ForkWriteBuffer");
+ throw;
+ }
+
+}
+
+void ForkWriteBuffer::finalizeImpl()
+{
+ for (const WriteBufferPtr & buffer : sources)
+ {
+ buffer->finalize();
+ }
+}
+
+ForkWriteBuffer::~ForkWriteBuffer()
+{
+ finalize();
+}
+
+
+}
diff --git a/contrib/clickhouse/src/IO/ForkWriteBuffer.h b/contrib/clickhouse/src/IO/ForkWriteBuffer.h
new file mode 100644
index 0000000000..17fc82028a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ForkWriteBuffer.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+}
+
+/** ForkWriteBuffer takes a vector of WriteBuffer and writes data to all of them
+ * If the vector of WriteBufferPts is empty, then it throws an error
+ * It uses the buffer of the first element as its buffer and copies data from
+ * first buffer to all the other buffers
+ **/
+class ForkWriteBuffer : public WriteBuffer
+{
+public:
+
+ using WriteBufferPtrs = std::vector<WriteBufferPtr>;
+
+ explicit ForkWriteBuffer(WriteBufferPtrs && sources_);
+ ~ForkWriteBuffer() override;
+
+protected:
+ void nextImpl() override;
+ void finalizeImpl() override;
+
+private:
+ WriteBufferPtrs sources;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.cpp b/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.cpp
new file mode 100644
index 0000000000..29034b35e1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.cpp
@@ -0,0 +1,91 @@
+#include <IO/HTTPChunkedReadBuffer.h>
+
+#include <IO/ReadHelpers.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <base/hex.h>
+#include <base/arithmeticOverflow.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+ extern const int UNEXPECTED_END_OF_FILE;
+ extern const int CORRUPTED_DATA;
+}
+
+size_t HTTPChunkedReadBuffer::readChunkHeader()
+{
+ if (in->eof())
+ throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of file while reading chunk header of HTTP chunked data");
+
+ if (!isHexDigit(*in->position()))
+ throw Exception(ErrorCodes::CORRUPTED_DATA, "Unexpected data instead of HTTP chunk header");
+
+ size_t res = 0;
+ do
+ {
+ if (common::mulOverflow(res, 16ul, res) || common::addOverflow<size_t>(res, unhex(*in->position()), res))
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Chunk size is out of bounds");
+ ++in->position();
+ } while (!in->eof() && isHexDigit(*in->position()));
+
+ if (res > max_chunk_size)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Chunk size exceeded the limit (max size: {})", max_chunk_size);
+
+ /// NOTE: If we want to read any chunk extensions, it should be done here.
+
+ skipToCarriageReturnOrEOF(*in);
+
+ if (in->eof())
+ throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of file while reading chunk header of HTTP chunked data");
+
+ assertString("\n", *in);
+ return res;
+}
+
+void HTTPChunkedReadBuffer::readChunkFooter()
+{
+ assertString("\r\n", *in);
+}
+
+bool HTTPChunkedReadBuffer::nextImpl()
+{
+ if (!in)
+ return false;
+
+ /// The footer of previous chunk.
+ if (count())
+ readChunkFooter();
+
+ size_t chunk_size = readChunkHeader();
+ if (0 == chunk_size)
+ {
+ readChunkFooter();
+ in.reset(); // prevent double-eof situation.
+ return false;
+ }
+
+ if (in->available() >= chunk_size)
+ {
+ /// Zero-copy read from input.
+ working_buffer = Buffer(in->position(), in->position() + chunk_size);
+ in->position() += chunk_size;
+ }
+ else
+ {
+ /// Chunk is not completely in buffer, copy it to scratch space.
+ memory.resize(chunk_size);
+ in->readStrict(memory.data(), chunk_size);
+ working_buffer = Buffer(memory.data(), memory.data() + chunk_size);
+ }
+
+ /// NOTE: We postpone reading the footer to the next iteration, because it may not be completely in buffer,
+ /// but we need to keep the current data in buffer available.
+
+ return true;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.h b/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.h
new file mode 100644
index 0000000000..68d90e470f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HTTPChunkedReadBuffer.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+/// Reads data with HTTP Chunked Transfer Encoding.
+class HTTPChunkedReadBuffer : public BufferWithOwnMemory<ReadBuffer>
+{
+public:
+ explicit HTTPChunkedReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t max_chunk_size_)
+ : max_chunk_size(max_chunk_size_), in(std::move(in_))
+ {}
+
+private:
+ const size_t max_chunk_size;
+ std::unique_ptr<ReadBuffer> in;
+
+ size_t readChunkHeader();
+ void readChunkFooter();
+
+ bool nextImpl() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/HTTPCommon.cpp b/contrib/clickhouse/src/IO/HTTPCommon.cpp
new file mode 100644
index 0000000000..077adfd863
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HTTPCommon.cpp
@@ -0,0 +1,449 @@
+#include <IO/HTTPCommon.h>
+
+#include <Server/HTTP/HTTPServerResponse.h>
+#include <Poco/Any.h>
+#include <Common/Concepts.h>
+#include <Common/DNSResolver.h>
+#include <Common/Exception.h>
+#include <Common/MemoryTrackerSwitcher.h>
+#include <Common/PoolBase.h>
+#include <Common/ProfileEvents.h>
+#include <Common/SipHash.h>
+
+#include "clickhouse_config.h"
+
+#if USE_SSL
+# include <Poco/Net/AcceptCertificateHandler.h>
+# include <Poco/Net/Context.h>
+# include <Poco/Net/HTTPSClientSession.h>
+# include <Poco/Net/InvalidCertificateHandler.h>
+# include <Poco/Net/PrivateKeyPassphraseHandler.h>
+# include <Poco/Net/RejectCertificateHandler.h>
+# include <Poco/Net/SSLManager.h>
+# include <Poco/Net/SecureStreamSocket.h>
+#endif
+
+#include <Poco/Util/Application.h>
+
+#include <sstream>
+#include <tuple>
+#include <unordered_map>
+
+
+namespace ProfileEvents
+{
+ extern const Event CreatedHTTPConnections;
+}
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
+ extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS;
+ extern const int FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME;
+ extern const int UNSUPPORTED_URI_SCHEME;
+ extern const int LOGICAL_ERROR;
+}
+
+
+namespace
+{
+ void setTimeouts(Poco::Net::HTTPClientSession & session, const ConnectionTimeouts & timeouts)
+ {
+ session.setTimeout(timeouts.connection_timeout, timeouts.send_timeout, timeouts.receive_timeout);
+ session.setKeepAliveTimeout(timeouts.http_keep_alive_timeout);
+ }
+
+ template <typename Session>
+ requires std::derived_from<Session, Poco::Net::HTTPClientSession>
+ class HTTPSessionAdapter : public Session
+ {
+ static_assert(std::has_virtual_destructor_v<Session>, "The base class must have a virtual destructor");
+
+ public:
+ HTTPSessionAdapter(const std::string & host, UInt16 port) : Session(host, port), log{&Poco::Logger::get("HTTPSessionAdapter")} { }
+ ~HTTPSessionAdapter() override = default;
+
+ protected:
+#if 0
+ void reconnect() override
+ {
+ // First of all will try to establish connection with last used addr.
+ if (!Session::getResolvedHost().empty())
+ {
+ try
+ {
+ Session::reconnect();
+ return;
+ }
+ catch (...)
+ {
+ Session::close();
+ LOG_TRACE(
+ log,
+ "Last ip ({}) is unreachable for {}:{}. Will try another resolved address.",
+ Session::getResolvedHost(),
+ Session::getHost(),
+ Session::getPort());
+ }
+ }
+
+ const auto endpoinds = DNSResolver::instance().resolveHostAll(Session::getHost());
+
+ for (auto it = endpoinds.begin();;)
+ {
+ try
+ {
+ Session::setResolvedHost(it->toString());
+ Session::reconnect();
+
+ LOG_TRACE(
+ log,
+ "Created HTTP(S) session with {}:{} ({}:{})",
+ Session::getHost(),
+ Session::getPort(),
+ it->toString(),
+ Session::getPort());
+
+ break;
+ }
+ catch (...)
+ {
+ Session::close();
+ if (++it == endpoinds.end())
+ {
+ Session::setResolvedHost("");
+ throw;
+ }
+ LOG_TRACE(
+ log,
+ "Failed to create connection with {}:{}, Will try another resolved address. {}",
+ Session::getResolvedHost(),
+ Session::getPort(),
+ getCurrentExceptionMessage(false));
+ }
+ }
+ }
+#endif
+ Poco::Logger * log;
+ };
+
+ bool isHTTPS(const Poco::URI & uri)
+ {
+ if (uri.getScheme() == "https")
+ return true;
+ else if (uri.getScheme() == "http")
+ return false;
+ else
+ throw Exception(ErrorCodes::UNSUPPORTED_URI_SCHEME, "Unsupported scheme in URI '{}'", uri.toString());
+ }
+
+ HTTPSessionPtr makeHTTPSessionImpl(
+ const std::string & host,
+ UInt16 port,
+ bool https,
+ bool keep_alive,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config = {})
+ {
+ HTTPSessionPtr session;
+
+ if (https)
+ {
+#if USE_SSL
+ session = std::make_shared<HTTPSessionAdapter<Poco::Net::HTTPSClientSession>>(host, port);
+#else
+ throw Exception(ErrorCodes::FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME, "ClickHouse was built without HTTPS support");
+#endif
+ }
+ else
+ {
+ session = std::make_shared<HTTPSessionAdapter<Poco::Net::HTTPClientSession>>(host, port);
+ }
+
+ ProfileEvents::increment(ProfileEvents::CreatedHTTPConnections);
+
+ /// doesn't work properly without patch
+ session->setKeepAlive(keep_alive);
+
+ session->setProxyConfig(proxy_config);
+
+ return session;
+ }
+
+ class SingleEndpointHTTPSessionPool : public PoolBase<Poco::Net::HTTPClientSession>
+ {
+ private:
+ const std::string host;
+ const UInt16 port;
+ const bool https;
+ const String proxy_host;
+ const UInt16 proxy_port;
+ const bool proxy_https;
+
+ using Base = PoolBase<Poco::Net::HTTPClientSession>;
+
+ ObjectPtr allocObject() override
+ {
+ /// Pool is global, we shouldn't attribute this memory to query/user.
+ MemoryTrackerSwitcher switcher{&total_memory_tracker};
+
+ auto session = makeHTTPSessionImpl(host, port, https, true);
+ if (!proxy_host.empty())
+ {
+ const String proxy_scheme = proxy_https ? "https" : "http";
+ session->setProxyHost(proxy_host);
+ session->setProxyPort(proxy_port);
+
+#if 0
+ session->setProxyProtocol(proxy_scheme);
+
+ /// Turn on tunnel mode if proxy scheme is HTTP while endpoint scheme is HTTPS.
+ session->setProxyTunnel(!proxy_https && https);
+#endif
+ }
+ return session;
+ }
+
+ public:
+ SingleEndpointHTTPSessionPool(
+ const std::string & host_,
+ UInt16 port_,
+ bool https_,
+ const std::string & proxy_host_,
+ UInt16 proxy_port_,
+ bool proxy_https_,
+ size_t max_pool_size_,
+ bool wait_on_pool_size_limit)
+ : Base(
+ static_cast<unsigned>(max_pool_size_),
+ &Poco::Logger::get("HTTPSessionPool"),
+ wait_on_pool_size_limit ? BehaviourOnLimit::Wait : BehaviourOnLimit::AllocateNewBypassingPool)
+ , host(host_)
+ , port(port_)
+ , https(https_)
+ , proxy_host(proxy_host_)
+ , proxy_port(proxy_port_)
+ , proxy_https(proxy_https_)
+ {
+ }
+ };
+
+ class HTTPSessionPool : private boost::noncopyable
+ {
+ public:
+ struct Key
+ {
+ String target_host;
+ UInt16 target_port;
+ bool is_target_https;
+ String proxy_host;
+ UInt16 proxy_port;
+ bool is_proxy_https;
+ bool wait_on_pool_size_limit;
+
+ bool operator ==(const Key & rhs) const
+ {
+ return std::tie(target_host, target_port, is_target_https, proxy_host, proxy_port, is_proxy_https, wait_on_pool_size_limit)
+ == std::tie(rhs.target_host, rhs.target_port, rhs.is_target_https, rhs.proxy_host, rhs.proxy_port, rhs.is_proxy_https, rhs.wait_on_pool_size_limit);
+ }
+ };
+
+ private:
+ using PoolPtr = std::shared_ptr<SingleEndpointHTTPSessionPool>;
+ using Entry = SingleEndpointHTTPSessionPool::Entry;
+
+ struct Hasher
+ {
+ size_t operator()(const Key & k) const
+ {
+ SipHash s;
+ s.update(k.target_host);
+ s.update(k.target_port);
+ s.update(k.is_target_https);
+ s.update(k.proxy_host);
+ s.update(k.proxy_port);
+ s.update(k.is_proxy_https);
+ s.update(k.wait_on_pool_size_limit);
+ return s.get64();
+ }
+ };
+
+ std::mutex mutex;
+ std::unordered_map<Key, PoolPtr, Hasher> endpoints_pool;
+
+ protected:
+ HTTPSessionPool() = default;
+
+ public:
+ static auto & instance()
+ {
+ static HTTPSessionPool instance;
+ return instance;
+ }
+
+ Entry getSession(
+ const Poco::URI & uri,
+ const Poco::URI & proxy_uri,
+ const ConnectionTimeouts & timeouts,
+ size_t max_connections_per_endpoint,
+ bool wait_on_pool_size_limit)
+ {
+ std::unique_lock lock(mutex);
+ const std::string & host = uri.getHost();
+ UInt16 port = uri.getPort();
+ bool https = isHTTPS(uri);
+
+ String proxy_host;
+ UInt16 proxy_port = 0;
+ bool proxy_https = false;
+ if (!proxy_uri.empty())
+ {
+ proxy_host = proxy_uri.getHost();
+ proxy_port = proxy_uri.getPort();
+ proxy_https = isHTTPS(proxy_uri);
+ }
+
+ HTTPSessionPool::Key key{host, port, https, proxy_host, proxy_port, proxy_https, wait_on_pool_size_limit};
+ auto pool_ptr = endpoints_pool.find(key);
+ if (pool_ptr == endpoints_pool.end())
+ std::tie(pool_ptr, std::ignore) = endpoints_pool.emplace(
+ key,
+ std::make_shared<SingleEndpointHTTPSessionPool>(
+ host,
+ port,
+ https,
+ proxy_host,
+ proxy_port,
+ proxy_https,
+ max_connections_per_endpoint,
+ wait_on_pool_size_limit));
+
+ /// Some routines held session objects until the end of its lifetime. Also this routines may create another sessions in this time frame.
+ /// If some other session holds `lock` because it waits on another lock inside `pool_ptr->second->get` it isn't possible to create any
+ /// new session and thus finish routine, return session to the pool and unlock the thread waiting inside `pool_ptr->second->get`.
+ /// To avoid such a deadlock we unlock `lock` before entering `pool_ptr->second->get`.
+ lock.unlock();
+
+ auto retry_timeout = timeouts.connection_timeout.totalMicroseconds();
+ auto session = pool_ptr->second->get(retry_timeout);
+
+ setTimeouts(*session, timeouts);
+
+ return session;
+ }
+ };
+}
+
+void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout)
+{
+ if (!response.getKeepAlive())
+ return;
+
+ Poco::Timespan timeout(keep_alive_timeout, 0);
+ if (timeout.totalSeconds())
+ response.set("Keep-Alive", "timeout=" + std::to_string(timeout.totalSeconds()));
+}
+
+HTTPSessionPtr makeHTTPSession(
+ const Poco::URI & uri,
+ const ConnectionTimeouts & timeouts,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config
+)
+{
+ const std::string & host = uri.getHost();
+ UInt16 port = uri.getPort();
+ bool https = isHTTPS(uri);
+
+ auto session = makeHTTPSessionImpl(host, port, https, false, proxy_config);
+ setTimeouts(*session, timeouts);
+ return session;
+}
+
+
+PooledHTTPSessionPtr makePooledHTTPSession(
+ const Poco::URI & uri,
+ const ConnectionTimeouts & timeouts,
+ size_t per_endpoint_pool_size,
+ bool wait_on_pool_size_limit)
+{
+ return makePooledHTTPSession(uri, {}, timeouts, per_endpoint_pool_size, wait_on_pool_size_limit);
+}
+
+PooledHTTPSessionPtr makePooledHTTPSession(
+ const Poco::URI & uri,
+ const Poco::URI & proxy_uri,
+ const ConnectionTimeouts & timeouts,
+ size_t per_endpoint_pool_size,
+ bool wait_on_pool_size_limit)
+{
+ return HTTPSessionPool::instance().getSession(uri, proxy_uri, timeouts, per_endpoint_pool_size, wait_on_pool_size_limit);
+}
+
+bool isRedirect(const Poco::Net::HTTPResponse::HTTPStatus status) { return status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT; }
+
+std::istream * receiveResponse(
+ Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, const bool allow_redirects)
+{
+ auto & istr = session.receiveResponse(response);
+ assertResponseIsOk(request, response, istr, allow_redirects);
+ return &istr;
+}
+
+void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, const bool allow_redirects)
+{
+ auto status = response.getStatus();
+
+ if (!(status == Poco::Net::HTTPResponse::HTTP_OK
+ || status == Poco::Net::HTTPResponse::HTTP_CREATED
+ || status == Poco::Net::HTTPResponse::HTTP_ACCEPTED
+ || status == Poco::Net::HTTPResponse::HTTP_PARTIAL_CONTENT /// Reading with Range header was successful.
+ || (isRedirect(status) && allow_redirects)))
+ {
+ int code = status == Poco::Net::HTTPResponse::HTTP_TOO_MANY_REQUESTS
+ ? ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS
+ : ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
+
+ std::stringstream body; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+ body.exceptions(std::ios::failbit);
+ body << istr.rdbuf();
+
+ throw HTTPException(code, request.getURI(), status, response.getReason(), body.str());
+ }
+}
+
+Exception HTTPException::makeExceptionMessage(
+ int code,
+ const std::string & uri,
+ Poco::Net::HTTPResponse::HTTPStatus http_status,
+ const std::string & reason,
+ const std::string & body)
+{
+ return Exception(code,
+ "Received error from remote server {}. "
+ "HTTP status code: {} {}, "
+ "body: {}",
+ uri, static_cast<int>(http_status), reason, body);
+}
+
+void markSessionForReuse(Poco::Net::HTTPSession & session)
+{
+ const auto & session_data = session.sessionData();
+ if (!session_data.empty() && !Poco::AnyCast<HTTPSessionReuseTag>(&session_data))
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR, "Data of an unexpected type ({}) is attached to the session", session_data.type().name());
+
+ session.attachSessionData(HTTPSessionReuseTag{});
+}
+
+void markSessionForReuse(HTTPSessionPtr session)
+{
+ markSessionForReuse(*session);
+}
+
+void markSessionForReuse(PooledHTTPSessionPtr session)
+{
+ markSessionForReuse(static_cast<Poco::Net::HTTPSession &>(*session));
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/HTTPCommon.h b/contrib/clickhouse/src/IO/HTTPCommon.h
new file mode 100644
index 0000000000..caf2fa361d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HTTPCommon.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+
+#include <Poco/Net/HTTPClientSession.h>
+#include <Poco/Net/HTTPRequest.h>
+#include <Poco/Net/HTTPResponse.h>
+#include <Poco/URI.h>
+#include <Common/PoolBase.h>
+#include <Poco/URIStreamFactory.h>
+
+#include <IO/ConnectionTimeouts.h>
+
+
+namespace DB
+{
+
+class HTTPServerResponse;
+
+class HTTPException : public Exception
+{
+public:
+ HTTPException(
+ int code,
+ const std::string & uri,
+ Poco::Net::HTTPResponse::HTTPStatus http_status_,
+ const std::string & reason,
+ const std::string & body
+ )
+ : Exception(makeExceptionMessage(code, uri, http_status_, reason, body))
+ , http_status(http_status_)
+ {}
+
+ HTTPException * clone() const override { return new HTTPException(*this); }
+ void rethrow() const override { throw *this; }
+
+ int getHTTPStatus() const { return http_status; }
+
+private:
+ Poco::Net::HTTPResponse::HTTPStatus http_status{};
+
+ static Exception makeExceptionMessage(
+ int code,
+ const std::string & uri,
+ Poco::Net::HTTPResponse::HTTPStatus http_status,
+ const std::string & reason,
+ const std::string & body);
+
+ const char * name() const noexcept override { return "DB::HTTPException"; }
+ const char * className() const noexcept override { return "DB::HTTPException"; }
+};
+
+using PooledHTTPSessionPtr = PoolBase<Poco::Net::HTTPClientSession>::Entry; // SingleEndpointHTTPSessionPool::Entry
+using HTTPSessionPtr = std::shared_ptr<Poco::Net::HTTPClientSession>;
+
+/// If a session have this tag attached, it will be reused without calling `reset()` on it.
+/// All pooled sessions don't have this tag attached after being taken from a pool.
+/// If the request and the response were fully written/read, the client code should add this tag
+/// explicitly by calling `markSessionForReuse()`.
+struct HTTPSessionReuseTag
+{
+};
+
+void markSessionForReuse(HTTPSessionPtr session);
+void markSessionForReuse(PooledHTTPSessionPtr session);
+
+
+void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout);
+
+/// Create session object to perform requests and set required parameters.
+HTTPSessionPtr makeHTTPSession(
+ const Poco::URI & uri,
+ const ConnectionTimeouts & timeouts,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config = {}
+);
+
+/// As previous method creates session, but tooks it from pool, without and with proxy uri.
+PooledHTTPSessionPtr makePooledHTTPSession(
+ const Poco::URI & uri,
+ const ConnectionTimeouts & timeouts,
+ size_t per_endpoint_pool_size,
+ bool wait_on_pool_size_limit = true);
+
+PooledHTTPSessionPtr makePooledHTTPSession(
+ const Poco::URI & uri,
+ const Poco::URI & proxy_uri,
+ const ConnectionTimeouts & timeouts,
+ size_t per_endpoint_pool_size,
+ bool wait_on_pool_size_limit = true);
+
+bool isRedirect(Poco::Net::HTTPResponse::HTTPStatus status);
+
+/** Used to receive response (response headers and possibly body)
+ * after sending data (request headers and possibly body).
+ * Throws exception in case of non HTTP_OK (200) response code.
+ * Returned istream lives in 'session' object.
+ */
+std::istream * receiveResponse(
+ Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, bool allow_redirects);
+
+void assertResponseIsOk(
+ const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, bool allow_redirects = false);
+}
diff --git a/contrib/clickhouse/src/IO/HTTPHeaderEntries.h b/contrib/clickhouse/src/IO/HTTPHeaderEntries.h
new file mode 100644
index 0000000000..5862f1ead1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HTTPHeaderEntries.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <string>
+
+namespace DB
+{
+
+struct HTTPHeaderEntry
+{
+ std::string name;
+ std::string value;
+
+ HTTPHeaderEntry(const std::string & name_, const std::string & value_) : name(name_), value(value_) {}
+ inline bool operator==(const HTTPHeaderEntry & other) const { return name == other.name && value == other.value; }
+};
+
+using HTTPHeaderEntries = std::vector<HTTPHeaderEntry>;
+
+}
diff --git a/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.cpp b/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.cpp
new file mode 100644
index 0000000000..37b709bc89
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.cpp
@@ -0,0 +1,240 @@
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+#include <fcntl.h>
+#include <sys/types.h>
+#include <memory>
+#include <string>
+#include <cstring>
+
+#include <snappy-c.h>
+
+#include "HadoopSnappyReadBuffer.h"
+
+#include <IO/WithFileName.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int SNAPPY_UNCOMPRESS_FAILED;
+}
+
+
+inline bool HadoopSnappyDecoder::checkBufferLength(int max) const
+{
+ return buffer_length >= 0 && buffer_length < max;
+}
+
+inline bool HadoopSnappyDecoder::checkAvailIn(size_t avail_in, int min)
+{
+ return avail_in >= static_cast<size_t>(min);
+}
+
+inline void HadoopSnappyDecoder::copyToBuffer(size_t * avail_in, const char ** next_in)
+{
+ assert(*avail_in + buffer_length <= sizeof(buffer));
+
+ memcpy(buffer + buffer_length, *next_in, *avail_in);
+
+ buffer_length += *avail_in;
+ *next_in += *avail_in;
+ *avail_in = 0;
+}
+
+
+inline uint32_t HadoopSnappyDecoder::readLength(const char * in)
+{
+ uint32_t b1 = *(reinterpret_cast<const uint8_t *>(in));
+ uint32_t b2 = *(reinterpret_cast<const uint8_t *>(in + 1));
+ uint32_t b3 = *(reinterpret_cast<const uint8_t *>(in + 2));
+ uint32_t b4 = *(reinterpret_cast<const uint8_t *>(in + 3));
+ uint32_t res = ((b1 << 24) + (b2 << 16) + (b3 << 8) + b4);
+ return res;
+}
+
+
+inline HadoopSnappyDecoder::Status HadoopSnappyDecoder::readLength(size_t * avail_in, const char ** next_in, int * length)
+{
+ char tmp[4] = {0};
+
+ if (!checkBufferLength(4))
+ return Status::INVALID_INPUT;
+ memcpy(tmp, buffer, buffer_length);
+
+ if (!checkAvailIn(*avail_in, 4 - buffer_length))
+ {
+ copyToBuffer(avail_in, next_in);
+ return Status::NEEDS_MORE_INPUT;
+ }
+ memcpy(tmp + buffer_length, *next_in, 4 - buffer_length);
+
+ *avail_in -= 4 - buffer_length;
+ *next_in += 4 - buffer_length;
+ buffer_length = 0;
+ *length = readLength(tmp);
+ return Status::OK;
+}
+
+inline HadoopSnappyDecoder::Status HadoopSnappyDecoder::readBlockLength(size_t * avail_in, const char ** next_in)
+{
+ if (block_length < 0)
+ {
+ return readLength(avail_in, next_in, &block_length);
+ }
+ return Status::OK;
+}
+
+inline HadoopSnappyDecoder::Status HadoopSnappyDecoder::readCompressedLength(size_t * avail_in, const char ** next_in)
+{
+ if (compressed_length < 0)
+ {
+ auto status = readLength(avail_in, next_in, &compressed_length);
+ if (unlikely(compressed_length > 0 && static_cast<size_t>(compressed_length) > sizeof(buffer)))
+ return Status::TOO_LARGE_COMPRESSED_BLOCK;
+
+ return status;
+ }
+ return Status::OK;
+}
+
+inline HadoopSnappyDecoder::Status
+HadoopSnappyDecoder::readCompressedData(size_t * avail_in, const char ** next_in, size_t * avail_out, char ** next_out)
+{
+ if (!checkBufferLength(compressed_length))
+ return Status::INVALID_INPUT;
+
+ if (!checkAvailIn(*avail_in, compressed_length - buffer_length))
+ {
+ copyToBuffer(avail_in, next_in);
+ return Status::NEEDS_MORE_INPUT;
+ }
+
+ const char * compressed = nullptr;
+ if (buffer_length > 0)
+ {
+ compressed = buffer;
+ memcpy(buffer + buffer_length, *next_in, compressed_length - buffer_length);
+ }
+ else
+ {
+ compressed = const_cast<char *>(*next_in);
+ }
+ size_t uncompressed_length = *avail_out;
+ auto status = snappy_uncompress(compressed, compressed_length, *next_out, &uncompressed_length);
+ if (status != SNAPPY_OK)
+ {
+ return Status(status);
+ }
+
+ *avail_in -= compressed_length - buffer_length;
+ *next_in += compressed_length - buffer_length;
+ *avail_out -= uncompressed_length;
+ *next_out += uncompressed_length;
+
+ total_uncompressed_length += uncompressed_length;
+ compressed_length = -1;
+ buffer_length = 0;
+ return Status::OK;
+}
+
+HadoopSnappyDecoder::Status HadoopSnappyDecoder::readBlock(size_t * avail_in, const char ** next_in, size_t * avail_out, char ** next_out)
+{
+ if (*avail_in == 0)
+ {
+ if (buffer_length == 0 && block_length < 0 && compressed_length < 0)
+ return Status::OK;
+ return Status::NEEDS_MORE_INPUT;
+ }
+
+ HadoopSnappyDecoder::Status status = readBlockLength(avail_in, next_in);
+ if (status != Status::OK)
+ return status;
+
+ while (total_uncompressed_length < block_length)
+ {
+ status = readCompressedLength(avail_in, next_in);
+ if (status != Status::OK)
+ return status;
+
+ status = readCompressedData(avail_in, next_in, avail_out, next_out);
+ if (status != Status::OK)
+ return status;
+ }
+ if (total_uncompressed_length != block_length)
+ {
+ return Status::INVALID_INPUT;
+ }
+ return Status::OK;
+}
+
+HadoopSnappyReadBuffer::HadoopSnappyReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+ , decoder(std::make_unique<HadoopSnappyDecoder>())
+ , in_available(0)
+ , in_data(nullptr)
+ , out_capacity(0)
+ , out_data(nullptr)
+ , eof(false)
+{
+}
+
+HadoopSnappyReadBuffer::~HadoopSnappyReadBuffer() = default;
+
+bool HadoopSnappyReadBuffer::nextImpl()
+{
+ if (eof)
+ return false;
+
+ do
+ {
+ if (!in_available)
+ {
+ in->nextIfAtEnd();
+ in_available = in->buffer().end() - in->position();
+ in_data = in->position();
+ }
+
+ if (decoder->result == Status::NEEDS_MORE_INPUT && (!in_available || in->eof()))
+ {
+ throw Exception(
+ ErrorCodes::SNAPPY_UNCOMPRESS_FAILED,
+ "hadoop snappy decode error: {}{}",
+ statusToString(decoder->result),
+ getExceptionEntryWithFileName(*in));
+ }
+
+ out_capacity = internal_buffer.size();
+ out_data = internal_buffer.begin();
+ decoder->result = decoder->readBlock(&in_available, &in_data, &out_capacity, &out_data);
+
+ in->position() = in->buffer().end() - in_available;
+ }
+ while (decoder->result == Status::NEEDS_MORE_INPUT);
+
+ working_buffer.resize(internal_buffer.size() - out_capacity);
+
+ if (decoder->result == Status::OK)
+ {
+ decoder->reset();
+ if (in->eof())
+ {
+ eof = true;
+ return !working_buffer.empty();
+ }
+ return true;
+ }
+ else if (decoder->result != Status::NEEDS_MORE_INPUT)
+ {
+ throw Exception(
+ ErrorCodes::SNAPPY_UNCOMPRESS_FAILED,
+ "hadoop snappy decode error: {}{}",
+ statusToString(decoder->result),
+ getExceptionEntryWithFileName(*in));
+ }
+ return true;
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.h b/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.h
new file mode 100644
index 0000000000..bcc438489d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HadoopSnappyReadBuffer.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+
+#include <memory>
+#include <IO/ReadBuffer.h>
+#include <IO/CompressedReadBufferWrapper.h>
+
+namespace DB
+{
+
+
+/*
+ * Hadoop-snappy format is one of the compression formats base on Snappy used in Hadoop. It uses its own framing format as follows:
+ * 1. A compressed file consists of one or more blocks.
+ * 2. A block consists of uncompressed length (big endian 4 byte integer) and one or more subblocks.
+ * 3. A subblock consists of compressed length (big endian 4 byte integer) and raw compressed data.
+ *
+ * HadoopSnappyDecoder implements the decompression of data compressed with hadoop-snappy format.
+ */
+class HadoopSnappyDecoder
+{
+public:
+ enum class Status : int
+ {
+ OK = 0,
+ INVALID_INPUT = 1,
+ BUFFER_TOO_SMALL = 2,
+ NEEDS_MORE_INPUT = 3,
+ TOO_LARGE_COMPRESSED_BLOCK = 4,
+ };
+
+ HadoopSnappyDecoder() = default;
+ ~HadoopSnappyDecoder() = default;
+
+ Status readBlock(size_t * avail_in, const char ** next_in, size_t * avail_out, char ** next_out);
+
+ inline void reset()
+ {
+ buffer_length = 0;
+ block_length = -1;
+ compressed_length = -1;
+ total_uncompressed_length = 0;
+ }
+
+ Status result = Status::OK;
+
+private:
+ inline bool checkBufferLength(int max) const;
+ inline static bool checkAvailIn(size_t avail_in, int min);
+
+ inline void copyToBuffer(size_t * avail_in, const char ** next_in);
+
+ inline static uint32_t readLength(const char * in);
+ inline Status readLength(size_t * avail_in, const char ** next_in, int * length);
+ inline Status readBlockLength(size_t * avail_in, const char ** next_in);
+ inline Status readCompressedLength(size_t * avail_in, const char ** next_in);
+ inline Status readCompressedData(size_t * avail_in, const char ** next_in, size_t * avail_out, char ** next_out);
+
+ char buffer[DBMS_DEFAULT_BUFFER_SIZE] = {0};
+ int buffer_length = 0;
+
+ int block_length = -1;
+ int compressed_length = -1;
+ int total_uncompressed_length = 0;
+};
+
+/// HadoopSnappyReadBuffer implements read buffer for data compressed with hadoop-snappy format.
+class HadoopSnappyReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ using Status = HadoopSnappyDecoder::Status;
+
+ inline static String statusToString(Status status)
+ {
+ switch (status)
+ {
+ case Status::OK:
+ return "OK";
+ case Status::INVALID_INPUT:
+ return "INVALID_INPUT";
+ case Status::BUFFER_TOO_SMALL:
+ return "BUFFER_TOO_SMALL";
+ case Status::NEEDS_MORE_INPUT:
+ return "NEEDS_MORE_INPUT";
+ case Status::TOO_LARGE_COMPRESSED_BLOCK:
+ return "TOO_LARGE_COMPRESSED_BLOCK";
+ }
+ UNREACHABLE();
+ }
+
+ explicit HadoopSnappyReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~HadoopSnappyReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ std::unique_ptr<HadoopSnappyDecoder> decoder;
+
+ size_t in_available;
+ const char * in_data;
+
+ size_t out_capacity;
+ char * out_data;
+
+ bool eof;
+};
+
+}
+#endif
diff --git a/contrib/clickhouse/src/IO/HashingReadBuffer.h b/contrib/clickhouse/src/IO/HashingReadBuffer.h
new file mode 100644
index 0000000000..a0a029e6f8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HashingReadBuffer.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <IO/HashingWriteBuffer.h>
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+/*
+ * Calculates the hash from the read data. When reading, the data is read from the nested ReadBuffer.
+ * Small pieces are copied into its own memory.
+ */
+class HashingReadBuffer : public IHashingBuffer<ReadBuffer>
+{
+public:
+ explicit HashingReadBuffer(ReadBuffer & in_, size_t block_size_ = DBMS_DEFAULT_HASHING_BLOCK_SIZE)
+ : IHashingBuffer<ReadBuffer>(block_size_), in(in_)
+ {
+ working_buffer = in.buffer();
+ pos = in.position();
+ hashing_begin = pos;
+ }
+
+ uint128 getHash()
+ {
+ if (pos > hashing_begin)
+ {
+ calculateHash(hashing_begin, pos - hashing_begin);
+ hashing_begin = pos;
+ }
+ return IHashingBuffer<ReadBuffer>::getHash();
+ }
+
+private:
+ bool nextImpl() override
+ {
+ if (pos > hashing_begin)
+ calculateHash(hashing_begin, pos - hashing_begin);
+
+ in.position() = pos;
+ bool res = in.next();
+ working_buffer = in.buffer();
+
+ // `pos` may be different from working_buffer.begin() when using sophisticated ReadBuffers.
+ pos = in.position();
+ hashing_begin = pos;
+
+ return res;
+ }
+
+ ReadBuffer & in;
+ BufferBase::Position hashing_begin;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/HashingWriteBuffer.cpp b/contrib/clickhouse/src/IO/HashingWriteBuffer.cpp
new file mode 100644
index 0000000000..d2461d4f52
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HashingWriteBuffer.cpp
@@ -0,0 +1,54 @@
+#include <IO/HashingWriteBuffer.h>
+#include <iomanip>
+
+
+namespace DB
+{
+
+/// computation of the hash depends on the partitioning of blocks
+/// so you need to compute a hash of n complete pieces and one incomplete
+template <typename Buffer>
+void IHashingBuffer<Buffer>::calculateHash(DB::BufferBase::Position data, size_t len)
+{
+ if (len)
+ {
+ /// if the data is less than `block_size`, then put them into buffer and calculate hash later
+ if (block_pos + len < block_size)
+ {
+ memcpy(&BufferWithOwnMemory<Buffer>::memory[block_pos], data, len);
+ block_pos += len;
+ }
+ else
+ {
+ /// if something is already written to the buffer, then we'll add it
+ if (block_pos)
+ {
+ size_t n = block_size - block_pos;
+ memcpy(&BufferWithOwnMemory<Buffer>::memory[block_pos], data, n);
+ append(&BufferWithOwnMemory<Buffer>::memory[0]);
+ len -= n;
+ data += n;
+ block_pos = 0;
+ }
+
+ while (len >= block_size)
+ {
+ append(data);
+ len -= block_size;
+ data += block_size;
+ }
+
+ /// write the remainder to its buffer
+ if (len)
+ {
+ memcpy(&BufferWithOwnMemory<Buffer>::memory[0], data, len);
+ block_pos = len;
+ }
+ }
+ }
+}
+
+template class IHashingBuffer<DB::ReadBuffer>;
+template class IHashingBuffer<DB::WriteBuffer>;
+
+}
diff --git a/contrib/clickhouse/src/IO/HashingWriteBuffer.h b/contrib/clickhouse/src/IO/HashingWriteBuffer.h
new file mode 100644
index 0000000000..8edfa45a6b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/HashingWriteBuffer.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadHelpers.h>
+#include <city.h>
+
+#define DBMS_DEFAULT_HASHING_BLOCK_SIZE 2048ULL
+
+
+namespace DB
+{
+
+template <typename Buffer>
+class IHashingBuffer : public BufferWithOwnMemory<Buffer>
+{
+public:
+ using uint128 = CityHash_v1_0_2::uint128;
+
+ explicit IHashingBuffer(size_t block_size_ = DBMS_DEFAULT_HASHING_BLOCK_SIZE)
+ : BufferWithOwnMemory<Buffer>(block_size_), block_pos(0), block_size(block_size_), state(0, 0)
+ {
+ }
+
+ uint128 getHash()
+ {
+ if (block_pos)
+ return CityHash_v1_0_2::CityHash128WithSeed(BufferWithOwnMemory<Buffer>::memory.data(), block_pos, state);
+ else
+ return state;
+ }
+
+ void append(DB::BufferBase::Position data)
+ {
+ state = CityHash_v1_0_2::CityHash128WithSeed(data, block_size, state);
+ }
+
+ /// computation of the hash depends on the partitioning of blocks
+ /// so you need to compute a hash of n complete pieces and one incomplete
+ void calculateHash(DB::BufferBase::Position data, size_t len);
+
+protected:
+ size_t block_pos;
+ size_t block_size;
+ uint128 state;
+};
+
+/** Computes the hash from the data to write and passes it to the specified WriteBuffer.
+ * The buffer of the nested WriteBuffer is used as the main buffer.
+ */
+class HashingWriteBuffer : public IHashingBuffer<WriteBuffer>
+{
+private:
+ WriteBuffer & out;
+
+ void nextImpl() override
+ {
+ size_t len = offset();
+
+ Position data = working_buffer.begin();
+ calculateHash(data, len);
+
+ out.position() = pos;
+ out.next();
+ working_buffer = out.buffer();
+ }
+
+public:
+ explicit HashingWriteBuffer(
+ WriteBuffer & out_,
+ size_t block_size_ = DBMS_DEFAULT_HASHING_BLOCK_SIZE)
+ : IHashingBuffer<DB::WriteBuffer>(block_size_), out(out_)
+ {
+ out.next(); /// If something has already been written to `out` before us, we will not let the remains of this data affect the hash.
+ working_buffer = out.buffer();
+ pos = working_buffer.begin();
+ state = uint128(0, 0);
+ }
+
+ void sync() override
+ {
+ out.sync();
+ }
+
+ uint128 getHash()
+ {
+ next();
+ return IHashingBuffer<WriteBuffer>::getHash();
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/IReadableWriteBuffer.h b/contrib/clickhouse/src/IO/IReadableWriteBuffer.h
new file mode 100644
index 0000000000..539825e3a8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/IReadableWriteBuffer.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <memory>
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+struct IReadableWriteBuffer
+{
+ /// At the first time returns getReadBufferImpl(). Next calls return nullptr.
+ inline std::shared_ptr<ReadBuffer> tryGetReadBuffer()
+ {
+ if (!can_reread)
+ return nullptr;
+
+ can_reread = false;
+ return getReadBufferImpl();
+ }
+
+ virtual ~IReadableWriteBuffer() = default;
+
+protected:
+
+ /// Creates read buffer from current write buffer.
+ /// Returned buffer points to the first byte of original buffer.
+ /// Original stream becomes invalid.
+ virtual std::shared_ptr<ReadBuffer> getReadBufferImpl() = 0;
+
+ bool can_reread = true;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/IResourceManager.h b/contrib/clickhouse/src/IO/IResourceManager.h
new file mode 100644
index 0000000000..f084a903cb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/IResourceManager.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <IO/ResourceLink.h>
+
+#include <Poco/Util/AbstractConfiguration.h>
+
+#include <boost/noncopyable.hpp>
+
+#include <memory>
+#include <unordered_map>
+
+namespace DB
+{
+
+/*
+ * Instance of derived class holds everything required for resource consumption,
+ * including resources currently registered at `SchedulerRoot`. This is required to avoid
+ * problems during configuration update. Do not hold instances longer than required.
+ * Should be created on query start and destructed when query is done.
+ */
+class IClassifier : private boost::noncopyable
+{
+public:
+ virtual ~IClassifier() {}
+
+ /// Returns ResouceLink that should be used to access resource.
+ /// Returned link is valid until classifier destruction.
+ virtual ResourceLink get(const String & resource_name) = 0;
+};
+
+using ClassifierPtr = std::shared_ptr<IClassifier>;
+
+/*
+ * Represents control plane of resource scheduling. Derived class is responsible for reading
+ * configuration, creating all required `ISchedulerNode` objects and
+ * managing their lifespan.
+ */
+class IResourceManager : private boost::noncopyable
+{
+public:
+ virtual ~IResourceManager() {}
+
+ /// Initialize or reconfigure manager.
+ virtual void updateConfiguration(const Poco::Util::AbstractConfiguration & config) = 0;
+
+ /// Obtain a classifier instance required to get access to resources.
+ /// Note that it holds resource configuration, so should be destructed when query is done.
+ virtual ClassifierPtr acquire(const String & classifier_name) = 0;
+};
+
+using ResourceManagerPtr = std::shared_ptr<IResourceManager>;
+
+}
diff --git a/contrib/clickhouse/src/IO/ISchedulerConstraint.h b/contrib/clickhouse/src/IO/ISchedulerConstraint.h
new file mode 100644
index 0000000000..47f6905e26
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ISchedulerConstraint.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <IO/ISchedulerNode.h>
+
+namespace DB
+{
+
+/*
+ * Constraint defined on the set of requests in consumption state.
+ * It allows to track two events:
+ * - dequeueRequest(): resource consumption begins
+ * - finishRequest(): resource consumption finishes
+ * This allows to keep track of in-flight requests and implement different constraints (e.g. in-flight limit).
+ * When constraint is violated, node must be deactivated by dequeueRequest() returning `false`.
+ * When constraint is again satisfied, scheduleActivation() is called from finishRequest().
+ *
+ * Derived class behaviour requirements:
+ * - dequeueRequest() must fill `request->constraint` iff it is nullptr;
+ * - finishRequest() must be recursive: call to `parent_constraint->finishRequest()`.
+ */
+class ISchedulerConstraint : public ISchedulerNode
+{
+public:
+ ISchedulerConstraint(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : ISchedulerNode(event_queue_, config, config_prefix)
+ {}
+
+ /// Resource consumption by `request` is finished.
+ /// Should be called outside of scheduling subsystem, implementation must be thread-safe.
+ virtual void finishRequest(ResourceRequest * request) = 0;
+
+ void setParent(ISchedulerNode * parent_) override
+ {
+ ISchedulerNode::setParent(parent_);
+
+ // Assign `parent_constraint` to the nearest parent derived from ISchedulerConstraint
+ for (ISchedulerNode * node = parent_; node != nullptr; node = node->parent)
+ {
+ if (auto * constraint = dynamic_cast<ISchedulerConstraint *>(node))
+ {
+ parent_constraint = constraint;
+ break;
+ }
+ }
+ }
+
+protected:
+ // Reference to nearest parent that is also derived from ISchedulerConstraint.
+ // Request can traverse through multiple constraints while being dequeue from hierarchy,
+ // while finishing request should traverse the same chain in reverse order.
+ // NOTE: it must be immutable after initialization, because it is accessed in not thread-safe way from finishRequest()
+ ISchedulerConstraint * parent_constraint = nullptr;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ISchedulerNode.h b/contrib/clickhouse/src/IO/ISchedulerNode.h
new file mode 100644
index 0000000000..1c33c03374
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ISchedulerNode.h
@@ -0,0 +1,222 @@
+#pragma once
+
+#include <Common/ErrorCodes.h>
+#include <Common/Exception.h>
+#include <Common/Priority.h>
+
+#include <IO/ResourceRequest.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Poco/Util/XMLConfiguration.h>
+
+#include <boost/noncopyable.hpp>
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+class ISchedulerNode;
+
+inline const Poco::Util::AbstractConfiguration & emptyConfig()
+{
+ static Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration();
+ return *config;
+}
+
+/*
+ * Info read and write for scheduling purposes by parent
+ */
+struct SchedulerNodeInfo
+{
+ double weight = 1.0; /// Weight of this node among it's siblings
+ Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)
+
+ /// Arbitrary data accessed/stored by parent
+ union {
+ size_t idx;
+ void * ptr;
+ } parent;
+
+ SchedulerNodeInfo() = default;
+
+ explicit SchedulerNodeInfo(const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ {
+ setWeight(config.getDouble(config_prefix + ".weight", weight));
+ setPriority(config.getInt64(config_prefix + ".priority", priority));
+ }
+
+ void setWeight(double value)
+ {
+ if (value <= 0 || !isfinite(value))
+ throw Exception(
+ ErrorCodes::INVALID_SCHEDULER_NODE,
+ "Negative and non-finite node weights are not allowed: {}",
+ value);
+ weight = value;
+ }
+
+ void setPriority(Int64 value)
+ {
+ priority.value = value;
+ }
+};
+
+/*
+ * Simple waitable thread-safe FIFO task queue.
+ * Intended to hold postponed events for later handling (usually by scheduler thread).
+ */
+class EventQueue
+{
+public:
+ using Event = std::function<void()>;
+
+ void enqueue(Event&& event)
+ {
+ std::unique_lock lock{mutex};
+ bool was_empty = queue.empty();
+ queue.emplace_back(event);
+ if (was_empty)
+ pending.notify_one();
+ }
+
+ /// Process single event if it exists
+ /// Returns `true` iff event has been processed
+ bool tryProcess()
+ {
+ std::unique_lock lock{mutex};
+ if (queue.empty())
+ return false;
+ Event event = std::move(queue.front());
+ queue.pop_front();
+ lock.unlock(); // do not hold queue mutext while processing events
+ event();
+ return true;
+ }
+
+ /// Wait for single event (if not available) and process it
+ void process()
+ {
+ std::unique_lock lock{mutex};
+ pending.wait(lock, [&] { return !queue.empty(); });
+ Event event = std::move(queue.front());
+ queue.pop_front();
+ lock.unlock(); // do not hold queue mutext while processing events
+ event();
+ }
+
+private:
+ std::mutex mutex;
+ std::condition_variable pending;
+ std::deque<Event> queue;
+};
+
+/*
+ * Node of hierarchy for scheduling requests for resource. Base class for all
+ * kinds of scheduling elements (queues, policies, constraints and schedulers).
+ *
+ * Root node is a scheduler, which has it's thread to dequeue requests,
+ * execute requests (see ResourceRequest) and process events in a thread-safe manner.
+ * Immediate children of the scheduler represent independent resources.
+ * Each resource has it's own hierarchy to achieve required scheduling policies.
+ * Non-leaf nodes do not hold requests, but keep scheduling state
+ * (e.g. consumption history, amount of in-flight requests, etc).
+ * Leafs of hierarchy are queues capable of holding pending requests.
+ *
+ * scheduler (SchedulerRoot)
+ * / \
+ * constraint constraint (SemaphoreConstraint)
+ * | |
+ * policy policy (PriorityPolicy)
+ * / \ / \
+ * q1 q2 q3 q4 (FifoQueue)
+ *
+ * Dequeueing request from an inner node will dequeue request from one of active leaf-queues in its subtree.
+ * Node is considered to be active iff:
+ * - it has at least one pending request in one of leaves of it's subtree;
+ * - and enforced constraints, if any, are satisfied
+ * (e.g. amount of concurrent requests is not greater than some number).
+ *
+ * All methods must be called only from scheduler thread for thread-safety.
+ */
+class ISchedulerNode : private boost::noncopyable
+{
+public:
+ ISchedulerNode(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : event_queue(event_queue_)
+ , info(config, config_prefix)
+ {}
+
+ virtual ~ISchedulerNode() {}
+
+ // Checks if two nodes configuration is equal
+ virtual bool equals(ISchedulerNode * other) = 0;
+
+ /// Attach new child
+ virtual void attachChild(const std::shared_ptr<ISchedulerNode> & child) = 0;
+
+ /// Detach and destroy child
+ virtual void removeChild(ISchedulerNode * child) = 0;
+
+ /// Get attached child by name
+ virtual ISchedulerNode * getChild(const String & child_name) = 0;
+
+ /// Activation of child due to the first pending request
+ /// Should be called on leaf node (i.e. queue) to propagate activation signal through chain to the root
+ virtual void activateChild(ISchedulerNode * child) = 0;
+
+ /// Returns true iff node is active
+ virtual bool isActive() = 0;
+
+ /// Returns the first request to be executed as the first component of resuting pair.
+ /// The second pair component is `true` iff node is still active after dequeueing.
+ virtual std::pair<ResourceRequest *, bool> dequeueRequest() = 0;
+
+ /// Returns full path string using names of every parent
+ String getPath()
+ {
+ String result;
+ ISchedulerNode * ptr = this;
+ while (ptr->parent)
+ {
+ result = "/" + ptr->basename + result;
+ ptr = ptr->parent;
+ }
+ return result.empty() ? "/" : result;
+ }
+
+ /// Attach to a parent (used by attachChild)
+ virtual void setParent(ISchedulerNode * parent_)
+ {
+ parent = parent_;
+ }
+
+protected:
+ /// Notify parents about the first pending request or constraint becoming satisfied.
+ /// Postponed to be handled in scheduler thread, so it is intended to be called from outside.
+ void scheduleActivation()
+ {
+ if (likely(parent))
+ {
+ event_queue->enqueue([this] { parent->activateChild(this); });
+ }
+ }
+
+public:
+ EventQueue * const event_queue;
+ String basename;
+ SchedulerNodeInfo info;
+ ISchedulerNode * parent = nullptr;
+};
+
+using SchedulerNodePtr = std::shared_ptr<ISchedulerNode>;
+
+}
diff --git a/contrib/clickhouse/src/IO/ISchedulerQueue.h b/contrib/clickhouse/src/IO/ISchedulerQueue.h
new file mode 100644
index 0000000000..fc2f3943d2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ISchedulerQueue.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <IO/ISchedulerNode.h>
+#include <IO/ResourceBudget.h>
+#include <IO/ResourceRequest.h>
+
+#include <memory>
+
+
+namespace DB
+{
+
+/*
+ * Queue for pending requests for specific resource, leaf of hierarchy.
+ * Note that every queue has budget associated with it.
+ */
+class ISchedulerQueue : public ISchedulerNode
+{
+public:
+ explicit ISchedulerQueue(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : ISchedulerNode(event_queue_, config, config_prefix)
+ {}
+
+ // Wrapper for `enqueueRequest()` that should be used to account for available resource budget
+ void enqueueRequestUsingBudget(ResourceRequest * request)
+ {
+ request->cost = budget.ask(request->cost);
+ enqueueRequest(request);
+ }
+
+ // Should be called to account for difference between real and estimated costs
+ void adjustBudget(ResourceCost estimated_cost, ResourceCost real_cost)
+ {
+ budget.adjust(estimated_cost, real_cost);
+ }
+
+ // Adjust budget to account for extra consumption of `cost` resource units
+ void consumeBudget(ResourceCost cost)
+ {
+ adjustBudget(0, cost);
+ }
+
+ // Adjust budget to account for requested, but not consumed `cost` resource units
+ void accumulateBudget(ResourceCost cost)
+ {
+ adjustBudget(cost, 0);
+ }
+
+ /// Enqueue new request to be executed using underlying resource.
+ /// Should be called outside of scheduling subsystem, implementation must be thread-safe.
+ virtual void enqueueRequest(ResourceRequest * request) = 0;
+
+private:
+ // Allows multiple consumers to synchronize with common "debit/credit" balance.
+ // 1) (positive) to avoid wasting of allocated but not used resource (e.g in case of a failure);
+ // 2) (negative) to account for overconsumption (e.g. if cost is not know in advance and estimation from below is applied).
+ ResourceBudget budget;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.cpp b/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.cpp
new file mode 100644
index 0000000000..c70ec1507c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.cpp
@@ -0,0 +1,123 @@
+#include <IO/LZMADeflatingWriteBuffer.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int LZMA_STREAM_ENCODER_FAILED;
+}
+
+LZMADeflatingWriteBuffer::LZMADeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+{
+
+ lstr = LZMA_STREAM_INIT;
+ lstr.allocator = nullptr;
+ lstr.next_in = nullptr;
+ lstr.avail_in = 0;
+ lstr.next_out = nullptr;
+ lstr.avail_out = 0;
+
+ // options for further compression
+ lzma_options_lzma opt_lzma2;
+ if (lzma_lzma_preset(&opt_lzma2, compression_level))
+ throw Exception(ErrorCodes::LZMA_STREAM_ENCODER_FAILED, "lzma preset failed: lzma version: {}", LZMA_VERSION_STRING);
+
+
+ // LZMA_FILTER_X86 -
+ // LZMA2 - codec for *.xz files compression; LZMA is not suitable for this purpose
+ // VLI - variable length integer (in *.xz most integers encoded as VLI)
+ // LZMA_VLI_UNKNOWN (UINT64_MAX) - VLI value to denote that the value is unknown
+ lzma_filter filters[] = {
+ {.id = LZMA_FILTER_X86, .options = nullptr},
+ {.id = LZMA_FILTER_LZMA2, .options = &opt_lzma2},
+ {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
+ };
+ lzma_ret ret = lzma_stream_encoder(&lstr, filters, LZMA_CHECK_CRC64);
+
+ if (ret != LZMA_OK)
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_ENCODER_FAILED,
+ "lzma stream encoder init failed: error code: {} lzma version: {}",
+ ret,
+ LZMA_VERSION_STRING);
+}
+
+LZMADeflatingWriteBuffer::~LZMADeflatingWriteBuffer() = default;
+
+void LZMADeflatingWriteBuffer::nextImpl()
+{
+ if (!offset())
+ return;
+
+ lstr.next_in = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ lstr.avail_in = offset();
+
+ try
+ {
+ lzma_action action = LZMA_RUN;
+ do
+ {
+ out->nextIfAtEnd();
+ lstr.next_out = reinterpret_cast<unsigned char *>(out->position());
+ lstr.avail_out = out->buffer().end() - out->position();
+
+ lzma_ret ret = lzma_code(&lstr, action);
+ out->position() = out->buffer().end() - lstr.avail_out;
+
+ if (ret == LZMA_STREAM_END)
+ return;
+
+ if (ret != LZMA_OK)
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_ENCODER_FAILED,
+ "lzma stream encoding failed: error code: {}; lzma_version: {}",
+ ret,
+ LZMA_VERSION_STRING);
+
+ } while (lstr.avail_in > 0 || lstr.avail_out == 0);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+}
+
+void LZMADeflatingWriteBuffer::finalizeBefore()
+{
+ next();
+
+ do
+ {
+ out->nextIfAtEnd();
+ lstr.next_out = reinterpret_cast<unsigned char *>(out->position());
+ lstr.avail_out = out->buffer().end() - out->position();
+
+ lzma_ret ret = lzma_code(&lstr, LZMA_FINISH);
+ out->position() = out->buffer().end() - lstr.avail_out;
+
+ if (ret == LZMA_STREAM_END)
+ {
+ return;
+ }
+
+ if (ret != LZMA_OK)
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_ENCODER_FAILED,
+ "lzma stream encoding failed: error code: {}; lzma version: {}",
+ ret,
+ LZMA_VERSION_STRING);
+
+ } while (lstr.avail_out == 0);
+}
+
+void LZMADeflatingWriteBuffer::finalizeAfter()
+{
+ lzma_end(&lstr);
+}
+
+}
+
diff --git a/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.h b/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.h
new file mode 100644
index 0000000000..2e135455e0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LZMADeflatingWriteBuffer.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferDecorator.h>
+
+#include <lzma.h>
+
+
+namespace DB
+{
+
+/// Performs compression using lzma library and writes compressed data to out_ WriteBuffer.
+class LZMADeflatingWriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ LZMADeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~LZMADeflatingWriteBuffer() override;
+
+private:
+ void nextImpl() override;
+
+ void finalizeBefore() override;
+ void finalizeAfter() override;
+
+ lzma_stream lstr;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.cpp b/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.cpp
new file mode 100644
index 0000000000..a6f3c74ae7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.cpp
@@ -0,0 +1,99 @@
+#include <IO/LZMAInflatingReadBuffer.h>
+#include <IO/WithFileName.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int LZMA_STREAM_DECODER_FAILED;
+}
+
+LZMAInflatingReadBuffer::LZMAInflatingReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment), eof_flag(false)
+{
+ lstr = LZMA_STREAM_INIT;
+ lstr.allocator = nullptr;
+ lstr.next_in = nullptr;
+ lstr.avail_in = 0;
+ lstr.next_out = nullptr;
+ lstr.avail_out = 0;
+
+ // 500 mb
+ uint64_t memlimit = 500ULL << 20;
+
+ lzma_ret ret = lzma_stream_decoder(&lstr, memlimit, LZMA_CONCATENATED);
+ // lzma does not provide api for converting error code to string unlike zlib
+ if (ret != LZMA_OK)
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_DECODER_FAILED,
+ "lzma_stream_decoder initialization failed: error code: {}; lzma version: {}",
+ ret,
+ LZMA_VERSION_STRING);
+}
+
+LZMAInflatingReadBuffer::~LZMAInflatingReadBuffer()
+{
+ lzma_end(&lstr);
+}
+
+bool LZMAInflatingReadBuffer::nextImpl()
+{
+ if (eof_flag)
+ return false;
+
+ lzma_action action = LZMA_RUN;
+ lzma_ret ret;
+
+ do
+ {
+ if (!lstr.avail_in)
+ {
+ in->nextIfAtEnd();
+ lstr.next_in = reinterpret_cast<unsigned char *>(in->position());
+ lstr.avail_in = in->buffer().end() - in->position();
+ }
+
+ if (in->eof())
+ {
+ action = LZMA_FINISH;
+ }
+
+ lstr.next_out = reinterpret_cast<unsigned char *>(internal_buffer.begin());
+ lstr.avail_out = internal_buffer.size();
+
+ ret = lzma_code(&lstr, action);
+ in->position() = in->buffer().end() - lstr.avail_in;
+ }
+ while (ret == LZMA_OK && lstr.avail_out == internal_buffer.size());
+
+ working_buffer.resize(internal_buffer.size() - lstr.avail_out);
+
+ if (ret == LZMA_STREAM_END)
+ {
+ if (in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+ else
+ {
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_DECODER_FAILED,
+ "lzma decoder finished, but input stream has not exceeded: error code: {}; lzma version: {}{}",
+ ret,
+ LZMA_VERSION_STRING,
+ getExceptionEntryWithFileName(*in));
+ }
+ }
+
+ if (ret != LZMA_OK)
+ throw Exception(
+ ErrorCodes::LZMA_STREAM_DECODER_FAILED,
+ "lzma_stream_decoder failed: error code: error code {}; lzma version: {}{}",
+ ret,
+ LZMA_VERSION_STRING,
+ getExceptionEntryWithFileName(*in));
+
+ return true;
+}
+}
diff --git a/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.h b/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.h
new file mode 100644
index 0000000000..5fd29d6f7d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LZMAInflatingReadBuffer.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/ReadBuffer.h>
+
+#include <lzma.h>
+
+namespace DB
+{
+
+class LZMAInflatingReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ explicit LZMAInflatingReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~LZMAInflatingReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ lzma_stream lstr;
+ bool eof_flag;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/LimitReadBuffer.cpp b/contrib/clickhouse/src/IO/LimitReadBuffer.cpp
new file mode 100644
index 0000000000..e14112f8d1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LimitReadBuffer.cpp
@@ -0,0 +1,99 @@
+#include <IO/LimitReadBuffer.h>
+
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LIMIT_EXCEEDED;
+ extern const int CANNOT_READ_ALL_DATA;
+}
+
+
+bool LimitReadBuffer::nextImpl()
+{
+ assert(position() >= in->position());
+
+ /// Let underlying buffer calculate read bytes in `next()` call.
+ in->position() = position();
+
+ if (bytes >= limit)
+ {
+ if (exact_limit && bytes == *exact_limit)
+ return false;
+
+ if (exact_limit && bytes != *exact_limit)
+ throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected data, got {} bytes, expected {}", bytes, *exact_limit);
+
+ if (throw_exception)
+ throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Limit for LimitReadBuffer exceeded: {}", exception_message);
+
+ return false;
+ }
+
+ if (!in->next())
+ {
+ if (exact_limit && bytes != *exact_limit)
+ throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected EOF, got {} of {} bytes", bytes, *exact_limit);
+ /// Clearing the buffer with existing data.
+ set(in->position(), 0);
+ return false;
+ }
+
+ working_buffer = in->buffer();
+
+ if (limit - bytes < working_buffer.size())
+ working_buffer.resize(limit - bytes);
+
+ return true;
+}
+
+
+LimitReadBuffer::LimitReadBuffer(ReadBuffer * in_, bool owns, UInt64 limit_, bool throw_exception_,
+ std::optional<size_t> exact_limit_, std::string exception_message_)
+ : ReadBuffer(in_ ? in_->position() : nullptr, 0)
+ , in(in_)
+ , owns_in(owns)
+ , limit(limit_)
+ , throw_exception(throw_exception_)
+ , exact_limit(exact_limit_)
+ , exception_message(std::move(exception_message_))
+{
+ assert(in);
+
+ size_t remaining_bytes_in_buffer = in->buffer().end() - in->position();
+ if (remaining_bytes_in_buffer > limit)
+ remaining_bytes_in_buffer = limit;
+
+ working_buffer = Buffer(in->position(), in->position() + remaining_bytes_in_buffer);
+}
+
+
+LimitReadBuffer::LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_,
+ std::optional<size_t> exact_limit_, std::string exception_message_)
+ : LimitReadBuffer(&in_, false, limit_, throw_exception_, exact_limit_, exception_message_)
+{
+}
+
+
+LimitReadBuffer::LimitReadBuffer(std::unique_ptr<ReadBuffer> in_, UInt64 limit_, bool throw_exception_,
+ std::optional<size_t> exact_limit_, std::string exception_message_)
+ : LimitReadBuffer(in_.release(), true, limit_, throw_exception_, exact_limit_, exception_message_)
+{
+}
+
+
+LimitReadBuffer::~LimitReadBuffer()
+{
+ /// Update underlying buffer's position in case when limit wasn't reached.
+ if (!working_buffer.empty())
+ in->position() = position();
+
+ if (owns_in)
+ delete in;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/LimitReadBuffer.h b/contrib/clickhouse/src/IO/LimitReadBuffer.h
new file mode 100644
index 0000000000..15885c1d85
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LimitReadBuffer.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <base/types.h>
+#include <IO/ReadBuffer.h>
+
+
+namespace DB
+{
+
+/** Allows to read from another ReadBuffer no more than the specified number of bytes.
+ * Note that the nested ReadBuffer may read slightly more data internally to fill its buffer.
+ */
+class LimitReadBuffer : public ReadBuffer
+{
+public:
+ LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_,
+ std::optional<size_t> exact_limit_, std::string exception_message_ = {});
+ LimitReadBuffer(std::unique_ptr<ReadBuffer> in_, UInt64 limit_, bool throw_exception_, std::optional<size_t> exact_limit_,
+ std::string exception_message_ = {});
+ ~LimitReadBuffer() override;
+
+private:
+ ReadBuffer * in;
+ bool owns_in;
+
+ UInt64 limit;
+ bool throw_exception;
+ std::optional<size_t> exact_limit;
+ std::string exception_message;
+
+ LimitReadBuffer(ReadBuffer * in_, bool owns, UInt64 limit_, bool throw_exception_, std::optional<size_t> exact_limit_, std::string exception_message_);
+
+ bool nextImpl() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.cpp b/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.cpp
new file mode 100644
index 0000000000..587138cb2c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.cpp
@@ -0,0 +1,125 @@
+#include <IO/LimitSeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+LimitSeekableReadBuffer::LimitSeekableReadBuffer(SeekableReadBuffer & in_, UInt64 start_offset_, UInt64 limit_size_)
+ : LimitSeekableReadBuffer(wrapSeekableReadBufferReference(in_), start_offset_, limit_size_)
+{
+}
+
+LimitSeekableReadBuffer::LimitSeekableReadBuffer(std::unique_ptr<SeekableReadBuffer> in_, UInt64 start_offset_, UInt64 limit_size_)
+ : SeekableReadBuffer(in_->position(), 0)
+ , in(std::move(in_))
+ , min_offset(start_offset_)
+ , max_offset(start_offset_ + limit_size_)
+ , need_seek(min_offset) /// We always start reading from `min_offset`.
+{
+}
+
+bool LimitSeekableReadBuffer::nextImpl()
+{
+ /// First let the nested buffer know the current position in the buffer (otherwise `in->eof()` or `in->seek()` below can work incorrectly).
+ in->position() = position();
+
+ if (need_seek)
+ {
+ /// Do actual seek.
+ if (in->seek(*need_seek, SEEK_SET) != static_cast<off_t>(*need_seek))
+ {
+ /// Failed to seek, maybe because the new seek position is located after EOF.
+ set(in->position(), 0);
+ return false;
+ }
+ need_seek.reset();
+ }
+
+ off_t seek_pos = in->getPosition();
+ off_t offset_after_min = seek_pos - min_offset;
+ off_t available_before_max = max_offset - seek_pos;
+
+ if (offset_after_min < 0 || available_before_max <= 0)
+ {
+ /// Limit reached.
+ set(in->position(), 0);
+ return false;
+ }
+
+ if (in->eof()) /// `in->eof()` can call `in->next()`
+ {
+ /// EOF reached.
+ set(in->position(), 0);
+ return false;
+ }
+
+ /// in->eof() shouldn't change the seek position.
+ chassert(seek_pos == in->getPosition());
+
+ /// Adjust the beginning and the end of the working buffer.
+ /// Because we don't want to read before `min_offset` or after `max_offset`.
+ auto * ptr = in->position();
+ auto * begin = in->buffer().begin();
+ auto * end = in->buffer().end();
+
+ if (ptr - begin > offset_after_min)
+ begin = ptr - offset_after_min;
+ if (end - ptr > available_before_max)
+ end = ptr + available_before_max;
+
+ BufferBase::set(begin, end - begin, ptr - begin);
+ chassert(position() == ptr && available());
+
+ return true;
+}
+
+off_t LimitSeekableReadBuffer::seek(off_t off, int whence)
+{
+ off_t new_position;
+ off_t current_position = getPosition();
+ if (whence == SEEK_SET)
+ new_position = off;
+ else if (whence == SEEK_CUR)
+ new_position = current_position + off;
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Seek expects SEEK_SET or SEEK_CUR as whence");
+
+ if (new_position < 0 || new_position + min_offset > max_offset)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Seek shift out of bounds");
+
+ off_t position_change = new_position - current_position;
+ if ((buffer().begin() <= pos + position_change) && (pos + position_change <= buffer().end()))
+ {
+ /// Position is still inside the buffer.
+ pos += position_change;
+ chassert(pos >= working_buffer.begin());
+ chassert(pos <= working_buffer.end());
+ return new_position;
+ }
+
+ /// Actual seek in the nested buffer will be performed in nextImpl().
+ need_seek = new_position + min_offset;
+
+ /// Set the size of the working buffer to zero so next call next() would call nextImpl().
+ set(in->position(), 0);
+
+ return new_position;
+}
+
+off_t LimitSeekableReadBuffer::getPosition()
+{
+ if (need_seek)
+ return *need_seek - min_offset;
+
+ /// We have to do that because `in->getPosition()` below most likely needs to know the current position in the buffer.
+ in->position() = position();
+
+ return in->getPosition() - min_offset;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.h b/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.h
new file mode 100644
index 0000000000..61b307c522
--- /dev/null
+++ b/contrib/clickhouse/src/IO/LimitSeekableReadBuffer.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <base/types.h>
+#include <IO/SeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+/** Allows to read from another SeekableReadBuffer up to `limit_size` bytes starting from `start_offset`.
+ * Note that the nested buffer may read slightly more data internally to fill its buffer.
+ */
+class LimitSeekableReadBuffer : public SeekableReadBuffer
+{
+public:
+ LimitSeekableReadBuffer(SeekableReadBuffer & in_, UInt64 start_offset_, UInt64 limit_size_);
+ LimitSeekableReadBuffer(std::unique_ptr<SeekableReadBuffer> in_, UInt64 start_offset_, UInt64 limit_size_);
+
+ /// Returns adjusted position, i.e. returns `3` if the position in the nested buffer is `start_offset + 3`.
+ off_t getPosition() override;
+
+ off_t seek(off_t off, int whence) override;
+
+private:
+ std::unique_ptr<SeekableReadBuffer> in;
+ off_t min_offset;
+ off_t max_offset;
+ std::optional<off_t> need_seek;
+
+ bool nextImpl() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.cpp b/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.cpp
new file mode 100644
index 0000000000..27c945f92c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.cpp
@@ -0,0 +1,161 @@
+#include <IO/Lz4DeflatingWriteBuffer.h>
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int LZ4_ENCODER_FAILED;
+}
+
+Lz4DeflatingWriteBuffer::Lz4DeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+ , in_data(nullptr)
+ , out_data(nullptr)
+ , in_capacity(0)
+ , out_capacity(0)
+{
+ kPrefs = {
+ {LZ4F_max256KB,
+ LZ4F_blockLinked,
+ LZ4F_noContentChecksum,
+ LZ4F_frame,
+ 0 /* unknown content size */,
+ 0 /* no dictID */,
+ LZ4F_noBlockChecksum},
+ compression_level, /* compression level; 0 == default */
+ 1, /* autoflush */
+ 0, /* favor decompression speed */
+ {0, 0, 0}, /* reserved, must be set to 0 */
+ };
+
+ size_t ret = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
+
+ if (LZ4F_isError(ret))
+ throw Exception(
+ ErrorCodes::LZ4_ENCODER_FAILED,
+ "creation of LZ4 compression context failed. LZ4F version: {}",
+ LZ4F_VERSION);
+}
+
+Lz4DeflatingWriteBuffer::~Lz4DeflatingWriteBuffer() = default;
+
+void Lz4DeflatingWriteBuffer::nextImpl()
+{
+ if (!offset())
+ return;
+
+ in_data = reinterpret_cast<void *>(working_buffer.begin());
+ in_capacity = offset();
+
+ out_capacity = out->buffer().end() - out->position();
+ out_data = reinterpret_cast<void *>(out->position());
+
+ try
+ {
+ if (first_time)
+ {
+ if (out_capacity < LZ4F_HEADER_SIZE_MAX)
+ {
+ out->next();
+ out_capacity = out->buffer().end() - out->position();
+ out_data = reinterpret_cast<void *>(out->position());
+ }
+
+ /// write frame header and check for errors
+ size_t header_size = LZ4F_compressBegin(ctx, out_data, out_capacity, &kPrefs);
+
+ if (LZ4F_isError(header_size))
+ throw Exception(
+ ErrorCodes::LZ4_ENCODER_FAILED,
+ "LZ4 failed to start stream encoding. LZ4F version: {}",
+ LZ4F_VERSION);
+
+ out_capacity -= header_size;
+ out->position() = out->buffer().end() - out_capacity;
+ out_data = reinterpret_cast<void *>(out->position());
+
+ first_time = false;
+ }
+
+ do
+ {
+ /// Ensure that there is enough space for compressed block of minimal size
+ size_t min_compressed_block_size = LZ4F_compressBound(1, &kPrefs);
+ if (out_capacity < min_compressed_block_size)
+ {
+ out->next();
+ out_capacity = out->buffer().end() - out->position();
+ out_data = reinterpret_cast<void *>(out->position());
+ }
+
+ /// LZ4F_compressUpdate compresses whole input buffer at once so we need to shink it manually
+ size_t cur_buffer_size = in_capacity;
+ if (out_capacity >= min_compressed_block_size) /// We cannot shrink the input buffer if it's already too small.
+ {
+ while (out_capacity < LZ4F_compressBound(cur_buffer_size, &kPrefs))
+ cur_buffer_size /= 2;
+ }
+
+ size_t compressed_size = LZ4F_compressUpdate(ctx, out_data, out_capacity, in_data, cur_buffer_size, nullptr);
+
+ if (LZ4F_isError(compressed_size))
+ throw Exception(
+ ErrorCodes::LZ4_ENCODER_FAILED,
+ "LZ4 failed to encode stream. LZ4F version: {}",
+ LZ4F_VERSION);
+
+ in_capacity -= cur_buffer_size;
+ in_data = reinterpret_cast<void *>(working_buffer.end() - in_capacity);
+
+ out_capacity -= compressed_size;
+ out->position() = out->buffer().end() - out_capacity;
+ out_data = reinterpret_cast<void *>(out->position());
+ }
+ while (in_capacity > 0);
+ }
+ catch (...)
+ {
+ out->position() = out->buffer().begin();
+ throw;
+ }
+ out->next();
+ out_capacity = out->buffer().end() - out->position();
+}
+
+void Lz4DeflatingWriteBuffer::finalizeBefore()
+{
+ next();
+
+ out_capacity = out->buffer().end() - out->position();
+ out_data = reinterpret_cast<void *>(out->position());
+
+ if (out_capacity < LZ4F_compressBound(0, &kPrefs))
+ {
+ out->next();
+ out_capacity = out->buffer().end() - out->position();
+ out_data = reinterpret_cast<void *>(out->position());
+ }
+
+ /// compression end
+ size_t end_size = LZ4F_compressEnd(ctx, out_data, out_capacity, nullptr);
+
+ if (LZ4F_isError(end_size))
+ throw Exception(
+ ErrorCodes::LZ4_ENCODER_FAILED,
+ "LZ4 failed to end stream encoding. LZ4F version: {}",
+ LZ4F_VERSION);
+
+ out_capacity -= end_size;
+ out->position() = out->buffer().end() - out_capacity;
+ out_data = reinterpret_cast<void *>(out->position());
+}
+
+void Lz4DeflatingWriteBuffer::finalizeAfter()
+{
+ LZ4F_freeCompressionContext(ctx);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.h b/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.h
new file mode 100644
index 0000000000..68873b5f8e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Lz4DeflatingWriteBuffer.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/CompressionMethod.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferDecorator.h>
+
+#include <lz4.h>
+#include <lz4frame.h>
+
+namespace DB
+{
+/// Performs compression using lz4 library and writes compressed data to out_ WriteBuffer.
+class Lz4DeflatingWriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ Lz4DeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~Lz4DeflatingWriteBuffer() override;
+
+private:
+ void nextImpl() override;
+
+ void finalizeBefore() override;
+ void finalizeAfter() override;
+
+ LZ4F_preferences_t kPrefs; /// NOLINT
+ LZ4F_compressionContext_t ctx;
+
+ void * in_data;
+ void * out_data;
+
+ size_t in_capacity;
+ size_t out_capacity;
+
+ bool first_time = true;
+};
+}
diff --git a/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.cpp b/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.cpp
new file mode 100644
index 0000000000..eaa71048e7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.cpp
@@ -0,0 +1,89 @@
+#include <IO/Lz4InflatingReadBuffer.h>
+#include <IO/WithFileName.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int LZ4_DECODER_FAILED;
+}
+
+Lz4InflatingReadBuffer::Lz4InflatingReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+ , in_data(nullptr)
+ , out_data(nullptr)
+ , in_available(0)
+ , out_available(0)
+{
+ size_t ret = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
+
+ if (LZ4F_isError(ret))
+ throw Exception(
+ ErrorCodes::LZ4_DECODER_FAILED,
+ "LZ4 failed create decompression context LZ4F_dctx. LZ4F version: {}. Error: {}",
+ LZ4F_VERSION,
+ LZ4F_getErrorName(ret));
+}
+
+Lz4InflatingReadBuffer::~Lz4InflatingReadBuffer()
+{
+ LZ4F_freeDecompressionContext(dctx);
+}
+
+bool Lz4InflatingReadBuffer::nextImpl()
+{
+ if (eof_flag)
+ return false;
+
+ bool need_more_input = false;
+ size_t ret;
+
+ do
+ {
+ if (!in_available)
+ {
+ in->nextIfAtEnd();
+ in_available = in->buffer().end() - in->position();
+ }
+
+ in_data = reinterpret_cast<void *>(in->position());
+ out_data = reinterpret_cast<void *>(internal_buffer.begin());
+
+ out_available = internal_buffer.size();
+
+ size_t bytes_read = in_available;
+ size_t bytes_written = out_available;
+
+ ret = LZ4F_decompress(dctx, out_data, &bytes_written, in_data, &bytes_read, /* LZ4F_decompressOptions_t */ nullptr);
+
+ in_available -= bytes_read;
+ out_available -= bytes_written;
+
+ /// It may happen that we didn't get new uncompressed data
+ /// (for example if we read the end of frame). Load new data
+ /// in this case.
+ need_more_input = bytes_written == 0;
+
+ in->position() = in->buffer().end() - in_available;
+ }
+ while (need_more_input && !LZ4F_isError(ret) && !in->eof());
+
+ working_buffer.resize(internal_buffer.size() - out_available);
+
+ if (LZ4F_isError(ret))
+ throw Exception(
+ ErrorCodes::LZ4_DECODER_FAILED,
+ "LZ4 decompression failed. LZ4F version: {}. Error: {}{}",
+ LZ4F_VERSION,
+ LZ4F_getErrorName(ret),
+ getExceptionEntryWithFileName(*in));
+
+ if (in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+
+ return true;
+}
+}
diff --git a/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.h b/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.h
new file mode 100644
index 0000000000..46bdc40670
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Lz4InflatingReadBuffer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/CompressionMethod.h>
+#include <IO/ReadBuffer.h>
+
+#include <lz4.h>
+#include <lz4frame.h>
+
+
+namespace DB
+{
+
+class Lz4InflatingReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ explicit Lz4InflatingReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~Lz4InflatingReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ LZ4F_dctx* dctx;
+
+ void * in_data;
+ void * out_data;
+
+ size_t in_available;
+ size_t out_available;
+
+ bool eof_flag = false;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFile.cpp b/contrib/clickhouse/src/IO/MMapReadBufferFromFile.cpp
new file mode 100644
index 0000000000..86e05d7ae4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFile.cpp
@@ -0,0 +1,79 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <Common/ProfileEvents.h>
+#include <Common/formatReadable.h>
+#include <IO/MMapReadBufferFromFile.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+void MMapReadBufferFromFile::open()
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+ fd = ::open(file_name.c_str(), O_RDONLY | O_CLOEXEC);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+}
+
+
+std::string MMapReadBufferFromFile::getFileName() const
+{
+ return file_name;
+}
+
+
+MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, size_t offset, size_t length_)
+ : file_name(file_name_)
+{
+ open();
+ mapped.set(fd, offset, length_);
+ init();
+}
+
+
+MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, size_t offset)
+ : file_name(file_name_)
+{
+ open();
+ mapped.set(fd, offset);
+ init();
+}
+
+
+MMapReadBufferFromFile::~MMapReadBufferFromFile()
+{
+ if (fd != -1)
+ close(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+
+void MMapReadBufferFromFile::close()
+{
+ finish();
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+ metric_increment.destroy();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFile.h b/contrib/clickhouse/src/IO/MMapReadBufferFromFile.h
new file mode 100644
index 0000000000..bc566a0489
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFile.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <Common/CurrentMetrics.h>
+#include <IO/MMapReadBufferFromFileDescriptor.h>
+
+
+namespace CurrentMetrics
+{
+ extern const Metric OpenFileForRead;
+}
+
+
+namespace DB
+{
+
+class MMapReadBufferFromFile : public MMapReadBufferFromFileDescriptor
+{
+public:
+ MMapReadBufferFromFile(const std::string & file_name_, size_t offset, size_t length_);
+
+ /// Map till end of file.
+ MMapReadBufferFromFile(const std::string & file_name_, size_t offset);
+
+ ~MMapReadBufferFromFile() override;
+
+ void close();
+
+ std::string getFileName() const override;
+
+private:
+ int fd = -1;
+ std::string file_name;
+
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
+
+ void open();
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.cpp b/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.cpp
new file mode 100644
index 0000000000..9b1c132cc0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.cpp
@@ -0,0 +1,105 @@
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <Common/ProfileEvents.h>
+#include <Common/formatReadable.h>
+#include <Common/Exception.h>
+#include <Common/filesystemHelpers.h>
+#include <base/getPageSize.h>
+#include <IO/WriteHelpers.h>
+#include <IO/MMapReadBufferFromFileDescriptor.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+void MMapReadBufferFromFileDescriptor::init()
+{
+ size_t length = mapped.getLength();
+ BufferBase::set(mapped.getData(), length, 0);
+
+ size_t page_size = static_cast<size_t>(::getPageSize());
+ ReadBuffer::padded = (length % page_size) > 0 && (length % page_size) <= (page_size - (PADDING_FOR_SIMD - 1));
+}
+
+
+MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd, size_t offset, size_t length)
+ : mapped(fd, offset, length)
+{
+ init();
+}
+
+
+MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd, size_t offset)
+ : mapped(fd, offset)
+{
+ init();
+}
+
+
+void MMapReadBufferFromFileDescriptor::finish()
+{
+ mapped.finish();
+}
+
+
+std::string MMapReadBufferFromFileDescriptor::getFileName() const
+{
+ return "(fd = " + toString(mapped.getFD()) + ")";
+}
+
+int MMapReadBufferFromFileDescriptor::getFD() const
+{
+ return mapped.getFD();
+}
+
+off_t MMapReadBufferFromFileDescriptor::getPosition()
+{
+ return count();
+}
+
+off_t MMapReadBufferFromFileDescriptor::seek(off_t offset, int whence)
+{
+ off_t new_pos;
+ if (whence == SEEK_SET)
+ new_pos = offset;
+ else if (whence == SEEK_CUR)
+ new_pos = count() + offset;
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "MMapReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence");
+
+ working_buffer = internal_buffer;
+ if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+ "Cannot seek through file {} because seek position ({}) is out of bounds [0, {}]",
+ getFileName(), new_pos, working_buffer.size());
+
+ position() = working_buffer.begin() + new_pos;
+ return new_pos;
+}
+
+size_t MMapReadBufferFromFileDescriptor::getFileSize()
+{
+ return getSizeFromFileDescriptor(getFD(), getFileName());
+}
+
+size_t MMapReadBufferFromFileDescriptor::readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> &)
+{
+ if (offset >= mapped.getLength())
+ return 0;
+
+ n = std::min(n, mapped.getLength() - offset);
+ memcpy(to, mapped.getData() + offset, n);
+ return n;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.h b/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.h
new file mode 100644
index 0000000000..2a039e0497
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/MMappedFileDescriptor.h>
+
+
+namespace DB
+{
+
+/** MMap range in a file and represent it as a ReadBuffer.
+ * Please note that mmap is not always the optimal way to read file.
+ * Also you cannot control whether and how long actual IO take place,
+ * so this method is not manageable and not recommended for anything except benchmarks.
+ */
+class MMapReadBufferFromFileDescriptor : public ReadBufferFromFileBase
+{
+public:
+ off_t seek(off_t off, int whence) override;
+
+protected:
+ MMapReadBufferFromFileDescriptor() = default;
+ void init();
+
+ MMappedFileDescriptor mapped;
+
+public:
+ MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_);
+
+ /// Map till end of file.
+ MMapReadBufferFromFileDescriptor(int fd_, size_t offset_);
+
+ /// unmap memory before call to destructor
+ void finish();
+
+ off_t getPosition() override;
+
+ std::string getFileName() const override;
+
+ int getFD() const;
+
+ size_t getFileSize() override;
+
+ size_t readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> &) override;
+ bool supportsReadAt() override { return true; }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.cpp b/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.cpp
new file mode 100644
index 0000000000..d13cf5db2f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -0,0 +1,78 @@
+#include <IO/MMapReadBufferFromFileWithCache.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+void MMapReadBufferFromFileWithCache::init()
+{
+ size_t length = mapped->getLength();
+ BufferBase::set(mapped->getData(), length, 0);
+
+ size_t page_size = static_cast<size_t>(::getPageSize());
+ ReadBuffer::padded = (length % page_size) > 0 && (length % page_size) <= (page_size - (PADDING_FOR_SIMD - 1));
+ ReadBufferFromFileBase::file_size = length;
+}
+
+
+MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
+ MMappedFileCache & cache, const std::string & file_name, size_t offset, size_t length)
+{
+ mapped = cache.getOrSet(cache.hash(file_name, offset, length), [&]
+ {
+ return std::make_shared<MMappedFile>(file_name, offset, length);
+ });
+
+ init();
+}
+
+MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
+ MMappedFileCache & cache, const std::string & file_name, size_t offset)
+{
+ mapped = cache.getOrSet(cache.hash(file_name, offset, -1), [&]
+ {
+ return std::make_shared<MMappedFile>(file_name, offset);
+ });
+
+ init();
+}
+
+
+std::string MMapReadBufferFromFileWithCache::getFileName() const
+{
+ return mapped->getFileName();
+}
+
+off_t MMapReadBufferFromFileWithCache::getPosition()
+{
+ return count();
+}
+
+off_t MMapReadBufferFromFileWithCache::seek(off_t offset, int whence)
+{
+ off_t new_pos;
+ if (whence == SEEK_SET)
+ new_pos = offset;
+ else if (whence == SEEK_CUR)
+ new_pos = count() + offset;
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "MMapReadBufferFromFileWithCache::seek expects SEEK_SET or SEEK_CUR as whence");
+
+ working_buffer = internal_buffer;
+ if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+ "Cannot seek through file {} because seek position ({}) is out of bounds [0, {}]",
+ getFileName(), new_pos, working_buffer.size());
+
+ position() = working_buffer.begin() + new_pos;
+ return new_pos;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.h b/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.h
new file mode 100644
index 0000000000..ff84f81610
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMapReadBufferFromFileWithCache.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/MMappedFileCache.h>
+#include <IO/MMapReadBufferFromFileDescriptor.h>
+
+
+namespace DB
+{
+
+class MMapReadBufferFromFileWithCache : public ReadBufferFromFileBase
+{
+public:
+ MMapReadBufferFromFileWithCache(MMappedFileCache & cache, const std::string & file_name, size_t offset, size_t length);
+
+ /// Map till end of file.
+ MMapReadBufferFromFileWithCache(MMappedFileCache & cache, const std::string & file_name, size_t offset);
+
+ off_t getPosition() override;
+ std::string getFileName() const override;
+ off_t seek(off_t offset, int whence) override;
+
+private:
+ MMappedFileCache::MappedPtr mapped;
+
+ void init();
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/MMappedFile.cpp b/contrib/clickhouse/src/IO/MMappedFile.cpp
new file mode 100644
index 0000000000..9e45140d5f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMappedFile.cpp
@@ -0,0 +1,78 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <Common/ProfileEvents.h>
+#include <Common/formatReadable.h>
+#include <Common/Exception.h>
+#include <IO/MMappedFile.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+void MMappedFile::open()
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+ fd = ::open(file_name.c_str(), O_RDONLY | O_CLOEXEC);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+}
+
+
+std::string MMappedFile::getFileName() const
+{
+ return file_name;
+}
+
+
+MMappedFile::MMappedFile(const std::string & file_name_, size_t offset_, size_t length_)
+ : file_name(file_name_)
+{
+ open();
+ set(fd, offset_, length_);
+}
+
+
+MMappedFile::MMappedFile(const std::string & file_name_, size_t offset_)
+ : file_name(file_name_)
+{
+ open();
+ set(fd, offset_);
+}
+
+
+MMappedFile::~MMappedFile()
+{
+ if (fd != -1)
+ close(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+
+void MMappedFile::close()
+{
+ finish();
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+ metric_increment.destroy();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MMappedFile.h b/contrib/clickhouse/src/IO/MMappedFile.h
new file mode 100644
index 0000000000..6ecf988fa9
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMappedFile.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <Common/CurrentMetrics.h>
+#include <IO/MMappedFileDescriptor.h>
+#include <cstddef>
+
+
+namespace CurrentMetrics
+{
+ extern const Metric OpenFileForRead;
+}
+
+
+namespace DB
+{
+
+/// Opens a file and mmaps a region in it (or a whole file) into memory. Unmaps and closes in destructor.
+class MMappedFile : public MMappedFileDescriptor
+{
+public:
+ MMappedFile(const std::string & file_name_, size_t offset_, size_t length_);
+
+ /// Map till end of file.
+ MMappedFile(const std::string & file_name_, size_t offset_);
+
+ ~MMappedFile() override;
+
+ void close();
+
+ std::string getFileName() const;
+
+private:
+ std::string file_name;
+
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
+
+ void open();
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/MMappedFileCache.h b/contrib/clickhouse/src/IO/MMappedFileCache.h
new file mode 100644
index 0000000000..bb30829ed6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMappedFileCache.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <Common/HashTable/Hash.h>
+#include <Common/CacheBase.h>
+#include <Common/SipHash.h>
+#include <Common/ProfileEvents.h>
+#include <IO/MMappedFile.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event MMappedFileCacheHits;
+ extern const Event MMappedFileCacheMisses;
+}
+
+namespace DB
+{
+
+
+/** Cache of opened and mmapped files for reading.
+ * mmap/munmap is heavy operation and better to keep mapped file to subsequent use than to map/unmap every time.
+ */
+class MMappedFileCache : public CacheBase<UInt128, MMappedFile, UInt128TrivialHash>
+{
+private:
+ using Base = CacheBase<UInt128, MMappedFile, UInt128TrivialHash>;
+
+public:
+ explicit MMappedFileCache(size_t max_size_in_bytes)
+ : Base(max_size_in_bytes) {}
+
+ /// Calculate key from path to file and offset.
+ static UInt128 hash(const String & path_to_file, size_t offset, ssize_t length = -1)
+ {
+ SipHash hash;
+ hash.update(path_to_file.data(), path_to_file.size() + 1);
+ hash.update(offset);
+ hash.update(length);
+
+ return hash.get128();
+ }
+
+ template <typename LoadFunc>
+ MappedPtr getOrSet(const Key & key, LoadFunc && load)
+ {
+ auto result = Base::getOrSet(key, load);
+ if (result.second)
+ ProfileEvents::increment(ProfileEvents::MMappedFileCacheMisses);
+ else
+ ProfileEvents::increment(ProfileEvents::MMappedFileCacheHits);
+
+ return result.first;
+ }
+};
+
+using MMappedFileCachePtr = std::shared_ptr<MMappedFileCache>;
+
+}
+
diff --git a/contrib/clickhouse/src/IO/MMappedFileDescriptor.cpp b/contrib/clickhouse/src/IO/MMappedFileDescriptor.cpp
new file mode 100644
index 0000000000..9cc1aaf656
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMappedFileDescriptor.cpp
@@ -0,0 +1,107 @@
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fmt/format.h>
+
+#include <Common/formatReadable.h>
+#include <Common/Exception.h>
+#include <base/getPageSize.h>
+#include <IO/MMappedFileDescriptor.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_ALLOCATE_MEMORY;
+ extern const int CANNOT_MUNMAP;
+ extern const int CANNOT_STAT;
+ extern const int BAD_ARGUMENTS;
+ extern const int LOGICAL_ERROR;
+}
+
+
+static size_t getFileSize(int fd)
+{
+ struct stat stat_res {};
+ if (0 != fstat(fd, &stat_res))
+ throwFromErrno("MMappedFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
+
+ off_t file_size = stat_res.st_size;
+
+ if (file_size < 0)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "MMappedFileDescriptor: fstat returned negative file size");
+
+ return file_size;
+}
+
+
+MMappedFileDescriptor::MMappedFileDescriptor(int fd_, size_t offset_, size_t length_)
+{
+ set(fd_, offset_, length_);
+}
+
+MMappedFileDescriptor::MMappedFileDescriptor(int fd_, size_t offset_)
+ : fd(fd_), offset(offset_)
+{
+ set(fd_, offset_);
+}
+
+void MMappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
+{
+ finish();
+
+ fd = fd_;
+ offset = offset_;
+ length = length_;
+
+ if (!length)
+ return;
+
+ void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
+ if (MAP_FAILED == buf)
+ throwFromErrno(fmt::format("MMappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
+ ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+ data = static_cast<char *>(buf);
+
+ files_metric_increment.changeTo(1);
+ bytes_metric_increment.changeTo(length);
+}
+
+void MMappedFileDescriptor::set(int fd_, size_t offset_)
+{
+ size_t file_size = getFileSize(fd_);
+
+ if (offset > static_cast<size_t>(file_size))
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "MMappedFileDescriptor: requested offset is greater than file size");
+
+ set(fd_, offset_, file_size - offset);
+}
+
+void MMappedFileDescriptor::finish()
+{
+ if (!length)
+ return;
+
+ if (0 != munmap(data, length))
+ throwFromErrno(fmt::format("MMappedFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
+ ErrorCodes::CANNOT_MUNMAP);
+
+ length = 0;
+
+ files_metric_increment.changeTo(0);
+ bytes_metric_increment.changeTo(0);
+}
+
+MMappedFileDescriptor::~MMappedFileDescriptor()
+{
+ finish(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+}
+
+
diff --git a/contrib/clickhouse/src/IO/MMappedFileDescriptor.h b/contrib/clickhouse/src/IO/MMappedFileDescriptor.h
new file mode 100644
index 0000000000..2611093643
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MMappedFileDescriptor.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cstddef>
+#include <Common/CurrentMetrics.h>
+
+namespace CurrentMetrics
+{
+ extern const Metric MMappedFiles;
+ extern const Metric MMappedFileBytes;
+}
+
+
+namespace DB
+{
+
+/// MMaps a region in file (or a whole file) into memory. Unmaps in destructor.
+/// Does not open or close file.
+class MMappedFileDescriptor
+{
+public:
+ MMappedFileDescriptor(int fd_, size_t offset_, size_t length_);
+ MMappedFileDescriptor(int fd_, size_t offset_);
+
+ /// Makes empty object that can be initialized with `set`.
+ MMappedFileDescriptor() = default;
+
+ virtual ~MMappedFileDescriptor();
+
+ char * getData() { return data; }
+ const char * getData() const { return data; }
+
+ int getFD() const { return fd; }
+ size_t getOffset() const { return offset; }
+ size_t getLength() const { return length; }
+
+ /// Unmap memory before call to destructor
+ void finish();
+
+ /// Initialize or reset to another fd.
+ void set(int fd_, size_t offset_, size_t length_);
+ void set(int fd_, size_t offset_);
+
+ MMappedFileDescriptor(const MMappedFileDescriptor &) = delete;
+ MMappedFileDescriptor(MMappedFileDescriptor &&) = delete;
+
+protected:
+
+ void init();
+
+ int fd = -1;
+ size_t offset = 0;
+ size_t length = 0;
+ char * data = nullptr;
+
+ CurrentMetrics::Increment files_metric_increment{CurrentMetrics::MMappedFiles, 0};
+ CurrentMetrics::Increment bytes_metric_increment{CurrentMetrics::MMappedFileBytes, 0};
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.cpp b/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.cpp
new file mode 100644
index 0000000000..415a6c6fad
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.cpp
@@ -0,0 +1,147 @@
+#include <IO/MemoryReadWriteBuffer.h>
+#include <boost/noncopyable.hpp>
+
+
+namespace DB
+{
+
+class ReadBufferFromMemoryWriteBuffer : public ReadBuffer, boost::noncopyable, private Allocator<false>
+{
+public:
+ explicit ReadBufferFromMemoryWriteBuffer(MemoryWriteBuffer && origin)
+ : ReadBuffer(nullptr, 0),
+ chunk_list(std::move(origin.chunk_list)),
+ end_pos(origin.position())
+ {
+ chunk_head = chunk_list.begin();
+ setChunk();
+ }
+
+ bool nextImpl() override
+ {
+ if (chunk_head == chunk_list.end())
+ return false;
+
+ ++chunk_head;
+ return setChunk();
+ }
+
+ ~ReadBufferFromMemoryWriteBuffer() override
+ {
+ for (const auto & range : chunk_list)
+ free(range.begin(), range.size());
+ }
+
+private:
+
+ /// update buffers and position according to chunk_head pointer
+ bool setChunk()
+ {
+ if (chunk_head != chunk_list.end())
+ {
+ internalBuffer() = *chunk_head;
+
+ /// It is last chunk, it should be truncated
+ if (std::next(chunk_head) != chunk_list.end())
+ buffer() = internalBuffer();
+ else
+ buffer() = Buffer(internalBuffer().begin(), end_pos);
+
+ position() = buffer().begin();
+ }
+ else
+ {
+ buffer() = internalBuffer() = Buffer(nullptr, nullptr);
+ position() = nullptr;
+ }
+
+ return !buffer().empty();
+ }
+
+ using Container = std::forward_list<BufferBase::Buffer>;
+
+ Container chunk_list;
+ Container::iterator chunk_head;
+ Position end_pos;
+};
+
+
+MemoryWriteBuffer::MemoryWriteBuffer(size_t max_total_size_, size_t initial_chunk_size_, double growth_rate_, size_t max_chunk_size_)
+ : WriteBuffer(nullptr, 0),
+ max_total_size(max_total_size_),
+ initial_chunk_size(initial_chunk_size_),
+ max_chunk_size(max_chunk_size_),
+ growth_rate(growth_rate_)
+{
+ addChunk();
+}
+
+
+void MemoryWriteBuffer::nextImpl()
+{
+ if (unlikely(hasPendingData()))
+ {
+ /// ignore flush
+ buffer() = Buffer(pos, buffer().end());
+ return;
+ }
+
+ addChunk();
+}
+
+
+void MemoryWriteBuffer::addChunk()
+{
+ size_t next_chunk_size;
+ if (chunk_list.empty())
+ {
+ chunk_tail = chunk_list.before_begin();
+ next_chunk_size = initial_chunk_size;
+ }
+ else
+ {
+ next_chunk_size = std::max(1uz, static_cast<size_t>(chunk_tail->size() * growth_rate));
+ next_chunk_size = std::min(next_chunk_size, max_chunk_size);
+ }
+
+ if (max_total_size)
+ {
+ if (total_chunks_size + next_chunk_size > max_total_size)
+ next_chunk_size = max_total_size - total_chunks_size;
+
+ if (0 == next_chunk_size)
+ {
+ set(position(), 0);
+ throw MemoryWriteBuffer::CurrentBufferExhausted();
+ }
+ }
+
+ Position begin = reinterpret_cast<Position>(alloc(next_chunk_size));
+ chunk_tail = chunk_list.emplace_after(chunk_tail, begin, begin + next_chunk_size);
+ total_chunks_size += next_chunk_size;
+
+ set(chunk_tail->begin(), chunk_tail->size());
+}
+
+
+std::shared_ptr<ReadBuffer> MemoryWriteBuffer::getReadBufferImpl()
+{
+ finalize();
+
+ auto res = std::make_shared<ReadBufferFromMemoryWriteBuffer>(std::move(*this));
+
+ /// invalidate members
+ chunk_list.clear();
+ chunk_tail = chunk_list.begin();
+
+ return res;
+}
+
+
+MemoryWriteBuffer::~MemoryWriteBuffer()
+{
+ for (const auto & range : chunk_list)
+ free(range.begin(), range.size());
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.h b/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.h
new file mode 100644
index 0000000000..d6bf231c22
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MemoryReadWriteBuffer.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <forward_list>
+
+#include <IO/WriteBuffer.h>
+#include <IO/IReadableWriteBuffer.h>
+#include <Common/Allocator.h>
+#include <Core/Defines.h>
+#include <boost/noncopyable.hpp>
+
+
+namespace DB
+{
+
+/// Stores data in memory chunks, size of chunks are exponentially increasing during write
+/// Written data could be reread after write
+class MemoryWriteBuffer : public WriteBuffer, public IReadableWriteBuffer, boost::noncopyable, private Allocator<false>
+{
+public:
+ /// Special exception to throw when the current WriteBuffer cannot receive data
+ class CurrentBufferExhausted : public std::exception
+ {
+ public:
+ const char * what() const noexcept override { return "MemoryWriteBuffer limit is exhausted"; }
+ };
+
+ /// Use max_total_size_ = 0 for unlimited storage
+ explicit MemoryWriteBuffer(
+ size_t max_total_size_ = 0,
+ size_t initial_chunk_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+ double growth_rate_ = 2.0,
+ size_t max_chunk_size_ = 128 * DBMS_DEFAULT_BUFFER_SIZE);
+
+ ~MemoryWriteBuffer() override;
+
+protected:
+
+ void nextImpl() override;
+
+ void finalizeImpl() override { /* no op */ }
+
+ std::shared_ptr<ReadBuffer> getReadBufferImpl() override;
+
+ const size_t max_total_size;
+ const size_t initial_chunk_size;
+ const size_t max_chunk_size;
+ const double growth_rate;
+
+ using Container = std::forward_list<BufferBase::Buffer>;
+
+ Container chunk_list;
+ Container::iterator chunk_tail;
+ size_t total_chunks_size = 0;
+
+ void addChunk();
+
+ friend class ReadBufferFromMemoryWriteBuffer;
+};
+
+
+}
diff --git a/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.cpp b/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.cpp
new file mode 100644
index 0000000000..9f05c5b5e0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.cpp
@@ -0,0 +1,80 @@
+#include <IO/MySQLBinlogEventReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+}
+
+MySQLBinlogEventReadBuffer::MySQLBinlogEventReadBuffer(ReadBuffer & in_, size_t checksum_signature_length_)
+ : ReadBuffer(nullptr, 0, 0), in(in_), checksum_signature_length(checksum_signature_length_)
+{
+ if (checksum_signature_length > MAX_CHECKSUM_SIGNATURE_LENGTH)
+ throw Exception(ErrorCodes::LOGICAL_ERROR,
+ "LOGICAL ERROR: checksum_signature_length must be less than MAX_CHECKSUM_SIGNATURE_LENGTH. "
+ "It is a bug.");
+
+ nextIfAtEnd();
+}
+
+bool MySQLBinlogEventReadBuffer::nextImpl()
+{
+ if (hasPendingData())
+ return true;
+
+ if (in.eof())
+ return false;
+
+ if (checksum_buff_size == checksum_buff_limit)
+ {
+ if (likely(in.available() > checksum_signature_length))
+ {
+ working_buffer = ReadBuffer::Buffer(in.position(), in.buffer().end() - checksum_signature_length);
+ in.ignore(working_buffer.size());
+ return true;
+ }
+
+ in.readStrict(checksum_buf, checksum_signature_length);
+ checksum_buff_size = checksum_buff_limit = checksum_signature_length;
+ }
+ else
+ {
+ for (size_t index = 0; index < checksum_buff_size - checksum_buff_limit; ++index)
+ checksum_buf[index] = checksum_buf[checksum_buff_limit + index];
+
+ checksum_buff_size -= checksum_buff_limit;
+ size_t read_bytes = checksum_signature_length - checksum_buff_size;
+ in.readStrict(checksum_buf + checksum_buff_size, read_bytes); /// Minimum checksum_signature_length bytes
+ checksum_buff_size = checksum_buff_limit = checksum_signature_length;
+ }
+
+ if (in.eof())
+ return false;
+
+ if (in.available() < checksum_signature_length)
+ {
+ size_t left_move_size = checksum_signature_length - in.available();
+ checksum_buff_limit = checksum_buff_size - left_move_size;
+ }
+
+ working_buffer = ReadBuffer::Buffer(checksum_buf, checksum_buf + checksum_buff_limit);
+ return true;
+}
+
+MySQLBinlogEventReadBuffer::~MySQLBinlogEventReadBuffer()
+{
+ try
+ {
+ /// ignore last checksum_signature_length bytes
+ nextIfAtEnd();
+ }
+ catch (...)
+ {
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.h b/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.h
new file mode 100644
index 0000000000..7212a54884
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLBinlogEventReadBuffer.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class MySQLBinlogEventReadBuffer : public ReadBuffer
+{
+protected:
+ static const size_t MAX_CHECKSUM_SIGNATURE_LENGTH = 4;
+
+ ReadBuffer & in;
+ size_t checksum_signature_length = 0;
+
+ size_t checksum_buff_size = 0;
+ size_t checksum_buff_limit = 0;
+ char checksum_buf[MAX_CHECKSUM_SIGNATURE_LENGTH]{};
+
+ bool nextImpl() override;
+
+public:
+ ~MySQLBinlogEventReadBuffer() override;
+
+ MySQLBinlogEventReadBuffer(ReadBuffer & in_, size_t checksum_signature_length_);
+
+};
+
+
+}
diff --git a/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.cpp b/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.cpp
new file mode 100644
index 0000000000..2c5167ed03
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.cpp
@@ -0,0 +1,61 @@
+#include <IO/MySQLPacketPayloadReadBuffer.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int UNKNOWN_PACKET_FROM_CLIENT;
+}
+
+const size_t MAX_PACKET_LENGTH = (1 << 24) - 1; // 16 mb
+
+MySQLPacketPayloadReadBuffer::MySQLPacketPayloadReadBuffer(ReadBuffer & in_, uint8_t & sequence_id_)
+ : ReadBuffer(in_.position(), 0), in(in_), sequence_id(sequence_id_) // not in.buffer().begin(), because working buffer may include previous packet
+{
+}
+
+bool MySQLPacketPayloadReadBuffer::nextImpl()
+{
+ if (!has_read_header || (payload_length == MAX_PACKET_LENGTH && offset == payload_length))
+ {
+ has_read_header = true;
+ working_buffer.resize(0);
+ offset = 0;
+ payload_length = 0;
+ in.readStrict(reinterpret_cast<char *>(&payload_length), 3);
+
+ if (payload_length > MAX_PACKET_LENGTH)
+ throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_CLIENT,
+ "Received packet with payload larger than max_packet_size: {}", payload_length);
+
+ size_t packet_sequence_id = 0;
+ in.readStrict(reinterpret_cast<char &>(packet_sequence_id));
+ if (packet_sequence_id != sequence_id)
+ throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_CLIENT,
+ "Received packet with wrong sequence-id: {}. Expected: {}.", packet_sequence_id, static_cast<unsigned int>(sequence_id));
+ sequence_id++;
+
+ if (payload_length == 0)
+ return false;
+ }
+ else if (offset == payload_length)
+ {
+ return false;
+ }
+
+ in.nextIfAtEnd();
+ /// Don't return a buffer when no bytes available
+ if (!in.hasPendingData())
+ return false;
+ working_buffer = ReadBuffer::Buffer(in.position(), in.buffer().end());
+ size_t count = std::min(in.available(), payload_length - offset);
+ working_buffer.resize(count);
+ in.ignore(count);
+
+ offset += count;
+
+ return true;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.h b/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.h
new file mode 100644
index 0000000000..f90a34ba93
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLPacketPayloadReadBuffer.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+/** Reading packets.
+ * Internally, it calls (if no more data) next() method of the underlying ReadBufferFromPocoSocket, and sets the working buffer to the rest part of the current packet payload.
+ */
+class MySQLPacketPayloadReadBuffer : public ReadBuffer
+{
+private:
+ ReadBuffer & in;
+ uint8_t & sequence_id;
+
+ bool has_read_header = false;
+
+ // Size of packet which is being read now.
+ size_t payload_length = 0;
+
+ // Offset in packet payload.
+ size_t offset = 0;
+
+protected:
+ bool nextImpl() override;
+
+public:
+ MySQLPacketPayloadReadBuffer(ReadBuffer & in_, uint8_t & sequence_id_);
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.cpp b/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.cpp
new file mode 100644
index 0000000000..425e1b8d08
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.cpp
@@ -0,0 +1,61 @@
+#include <IO/MySQLPacketPayloadWriteBuffer.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+}
+
+const size_t MAX_PACKET_LENGTH = (1 << 24) - 1; // 16 mb
+
+MySQLPacketPayloadWriteBuffer::MySQLPacketPayloadWriteBuffer(WriteBuffer & out_, size_t payload_length_, uint8_t & sequence_id_)
+ : WriteBuffer(out_.position(), 0), out(out_), sequence_id(sequence_id_), total_left(payload_length_)
+{
+ startNewPacket();
+ setWorkingBuffer();
+ pos = out.position();
+}
+
+void MySQLPacketPayloadWriteBuffer::startNewPacket()
+{
+ payload_length = std::min(total_left, MAX_PACKET_LENGTH);
+ bytes_written = 0;
+ total_left -= payload_length;
+
+ out.write(reinterpret_cast<char *>(&payload_length), 3);
+ out.write(sequence_id++);
+ bytes += 4;
+}
+
+void MySQLPacketPayloadWriteBuffer::setWorkingBuffer()
+{
+ out.nextIfAtEnd();
+ working_buffer = WriteBuffer::Buffer(out.position(), out.position() + std::min(payload_length - bytes_written, out.available()));
+
+ if (payload_length - bytes_written == 0)
+ {
+ /// Finished writing packet. Due to an implementation of WriteBuffer, working_buffer cannot be empty. Further write attempts will throw Exception.
+ eof = true;
+ working_buffer.resize(1);
+ }
+}
+
+void MySQLPacketPayloadWriteBuffer::nextImpl()
+{
+ size_t written = pos - working_buffer.begin();
+ if (eof)
+ throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER, "Cannot write after end of buffer.");
+
+ out.position() += written;
+ bytes_written += written;
+
+ /// Packets of size greater than MAX_PACKET_LENGTH are split into few packets of size MAX_PACKET_LENGTH and las packet of size < MAX_PACKET_LENGTH.
+ if (bytes_written == payload_length && (total_left > 0 || payload_length == MAX_PACKET_LENGTH))
+ startNewPacket();
+
+ setWorkingBuffer();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.h b/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.h
new file mode 100644
index 0000000000..d4ce8a8955
--- /dev/null
+++ b/contrib/clickhouse/src/IO/MySQLPacketPayloadWriteBuffer.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+
+namespace DB
+{
+
+/** Writing packets.
+ * https://dev.mysql.com/doc/internals/en/mysql-packet.html
+ */
+class MySQLPacketPayloadWriteBuffer : public WriteBuffer
+{
+public:
+ MySQLPacketPayloadWriteBuffer(WriteBuffer & out_, size_t payload_length_, uint8_t & sequence_id_);
+
+ bool remainingPayloadSize() const { return total_left; }
+
+protected:
+ void nextImpl() override;
+
+private:
+ WriteBuffer & out;
+ uint8_t & sequence_id;
+
+ size_t total_left = 0;
+ size_t payload_length = 0;
+ size_t bytes_written = 0;
+ bool eof = false;
+
+ void startNewPacket();
+
+ /// Sets working buffer to the rest of current packet payload.
+ void setWorkingBuffer();
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/NullWriteBuffer.cpp b/contrib/clickhouse/src/IO/NullWriteBuffer.cpp
new file mode 100644
index 0000000000..295c53ef7c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/NullWriteBuffer.cpp
@@ -0,0 +1,16 @@
+#include <IO/NullWriteBuffer.h>
+
+
+namespace DB
+{
+
+NullWriteBuffer::NullWriteBuffer()
+ : WriteBuffer(data, sizeof(data))
+{
+}
+
+void NullWriteBuffer::nextImpl()
+{
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/NullWriteBuffer.h b/contrib/clickhouse/src/IO/NullWriteBuffer.h
new file mode 100644
index 0000000000..f14c74ff72
--- /dev/null
+++ b/contrib/clickhouse/src/IO/NullWriteBuffer.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <boost/noncopyable.hpp>
+
+namespace DB
+{
+
+/// Simply do nothing, can be used to measure amount of written bytes.
+class NullWriteBuffer : public WriteBuffer, boost::noncopyable
+{
+public:
+ NullWriteBuffer();
+ void nextImpl() override;
+
+private:
+ char data[128];
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/OpenedFile.cpp b/contrib/clickhouse/src/IO/OpenedFile.cpp
new file mode 100644
index 0000000000..b75e087e5c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/OpenedFile.cpp
@@ -0,0 +1,77 @@
+#include <mutex>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <Common/ProfileEvents.h>
+#include <Common/Exception.h>
+#include <IO/OpenedFile.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+void OpenedFile::open() const
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+ fd = ::open(file_name.c_str(), (flags == -1 ? 0 : flags) | O_RDONLY | O_CLOEXEC);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+}
+
+int OpenedFile::getFD() const
+{
+ std::lock_guard l(mutex);
+ if (fd == -1)
+ open();
+ return fd;
+}
+
+std::string OpenedFile::getFileName() const
+{
+ return file_name;
+}
+
+
+OpenedFile::OpenedFile(const std::string & file_name_, int flags_)
+ : file_name(file_name_), flags(flags_)
+{
+}
+
+
+OpenedFile::~OpenedFile()
+{
+ close(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+
+void OpenedFile::close()
+{
+ std::lock_guard l(mutex);
+ if (fd == -1)
+ return;
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+ metric_increment.destroy();
+}
+
+}
+
diff --git a/contrib/clickhouse/src/IO/OpenedFile.h b/contrib/clickhouse/src/IO/OpenedFile.h
new file mode 100644
index 0000000000..10c36d9e1d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/OpenedFile.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <Common/CurrentMetrics.h>
+#include <memory>
+#include <mutex>
+
+
+namespace CurrentMetrics
+{
+ extern const Metric OpenFileForRead;
+}
+
+
+namespace DB
+{
+
+/// RAII for readonly opened file descriptor.
+class OpenedFile
+{
+public:
+ OpenedFile(const std::string & file_name_, int flags_);
+ ~OpenedFile();
+
+ /// Close prematurally.
+ void close();
+
+ int getFD() const;
+ std::string getFileName() const;
+
+private:
+ std::string file_name;
+ int flags = 0;
+
+ mutable int fd = -1;
+ mutable std::mutex mutex;
+
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
+
+ void open() const;
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/OpenedFileCache.h b/contrib/clickhouse/src/IO/OpenedFileCache.h
new file mode 100644
index 0000000000..2cecc675af
--- /dev/null
+++ b/contrib/clickhouse/src/IO/OpenedFileCache.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <map>
+#include <mutex>
+
+#include <Core/Types.h>
+#include <IO/OpenedFile.h>
+#include <Common/ElapsedTimeProfileEventIncrement.h>
+#include <Common/ProfileEvents.h>
+
+#include <city.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event OpenedFileCacheHits;
+ extern const Event OpenedFileCacheMisses;
+ extern const Event OpenedFileCacheMicroseconds;
+}
+
+namespace DB
+{
+
+
+/** Cache of opened files for reading.
+ * It allows to share file descriptors when doing reading with 'pread' syscalls on readonly files.
+ * Note: open/close of files is very cheap on Linux and we should not bother doing it 10 000 times a second.
+ * (This may not be the case on Windows with WSL. This is also not the case if strace is active. Neither when some eBPF is loaded).
+ * But sometimes we may end up opening one file multiple times, that increases chance exhausting opened files limit.
+ */
+class OpenedFileCache
+{
+ class OpenedFileMap
+ {
+ using Key = std::pair<std::string /* path */, int /* flags */>;
+
+ using OpenedFileWeakPtr = std::weak_ptr<OpenedFile>;
+ using Files = std::map<Key, OpenedFileWeakPtr>;
+
+ Files files;
+ std::mutex mutex;
+
+ public:
+ using OpenedFilePtr = std::shared_ptr<OpenedFile>;
+
+ OpenedFilePtr get(const std::string & path, int flags)
+ {
+ Key key(path, flags);
+
+ std::lock_guard lock(mutex);
+
+ auto [it, inserted] = files.emplace(key, OpenedFilePtr{});
+ if (!inserted)
+ {
+ if (auto res = it->second.lock())
+ {
+ ProfileEvents::increment(ProfileEvents::OpenedFileCacheHits);
+ return res;
+ }
+ }
+ ProfileEvents::increment(ProfileEvents::OpenedFileCacheMisses);
+
+ OpenedFilePtr res
+ {
+ new OpenedFile(path, flags),
+ [key, this](auto ptr)
+ {
+ {
+ std::lock_guard another_lock(mutex);
+ files.erase(key);
+ }
+ delete ptr;
+ }
+ };
+
+ it->second = res;
+ return res;
+ }
+
+ void remove(const std::string & path, int flags)
+ {
+ Key key(path, flags);
+ std::lock_guard lock(mutex);
+ files.erase(key);
+ }
+ };
+
+ static constexpr size_t buckets = 1024;
+ std::vector<OpenedFileMap> impls{buckets};
+
+public:
+ using OpenedFilePtr = OpenedFileMap::OpenedFilePtr;
+
+ OpenedFilePtr get(const std::string & path, int flags)
+ {
+ ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::OpenedFileCacheMicroseconds);
+ const auto bucket = CityHash_v1_0_2::CityHash64(path.data(), path.length()) % buckets;
+ return impls[bucket].get(path, flags);
+ }
+
+ void remove(const std::string & path, int flags)
+ {
+ ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::OpenedFileCacheMicroseconds);
+ const auto bucket = CityHash_v1_0_2::CityHash64(path.data(), path.length()) % buckets;
+ impls[bucket].remove(path, flags);
+ }
+
+ static OpenedFileCache & instance()
+ {
+ static OpenedFileCache res;
+ return res;
+ }
+};
+
+using OpenedFileCachePtr = std::shared_ptr<OpenedFileCache>;
+}
diff --git a/contrib/clickhouse/src/IO/Operators.h b/contrib/clickhouse/src/IO/Operators.h
new file mode 100644
index 0000000000..185745e841
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Operators.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <functional>
+
+
+namespace DB
+{
+
+/** Implements the ability to write and read data in/from WriteBuffer/ReadBuffer
+ * with the help of << and >> operators and also manipulators,
+ * providing a way of using, similar to iostreams.
+ *
+ * It is neither a subset nor an extension of iostreams.
+ *
+ * Example usage:
+ *
+ * DB::WriteBufferFromFileDescriptor buf(STDOUT_FILENO);
+ * buf << DB::double_quote << "Hello, world!" << '\n' << DB::flush;
+ *
+ * Outputs `char` type (usually it's Int8) as a symbol, not as a number.
+ */
+
+/// Manipulators.
+enum EscapeManip { escape }; /// For strings - escape special characters. In the rest, as usual.
+enum QuoteManip { quote }; /// For strings, dates, datetimes - enclose in single quotes with escaping. In the rest, as usual.
+enum DoubleQuoteManip { double_quote }; /// For strings, dates, datetimes - enclose in double quotes with escaping. In the rest, as usual.
+enum BinaryManip { binary }; /// Output in binary format.
+enum XMLManip { xml }; /// Output strings with XML escaping.
+
+struct EscapeManipWriteBuffer : std::reference_wrapper<WriteBuffer> { using std::reference_wrapper<WriteBuffer>::reference_wrapper; };
+struct QuoteManipWriteBuffer : std::reference_wrapper<WriteBuffer> { using std::reference_wrapper<WriteBuffer>::reference_wrapper; };
+struct DoubleQuoteManipWriteBuffer : std::reference_wrapper<WriteBuffer> { using std::reference_wrapper<WriteBuffer>::reference_wrapper; };
+struct BinaryManipWriteBuffer : std::reference_wrapper<WriteBuffer> { using std::reference_wrapper<WriteBuffer>::reference_wrapper; };
+struct XMLManipWriteBuffer : std::reference_wrapper<WriteBuffer> { using std::reference_wrapper<WriteBuffer>::reference_wrapper; };
+
+struct EscapeManipReadBuffer : std::reference_wrapper<ReadBuffer> { using std::reference_wrapper<ReadBuffer>::reference_wrapper; };
+struct QuoteManipReadBuffer : std::reference_wrapper<ReadBuffer> { using std::reference_wrapper<ReadBuffer>::reference_wrapper; };
+struct DoubleQuoteManipReadBuffer : std::reference_wrapper<ReadBuffer> { using std::reference_wrapper<ReadBuffer>::reference_wrapper; };
+struct BinaryManipReadBuffer : std::reference_wrapper<ReadBuffer> { using std::reference_wrapper<ReadBuffer>::reference_wrapper; };
+
+inline WriteBuffer & operator<<(WriteBuffer & buf, const auto & x) { writeText(x, buf); return buf; }
+inline WriteBuffer & operator<<(WriteBuffer & buf, const pcg32_fast & x) { PcgSerializer::serializePcg32(x, buf); return buf; }
+
+inline EscapeManipWriteBuffer operator<< (WriteBuffer & buf, EscapeManip) { return buf; }
+inline QuoteManipWriteBuffer operator<< (WriteBuffer & buf, QuoteManip) { return buf; }
+inline DoubleQuoteManipWriteBuffer operator<< (WriteBuffer & buf, DoubleQuoteManip) { return buf; }
+inline BinaryManipWriteBuffer operator<< (WriteBuffer & buf, BinaryManip) { return buf; }
+inline XMLManipWriteBuffer operator<< (WriteBuffer & buf, XMLManip) { return buf; }
+
+template <typename T> WriteBuffer & operator<< (EscapeManipWriteBuffer buf, const T & x) { writeText(x, buf.get()); return buf; }
+template <typename T> WriteBuffer & operator<< (QuoteManipWriteBuffer buf, const T & x) { writeQuoted(x, buf.get()); return buf; }
+template <typename T> WriteBuffer & operator<< (DoubleQuoteManipWriteBuffer buf, const T & x) { writeDoubleQuoted(x, buf.get()); return buf; }
+template <typename T> WriteBuffer & operator<< (BinaryManipWriteBuffer buf, const T & x) { writeBinary(x, buf.get()); return buf; }
+template <typename T> WriteBuffer & operator<< (XMLManipWriteBuffer buf, const T & x) { writeText(x, buf.get()); return buf; }
+
+inline WriteBuffer & operator<< (EscapeManipWriteBuffer buf, const String & x) { writeEscapedString(x, buf); return buf; }
+inline WriteBuffer & operator<< (EscapeManipWriteBuffer buf, std::string_view x) { writeEscapedString(x, buf); return buf; }
+inline WriteBuffer & operator<< (EscapeManipWriteBuffer buf, StringRef x) { writeEscapedString(x.toView(), buf); return buf; }
+inline WriteBuffer & operator<< (EscapeManipWriteBuffer buf, const char * x) { writeEscapedString(x, strlen(x), buf); return buf; }
+
+inline WriteBuffer & operator<< (QuoteManipWriteBuffer buf, const char * x) { writeAnyQuotedString<'\''>(x, x + strlen(x), buf.get()); return buf; }
+inline WriteBuffer & operator<< (DoubleQuoteManipWriteBuffer buf, const char * x) { writeAnyQuotedString<'"'>(x, x + strlen(x), buf.get()); return buf; }
+inline WriteBuffer & operator<< (BinaryManipWriteBuffer buf, const char * x) { writeStringBinary(x, buf.get()); return buf; }
+
+inline WriteBuffer & operator<< (XMLManipWriteBuffer buf, std::string_view x) { writeXMLStringForTextElementOrAttributeValue(x, buf); return buf; }
+inline WriteBuffer & operator<< (XMLManipWriteBuffer buf, StringRef x) { writeXMLStringForTextElementOrAttributeValue(x.toView(), buf); return buf; }
+inline WriteBuffer & operator<< (XMLManipWriteBuffer buf, const char * x) { writeXMLStringForTextElementOrAttributeValue(std::string_view(x), buf); return buf; }
+
+/// The manipulator calls the WriteBuffer method `next` - this makes the buffer reset. For nested buffers, the reset is not recursive.
+enum FlushManip { flush };
+
+inline WriteBuffer & operator<< (WriteBuffer & buf, FlushManip) { buf.next(); return buf; }
+
+
+template <typename T> ReadBuffer & operator>> (ReadBuffer & buf, T & x) { readText(x, buf); return buf; }
+template <> inline ReadBuffer & operator>> (ReadBuffer & buf, String & x) { readString(x, buf); return buf; }
+template <> inline ReadBuffer & operator>> (ReadBuffer & buf, char & x) { readChar(x, buf); return buf; }
+template <> inline ReadBuffer & operator>> (ReadBuffer & buf, pcg32_fast & x) { PcgDeserializer::deserializePcg32(x, buf); return buf; }
+
+/// If you specify a string literal for reading, this will mean - make sure there is a sequence of bytes and skip it.
+inline ReadBuffer & operator>> (ReadBuffer & buf, const char * x) { assertString(x, buf); return buf; }
+
+inline EscapeManipReadBuffer operator>> (ReadBuffer & buf, EscapeManip) { return buf; }
+inline QuoteManipReadBuffer operator>> (ReadBuffer & buf, QuoteManip) { return buf; }
+inline DoubleQuoteManipReadBuffer operator>> (ReadBuffer & buf, DoubleQuoteManip) { return buf; }
+inline BinaryManipReadBuffer operator>> (ReadBuffer & buf, BinaryManip) { return buf; }
+
+template <typename T> ReadBuffer & operator>> (EscapeManipReadBuffer buf, T & x) { readText(x, buf.get()); return buf; }
+template <typename T> ReadBuffer & operator>> (QuoteManipReadBuffer buf, T & x) { readQuoted(x, buf.get()); return buf; }
+template <typename T> ReadBuffer & operator>> (DoubleQuoteManipReadBuffer buf, T & x) { readDoubleQuoted(x, buf.get()); return buf; }
+template <typename T> ReadBuffer & operator>> (BinaryManipReadBuffer buf, T & x) { readBinary(x, buf.get()); return buf; }
+
+}
diff --git a/contrib/clickhouse/src/IO/ParallelReadBuffer.cpp b/contrib/clickhouse/src/IO/ParallelReadBuffer.cpp
new file mode 100644
index 0000000000..8d73f22174
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ParallelReadBuffer.cpp
@@ -0,0 +1,307 @@
+#include <IO/ParallelReadBuffer.h>
+#include <IO/SharedThreadPools.h>
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int UNEXPECTED_END_OF_FILE;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+
+}
+
+// A subrange of the input, read by one thread.
+struct ParallelReadBuffer::ReadWorker
+{
+ ReadWorker(SeekableReadBuffer & input_, size_t offset, size_t size)
+ : input(input_), start_offset(offset), segment(size)
+ {
+ chassert(size);
+ chassert(segment.size() == size);
+ }
+
+ bool hasBytesToConsume() const { return bytes_produced > bytes_consumed; }
+ bool hasBytesToProduce() const { return bytes_produced < segment.size(); }
+
+ SeekableReadBuffer & input;
+ const size_t start_offset; // start of the segment
+
+ Memory<> segment;
+ /// Reader thread produces data, nextImpl() consumes it.
+ /// segment[bytes_consumed..bytes_produced-1] is data waiting to be picked up by nextImpl()
+ /// segment[bytes_produced..] needs to be read from the input ReadBuffer
+ size_t bytes_produced = 0;
+ size_t bytes_consumed = 0;
+
+ std::atomic_bool cancel{false};
+ std::mutex worker_mutex;
+};
+
+ParallelReadBuffer::ParallelReadBuffer(
+ SeekableReadBuffer & input_, ThreadPoolCallbackRunner<void> schedule_, size_t max_working_readers_, size_t range_step_, size_t file_size_)
+ : SeekableReadBuffer(nullptr, 0)
+ , max_working_readers(max_working_readers_)
+ , schedule(std::move(schedule_))
+ , input(input_)
+ , file_size(file_size_)
+ , range_step(std::max(1ul, range_step_))
+{
+ LOG_TRACE(&Poco::Logger::get("ParallelReadBuffer"), "Parallel reading is used");
+
+ try
+ {
+ addReaders();
+ }
+ catch (const Exception &)
+ {
+ finishAndWait();
+ throw;
+ }
+}
+
+bool ParallelReadBuffer::addReaderToPool()
+{
+ if (next_range_start >= file_size)
+ return false;
+ size_t range_start = next_range_start;
+ size_t size = std::min(range_step, file_size - range_start);
+ next_range_start += size;
+
+ auto worker = read_workers.emplace_back(std::make_shared<ReadWorker>(input, range_start, size));
+
+ ++active_working_readers;
+ schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, Priority{});
+
+ return true;
+}
+
+void ParallelReadBuffer::addReaders()
+{
+ while (read_workers.size() < max_working_readers && addReaderToPool())
+ ;
+}
+
+off_t ParallelReadBuffer::seek(off_t offset, int whence)
+{
+ if (whence != SEEK_SET)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
+
+ if (offset < 0)
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", offset);
+
+ if (!working_buffer.empty() && static_cast<size_t>(offset) >= current_position - working_buffer.size() && offset < current_position)
+ {
+ pos = working_buffer.end() - (current_position - offset);
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+
+ return offset;
+ }
+
+ const auto offset_is_in_range
+ = [&](const auto & worker) { return static_cast<size_t>(offset) >= worker->start_offset && static_cast<size_t>(offset) < worker->start_offset + worker->segment.size(); };
+
+ while (!read_workers.empty() && !offset_is_in_range(read_workers.front()))
+ {
+ read_workers.front()->cancel = true;
+ read_workers.pop_front();
+ }
+
+ if (!read_workers.empty())
+ {
+ auto & w = read_workers.front();
+ size_t diff = static_cast<size_t>(offset) - w->start_offset;
+ while (true)
+ {
+ std::unique_lock lock{w->worker_mutex};
+
+ if (emergency_stop)
+ handleEmergencyStop();
+
+ if (w->bytes_produced > diff)
+ {
+ working_buffer = internal_buffer = Buffer(
+ w->segment.data(), w->segment.data() + w->bytes_produced);
+ pos = working_buffer.begin() + diff;
+ w->bytes_consumed = w->bytes_produced;
+ current_position = w->start_offset + w->bytes_consumed;
+ addReaders();
+ return offset;
+ }
+
+ next_condvar.wait_for(lock, std::chrono::seconds(10));
+ }
+ }
+
+ finishAndWait();
+
+ read_workers.clear();
+
+ next_range_start = offset;
+ current_position = offset;
+ resetWorkingBuffer();
+
+ emergency_stop = false;
+
+ addReaders();
+ return offset;
+}
+
+size_t ParallelReadBuffer::getFileSize()
+{
+ return file_size;
+}
+
+off_t ParallelReadBuffer::getPosition()
+{
+ return current_position - available();
+}
+
+void ParallelReadBuffer::handleEmergencyStop()
+{
+ // this can only be called from the main thread when there is an exception
+ assert(background_exception);
+ std::rethrow_exception(background_exception);
+}
+
+bool ParallelReadBuffer::nextImpl()
+{
+ while (true)
+ {
+ /// All readers processed, stop
+ if (read_workers.empty())
+ {
+ chassert(next_range_start >= file_size);
+ return false;
+ }
+
+ auto * w = read_workers.front().get();
+
+ std::unique_lock lock{w->worker_mutex};
+
+ if (emergency_stop)
+ handleEmergencyStop(); // throws
+
+ /// Read data from front reader
+ if (w->bytes_produced > w->bytes_consumed)
+ {
+ chassert(w->start_offset + w->bytes_consumed == static_cast<size_t>(current_position));
+
+ working_buffer = internal_buffer = Buffer(
+ w->segment.data() + w->bytes_consumed, w->segment.data() + w->bytes_produced);
+ current_position += working_buffer.size();
+ w->bytes_consumed = w->bytes_produced;
+
+ return true;
+ }
+
+ /// Front reader is done, remove it and add another
+ if (!w->hasBytesToProduce())
+ {
+ lock.unlock();
+ read_workers.pop_front();
+ addReaders();
+
+ continue;
+ }
+
+ /// Nothing to do right now, wait for something to change.
+ ///
+ /// The timeout is a workaround for a race condition.
+ /// emergency_stop is assigned while holding a *different* mutex from the one we're holding
+ /// (exception_mutex vs worker_mutex). So it's possible that our emergency_stop check (above)
+ /// happens before a onBackgroundException() call, but our wait(lock) happens after it.
+ /// Then the wait may get stuck forever.
+ ///
+ /// Note that using wait(lock, [&]{ return emergency_stop || ...; }) wouldn't help because
+ /// it does effectively the same "check, then wait" sequence.
+ ///
+ /// One possible proper fix would be to make onBackgroundException() lock all read_workers
+ /// mutexes too (not necessarily simultaneously - just locking+unlocking them one by one
+ /// between the emergency_stop change and the notify_all() would be enough), but then we
+ /// need another mutex to protect read_workers itself...
+ next_condvar.wait_for(lock, std::chrono::seconds(10));
+ }
+ chassert(false);
+ return false;
+}
+
+void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker)
+{
+ SCOPE_EXIT({
+ if (active_working_readers.fetch_sub(1) == 1)
+ active_working_readers.notify_all();
+ });
+
+ try
+ {
+ auto on_progress = [&](size_t bytes_read) -> bool
+ {
+ if (emergency_stop || read_worker->cancel)
+ return true;
+
+ std::lock_guard lock(read_worker->worker_mutex);
+ if (bytes_read <= read_worker->bytes_produced)
+ return false;
+
+ bool need_notify = read_worker->bytes_produced == read_worker->bytes_consumed;
+ read_worker->bytes_produced = bytes_read;
+ if (need_notify)
+ next_condvar.notify_all();
+
+ return false;
+ };
+
+ size_t r = input.readBigAt(read_worker->segment.data(), read_worker->segment.size(), read_worker->start_offset, on_progress);
+
+ if (!on_progress(r) && r < read_worker->segment.size())
+ throw Exception(
+ ErrorCodes::UNEXPECTED_END_OF_FILE,
+ "Failed to read all the data from the reader at offset {}, got {}/{} bytes",
+ read_worker->start_offset, r, read_worker->segment.size());
+ }
+ catch (...)
+ {
+ onBackgroundException();
+ }
+}
+
+void ParallelReadBuffer::onBackgroundException()
+{
+ std::lock_guard lock{exception_mutex};
+ if (!background_exception)
+ background_exception = std::current_exception();
+
+ emergency_stop = true;
+ next_condvar.notify_all();
+}
+
+void ParallelReadBuffer::finishAndWait()
+{
+ emergency_stop = true;
+
+ size_t active_readers = active_working_readers.load();
+ while (active_readers != 0)
+ {
+ active_working_readers.wait(active_readers);
+ active_readers = active_working_readers.load();
+ }
+}
+
+std::unique_ptr<ParallelReadBuffer> wrapInParallelReadBufferIfSupported(
+ ReadBuffer & buf, ThreadPoolCallbackRunner<void> schedule, size_t max_working_readers,
+ size_t range_step, size_t file_size)
+{
+ auto * seekable = dynamic_cast<SeekableReadBuffer*>(&buf);
+ if (!seekable || !seekable->supportsReadAt())
+ return nullptr;
+
+ return std::make_unique<ParallelReadBuffer>(
+ *seekable, schedule, max_working_readers, range_step, file_size);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ParallelReadBuffer.h b/contrib/clickhouse/src/IO/ParallelReadBuffer.h
new file mode 100644
index 0000000000..e76b40f77b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ParallelReadBuffer.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
+#include <IO/SeekableReadBuffer.h>
+#include <Interpreters/threadPoolCallbackRunner.h>
+#include <Common/ArenaWithFreeLists.h>
+
+namespace DB
+{
+
+/**
+ * Reads from multiple positions in a ReadBuffer in parallel.
+ * Then reassembles the data into one stream in the original order.
+ *
+ * Each working reader reads its segment of data into a buffer.
+ *
+ * ParallelReadBuffer in nextImpl method take first available segment from first reader in deque and reports it it to user.
+ * When first reader finishes reading, they will be removed from worker deque and data from next reader consumed.
+ *
+ * Number of working readers limited by max_working_readers.
+ */
+class ParallelReadBuffer : public SeekableReadBuffer, public WithFileSize
+{
+private:
+ /// Blocks until data occurred in the first reader or this reader indicate finishing
+ /// Finished readers removed from queue and data from next readers processed
+ bool nextImpl() override;
+
+public:
+ ParallelReadBuffer(SeekableReadBuffer & input, ThreadPoolCallbackRunner<void> schedule_, size_t max_working_readers, size_t range_step_, size_t file_size);
+
+ ~ParallelReadBuffer() override { finishAndWait(); }
+
+ off_t seek(off_t off, int whence) override;
+ size_t getFileSize() override;
+ off_t getPosition() override;
+
+ const SeekableReadBuffer & getReadBuffer() const { return input; }
+ SeekableReadBuffer & getReadBuffer() { return input; }
+
+private:
+ /// Reader in progress with a buffer for the segment
+ struct ReadWorker;
+ using ReadWorkerPtr = std::shared_ptr<ReadWorker>;
+
+ /// First worker in deque have new data or processed all available amount
+ bool currentWorkerReady() const;
+ /// First worker in deque processed and flushed all data
+ bool currentWorkerCompleted() const;
+
+ [[noreturn]] void handleEmergencyStop();
+
+ void addReaders();
+ bool addReaderToPool();
+
+ /// Process read_worker, read data and save into the buffer
+ void readerThreadFunction(ReadWorkerPtr read_worker);
+
+ void onBackgroundException();
+ void finishAndWait();
+
+ size_t max_working_readers;
+ std::atomic_size_t active_working_readers{0};
+
+ ThreadPoolCallbackRunner<void> schedule;
+
+ SeekableReadBuffer & input;
+ size_t file_size;
+ size_t range_step;
+ size_t next_range_start{0};
+
+ /**
+ * FIFO queue of readers.
+ * Each worker contains a buffer for the downloaded segment.
+ * After all data for the segment is read and delivered to the user, the reader will be removed
+ * from deque and data from next reader will be delivered.
+ * After removing from deque, call addReaders().
+ */
+ std::deque<ReadWorkerPtr> read_workers;
+
+ /// Triggered when new data available
+ std::condition_variable next_condvar;
+
+ std::mutex exception_mutex;
+ std::exception_ptr background_exception = nullptr;
+ std::atomic_bool emergency_stop{false};
+
+ off_t current_position{0}; // end of working_buffer
+
+ bool all_completed{false};
+};
+
+/// If `buf` is a SeekableReadBuffer with supportsReadAt() == true, creates a ParallelReadBuffer
+/// from it. Otherwise returns nullptr;
+std::unique_ptr<ParallelReadBuffer> wrapInParallelReadBufferIfSupported(
+ ReadBuffer & buf, ThreadPoolCallbackRunner<void> schedule, size_t max_working_readers,
+ size_t range_step, size_t file_size);
+
+}
diff --git a/contrib/clickhouse/src/IO/PeekableReadBuffer.cpp b/contrib/clickhouse/src/IO/PeekableReadBuffer.cpp
new file mode 100644
index 0000000000..ce9c20e7a5
--- /dev/null
+++ b/contrib/clickhouse/src/IO/PeekableReadBuffer.cpp
@@ -0,0 +1,378 @@
+#include <IO/PeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+}
+
+PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= 0*/)
+ : BufferWithOwnMemory(start_size_), sub_buf(&sub_buf_)
+{
+ padded &= sub_buf->isPadded();
+ /// Read from sub-buffer
+ Buffer & sub_working = sub_buf->buffer();
+ BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf->offset());
+
+ checkStateCorrect();
+}
+
+void PeekableReadBuffer::reset()
+{
+ checkStateCorrect();
+}
+
+void PeekableReadBuffer::setSubBuffer(ReadBuffer & sub_buf_)
+{
+ sub_buf = &sub_buf_;
+ resetImpl();
+}
+
+void PeekableReadBuffer::resetImpl()
+{
+ peeked_size = 0;
+ checkpoint = std::nullopt;
+ checkpoint_in_own_memory = false;
+ use_stack_memory = true;
+
+ if (!currentlyReadFromOwnMemory())
+ sub_buf->position() = pos;
+
+ Buffer & sub_working = sub_buf->buffer();
+ BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf->offset());
+
+ checkStateCorrect();
+}
+
+bool PeekableReadBuffer::peekNext()
+{
+ checkStateCorrect();
+
+ Position copy_from = pos;
+ size_t bytes_to_copy = sub_buf->available();
+ if (useSubbufferOnly())
+ {
+ /// Don't have to copy all data from sub-buffer if there is no data in own memory (checkpoint and pos are in sub-buffer)
+ if (checkpoint)
+ copy_from = *checkpoint;
+ bytes_to_copy = sub_buf->buffer().end() - copy_from;
+ if (!bytes_to_copy)
+ {
+ sub_buf->position() = copy_from;
+
+ /// Both checkpoint and pos are at the end of sub-buffer. Just load next part of data.
+ bool res = sub_buf->next();
+ BufferBase::set(sub_buf->buffer().begin(), sub_buf->buffer().size(), sub_buf->offset());
+ if (checkpoint)
+ checkpoint.emplace(pos);
+
+ checkStateCorrect();
+ return res;
+ }
+ }
+
+ /// May throw an exception
+ resizeOwnMemoryIfNecessary(bytes_to_copy);
+
+ if (useSubbufferOnly())
+ {
+ sub_buf->position() = copy_from;
+ }
+
+ char * memory_data = getMemoryData();
+
+ /// Save unread data from sub-buffer to own memory
+ memcpy(memory_data + peeked_size, sub_buf->position(), bytes_to_copy);
+
+ /// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary
+ /// Otherwise, checkpoint now at the beginning of own memory
+ if (checkpoint && useSubbufferOnly())
+ {
+ checkpoint.emplace(memory_data);
+ checkpoint_in_own_memory = true;
+ }
+
+ if (currentlyReadFromOwnMemory())
+ {
+ /// Update buffer size
+ BufferBase::set(memory_data, peeked_size + bytes_to_copy, offset());
+ }
+ else
+ {
+ /// Switch to reading from own memory
+ size_t pos_offset = peeked_size + this->offset();
+ if (useSubbufferOnly())
+ {
+ if (checkpoint)
+ pos_offset = bytes_to_copy;
+ else
+ pos_offset = 0;
+ }
+ BufferBase::set(memory_data, peeked_size + bytes_to_copy, pos_offset);
+ }
+
+ peeked_size += bytes_to_copy;
+ sub_buf->position() += bytes_to_copy;
+
+ checkStateCorrect();
+ return sub_buf->next();
+}
+
+void PeekableReadBuffer::rollbackToCheckpoint(bool drop)
+{
+ checkStateCorrect();
+
+ assert(checkpoint);
+
+ if (recursive_checkpoints_offsets.empty())
+ {
+ if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+ {
+ /// Both checkpoint and position are in the same buffer.
+ pos = *checkpoint;
+ }
+ else
+ {
+ /// Checkpoint is in own memory and position is not.
+ assert(checkpointInOwnMemory());
+
+ char * memory_data = getMemoryData();
+ /// Switch to reading from own memory.
+ BufferBase::set(memory_data, peeked_size, *checkpoint - memory_data);
+ }
+ }
+ else
+ {
+ size_t offset_from_checkpoint = recursive_checkpoints_offsets.top();
+ if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+ {
+ /// Both checkpoint and position are in the same buffer.
+ pos = *checkpoint + offset_from_checkpoint;
+ }
+ else
+ {
+ /// Checkpoint is in own memory and position is not.
+ assert(checkpointInOwnMemory());
+
+ size_t offset_from_checkpoint_in_own_memory = offsetFromCheckpointInOwnMemory();
+ if (offset_from_checkpoint >= offset_from_checkpoint_in_own_memory)
+ {
+ /// Recursive checkpoint is in sub buffer with current position.
+ /// Just move position to the recursive checkpoint
+ pos = buffer().begin() + (offset_from_checkpoint - offset_from_checkpoint_in_own_memory);
+ }
+ else
+ {
+ /// Recursive checkpoint is in own memory and position is not.
+ /// Switch to reading from own memory.
+ char * memory_data = getMemoryData();
+ BufferBase::set(memory_data, peeked_size, *checkpoint - memory_data + offset_from_checkpoint);
+ }
+ }
+ }
+
+ if (drop)
+ dropCheckpoint();
+
+ checkStateCorrect();
+}
+
+bool PeekableReadBuffer::nextImpl()
+{
+ /// FIXME: wrong bytes count because it can read the same data again after rollbackToCheckpoint()
+ /// however, changing bytes count on every call of next() (even after rollback) allows to determine
+ /// if some pointers were invalidated.
+
+ checkStateCorrect();
+ bool res;
+ bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory();
+
+ if (checkpoint)
+ {
+ if (currentlyReadFromOwnMemory())
+ res = sub_buf->hasPendingData() || sub_buf->next();
+ else
+ res = peekNext();
+ }
+ else
+ {
+ if (useSubbufferOnly())
+ {
+ /// Load next data to sub_buf
+ sub_buf->position() = position();
+ res = sub_buf->next();
+ }
+ else
+ {
+ /// All copied data have been read from own memory, continue reading from sub_buf
+ peeked_size = 0;
+ res = sub_buf->hasPendingData() || sub_buf->next();
+ }
+ }
+
+ /// Switch to reading from sub_buf (or just update it if already switched)
+ Buffer & sub_working = sub_buf->buffer();
+ BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf->offset());
+ nextimpl_working_buffer_offset = sub_buf->offset();
+
+ if (checkpoint_at_end)
+ {
+ checkpoint.emplace(position());
+ peeked_size = 0;
+ checkpoint_in_own_memory = false;
+ }
+
+ checkStateCorrect();
+ return res;
+}
+
+
+void PeekableReadBuffer::checkStateCorrect() const
+{
+ if (checkpoint)
+ {
+ if (checkpointInOwnMemory())
+ {
+ if (!peeked_size)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Checkpoint in empty own buffer");
+ if (currentlyReadFromOwnMemory() && pos < *checkpoint)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Current position in own buffer before checkpoint in own buffer");
+ if (!currentlyReadFromOwnMemory() && pos < sub_buf->position())
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Current position in subbuffer less than sub_buf->position()");
+ }
+ else
+ {
+ if (peeked_size)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Own buffer is not empty");
+ if (currentlyReadFromOwnMemory())
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Current position in own buffer before checkpoint in subbuffer");
+ if (pos < *checkpoint)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Current position in subbuffer before checkpoint in subbuffer");
+ }
+ }
+ else
+ {
+ if (!currentlyReadFromOwnMemory() && peeked_size)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Own buffer is not empty");
+ }
+ if (currentlyReadFromOwnMemory() && !peeked_size)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Pos in empty own buffer");
+}
+
+void PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append)
+{
+ checkStateCorrect();
+ bool need_update_checkpoint = checkpointInOwnMemory();
+ bool need_update_pos = currentlyReadFromOwnMemory();
+ size_t offset = 0;
+ if (need_update_checkpoint)
+ {
+ char * memory_data = getMemoryData();
+ offset = *checkpoint - memory_data;
+ }
+ else if (need_update_pos)
+ offset = this->offset();
+
+ size_t new_size = peeked_size + bytes_to_append;
+
+ if (use_stack_memory)
+ {
+ /// If stack memory is still enough, do nothing.
+ if (sizeof(stack_memory) >= new_size)
+ return;
+
+ /// Stack memory is not enough, allocate larger buffer.
+ use_stack_memory = false;
+ memory.resize(std::max(static_cast<size_t>(DBMS_DEFAULT_BUFFER_SIZE), new_size));
+ memcpy(memory.data(), stack_memory, sizeof(stack_memory));
+ if (need_update_checkpoint)
+ checkpoint.emplace(memory.data() + offset);
+ if (need_update_pos)
+ BufferBase::set(memory.data(), peeked_size, pos - stack_memory);
+ }
+ else if (memory.size() < new_size)
+ {
+ if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size())
+ {
+ /// Move unread data to the beginning of own memory instead of resize own memory
+ peeked_size -= offset;
+ memmove(memory.data(), memory.data() + offset, peeked_size);
+
+ if (need_update_checkpoint)
+ *checkpoint -= offset;
+ if (need_update_pos)
+ pos -= offset;
+ }
+ else
+ {
+ size_t pos_offset = pos - memory.data();
+
+ size_t new_size_amortized = memory.size() * 2;
+ if (new_size_amortized < new_size)
+ new_size_amortized = new_size;
+ memory.resize(new_size_amortized);
+
+ if (need_update_checkpoint)
+ checkpoint.emplace(memory.data() + offset);
+ if (need_update_pos)
+ {
+ BufferBase::set(memory.data(), peeked_size, pos_offset);
+ }
+ }
+ }
+ checkStateCorrect();
+}
+
+void PeekableReadBuffer::makeContinuousMemoryFromCheckpointToPos()
+{
+ if (!checkpoint)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "There is no checkpoint");
+ checkStateCorrect();
+
+ if (!checkpointInOwnMemory() || currentlyReadFromOwnMemory())
+ return; /// it's already continuous
+
+ size_t bytes_to_append = pos - sub_buf->position();
+ resizeOwnMemoryIfNecessary(bytes_to_append);
+ char * memory_data = getMemoryData();
+ memcpy(memory_data + peeked_size, sub_buf->position(), bytes_to_append);
+ sub_buf->position() = pos;
+ peeked_size += bytes_to_append;
+ BufferBase::set(memory_data, peeked_size, peeked_size);
+}
+
+PeekableReadBuffer::~PeekableReadBuffer()
+{
+ if (!currentlyReadFromOwnMemory())
+ sub_buf->position() = pos;
+}
+
+bool PeekableReadBuffer::hasUnreadData() const
+{
+ return peeked_size && pos != getMemoryData() + peeked_size;
+}
+
+size_t PeekableReadBuffer::offsetFromCheckpointInOwnMemory() const
+{
+ return peeked_size - (*checkpoint - getMemoryData());
+}
+
+size_t PeekableReadBuffer::offsetFromCheckpoint() const
+{
+ if (!checkpoint)
+ throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "There is no checkpoint");
+
+ if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+ {
+ /// Checkpoint and pos are in the same buffer.
+ return pos - *checkpoint;
+ }
+
+ /// Checkpoint is in own memory, position is in sub buffer.
+ return offset() + offsetFromCheckpointInOwnMemory();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/PeekableReadBuffer.h b/contrib/clickhouse/src/IO/PeekableReadBuffer.h
new file mode 100644
index 0000000000..78cb319327
--- /dev/null
+++ b/contrib/clickhouse/src/IO/PeekableReadBuffer.h
@@ -0,0 +1,143 @@
+#pragma once
+#include <IO/ReadBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <stack>
+
+namespace DB
+{
+
+/// Also allows to set checkpoint at some position in stream and come back to this position later.
+/// When next() is called, saves data between checkpoint and current position to own memory and loads next data to sub-buffer
+/// Sub-buffer should not be accessed directly during the lifetime of peekable buffer (unless
+/// you reset() the state of peekable buffer after each change of underlying buffer)
+/// If position() of peekable buffer is explicitly set to some position before checkpoint
+/// (e.g. by istr.position() = prev_pos), behavior is undefined.
+class PeekableReadBuffer : public BufferWithOwnMemory<ReadBuffer>
+{
+ friend class PeekableReadBufferCheckpoint;
+public:
+ explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = 0);
+
+ ~PeekableReadBuffer() override;
+
+ void prefetch(Priority priority) override { sub_buf->prefetch(priority); }
+
+ /// Sets checkpoint at current position
+ ALWAYS_INLINE inline void setCheckpoint()
+ {
+ if (checkpoint)
+ {
+ /// Recursive checkpoints. We just remember offset from the
+ /// first checkpoint to the current position.
+ recursive_checkpoints_offsets.push(offsetFromCheckpoint());
+ return;
+ }
+
+ checkpoint_in_own_memory = currentlyReadFromOwnMemory();
+ if (!checkpoint_in_own_memory)
+ {
+ /// Don't need to store unread data anymore
+ peeked_size = 0;
+ }
+ checkpoint.emplace(pos);
+ }
+
+ /// Forget checkpoint and all data between checkpoint and position
+ ALWAYS_INLINE inline void dropCheckpoint()
+ {
+ assert(checkpoint);
+
+ if (!recursive_checkpoints_offsets.empty())
+ {
+ recursive_checkpoints_offsets.pop();
+ return;
+ }
+
+ if (!currentlyReadFromOwnMemory())
+ {
+ /// Don't need to store unread data anymore
+ peeked_size = 0;
+ }
+ checkpoint = std::nullopt;
+ checkpoint_in_own_memory = false;
+ }
+
+ /// Sets position at checkpoint.
+ /// All pointers (such as this->buffer().end()) may be invalidated
+ void rollbackToCheckpoint(bool drop = false);
+
+ /// If checkpoint and current position are in different buffers, appends data from sub-buffer to own memory,
+ /// so data between checkpoint and position will be in continuous memory.
+ void makeContinuousMemoryFromCheckpointToPos();
+
+ /// Returns true if there unread data extracted from sub-buffer in own memory.
+ /// This data will be lost after destruction of peekable buffer.
+ bool hasUnreadData() const;
+
+ // for streaming reading (like in Kafka) we need to restore initial state of the buffer
+ // without recreating the buffer.
+ void reset();
+
+ void setSubBuffer(ReadBuffer & sub_buf_);
+
+ const ReadBuffer & getSubBuffer() const { return *sub_buf; }
+
+private:
+ bool nextImpl() override;
+
+ void resetImpl();
+
+ bool peekNext();
+
+ inline bool useSubbufferOnly() const { return !peeked_size; }
+ inline bool currentlyReadFromOwnMemory() const { return working_buffer.begin() != sub_buf->buffer().begin(); }
+ inline bool checkpointInOwnMemory() const { return checkpoint_in_own_memory; }
+
+ void checkStateCorrect() const;
+
+ /// Makes possible to append `bytes_to_append` bytes to data in own memory.
+ /// Updates all invalidated pointers and sizes.
+ void resizeOwnMemoryIfNecessary(size_t bytes_to_append);
+
+ char * getMemoryData() { return use_stack_memory ? stack_memory : memory.data(); }
+ const char * getMemoryData() const { return use_stack_memory ? stack_memory : memory.data(); }
+
+ size_t offsetFromCheckpointInOwnMemory() const;
+ size_t offsetFromCheckpoint() const;
+
+
+ ReadBuffer * sub_buf;
+ size_t peeked_size = 0;
+ std::optional<Position> checkpoint = std::nullopt;
+ bool checkpoint_in_own_memory = false;
+
+ /// To prevent expensive and in some cases unnecessary memory allocations on PeekableReadBuffer
+ /// creation (for example if PeekableReadBuffer is often created or if we need to remember small amount of
+ /// data after checkpoint), at the beginning we will use small amount of memory on stack and allocate
+ /// larger buffer only if reserved memory is not enough.
+ char stack_memory[PADDING_FOR_SIMD];
+ bool use_stack_memory = true;
+
+ std::stack<size_t> recursive_checkpoints_offsets;
+};
+
+
+class PeekableReadBufferCheckpoint : boost::noncopyable
+{
+ PeekableReadBuffer & buf;
+ bool auto_rollback;
+public:
+ explicit PeekableReadBufferCheckpoint(PeekableReadBuffer & buf_, bool auto_rollback_ = false)
+ : buf(buf_), auto_rollback(auto_rollback_) { buf.setCheckpoint(); }
+ ~PeekableReadBufferCheckpoint()
+ {
+ if (!buf.checkpoint)
+ return;
+ if (auto_rollback)
+ buf.rollbackToCheckpoint();
+ buf.dropCheckpoint();
+ }
+
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Progress.cpp b/contrib/clickhouse/src/IO/Progress.cpp
new file mode 100644
index 0000000000..145281e140
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Progress.cpp
@@ -0,0 +1,240 @@
+#include "Progress.h"
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Core/ProtocolDefines.h>
+
+
+namespace DB
+{
+
+namespace
+{
+ UInt64 getApproxTotalRowsToRead(UInt64 read_rows, UInt64 read_bytes, UInt64 total_bytes_to_read)
+ {
+ if (!read_rows || !read_bytes)
+ return 0;
+
+ auto bytes_per_row = std::ceil(static_cast<double>(read_bytes) / read_rows);
+ return static_cast<UInt64>(std::ceil(static_cast<double>(total_bytes_to_read) / bytes_per_row));
+ }
+}
+
+void ProgressValues::read(ReadBuffer & in, UInt64 server_revision)
+{
+ readVarUInt(read_rows, in);
+ readVarUInt(read_bytes, in);
+ readVarUInt(total_rows_to_read, in);
+ if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_TOTAL_BYTES_IN_PROGRESS)
+ {
+ readVarUInt(total_bytes_to_read, in);
+ }
+ if (server_revision >= DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO)
+ {
+ readVarUInt(written_rows, in);
+ readVarUInt(written_bytes, in);
+ }
+ if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_SERVER_QUERY_TIME_IN_PROGRESS)
+ {
+ readVarUInt(elapsed_ns, in);
+ }
+}
+
+
+void ProgressValues::write(WriteBuffer & out, UInt64 client_revision) const
+{
+ writeVarUInt(read_rows, out);
+ writeVarUInt(read_bytes, out);
+ /// In new TCP protocol we can send total_bytes_to_read without total_rows_to_read.
+ /// If client doesn't support total_bytes_to_read, send approx total_rows_to_read
+ /// to indicate at least approx progress.
+ if (client_revision < DBMS_MIN_PROTOCOL_VERSION_WITH_TOTAL_BYTES_IN_PROGRESS && total_bytes_to_read && !total_rows_to_read)
+ writeVarUInt(getApproxTotalRowsToRead(read_rows, read_bytes, total_bytes_to_read), out);
+ else
+ writeVarUInt(total_rows_to_read, out);
+ if (client_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_TOTAL_BYTES_IN_PROGRESS)
+ {
+ writeVarUInt(total_bytes_to_read, out);
+ }
+ if (client_revision >= DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO)
+ {
+ writeVarUInt(written_rows, out);
+ writeVarUInt(written_bytes, out);
+ }
+ if (client_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_SERVER_QUERY_TIME_IN_PROGRESS)
+ {
+ writeVarUInt(elapsed_ns, out);
+ }
+}
+
+void ProgressValues::writeJSON(WriteBuffer & out) const
+{
+ /// Numbers are written in double quotes (as strings) to avoid loss of precision
+ /// of 64-bit integers after interpretation by JavaScript.
+
+ writeCString("{", out);
+ writeCString("\"read_rows\":\"", out);
+ writeText(read_rows, out);
+ writeCString("\",\"read_bytes\":\"", out);
+ writeText(read_bytes, out);
+ writeCString("\",\"written_rows\":\"", out);
+ writeText(written_rows, out);
+ writeCString("\",\"written_bytes\":\"", out);
+ writeText(written_bytes, out);
+ writeCString("\",\"total_rows_to_read\":\"", out);
+ writeText(total_rows_to_read, out);
+ writeCString("\",\"result_rows\":\"", out);
+ writeText(result_rows, out);
+ writeCString("\",\"result_bytes\":\"", out);
+ writeText(result_bytes, out);
+ writeCString("\"", out);
+ writeCString("}", out);
+}
+
+bool Progress::incrementPiecewiseAtomically(const Progress & rhs)
+{
+ read_rows += rhs.read_rows;
+ read_bytes += rhs.read_bytes;
+
+ total_rows_to_read += rhs.total_rows_to_read;
+ total_bytes_to_read += rhs.total_bytes_to_read;
+
+ written_rows += rhs.written_rows;
+ written_bytes += rhs.written_bytes;
+
+ result_rows += rhs.result_rows;
+ result_bytes += rhs.result_bytes;
+
+ elapsed_ns += rhs.elapsed_ns;
+
+ return rhs.read_rows || rhs.written_rows;
+}
+
+void Progress::reset()
+{
+ read_rows = 0;
+ read_bytes = 0;
+
+ total_rows_to_read = 0;
+ total_bytes_to_read = 0;
+
+ written_rows = 0;
+ written_bytes = 0;
+
+ result_rows = 0;
+ result_bytes = 0;
+
+ elapsed_ns = 0;
+}
+
+ProgressValues Progress::getValues() const
+{
+ ProgressValues res;
+
+ res.read_rows = read_rows.load(std::memory_order_relaxed);
+ res.read_bytes = read_bytes.load(std::memory_order_relaxed);
+
+ res.total_rows_to_read = total_rows_to_read.load(std::memory_order_relaxed);
+ res.total_bytes_to_read = total_bytes_to_read.load(std::memory_order_relaxed);
+
+ res.written_rows = written_rows.load(std::memory_order_relaxed);
+ res.written_bytes = written_bytes.load(std::memory_order_relaxed);
+
+ res.result_rows = result_rows.load(std::memory_order_relaxed);
+ res.result_bytes = result_bytes.load(std::memory_order_relaxed);
+
+ res.elapsed_ns = elapsed_ns.load(std::memory_order_relaxed);
+
+ return res;
+}
+
+ProgressValues Progress::fetchValuesAndResetPiecewiseAtomically()
+{
+ ProgressValues res;
+
+ res.read_rows = read_rows.fetch_and(0);
+ res.read_bytes = read_bytes.fetch_and(0);
+
+ res.total_rows_to_read = total_rows_to_read.fetch_and(0);
+ res.total_bytes_to_read = total_bytes_to_read.fetch_and(0);
+
+ res.written_rows = written_rows.fetch_and(0);
+ res.written_bytes = written_bytes.fetch_and(0);
+
+ res.result_rows = result_rows.fetch_and(0);
+ res.result_bytes = result_bytes.fetch_and(0);
+
+ res.elapsed_ns = elapsed_ns.fetch_and(0);
+
+ return res;
+}
+
+Progress Progress::fetchAndResetPiecewiseAtomically()
+{
+ Progress res;
+
+ res.read_rows = read_rows.fetch_and(0);
+ res.read_bytes = read_bytes.fetch_and(0);
+
+ res.total_rows_to_read = total_rows_to_read.fetch_and(0);
+ res.total_bytes_to_read = total_bytes_to_read.fetch_and(0);
+
+ res.written_rows = written_rows.fetch_and(0);
+ res.written_bytes = written_bytes.fetch_and(0);
+
+ res.result_rows = result_rows.fetch_and(0);
+ res.result_bytes = result_bytes.fetch_and(0);
+
+ res.elapsed_ns = elapsed_ns.fetch_and(0);
+
+ return res;
+}
+
+Progress & Progress::operator=(Progress && other) noexcept
+{
+ read_rows = other.read_rows.load(std::memory_order_relaxed);
+ read_bytes = other.read_bytes.load(std::memory_order_relaxed);
+
+ total_rows_to_read = other.total_rows_to_read.load(std::memory_order_relaxed);
+ total_bytes_to_read = other.total_bytes_to_read.load(std::memory_order_relaxed);
+
+ written_rows = other.written_rows.load(std::memory_order_relaxed);
+ written_bytes = other.written_bytes.load(std::memory_order_relaxed);
+
+ result_rows = other.result_rows.load(std::memory_order_relaxed);
+ result_bytes = other.result_bytes.load(std::memory_order_relaxed);
+
+ elapsed_ns = other.elapsed_ns.load(std::memory_order_relaxed);
+
+ return *this;
+}
+
+void Progress::read(ReadBuffer & in, UInt64 server_revision)
+{
+ ProgressValues values;
+ values.read(in, server_revision);
+
+ read_rows.store(values.read_rows, std::memory_order_relaxed);
+ read_bytes.store(values.read_bytes, std::memory_order_relaxed);
+ total_rows_to_read.store(values.total_rows_to_read, std::memory_order_relaxed);
+ total_bytes_to_read.store(values.total_bytes_to_read, std::memory_order_relaxed);
+
+ written_rows.store(values.written_rows, std::memory_order_relaxed);
+ written_bytes.store(values.written_bytes, std::memory_order_relaxed);
+
+ elapsed_ns.store(values.elapsed_ns, std::memory_order_relaxed);
+}
+
+void Progress::write(WriteBuffer & out, UInt64 client_revision) const
+{
+ getValues().write(out, client_revision);
+}
+
+void Progress::writeJSON(WriteBuffer & out) const
+{
+ getValues().writeJSON(out);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Progress.h b/contrib/clickhouse/src/IO/Progress.h
new file mode 100644
index 0000000000..a68ff9bc5c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Progress.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <functional>
+#include <base/types.h>
+
+#include <Core/Defines.h>
+
+namespace DB
+{
+
+class ReadBuffer;
+class WriteBuffer;
+
+/// See Progress.
+struct ProgressValues
+{
+ UInt64 read_rows = 0;
+ UInt64 read_bytes = 0;
+
+ UInt64 total_rows_to_read = 0;
+ UInt64 total_bytes_to_read = 0;
+
+ UInt64 written_rows = 0;
+ UInt64 written_bytes = 0;
+
+ UInt64 result_rows = 0;
+ UInt64 result_bytes = 0;
+
+ UInt64 elapsed_ns = 0;
+
+ void read(ReadBuffer & in, UInt64 server_revision);
+ void write(WriteBuffer & out, UInt64 client_revision) const;
+ void writeJSON(WriteBuffer & out) const;
+};
+
+struct ReadProgress
+{
+ UInt64 read_rows = 0;
+ UInt64 read_bytes = 0;
+ UInt64 total_rows_to_read = 0;
+ UInt64 total_bytes_to_read = 0;
+
+ ReadProgress(UInt64 read_rows_, UInt64 read_bytes_, UInt64 total_rows_to_read_ = 0, UInt64 total_bytes_to_read_ = 0)
+ : read_rows(read_rows_), read_bytes(read_bytes_), total_rows_to_read(total_rows_to_read_), total_bytes_to_read(total_bytes_to_read_) {}
+};
+
+struct WriteProgress
+{
+ UInt64 written_rows = 0;
+ UInt64 written_bytes = 0;
+
+ WriteProgress(UInt64 written_rows_, UInt64 written_bytes_)
+ : written_rows(written_rows_), written_bytes(written_bytes_) {}
+};
+
+struct ResultProgress
+{
+ UInt64 result_rows = 0;
+ UInt64 result_bytes = 0;
+
+ ResultProgress(UInt64 result_rows_, UInt64 result_bytes_)
+ : result_rows(result_rows_), result_bytes(result_bytes_) {}
+};
+
+struct FileProgress
+{
+ /// Here read_bytes (raw bytes) - do not equal ReadProgress::read_bytes, which are calculated according to column types.
+ UInt64 read_bytes = 0;
+ UInt64 total_bytes_to_read = 0;
+
+ explicit FileProgress(UInt64 read_bytes_, UInt64 total_bytes_to_read_ = 0) : read_bytes(read_bytes_), total_bytes_to_read(total_bytes_to_read_) {}
+};
+
+
+/** Progress of query execution.
+ * Values, transferred over network are deltas - how much was done after previously sent value.
+ * The same struct is also used for summarized values.
+ */
+struct Progress
+{
+ std::atomic<UInt64> read_rows {0}; /// Rows (source) processed.
+ std::atomic<UInt64> read_bytes {0}; /// Bytes (uncompressed, source) processed.
+
+ /** How much rows/bytes must be processed, in total, approximately. Non-zero value is sent when there is information about
+ * some new part of job. Received values must be summed to get estimate of total rows to process.
+ */
+ std::atomic<UInt64> total_rows_to_read {0};
+ std::atomic<UInt64> total_bytes_to_read {0};
+
+ std::atomic<UInt64> written_rows {0};
+ std::atomic<UInt64> written_bytes {0};
+
+ std::atomic<UInt64> result_rows {0};
+ std::atomic<UInt64> result_bytes {0};
+
+ std::atomic<UInt64> elapsed_ns {0};
+
+ Progress() = default;
+
+ Progress(UInt64 read_rows_, UInt64 read_bytes_, UInt64 total_rows_to_read_ = 0, UInt64 total_bytes_to_read_ = 0)
+ : read_rows(read_rows_), read_bytes(read_bytes_), total_rows_to_read(total_rows_to_read_), total_bytes_to_read(total_bytes_to_read_) {}
+
+ explicit Progress(ReadProgress read_progress)
+ : read_rows(read_progress.read_rows), read_bytes(read_progress.read_bytes), total_rows_to_read(read_progress.total_rows_to_read) {}
+
+ explicit Progress(WriteProgress write_progress)
+ : written_rows(write_progress.written_rows), written_bytes(write_progress.written_bytes) {}
+
+ explicit Progress(ResultProgress result_progress)
+ : result_rows(result_progress.result_rows), result_bytes(result_progress.result_bytes) {}
+
+ explicit Progress(FileProgress file_progress)
+ : read_bytes(file_progress.read_bytes), total_bytes_to_read(file_progress.total_bytes_to_read) {}
+
+ void read(ReadBuffer & in, UInt64 server_revision);
+
+ void write(WriteBuffer & out, UInt64 client_revision) const;
+
+ /// Progress in JSON format (single line, without whitespaces) is used in HTTP headers.
+ void writeJSON(WriteBuffer & out) const;
+
+ /// Each value separately is changed atomically (but not whole object).
+ bool incrementPiecewiseAtomically(const Progress & rhs);
+
+ void reset();
+
+ ProgressValues getValues() const;
+
+ ProgressValues fetchValuesAndResetPiecewiseAtomically();
+
+ Progress fetchAndResetPiecewiseAtomically();
+
+ Progress & operator=(Progress && other) noexcept;
+
+ Progress(Progress && other) noexcept
+ {
+ *this = std::move(other);
+ }
+};
+
+
+/** Callback to track the progress of the query.
+ * Used in QueryPipeline and Context.
+ * The function takes the number of rows in the last block, the number of bytes in the last block.
+ * Note that the callback can be called from different threads.
+ */
+using ProgressCallback = std::function<void(const Progress & progress)>;
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBuffer.cpp b/contrib/clickhouse/src/IO/ReadBuffer.cpp
new file mode 100644
index 0000000000..bf054d0842
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBuffer.cpp
@@ -0,0 +1,47 @@
+#include <IO/ReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace
+{
+ template <typename CustomData>
+ class ReadBufferWrapper : public ReadBuffer
+ {
+ public:
+ ReadBufferWrapper(ReadBuffer & in_, CustomData && custom_data_)
+ : ReadBuffer(in_.buffer().begin(), in_.buffer().size(), in_.offset()), in(in_), custom_data(std::move(custom_data_))
+ {
+ }
+
+ private:
+ ReadBuffer & in;
+ CustomData custom_data;
+
+ bool nextImpl() override
+ {
+ in.position() = position();
+ if (!in.next())
+ {
+ set(in.position(), 0);
+ return false;
+ }
+ BufferBase::set(in.buffer().begin(), in.buffer().size(), in.offset());
+ return true;
+ }
+ };
+}
+
+
+std::unique_ptr<ReadBuffer> wrapReadBufferReference(ReadBuffer & ref)
+{
+ return std::make_unique<ReadBufferWrapper<nullptr_t>>(ref, nullptr);
+}
+
+std::unique_ptr<ReadBuffer> wrapReadBufferPointer(ReadBufferPtr ptr)
+{
+ return std::make_unique<ReadBufferWrapper<ReadBufferPtr>>(*ptr, ReadBufferPtr{ptr});
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBuffer.h b/contrib/clickhouse/src/IO/ReadBuffer.h
new file mode 100644
index 0000000000..a4ae12f506
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBuffer.h
@@ -0,0 +1,277 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <algorithm>
+#include <memory>
+
+#include <Common/Exception.h>
+#include <Common/Priority.h>
+#include <IO/BufferBase.h>
+#include <IO/AsynchronousReader.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ATTEMPT_TO_READ_AFTER_EOF;
+ extern const int CANNOT_READ_ALL_DATA;
+ extern const int NOT_IMPLEMENTED;
+}
+
+static constexpr auto DEFAULT_PREFETCH_PRIORITY = Priority{0};
+
+/** A simple abstract class for buffered data reading (char sequences) from somewhere.
+ * Unlike std::istream, it provides access to the internal buffer,
+ * and also allows you to manually manage the position inside the buffer.
+ *
+ * Note! `char *`, not `const char *` is used
+ * (so that you can take out the common code into BufferBase, and also so that you can fill the buffer in with new data).
+ * This causes inconveniences - for example, when using ReadBuffer to read from a chunk of memory const char *,
+ * you have to use const_cast.
+ *
+ * Derived classes must implement the nextImpl() method.
+ */
+class ReadBuffer : public BufferBase
+{
+public:
+ /** Creates a buffer and sets a piece of available data to read to zero size,
+ * so that the next() function is called to load the new data portion into the buffer at the first try.
+ */
+ ReadBuffer(Position ptr, size_t size) : BufferBase(ptr, size, 0) { working_buffer.resize(0); }
+
+ /** Used when the buffer is already full of data that can be read.
+ * (in this case, pass 0 as an offset)
+ */
+ ReadBuffer(Position ptr, size_t size, size_t offset) : BufferBase(ptr, size, offset) {}
+
+ // Copying the read buffers can be dangerous because they can hold a lot of
+ // memory or open files, so better to disable the copy constructor to prevent
+ // accidental copying.
+ ReadBuffer(const ReadBuffer &) = delete;
+
+ // FIXME: behavior differs greately from `BufferBase::set()` and it's very confusing.
+ void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); working_buffer.resize(0); }
+
+ /** read next data and fill a buffer with it; set position to the beginning of the new data
+ * (but not necessarily to the beginning of working_buffer!);
+ * return `false` in case of end, `true` otherwise; throw an exception, if something is wrong;
+ *
+ * if an exception was thrown, is the ReadBuffer left in a usable state? this varies across implementations;
+ * can the caller retry next() after an exception, or call other methods? not recommended
+ */
+ bool next()
+ {
+ assert(!hasPendingData());
+ assert(position() <= working_buffer.end());
+
+ bytes += offset();
+ bool res = nextImpl();
+ if (!res)
+ working_buffer = Buffer(pos, pos);
+ else
+ {
+ pos = working_buffer.begin() + nextimpl_working_buffer_offset;
+ assert(position() != working_buffer.end());
+ }
+ nextimpl_working_buffer_offset = 0;
+
+ assert(position() <= working_buffer.end());
+
+ return res;
+ }
+
+
+ inline void nextIfAtEnd()
+ {
+ if (!hasPendingData())
+ next();
+ }
+
+ virtual ~ReadBuffer() = default;
+
+
+ /** Unlike std::istream, it returns true if all data was read
+ * (and not in case there was an attempt to read after the end).
+ * If at the moment the position is at the end of the buffer, it calls the next() method.
+ * That is, it has a side effect - if the buffer is over, then it updates it and set the position to the beginning.
+ *
+ * Try to read after the end should throw an exception.
+ */
+ bool ALWAYS_INLINE eof()
+ {
+ return !hasPendingData() && !next();
+ }
+
+ void ignore()
+ {
+ if (!eof())
+ ++pos;
+ else
+ throwReadAfterEOF();
+ }
+
+ void ignore(size_t n)
+ {
+ while (n != 0 && !eof())
+ {
+ size_t bytes_to_ignore = std::min(static_cast<size_t>(working_buffer.end() - pos), n);
+ pos += bytes_to_ignore;
+ n -= bytes_to_ignore;
+ }
+
+ if (n)
+ throwReadAfterEOF();
+ }
+
+ /// You could call this method `ignore`, and `ignore` call `ignoreStrict`.
+ size_t tryIgnore(size_t n)
+ {
+ size_t bytes_ignored = 0;
+
+ while (bytes_ignored < n && !eof())
+ {
+ size_t bytes_to_ignore = std::min(static_cast<size_t>(working_buffer.end() - pos), n - bytes_ignored);
+ pos += bytes_to_ignore;
+ bytes_ignored += bytes_to_ignore;
+ }
+
+ return bytes_ignored;
+ }
+
+ void ignoreAll()
+ {
+ tryIgnore(std::numeric_limits<size_t>::max());
+ }
+
+ /// Peeks a single byte.
+ bool ALWAYS_INLINE peek(char & c)
+ {
+ if (eof())
+ return false;
+ c = *pos;
+ return true;
+ }
+
+ /// Reads a single byte.
+ [[nodiscard]] bool ALWAYS_INLINE read(char & c)
+ {
+ if (peek(c))
+ {
+ ++pos;
+ return true;
+ }
+
+ return false;
+ }
+
+ void ALWAYS_INLINE readStrict(char & c)
+ {
+ if (read(c))
+ return;
+ throwReadAfterEOF();
+ }
+
+ /** Reads as many as there are, no more than n bytes. */
+ [[nodiscard]] size_t read(char * to, size_t n)
+ {
+ size_t bytes_copied = 0;
+
+ while (bytes_copied < n && !eof())
+ {
+ size_t bytes_to_copy = std::min(static_cast<size_t>(working_buffer.end() - pos), n - bytes_copied);
+ ::memcpy(to + bytes_copied, pos, bytes_to_copy);
+ pos += bytes_to_copy;
+ bytes_copied += bytes_to_copy;
+ }
+
+ return bytes_copied;
+ }
+
+ /** Reads n bytes, if there are less - throws an exception. */
+ void readStrict(char * to, size_t n)
+ {
+ auto read_bytes = read(to, n);
+ if (n != read_bytes)
+ throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
+ "Cannot read all data. Bytes read: {}. Bytes expected: {}.", read_bytes, std::to_string(n));
+ }
+
+ /** A method that can be more efficiently implemented in derived classes, in the case of reading large enough blocks.
+ * The implementation can read data directly into `to`, without superfluous copying, if in `to` there is enough space for work.
+ * For example, a CompressedReadBuffer can decompress the data directly into `to`, if the entire decompressed block fits there.
+ * By default - the same as read.
+ * Don't use for small reads.
+ */
+ [[nodiscard]] virtual size_t readBig(char * to, size_t n) { return read(to, n); }
+
+ /** Do something to allow faster subsequent call to 'nextImpl' if possible.
+ * It's used for asynchronous readers with double-buffering.
+ * `priority` is the `ThreadPool` priority, with which the prefetch task will be scheduled.
+ * Lower value means higher priority.
+ */
+ virtual void prefetch(Priority) {}
+
+ /**
+ * Set upper bound for read range [..., position).
+ * Useful for reading from remote filesystem, when it matters how much we read.
+ * Doesn't affect getFileSize().
+ * See also: SeekableReadBuffer::supportsRightBoundedReads().
+ *
+ * Behavior in weird cases is currently implementation-defined:
+ * - setReadUntilPosition() below current position,
+ * - setReadUntilPosition() above the end of the file,
+ * - seek() to a position above the until position (even if you setReadUntilPosition() to a
+ * higher value right after the seek!),
+ *
+ * Typical implementations discard any current buffers and connections, even if the position is
+ * adjusted only a little.
+ *
+ * Typical usage is to call it right after creating the ReadBuffer, before it started doing any
+ * work.
+ */
+ virtual void setReadUntilPosition(size_t /* position */) {}
+
+ virtual void setReadUntilEnd() {}
+
+ /// Read at most `size` bytes into data at specified offset `offset`. First ignore `ignore` bytes if `ignore` > 0.
+ /// Notice: this function only need to be implemented in synchronous read buffers to be wrapped in asynchronous read.
+ /// Such as ReadBufferFromRemoteFSGather and AsynchronousReadIndirectBufferFromRemoteFS.
+ virtual IAsynchronousReader::Result readInto(char * /*data*/, size_t /*size*/, size_t /*offset*/, size_t /*ignore*/)
+ {
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "readInto not implemented");
+ }
+
+protected:
+ /// The number of bytes to ignore from the initial position of `working_buffer`
+ /// buffer. Apparently this is an additional out-parameter for nextImpl(),
+ /// not a real field.
+ size_t nextimpl_working_buffer_offset = 0;
+
+private:
+ /** Read the next data and fill a buffer with it.
+ * Return `false` in case of the end, `true` otherwise.
+ * Throw an exception if something is wrong.
+ */
+ virtual bool nextImpl() { return false; }
+
+ [[noreturn]] static void throwReadAfterEOF()
+ {
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after eof");
+ }
+};
+
+
+using ReadBufferPtr = std::shared_ptr<ReadBuffer>;
+
+/// Due to inconsistencies in ReadBuffer-family interfaces:
+/// - some require to fully wrap underlying buffer and own it,
+/// - some just wrap the reference without ownership,
+/// we need to be able to wrap reference-only buffers with movable transparent proxy-buffer.
+/// The uniqueness of such wraps is responsibility of the code author.
+std::unique_ptr<ReadBuffer> wrapReadBufferReference(ReadBuffer & ref);
+std::unique_ptr<ReadBuffer> wrapReadBufferPointer(ReadBufferPtr ptr);
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromEmptyFile.h b/contrib/clickhouse/src/IO/ReadBufferFromEmptyFile.h
new file mode 100644
index 0000000000..f21f2f507d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromEmptyFile.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/ReadBufferFromFileBase.h>
+
+namespace DB
+{
+
+/// In case of empty file it does not make any sense to read it.
+///
+/// Plus regular readers from file has an assert that buffer is not empty, that will fail:
+/// - ReadBufferFromFileDescriptor
+/// - SynchronousReader
+/// - ThreadPoolReader
+class ReadBufferFromEmptyFile : public ReadBufferFromFileBase
+{
+private:
+ bool nextImpl() override { return false; }
+ std::string getFileName() const override { return "<empty>"; }
+ off_t seek(off_t /*off*/, int /*whence*/) override { return 0; }
+ off_t getPosition() override { return 0; }
+ size_t getFileSize() override { return 0; }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.cpp b/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.cpp
new file mode 100644
index 0000000000..f9cf159715
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.cpp
@@ -0,0 +1,106 @@
+#include <IO/ReadBufferFromEncryptedFile.h>
+
+#if USE_SSL
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+ReadBufferFromEncryptedFile::ReadBufferFromEncryptedFile(
+ size_t buffer_size_,
+ std::unique_ptr<ReadBufferFromFileBase> in_,
+ const String & key_,
+ const FileEncryption::Header & header_,
+ size_t offset_)
+ : ReadBufferFromFileBase(buffer_size_, nullptr, 0)
+ , in(std::move(in_))
+ , encrypted_buffer(buffer_size_)
+ , encryptor(header_.algorithm, key_, header_.init_vector)
+{
+ offset = offset_;
+ need_seek = true;
+}
+
+off_t ReadBufferFromEncryptedFile::seek(off_t off, int whence)
+{
+ off_t new_pos;
+ if (whence == SEEK_SET)
+ {
+ if (off < 0)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "SEEK_SET underflow: off = {}", off);
+ new_pos = off;
+ }
+ else if (whence == SEEK_CUR)
+ {
+ if (off < 0 && -off > getPosition())
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "SEEK_CUR shift out of bounds");
+ new_pos = getPosition() + off;
+ }
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "ReadBufferFromFileEncrypted::seek expects SEEK_SET or SEEK_CUR as whence");
+
+ if ((offset - static_cast<off_t>(working_buffer.size()) <= new_pos) && (new_pos <= offset) && !need_seek)
+ {
+ /// Position is still inside buffer.
+ pos = working_buffer.end() - offset + new_pos;
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+ }
+ else
+ {
+ need_seek = true;
+ offset = new_pos;
+
+ /// No more reading from the current working buffer until next() is called.
+ resetWorkingBuffer();
+ assert(!hasPendingData());
+ }
+
+ return new_pos;
+}
+
+off_t ReadBufferFromEncryptedFile::getPosition()
+{
+ return offset - available();
+}
+
+bool ReadBufferFromEncryptedFile::nextImpl()
+{
+ if (need_seek)
+ {
+ off_t raw_offset = offset + FileEncryption::Header::kSize;
+ if (in->seek(raw_offset, SEEK_SET) != raw_offset)
+ return false;
+ need_seek = false;
+ }
+
+ if (in->eof())
+ return false;
+
+ /// Read up to the size of `encrypted_buffer`.
+ size_t bytes_read = 0;
+ while (bytes_read < encrypted_buffer.size() && !in->eof())
+ {
+ bytes_read += in->read(encrypted_buffer.data() + bytes_read, encrypted_buffer.size() - bytes_read);
+ }
+
+ /// The used cipher algorithms generate the same number of bytes in output as it were in input,
+ /// so after deciphering the numbers of bytes will be still `bytes_read`.
+ working_buffer.resize(bytes_read);
+
+ /// The decryptor needs to know what the current offset is (because it's used in the decryption algorithm).
+ encryptor.setOffset(offset);
+
+ encryptor.decrypt(encrypted_buffer.data(), bytes_read, working_buffer.begin());
+
+ offset += bytes_read;
+ pos = working_buffer.begin();
+ return true;
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.h b/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.h
new file mode 100644
index 0000000000..155dc2ccce
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromEncryptedFile.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_SSL
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/FileEncryptionCommon.h>
+
+
+namespace DB
+{
+
+/// Reads data from the underlying read buffer and decrypts it.
+class ReadBufferFromEncryptedFile : public ReadBufferFromFileBase
+{
+public:
+ ReadBufferFromEncryptedFile(
+ size_t buffer_size_,
+ std::unique_ptr<ReadBufferFromFileBase> in_,
+ const String & key_,
+ const FileEncryption::Header & header_,
+ size_t offset_ = 0);
+
+ off_t seek(off_t off, int whence) override;
+ off_t getPosition() override;
+
+ std::string getFileName() const override { return in->getFileName(); }
+
+ void setReadUntilPosition(size_t position) override { in->setReadUntilPosition(position + FileEncryption::Header::kSize); }
+
+ void setReadUntilEnd() override { in->setReadUntilEnd(); }
+
+ size_t getFileSize() override { return in->getFileSize(); }
+
+private:
+ bool nextImpl() override;
+
+ std::unique_ptr<ReadBufferFromFileBase> in;
+
+ off_t offset = 0;
+
+ bool need_seek = false;
+
+ Memory<> encrypted_buffer;
+ FileEncryption::Encryptor encryptor;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFile.cpp b/contrib/clickhouse/src/IO/ReadBufferFromFile.cpp
new file mode 100644
index 0000000000..79ac62c642
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFile.cpp
@@ -0,0 +1,97 @@
+#include <fcntl.h>
+
+#include <IO/ReadBufferFromFile.h>
+#include <IO/WriteHelpers.h>
+#include <Common/ProfileEvents.h>
+#include <base/defines.h>
+#include <cerrno>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+ReadBufferFromFile::ReadBufferFromFile(
+ const std::string & file_name_,
+ size_t buf_size,
+ int flags,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_,
+ ThrottlerPtr throttler_)
+ : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_size_, throttler_)
+ , file_name(file_name_)
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+#ifdef OS_DARWIN
+ bool o_direct = (flags != -1) && (flags & O_DIRECT);
+ if (o_direct)
+ flags = flags & ~O_DIRECT;
+#endif
+ fd = ::open(file_name.c_str(), flags == -1 ? O_RDONLY | O_CLOEXEC : flags | O_CLOEXEC);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+#ifdef OS_DARWIN
+ if (o_direct)
+ {
+ if (fcntl(fd, F_NOCACHE, 1) == -1)
+ throwFromErrnoWithPath("Cannot set F_NOCACHE on file " + file_name, file_name, ErrorCodes::CANNOT_OPEN_FILE);
+ }
+#endif
+}
+
+
+ReadBufferFromFile::ReadBufferFromFile(
+ int & fd_,
+ const std::string & original_file_name,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_,
+ ThrottlerPtr throttler_)
+ : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_, throttler_)
+ , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name)
+{
+ fd_ = -1;
+}
+
+
+ReadBufferFromFile::~ReadBufferFromFile()
+{
+ if (fd < 0)
+ return;
+
+ int err = ::close(fd);
+ chassert(!err || errno == EINTR);
+}
+
+
+void ReadBufferFromFile::close()
+{
+ if (fd < 0)
+ return;
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+ metric_increment.destroy();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFile.h b/contrib/clickhouse/src/IO/ReadBufferFromFile.h
new file mode 100644
index 0000000000..462453d974
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFile.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileDescriptor.h>
+#include <IO/OpenedFileCache.h>
+#include <Common/CurrentMetrics.h>
+
+
+namespace CurrentMetrics
+{
+ extern const Metric OpenFileForRead;
+}
+
+namespace DB
+{
+
+/** Accepts path to file and opens it, or pre-opened file descriptor.
+ * Closes file by himself (thus "owns" a file descriptor).
+ */
+class ReadBufferFromFile : public ReadBufferFromFileDescriptor
+{
+protected:
+ std::string file_name;
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
+
+public:
+ explicit ReadBufferFromFile(
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler = {});
+
+ /// Use pre-opened file descriptor.
+ explicit ReadBufferFromFile(
+ int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
+ const std::string & original_file_name = {},
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler = {});
+
+ ~ReadBufferFromFile() override;
+
+ /// Close file before destruction of object.
+ void close();
+
+ std::string getFileName() const override
+ {
+ return file_name;
+ }
+
+ size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; }
+};
+
+
+/** Similar to ReadBufferFromFile but it is using 'pread' instead of 'read'.
+ */
+class ReadBufferFromFilePRead : public ReadBufferFromFile
+{
+public:
+ explicit ReadBufferFromFilePRead(
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt)
+ : ReadBufferFromFile(file_name_, buf_size, flags, existing_memory, alignment, file_size_)
+ {
+ use_pread = true;
+ }
+};
+
+
+/** Similar to ReadBufferFromFilePRead but also transparently shares open file descriptors.
+ */
+class ReadBufferFromFilePReadWithDescriptorsCache : public ReadBufferFromFileDescriptorPRead
+{
+private:
+ std::string file_name;
+ OpenedFileCache::OpenedFilePtr file;
+
+public:
+ explicit ReadBufferFromFilePReadWithDescriptorsCache(
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler_ = {})
+ : ReadBufferFromFileDescriptorPRead(-1, buf_size, existing_memory, alignment, file_size_, throttler_)
+ , file_name(file_name_)
+ {
+ file = OpenedFileCache::instance().get(file_name, flags);
+ fd = file->getFD();
+ }
+
+ std::string getFileName() const override
+ {
+ return file_name;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileBase.cpp b/contrib/clickhouse/src/IO/ReadBufferFromFileBase.cpp
new file mode 100644
index 0000000000..4ac3f984f7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileBase.cpp
@@ -0,0 +1,49 @@
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/Progress.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int UNKNOWN_FILE_SIZE;
+}
+
+ReadBufferFromFileBase::ReadBufferFromFileBase() : BufferWithOwnMemory<SeekableReadBuffer>(0)
+{
+}
+
+ReadBufferFromFileBase::ReadBufferFromFileBase(
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_)
+ : BufferWithOwnMemory<SeekableReadBuffer>(buf_size, existing_memory, alignment)
+ , file_size(file_size_)
+{
+}
+
+ReadBufferFromFileBase::~ReadBufferFromFileBase() = default;
+
+size_t ReadBufferFromFileBase::getFileSize()
+{
+ if (file_size)
+ return *file_size;
+ throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for read buffer");
+}
+
+void ReadBufferFromFileBase::setProgressCallback(ContextPtr context)
+{
+ auto file_progress_callback = context->getFileProgressCallback();
+
+ if (!file_progress_callback)
+ return;
+
+ setProfileCallback([file_progress_callback](const ProfileInfo & progress)
+ {
+ file_progress_callback(FileProgress(progress.bytes_read));
+ });
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileBase.h b/contrib/clickhouse/src/IO/ReadBufferFromFileBase.h
new file mode 100644
index 0000000000..b77db29bc2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileBase.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/SeekableReadBuffer.h>
+#include <IO/WithFileName.h>
+#include <Interpreters/Context_fwd.h>
+#include <base/time.h>
+
+#include <functional>
+#include <utility>
+#include <string>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+
+#ifndef O_DIRECT
+#define O_DIRECT 00040000
+#endif
+
+
+namespace DB
+{
+
+class ReadBufferFromFileBase : public BufferWithOwnMemory<SeekableReadBuffer>, public WithFileName, public WithFileSize
+{
+public:
+ ReadBufferFromFileBase();
+ ReadBufferFromFileBase(
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment,
+ std::optional<size_t> file_size_ = std::nullopt);
+ ~ReadBufferFromFileBase() override;
+
+ /// It is possible to get information about the time of each reading.
+ struct ProfileInfo
+ {
+ size_t bytes_requested;
+ size_t bytes_read;
+ size_t nanoseconds;
+ };
+
+ using ProfileCallback = std::function<void(ProfileInfo)>;
+
+ /// CLOCK_MONOTONIC_COARSE is more than enough to track long reads - for example, hanging for a second.
+ void setProfileCallback(const ProfileCallback & profile_callback_, clockid_t clock_type_ = CLOCK_MONOTONIC_COARSE)
+ {
+ profile_callback = profile_callback_;
+ clock_type = clock_type_;
+ }
+
+ size_t getFileSize() override;
+
+ void setProgressCallback(ContextPtr context);
+
+protected:
+ std::optional<size_t> file_size;
+ ProfileCallback profile_callback;
+ clockid_t clock_type{};
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.cpp b/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.cpp
new file mode 100644
index 0000000000..9ac0fb4e47
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.cpp
@@ -0,0 +1,60 @@
+#include <IO/ReadBufferFromFileDecorator.h>
+
+
+namespace DB
+{
+
+ReadBufferFromFileDecorator::ReadBufferFromFileDecorator(std::unique_ptr<SeekableReadBuffer> impl_)
+ : ReadBufferFromFileDecorator(std::move(impl_), "")
+{
+}
+
+
+ReadBufferFromFileDecorator::ReadBufferFromFileDecorator(std::unique_ptr<SeekableReadBuffer> impl_, const String & file_name_)
+ : impl(std::move(impl_)), file_name(file_name_)
+{
+ swap(*impl);
+}
+
+
+std::string ReadBufferFromFileDecorator::getFileName() const
+{
+ if (!file_name.empty())
+ return file_name;
+
+ return getFileNameFromReadBuffer(*impl);
+}
+
+
+off_t ReadBufferFromFileDecorator::getPosition()
+{
+ swap(*impl);
+ auto position = impl->getPosition();
+ swap(*impl);
+ return position;
+}
+
+
+off_t ReadBufferFromFileDecorator::seek(off_t off, int whence)
+{
+ swap(*impl);
+ auto result = impl->seek(off, whence);
+ swap(*impl);
+ return result;
+}
+
+
+bool ReadBufferFromFileDecorator::nextImpl()
+{
+ swap(*impl);
+ auto result = impl->next();
+ swap(*impl);
+ return result;
+}
+
+size_t ReadBufferFromFileDecorator::getFileSize()
+{
+ return getFileSizeFromReadBuffer(*impl);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.h b/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.h
new file mode 100644
index 0000000000..6e62c7f741
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileDecorator.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+
+
+namespace DB
+{
+
+/// Delegates all reads to underlying buffer. Doesn't have own memory.
+class ReadBufferFromFileDecorator : public ReadBufferFromFileBase
+{
+public:
+ explicit ReadBufferFromFileDecorator(std::unique_ptr<SeekableReadBuffer> impl_);
+ ReadBufferFromFileDecorator(std::unique_ptr<SeekableReadBuffer> impl_, const String & file_name_);
+
+ std::string getFileName() const override;
+
+ off_t getPosition() override;
+
+ off_t seek(off_t off, int whence) override;
+
+ bool nextImpl() override;
+
+ bool isWithFileSize() const { return dynamic_cast<const WithFileSize *>(impl.get()) != nullptr; }
+
+ const ReadBuffer & getWrappedReadBuffer() const { return *impl; }
+
+ ReadBuffer & getWrappedReadBuffer() { return *impl; }
+
+ size_t getFileSize() override;
+
+protected:
+ std::unique_ptr<SeekableReadBuffer> impl;
+ String file_name;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.cpp b/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.cpp
new file mode 100644
index 0000000000..6c0c1681a4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.cpp
@@ -0,0 +1,288 @@
+#include <cerrno>
+#include <ctime>
+#include <optional>
+#include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
+#include <Common/Exception.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/Throttler.h>
+#include <IO/ReadBufferFromFileDescriptor.h>
+#include <IO/WriteHelpers.h>
+#include <Common/filesystemHelpers.h>
+#include <sys/stat.h>
+#include <Interpreters/Context.h>
+
+
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+
+namespace ProfileEvents
+{
+ extern const Event ReadBufferFromFileDescriptorRead;
+ extern const Event ReadBufferFromFileDescriptorReadFailed;
+ extern const Event ReadBufferFromFileDescriptorReadBytes;
+ extern const Event DiskReadElapsedMicroseconds;
+ extern const Event Seek;
+ extern const Event LocalReadThrottlerBytes;
+ extern const Event LocalReadThrottlerSleepMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric Read;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR;
+ extern const int ARGUMENT_OUT_OF_BOUND;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int CANNOT_SELECT;
+ extern const int CANNOT_ADVISE;
+}
+
+
+std::string ReadBufferFromFileDescriptor::getFileName() const
+{
+ return "(fd = " + toString(fd) + ")";
+}
+
+
+size_t ReadBufferFromFileDescriptor::readImpl(char * to, size_t min_bytes, size_t max_bytes, size_t offset)
+{
+ chassert(min_bytes <= max_bytes);
+
+ /// This is a workaround of a read past EOF bug in linux kernel with pread()
+ if (file_size.has_value() && offset >= *file_size)
+ return 0;
+
+ size_t bytes_read = 0;
+ while (bytes_read < min_bytes)
+ {
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorRead);
+
+ Stopwatch watch(profile_callback ? clock_type : CLOCK_MONOTONIC);
+
+ ssize_t res = 0;
+ size_t to_read = max_bytes - bytes_read;
+ {
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::Read};
+
+ if (use_pread)
+ res = ::pread(fd, to + bytes_read, to_read, offset + bytes_read);
+ else
+ res = ::read(fd, to + bytes_read, to_read);
+ }
+ if (!res)
+ break;
+
+ if (-1 == res && errno != EINTR)
+ {
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadFailed);
+ throwFromErrnoWithPath("Cannot read from file: " + getFileName(), getFileName(), ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR);
+ }
+
+ if (res > 0)
+ {
+ bytes_read += res;
+ if (throttler)
+ throttler->add(res, ProfileEvents::LocalReadThrottlerBytes, ProfileEvents::LocalReadThrottlerSleepMicroseconds);
+ }
+
+
+ /// It reports real time spent including the time spent while thread was preempted doing nothing.
+ /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables).
+ /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it
+ /// (NetlinkMetricsProvider has about 500K RPS).
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds());
+
+ if (profile_callback)
+ {
+ ProfileInfo info;
+ info.bytes_requested = to_read;
+ info.bytes_read = res;
+ info.nanoseconds = watch.elapsed();
+ profile_callback(info);
+ }
+ }
+
+ if (bytes_read)
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read);
+
+ return bytes_read;
+}
+
+
+bool ReadBufferFromFileDescriptor::nextImpl()
+{
+ /// If internal_buffer size is empty, then read() cannot be distinguished from EOF
+ assert(!internal_buffer.empty());
+
+ size_t bytes_read = readImpl(internal_buffer.begin(), 1, internal_buffer.size(), file_offset_of_buffer_end);
+
+ file_offset_of_buffer_end += bytes_read;
+
+ if (bytes_read)
+ {
+ working_buffer = internal_buffer;
+ working_buffer.resize(bytes_read);
+ }
+ else
+ return false;
+
+ return true;
+}
+
+
+void ReadBufferFromFileDescriptor::prefetch(Priority)
+{
+#if defined(POSIX_FADV_WILLNEED)
+ /// For direct IO, loading data into page cache is pointless.
+ if (required_alignment)
+ return;
+
+ /// Ask OS to prefetch data into page cache.
+ if (0 != posix_fadvise(fd, file_offset_of_buffer_end, internal_buffer.size(), POSIX_FADV_WILLNEED))
+ throwFromErrno("Cannot posix_fadvise", ErrorCodes::CANNOT_ADVISE);
+#endif
+}
+
+
+/// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
+off_t ReadBufferFromFileDescriptor::seek(off_t offset, int whence)
+{
+ size_t new_pos;
+ if (whence == SEEK_SET)
+ {
+ assert(offset >= 0);
+ new_pos = offset;
+ }
+ else if (whence == SEEK_CUR)
+ {
+ new_pos = file_offset_of_buffer_end - (working_buffer.end() - pos) + offset;
+ }
+ else
+ {
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "ReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence");
+ }
+
+ /// Position is unchanged.
+ if (new_pos + (working_buffer.end() - pos) == file_offset_of_buffer_end)
+ return new_pos;
+
+ if (file_offset_of_buffer_end - working_buffer.size() <= static_cast<size_t>(new_pos)
+ && new_pos <= file_offset_of_buffer_end)
+ {
+ /// Position is still inside the buffer.
+ /// Probably it is at the end of the buffer - then we will load data on the following 'next' call.
+
+ pos = working_buffer.end() - file_offset_of_buffer_end + new_pos;
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+
+ return new_pos;
+ }
+ else
+ {
+ /// Position is out of the buffer, we need to do real seek.
+ off_t seek_pos = required_alignment > 1
+ ? new_pos / required_alignment * required_alignment
+ : new_pos;
+
+ off_t offset_after_seek_pos = new_pos - seek_pos;
+
+ /// First reset the buffer so the next read will fetch new data to the buffer.
+ resetWorkingBuffer();
+
+ /// In case of using 'pread' we just update the info about the next position in file.
+ /// In case of using 'read' we call 'lseek'.
+
+ /// We account both cases as seek event as it leads to non-contiguous reads from file.
+ ProfileEvents::increment(ProfileEvents::Seek);
+
+ if (!use_pread)
+ {
+ Stopwatch watch(profile_callback ? clock_type : CLOCK_MONOTONIC);
+
+ off_t res = ::lseek(fd, seek_pos, SEEK_SET);
+ if (-1 == res)
+ throwFromErrnoWithPath(fmt::format("Cannot seek through file {} at offset {}", getFileName(), seek_pos), getFileName(),
+ ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+ /// Also note that seeking past the file size is not allowed.
+ if (res != seek_pos)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+ "The 'lseek' syscall returned value ({}) that is not expected ({})", res, seek_pos);
+
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds());
+ }
+
+ file_offset_of_buffer_end = seek_pos;
+
+ if (offset_after_seek_pos > 0)
+ ignore(offset_after_seek_pos);
+
+ return seek_pos;
+ }
+}
+
+
+void ReadBufferFromFileDescriptor::rewind()
+{
+ if (!use_pread)
+ {
+ ProfileEvents::increment(ProfileEvents::Seek);
+ off_t res = ::lseek(fd, 0, SEEK_SET);
+ if (-1 == res)
+ throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
+ ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+ }
+ /// In case of pread, the ProfileEvents::Seek is not accounted, but it's Ok.
+
+ /// Clearing the buffer with existing data. New data will be read on subsequent call to 'next'.
+ working_buffer.resize(0);
+ pos = working_buffer.begin();
+ file_offset_of_buffer_end = 0;
+}
+
+
+/// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout.
+bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) const
+{
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ timeval timeout = { time_t(timeout_microseconds / 1000000), suseconds_t(timeout_microseconds % 1000000) };
+
+ int res = select(1, &fds, nullptr, nullptr, &timeout);
+
+ if (-1 == res)
+ throwFromErrno("Cannot select", ErrorCodes::CANNOT_SELECT);
+
+ return res > 0;
+}
+
+
+size_t ReadBufferFromFileDescriptor::getFileSize()
+{
+ return getSizeFromFileDescriptor(fd, getFileName());
+}
+
+bool ReadBufferFromFileDescriptor::checkIfActuallySeekable()
+{
+ struct stat stat;
+ auto res = ::fstat(fd, &stat);
+ return res == 0 && S_ISREG(stat.st_mode);
+}
+
+size_t ReadBufferFromFileDescriptor::readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> &)
+{
+ chassert(use_pread);
+ return readImpl(to, n, n, offset);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.h b/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.h
new file mode 100644
index 0000000000..64340770cf
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromFileDescriptor.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <Interpreters/Context_fwd.h>
+#include <Common/Throttler_fwd.h>
+
+#include <unistd.h>
+
+
+namespace DB
+{
+
+/** Use ready file descriptor. Does not open or close a file.
+ */
+class ReadBufferFromFileDescriptor : public ReadBufferFromFileBase
+{
+protected:
+ const size_t required_alignment = 0; /// For O_DIRECT both file offsets and memory addresses have to be aligned.
+ bool use_pread = false; /// To access one fd from multiple threads, use 'pread' syscall instead of 'read'.
+
+ size_t file_offset_of_buffer_end = 0; /// What offset in file corresponds to working_buffer.end().
+
+ int fd;
+
+ ThrottlerPtr throttler;
+
+ bool nextImpl() override;
+ void prefetch(Priority priority) override;
+
+ /// Name or some description of file.
+ std::string getFileName() const override;
+
+ /// Does the read()/pread(), with all the metric increments, error handling, throttling, etc.
+ /// Doesn't seek (`offset` must match fd's position if !use_pread).
+ /// Stops after min_bytes or eof. Returns 0 if eof.
+ /// Thread safe.
+ size_t readImpl(char * to, size_t min_bytes, size_t max_bytes, size_t offset);
+
+public:
+ explicit ReadBufferFromFileDescriptor(
+ int fd_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler_ = {})
+ : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_)
+ , required_alignment(alignment)
+ , fd(fd_)
+ , throttler(throttler_)
+ {
+ }
+
+ int getFD() const
+ {
+ return fd;
+ }
+
+ off_t getPosition() override
+ {
+ return file_offset_of_buffer_end - (working_buffer.end() - pos);
+ }
+
+ size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; }
+
+ /// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
+ off_t seek(off_t off, int whence) override;
+
+ /// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read.
+ void rewind();
+
+ size_t getFileSize() override;
+
+ bool checkIfActuallySeekable() override;
+
+ size_t readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> &) override;
+ bool supportsReadAt() override { return use_pread; }
+
+private:
+ /// Assuming file descriptor supports 'select', check that we have data to read or wait until timeout.
+ bool poll(size_t timeout_microseconds) const;
+};
+
+
+/** Similar to ReadBufferFromFileDescriptor but it is using 'pread' allowing multiple concurrent reads from the same fd.
+ */
+class ReadBufferFromFileDescriptorPRead : public ReadBufferFromFileDescriptor
+{
+public:
+ explicit ReadBufferFromFileDescriptorPRead(
+ int fd_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ std::optional<size_t> file_size_ = std::nullopt,
+ ThrottlerPtr throttler_ = {})
+ : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_, throttler_)
+ {
+ use_pread = true;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromIStream.cpp b/contrib/clickhouse/src/IO/ReadBufferFromIStream.cpp
new file mode 100644
index 0000000000..e0c966fb70
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromIStream.cpp
@@ -0,0 +1,39 @@
+#include <IO/ReadBufferFromIStream.h>
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_READ_FROM_ISTREAM;
+}
+
+bool ReadBufferFromIStream::nextImpl()
+{
+ istr.read(internal_buffer.begin(), internal_buffer.size());
+ size_t gcount = istr.gcount();
+
+ if (!gcount)
+ {
+ if (istr.eof())
+ return false;
+
+ if (istr.fail())
+ throw Exception(ErrorCodes::CANNOT_READ_FROM_ISTREAM, "Cannot read from istream at offset {}", count());
+
+ throw Exception(ErrorCodes::CANNOT_READ_FROM_ISTREAM, "Unexpected state of istream at offset {}", count());
+ }
+ else
+ working_buffer.resize(gcount);
+
+ return true;
+}
+
+ReadBufferFromIStream::ReadBufferFromIStream(std::istream & istr_, size_t size)
+ : BufferWithOwnMemory<ReadBuffer>(size), istr(istr_)
+{
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromIStream.h b/contrib/clickhouse/src/IO/ReadBufferFromIStream.h
new file mode 100644
index 0000000000..8c3f62728b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromIStream.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+
+
+namespace DB
+{
+
+class ReadBufferFromIStream : public BufferWithOwnMemory<ReadBuffer>
+{
+private:
+ std::istream & istr;
+
+ bool nextImpl() override;
+
+public:
+ explicit ReadBufferFromIStream(std::istream & istr_, size_t size = DBMS_DEFAULT_BUFFER_SIZE);
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromMemory.cpp b/contrib/clickhouse/src/IO/ReadBufferFromMemory.cpp
new file mode 100644
index 0000000000..ede2c531e4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromMemory.cpp
@@ -0,0 +1,47 @@
+#include "ReadBufferFromMemory.h"
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+}
+
+off_t ReadBufferFromMemory::seek(off_t offset, int whence)
+{
+ if (whence == SEEK_SET)
+ {
+ if (offset >= 0 && internal_buffer.begin() + offset <= internal_buffer.end())
+ {
+ pos = internal_buffer.begin() + offset;
+ working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
+ return static_cast<size_t>(pos - internal_buffer.begin());
+ }
+ else
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}, Max: {}",
+ offset, std::to_string(static_cast<size_t>(internal_buffer.end() - internal_buffer.begin())));
+ }
+ else if (whence == SEEK_CUR)
+ {
+ Position new_pos = pos + offset;
+ if (new_pos >= internal_buffer.begin() && new_pos <= internal_buffer.end())
+ {
+ pos = new_pos;
+ working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
+ return static_cast<size_t>(pos - internal_buffer.begin());
+ }
+ else
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}, Max: {}",
+ offset, std::to_string(static_cast<size_t>(internal_buffer.end() - internal_buffer.begin())));
+ }
+ else
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET and SEEK_CUR seek modes allowed.");
+}
+
+off_t ReadBufferFromMemory::getPosition()
+{
+ return pos - internal_buffer.begin();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromMemory.h b/contrib/clickhouse/src/IO/ReadBufferFromMemory.h
new file mode 100644
index 0000000000..ad96e4bfa2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromMemory.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "SeekableReadBuffer.h"
+
+
+namespace DB
+{
+/** Allows to read from memory range.
+ * In comparison with just ReadBuffer, it only adds convenient constructors, that do const_cast.
+ * In fact, ReadBuffer will not modify data in buffer, but it requires non-const pointer.
+ */
+class ReadBufferFromMemory : public SeekableReadBuffer
+{
+public:
+ template <typename CharT>
+ requires (sizeof(CharT) == 1)
+ ReadBufferFromMemory(const CharT * buf, size_t size)
+ : SeekableReadBuffer(const_cast<char *>(reinterpret_cast<const char *>(buf)), size, 0) {}
+ explicit ReadBufferFromMemory(const std::string_view&& str)
+ : SeekableReadBuffer(const_cast<char *>(str.data()), str.size(), 0) {}
+
+ off_t seek(off_t off, int whence) override;
+
+ off_t getPosition() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.cpp b/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.cpp
new file mode 100644
index 0000000000..ff72dc5386
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.cpp
@@ -0,0 +1,129 @@
+#include <Poco/Net/NetException.h>
+
+#include <base/scope_guard.h>
+
+#include <IO/ReadBufferFromPocoSocket.h>
+#include <Common/Exception.h>
+#include <Common/NetException.h>
+#include <Common/Stopwatch.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/AsyncTaskExecutor.h>
+#include <Common/checkSSLReturnCode.h>
+
+namespace ProfileEvents
+{
+ extern const Event NetworkReceiveElapsedMicroseconds;
+ extern const Event NetworkReceiveBytes;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric NetworkReceive;
+}
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int NETWORK_ERROR;
+ extern const int SOCKET_TIMEOUT;
+ extern const int CANNOT_READ_FROM_SOCKET;
+ extern const int LOGICAL_ERROR;
+}
+
+bool ReadBufferFromPocoSocket::nextImpl()
+{
+ ssize_t bytes_read = 0;
+ Stopwatch watch;
+
+ SCOPE_EXIT({
+ /// NOTE: it is quite inaccurate on high loads since the thread could be replaced by another one
+ ProfileEvents::increment(ProfileEvents::NetworkReceiveElapsedMicroseconds, watch.elapsedMicroseconds());
+ ProfileEvents::increment(ProfileEvents::NetworkReceiveBytes, bytes_read);
+ });
+
+ /// Add more details to exceptions.
+ try
+ {
+ CurrentMetrics::Increment metric_increment(CurrentMetrics::NetworkReceive);
+
+ if (internal_buffer.size() > INT_MAX)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow");
+
+ /// If async_callback is specified, set socket to non-blocking mode
+ /// and try to read data from it, if socket is not ready for reading,
+ /// run async_callback and try again later.
+ /// It is expected that file descriptor may be polled externally.
+ /// Note that send timeout is not checked here. External code should check it while polling.
+ if (async_callback)
+ {
+ socket.setBlocking(false);
+ SCOPE_EXIT(socket.setBlocking(true));
+ bool secure = socket.secure();
+ bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+
+ /// Check EAGAIN and ERR_SSL_WANT_READ/ERR_SSL_WANT_WRITE for secure socket (reading from secure socket can write too).
+ while (bytes_read < 0 && (errno == EAGAIN || (secure && (checkSSLWantRead(bytes_read) || checkSSLWantWrite(bytes_read)))))
+ {
+ /// In case of ERR_SSL_WANT_WRITE we should wait for socket to be ready for writing, otherwise - for reading.
+ if (secure && checkSSLWantWrite(bytes_read))
+ async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR);
+ else
+ async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR);
+
+ /// Try to read again.
+ bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+ }
+ }
+ else
+ {
+ bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), static_cast<int>(internal_buffer.size()));
+ }
+ }
+ catch (const Poco::Net::NetException & e)
+ {
+ throw NetException(ErrorCodes::NETWORK_ERROR, "{}, while reading from socket ({})", e.displayText(), peer_address.toString());
+ }
+ catch (const Poco::TimeoutException &)
+ {
+ throw NetException(ErrorCodes::SOCKET_TIMEOUT, "Timeout exceeded while reading from socket ({}, {} ms)",
+ peer_address.toString(),
+ socket.impl()->getReceiveTimeout().totalMilliseconds());
+ }
+ catch (const Poco::IOException & e)
+ {
+ throw NetException(ErrorCodes::NETWORK_ERROR, "{}, while reading from socket ({})", e.displayText(), peer_address.toString());
+ }
+
+ if (bytes_read < 0)
+ throw NetException(ErrorCodes::CANNOT_READ_FROM_SOCKET, "Cannot read from socket ({})", peer_address.toString());
+
+ if (bytes_read)
+ working_buffer.resize(bytes_read);
+ else
+ return false;
+
+ return true;
+}
+
+ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size)
+ : BufferWithOwnMemory<ReadBuffer>(buf_size)
+ , socket(socket_)
+ , peer_address(socket.peerAddress())
+ , socket_description("socket (" + peer_address.toString() + ")")
+{
+}
+
+bool ReadBufferFromPocoSocket::poll(size_t timeout_microseconds) const
+{
+ if (available())
+ return true;
+
+ Stopwatch watch;
+ bool res = socket.poll(timeout_microseconds, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR);
+ ProfileEvents::increment(ProfileEvents::NetworkReceiveElapsedMicroseconds, watch.elapsedMicroseconds());
+ return res;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.h b/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.h
new file mode 100644
index 0000000000..dab4ac8629
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromPocoSocket.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
+#include <Common/AsyncTaskExecutor.h>
+#include <Poco/Net/Socket.h>
+
+namespace DB
+{
+
+/// Works with the ready Poco::Net::Socket. Blocking operations.
+class ReadBufferFromPocoSocket : public BufferWithOwnMemory<ReadBuffer>
+{
+protected:
+ Poco::Net::Socket & socket;
+
+ /** For error messages. It is necessary to receive this address in advance, because,
+ * for example, if the connection is broken, the address will not be received anymore
+ * (getpeername will return an error).
+ */
+ Poco::Net::SocketAddress peer_address;
+
+ bool nextImpl() override;
+
+public:
+ explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
+
+ bool poll(size_t timeout_microseconds) const;
+
+ void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }
+
+private:
+ AsyncCallback async_callback;
+ std::string socket_description;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromS3.cpp b/contrib/clickhouse/src/IO/ReadBufferFromS3.cpp
new file mode 100644
index 0000000000..1658f03f85
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromS3.cpp
@@ -0,0 +1,498 @@
+#include <IO/HTTPCommon.h>
+#include <IO/S3Common.h>
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <IO/ReadBufferFromIStream.h>
+#include <IO/ReadBufferFromS3.h>
+#include <IO/ResourceGuard.h>
+#include <IO/S3/getObjectInfo.h>
+#include <IO/S3/Requests.h>
+
+#include <Common/Stopwatch.h>
+#include <Common/Throttler.h>
+#include <Common/logger_useful.h>
+#include <Common/ElapsedTimeProfileEventIncrement.h>
+#include <base/sleep.h>
+
+#include <utility>
+
+
+namespace ProfileEvents
+{
+ extern const Event ReadBufferFromS3Microseconds;
+ extern const Event ReadBufferFromS3InitMicroseconds;
+ extern const Event ReadBufferFromS3Bytes;
+ extern const Event ReadBufferFromS3RequestsErrors;
+ extern const Event ReadBufferFromS3ResetSessions;
+ extern const Event ReadBufferFromS3PreservedSessions;
+ extern const Event ReadBufferSeekCancelConnection;
+ extern const Event S3GetObject;
+ extern const Event DiskS3GetObject;
+ extern const Event RemoteReadThrottlerBytes;
+ extern const Event RemoteReadThrottlerSleepMicroseconds;
+}
+
+namespace
+{
+DB::PooledHTTPSessionPtr getSession(Aws::S3::Model::GetObjectResult & read_result)
+{
+ if (auto * session_aware_stream = dynamic_cast<DB::S3::SessionAwareIOStream<DB::PooledHTTPSessionPtr> *>(&read_result.GetBody()))
+ return static_cast<DB::PooledHTTPSessionPtr &>(session_aware_stream->getSession());
+
+ if (dynamic_cast<DB::S3::SessionAwareIOStream<DB::HTTPSessionPtr> *>(&read_result.GetBody()))
+ return {};
+
+ /// accept result from S# mock in gtest_writebuffer_s3.cpp
+ if (dynamic_cast<Aws::Utils::Stream::DefaultUnderlyingStream *>(&read_result.GetBody()))
+ return {};
+
+ throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session of unexpected type encountered");
+}
+
+void resetSession(Aws::S3::Model::GetObjectResult & read_result)
+{
+ if (auto session = getSession(read_result); !session.isNull())
+ {
+ auto & http_session = static_cast<Poco::Net::HTTPClientSession &>(*session);
+ http_session.reset();
+ }
+}
+
+void resetSessionIfNeeded(bool read_all_range_successfully, std::optional<Aws::S3::Model::GetObjectResult> & read_result)
+{
+ if (!read_result)
+ return;
+
+ if (!read_all_range_successfully)
+ {
+ /// When we abandon a session with an ongoing GetObject request and there is another one trying to delete the same object this delete
+ /// operation will hang until GetObject's session idle timeouts. So we have to call `reset()` on GetObject's session session immediately.
+ resetSession(*read_result);
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromS3ResetSessions);
+ }
+ else if (auto session = getSession(*read_result); !session.isNull())
+ {
+ DB::markSessionForReuse(session);
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromS3PreservedSessions);
+ }
+}
+}
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int S3_ERROR;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+ extern const int LOGICAL_ERROR;
+ extern const int CANNOT_ALLOCATE_MEMORY;
+}
+
+
+ReadBufferFromS3::ReadBufferFromS3(
+ std::shared_ptr<const S3::Client> client_ptr_,
+ const String & bucket_,
+ const String & key_,
+ const String & version_id_,
+ const S3Settings::RequestSettings & request_settings_,
+ const ReadSettings & settings_,
+ bool use_external_buffer_,
+ size_t offset_,
+ size_t read_until_position_,
+ bool restricted_seek_,
+ std::optional<size_t> file_size_)
+ : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0, file_size_)
+ , client_ptr(std::move(client_ptr_))
+ , bucket(bucket_)
+ , key(key_)
+ , version_id(version_id_)
+ , request_settings(request_settings_)
+ , offset(offset_)
+ , read_until_position(read_until_position_)
+ , read_settings(settings_)
+ , use_external_buffer(use_external_buffer_)
+ , restricted_seek(restricted_seek_)
+{
+}
+
+bool ReadBufferFromS3::nextImpl()
+{
+ if (read_until_position)
+ {
+ if (read_until_position == offset)
+ return false;
+
+ if (read_until_position < offset)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+ }
+
+ bool next_result = false;
+
+ if (impl)
+ {
+ if (use_external_buffer)
+ {
+ /**
+ * use_external_buffer -- means we read into the buffer which
+ * was passed to us from somewhere else. We do not check whether
+ * previously returned buffer was read or not (no hasPendingData() check is needed),
+ * because this branch means we are prefetching data,
+ * each nextImpl() call we can fill a different buffer.
+ */
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+ else
+ {
+ /**
+ * impl was initialized before, pass position() to it to make
+ * sure there is no pending data which was not read.
+ */
+ impl->position() = position();
+ assert(!impl->hasPendingData());
+ }
+ }
+
+ size_t sleep_time_with_backoff_milliseconds = 100;
+ for (size_t attempt = 0; !next_result; ++attempt)
+ {
+ bool last_attempt = attempt + 1 >= request_settings.max_single_read_retries;
+
+ ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::ReadBufferFromS3Microseconds);
+
+ try
+ {
+ if (!impl)
+ {
+ impl = initialize();
+
+ if (use_external_buffer)
+ {
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+ else
+ {
+ /// use the buffer returned by `impl`
+ BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
+ }
+ }
+
+ /// Try to read a next portion of data.
+ next_result = impl->next();
+ break;
+ }
+ catch (Exception & e)
+ {
+ if (!processException(e, getPosition(), attempt) || last_attempt)
+ throw;
+
+ /// Pause before next attempt.
+ sleepForMilliseconds(sleep_time_with_backoff_milliseconds);
+ sleep_time_with_backoff_milliseconds *= 2;
+
+ /// Try to reinitialize `impl`.
+ resetWorkingBuffer();
+ impl.reset();
+ }
+ }
+
+ if (!next_result)
+ {
+ read_all_range_successfully = true;
+ return false;
+ }
+
+ BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
+
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Bytes, working_buffer.size());
+ offset += working_buffer.size();
+ if (read_settings.remote_throttler)
+ read_settings.remote_throttler->add(working_buffer.size(), ProfileEvents::RemoteReadThrottlerBytes, ProfileEvents::RemoteReadThrottlerSleepMicroseconds);
+
+ return true;
+}
+
+
+size_t ReadBufferFromS3::readBigAt(char * to, size_t n, size_t range_begin, const std::function<bool(size_t)> & progress_callback)
+{
+ if (n == 0)
+ return 0;
+
+ size_t sleep_time_with_backoff_milliseconds = 100;
+ for (size_t attempt = 0;; ++attempt)
+ {
+ bool last_attempt = attempt + 1 >= request_settings.max_single_read_retries;
+
+ ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::ReadBufferFromS3Microseconds);
+
+ try
+ {
+ auto result = sendRequest(range_begin, range_begin + n - 1);
+ std::istream & istr = result.GetBody();
+
+ size_t bytes = copyFromIStreamWithProgressCallback(istr, to, n, progress_callback);
+
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Bytes, bytes);
+
+ if (read_settings.remote_throttler)
+ read_settings.remote_throttler->add(bytes, ProfileEvents::RemoteReadThrottlerBytes, ProfileEvents::RemoteReadThrottlerSleepMicroseconds);
+
+ return bytes;
+ }
+ catch (Poco::Exception & e)
+ {
+ if (!processException(e, range_begin, attempt) || last_attempt)
+ throw;
+
+ sleepForMilliseconds(sleep_time_with_backoff_milliseconds);
+ sleep_time_with_backoff_milliseconds *= 2;
+ }
+ }
+}
+
+bool ReadBufferFromS3::processException(Poco::Exception & e, size_t read_offset, size_t attempt) const
+{
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);
+
+ LOG_DEBUG(
+ log,
+ "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, "
+ "Attempt: {}, Message: {}",
+ bucket, key, version_id.empty() ? "Latest" : version_id, read_offset, attempt, e.message());
+
+
+ if (auto * s3_exception = dynamic_cast<S3Exception *>(&e))
+ {
+ /// It doesn't make sense to retry Access Denied or No Such Key
+ if (!s3_exception->isRetryableError())
+ {
+ s3_exception->addMessage("while reading key: {}, from bucket: {}", key, bucket);
+ return false;
+ }
+ }
+
+ /// It doesn't make sense to retry allocator errors
+ if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
+ {
+ tryLogCurrentException(log);
+ return false;
+ }
+
+ return true;
+}
+
+
+off_t ReadBufferFromS3::seek(off_t offset_, int whence)
+{
+ if (offset_ == getPosition() && whence == SEEK_SET)
+ return offset_;
+
+ read_all_range_successfully = false;
+
+ if (impl && restricted_seek)
+ {
+ throw Exception(
+ ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+ "Seek is allowed only before first read attempt from the buffer (current offset: "
+ "{}, new offset: {}, reading until position: {}, available: {})",
+ getPosition(), offset_, read_until_position, available());
+ }
+
+ if (whence != SEEK_SET)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
+
+ if (offset_ < 0)
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", offset_);
+
+ if (!restricted_seek)
+ {
+ if (!working_buffer.empty()
+ && static_cast<size_t>(offset_) >= offset - working_buffer.size()
+ && offset_ < offset)
+ {
+ pos = working_buffer.end() - (offset - offset_);
+ assert(pos >= working_buffer.begin());
+ assert(pos < working_buffer.end());
+
+ return getPosition();
+ }
+
+ off_t position = getPosition();
+ if (impl && offset_ > position)
+ {
+ size_t diff = offset_ - position;
+ if (diff < read_settings.remote_read_min_bytes_for_seek)
+ {
+ ignore(diff);
+ return offset_;
+ }
+ }
+
+ resetWorkingBuffer();
+ if (impl)
+ {
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ impl.reset();
+ }
+ }
+
+ offset = offset_;
+ return offset;
+}
+
+size_t ReadBufferFromS3::getFileSize()
+{
+ if (file_size)
+ return *file_size;
+
+ auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, request_settings, /* for_disk_s3= */ read_settings.for_object_storage);
+
+ file_size = object_size;
+ return *file_size;
+}
+
+off_t ReadBufferFromS3::getPosition()
+{
+ return offset - available();
+}
+
+void ReadBufferFromS3::setReadUntilPosition(size_t position)
+{
+ if (position != static_cast<size_t>(read_until_position))
+ {
+ read_all_range_successfully = false;
+
+ if (impl)
+ {
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ offset = getPosition();
+ resetWorkingBuffer();
+ impl.reset();
+ }
+ read_until_position = position;
+ }
+}
+
+void ReadBufferFromS3::setReadUntilEnd()
+{
+ if (read_until_position)
+ {
+ read_all_range_successfully = false;
+
+ read_until_position = 0;
+ if (impl)
+ {
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ offset = getPosition();
+ resetWorkingBuffer();
+ impl.reset();
+ }
+ }
+}
+
+bool ReadBufferFromS3::atEndOfRequestedRangeGuess()
+{
+ if (!impl)
+ return true;
+ if (read_until_position)
+ return getPosition() >= read_until_position;
+ if (file_size)
+ return getPosition() >= static_cast<off_t>(*file_size);
+ return false;
+}
+
+ReadBufferFromS3::~ReadBufferFromS3()
+{
+ try
+ {
+ resetSessionIfNeeded(readAllRangeSuccessfully(), read_result);
+ }
+ catch (...)
+ {
+ tryLogCurrentException(log);
+ }
+}
+
+std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
+{
+ resetSessionIfNeeded(readAllRangeSuccessfully(), read_result);
+ read_all_range_successfully = false;
+
+ /**
+ * If remote_filesystem_read_method = 'threadpool', then for MergeTree family tables
+ * exact byte ranges to read are always passed here.
+ */
+ if (read_until_position && offset >= read_until_position)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+
+ read_result = sendRequest(offset, read_until_position ? std::make_optional(read_until_position - 1) : std::nullopt);
+
+ size_t buffer_size = use_external_buffer ? 0 : read_settings.remote_fs_buffer_size;
+ return std::make_unique<ReadBufferFromIStream>(read_result->GetBody(), buffer_size);
+}
+
+Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t range_begin, std::optional<size_t> range_end_incl) const
+{
+ S3::GetObjectRequest req;
+ req.SetBucket(bucket);
+ req.SetKey(key);
+ if (!version_id.empty())
+ req.SetVersionId(version_id);
+
+ if (range_end_incl)
+ {
+ req.SetRange(fmt::format("bytes={}-{}", range_begin, *range_end_incl));
+ LOG_TEST(
+ log, "Read S3 object. Bucket: {}, Key: {}, Version: {}, Range: {}-{}",
+ bucket, key, version_id.empty() ? "Latest" : version_id, range_begin, *range_end_incl);
+ }
+ else if (range_begin)
+ {
+ req.SetRange(fmt::format("bytes={}-", range_begin));
+ LOG_TEST(
+ log, "Read S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}",
+ bucket, key, version_id.empty() ? "Latest" : version_id, range_begin);
+ }
+
+ ProfileEvents::increment(ProfileEvents::S3GetObject);
+ if (read_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3GetObject);
+
+ ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::ReadBufferFromS3InitMicroseconds);
+
+ // We do not know in advance how many bytes we are going to consume, to avoid blocking estimated it from below
+ constexpr ResourceCost estimated_cost = 1;
+ ResourceGuard rlock(read_settings.resource_link, estimated_cost);
+ Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req);
+ rlock.unlock();
+
+ if (outcome.IsSuccess())
+ {
+ ResourceCost bytes_read = outcome.GetResult().GetContentLength();
+ read_settings.resource_link.adjust(estimated_cost, bytes_read);
+ return outcome.GetResultWithOwnership();
+ }
+ else
+ {
+ read_settings.resource_link.accumulate(estimated_cost);
+ const auto & error = outcome.GetError();
+ throw S3Exception(error.GetMessage(), error.GetErrorType());
+ }
+}
+
+bool ReadBufferFromS3::readAllRangeSuccessfully() const
+{
+ return read_until_position ? offset == read_until_position : read_all_range_successfully;
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromS3.h b/contrib/clickhouse/src/IO/ReadBufferFromS3.h
new file mode 100644
index 0000000000..94697df1a0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromS3.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <Storages/StorageS3Settings.h>
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <memory>
+
+#include <IO/HTTPCommon.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/ReadSettings.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/WithFileName.h>
+
+#include <aws/s3/model/GetObjectResult.h>
+
+namespace DB
+{
+/**
+ * Perform S3 HTTP GET request and provide response to read.
+ */
+class ReadBufferFromS3 : public ReadBufferFromFileBase
+{
+private:
+ std::shared_ptr<const S3::Client> client_ptr;
+ String bucket;
+ String key;
+ String version_id;
+ const S3Settings::RequestSettings request_settings;
+
+ /// These variables are atomic because they can be used for `logging only`
+ /// (where it is not important to get consistent result)
+ /// from separate thread other than the one which uses the buffer for s3 reading.
+ std::atomic<off_t> offset = 0;
+ std::atomic<off_t> read_until_position = 0;
+
+ std::optional<Aws::S3::Model::GetObjectResult> read_result;
+ std::unique_ptr<ReadBuffer> impl;
+
+ Poco::Logger * log = &Poco::Logger::get("ReadBufferFromS3");
+
+public:
+ ReadBufferFromS3(
+ std::shared_ptr<const S3::Client> client_ptr_,
+ const String & bucket_,
+ const String & key_,
+ const String & version_id_,
+ const S3Settings::RequestSettings & request_settings_,
+ const ReadSettings & settings_,
+ bool use_external_buffer = false,
+ size_t offset_ = 0,
+ size_t read_until_position_ = 0,
+ bool restricted_seek_ = false,
+ std::optional<size_t> file_size = std::nullopt);
+
+ ~ReadBufferFromS3() override;
+
+ bool nextImpl() override;
+
+ off_t seek(off_t off, int whence) override;
+
+ off_t getPosition() override;
+
+ size_t getFileSize() override;
+
+ void setReadUntilPosition(size_t position) override;
+ void setReadUntilEnd() override;
+
+ size_t getFileOffsetOfBufferEnd() const override { return offset; }
+
+ bool supportsRightBoundedReads() const override { return true; }
+
+ String getFileName() const override { return bucket + "/" + key; }
+
+ size_t readBigAt(char * to, size_t n, size_t range_begin, const std::function<bool(size_t)> & progress_callback) override;
+
+ bool supportsReadAt() override { return true; }
+
+private:
+ std::unique_ptr<ReadBuffer> initialize();
+
+ /// If true, if we destroy impl now, no work was wasted. Just for metrics.
+ bool atEndOfRequestedRangeGuess();
+
+ /// Call inside catch() block if GetObject fails. Bumps metrics, logs the error.
+ /// Returns true if the error looks retriable.
+ bool processException(Poco::Exception & e, size_t read_offset, size_t attempt) const;
+
+ Aws::S3::Model::GetObjectResult sendRequest(size_t range_begin, std::optional<size_t> range_end_incl) const;
+
+ bool readAllRangeSuccessfully() const;
+
+ ReadSettings read_settings;
+
+ bool use_external_buffer;
+
+ /// There is different seek policy for disk seek and for non-disk seek
+ /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet).
+ bool restricted_seek;
+
+ bool read_all_range_successfully = false;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/ReadBufferFromString.h b/contrib/clickhouse/src/IO/ReadBufferFromString.h
new file mode 100644
index 0000000000..f20e319b93
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadBufferFromString.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <IO/ReadBufferFromMemory.h>
+
+namespace DB
+{
+
+/// Allows to read from std::string-like object.
+class ReadBufferFromString : public ReadBufferFromMemory
+{
+public:
+ /// std::string or something similar
+ template <typename S>
+ explicit ReadBufferFromString(const S & s) : ReadBufferFromMemory(s.data(), s.size()) {}
+
+ explicit ReadBufferFromString(std::string_view s) : ReadBufferFromMemory(s.data(), s.size()) {}
+};
+
+class ReadBufferFromOwnString : public String, public ReadBufferFromString
+{
+public:
+ template <typename S>
+ explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
+ {
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadHelpers.cpp b/contrib/clickhouse/src/IO/ReadHelpers.cpp
new file mode 100644
index 0000000000..bf3215d582
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadHelpers.cpp
@@ -0,0 +1,1712 @@
+#include <Core/Defines.h>
+#include <base/hex.h>
+#include <Common/PODArray.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/memcpySmall.h>
+#include <Formats/FormatSettings.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/readFloatText.h>
+#include <IO/Operators.h>
+#include <base/find_symbols.h>
+#include <cstdlib>
+#include <bit>
+
+#include <base/simd.h>
+
+#ifdef __SSE2__
+ #include <emmintrin.h>
+#endif
+
+#if defined(__aarch64__) && defined(__ARM_NEON)
+# include <arm_neon.h>
+# pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
+ extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
+ extern const int CANNOT_PARSE_QUOTED_STRING;
+ extern const int CANNOT_PARSE_DATETIME;
+ extern const int CANNOT_PARSE_DATE;
+ extern const int CANNOT_PARSE_UUID;
+ extern const int INCORRECT_DATA;
+ extern const int ATTEMPT_TO_READ_AFTER_EOF;
+ extern const int LOGICAL_ERROR;
+ extern const int BAD_ARGUMENTS;
+}
+
+template <size_t num_bytes, typename IteratorSrc, typename IteratorDst>
+inline void parseHex(IteratorSrc src, IteratorDst dst)
+{
+ size_t src_pos = 0;
+ size_t dst_pos = 0;
+ for (; dst_pos < num_bytes; ++dst_pos, src_pos += 2)
+ dst[dst_pos] = unhex2(reinterpret_cast<const char *>(&src[src_pos]));
+}
+
+UUID parseUUID(std::span<const UInt8> src)
+{
+ UUID uuid;
+ const auto * src_ptr = src.data();
+ const auto size = src.size();
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ const std::reverse_iterator dst(reinterpret_cast<UInt8 *>(&uuid) + sizeof(UUID));
+#else
+ auto * dst = reinterpret_cast<UInt8 *>(&uuid);
+#endif
+ if (size == 36)
+ {
+ parseHex<4>(src_ptr, dst + 8);
+ parseHex<2>(src_ptr + 9, dst + 12);
+ parseHex<2>(src_ptr + 14, dst + 14);
+ parseHex<2>(src_ptr + 19, dst);
+ parseHex<6>(src_ptr + 24, dst + 2);
+ }
+ else if (size == 32)
+ {
+ parseHex<8>(src_ptr, dst + 8);
+ parseHex<8>(src_ptr + 16, dst);
+ }
+ else
+ throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Unexpected length when trying to parse UUID ({})", size);
+
+ return uuid;
+}
+
+void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
+{
+ WriteBufferFromOwnString out;
+ out << quote << s;
+
+ if (buf.eof())
+ out << " at end of stream.";
+ else
+ out << " before: " << quote << String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position()));
+
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse input: expected {}", out.str());
+}
+
+
+bool checkString(const char * s, ReadBuffer & buf)
+{
+ for (; *s; ++s)
+ {
+ if (buf.eof() || *buf.position() != *s)
+ return false;
+ ++buf.position();
+ }
+ return true;
+}
+
+
+bool checkStringCaseInsensitive(const char * s, ReadBuffer & buf)
+{
+ for (; *s; ++s)
+ {
+ if (buf.eof())
+ return false;
+
+ char c = *buf.position();
+ if (!equalsCaseInsensitive(*s, c))
+ return false;
+
+ ++buf.position();
+ }
+ return true;
+}
+
+
+void assertString(const char * s, ReadBuffer & buf)
+{
+ if (!checkString(s, buf))
+ throwAtAssertionFailed(s, buf);
+}
+
+
+void assertEOF(ReadBuffer & buf)
+{
+ if (!buf.eof())
+ throwAtAssertionFailed("eof", buf);
+}
+
+void assertNotEOF(ReadBuffer & buf)
+{
+ if (buf.eof())
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after EOF");
+}
+
+
+void assertStringCaseInsensitive(const char * s, ReadBuffer & buf)
+{
+ if (!checkStringCaseInsensitive(s, buf))
+ throwAtAssertionFailed(s, buf);
+}
+
+
+bool checkStringByFirstCharacterAndAssertTheRest(const char * s, ReadBuffer & buf)
+{
+ if (buf.eof() || *buf.position() != *s)
+ return false;
+
+ assertString(s, buf);
+ return true;
+}
+
+bool checkStringByFirstCharacterAndAssertTheRestCaseInsensitive(const char * s, ReadBuffer & buf)
+{
+ if (buf.eof())
+ return false;
+
+ char c = *buf.position();
+ if (!equalsCaseInsensitive(*s, c))
+ return false;
+
+ assertStringCaseInsensitive(s, buf);
+ return true;
+}
+
+
+template <typename T>
+static void appendToStringOrVector(T & s, ReadBuffer & rb, const char * end)
+{
+ s.append(rb.position(), end - rb.position());
+}
+
+template <>
+inline void appendToStringOrVector(PaddedPODArray<UInt8> & s, ReadBuffer & rb, const char * end)
+{
+ if (rb.isPadded())
+ s.insertSmallAllowReadWriteOverflow15(rb.position(), end);
+ else
+ s.insert(rb.position(), end);
+}
+
+template <>
+inline void appendToStringOrVector(PODArray<char> & s, ReadBuffer & rb, const char * end)
+{
+ s.insert(rb.position(), end);
+}
+
+template <char... chars, typename Vector>
+void readStringUntilCharsInto(Vector & s, ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<chars...>(buf.position(), buf.buffer().end());
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (buf.hasPendingData())
+ return;
+ }
+}
+
+template <typename Vector>
+void readStringInto(Vector & s, ReadBuffer & buf)
+{
+ readStringUntilCharsInto<'\t', '\n'>(s, buf);
+}
+
+template <typename Vector>
+void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf)
+{
+ readStringUntilCharsInto<' '>(s, buf);
+}
+
+template <typename Vector>
+void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf)
+{
+ readStringUntilCharsInto<'\n'>(s, buf);
+}
+
+template void readStringUntilNewlineInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template void readStringUntilNewlineInto<String>(String & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readNullTerminated(Vector & s, ReadBuffer & buf)
+{
+ readStringUntilCharsInto<'\0'>(s, buf);
+ buf.ignore();
+}
+
+void readStringUntilWhitespace(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readStringUntilWhitespaceInto(s, buf);
+}
+
+template void readNullTerminated<PODArray<char>>(PODArray<char> & s, ReadBuffer & buf);
+template void readNullTerminated<String>(String & s, ReadBuffer & buf);
+
+void readString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readStringInto(s, buf);
+}
+
+template void readStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template void readStringInto<String>(String & s, ReadBuffer & buf);
+template void readStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readStringUntilEOFInto(Vector & s, ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ appendToStringOrVector(s, buf, buf.buffer().end());
+ buf.position() = buf.buffer().end();
+ }
+}
+
+
+void readStringUntilEOF(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readStringUntilEOFInto(s, buf);
+}
+
+template <typename Vector>
+void readEscapedStringUntilEOLInto(Vector & s, ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\n', '\\'>(buf.position(), buf.buffer().end());
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\n')
+ return;
+
+ if (*buf.position() == '\\')
+ parseComplexEscapeSequence(s, buf);
+ }
+}
+
+
+void readEscapedStringUntilEOL(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readEscapedStringUntilEOLInto(s, buf);
+}
+
+template void readStringUntilEOFInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+
+
+/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
+ * It is assumed that the cursor is located on the `\` symbol
+ */
+template <typename Vector, typename ReturnType = void>
+static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]])
+ {
+ if constexpr (throw_exception)
+ throw Exception::createDeprecated(message, code);
+ return ReturnType(false);
+ };
+
+ ++buf.position();
+
+ if (buf.eof())
+ {
+ return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+ }
+
+ char char_after_backslash = *buf.position();
+
+ if (char_after_backslash == 'x')
+ {
+ ++buf.position();
+ /// escape sequence of the form \xAA
+ char hex_code[2];
+
+ auto bytes_read = buf.read(hex_code, sizeof(hex_code));
+
+ if (bytes_read != sizeof(hex_code))
+ {
+ return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+ }
+
+ s.push_back(unhex2(hex_code));
+ }
+ else if (char_after_backslash == 'N')
+ {
+ /// Support for NULLs: \N sequence must be parsed as empty string.
+ ++buf.position();
+ }
+ else
+ {
+ /// The usual escape sequence of a single character.
+ char decoded_char = parseEscapeSequence(char_after_backslash);
+
+ /// For convenience using LIKE and regular expressions,
+ /// we leave backslash when user write something like 'Hello 100\%':
+ /// it is parsed like Hello 100\% instead of Hello 100%
+ if (decoded_char != '\\'
+ && decoded_char != '\''
+ && decoded_char != '"'
+ && decoded_char != '`' /// MySQL style identifiers
+ && decoded_char != '/' /// JavaScript in HTML
+ && decoded_char != '=' /// TSKV format invented somewhere
+ && !isControlASCII(decoded_char))
+ {
+ s.push_back('\\');
+ }
+
+ s.push_back(decoded_char);
+ ++buf.position();
+ }
+
+ return ReturnType(true);
+}
+
+bool parseComplexEscapeSequence(String & s, ReadBuffer & buf)
+{
+ return parseComplexEscapeSequence<String, bool>(s, buf);
+}
+
+template <typename Vector, typename ReturnType>
+static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]])
+ {
+ if constexpr (throw_exception)
+ throw Exception::createDeprecated(message, code);
+ return ReturnType(false);
+ };
+
+ ++buf.position();
+
+ if (buf.eof())
+ return error("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+
+ assert(buf.hasPendingData());
+
+ switch (*buf.position())
+ {
+ case '"':
+ s.push_back('"');
+ break;
+ case '\\':
+ s.push_back('\\');
+ break;
+ case '/':
+ s.push_back('/');
+ break;
+ case 'b':
+ s.push_back('\b');
+ break;
+ case 'f':
+ s.push_back('\f');
+ break;
+ case 'n':
+ s.push_back('\n');
+ break;
+ case 'r':
+ s.push_back('\r');
+ break;
+ case 't':
+ s.push_back('\t');
+ break;
+ case 'u':
+ {
+ ++buf.position();
+
+ char hex_code[4];
+ if (4 != buf.read(hex_code, 4))
+ return error("Cannot parse escape sequence: less than four bytes after \\u", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+
+ /// \u0000 - special case
+ if (0 == memcmp(hex_code, "0000", 4))
+ {
+ s.push_back(0);
+ return ReturnType(true);
+ }
+
+ UInt16 code_point = unhex4(hex_code);
+
+ if (code_point <= 0x7F)
+ {
+ s.push_back(code_point);
+ }
+ else if (code_point <= 0x07FF)
+ {
+ s.push_back(((code_point >> 6) & 0x1F) | 0xC0);
+ s.push_back((code_point & 0x3F) | 0x80);
+ }
+ else
+ {
+ /// Surrogate pair.
+ if (code_point >= 0xD800 && code_point <= 0xDBFF)
+ {
+ if (!checkString("\\u", buf))
+ return error("Cannot parse escape sequence: missing second part of surrogate pair", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+
+ char second_hex_code[4];
+ if (4 != buf.read(second_hex_code, 4))
+ return error("Cannot parse escape sequence: less than four bytes after \\u of second part of surrogate pair",
+ ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+
+ UInt16 second_code_point = unhex4(second_hex_code);
+
+ if (second_code_point >= 0xDC00 && second_code_point <= 0xDFFF)
+ {
+ UInt32 full_code_point = 0x10000 + (code_point - 0xD800) * 1024 + (second_code_point - 0xDC00);
+
+ s.push_back(((full_code_point >> 18) & 0x07) | 0xF0);
+ s.push_back(((full_code_point >> 12) & 0x3F) | 0x80);
+ s.push_back(((full_code_point >> 6) & 0x3F) | 0x80);
+ s.push_back((full_code_point & 0x3F) | 0x80);
+ }
+ else
+ return error("Incorrect surrogate pair of unicode escape sequences in JSON", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
+ }
+ else
+ {
+ s.push_back(((code_point >> 12) & 0x0F) | 0xE0);
+ s.push_back(((code_point >> 6) & 0x3F) | 0x80);
+ s.push_back((code_point & 0x3F) | 0x80);
+ }
+ }
+
+ return ReturnType(true);
+ }
+ default:
+ s.push_back(*buf.position());
+ break;
+ }
+
+ ++buf.position();
+ return ReturnType(true);
+}
+
+
+template <typename Vector, bool parse_complex_escape_sequence>
+void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end());
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\t' || *buf.position() == '\n')
+ return;
+
+ if (*buf.position() == '\\')
+ {
+ if constexpr (parse_complex_escape_sequence)
+ {
+ parseComplexEscapeSequence(s, buf);
+ }
+ else
+ {
+ s.push_back(*buf.position());
+ ++buf.position();
+ if (!buf.eof())
+ {
+ s.push_back(*buf.position());
+ ++buf.position();
+ }
+ }
+ }
+ }
+}
+
+template <typename Vector>
+void readEscapedStringInto(Vector & s, ReadBuffer & buf)
+{
+ readEscapedStringIntoImpl<Vector, true>(s, buf);
+}
+
+
+void readEscapedString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readEscapedStringInto(s, buf);
+}
+
+template void readEscapedStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
+
+
+/** If enable_sql_style_quoting == true,
+ * strings like 'abc''def' will be parsed as abc'def.
+ * Please note, that even with SQL style quoting enabled,
+ * backslash escape sequences are also parsed,
+ * that could be slightly confusing.
+ */
+template <char quote, bool enable_sql_style_quoting, typename Vector, typename ReturnType = void>
+static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+ if (buf.eof() || *buf.position() != quote)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
+ "Cannot parse quoted string: expected opening quote '{}', got '{}'",
+ std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
+ else
+ return ReturnType(false);
+ }
+
+ ++buf.position();
+
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\\', quote>(buf.position(), buf.buffer().end());
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == quote)
+ {
+ ++buf.position();
+
+ if (enable_sql_style_quoting && !buf.eof() && *buf.position() == quote)
+ {
+ s.push_back(quote);
+ ++buf.position();
+ continue;
+ }
+
+ return ReturnType(true);
+ }
+
+ if (*buf.position() == '\\')
+ {
+ if constexpr (throw_exception)
+ parseComplexEscapeSequence<Vector, ReturnType>(s, buf);
+ else
+ {
+ if (!parseComplexEscapeSequence<Vector, ReturnType>(s, buf))
+ return ReturnType(false);
+ }
+ }
+ }
+
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING, "Cannot parse quoted string: expected closing quote");
+ else
+ return ReturnType(false);
+}
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readQuotedStringInto(Vector & s, ReadBuffer & buf)
+{
+ readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf);
+}
+
+template <typename Vector>
+bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf)
+{
+ return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf);
+}
+
+template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf);
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf)
+{
+ readAnyQuotedStringInto<'"', enable_sql_style_quoting>(s, buf);
+}
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readBackQuotedStringInto(Vector & s, ReadBuffer & buf)
+{
+ readAnyQuotedStringInto<'`', enable_sql_style_quoting>(s, buf);
+}
+
+
+void readQuotedString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readQuotedStringInto<false>(s, buf);
+}
+
+void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readQuotedStringInto<true>(s, buf);
+}
+
+
+template void readQuotedStringInto<true>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template void readQuotedStringInto<true>(String & s, ReadBuffer & buf);
+template void readQuotedStringInto<false>(String & s, ReadBuffer & buf);
+template void readDoubleQuotedStringInto<false>(NullOutput & s, ReadBuffer & buf);
+template void readDoubleQuotedStringInto<false>(String & s, ReadBuffer & buf);
+template void readBackQuotedStringInto<false>(String & s, ReadBuffer & buf);
+
+void readDoubleQuotedString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readDoubleQuotedStringInto<false>(s, buf);
+}
+
+void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readDoubleQuotedStringInto<true>(s, buf);
+}
+
+void readBackQuotedString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readBackQuotedStringInto<false>(s, buf);
+}
+
+void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readBackQuotedStringInto<true>(s, buf);
+}
+
+template<typename T>
+concept WithResize = requires (T value)
+{
+ { value.resize(1) };
+ { value.size() } -> std::integral<>;
+};
+
+template <typename Vector, bool include_quotes>
+void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
+{
+ /// Empty string
+ if (buf.eof())
+ return;
+
+ const char delimiter = settings.delimiter;
+ const char maybe_quote = *buf.position();
+ const String & custom_delimiter = settings.custom_delimiter;
+
+ /// Emptiness and not even in quotation marks.
+ if (custom_delimiter.empty() && maybe_quote == delimiter)
+ return;
+
+ if ((settings.allow_single_quotes && maybe_quote == '\'') || (settings.allow_double_quotes && maybe_quote == '"'))
+ {
+ if constexpr (include_quotes)
+ s.push_back(maybe_quote);
+
+ ++buf.position();
+
+ /// The quoted case. We are looking for the next quotation mark.
+ while (!buf.eof())
+ {
+ char * next_pos = reinterpret_cast<char *>(memchr(buf.position(), maybe_quote, buf.buffer().end() - buf.position()));
+
+ if (nullptr == next_pos)
+ next_pos = buf.buffer().end();
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if constexpr (include_quotes)
+ s.push_back(maybe_quote);
+
+ /// Now there is a quotation mark under the cursor. Is there any following?
+ ++buf.position();
+
+ if (buf.eof())
+ return;
+
+ if (*buf.position() == maybe_quote)
+ {
+ s.push_back(maybe_quote);
+ ++buf.position();
+ continue;
+ }
+
+ return;
+ }
+ }
+ else
+ {
+ /// If custom_delimiter is specified, we should read until first occurrences of
+ /// custom_delimiter in buffer.
+ if (!custom_delimiter.empty())
+ {
+ PeekableReadBuffer * peekable_buf = dynamic_cast<PeekableReadBuffer *>(&buf);
+ if (!peekable_buf)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer");
+
+ while (true)
+ {
+ if (peekable_buf->eof())
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter);
+
+ char * next_pos = reinterpret_cast<char *>(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available()));
+ if (!next_pos)
+ next_pos = peekable_buf->buffer().end();
+
+ appendToStringOrVector(s, *peekable_buf, next_pos);
+ peekable_buf->position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ {
+ PeekableReadBufferCheckpoint checkpoint{*peekable_buf, true};
+ if (checkString(custom_delimiter, *peekable_buf))
+ return;
+ }
+
+ s.push_back(*peekable_buf->position());
+ ++peekable_buf->position();
+ }
+
+ return;
+ }
+
+ /// Unquoted case. Look for delimiter or \r or \n.
+ while (!buf.eof())
+ {
+ char * next_pos = buf.position();
+
+ [&]()
+ {
+#ifdef __SSE2__
+ auto rc = _mm_set1_epi8('\r');
+ auto nc = _mm_set1_epi8('\n');
+ auto dc = _mm_set1_epi8(delimiter);
+ for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
+ {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(next_pos));
+ auto eq = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(bytes, rc), _mm_cmpeq_epi8(bytes, nc)), _mm_cmpeq_epi8(bytes, dc));
+ uint16_t bit_mask = _mm_movemask_epi8(eq);
+ if (bit_mask)
+ {
+ next_pos += std::countr_zero(bit_mask);
+ return;
+ }
+ }
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+ auto rc = vdupq_n_u8('\r');
+ auto nc = vdupq_n_u8('\n');
+ auto dc = vdupq_n_u8(delimiter);
+ for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
+ {
+ uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(next_pos));
+ auto eq = vorrq_u8(vorrq_u8(vceqq_u8(bytes, rc), vceqq_u8(bytes, nc)), vceqq_u8(bytes, dc));
+ uint64_t bit_mask = getNibbleMask(eq);
+ if (bit_mask)
+ {
+ next_pos += std::countr_zero(bit_mask) >> 2;
+ return;
+ }
+ }
+#endif
+ while (next_pos < buf.buffer().end()
+ && *next_pos != delimiter && *next_pos != '\r' && *next_pos != '\n')
+ ++next_pos;
+ }();
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if constexpr (WithResize<Vector>)
+ {
+ if (settings.trim_whitespaces) [[likely]]
+ {
+ /** CSV format can contain insignificant spaces and tabs.
+ * Usually the task of skipping them is for the calling code.
+ * But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself.
+ */
+ size_t size = s.size();
+ while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t'))
+ --size;
+
+ s.resize(size);
+ }
+ }
+ return;
+ }
+ }
+}
+
+void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
+{
+ s.clear();
+ readCSVStringInto(s, buf, settings);
+}
+
+void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings)
+{
+ s.clear();
+ readCSVStringInto<String, true>(s, buf, settings);
+}
+
+void readCSVWithTwoPossibleDelimitersImpl(String & s, PeekableReadBuffer & buf, const String & first_delimiter, const String & second_delimiter)
+{
+ /// Check that delimiters are not empty.
+ if (first_delimiter.empty() || second_delimiter.empty())
+ throw Exception(ErrorCodes::BAD_ARGUMENTS,
+ "Cannot read CSV field with two possible delimiters, one "
+ "of delimiters '{}' and '{}' is empty", first_delimiter, second_delimiter);
+
+ /// Read all data until first_delimiter or second_delimiter
+ while (true)
+ {
+ if (buf.eof())
+ throw Exception(ErrorCodes::INCORRECT_DATA, R"(Unexpected EOF while reading CSV string, expected on "
+ "of delimiters "{}" or "{}")", first_delimiter, second_delimiter);
+
+ char * next_pos = buf.position();
+ while (next_pos != buf.buffer().end() && *next_pos != first_delimiter[0] && *next_pos != second_delimiter[0])
+ ++next_pos;
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == first_delimiter[0])
+ {
+ PeekableReadBufferCheckpoint checkpoint(buf, true);
+ if (checkString(first_delimiter, buf))
+ return;
+ }
+
+ if (*buf.position() == second_delimiter[0])
+ {
+ PeekableReadBufferCheckpoint checkpoint(buf, true);
+ if (checkString(second_delimiter, buf))
+ return;
+ }
+
+ s.push_back(*buf.position());
+ ++buf.position();
+ }
+}
+
+String readCSVStringWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
+{
+ String res;
+
+ /// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
+ if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
+ readCSVStringInto(res, buf, settings);
+ else
+ readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
+
+ return res;
+}
+
+String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter)
+{
+ String res;
+
+ /// If value is quoted, use regular CSV reading since we need to read only data inside quotes.
+ if (!buf.eof() && ((settings.allow_single_quotes && *buf.position() == '\'') || (settings.allow_double_quotes && *buf.position() == '"')))
+ readCSVField(res, buf, settings);
+ else
+ readCSVWithTwoPossibleDelimitersImpl(res, buf, first_delimiter, second_delimiter);
+
+ return res;
+}
+
+template void readCSVStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
+template void readCSVStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
+
+
+template <typename Vector, typename ReturnType>
+ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(code, std::move(message));
+ return ReturnType(false);
+ };
+
+ if (buf.eof() || *buf.position() != '"')
+ return error("Cannot parse JSON string: expected opening quote", ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
+ ++buf.position();
+
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\\', '"'>(buf.position(), buf.buffer().end());
+
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '"')
+ {
+ ++buf.position();
+ return ReturnType(true);
+ }
+
+ if (*buf.position() == '\\')
+ parseJSONEscapeSequence<Vector, ReturnType>(s, buf);
+ }
+
+ return error("Cannot parse JSON string: expected closing quote", ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
+}
+
+void readJSONString(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readJSONStringInto(s, buf);
+}
+
+template void readJSONStringInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template bool readJSONStringInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
+template void readJSONStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
+template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
+template bool readJSONStringInto<String, bool>(String & s, ReadBuffer & buf);
+
+template <typename Vector, typename ReturnType>
+ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(code, std::move(message));
+ return ReturnType(false);
+ };
+
+ if (buf.eof() || *buf.position() != '{')
+ return error("JSON should start from opening curly bracket", ErrorCodes::INCORRECT_DATA);
+
+ s.push_back(*buf.position());
+ ++buf.position();
+
+ Int64 balance = 1;
+ bool quotes = false;
+
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\\', '{', '}', '"'>(buf.position(), buf.buffer().end());
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ s.push_back(*buf.position());
+
+ if (*buf.position() == '\\')
+ {
+ ++buf.position();
+ if (!buf.eof())
+ {
+ s.push_back(*buf.position());
+ ++buf.position();
+ }
+
+ continue;
+ }
+
+ if (*buf.position() == '"')
+ quotes = !quotes;
+ else if (!quotes) // can be only '{' or '}'
+ balance += *buf.position() == '{' ? 1 : -1;
+
+ ++buf.position();
+
+ if (balance == 0)
+ return ReturnType(true);
+
+ if (balance < 0)
+ break;
+ }
+
+ return error("JSON should have equal number of opening and closing brackets", ErrorCodes::INCORRECT_DATA);
+}
+
+template void readJSONObjectPossiblyInvalid<String>(String & s, ReadBuffer & buf);
+
+template <typename ReturnType>
+ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ auto error = []
+ {
+ if constexpr (throw_exception)
+ throw Exception(ErrorCodes::CANNOT_PARSE_DATE, "Cannot parse date: value is too short");
+ return ReturnType(false);
+ };
+
+ auto append_digit = [&](auto & x)
+ {
+ if (!buf.eof() && isNumericASCII(*buf.position()))
+ {
+ x = x * 10 + (*buf.position() - '0');
+ ++buf.position();
+ return true;
+ }
+ else
+ return false;
+ };
+
+ UInt16 year = 0;
+ UInt8 month = 0;
+ UInt8 day = 0;
+
+ if (!append_digit(year)
+ || !append_digit(year) // NOLINT
+ || !append_digit(year) // NOLINT
+ || !append_digit(year)) // NOLINT
+ return error();
+
+ if (buf.eof())
+ return error();
+
+ if (isNumericASCII(*buf.position()))
+ {
+ /// YYYYMMDD
+ if (!append_digit(month)
+ || !append_digit(month) // NOLINT
+ || !append_digit(day)
+ || !append_digit(day)) // NOLINT
+ return error();
+ }
+ else
+ {
+ ++buf.position();
+
+ if (!append_digit(month))
+ return error();
+ append_digit(month);
+
+ if (!buf.eof() && !isNumericASCII(*buf.position()))
+ ++buf.position();
+ else
+ return error();
+
+ if (!append_digit(day))
+ return error();
+ append_digit(day);
+ }
+
+ date = LocalDate(year, month, day);
+ return ReturnType(true);
+}
+
+template void readDateTextFallback<void>(LocalDate &, ReadBuffer &);
+template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &);
+
+
+template <typename ReturnType>
+ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ /// YYYY-MM-DD
+ static constexpr auto date_broken_down_length = 10;
+ /// hh:mm:ss
+ static constexpr auto time_broken_down_length = 8;
+ /// YYYY-MM-DD hh:mm:ss
+ static constexpr auto date_time_broken_down_length = date_broken_down_length + 1 + time_broken_down_length;
+
+ char s[date_time_broken_down_length];
+ char * s_pos = s;
+
+ /** Read characters, that could represent unix timestamp.
+ * Only unix timestamp of at least 5 characters is supported.
+ * Then look at 5th character. If it is a number - treat whole as unix timestamp.
+ * If it is not a number - then parse datetime in YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format.
+ */
+
+ /// A piece similar to unix timestamp, maybe scaled to subsecond precision.
+ while (s_pos < s + date_time_broken_down_length && !buf.eof() && isNumericASCII(*buf.position()))
+ {
+ *s_pos = *buf.position();
+ ++s_pos;
+ ++buf.position();
+ }
+
+ /// 2015-01-01 01:02:03 or 2015-01-01
+ if (s_pos == s + 4 && !buf.eof() && !isNumericASCII(*buf.position()))
+ {
+ const auto already_read_length = s_pos - s;
+ const size_t remaining_date_size = date_broken_down_length - already_read_length;
+
+ size_t size = buf.read(s_pos, remaining_date_size);
+ if (size != remaining_date_size)
+ {
+ s_pos[size] = 0;
+
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
+ else
+ return false;
+ }
+
+ UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
+ UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
+ UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
+
+ UInt8 hour = 0;
+ UInt8 minute = 0;
+ UInt8 second = 0;
+
+ if (!buf.eof() && (*buf.position() == ' ' || *buf.position() == 'T'))
+ {
+ ++buf.position();
+ size = buf.read(s, time_broken_down_length);
+
+ if (size != time_broken_down_length)
+ {
+ s_pos[size] = 0;
+
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
+ else
+ return false;
+ }
+
+ hour = (s[0] - '0') * 10 + (s[1] - '0');
+ minute = (s[3] - '0') * 10 + (s[4] - '0');
+ second = (s[6] - '0') * 10 + (s[7] - '0');
+ }
+
+ if (unlikely(year == 0))
+ datetime = 0;
+ else
+ datetime = date_lut.makeDateTime(year, month, day, hour, minute, second);
+ }
+ else
+ {
+ if (s_pos - s >= 5)
+ {
+ /// Not very efficient.
+ datetime = 0;
+ for (const char * digit_pos = s; digit_pos < s_pos; ++digit_pos)
+ datetime = datetime * 10 + *digit_pos - '0';
+ }
+ else
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse datetime");
+ else
+ return false;
+ }
+ }
+
+ return ReturnType(true);
+}
+
+template void readDateTimeTextFallback<void>(time_t &, ReadBuffer &, const DateLUTImpl &);
+template bool readDateTimeTextFallback<bool>(time_t &, ReadBuffer &, const DateLUTImpl &);
+
+
+void skipJSONField(ReadBuffer & buf, StringRef name_of_field)
+{
+ if (buf.eof())
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString());
+ else if (*buf.position() == '"') /// skip double-quoted string
+ {
+ NullOutput sink;
+ readJSONStringInto(sink, buf);
+ }
+ else if (isNumericASCII(*buf.position()) || *buf.position() == '-' || *buf.position() == '+' || *buf.position() == '.') /// skip number
+ {
+ if (*buf.position() == '+')
+ ++buf.position();
+
+ double v;
+ if (!tryReadFloatText(v, buf))
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString());
+ }
+ else if (*buf.position() == 'n') /// skip null
+ {
+ assertString("null", buf);
+ }
+ else if (*buf.position() == 't') /// skip true
+ {
+ assertString("true", buf);
+ }
+ else if (*buf.position() == 'f') /// skip false
+ {
+ assertString("false", buf);
+ }
+ else if (*buf.position() == '[')
+ {
+ ++buf.position();
+ skipWhitespaceIfAny(buf);
+
+ if (!buf.eof() && *buf.position() == ']') /// skip empty array
+ {
+ ++buf.position();
+ return;
+ }
+
+ while (true)
+ {
+ skipJSONField(buf, name_of_field);
+ skipWhitespaceIfAny(buf);
+
+ if (!buf.eof() && *buf.position() == ',')
+ {
+ ++buf.position();
+ skipWhitespaceIfAny(buf);
+ }
+ else if (!buf.eof() && *buf.position() == ']')
+ {
+ ++buf.position();
+ break;
+ }
+ else
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
+ }
+ }
+ else if (*buf.position() == '{') /// skip whole object
+ {
+ ++buf.position();
+ skipWhitespaceIfAny(buf);
+
+ while (!buf.eof() && *buf.position() != '}')
+ {
+ // field name
+ if (*buf.position() == '"')
+ {
+ NullOutput sink;
+ readJSONStringInto(sink, buf);
+ }
+ else
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
+
+ // ':'
+ skipWhitespaceIfAny(buf);
+ if (buf.eof() || !(*buf.position() == ':'))
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString());
+ ++buf.position();
+ skipWhitespaceIfAny(buf);
+
+ skipJSONField(buf, name_of_field);
+ skipWhitespaceIfAny(buf);
+
+ // optional ','
+ if (!buf.eof() && *buf.position() == ',')
+ {
+ ++buf.position();
+ skipWhitespaceIfAny(buf);
+ }
+ }
+
+ if (buf.eof())
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString());
+ ++buf.position();
+ }
+ else
+ {
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'",
+ std::string(*buf.position(), 1), name_of_field.toString());
+ }
+}
+
+
+Exception readException(ReadBuffer & buf, const String & additional_message, bool remote_exception)
+{
+ int code = 0;
+ String name;
+ String message;
+ String stack_trace;
+ bool has_nested = false; /// Obsolete
+
+ readBinaryLittleEndian(code, buf);
+ readBinary(name, buf);
+ readBinary(message, buf);
+ readBinary(stack_trace, buf);
+ readBinary(has_nested, buf);
+
+ WriteBufferFromOwnString out;
+
+ if (!additional_message.empty())
+ out << additional_message << ". ";
+
+ if (name != "DB::Exception")
+ out << name << ". ";
+
+ out << message << ".";
+
+ if (!stack_trace.empty())
+ out << " Stack trace:\n\n" << stack_trace;
+
+ return Exception::createDeprecated(out.str(), code, remote_exception);
+}
+
+void readAndThrowException(ReadBuffer & buf, const String & additional_message)
+{
+ readException(buf, additional_message).rethrow();
+}
+
+
+void skipToCarriageReturnOrEOF(ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\r'>(buf.position(), buf.buffer().end());
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\r')
+ {
+ ++buf.position();
+ return;
+ }
+ }
+}
+
+
+void skipToNextLineOrEOF(ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\n'>(buf.position(), buf.buffer().end());
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\n')
+ {
+ ++buf.position();
+ return;
+ }
+ }
+}
+
+
+void skipToUnescapedNextLineOrEOF(ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\n', '\\'>(buf.position(), buf.buffer().end());
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\n')
+ {
+ ++buf.position();
+ return;
+ }
+
+ if (*buf.position() == '\\')
+ {
+ ++buf.position();
+ if (buf.eof())
+ return;
+
+ /// Skip escaped character. We do not consider escape sequences with more than one character after backslash (\x01).
+ /// It's ok for the purpose of this function, because we are interested only in \n and \\.
+ ++buf.position();
+ continue;
+ }
+ }
+}
+
+void skipNullTerminated(ReadBuffer & buf)
+{
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\0'>(buf.position(), buf.buffer().end());
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\0')
+ {
+ ++buf.position();
+ return;
+ }
+ }
+}
+
+
+void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current)
+{
+ assert(current >= in.position());
+ assert(current <= in.buffer().end());
+
+ const size_t old_bytes = memory.size();
+ const size_t additional_bytes = current - in.position();
+ const size_t new_bytes = old_bytes + additional_bytes;
+
+ /// There are no new bytes to add to memory.
+ /// No need to do extra stuff.
+ if (new_bytes == 0)
+ return;
+
+ assert(in.position() + additional_bytes <= in.buffer().end());
+ memory.resize(new_bytes);
+ memcpy(memory.data() + old_bytes, in.position(), additional_bytes);
+ in.position() = current;
+}
+
+bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current)
+{
+ assert(current <= in.buffer().end());
+
+ if (current < in.buffer().end())
+ return true;
+
+ saveUpToPosition(in, memory, current);
+
+ bool loaded_more = !in.eof();
+ // A sanity check. Buffer position may be in the beginning of the buffer
+ // (normal case), or have some offset from it (AIO).
+ assert(in.position() >= in.buffer().begin());
+ assert(in.position() <= in.buffer().end());
+ current = in.position();
+
+ return loaded_more;
+}
+
+/// Searches for delimiter in input stream and sets buffer position after delimiter (if found) or EOF (if not)
+static void findAndSkipNextDelimiter(PeekableReadBuffer & buf, const String & delimiter)
+{
+ if (delimiter.empty())
+ return;
+
+ while (!buf.eof())
+ {
+ void * pos = memchr(buf.position(), delimiter[0], buf.available());
+ if (!pos)
+ {
+ buf.position() += buf.available();
+ continue;
+ }
+
+ buf.position() = static_cast<ReadBuffer::Position>(pos);
+
+ PeekableReadBufferCheckpoint checkpoint{buf};
+ if (checkString(delimiter, buf))
+ return;
+
+ buf.rollbackToCheckpoint();
+ ++buf.position();
+ }
+}
+
+void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delimiter, const String & row_between_delimiter, bool skip_spaces)
+{
+ if (row_after_delimiter.empty())
+ {
+ findAndSkipNextDelimiter(buf, row_between_delimiter);
+ return;
+ }
+
+ while (true)
+ {
+ findAndSkipNextDelimiter(buf, row_after_delimiter);
+
+ if (skip_spaces)
+ skipWhitespaceIfAny(buf);
+
+ if (checkString(row_between_delimiter, buf))
+ break;
+ }
+}
+
+// Use PeekableReadBuffer to copy field to string after parsing.
+template <typename Vector, typename ParseFunc>
+static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func)
+{
+ PeekableReadBuffer peekable_buf(buf);
+ peekable_buf.setCheckpoint();
+ parse_func(peekable_buf);
+ peekable_buf.makeContinuousMemoryFromCheckpointToPos();
+ auto * end = peekable_buf.position();
+ peekable_buf.rollbackToCheckpoint();
+ s.append(peekable_buf.position(), end);
+ peekable_buf.position() = end;
+}
+
+template <typename Vector>
+static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ s.push_back('\'');
+ while (!buf.eof())
+ {
+ char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
+
+ s.append(buf.position(), next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\'')
+ break;
+
+ s.push_back(*buf.position());
+ if (*buf.position() == '\\')
+ {
+ ++buf.position();
+ if (!buf.eof())
+ {
+ s.push_back(*buf.position());
+ ++buf.position();
+ }
+ }
+ }
+
+ if (buf.eof())
+ return;
+
+ ++buf.position();
+ s.push_back('\'');
+}
+
+template <char opening_bracket, char closing_bracket, typename Vector>
+static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
+{
+ assertChar(opening_bracket, buf);
+ s.push_back(opening_bracket);
+
+ size_t balance = 1;
+
+ while (!buf.eof() && balance)
+ {
+ char * next_pos = find_first_symbols<'\'', opening_bracket, closing_bracket>(buf.position(), buf.buffer().end());
+ appendToStringOrVector(s, buf, next_pos);
+ buf.position() = next_pos;
+
+ if (!buf.hasPendingData())
+ continue;
+
+ if (*buf.position() == '\'')
+ {
+ readQuotedStringFieldInto(s, buf);
+ }
+ else if (*buf.position() == opening_bracket)
+ {
+ s.push_back(opening_bracket);
+ ++balance;
+ ++buf.position();
+ }
+ else if (*buf.position() == closing_bracket)
+ {
+ s.push_back(closing_bracket);
+ --balance;
+ ++buf.position();
+ }
+ }
+}
+
+template <typename Vector>
+void readQuotedFieldInto(Vector & s, ReadBuffer & buf)
+{
+ if (buf.eof())
+ return;
+
+ /// Possible values in 'Quoted' field:
+ /// - Strings: '...'
+ /// - Arrays: [...]
+ /// - Tuples: (...)
+ /// - Maps: {...}
+ /// - NULL
+ /// - Bool: true/false
+ /// - Number: integer, float, decimal.
+
+ if (*buf.position() == '\'')
+ readQuotedStringFieldInto(s, buf);
+ else if (*buf.position() == '[')
+ readQuotedFieldInBracketsInto<'[', ']'>(s, buf);
+ else if (*buf.position() == '(')
+ readQuotedFieldInBracketsInto<'(', ')'>(s, buf);
+ else if (*buf.position() == '{')
+ readQuotedFieldInBracketsInto<'{', '}'>(s, buf);
+ else if (checkCharCaseInsensitive('n', buf))
+ {
+ /// NULL or NaN
+ if (checkCharCaseInsensitive('u', buf))
+ {
+ assertStringCaseInsensitive("ll", buf);
+ s.append("NULL");
+ }
+ else
+ {
+ assertStringCaseInsensitive("an", buf);
+ s.append("NaN");
+ }
+ }
+ else if (checkCharCaseInsensitive('t', buf))
+ {
+ assertStringCaseInsensitive("rue", buf);
+ s.append("true");
+ }
+ else if (checkCharCaseInsensitive('f', buf))
+ {
+ assertStringCaseInsensitive("alse", buf);
+ s.append("false");
+ }
+ else
+ {
+ /// It's an integer, float or decimal. They all can be parsed as float.
+ auto parse_func = [](ReadBuffer & in)
+ {
+ Float64 tmp;
+ readFloatText(tmp, in);
+ };
+ readParsedValueInto(s, buf, parse_func);
+ }
+}
+
+template void readQuotedFieldInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
+
+void readQuotedField(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readQuotedFieldInto(s, buf);
+}
+
+void readJSONField(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); };
+ readParsedValueInto(s, buf, parse_func);
+}
+
+void readTSVField(String & s, ReadBuffer & buf)
+{
+ s.clear();
+ readEscapedStringIntoImpl<String, false>(s, buf);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadHelpers.h b/contrib/clickhouse/src/IO/ReadHelpers.h
new file mode 100644
index 0000000000..f99c78fdf1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadHelpers.h
@@ -0,0 +1,1619 @@
+#pragma once
+
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <string_view>
+#include <limits>
+#include <algorithm>
+#include <iterator>
+#include <bit>
+#include <span>
+
+#include <type_traits>
+
+#include <Common/StackTrace.h>
+#include <Common/formatIPv6.h>
+#include <Common/DateLUT.h>
+#include <Common/LocalDate.h>
+#include <Common/LocalDateTime.h>
+#include <Common/TransformEndianness.hpp>
+#include <base/StringRef.h>
+#include <base/arithmeticOverflow.h>
+#include <base/sort.h>
+#include <base/unit.h>
+
+#include <Core/Types.h>
+#include <Core/DecimalFunctions.h>
+#include <Core/UUID.h>
+#include <base/IPv4andIPv6.h>
+
+#include <Common/Allocator.h>
+#include <Common/Exception.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/intExp.h>
+
+#include <Formats/FormatSettings.h>
+
+#include <IO/CompressionMethod.h>
+#include <IO/ReadBuffer.h>
+#include <IO/ReadBufferFromMemory.h>
+#include <IO/PeekableReadBuffer.h>
+#include <IO/VarInt.h>
+
+#include <double-conversion/double-conversion.h>
+
+static constexpr auto DEFAULT_MAX_STRING_SIZE = 1_GiB;
+
+namespace DB
+{
+
+template <typename Allocator>
+struct Memory;
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_DATE;
+ extern const int CANNOT_PARSE_BOOL;
+ extern const int CANNOT_PARSE_DATETIME;
+ extern const int CANNOT_PARSE_UUID;
+ extern const int CANNOT_PARSE_IPV4;
+ extern const int CANNOT_PARSE_IPV6;
+ extern const int CANNOT_READ_ARRAY_FROM_TEXT;
+ extern const int CANNOT_PARSE_NUMBER;
+ extern const int INCORRECT_DATA;
+ extern const int TOO_LARGE_STRING_SIZE;
+ extern const int TOO_LARGE_ARRAY_SIZE;
+ extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH;
+}
+
+/// Helper functions for formatted input.
+
+inline char parseEscapeSequence(char c)
+{
+ switch (c)
+ {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'e':
+ return '\x1B'; /// \e escape sequence is non standard for C and C++ but supported by gcc and clang.
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '0':
+ return '\0';
+ default:
+ return c;
+ }
+}
+
+
+/// Function throwReadAfterEOF is located in VarInt.h
+
+
+inline void readChar(char & x, ReadBuffer & buf)
+{
+ if (buf.eof()) [[unlikely]]
+ throwReadAfterEOF();
+ x = *buf.position();
+ ++buf.position();
+}
+
+
+/// Read POD-type in native format
+template <typename T>
+inline void readPODBinary(T & x, ReadBuffer & buf)
+{
+ buf.readStrict(reinterpret_cast<char *>(&x), sizeof(x)); /// NOLINT
+}
+
+inline void readUUIDBinary(UUID & x, ReadBuffer & buf)
+{
+ auto & uuid = x.toUnderType();
+ readPODBinary(uuid.items[0], buf);
+ readPODBinary(uuid.items[1], buf);
+}
+
+template <typename T>
+inline void readIntBinary(T & x, ReadBuffer & buf)
+{
+ readPODBinary(x, buf);
+}
+
+template <typename T>
+inline void readFloatBinary(T & x, ReadBuffer & buf)
+{
+ readPODBinary(x, buf);
+}
+
+inline void readStringBinary(std::string & s, ReadBuffer & buf, size_t max_string_size = DEFAULT_MAX_STRING_SIZE)
+{
+ size_t size = 0;
+ readVarUInt(size, buf);
+
+ if (size > max_string_size)
+ throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size.");
+
+ s.resize(size);
+ buf.readStrict(s.data(), size);
+}
+
+/// For historical reasons we store IPv6 as a String
+inline void readIPv6Binary(IPv6 & ip, ReadBuffer & buf)
+{
+ size_t size = 0;
+ readVarUInt(size, buf);
+
+ if (size != IPV6_BINARY_LENGTH)
+ throw Exception(ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH,
+ "Size of the string {} doesn't match size of binary IPv6 {}", size, IPV6_BINARY_LENGTH);
+
+ buf.readStrict(reinterpret_cast<char*>(&ip.toUnderType()), size);
+}
+
+template <typename T>
+void readVectorBinary(std::vector<T> & v, ReadBuffer & buf)
+{
+ size_t size = 0;
+ readVarUInt(size, buf);
+
+ if (size > DEFAULT_MAX_STRING_SIZE)
+ throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
+ "Too large array size (maximum: {})", DEFAULT_MAX_STRING_SIZE);
+
+ v.resize(size);
+ for (size_t i = 0; i < size; ++i)
+ readBinary(v[i], buf);
+}
+
+
+void assertString(const char * s, ReadBuffer & buf);
+void assertEOF(ReadBuffer & buf);
+void assertNotEOF(ReadBuffer & buf);
+
+[[noreturn]] void throwAtAssertionFailed(const char * s, ReadBuffer & buf);
+
+inline bool checkChar(char c, ReadBuffer & buf)
+{
+ char a;
+ if (!buf.peek(a) || a != c)
+ return false;
+ buf.ignore();
+ return true;
+}
+
+inline void assertChar(char symbol, ReadBuffer & buf)
+{
+ if (!checkChar(symbol, buf))
+ {
+ char err[2] = {symbol, '\0'};
+ throwAtAssertionFailed(err, buf);
+ }
+}
+
+inline bool checkCharCaseInsensitive(char c, ReadBuffer & buf)
+{
+ char a;
+ if (!buf.peek(a) || !equalsCaseInsensitive(a, c))
+ return false;
+ buf.ignore();
+ return true;
+}
+
+inline void assertString(const String & s, ReadBuffer & buf)
+{
+ assertString(s.c_str(), buf);
+}
+
+bool checkString(const char * s, ReadBuffer & buf);
+inline bool checkString(const String & s, ReadBuffer & buf)
+{
+ return checkString(s.c_str(), buf);
+}
+
+bool checkStringCaseInsensitive(const char * s, ReadBuffer & buf);
+inline bool checkStringCaseInsensitive(const String & s, ReadBuffer & buf)
+{
+ return checkStringCaseInsensitive(s.c_str(), buf);
+}
+
+void assertStringCaseInsensitive(const char * s, ReadBuffer & buf);
+inline void assertStringCaseInsensitive(const String & s, ReadBuffer & buf)
+{
+ return assertStringCaseInsensitive(s.c_str(), buf);
+}
+
+/** Check that next character in buf matches first character of s.
+ * If true, then check all characters in s and throw exception if it doesn't match.
+ * If false, then return false, and leave position in buffer unchanged.
+ */
+bool checkStringByFirstCharacterAndAssertTheRest(const char * s, ReadBuffer & buf);
+bool checkStringByFirstCharacterAndAssertTheRestCaseInsensitive(const char * s, ReadBuffer & buf);
+
+inline bool checkStringByFirstCharacterAndAssertTheRest(const String & s, ReadBuffer & buf)
+{
+ return checkStringByFirstCharacterAndAssertTheRest(s.c_str(), buf);
+}
+
+inline bool checkStringByFirstCharacterAndAssertTheRestCaseInsensitive(const String & s, ReadBuffer & buf)
+{
+ return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive(s.c_str(), buf);
+}
+
+
+inline void readBoolText(bool & x, ReadBuffer & buf)
+{
+ char tmp = '0';
+ readChar(tmp, buf);
+ x = tmp != '0';
+}
+
+inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false)
+{
+ if (buf.eof()) [[unlikely]]
+ throwReadAfterEOF();
+
+ switch (*buf.position())
+ {
+ case 't':
+ assertString("true", buf);
+ x = true;
+ break;
+ case 'f':
+ assertString("false", buf);
+ x = false;
+ break;
+ case 'T':
+ {
+ if (support_upper_case)
+ {
+ assertString("TRUE", buf);
+ x = true;
+ break;
+ }
+ else
+ [[fallthrough]];
+ }
+ case 'F':
+ {
+ if (support_upper_case)
+ {
+ assertString("FALSE", buf);
+ x = false;
+ break;
+ }
+ else
+ [[fallthrough]];
+ }
+ default:
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value");
+ }
+}
+
+enum class ReadIntTextCheckOverflow
+{
+ DO_NOT_CHECK_OVERFLOW,
+ CHECK_OVERFLOW,
+};
+
+template <typename T, typename ReturnType = void, ReadIntTextCheckOverflow check_overflow = ReadIntTextCheckOverflow::DO_NOT_CHECK_OVERFLOW>
+ReturnType readIntTextImpl(T & x, ReadBuffer & buf)
+{
+ using UnsignedT = make_unsigned_t<T>;
+
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ bool negative = false;
+ UnsignedT res{};
+ if (buf.eof()) [[unlikely]]
+ {
+ if constexpr (throw_exception)
+ throwReadAfterEOF();
+ else
+ return ReturnType(false);
+ }
+
+ const size_t initial_pos = buf.count();
+ bool has_sign = false;
+ bool has_number = false;
+ while (!buf.eof())
+ {
+ switch (*buf.position())
+ {
+ case '+':
+ {
+ /// 123+ or +123+, just stop after 123 or +123.
+ if (has_number)
+ goto end;
+
+ /// No digits read yet, but we already read sign, like ++, -+.
+ if (has_sign)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
+ "Cannot parse number with multiple sign (+/-) characters");
+ else
+ return ReturnType(false);
+ }
+
+ has_sign = true;
+ break;
+ }
+ case '-':
+ {
+ if (has_number)
+ goto end;
+
+ if (has_sign)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
+ "Cannot parse number with multiple sign (+/-) characters");
+ else
+ return ReturnType(false);
+ }
+
+ if constexpr (is_signed_v<T>)
+ negative = true;
+ else
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Unsigned type must not contain '-' symbol");
+ else
+ return ReturnType(false);
+ }
+ has_sign = true;
+ break;
+ }
+ case '0': [[fallthrough]];
+ case '1': [[fallthrough]];
+ case '2': [[fallthrough]];
+ case '3': [[fallthrough]];
+ case '4': [[fallthrough]];
+ case '5': [[fallthrough]];
+ case '6': [[fallthrough]];
+ case '7': [[fallthrough]];
+ case '8': [[fallthrough]];
+ case '9':
+ {
+ has_number = true;
+ if constexpr (check_overflow == ReadIntTextCheckOverflow::CHECK_OVERFLOW && !is_big_int_v<T>)
+ {
+ /// Perform relativelly slow overflow check only when
+ /// number of decimal digits so far is close to the max for given type.
+ /// Example: 20 * 10 will overflow Int8.
+
+ if (buf.count() - initial_pos + 1 >= std::numeric_limits<T>::max_digits10)
+ {
+ if (negative)
+ {
+ T signed_res = -res;
+ if (common::mulOverflow<T>(signed_res, 10, signed_res) ||
+ common::subOverflow<T>(signed_res, (*buf.position() - '0'), signed_res))
+ return ReturnType(false);
+
+ res = -static_cast<UnsignedT>(signed_res);
+ }
+ else
+ {
+ T signed_res = res;
+ if (common::mulOverflow<T>(signed_res, 10, signed_res) ||
+ common::addOverflow<T>(signed_res, (*buf.position() - '0'), signed_res))
+ return ReturnType(false);
+
+ res = signed_res;
+ }
+ break;
+ }
+ }
+ res *= 10;
+ res += *buf.position() - '0';
+ break;
+ }
+ default:
+ goto end;
+ }
+ ++buf.position();
+ }
+
+end:
+ if (has_sign && !has_number)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
+ "Cannot parse number with a sign character but without any numeric character");
+ else
+ return ReturnType(false);
+ }
+ x = res;
+ if constexpr (is_signed_v<T>)
+ {
+ if (negative)
+ {
+ if constexpr (check_overflow == ReadIntTextCheckOverflow::CHECK_OVERFLOW)
+ {
+ if (common::mulOverflow<UnsignedT, Int8, T>(res, -1, x))
+ return ReturnType(false);
+ }
+ else
+ x = -res;
+ }
+ }
+
+ return ReturnType(true);
+}
+
+template <ReadIntTextCheckOverflow check_overflow = ReadIntTextCheckOverflow::DO_NOT_CHECK_OVERFLOW, typename T>
+void readIntText(T & x, ReadBuffer & buf)
+{
+ if constexpr (is_decimal<T>)
+ {
+ readIntText<check_overflow>(x.value, buf);
+ }
+ else
+ {
+ readIntTextImpl<T, void, check_overflow>(x, buf);
+ }
+}
+
+template <ReadIntTextCheckOverflow check_overflow = ReadIntTextCheckOverflow::CHECK_OVERFLOW, typename T>
+bool tryReadIntText(T & x, ReadBuffer & buf)
+{
+ return readIntTextImpl<T, bool, check_overflow>(x, buf);
+}
+
+
+/** More efficient variant (about 1.5 times on real dataset).
+ * Differs in following:
+ * - for numbers starting with zero, parsed only zero;
+ * - symbol '+' before number is not supported;
+ */
+template <typename T, bool throw_on_error = true>
+void readIntTextUnsafe(T & x, ReadBuffer & buf)
+{
+ bool negative = false;
+ make_unsigned_t<T> res = 0;
+
+ auto on_error = []
+ {
+ if (throw_on_error)
+ throwReadAfterEOF();
+ };
+
+ if (buf.eof()) [[unlikely]]
+ return on_error();
+
+ if (is_signed_v<T> && *buf.position() == '-')
+ {
+ ++buf.position();
+ negative = true;
+ if (buf.eof()) [[unlikely]]
+ return on_error();
+ }
+
+ if (*buf.position() == '0') /// There are many zeros in real datasets.
+ {
+ ++buf.position();
+ x = 0;
+ return;
+ }
+
+ while (!buf.eof())
+ {
+ unsigned char value = *buf.position() - '0';
+
+ if (value < 10)
+ {
+ res *= 10;
+ res += value;
+ ++buf.position();
+ }
+ else
+ break;
+ }
+
+ /// See note about undefined behaviour above.
+ x = is_signed_v<T> && negative ? -res : res;
+}
+
+template <typename T>
+void tryReadIntTextUnsafe(T & x, ReadBuffer & buf)
+{
+ return readIntTextUnsafe<T, false>(x, buf);
+}
+
+
+/// Look at readFloatText.h
+template <typename T> void readFloatText(T & x, ReadBuffer & in);
+template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in);
+
+template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in);
+template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in);
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in);
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in);
+
+
+/// simple: all until '\n' or '\t'
+void readString(String & s, ReadBuffer & buf);
+
+void readEscapedString(String & s, ReadBuffer & buf);
+
+void readQuotedString(String & s, ReadBuffer & buf);
+void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
+
+void readDoubleQuotedString(String & s, ReadBuffer & buf);
+void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
+
+void readJSONString(String & s, ReadBuffer & buf);
+
+void readBackQuotedString(String & s, ReadBuffer & buf);
+void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf);
+
+void readStringUntilEOF(String & s, ReadBuffer & buf);
+
+// Reads the line until EOL, unescaping backslash escape sequences.
+// Buffer pointer is left at EOL, don't forget to advance it.
+void readEscapedStringUntilEOL(String & s, ReadBuffer & buf);
+
+/// Only 0x20 as whitespace character
+void readStringUntilWhitespace(String & s, ReadBuffer & buf);
+
+
+/** Read string in CSV format.
+ * Parsing rules:
+ * - string could be placed in quotes; quotes could be single: ' if FormatSettings::CSV::allow_single_quotes is true
+ * or double: " if FormatSettings::CSV::allow_double_quotes is true;
+ * - or string could be unquoted - this is determined by first character;
+ * - if string is unquoted, then:
+ * - If settings.custom_delimiter is not specified, it is read until next settings.delimiter, either until end of line (CR or LF) or until end of stream;
+ * - If settings.custom_delimiter is specified it reads until first occurrences of settings.custom_delimiter in buffer.
+ * This works only if provided buffer is PeekableReadBuffer.
+ * but spaces and tabs at begin and end of unquoted string are consumed but ignored (note that this behaviour differs from RFC).
+ * - if string is in quotes, then it will be read until closing quote,
+ * but sequences of two consecutive quotes are parsed as single quote inside string;
+ */
+void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
+
+/// Differ from readCSVString in that it doesn't remove quotes around field if any.
+void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
+
+/// Read string in CSV format until the first occurrence of first_delimiter or second_delimiter.
+/// Similar to readCSVString if string is in quotes, we read only data in quotes.
+String readCSVStringWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter);
+
+/// Same as above but includes quotes in the result if any.
+String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const FormatSettings::CSV & settings, const String & first_delimiter, const String & second_delimiter);
+
+/// Read and append result to array of characters.
+template <typename Vector>
+void readStringInto(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readNullTerminated(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readEscapedStringInto(Vector & s, ReadBuffer & buf);
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readQuotedStringInto(Vector & s, ReadBuffer & buf);
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf);
+
+template <bool enable_sql_style_quoting, typename Vector>
+void readBackQuotedStringInto(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readStringUntilEOFInto(Vector & s, ReadBuffer & buf);
+
+template <typename Vector, bool include_quotes = false>
+void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
+
+/// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception.
+template <typename Vector, typename ReturnType = void>
+ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf)
+{
+ return readJSONStringInto<Vector, bool>(s, buf);
+}
+
+template <typename Vector>
+bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf);
+
+/// Reads chunk of data between {} in that way,
+/// that it has balanced parentheses sequence of {}.
+/// So, it may form a JSON object, but it can be incorrenct.
+template <typename Vector, typename ReturnType = void>
+ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf);
+
+template <typename Vector>
+void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf);
+
+/// This could be used as template parameter for functions above, if you want to just skip data.
+struct NullOutput
+{
+ void append(const char *, size_t) {}
+ void append(const char *) {}
+ void append(const char *, const char *) {}
+ void push_back(char) {} /// NOLINT
+};
+
+template <typename ReturnType>
+ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf);
+
+/// In YYYY-MM-DD format.
+/// For convenience, Month and Day parts can have single digit instead of two digits.
+/// Any separators other than '-' are supported.
+template <typename ReturnType = void>
+inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
+{
+ /// Optimistic path, when whole value is in buffer.
+ if (!buf.eof() && buf.position() + 10 <= buf.buffer().end())
+ {
+ char * pos = buf.position();
+
+ /// YYYY-MM-DD
+ /// YYYY-MM-D
+ /// YYYY-M-DD
+ /// YYYY-M-D
+ /// YYYYMMDD
+
+ /// The delimiters can be arbitrary characters, like YYYY/MM!DD, but obviously not digits.
+
+ UInt16 year = (pos[0] - '0') * 1000 + (pos[1] - '0') * 100 + (pos[2] - '0') * 10 + (pos[3] - '0');
+ UInt8 month;
+ UInt8 day;
+ pos += 5;
+
+ if (isNumericASCII(pos[-1]))
+ {
+ /// YYYYMMDD
+ month = (pos[-1] - '0') * 10 + (pos[0] - '0');
+ day = (pos[1] - '0') * 10 + (pos[2] - '0');
+ pos += 3;
+ }
+ else
+ {
+ month = pos[0] - '0';
+ if (isNumericASCII(pos[1]))
+ {
+ month = month * 10 + pos[1] - '0';
+ pos += 3;
+ }
+ else
+ pos += 2;
+
+ if (isNumericASCII(pos[-1]))
+ return ReturnType(false);
+
+ day = pos[0] - '0';
+ if (isNumericASCII(pos[1]))
+ {
+ day = day * 10 + pos[1] - '0';
+ pos += 2;
+ }
+ else
+ pos += 1;
+ }
+
+ buf.position() = pos;
+ date = LocalDate(year, month, day);
+ return ReturnType(true);
+ }
+ else
+ return readDateTextFallback<ReturnType>(date, buf);
+}
+
+inline void convertToDayNum(DayNum & date, ExtendedDayNum & from)
+{
+ if (unlikely(from < 0))
+ date = 0;
+ else if (unlikely(from > 0xFFFF))
+ date = 0xFFFF;
+ else
+ date = from;
+}
+
+template <typename ReturnType = void>
+inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ LocalDate local_date;
+
+ if constexpr (throw_exception)
+ readDateTextImpl<ReturnType>(local_date, buf);
+ else if (!readDateTextImpl<ReturnType>(local_date, buf))
+ return false;
+
+ ExtendedDayNum ret = date_lut.makeDayNum(local_date.year(), local_date.month(), local_date.day());
+ convertToDayNum(date, ret);
+ return ReturnType(true);
+}
+
+template <typename ReturnType = void>
+inline ReturnType readDateTextImpl(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ LocalDate local_date;
+
+ if constexpr (throw_exception)
+ readDateTextImpl<ReturnType>(local_date, buf);
+ else if (!readDateTextImpl<ReturnType>(local_date, buf))
+ return false;
+
+ /// When the parameter is out of rule or out of range, Date32 uses 1925-01-01 as the default value (-DateLUT::instance().getDayNumOffsetEpoch(), -16436) and Date uses 1970-01-01.
+ date = date_lut.makeDayNum(local_date.year(), local_date.month(), local_date.day(), -static_cast<Int32>(date_lut.getDayNumOffsetEpoch()));
+ return ReturnType(true);
+}
+
+
+inline void readDateText(LocalDate & date, ReadBuffer & buf)
+{
+ readDateTextImpl<void>(date, buf);
+}
+
+inline void readDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
+{
+ readDateTextImpl<void>(date, buf, date_lut);
+}
+
+inline void readDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
+{
+ readDateTextImpl<void>(date, buf, date_lut);
+}
+
+inline bool tryReadDateText(LocalDate & date, ReadBuffer & buf)
+{
+ return readDateTextImpl<bool>(date, buf);
+}
+
+inline bool tryReadDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ return readDateTextImpl<bool>(date, buf, time_zone);
+}
+
+inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ return readDateTextImpl<bool>(date, buf, time_zone);
+}
+
+UUID parseUUID(std::span<const UInt8> src);
+
+template <typename ReturnType = void>
+inline ReturnType readUUIDTextImpl(UUID & uuid, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ char s[36];
+ size_t size = buf.read(s, 32);
+
+ if (size == 32)
+ {
+ if (s[8] == '-')
+ {
+ size += buf.read(&s[32], 4);
+
+ if (size != 36)
+ {
+ s[size] = 0;
+
+ if constexpr (throw_exception)
+ {
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
+ }
+ else
+ {
+ return ReturnType(false);
+ }
+ }
+ }
+
+ uuid = parseUUID({reinterpret_cast<const UInt8 *>(s), size});
+ return ReturnType(true);
+ }
+ else
+ {
+ s[size] = 0;
+
+ if constexpr (throw_exception)
+ {
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
+ }
+ else
+ {
+ return ReturnType(false);
+ }
+ }
+}
+
+inline void readUUIDText(UUID & uuid, ReadBuffer & buf)
+{
+ return readUUIDTextImpl<void>(uuid, buf);
+}
+
+inline bool tryReadUUIDText(UUID & uuid, ReadBuffer & buf)
+{
+ return readUUIDTextImpl<bool>(uuid, buf);
+}
+
+template <typename ReturnType = void>
+inline ReturnType readIPv4TextImpl(IPv4 & ip, ReadBuffer & buf)
+{
+ if (parseIPv4(buf.position(), [&buf](){ return buf.eof(); }, reinterpret_cast<unsigned char *>(&ip.toUnderType())))
+ return ReturnType(true);
+
+ if constexpr (std::is_same_v<ReturnType, void>)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_IPV4, "Cannot parse IPv4 {}", std::string_view(buf.position(), buf.available()));
+ else
+ return ReturnType(false);
+}
+
+inline void readIPv4Text(IPv4 & ip, ReadBuffer & buf)
+{
+ return readIPv4TextImpl<void>(ip, buf);
+}
+
+inline bool tryReadIPv4Text(IPv4 & ip, ReadBuffer & buf)
+{
+ return readIPv4TextImpl<bool>(ip, buf);
+}
+
+template <typename ReturnType = void>
+inline ReturnType readIPv6TextImpl(IPv6 & ip, ReadBuffer & buf)
+{
+ if (parseIPv6orIPv4(buf.position(), [&buf](){ return buf.eof(); }, reinterpret_cast<unsigned char *>(ip.toUnderType().items)))
+ return ReturnType(true);
+
+ if constexpr (std::is_same_v<ReturnType, void>)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_IPV6, "Cannot parse IPv6 {}", std::string_view(buf.position(), buf.available()));
+ else
+ return ReturnType(false);
+}
+
+inline void readIPv6Text(IPv6 & ip, ReadBuffer & buf)
+{
+ return readIPv6TextImpl<void>(ip, buf);
+}
+
+inline bool tryReadIPv6Text(IPv6 & ip, ReadBuffer & buf)
+{
+ return readIPv6TextImpl<bool>(ip, buf);
+}
+
+template <typename T>
+inline T parse(const char * data, size_t size);
+
+template <typename T>
+inline T parseFromString(std::string_view str)
+{
+ return parse<T>(str.data(), str.size());
+}
+
+
+template <typename ReturnType = void>
+ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut);
+
+/** In YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format, according to specified time zone.
+ * As an exception, also supported parsing of unix timestamp in form of decimal number.
+ */
+template <typename ReturnType = void>
+inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
+{
+ /// Optimistic path, when whole value is in buffer.
+ const char * s = buf.position();
+
+ /// YYYY-MM-DD hh:mm:ss
+ static constexpr auto date_time_broken_down_length = 19;
+ /// YYYY-MM-DD
+ static constexpr auto date_broken_down_length = 10;
+ bool optimistic_path_for_date_time_input = s + date_time_broken_down_length <= buf.buffer().end();
+
+ if (optimistic_path_for_date_time_input)
+ {
+ if (s[4] < '0' || s[4] > '9')
+ {
+ UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
+ UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
+ UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
+
+ UInt8 hour = 0;
+ UInt8 minute = 0;
+ UInt8 second = 0;
+
+ /// Simply determine whether it is YYYY-MM-DD hh:mm:ss or YYYY-MM-DD by the content of the tenth character in an optimistic scenario
+ bool dt_long = (s[10] == ' ' || s[10] == 'T');
+ if (dt_long)
+ {
+ hour = (s[11] - '0') * 10 + (s[12] - '0');
+ minute = (s[14] - '0') * 10 + (s[15] - '0');
+ second = (s[17] - '0') * 10 + (s[18] - '0');
+ }
+
+ if (unlikely(year == 0))
+ datetime = 0;
+ else
+ datetime = date_lut.makeDateTime(year, month, day, hour, minute, second);
+
+ if (dt_long)
+ buf.position() += date_time_broken_down_length;
+ else
+ buf.position() += date_broken_down_length;
+
+ return ReturnType(true);
+ }
+ else
+ /// Why not readIntTextUnsafe? Because for needs of AdFox, parsing of unix timestamp with leading zeros is supported: 000...NNNN.
+ return readIntTextImpl<time_t, ReturnType, ReadIntTextCheckOverflow::CHECK_OVERFLOW>(datetime, buf);
+ }
+ else
+ return readDateTimeTextFallback<ReturnType>(datetime, buf, date_lut);
+}
+
+template <typename ReturnType>
+inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut)
+{
+ time_t whole;
+ if (!readDateTimeTextImpl<bool>(whole, buf, date_lut))
+ {
+ return ReturnType(false);
+ }
+
+ int negative_multiplier = 1;
+
+ DB::DecimalUtils::DecimalComponents<DateTime64> components{static_cast<DateTime64::NativeType>(whole), 0};
+
+ if (!buf.eof() && *buf.position() == '.')
+ {
+ ++buf.position();
+
+ /// Read digits, up to 'scale' positions.
+ for (size_t i = 0; i < scale; ++i)
+ {
+ if (!buf.eof() && isNumericASCII(*buf.position()))
+ {
+ components.fractional *= 10;
+ components.fractional += *buf.position() - '0';
+ ++buf.position();
+ }
+ else
+ {
+ /// Adjust to scale.
+ components.fractional *= 10;
+ }
+ }
+
+ /// Ignore digits that are out of precision.
+ while (!buf.eof() && isNumericASCII(*buf.position()))
+ ++buf.position();
+
+ /// Fractional part (subseconds) is treated as positive by users
+ /// (as DateTime64 itself is a positive, although underlying decimal is negative)
+ /// setting fractional part to be negative when whole is 0 results in wrong value,
+ /// so we multiply result by -1.
+ if (components.whole < 0 && components.fractional != 0)
+ {
+ const auto scale_multiplier = DecimalUtils::scaleMultiplier<DateTime64::NativeType>(scale);
+ ++components.whole;
+ components.fractional = scale_multiplier - components.fractional;
+ if (!components.whole)
+ {
+ negative_multiplier = -1;
+ }
+ }
+ }
+ /// 10413792000 is time_t value for 2300-01-01 UTC (a bit over the last year supported by DateTime64)
+ else if (whole >= 10413792000LL)
+ {
+ /// Unix timestamp with subsecond precision, already scaled to integer.
+ /// For disambiguation we support only time since 2001-09-09 01:46:40 UTC and less than 30 000 years in future.
+ components.fractional = components.whole % common::exp10_i32(scale);
+ components.whole = components.whole / common::exp10_i32(scale);
+ }
+
+ bool is_ok = true;
+ if constexpr (std::is_same_v<ReturnType, void>)
+ {
+ datetime64 = DecimalUtils::decimalFromComponents<DateTime64>(components, scale) * negative_multiplier;
+ }
+ else
+ {
+ is_ok = DecimalUtils::tryGetDecimalFromComponents<DateTime64>(components, scale, datetime64);
+ if (is_ok)
+ datetime64 *= negative_multiplier;
+ }
+
+ return ReturnType(is_ok);
+}
+
+inline void readDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ readDateTimeTextImpl<void>(datetime, buf, time_zone);
+}
+
+inline void readDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
+{
+ readDateTimeTextImpl<void>(datetime64, scale, buf, date_lut);
+}
+
+inline bool tryReadDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ return readDateTimeTextImpl<bool>(datetime, buf, time_zone);
+}
+
+inline bool tryReadDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
+{
+ return readDateTimeTextImpl<bool>(datetime64, scale, buf, date_lut);
+}
+
+inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf)
+{
+ char s[10];
+ size_t size = buf.read(s, 10);
+ if (10 != size)
+ {
+ s[size] = 0;
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
+ }
+
+ datetime.year((s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'));
+ datetime.month((s[5] - '0') * 10 + (s[6] - '0'));
+ datetime.day((s[8] - '0') * 10 + (s[9] - '0'));
+
+ /// Allow to read Date as DateTime
+ if (buf.eof() || !(*buf.position() == ' ' || *buf.position() == 'T'))
+ return;
+
+ ++buf.position();
+ size = buf.read(s, 8);
+ if (8 != size)
+ {
+ s[size] = 0;
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
+ }
+
+ datetime.hour((s[0] - '0') * 10 + (s[1] - '0'));
+ datetime.minute((s[3] - '0') * 10 + (s[4] - '0'));
+ datetime.second((s[6] - '0') * 10 + (s[7] - '0'));
+}
+
+
+/// Generic methods to read value in native binary format.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readBinary(T & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+
+inline void readBinary(bool & x, ReadBuffer & buf)
+{
+ /// When deserializing a bool it might trigger UBSAN if the input is not 0 or 1, so it's better to treat it as an Int8
+ static_assert(sizeof(bool) == sizeof(Int8));
+ Int8 flag = 0;
+ readBinary(flag, buf);
+ x = (flag != 0);
+}
+
+inline void readBinary(String & x, ReadBuffer & buf) { readStringBinary(x, buf); }
+inline void readBinary(Decimal32 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+inline void readBinary(Decimal64 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); }
+inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+inline void readBinary(IPv4 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+inline void readBinary(IPv6 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+
+inline void readBinary(UUID & x, ReadBuffer & buf)
+{
+ readUUIDBinary(x, buf);
+}
+
+inline void readBinary(CityHash_v1_0_2::uint128 & x, ReadBuffer & buf)
+{
+ readPODBinary(x.low64, buf);
+ readPODBinary(x.high64, buf);
+}
+
+inline void readBinary(StackTrace::FramePointers & x, ReadBuffer & buf) { readPODBinary(x, buf); }
+
+template <std::endian endian, typename T>
+inline void readBinaryEndian(T & x, ReadBuffer & buf)
+{
+ readBinary(x, buf);
+ transformEndianness<endian>(x);
+}
+
+template <typename T>
+inline void readBinaryLittleEndian(T & x, ReadBuffer & buf)
+{
+ readBinaryEndian<std::endian::little>(x, buf);
+}
+
+template <typename T>
+inline void readBinaryBigEndian(T & x, ReadBuffer & buf)
+{
+ readBinaryEndian<std::endian::big>(x, buf);
+}
+
+
+/// Generic methods to read value in text tab-separated format.
+
+inline void readText(is_integer auto & x, ReadBuffer & buf)
+{
+ if constexpr (std::is_same_v<decltype(x), bool &>)
+ readBoolText(x, buf);
+ else
+ readIntText(x, buf);
+}
+
+inline bool tryReadText(is_integer auto & x, ReadBuffer & buf)
+{
+ return tryReadIntText(x, buf);
+}
+
+inline bool tryReadText(UUID & x, ReadBuffer & buf) { return tryReadUUIDText(x, buf); }
+inline bool tryReadText(IPv4 & x, ReadBuffer & buf) { return tryReadIPv4Text(x, buf); }
+inline bool tryReadText(IPv6 & x, ReadBuffer & buf) { return tryReadIPv6Text(x, buf); }
+
+inline void readText(is_floating_point auto & x, ReadBuffer & buf) { readFloatText(x, buf); }
+
+inline void readText(String & x, ReadBuffer & buf) { readEscapedString(x, buf); }
+
+inline void readText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { readDateText(x, buf, time_zone); }
+
+inline void readText(LocalDate & x, ReadBuffer & buf) { readDateText(x, buf); }
+inline void readText(LocalDateTime & x, ReadBuffer & buf) { readDateTimeText(x, buf); }
+inline void readText(UUID & x, ReadBuffer & buf) { readUUIDText(x, buf); }
+inline void readText(IPv4 & x, ReadBuffer & buf) { readIPv4Text(x, buf); }
+inline void readText(IPv6 & x, ReadBuffer & buf) { readIPv6Text(x, buf); }
+
+/// Generic methods to read value in text format,
+/// possibly in single quotes (only for data types that use quotes in VALUES format of INSERT statement in SQL).
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readQuoted(T & x, ReadBuffer & buf) { readText(x, buf); }
+
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readQuoted(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readText(x, buf, time_zone); }
+
+inline void readQuoted(String & x, ReadBuffer & buf) { readQuotedString(x, buf); }
+
+inline void readQuoted(LocalDate & x, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ readDateText(x, buf);
+ assertChar('\'', buf);
+}
+
+inline void readQuoted(LocalDateTime & x, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ readDateTimeText(x, buf);
+ assertChar('\'', buf);
+}
+
+inline void readQuoted(UUID & x, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ readUUIDText(x, buf);
+ assertChar('\'', buf);
+}
+
+inline void readQuoted(IPv4 & x, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ readIPv4Text(x, buf);
+ assertChar('\'', buf);
+}
+
+inline void readQuoted(IPv6 & x, ReadBuffer & buf)
+{
+ assertChar('\'', buf);
+ readIPv6Text(x, buf);
+ assertChar('\'', buf);
+}
+
+/// Same as above, but in double quotes.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readDoubleQuoted(T & x, ReadBuffer & buf) { readText(x, buf); }
+
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readDoubleQuoted(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readText(x, buf, time_zone); }
+
+inline void readDoubleQuoted(String & x, ReadBuffer & buf) { readDoubleQuotedString(x, buf); }
+
+inline void readDoubleQuoted(LocalDate & x, ReadBuffer & buf)
+{
+ assertChar('"', buf);
+ readDateText(x, buf);
+ assertChar('"', buf);
+}
+
+inline void readDoubleQuoted(LocalDateTime & x, ReadBuffer & buf)
+{
+ assertChar('"', buf);
+ readDateTimeText(x, buf);
+ assertChar('"', buf);
+}
+
+/// CSV for numbers: quotes are optional, no special escaping rules.
+template <typename T>
+inline void readCSVSimple(T & x, ReadBuffer & buf)
+{
+ if (buf.eof()) [[unlikely]]
+ throwReadAfterEOF();
+
+ char maybe_quote = *buf.position();
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ ++buf.position();
+
+ readText(x, buf);
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ assertChar(maybe_quote, buf);
+}
+
+// standalone overload for dates: to avoid instantiating DateLUTs while parsing other types
+template <typename T>
+inline void readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone)
+{
+ if (buf.eof()) [[unlikely]]
+ throwReadAfterEOF();
+
+ char maybe_quote = *buf.position();
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ ++buf.position();
+
+ readText(x, buf, time_zone);
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ assertChar(maybe_quote, buf);
+}
+
+template <typename T>
+requires is_arithmetic_v<T>
+inline void readCSV(T & x, ReadBuffer & buf)
+{
+ readCSVSimple(x, buf);
+}
+
+inline void readCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) { readCSVString(x, buf, settings); }
+inline void readCSV(LocalDate & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(DayNum & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readCSVSimple(x, buf, time_zone); }
+inline void readCSV(LocalDateTime & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(UUID & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(IPv4 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(IPv6 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(UInt128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(Int128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(UInt256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+inline void readCSV(Int256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); }
+
+template <typename T>
+void readBinary(std::vector<T> & x, ReadBuffer & buf)
+{
+ size_t size = 0;
+ readVarUInt(size, buf);
+
+ if (size > DEFAULT_MAX_STRING_SIZE)
+ throw Poco::Exception("Too large vector size.");
+
+ x.resize(size);
+ for (size_t i = 0; i < size; ++i)
+ readBinary(x[i], buf);
+}
+
+template <typename T>
+void readQuoted(std::vector<T> & x, ReadBuffer & buf)
+{
+ bool first = true;
+ assertChar('[', buf);
+ while (!buf.eof() && *buf.position() != ']')
+ {
+ if (!first)
+ {
+ if (*buf.position() == ',')
+ ++buf.position();
+ else
+ throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
+ }
+
+ first = false;
+
+ x.push_back(T());
+ readQuoted(x.back(), buf);
+ }
+ assertChar(']', buf);
+}
+
+template <typename T>
+void readDoubleQuoted(std::vector<T> & x, ReadBuffer & buf)
+{
+ bool first = true;
+ assertChar('[', buf);
+ while (!buf.eof() && *buf.position() != ']')
+ {
+ if (!first)
+ {
+ if (*buf.position() == ',')
+ ++buf.position();
+ else
+ throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
+ }
+
+ first = false;
+
+ x.push_back(T());
+ readDoubleQuoted(x.back(), buf);
+ }
+ assertChar(']', buf);
+}
+
+template <typename T>
+void readText(std::vector<T> & x, ReadBuffer & buf)
+{
+ readQuoted(x, buf);
+}
+
+
+/// Skip whitespace characters.
+inline void skipWhitespaceIfAny(ReadBuffer & buf, bool one_line = false)
+{
+ if (!one_line)
+ while (!buf.eof() && isWhitespaceASCII(*buf.position()))
+ ++buf.position();
+ else
+ while (!buf.eof() && isWhitespaceASCIIOneLine(*buf.position()))
+ ++buf.position();
+}
+
+/// Skips json value.
+void skipJSONField(ReadBuffer & buf, StringRef name_of_field);
+
+
+/** Read serialized exception.
+ * During serialization/deserialization some information is lost
+ * (type is cut to base class, 'message' replaced by 'displayText', and stack trace is appended to 'message')
+ * Some additional message could be appended to exception (example: you could add information about from where it was received).
+ */
+Exception readException(ReadBuffer & buf, const String & additional_message = "", bool remote_exception = false);
+void readAndThrowException(ReadBuffer & buf, const String & additional_message = "");
+
+
+/** Helper function for implementation.
+ */
+template <ReadIntTextCheckOverflow check_overflow = ReadIntTextCheckOverflow::CHECK_OVERFLOW, typename T>
+static inline const char * tryReadIntText(T & x, const char * pos, const char * end)
+{
+ ReadBufferFromMemory in(pos, end - pos);
+ tryReadIntText<check_overflow>(x, in);
+ return pos + in.count();
+}
+
+
+/// Convenient methods for reading something from string in text format.
+template <typename T>
+inline T parse(const char * data, size_t size)
+{
+ T res;
+ ReadBufferFromMemory buf(data, size);
+ readText(res, buf);
+ return res;
+}
+
+template <typename T>
+inline bool tryParse(T & res, const char * data, size_t size)
+{
+ ReadBufferFromMemory buf(data, size);
+ return tryReadText(res, buf);
+}
+
+template <typename T>
+inline void readTextWithSizeSuffix(T & x, ReadBuffer & buf) { readText(x, buf); }
+
+template <is_integer T>
+inline void readTextWithSizeSuffix(T & x, ReadBuffer & buf)
+{
+ readIntText(x, buf);
+ if (buf.eof())
+ return;
+
+ /// Updates x depending on the suffix
+ auto finish = [&buf, &x] (UInt64 base, int power_of_two) mutable
+ {
+ ++buf.position();
+ if (buf.eof())
+ {
+ x *= base; /// For decimal suffixes, such as k, M, G etc.
+ }
+ else if (*buf.position() == 'i')
+ {
+ x = (x << power_of_two); // NOLINT /// For binary suffixes, such as ki, Mi, Gi, etc.
+ ++buf.position();
+ }
+ return;
+ };
+
+ switch (*buf.position())
+ {
+ case 'k': [[fallthrough]];
+ case 'K':
+ finish(1000, 10);
+ break;
+ case 'M':
+ finish(1000000, 20);
+ break;
+ case 'G':
+ finish(1000000000, 30);
+ break;
+ case 'T':
+ finish(1000000000000ULL, 40);
+ break;
+ default:
+ return;
+ }
+}
+
+/// Read something from text format and trying to parse the suffix.
+/// If the suffix is not valid gives an error
+/// For example: 723145 -- ok, 213MB -- not ok, but 213Mi -- ok
+template <typename T>
+inline T parseWithSizeSuffix(const char * data, size_t size)
+{
+ T res;
+ ReadBufferFromMemory buf(data, size);
+ readTextWithSizeSuffix(res, buf);
+ assertEOF(buf);
+ return res;
+}
+
+template <typename T>
+inline T parseWithSizeSuffix(std::string_view s)
+{
+ return parseWithSizeSuffix<T>(s.data(), s.size());
+}
+
+template <typename T>
+inline T parseWithSizeSuffix(const char * data)
+{
+ return parseWithSizeSuffix<T>(data, strlen(data));
+}
+
+template <typename T>
+inline T parse(const char * data)
+{
+ return parse<T>(data, strlen(data));
+}
+
+template <typename T>
+inline T parse(const String & s)
+{
+ return parse<T>(s.data(), s.size());
+}
+
+template <typename T>
+inline T parse(std::string_view s)
+{
+ return parse<T>(s.data(), s.size());
+}
+
+template <typename T>
+inline bool tryParse(T & res, const char * data)
+{
+ return tryParse(res, data, strlen(data));
+}
+
+template <typename T>
+inline bool tryParse(T & res, const String & s)
+{
+ return tryParse(res, s.data(), s.size());
+}
+
+template <typename T>
+inline bool tryParse(T & res, std::string_view s)
+{
+ return tryParse(res, s.data(), s.size());
+}
+
+
+/** Skip UTF-8 BOM if it is under cursor.
+ * As BOM is usually located at start of stream, and buffer size is usually larger than three bytes,
+ * the function expects, that all three bytes of BOM is fully in buffer (otherwise it don't skip anything).
+ */
+inline void skipBOMIfExists(ReadBuffer & buf)
+{
+ if (!buf.eof()
+ && buf.position() + 3 < buf.buffer().end()
+ && buf.position()[0] == '\xEF'
+ && buf.position()[1] == '\xBB'
+ && buf.position()[2] == '\xBF')
+ {
+ buf.position() += 3;
+ }
+}
+
+
+/// Skip to next character after next \n. If no \n in stream, skip to end.
+void skipToNextLineOrEOF(ReadBuffer & buf);
+
+/// Skip to next character after next \r. If no \r in stream, skip to end.
+void skipToCarriageReturnOrEOF(ReadBuffer & buf);
+
+/// Skip to next character after next unescaped \n. If no \n in stream, skip to end. Does not throw on invalid escape sequences.
+void skipToUnescapedNextLineOrEOF(ReadBuffer & buf);
+
+/// Skip to next character after next \0. If no \0 in stream, skip to end.
+void skipNullTerminated(ReadBuffer & buf);
+
+/** This function just copies the data from buffer's position (in.position())
+ * to current position (from arguments) appending into memory.
+ */
+void saveUpToPosition(ReadBuffer & in, Memory<Allocator<false>> & memory, char * current);
+
+/** This function is negative to eof().
+ * In fact it returns whether the data was loaded to internal ReadBuffers's buffer or not.
+ * And saves data from buffer's position to current if there is no pending data in buffer.
+ * Why we have to use this strange function? Consider we have buffer's internal position in the middle
+ * of our buffer and the current cursor in the end of the buffer. When we call eof() it calls next().
+ * And this function can fill the buffer with new data, so we will lose the data from previous buffer state.
+ */
+bool loadAtPosition(ReadBuffer & in, Memory<Allocator<false>> & memory, char * & current);
+
+/// Skip data until start of the next row or eof (the end of row is determined by two delimiters:
+/// row_after_delimiter and row_between_delimiter).
+void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delimiter, const String & row_between_delimiter, bool skip_spaces);
+
+struct PcgDeserializer
+{
+ static void deserializePcg32(pcg32_fast & rng, ReadBuffer & buf)
+ {
+ decltype(rng.state_) multiplier, increment, state;
+ readText(multiplier, buf);
+ assertChar(' ', buf);
+ readText(increment, buf);
+ assertChar(' ', buf);
+ readText(state, buf);
+
+ if (multiplier != rng.multiplier())
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect multiplier in pcg32: expected {}, got {}", rng.multiplier(), multiplier);
+ if (increment != rng.increment())
+ throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect increment in pcg32: expected {}, got {}", rng.increment(), increment);
+
+ rng.state_ = state;
+ }
+};
+
+template <typename Vector>
+void readQuotedFieldInto(Vector & s, ReadBuffer & buf);
+
+void readQuotedField(String & s, ReadBuffer & buf);
+
+void readJSONField(String & s, ReadBuffer & buf);
+
+void readTSVField(String & s, ReadBuffer & buf);
+
+/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
+ * It is assumed that the cursor is located on the `\` symbol
+ */
+bool parseComplexEscapeSequence(String & s, ReadBuffer & buf);
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadHelpersArena.h b/contrib/clickhouse/src/IO/ReadHelpersArena.h
new file mode 100644
index 0000000000..b88d5c037d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadHelpersArena.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/VarInt.h>
+#include <base/StringRef.h>
+#include <Common/Arena.h>
+
+
+namespace DB
+{
+
+
+namespace ErrorCodes
+{
+ extern const int TOO_LARGE_STRING_SIZE;
+}
+
+inline StringRef readStringBinaryInto(Arena & arena, ReadBuffer & buf)
+{
+ size_t size = 0;
+ readVarUInt(size, buf);
+
+ if (unlikely(size > DEFAULT_MAX_STRING_SIZE))
+ throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size.");
+
+ char * data = arena.alloc(size);
+ buf.readStrict(data, size);
+
+ return StringRef(data, size);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadSettings.h b/contrib/clickhouse/src/IO/ReadSettings.h
new file mode 100644
index 0000000000..87f249823b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadSettings.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <Core/Defines.h>
+#include <Interpreters/Cache/FileCache_fwd.h>
+#include <Common/Throttler_fwd.h>
+#include <Common/Priority.h>
+#include <IO/ResourceLink.h>
+
+namespace DB
+{
+enum class LocalFSReadMethod
+{
+ /**
+ * Simple synchronous reads with 'read'.
+ * Can use direct IO after specified size.
+ * Can use prefetch by asking OS to perform readahead.
+ */
+ read,
+
+ /**
+ * Simple synchronous reads with 'pread'.
+ * In contrast to 'read', shares single file descriptor from multiple threads.
+ * Can use direct IO after specified size.
+ * Can use prefetch by asking OS to perform readahead.
+ */
+ pread,
+
+ /**
+ * Use mmap after specified size or simple synchronous reads with 'pread'.
+ * Can use prefetch by asking OS to perform readahead.
+ */
+ mmap,
+
+ /**
+ * Use the io_uring Linux subsystem for asynchronous reads.
+ * Can use direct IO after specified size.
+ * Can do prefetch with double buffering.
+ */
+ io_uring,
+
+ /**
+ * Checks if data is in page cache with 'preadv2' on modern Linux kernels.
+ * If data is in page cache, read from the same thread.
+ * If not, offload IO to separate threadpool.
+ * Can do prefetch with double buffering.
+ * Can use specified priorities and limit the number of concurrent reads.
+ */
+ pread_threadpool,
+
+ /// Use asynchronous reader with fake backend that in fact synchronous.
+ /// @attention Use only for testing purposes.
+ pread_fake_async
+};
+
+enum class RemoteFSReadMethod
+{
+ read,
+ threadpool,
+};
+
+class MMappedFileCache;
+
+struct ReadSettings
+{
+ /// Method to use reading from local filesystem.
+ LocalFSReadMethod local_fs_method = LocalFSReadMethod::pread;
+ /// Method to use reading from remote filesystem.
+ RemoteFSReadMethod remote_fs_method = RemoteFSReadMethod::threadpool;
+
+ /// https://eklitzke.org/efficient-file-copying-on-linux
+ size_t local_fs_buffer_size = 128 * 1024;
+
+ size_t remote_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
+ size_t prefetch_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
+
+ bool local_fs_prefetch = false;
+ bool remote_fs_prefetch = false;
+
+ /// For 'read', 'pread' and 'pread_threadpool' methods.
+ size_t direct_io_threshold = 0;
+
+ /// For 'mmap' method.
+ size_t mmap_threshold = 0;
+ MMappedFileCache * mmap_cache = nullptr;
+
+ /// For 'pread_threadpool'/'io_uring' method. Lower value is higher priority.
+ Priority priority;
+
+ bool load_marks_asynchronously = true;
+
+ size_t remote_fs_read_max_backoff_ms = 10000;
+ size_t remote_fs_read_backoff_max_tries = 4;
+
+ bool enable_filesystem_read_prefetches_log = false;
+
+ bool enable_filesystem_cache = true;
+ bool read_from_filesystem_cache_if_exists_otherwise_bypass_cache = false;
+ bool enable_filesystem_cache_log = false;
+ /// Don't populate cache when the read is not part of query execution (e.g. background thread).
+ bool avoid_readthrough_cache_outside_query_context = true;
+
+ size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024);
+ bool skip_download_if_exceeds_query_cache = true;
+
+ size_t remote_read_min_bytes_for_seek = DBMS_DEFAULT_BUFFER_SIZE;
+
+ FileCachePtr remote_fs_cache;
+
+ /// Bandwidth throttler to use during reading
+ ThrottlerPtr remote_throttler;
+ ThrottlerPtr local_throttler;
+
+ // Resource to be used during reading
+ ResourceLink resource_link;
+
+ size_t http_max_tries = 1;
+ size_t http_retry_initial_backoff_ms = 100;
+ size_t http_retry_max_backoff_ms = 1600;
+ bool http_skip_not_found_url_for_globs = true;
+
+ /// Monitoring
+ bool for_object_storage = false; // to choose which profile events should be incremented
+
+ ReadSettings adjustBufferSize(size_t file_size) const
+ {
+ ReadSettings res = *this;
+ res.local_fs_buffer_size = std::min(std::max(1ul, file_size), local_fs_buffer_size);
+ res.remote_fs_buffer_size = std::min(std::max(1ul, file_size), remote_fs_buffer_size);
+ res.prefetch_buffer_size = std::min(std::max(1ul, file_size), prefetch_buffer_size);
+ return res;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.cpp b/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.cpp
new file mode 100644
index 0000000000..7e5c0d37c8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.cpp
@@ -0,0 +1,940 @@
+#include "ReadWriteBufferFromHTTP.h"
+
+#include <IO/HTTPCommon.h>
+
+namespace ProfileEvents
+{
+extern const Event ReadBufferSeekCancelConnection;
+extern const Event ReadWriteBufferFromHTTPPreservedSessions;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int TOO_MANY_REDIRECTS;
+ extern const int HTTP_RANGE_NOT_SATISFIABLE;
+ extern const int BAD_ARGUMENTS;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+ extern const int UNKNOWN_FILE_SIZE;
+}
+
+template <typename TSessionFactory>
+UpdatableSession<TSessionFactory>::UpdatableSession(const Poco::URI & uri, UInt64 max_redirects_, std::shared_ptr<TSessionFactory> session_factory_)
+ : max_redirects{max_redirects_}
+ , initial_uri(uri)
+ , session_factory(std::move(session_factory_))
+{
+ session = session_factory->buildNewSession(uri);
+}
+
+template <typename TSessionFactory>
+typename UpdatableSession<TSessionFactory>::SessionPtr UpdatableSession<TSessionFactory>::getSession() { return session; }
+
+template <typename TSessionFactory>
+void UpdatableSession<TSessionFactory>::updateSession(const Poco::URI & uri)
+{
+ ++redirects;
+ if (redirects <= max_redirects)
+ session = session_factory->buildNewSession(uri);
+ else
+ throw Exception(ErrorCodes::TOO_MANY_REDIRECTS,
+ "Too many redirects while trying to access {}."
+ " You can {} redirects by changing the setting 'max_http_get_redirects'."
+ " Example: `SET max_http_get_redirects = 10`."
+ " Redirects are restricted to prevent possible attack when a malicious server redirects to an internal resource, bypassing the authentication or firewall.",
+ initial_uri.toString(), max_redirects ? "increase the allowed maximum number of" : "allow");
+}
+
+template <typename TSessionFactory>
+typename UpdatableSession<TSessionFactory>::SessionPtr UpdatableSession<TSessionFactory>::createDetachedSession(const Poco::URI & uri)
+{
+ return session_factory->buildNewSession(uri);
+}
+
+template <typename TSessionFactory>
+std::shared_ptr<UpdatableSession<TSessionFactory>> UpdatableSession<TSessionFactory>::clone(const Poco::URI & uri)
+{
+ return std::make_shared<UpdatableSession<TSessionFactory>>(uri, max_redirects, session_factory);
+}
+
+
+namespace detail
+{
+
+static bool isRetriableError(const Poco::Net::HTTPResponse::HTTPStatus http_status) noexcept
+{
+ static constexpr std::array non_retriable_errors{
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_BAD_REQUEST,
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_UNAUTHORIZED,
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_NOT_FOUND,
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_FORBIDDEN,
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_NOT_IMPLEMENTED,
+ Poco::Net::HTTPResponse::HTTPStatus::HTTP_METHOD_NOT_ALLOWED};
+
+ return std::all_of(
+ non_retriable_errors.begin(), non_retriable_errors.end(), [&](const auto status) { return http_status != status; });
+}
+
+static Poco::URI getUriAfterRedirect(const Poco::URI & prev_uri, Poco::Net::HTTPResponse & response)
+{
+ auto location = response.get("Location");
+ auto location_uri = Poco::URI(location);
+ if (!location_uri.isRelative())
+ return location_uri;
+ /// Location header contains relative path. So we need to concatenate it
+ /// with path from the original URI and normalize it.
+ auto path = std::filesystem::weakly_canonical(std::filesystem::path(prev_uri.getPath()) / location);
+ location_uri = prev_uri;
+ location_uri.setPath(path);
+ return location_uri;
+}
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::withPartialContent(const HTTPRange & range) const
+{
+ /**
+ * Add range header if we have some passed range
+ * or if we want to retry GET request on purpose.
+ */
+ return range.begin || range.end || retry_with_range_header;
+}
+
+template <typename UpdatableSessionPtr>
+size_t ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getOffset() const { return read_range.begin.value_or(0) + offset_from_begin_pos; }
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::prepareRequest(Poco::Net::HTTPRequest & request, Poco::URI uri_, std::optional<HTTPRange> range) const
+{
+ request.setHost(uri_.getHost()); // use original, not resolved host name in header
+
+ if (out_stream_callback)
+ request.setChunkedTransferEncoding(true);
+ else if (method == Poco::Net::HTTPRequest::HTTP_POST)
+ request.setContentLength(0); /// No callback - no body
+
+ for (const auto & [header, value] : http_header_entries)
+ request.set(header, value);
+
+ if (range)
+ {
+ String range_header_value;
+ if (range->end)
+ range_header_value = fmt::format("bytes={}-{}", *range->begin, *range->end);
+ else
+ range_header_value = fmt::format("bytes={}-", *range->begin);
+ LOG_TEST(log, "Adding header: Range: {}", range_header_value);
+ request.set("Range", range_header_value);
+ }
+
+ if (!credentials.getUsername().empty())
+ credentials.authenticate(request);
+}
+
+template <typename UpdatableSessionPtr>
+std::istream * ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::callImpl(
+ UpdatableSessionPtr & current_session, Poco::URI uri_, Poco::Net::HTTPResponse & response, const std::string & method_, bool for_object_info)
+{
+ // With empty path poco will send "POST HTTP/1.1" its bug.
+ if (uri_.getPath().empty())
+ uri_.setPath("/");
+
+ std::optional<HTTPRange> range;
+ if (!for_object_info)
+ {
+ if (withPartialContent(read_range))
+ range = HTTPRange{getOffset(), read_range.end};
+ }
+
+ Poco::Net::HTTPRequest request(method_, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1);
+ prepareRequest(request, uri_, range);
+
+ LOG_TRACE(log, "Sending request to {}", uri_.toString());
+
+ auto sess = current_session->getSession();
+ auto & stream_out = sess->sendRequest(request);
+
+ if (out_stream_callback)
+ out_stream_callback(stream_out);
+
+ auto result_istr = receiveResponse(*sess, request, response, true);
+ response.getCookies(cookies);
+
+ /// we can fetch object info while the request is being processed
+ /// and we don't want to override any context used by it
+ if (!for_object_info)
+ content_encoding = response.get("Content-Encoding", "");
+
+ return result_istr;
+}
+
+template <typename UpdatableSessionPtr>
+size_t ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getFileSize()
+{
+ if (!file_info)
+ file_info = getFileInfo();
+
+ if (file_info->file_size)
+ return *file_info->file_size;
+
+ throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for: {}", uri.toString());
+}
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::supportsReadAt()
+{
+ if (!file_info)
+ file_info = getFileInfo();
+ return method == Poco::Net::HTTPRequest::HTTP_GET && file_info->seekable;
+}
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::checkIfActuallySeekable()
+{
+ if (!file_info)
+ file_info = getFileInfo();
+ return file_info->seekable;
+}
+
+template <typename UpdatableSessionPtr>
+String ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getFileName() const { return uri.toString(); }
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getHeadResponse(Poco::Net::HTTPResponse & response)
+{
+ for (size_t i = 0; i < settings.http_max_tries; ++i)
+ {
+ try
+ {
+ callWithRedirects(response, Poco::Net::HTTPRequest::HTTP_HEAD, true, true);
+ break;
+ }
+ catch (const Poco::Exception & e)
+ {
+ if (i == settings.http_max_tries - 1 || !isRetriableError(response.getStatus()))
+ throw;
+
+ LOG_ERROR(log, "Failed to make HTTP_HEAD request to {}. Error: {}", uri.toString(), e.displayText());
+ }
+ }
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::setupExternalBuffer()
+{
+ /**
+ * use_external_buffer -- means we read into the buffer which
+ * was passed to us from somewhere else. We do not check whether
+ * previously returned buffer was read or not (no hasPendingData() check is needed),
+ * because this branch means we are prefetching data,
+ * each nextImpl() call we can fill a different buffer.
+ */
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+}
+
+template <typename UpdatableSessionPtr>
+ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::ReadWriteBufferFromHTTPBase(
+ UpdatableSessionPtr session_,
+ Poco::URI uri_,
+ const Poco::Net::HTTPBasicCredentials & credentials_,
+ const std::string & method_,
+ OutStreamCallback out_stream_callback_,
+ size_t buffer_size_,
+ const ReadSettings & settings_,
+ HTTPHeaderEntries http_header_entries_,
+ const RemoteHostFilter * remote_host_filter_,
+ bool delay_initialization,
+ bool use_external_buffer_,
+ bool http_skip_not_found_url_,
+ std::optional<HTTPFileInfo> file_info_,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config_)
+ : SeekableReadBuffer(nullptr, 0)
+ , uri {uri_}
+ , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET}
+ , session {session_}
+ , out_stream_callback {out_stream_callback_}
+ , credentials {credentials_}
+ , http_header_entries {std::move(http_header_entries_)}
+ , remote_host_filter {remote_host_filter_}
+ , buffer_size {buffer_size_}
+ , use_external_buffer {use_external_buffer_}
+ , file_info(file_info_)
+ , http_skip_not_found_url(http_skip_not_found_url_)
+ , settings {settings_}
+ , log(&Poco::Logger::get("ReadWriteBufferFromHTTP"))
+ , proxy_config(proxy_config_)
+{
+ if (settings.http_max_tries <= 0 || settings.http_retry_initial_backoff_ms <= 0
+ || settings.http_retry_initial_backoff_ms >= settings.http_retry_max_backoff_ms)
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Invalid setting for http backoff, "
+ "must be http_max_tries >= 1 (current is {}) and "
+ "0 < http_retry_initial_backoff_ms < settings.http_retry_max_backoff_ms (now 0 < {} < {})",
+ settings.http_max_tries,
+ settings.http_retry_initial_backoff_ms,
+ settings.http_retry_max_backoff_ms);
+
+ // Configure User-Agent if it not already set.
+ const std::string user_agent = "User-Agent";
+ auto iter = std::find_if(
+ http_header_entries.begin(),
+ http_header_entries.end(),
+ [&user_agent](const HTTPHeaderEntry & entry) { return entry.name == user_agent; });
+
+ if (iter == http_header_entries.end())
+ {
+ http_header_entries.emplace_back("User-Agent", fmt::format("ClickHouse/{}", VERSION_STRING));
+ }
+
+ if (!delay_initialization)
+ {
+ initialize();
+ if (exception)
+ std::rethrow_exception(exception);
+ }
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::callWithRedirects(Poco::Net::HTTPResponse & response, const String & method_, bool throw_on_all_errors, bool for_object_info)
+{
+ UpdatableSessionPtr current_session = nullptr;
+
+ /// we can fetch object info while the request is being processed
+ /// and we don't want to override any context used by it
+ if (for_object_info)
+ current_session = session->clone(uri);
+ else
+ current_session = session;
+
+ call(current_session, response, method_, throw_on_all_errors, for_object_info);
+ saved_uri_redirect = uri;
+
+ while (isRedirect(response.getStatus()))
+ {
+ Poco::URI uri_redirect = getUriAfterRedirect(*saved_uri_redirect, response);
+ saved_uri_redirect = uri_redirect;
+ if (remote_host_filter)
+ remote_host_filter->checkURL(uri_redirect);
+
+ current_session->updateSession(uri_redirect);
+
+ /// we can fetch object info while the request is being processed
+ /// and we don't want to override any context used by it
+ auto result_istr = callImpl(current_session, uri_redirect, response, method, for_object_info);
+ if (!for_object_info)
+ istr = result_istr;
+ }
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::call(UpdatableSessionPtr & current_session, Poco::Net::HTTPResponse & response, const String & method_, bool throw_on_all_errors, bool for_object_info)
+{
+ try
+ {
+ /// we can fetch object info while the request is being processed
+ /// and we don't want to override any context used by it
+ auto result_istr = callImpl(current_session, saved_uri_redirect ? *saved_uri_redirect : uri, response, method_, for_object_info);
+ if (!for_object_info)
+ istr = result_istr;
+ }
+ catch (...)
+ {
+ /// we can fetch object info while the request is being processed
+ /// and we don't want to override any context used by it
+ if (for_object_info)
+ throw;
+
+ if (throw_on_all_errors)
+ throw;
+
+ auto http_status = response.getStatus();
+
+ if (http_status == Poco::Net::HTTPResponse::HTTPStatus::HTTP_NOT_FOUND && http_skip_not_found_url)
+ {
+ initialization_error = InitializeError::SKIP_NOT_FOUND_URL;
+ }
+ else if (!isRetriableError(http_status))
+ {
+ initialization_error = InitializeError::NON_RETRYABLE_ERROR;
+ exception = std::current_exception();
+ }
+ else
+ {
+ throw;
+ }
+ }
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::initialize()
+{
+ Poco::Net::HTTPResponse response;
+
+ call(session, response, method);
+ if (initialization_error != InitializeError::NONE)
+ return;
+
+ while (isRedirect(response.getStatus()))
+ {
+ Poco::URI uri_redirect = getUriAfterRedirect(saved_uri_redirect.value_or(uri), response);
+ if (remote_host_filter)
+ remote_host_filter->checkURL(uri_redirect);
+
+ session->updateSession(uri_redirect);
+
+ istr = callImpl(session, uri_redirect, response, method);
+ saved_uri_redirect = uri_redirect;
+ }
+
+ if (response.hasContentLength())
+ LOG_DEBUG(log, "Received response with content length: {}", response.getContentLength());
+
+ if (withPartialContent(read_range) && response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_PARTIAL_CONTENT)
+ {
+ /// Having `200 OK` instead of `206 Partial Content` is acceptable in case we retried with range.begin == 0.
+ if (getOffset() != 0)
+ {
+ if (!exception)
+ {
+ exception = std::make_exception_ptr(Exception(
+ ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
+ "Cannot read with range: [{}, {}] (response status: {}, reason: {})",
+ *read_range.begin,
+ read_range.end ? toString(*read_range.end) : "-",
+ toString(response.getStatus()), response.getReason()));
+ }
+
+ /// Retry 200OK
+ if (response.getStatus() == Poco::Net::HTTPResponse::HTTPStatus::HTTP_OK)
+ initialization_error = InitializeError::RETRYABLE_ERROR;
+ else
+ initialization_error = InitializeError::NON_RETRYABLE_ERROR;
+
+ return;
+ }
+ else if (read_range.end)
+ {
+ /// We could have range.begin == 0 and range.end != 0 in case of DiskWeb and failing to read with partial content
+ /// will affect only performance, so a warning is enough.
+ LOG_WARNING(log, "Unable to read with range header: [{}, {}]", read_range.begin.value_or(0), *read_range.end);
+ }
+ }
+
+ // Remember file size. It'll be used to report eof in next nextImpl() call.
+ if (!read_range.end && response.hasContentLength())
+ file_info = parseFileInfo(response, withPartialContent(read_range) ? getOffset() : 0);
+
+ impl = std::make_unique<ReadBufferFromIStream>(*istr, buffer_size);
+
+ if (use_external_buffer)
+ setupExternalBuffer();
+}
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::nextImpl()
+{
+ if (initialization_error == InitializeError::SKIP_NOT_FOUND_URL)
+ return false;
+ assert(initialization_error == InitializeError::NONE);
+
+ if (next_callback)
+ next_callback(count());
+
+ if ((read_range.end && getOffset() > read_range.end.value()) ||
+ (file_info && file_info->file_size && getOffset() >= file_info->file_size.value()))
+ {
+ /// Response was fully read.
+ markSessionForReuse(session->getSession());
+ ProfileEvents::increment(ProfileEvents::ReadWriteBufferFromHTTPPreservedSessions);
+ return false;
+ }
+
+ if (impl)
+ {
+ if (use_external_buffer)
+ {
+ setupExternalBuffer();
+ }
+ else
+ {
+ /**
+ * impl was initialized before, pass position() to it to make
+ * sure there is no pending data which was not read.
+ */
+ if (!working_buffer.empty())
+ impl->position() = position();
+ }
+ }
+
+ bool result = false;
+ size_t milliseconds_to_wait = settings.http_retry_initial_backoff_ms;
+ bool last_attempt = false;
+
+ auto on_retriable_error = [&]()
+ {
+ retry_with_range_header = true;
+ impl.reset();
+ auto http_session = session->getSession();
+ http_session->reset();
+ if (!last_attempt)
+ {
+ sleepForMilliseconds(milliseconds_to_wait);
+ milliseconds_to_wait = std::min(milliseconds_to_wait * 2, settings.http_retry_max_backoff_ms);
+ }
+ };
+
+ for (size_t i = 0;; ++i)
+ {
+ if (last_attempt)
+ break;
+ last_attempt = i + 1 >= settings.http_max_tries;
+
+ exception = nullptr;
+ initialization_error = InitializeError::NONE;
+
+ try
+ {
+ if (!impl)
+ {
+ initialize();
+
+ if (initialization_error == InitializeError::NON_RETRYABLE_ERROR)
+ {
+ assert(exception);
+ break;
+ }
+ else if (initialization_error == InitializeError::SKIP_NOT_FOUND_URL)
+ {
+ return false;
+ }
+ else if (initialization_error == InitializeError::RETRYABLE_ERROR)
+ {
+ LOG_ERROR(
+ log,
+ "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. "
+ "(Current backoff wait is {}/{} ms)",
+ uri.toString(), i + 1, settings.http_max_tries, getOffset(),
+ read_range.end ? toString(*read_range.end) : "unknown",
+ milliseconds_to_wait, settings.http_retry_max_backoff_ms);
+
+ assert(exception);
+ on_retriable_error();
+ continue;
+ }
+
+ assert(!exception);
+
+ if (use_external_buffer)
+ {
+ setupExternalBuffer();
+ }
+ }
+
+ result = impl->next();
+ exception = nullptr;
+ break;
+ }
+ catch (const Poco::Exception & e)
+ {
+ /// Too many open files - non-retryable.
+ if (e.code() == POCO_EMFILE)
+ throw;
+
+ /** Retry request unconditionally if nothing has been read yet.
+ * Otherwise if it is GET method retry with range header.
+ */
+ bool can_retry_request = !offset_from_begin_pos || method == Poco::Net::HTTPRequest::HTTP_GET;
+ if (!can_retry_request)
+ throw;
+
+ LOG_ERROR(
+ log,
+ "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. "
+ "Error: {}. (Current backoff wait is {}/{} ms)",
+ uri.toString(),
+ i + 1,
+ settings.http_max_tries,
+ getOffset(),
+ read_range.end ? toString(*read_range.end) : "unknown",
+ e.displayText(),
+ milliseconds_to_wait,
+ settings.http_retry_max_backoff_ms);
+
+ on_retriable_error();
+ exception = std::current_exception();
+ }
+ }
+
+ if (exception)
+ std::rethrow_exception(exception);
+
+ if (!result)
+ {
+ /// Eof is reached, i.e response was fully read.
+ markSessionForReuse(session->getSession());
+ ProfileEvents::increment(ProfileEvents::ReadWriteBufferFromHTTPPreservedSessions);
+ return false;
+ }
+
+ internal_buffer = impl->buffer();
+ working_buffer = internal_buffer;
+ offset_from_begin_pos += working_buffer.size();
+ return true;
+}
+
+template <typename UpdatableSessionPtr>
+size_t ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> & progress_callback)
+{
+ /// Caller must have checked supportsReadAt().
+ /// This ensures we've sent at least one HTTP request and populated saved_uri_redirect.
+ chassert(file_info && file_info->seekable);
+
+ if (n == 0)
+ return 0;
+
+ Poco::URI uri_ = saved_uri_redirect.value_or(uri);
+ if (uri_.getPath().empty())
+ uri_.setPath("/");
+
+ size_t milliseconds_to_wait = settings.http_retry_initial_backoff_ms;
+
+ for (size_t attempt = 0;; ++attempt)
+ {
+ bool last_attempt = attempt + 1 >= settings.http_max_tries;
+
+ Poco::Net::HTTPRequest request(method, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1);
+ prepareRequest(request, uri_, HTTPRange { .begin = offset, .end = offset + n - 1});
+
+ LOG_TRACE(log, "Sending request to {} for range [{}, {})", uri_.toString(), offset, offset + n);
+
+ auto sess = session->createDetachedSession(uri_);
+
+ Poco::Net::HTTPResponse response;
+ std::istream * result_istr;
+
+ try
+ {
+ sess->sendRequest(request);
+ result_istr = receiveResponse(*sess, request, response, /*allow_redirects*/ false);
+
+ if (response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_PARTIAL_CONTENT &&
+ (offset != 0 || offset + n < *file_info->file_size))
+ throw Exception(
+ ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
+ "Expected 206 Partial Content, got {} when reading {} range [{}, {})",
+ toString(response.getStatus()), uri_.toString(), offset, offset + n);
+
+ bool cancelled;
+ size_t r = copyFromIStreamWithProgressCallback(*result_istr, to, n, progress_callback, &cancelled);
+
+ if (!cancelled)
+ {
+ /// Response was fully read.
+ markSessionForReuse(sess);
+ ProfileEvents::increment(ProfileEvents::ReadWriteBufferFromHTTPPreservedSessions);
+ }
+
+ return r;
+ }
+ catch (const Poco::Exception & e)
+ {
+ LOG_ERROR(
+ log,
+ "HTTP request (positioned) to `{}` with range [{}, {}) failed at try {}/{}: {}",
+ uri_.toString(), offset, offset + n, attempt + 1, settings.http_max_tries,
+ e.what());
+
+ /// Decide whether to retry.
+
+ if (last_attempt)
+ throw;
+
+ /// Too many open files - non-retryable.
+ if (e.code() == POCO_EMFILE)
+ throw;
+
+ if (const auto * h = dynamic_cast<const HTTPException*>(&e);
+ h && !isRetriableError(static_cast<Poco::Net::HTTPResponse::HTTPStatus>(h->getHTTPStatus())))
+ throw;
+
+ sleepForMilliseconds(milliseconds_to_wait);
+ milliseconds_to_wait = std::min(milliseconds_to_wait * 2, settings.http_retry_max_backoff_ms);
+ continue;
+ }
+ }
+}
+
+template <typename UpdatableSessionPtr>
+off_t ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getPosition() { return getOffset() - available(); }
+
+template <typename UpdatableSessionPtr>
+off_t ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::seek(off_t offset_, int whence)
+{
+ if (whence != SEEK_SET)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
+
+ if (offset_ < 0)
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}",
+ offset_);
+
+ off_t current_offset = getOffset();
+ if (!working_buffer.empty() && size_t(offset_) >= current_offset - working_buffer.size() && offset_ < current_offset)
+ {
+ pos = working_buffer.end() - (current_offset - offset_);
+ assert(pos >= working_buffer.begin());
+ assert(pos < working_buffer.end());
+
+ return getPosition();
+ }
+
+ if (impl)
+ {
+ auto position = getPosition();
+ if (offset_ > position)
+ {
+ size_t diff = offset_ - position;
+ if (diff < settings.remote_read_min_bytes_for_seek)
+ {
+ ignore(diff);
+ return offset_;
+ }
+ }
+
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ impl.reset();
+ }
+
+ resetWorkingBuffer();
+ read_range.begin = offset_;
+ offset_from_begin_pos = 0;
+
+ return offset_;
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::setReadUntilPosition(size_t until)
+{
+ until = std::max(until, 1ul);
+ if (read_range.end && *read_range.end + 1 == until)
+ return;
+ read_range.end = until - 1;
+ read_range.begin = getPosition();
+ resetWorkingBuffer();
+ if (impl)
+ {
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ impl.reset();
+ }
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::setReadUntilEnd()
+{
+ if (!read_range.end)
+ return;
+ read_range.end.reset();
+ read_range.begin = getPosition();
+ resetWorkingBuffer();
+ if (impl)
+ {
+ if (!atEndOfRequestedRangeGuess())
+ ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
+ impl.reset();
+ }
+}
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::supportsRightBoundedReads() const { return true; }
+
+template <typename UpdatableSessionPtr>
+bool ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::atEndOfRequestedRangeGuess()
+{
+ if (!impl)
+ return true;
+ if (read_range.end)
+ return getPosition() > static_cast<off_t>(*read_range.end);
+ if (file_info && file_info->file_size)
+ return getPosition() >= static_cast<off_t>(*file_info->file_size);
+ return false;
+}
+
+template <typename UpdatableSessionPtr>
+std::string ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getResponseCookie(const std::string & name, const std::string & def) const
+{
+ for (const auto & cookie : cookies)
+ if (cookie.getName() == name)
+ return cookie.getValue();
+ return def;
+}
+
+template <typename UpdatableSessionPtr>
+void ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::setNextCallback(NextCallback next_callback_)
+{
+ next_callback = next_callback_;
+ /// Some data maybe already read
+ next_callback(count());
+}
+
+template <typename UpdatableSessionPtr>
+const std::string & ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getCompressionMethod() const { return content_encoding; }
+
+template <typename UpdatableSessionPtr>
+std::optional<time_t> ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::tryGetLastModificationTime()
+{
+ if (!file_info)
+ {
+ try
+ {
+ file_info = getFileInfo();
+ }
+ catch (...)
+ {
+ return std::nullopt;
+ }
+ }
+
+ return file_info->last_modified;
+}
+
+template <typename UpdatableSessionPtr>
+HTTPFileInfo ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::getFileInfo()
+{
+ Poco::Net::HTTPResponse response;
+ try
+ {
+ getHeadResponse(response);
+ }
+ catch (HTTPException & e)
+ {
+ /// Maybe the web server doesn't support HEAD requests.
+ /// E.g. webhdfs reports status 400.
+ /// We should proceed in hopes that the actual GET request will succeed.
+ /// (Unless the error in transient. Don't want to nondeterministically sometimes
+ /// fall back to slow whole-file reads when HEAD is actually supported; that sounds
+ /// like a nightmare to debug.)
+ if (e.getHTTPStatus() >= 400 && e.getHTTPStatus() <= 499 &&
+ e.getHTTPStatus() != Poco::Net::HTTPResponse::HTTP_TOO_MANY_REQUESTS)
+ return HTTPFileInfo{};
+
+ throw;
+ }
+ return parseFileInfo(response, 0);
+}
+
+template <typename UpdatableSessionPtr>
+HTTPFileInfo ReadWriteBufferFromHTTPBase<UpdatableSessionPtr>::parseFileInfo(const Poco::Net::HTTPResponse & response, size_t requested_range_begin)
+{
+ HTTPFileInfo res;
+
+ if (response.hasContentLength())
+ {
+ res.file_size = response.getContentLength();
+
+ if (response.getStatus() == Poco::Net::HTTPResponse::HTTPStatus::HTTP_PARTIAL_CONTENT)
+ {
+ *res.file_size += requested_range_begin;
+ res.seekable = true;
+ }
+ else
+ {
+ res.seekable = response.has("Accept-Ranges") && response.get("Accept-Ranges") == "bytes";
+ }
+ }
+
+ if (response.has("Last-Modified"))
+ {
+ String date_str = response.get("Last-Modified");
+ struct tm info;
+ char * end = strptime(date_str.data(), "%a, %d %b %Y %H:%M:%S %Z", &info);
+ if (end == date_str.data() + date_str.size())
+ res.last_modified = timegm(&info);
+ }
+
+ return res;
+}
+
+}
+
+SessionFactory::SessionFactory(const ConnectionTimeouts & timeouts_, Poco::Net::HTTPClientSession::ProxyConfig proxy_config_)
+ : timeouts(timeouts_), proxy_config(proxy_config_) {}
+
+SessionFactory::SessionType SessionFactory::buildNewSession(const Poco::URI & uri)
+{
+ return makeHTTPSession(uri, timeouts, proxy_config);
+}
+
+ReadWriteBufferFromHTTP::ReadWriteBufferFromHTTP(
+ Poco::URI uri_,
+ const std::string & method_,
+ OutStreamCallback out_stream_callback_,
+ const ConnectionTimeouts & timeouts,
+ const Poco::Net::HTTPBasicCredentials & credentials_,
+ const UInt64 max_redirects,
+ size_t buffer_size_,
+ const ReadSettings & settings_,
+ const HTTPHeaderEntries & http_header_entries_,
+ const RemoteHostFilter * remote_host_filter_,
+ bool delay_initialization_,
+ bool use_external_buffer_,
+ bool skip_not_found_url_,
+ std::optional<HTTPFileInfo> file_info_,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config_)
+ : Parent(
+ std::make_shared<SessionType>(uri_, max_redirects, std::make_shared<SessionFactory>(timeouts, proxy_config_)),
+ uri_,
+ credentials_,
+ method_,
+ out_stream_callback_,
+ buffer_size_,
+ settings_,
+ http_header_entries_,
+ remote_host_filter_,
+ delay_initialization_,
+ use_external_buffer_,
+ skip_not_found_url_,
+ file_info_,
+ proxy_config_) {}
+
+
+PooledSessionFactory::PooledSessionFactory(
+ const ConnectionTimeouts & timeouts_, size_t per_endpoint_pool_size_)
+ : timeouts(timeouts_)
+ , per_endpoint_pool_size(per_endpoint_pool_size_) {}
+
+PooledSessionFactory::SessionType PooledSessionFactory::buildNewSession(const Poco::URI & uri)
+{
+ return makePooledHTTPSession(uri, timeouts, per_endpoint_pool_size);
+}
+
+
+PooledReadWriteBufferFromHTTP::PooledReadWriteBufferFromHTTP(
+ Poco::URI uri_,
+ const std::string & method_,
+ OutStreamCallback out_stream_callback_,
+ const ConnectionTimeouts & timeouts_,
+ const Poco::Net::HTTPBasicCredentials & credentials_,
+ size_t buffer_size_,
+ const UInt64 max_redirects,
+ size_t max_connections_per_endpoint)
+ : Parent(
+ std::make_shared<SessionType>(uri_, max_redirects, std::make_shared<PooledSessionFactory>(timeouts_, max_connections_per_endpoint)),
+ uri_,
+ credentials_,
+ method_,
+ out_stream_callback_,
+ buffer_size_) {}
+
+
+template class UpdatableSession<SessionFactory>;
+template class UpdatableSession<PooledSessionFactory>;
+template class detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<SessionFactory>>>;
+template class detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<PooledSessionFactory>>>;
+
+}
diff --git a/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.h b/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.h
new file mode 100644
index 0000000000..ae02292446
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ReadWriteBufferFromHTTP.h
@@ -0,0 +1,291 @@
+#pragma once
+
+#include <functional>
+#include <IO/ConnectionTimeouts.h>
+#include <IO/HTTPCommon.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/ReadBufferFromIStream.h>
+#include <IO/ReadHelpers.h>
+#include <IO/ReadSettings.h>
+#include <IO/WithFileName.h>
+#include <IO/HTTPHeaderEntries.h>
+#include <Common/logger_useful.h>
+#include <base/sleep.h>
+#include <base/types.h>
+#include <Poco/Any.h>
+#include <Poco/Net/HTTPBasicCredentials.h>
+#include <Poco/Net/HTTPClientSession.h>
+#include <Poco/Net/HTTPRequest.h>
+#include <Poco/Net/HTTPResponse.h>
+#include <Poco/URI.h>
+#include <Poco/URIStreamFactory.h>
+#include <Common/DNSResolver.h>
+#include <Common/RemoteHostFilter.h>
+#include "clickhouse_config.h"
+#include "config_version.h"
+
+#include <filesystem>
+
+namespace DB
+{
+
+template <typename TSessionFactory>
+class UpdatableSession
+{
+public:
+ using SessionPtr = typename TSessionFactory::SessionType;
+
+ explicit UpdatableSession(const Poco::URI & uri, UInt64 max_redirects_, std::shared_ptr<TSessionFactory> session_factory_);
+
+ SessionPtr getSession();
+
+ void updateSession(const Poco::URI & uri);
+
+ /// Thread safe.
+ SessionPtr createDetachedSession(const Poco::URI & uri);
+
+ std::shared_ptr<UpdatableSession<TSessionFactory>> clone(const Poco::URI & uri);
+
+private:
+ SessionPtr session;
+ UInt64 redirects{0};
+ UInt64 max_redirects;
+ Poco::URI initial_uri;
+ std::shared_ptr<TSessionFactory> session_factory;
+};
+
+
+/// Information from HTTP response header.
+struct HTTPFileInfo
+{
+ // nullopt if the server doesn't report it.
+ std::optional<size_t> file_size;
+ std::optional<time_t> last_modified;
+ bool seekable = false;
+};
+
+
+namespace detail
+{
+ /// Byte range, including right bound [begin, end].
+ struct HTTPRange
+ {
+ std::optional<size_t> begin;
+ std::optional<size_t> end;
+ };
+
+ template <typename UpdatableSessionPtr>
+ class ReadWriteBufferFromHTTPBase : public SeekableReadBuffer, public WithFileName, public WithFileSize
+ {
+ protected:
+ Poco::URI uri;
+ std::string method;
+ std::string content_encoding;
+
+ UpdatableSessionPtr session;
+ std::istream * istr; /// owned by session
+ std::unique_ptr<ReadBuffer> impl;
+ std::function<void(std::ostream &)> out_stream_callback;
+ const Poco::Net::HTTPBasicCredentials & credentials;
+ std::vector<Poco::Net::HTTPCookie> cookies;
+ HTTPHeaderEntries http_header_entries;
+ const RemoteHostFilter * remote_host_filter = nullptr;
+ std::function<void(size_t)> next_callback;
+
+ size_t buffer_size;
+ bool use_external_buffer;
+
+ size_t offset_from_begin_pos = 0;
+ HTTPRange read_range;
+ std::optional<HTTPFileInfo> file_info;
+
+ /// Delayed exception in case retries with partial content are not satisfiable.
+ std::exception_ptr exception;
+ bool retry_with_range_header = false;
+ /// In case of redirects, save result uri to use it if we retry the request.
+ std::optional<Poco::URI> saved_uri_redirect;
+
+ bool http_skip_not_found_url;
+
+ ReadSettings settings;
+ Poco::Logger * log;
+
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config;
+
+ bool withPartialContent(const HTTPRange & range) const;
+
+ size_t getOffset() const;
+
+ void prepareRequest(Poco::Net::HTTPRequest & request, Poco::URI uri_, std::optional<HTTPRange> range) const;
+
+ std::istream * callImpl(UpdatableSessionPtr & current_session, Poco::URI uri_, Poco::Net::HTTPResponse & response, const std::string & method_, bool for_object_info = false);
+
+ size_t getFileSize() override;
+
+ bool supportsReadAt() override;
+
+ bool checkIfActuallySeekable() override;
+
+ String getFileName() const override;
+
+ enum class InitializeError
+ {
+ RETRYABLE_ERROR,
+ /// If error is not retriable, `exception` variable must be set.
+ NON_RETRYABLE_ERROR,
+ /// Allows to skip not found urls for globs
+ SKIP_NOT_FOUND_URL,
+ NONE,
+ };
+
+ InitializeError initialization_error = InitializeError::NONE;
+
+ private:
+ void getHeadResponse(Poco::Net::HTTPResponse & response);
+
+ void setupExternalBuffer();
+
+ public:
+ using NextCallback = std::function<void(size_t)>;
+ using OutStreamCallback = std::function<void(std::ostream &)>;
+
+ explicit ReadWriteBufferFromHTTPBase(
+ UpdatableSessionPtr session_,
+ Poco::URI uri_,
+ const Poco::Net::HTTPBasicCredentials & credentials_,
+ const std::string & method_ = {},
+ OutStreamCallback out_stream_callback_ = {},
+ size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+ const ReadSettings & settings_ = {},
+ HTTPHeaderEntries http_header_entries_ = {},
+ const RemoteHostFilter * remote_host_filter_ = nullptr,
+ bool delay_initialization = false,
+ bool use_external_buffer_ = false,
+ bool http_skip_not_found_url_ = false,
+ std::optional<HTTPFileInfo> file_info_ = std::nullopt,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config_ = {});
+
+ void callWithRedirects(Poco::Net::HTTPResponse & response, const String & method_, bool throw_on_all_errors = false, bool for_object_info = false);
+
+ void call(UpdatableSessionPtr & current_session, Poco::Net::HTTPResponse & response, const String & method_, bool throw_on_all_errors = false, bool for_object_info = false);
+
+ /**
+ * Throws if error is retryable, otherwise sets initialization_error = NON_RETRYABLE_ERROR and
+ * saves exception into `exception` variable. In case url is not found and skip_not_found_url == true,
+ * sets initialization_error = SKIP_NOT_FOUND_URL, otherwise throws.
+ */
+ void initialize();
+
+ bool nextImpl() override;
+
+ size_t readBigAt(char * to, size_t n, size_t offset, const std::function<bool(size_t)> & progress_callback) override;
+
+ off_t getPosition() override;
+
+ off_t seek(off_t offset_, int whence) override;
+
+ void setReadUntilPosition(size_t until) override;
+
+ void setReadUntilEnd() override;
+
+ bool supportsRightBoundedReads() const override;
+
+ // If true, if we destroy impl now, no work was wasted. Just for metrics.
+ bool atEndOfRequestedRangeGuess();
+
+ std::string getResponseCookie(const std::string & name, const std::string & def) const;
+
+ /// Set function to call on each nextImpl, useful when you need to track
+ /// progress.
+ /// NOTE: parameter on each call is not incremental -- it's all bytes count
+ /// passed through the buffer
+ void setNextCallback(NextCallback next_callback_);
+
+ const std::string & getCompressionMethod() const;
+
+ std::optional<time_t> tryGetLastModificationTime();
+
+ HTTPFileInfo getFileInfo();
+
+ HTTPFileInfo parseFileInfo(const Poco::Net::HTTPResponse & response, size_t requested_range_begin);
+ };
+}
+
+class SessionFactory
+{
+public:
+ explicit SessionFactory(const ConnectionTimeouts & timeouts_, Poco::Net::HTTPClientSession::ProxyConfig proxy_config_ = {});
+
+ using SessionType = HTTPSessionPtr;
+
+ SessionType buildNewSession(const Poco::URI & uri);
+private:
+ ConnectionTimeouts timeouts;
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config;
+};
+
+class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<SessionFactory>>>
+{
+ using SessionType = UpdatableSession<SessionFactory>;
+ using Parent = detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<SessionType>>;
+
+public:
+ ReadWriteBufferFromHTTP(
+ Poco::URI uri_,
+ const std::string & method_,
+ OutStreamCallback out_stream_callback_,
+ const ConnectionTimeouts & timeouts,
+ const Poco::Net::HTTPBasicCredentials & credentials_,
+ const UInt64 max_redirects = 0,
+ size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+ const ReadSettings & settings_ = {},
+ const HTTPHeaderEntries & http_header_entries_ = {},
+ const RemoteHostFilter * remote_host_filter_ = nullptr,
+ bool delay_initialization_ = true,
+ bool use_external_buffer_ = false,
+ bool skip_not_found_url_ = false,
+ std::optional<HTTPFileInfo> file_info_ = std::nullopt,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_config_ = {});
+};
+
+class PooledSessionFactory
+{
+public:
+ explicit PooledSessionFactory(
+ const ConnectionTimeouts & timeouts_, size_t per_endpoint_pool_size_);
+
+ using SessionType = PooledHTTPSessionPtr;
+
+ /// Thread safe.
+ SessionType buildNewSession(const Poco::URI & uri);
+
+private:
+ ConnectionTimeouts timeouts;
+ size_t per_endpoint_pool_size;
+};
+
+class PooledReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<PooledSessionFactory>>>
+{
+ using SessionType = UpdatableSession<PooledSessionFactory>;
+ using Parent = detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<SessionType>>;
+
+public:
+ explicit PooledReadWriteBufferFromHTTP(
+ Poco::URI uri_,
+ const std::string & method_ = {},
+ OutStreamCallback out_stream_callback_ = {},
+ const ConnectionTimeouts & timeouts_ = {},
+ const Poco::Net::HTTPBasicCredentials & credentials_ = {},
+ size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+ const UInt64 max_redirects = 0,
+ size_t max_connections_per_endpoint = DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT);
+};
+
+
+extern template class UpdatableSession<SessionFactory>;
+extern template class UpdatableSession<PooledSessionFactory>;
+extern template class detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<SessionFactory>>>;
+extern template class detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession<PooledSessionFactory>>>;
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.cpp b/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.cpp
new file mode 100644
index 0000000000..fcd4655e2e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.cpp
@@ -0,0 +1,40 @@
+#include <IO/Resource/ClassifiersConfig.h>
+
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int RESOURCE_NOT_FOUND;
+}
+
+ClassifierDescription::ClassifierDescription(const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
+{
+ Poco::Util::AbstractConfiguration::Keys keys;
+ config.keys(config_prefix, keys);
+ for (const auto & key : keys)
+ emplace(key, config.getString(config_prefix + "." + key));
+}
+
+ClassifiersConfig::ClassifiersConfig(const Poco::Util::AbstractConfiguration & config)
+{
+ Poco::Util::AbstractConfiguration::Keys keys;
+ const String config_prefix = "classifiers";
+ config.keys(config_prefix, keys);
+ for (const auto & key : keys)
+ classifiers.emplace(std::piecewise_construct,
+ std::forward_as_tuple(key),
+ std::forward_as_tuple(config, config_prefix + "." + key));
+}
+
+const ClassifierDescription & ClassifiersConfig::get(const String & classifier_name)
+{
+ if (auto it = classifiers.find(classifier_name); it != classifiers.end())
+ return it->second;
+ else
+ throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Unknown classifier '{}' to access resources", classifier_name);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.h b/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.h
new file mode 100644
index 0000000000..96e2bd0f0b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/ClassifiersConfig.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <base/types.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <unordered_map>
+
+namespace DB
+{
+
+/// Mapping of resource name into path string (e.g. "disk1" -> "/path/to/class")
+struct ClassifierDescription : std::unordered_map<String, String>
+{
+ ClassifierDescription(const Poco::Util::AbstractConfiguration & config, const String & config_prefix);
+};
+
+/*
+ * Loads a config with the following format:
+ * <classifiers>
+ * <classifier1>
+ * <resource1>/path/to/queue</resource1>
+ * <resource2>/path/to/another/queue</resource2>
+ * </classifier1>
+ * ...
+ * <classifierN>...</classifierN>
+ * </classifiers>
+ */
+class ClassifiersConfig
+{
+public:
+ ClassifiersConfig() = default;
+ explicit ClassifiersConfig(const Poco::Util::AbstractConfiguration & config);
+
+ const ClassifierDescription & get(const String & classifier_name);
+
+private:
+ std::unordered_map<String, ClassifierDescription> classifiers; // by classifier_name
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.cpp b/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.cpp
new file mode 100644
index 0000000000..df0de6575f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.cpp
@@ -0,0 +1,234 @@
+#include <IO/Resource/DynamicResourceManager.h>
+
+#include <IO/SchedulerNodeFactory.h>
+#include <IO/ResourceManagerFactory.h>
+#include <IO/ISchedulerQueue.h>
+
+#include <Common/Exception.h>
+#include <Common/StringUtils/StringUtils.h>
+
+#include <map>
+#include <tuple>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int RESOURCE_ACCESS_DENIED;
+ extern const int RESOURCE_NOT_FOUND;
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+DynamicResourceManager::State::State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config)
+ : classifiers(config)
+{
+ Poco::Util::AbstractConfiguration::Keys keys;
+ const String config_prefix = "resources";
+ config.keys(config_prefix, keys);
+
+ // Create resource for every element under <resources> tag
+ for (const auto & key : keys)
+ {
+ resources.emplace(key, std::make_shared<Resource>(key, event_queue, config, config_prefix + "." + key));
+ }
+}
+
+DynamicResourceManager::State::Resource::Resource(
+ const String & name,
+ EventQueue * event_queue,
+ const Poco::Util::AbstractConfiguration & config,
+ const std::string & config_prefix)
+{
+ Poco::Util::AbstractConfiguration::Keys keys;
+ config.keys(config_prefix, keys);
+
+ // Sort nodes by path to create parents before children
+ std::map<String, String> path2key;
+ for (const auto & key : keys)
+ {
+ if (!startsWith(key, "node"))
+ continue;
+ String path = config.getString(config_prefix + "." + key + "[@path]", "");
+ if (path.empty())
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Attribute 'path' must be specified in all nodes for resource '{}'", name);
+ if (path[0] != '/')
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Path must start with '/' for resource '{}'", name);
+ if (auto [_, inserted] = path2key.emplace(path, key); !inserted)
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Duplicate path '{}' for resource '{}'", path, name);
+ }
+
+ // Create nodes
+ bool has_root = false;
+ for (auto [path, key] : path2key)
+ {
+ // Validate path
+ size_t slash = path.rfind('/');
+ if (slash == String::npos)
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Invalid scheduler node path '{}' for resource '{}'", path, name);
+
+ // Create node
+ String basename = path.substr(slash + 1); // root name is empty string
+ auto [iter, _] = nodes.emplace(path, Node(basename, event_queue, config, config_prefix + "." + key));
+ if (path == "/")
+ {
+ has_root = true;
+ continue;
+ }
+
+ // Attach created node to parent (if not root)
+ // NOTE: resource root is attached to the scheduler using event queue for thread-safety
+ String parent_path = path.substr(0, slash);
+ if (parent_path.empty())
+ parent_path = "/";
+ if (auto parent = nodes.find(parent_path); parent != nodes.end())
+ parent->second.ptr->attachChild(iter->second.ptr);
+ else
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Parent node doesn't exist for path '{}' for resource '{}'", path, name);
+ }
+
+ if (!has_root)
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "undefined root node path '/' for resource '{}'", name);
+}
+
+DynamicResourceManager::State::Resource::~Resource()
+{
+ // NOTE: we should rely on `attached_to` and cannot use `parent`,
+ // NOTE: because `parent` can be `nullptr` in case attachment is still in event queue
+ if (attached_to != nullptr)
+ {
+ ISchedulerNode * root = nodes.find("/")->second.ptr.get();
+ attached_to->event_queue->enqueue([my_scheduler = attached_to, root]
+ {
+ my_scheduler->removeChild(root);
+ });
+ }
+}
+
+DynamicResourceManager::State::Node::Node(const String & name, EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+ : type(config.getString(config_prefix + ".type", "fifo"))
+ , ptr(SchedulerNodeFactory::instance().get(type, event_queue, config, config_prefix))
+{
+ ptr->basename = name;
+}
+
+bool DynamicResourceManager::State::Resource::equals(const DynamicResourceManager::State::Resource & o) const
+{
+ if (nodes.size() != o.nodes.size())
+ return false;
+
+ for (const auto & [path, o_node] : o.nodes)
+ {
+ auto iter = nodes.find(path);
+ if (iter == nodes.end())
+ return false;
+ if (!iter->second.equals(o_node))
+ return false;
+ }
+
+ return true;
+}
+
+bool DynamicResourceManager::State::Node::equals(const DynamicResourceManager::State::Node & o) const
+{
+ if (type != o.type)
+ return false;
+ return ptr->equals(o.ptr.get());
+}
+
+DynamicResourceManager::Classifier::Classifier(const DynamicResourceManager::StatePtr & state_, const String & classifier_name)
+ : state(state_)
+{
+ // State is immutable, but nodes are mutable and thread-safe
+ // So it's safe to obtain node pointers w/o lock
+ for (auto [resource_name, path] : state->classifiers.get(classifier_name))
+ {
+ if (auto resource_iter = state->resources.find(resource_name); resource_iter != state->resources.end())
+ {
+ const auto & resource = resource_iter->second;
+ if (auto node_iter = resource->nodes.find(path); node_iter != resource->nodes.end())
+ {
+ if (auto * queue = dynamic_cast<ISchedulerQueue *>(node_iter->second.ptr.get()))
+ resources.emplace(resource_name, ResourceLink{.queue = queue});
+ else
+ throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Unable to access non-queue node at path '{}' for resource '{}'", path, resource_name);
+ }
+ else
+ throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Path '{}' for resource '{}' does not exist", path, resource_name);
+ }
+ else
+ resources.emplace(resource_name, ResourceLink{}); // resource not configured yet - use unlimited resource
+ }
+}
+
+ResourceLink DynamicResourceManager::Classifier::get(const String & resource_name)
+{
+ if (auto iter = resources.find(resource_name); iter != resources.end())
+ return iter->second;
+ else
+ throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "Access denied to resource '{}'", resource_name);
+}
+
+DynamicResourceManager::DynamicResourceManager()
+ : state(new State())
+{
+ scheduler.start();
+}
+
+void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+ StatePtr new_state = std::make_shared<State>(scheduler.event_queue, config);
+
+ std::lock_guard lock{mutex};
+
+ // Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits)
+ // Try to minimise this by reusing "equal" resources (initialized with the same configuration).
+ for (auto & [name, new_resource] : new_state->resources)
+ {
+ if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update
+ {
+ State::ResourcePtr old_resource = iter->second;
+ if (old_resource->equals(*new_resource))
+ new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data
+ }
+ }
+
+ // Commit new state
+ // NOTE: dtor will detach from scheduler old resources that are not in use currently
+ state = new_state;
+
+ // Attach new and updated resources to the scheduler
+ for (auto & [name, resource] : new_state->resources)
+ {
+ const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr;
+ if (root->parent == nullptr)
+ {
+ resource->attached_to = &scheduler;
+ scheduler.event_queue->enqueue([this, root]
+ {
+ scheduler.attachChild(root);
+ });
+ }
+ }
+
+ // NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable
+}
+
+ClassifierPtr DynamicResourceManager::acquire(const String & classifier_name)
+{
+ // Acquire a reference to the current state
+ StatePtr state_;
+ {
+ std::lock_guard lock{mutex};
+ state_ = state;
+ }
+
+ return std::make_shared<Classifier>(state_, classifier_name);
+}
+
+void registerDynamicResourceManager(ResourceManagerFactory & factory)
+{
+ factory.registerMethod<DynamicResourceManager>("dynamic");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.h b/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.h
new file mode 100644
index 0000000000..aa1147f1fb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/DynamicResourceManager.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <IO/IResourceManager.h>
+#include <IO/SchedulerRoot.h>
+#include <IO/Resource/ClassifiersConfig.h>
+
+#include <mutex>
+
+namespace DB
+{
+
+/*
+ * Implementation of `IResourceManager` supporting arbitrary dynamic hierarchy of scheduler nodes.
+ * All resources are controlled by single root `SchedulerRoot`.
+ *
+ * State of manager is set of resources attached to the scheduler. States are referenced by classifiers.
+ * Classifiers are used (1) to access resources and (2) to keep shared ownership of resources with pending
+ * resource requests. This allows `ResourceRequest` and `ResourceLink` to hold raw pointers as long as
+ * `ClassifierPtr` is acquired and held.
+ *
+ * Manager can update configuration after initialization. During update, new version of resources are also
+ * attached to scheduler, so multiple version can coexist for a short perid. This will violate constraints
+ * (e.g. in-fly-limit), because different version have independent nodes to impose constraints, the same
+ * violation will apply to fairness. Old version exists as long as there is at least one classifier
+ * instance referencing it. Classifiers are typically attached to queries and will be destructed with them.
+ */
+class DynamicResourceManager : public IResourceManager
+{
+public:
+ DynamicResourceManager();
+ void updateConfiguration(const Poco::Util::AbstractConfiguration & config) override;
+ ClassifierPtr acquire(const String & classifier_name) override;
+
+private:
+ /// Holds everything required to work with one specific configuration
+ struct State
+ {
+ struct Node
+ {
+ String type;
+ SchedulerNodePtr ptr;
+
+ Node(
+ const String & name,
+ EventQueue * event_queue,
+ const Poco::Util::AbstractConfiguration & config,
+ const std::string & config_prefix);
+ bool equals(const Node & o) const;
+ };
+
+ struct Resource
+ {
+ std::unordered_map<String, Node> nodes; // by path
+ SchedulerRoot * attached_to = nullptr;
+
+ Resource(
+ const String & name,
+ EventQueue * event_queue,
+ const Poco::Util::AbstractConfiguration & config,
+ const std::string & config_prefix);
+ ~Resource(); // unregisters resource from scheduler
+ bool equals(const Resource & o) const;
+ };
+
+ using ResourcePtr = std::shared_ptr<Resource>;
+
+ std::unordered_map<String, ResourcePtr> resources; // by name
+ ClassifiersConfig classifiers;
+
+ State() = default;
+ explicit State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config);
+ };
+
+ using StatePtr = std::shared_ptr<State>;
+
+ /// Created per query, holds State used by that query
+ class Classifier : public IClassifier
+ {
+ public:
+ Classifier(const StatePtr & state_, const String & classifier_name);
+ ResourceLink get(const String & resource_name) override;
+ private:
+ std::unordered_map<String, ResourceLink> resources; // accessible resources by names
+ StatePtr state; // hold state to avoid ResourceLink invalidation due to resource deregistration from SchedulerRoot
+ };
+
+private:
+ SchedulerRoot scheduler;
+ std::mutex mutex;
+ StatePtr state;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/FairPolicy.cpp b/contrib/clickhouse/src/IO/Resource/FairPolicy.cpp
new file mode 100644
index 0000000000..248ff04cbd
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/FairPolicy.cpp
@@ -0,0 +1,13 @@
+#include <IO/Resource/FairPolicy.h>
+
+#include <IO/SchedulerNodeFactory.h>
+
+namespace DB
+{
+
+void registerFairPolicy(SchedulerNodeFactory & factory)
+{
+ factory.registerMethod<FairPolicy>("fair");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/FairPolicy.h b/contrib/clickhouse/src/IO/Resource/FairPolicy.h
new file mode 100644
index 0000000000..9c0c78f057
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/FairPolicy.h
@@ -0,0 +1,232 @@
+#pragma once
+
+#include <IO/ISchedulerQueue.h>
+#include <IO/SchedulerRoot.h>
+
+#include <Common/Stopwatch.h>
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+/*
+ * Scheduler node that implements weight-based fair scheduling policy.
+ * Based on Start-time Fair Queueing (SFQ) algorithm.
+ *
+ * Algorithm description.
+ * Virtual runtime (total consumed cost divided by child weight) is tracked for every child.
+ * Active child with minimum vruntime is selected to be dequeued next. On activation, initial vruntime
+ * of a child is set to vruntime of "start" of the last request. This guarantees immediate processing
+ * of at least single request of newly activated children and thus best isolation and scheduling latency.
+ */
+class FairPolicy : public ISchedulerNode
+{
+ /// Scheduling state of a child
+ struct Item
+ {
+ ISchedulerNode * child = nullptr;
+ double vruntime = 0; /// total consumed cost divided by child weight
+
+ /// For min-heap by vruntime
+ bool operator<(const Item & rhs) const noexcept
+ {
+ return vruntime > rhs.vruntime;
+ }
+ };
+
+public:
+ explicit FairPolicy(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : ISchedulerNode(event_queue_, config, config_prefix)
+ {}
+
+ bool equals(ISchedulerNode * other) override
+ {
+ if (auto * o = dynamic_cast<FairPolicy *>(other))
+ return true;
+ return false;
+ }
+
+ void attachChild(const SchedulerNodePtr & child) override
+ {
+ // Take ownership
+ if (auto [it, inserted] = children.emplace(child->basename, child); !inserted)
+ throw Exception(
+ ErrorCodes::INVALID_SCHEDULER_NODE,
+ "Can't add another child with the same path: {}",
+ it->second->getPath());
+
+ // Attach
+ child->setParent(this);
+
+ // At first attach as inactive child.
+ // Inactive attached child must have `info.parent.idx` equal it's index inside `items` array.
+ // This is needed to avoid later scanning through inactive `items` in O(N). Important optimization.
+ // NOTE: vruntime must be equal to `system_vruntime` for fairness.
+ child->info.parent.idx = items.size();
+ items.emplace_back(Item{child.get(), system_vruntime});
+
+ // Activate child if it is not empty
+ if (child->isActive())
+ activateChildImpl(items.size() - 1);
+ }
+
+ void removeChild(ISchedulerNode * child) override
+ {
+ if (auto iter = children.find(child->basename); iter != children.end())
+ {
+ SchedulerNodePtr removed = iter->second;
+
+ // Deactivate: detach is not very common operation, so we can afford O(N) here
+ size_t child_idx = 0;
+ [[ maybe_unused ]] bool found = false;
+ for (; child_idx != items.size(); child_idx++)
+ {
+ if (items[child_idx].child == removed.get())
+ {
+ found = true;
+ break;
+ }
+ }
+ assert(found);
+ if (child_idx < heap_size) // Detach of active child requires deactivation at first
+ {
+ heap_size--;
+ std::swap(items[child_idx], items[heap_size]);
+ // Element was removed from inside of heap -- heap must be rebuilt
+ std::make_heap(items.begin(), items.begin() + heap_size);
+ child_idx = heap_size;
+ }
+
+ // Now detach inactive child
+ if (child_idx != items.size() - 1)
+ {
+ std::swap(items[child_idx], items.back());
+ items[child_idx].child->info.parent.idx = child_idx;
+ }
+ items.pop_back();
+
+ // Detach
+ removed->setParent(nullptr);
+
+ // Get rid of ownership
+ children.erase(iter);
+ }
+ }
+
+ ISchedulerNode * getChild(const String & child_name) override
+ {
+ if (auto iter = children.find(child_name); iter != children.end())
+ return iter->second.get();
+ else
+ return nullptr;
+ }
+
+ std::pair<ResourceRequest *, bool> dequeueRequest() override
+ {
+ if (heap_size == 0)
+ return {nullptr, false};
+
+ // Recursively pull request from child
+ auto [request, child_active] = items.front().child->dequeueRequest();
+ assert(request != nullptr);
+ std::pop_heap(items.begin(), items.begin() + heap_size);
+ Item & current = items[heap_size - 1];
+
+ // SFQ fairness invariant: system vruntime equals last served request start-time
+ assert(current.vruntime >= system_vruntime);
+ system_vruntime = current.vruntime;
+
+ // By definition vruntime is amount of consumed resource (cost) divided by weight
+ current.vruntime += double(request->cost) / current.child->info.weight;
+ max_vruntime = std::max(max_vruntime, current.vruntime);
+
+ if (child_active) // Put active child back in heap after vruntime update
+ {
+ std::push_heap(items.begin(), items.begin() + heap_size);
+ }
+ else // Deactivate child if it is empty, but remember it's vruntime for latter activations
+ {
+ heap_size--;
+
+ // Store index of this inactive child in `parent.idx`
+ // This enables O(1) search of inactive children instead of O(n)
+ current.child->info.parent.idx = heap_size;
+ }
+
+ // Reset any difference between children on busy period end
+ if (heap_size == 0)
+ {
+ // Reset vtime to zero to avoid floating-point error accumulation,
+ // but do not reset too often, because it's O(N)
+ UInt64 ns = clock_gettime_ns();
+ if (last_reset_ns + 1000000000 < ns)
+ {
+ last_reset_ns = ns;
+ for (Item & item : items)
+ item.vruntime = 0;
+ max_vruntime = 0;
+ }
+ system_vruntime = max_vruntime;
+ }
+
+ return {request, heap_size > 0};
+ }
+
+ bool isActive() override
+ {
+ return heap_size > 0;
+ }
+
+ void activateChild(ISchedulerNode * child) override
+ {
+ // Find this child; this is O(1), thanks to inactive index we hold in `parent.idx`
+ activateChildImpl(child->info.parent.idx);
+ }
+
+private:
+ void activateChildImpl(size_t inactive_idx)
+ {
+ bool activate_parent = heap_size == 0;
+
+ if (heap_size != inactive_idx)
+ {
+ std::swap(items[heap_size], items[inactive_idx]);
+ items[inactive_idx].child->info.parent.idx = inactive_idx;
+ }
+
+ // Newly activated child should have at least `system_vruntime` to keep fairness
+ items[heap_size].vruntime = std::max(system_vruntime, items[heap_size].vruntime);
+ heap_size++;
+ std::push_heap(items.begin(), items.begin() + heap_size);
+
+ // Recursive activation
+ if (activate_parent && parent)
+ parent->activateChild(this);
+ }
+
+private:
+ /// Beginning of `items` vector is heap of active children: [0; `heap_size`).
+ /// Next go inactive children in unsorted order.
+ /// NOTE: we have to track vruntime of inactive children for max-min fairness.
+ std::vector<Item> items;
+ size_t heap_size = 0;
+
+ /// Last request vruntime
+ double system_vruntime = 0;
+ double max_vruntime = 0;
+ UInt64 last_reset_ns = 0;
+
+ /// All children with ownership
+ std::unordered_map<String, SchedulerNodePtr> children; // basename -> child
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/FifoQueue.cpp b/contrib/clickhouse/src/IO/Resource/FifoQueue.cpp
new file mode 100644
index 0000000000..f4b0e9c332
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/FifoQueue.cpp
@@ -0,0 +1,13 @@
+#include <IO/Resource/FifoQueue.h>
+
+#include <IO/SchedulerNodeFactory.h>
+
+namespace DB
+{
+
+void registerFifoQueue(SchedulerNodeFactory & factory)
+{
+ factory.registerMethod<FifoQueue>("fifo");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/FifoQueue.h b/contrib/clickhouse/src/IO/Resource/FifoQueue.h
new file mode 100644
index 0000000000..f3ff15ad46
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/FifoQueue.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <Common/Stopwatch.h>
+
+#include <IO/ISchedulerQueue.h>
+
+#include <Poco/Util/AbstractConfiguration.h>
+
+#include <deque>
+#include <mutex>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+/*
+ * FIFO queue to hold pending resource requests
+ */
+class FifoQueue : public ISchedulerQueue
+{
+public:
+ FifoQueue(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
+ : ISchedulerQueue(event_queue_, config, config_prefix)
+ {}
+
+ bool equals(ISchedulerNode * other) override
+ {
+ if (auto * o = dynamic_cast<FifoQueue *>(other))
+ return true;
+ return false;
+ }
+
+ void enqueueRequest(ResourceRequest * request) override
+ {
+ std::unique_lock lock(mutex);
+ request->enqueue_ns = clock_gettime_ns();
+ bool was_empty = requests.empty();
+ requests.push_back(request);
+ if (was_empty)
+ scheduleActivation();
+ }
+
+ std::pair<ResourceRequest *, bool> dequeueRequest() override
+ {
+ std::unique_lock lock(mutex);
+ if (requests.empty())
+ return {nullptr, false};
+ ResourceRequest * result = requests.front();
+ requests.pop_front();
+ return {result, !requests.empty()};
+ }
+
+ bool isActive() override
+ {
+ std::unique_lock lock(mutex);
+ return !requests.empty();
+ }
+
+ void activateChild(ISchedulerNode *) override
+ {
+ assert(false); // queue cannot have children
+ }
+
+ void attachChild(const SchedulerNodePtr &) override
+ {
+ throw Exception(
+ ErrorCodes::INVALID_SCHEDULER_NODE,
+ "Cannot add child to leaf scheduler queue: {}",
+ getPath());
+ }
+
+ void removeChild(ISchedulerNode *) override
+ {
+ }
+
+ ISchedulerNode * getChild(const String &) override
+ {
+ return nullptr;
+ }
+
+private:
+ std::mutex mutex;
+ std::deque<ResourceRequest *> requests;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/PriorityPolicy.cpp b/contrib/clickhouse/src/IO/Resource/PriorityPolicy.cpp
new file mode 100644
index 0000000000..bee9a6d5dd
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/PriorityPolicy.cpp
@@ -0,0 +1,13 @@
+#include <IO/Resource/PriorityPolicy.h>
+
+#include <IO/SchedulerNodeFactory.h>
+
+namespace DB
+{
+
+void registerPriorityPolicy(SchedulerNodeFactory & factory)
+{
+ factory.registerMethod<PriorityPolicy>("priority");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/PriorityPolicy.h b/contrib/clickhouse/src/IO/Resource/PriorityPolicy.h
new file mode 100644
index 0000000000..3c091dcc85
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/PriorityPolicy.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <IO/ISchedulerQueue.h>
+#include <IO/SchedulerRoot.h>
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+/*
+ * Scheduler node that implements priority scheduling policy.
+ * Requests are scheduled in order of priorities.
+ */
+class PriorityPolicy : public ISchedulerNode
+{
+ /// Scheduling state of a child
+ struct Item
+ {
+ ISchedulerNode * child = nullptr;
+ Priority priority; // lower value means higher priority
+
+ /// For max-heap by priority
+ bool operator<(const Item& rhs) const noexcept
+ {
+ return priority > rhs.priority; // Reversed for heap top to yield highest priority (lowest value) child first
+ }
+ };
+
+public:
+ PriorityPolicy(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : ISchedulerNode(event_queue_, config, config_prefix)
+ {}
+
+ bool equals(ISchedulerNode * other) override
+ {
+ if (auto * o = dynamic_cast<PriorityPolicy *>(other))
+ return true;
+ return false;
+ }
+
+ void attachChild(const SchedulerNodePtr & child) override
+ {
+ // Take ownership
+ chassert(child->parent == nullptr);
+ if (auto [it, inserted] = children.emplace(child->basename, child); !inserted)
+ throw Exception(
+ ErrorCodes::INVALID_SCHEDULER_NODE,
+ "Can't add another child with the same path: {}",
+ it->second->getPath());
+
+ // Attach
+ child->setParent(this);
+
+ // Activate child if it is not empty
+ if (child->isActive())
+ activateChild(child.get());
+ }
+
+ void removeChild(ISchedulerNode * child) override
+ {
+ if (auto iter = children.find(child->basename); iter != children.end())
+ {
+ SchedulerNodePtr removed = iter->second;
+
+ // Deactivate: detach is not very common operation, so we can afford O(N) here
+ for (auto i = items.begin(), e = items.end(); i != e; ++i)
+ {
+ if (i->child == removed.get())
+ {
+ items.erase(i);
+ // Element was removed from inside of heap -- heap must be rebuilt
+ std::make_heap(items.begin(), items.end());
+ break;
+ }
+ }
+
+ // Detach
+ removed->setParent(nullptr);
+
+ // Get rid of ownership
+ children.erase(iter);
+ }
+ }
+
+ ISchedulerNode * getChild(const String & child_name) override
+ {
+ if (auto iter = children.find(child_name); iter != children.end())
+ return iter->second.get();
+ else
+ return nullptr;
+ }
+
+ std::pair<ResourceRequest *, bool> dequeueRequest() override
+ {
+ if (items.empty())
+ return {nullptr, false};
+
+ // Recursively pull request from child
+ auto [request, child_active] = items.front().child->dequeueRequest();
+ assert(request != nullptr);
+
+ // Deactivate child if it is empty
+ if (!child_active)
+ {
+ std::pop_heap(items.begin(), items.end());
+ items.pop_back();
+ }
+
+ return {request, !items.empty()};
+ }
+
+ bool isActive() override
+ {
+ return !items.empty();
+ }
+
+ void activateChild(ISchedulerNode * child) override
+ {
+ bool activate_parent = items.empty();
+ items.emplace_back(Item{child, child->info.priority});
+ std::push_heap(items.begin(), items.end());
+ if (activate_parent && parent)
+ parent->activateChild(this);
+ }
+
+private:
+ /// Heap of active children
+ std::vector<Item> items;
+
+ /// All children with ownership
+ std::unordered_map<String, SchedulerNodePtr> children; // basename -> child
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.cpp b/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.cpp
new file mode 100644
index 0000000000..2135fd65a8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.cpp
@@ -0,0 +1,13 @@
+#include <IO/Resource/SemaphoreConstraint.h>
+
+#include <IO/SchedulerNodeFactory.h>
+
+namespace DB
+{
+
+void registerSemaphoreConstraint(SchedulerNodeFactory & factory)
+{
+ factory.registerMethod<SemaphoreConstraint>("inflight_limit");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.h b/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.h
new file mode 100644
index 0000000000..237e63eadd
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/SemaphoreConstraint.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <IO/ISchedulerConstraint.h>
+#include <IO/SchedulerRoot.h>
+
+#include <mutex>
+#include <limits>
+#include <utility>
+
+namespace DB
+{
+
+/*
+ * Limited concurrency constraint.
+ * Blocks if either number of concurrent in-flight requests exceeds `max_requests`, or their total cost exceeds `max_cost`
+ */
+class SemaphoreConstraint : public ISchedulerConstraint
+{
+ static constexpr Int64 default_max_requests = std::numeric_limits<Int64>::max();
+ static constexpr Int64 default_max_cost = std::numeric_limits<Int64>::max();
+public:
+ SemaphoreConstraint(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
+ : ISchedulerConstraint(event_queue_, config, config_prefix)
+ , max_requests(config.getInt64(config_prefix + ".max_requests", default_max_requests))
+ , max_cost(config.getInt64(config_prefix + ".max_cost", config.getInt64(config_prefix + ".max_bytes", default_max_cost)))
+ {}
+
+ bool equals(ISchedulerNode * other) override
+ {
+ if (auto * o = dynamic_cast<SemaphoreConstraint *>(other))
+ return max_requests == o->max_requests && max_cost == o->max_cost;
+ return false;
+ }
+
+ void attachChild(const std::shared_ptr<ISchedulerNode> & child_) override
+ {
+ // Take ownership
+ child = child_;
+ child->setParent(this);
+
+ // Activate if required
+ if (child->isActive())
+ activateChild(child.get());
+ }
+
+ void removeChild(ISchedulerNode * child_) override
+ {
+ if (child.get() == child_)
+ {
+ child_active = false; // deactivate
+ child->setParent(nullptr); // detach
+ child.reset();
+ }
+ }
+
+ ISchedulerNode * getChild(const String & child_name) override
+ {
+ if (child->basename == child_name)
+ return child.get();
+ else
+ return nullptr;
+ }
+
+ std::pair<ResourceRequest *, bool> dequeueRequest() override
+ {
+ // Dequeue request from the child
+ auto [request, child_now_active] = child->dequeueRequest();
+ if (!request)
+ return {nullptr, false};
+
+ // Request has reference to the first (closest to leaf) `constraint`, which can have `parent_constraint`.
+ // The former is initialized here dynamically and the latter is initialized once during hierarchy construction.
+ if (!request->constraint)
+ request->constraint = this;
+
+ // Update state on request arrival
+ std::unique_lock lock(mutex);
+ requests++;
+ cost += request->cost;
+ child_active = child_now_active;
+
+ return {request, active()};
+ }
+
+ void finishRequest(ResourceRequest * request) override
+ {
+ // Recursive traverse of parent flow controls in reverse order
+ if (parent_constraint)
+ parent_constraint->finishRequest(request);
+
+ // Update state on request departure
+ std::unique_lock lock(mutex);
+ bool was_active = active();
+ requests--;
+ cost -= request->cost;
+
+ // Schedule activation on transition from inactive state
+ if (!was_active && active())
+ scheduleActivation();
+ }
+
+ void activateChild(ISchedulerNode * child_) override
+ {
+ std::unique_lock lock(mutex);
+ if (child_ == child.get())
+ if (!std::exchange(child_active, true) && satisfied() && parent)
+ parent->activateChild(this);
+ }
+
+ bool isActive() override
+ {
+ std::unique_lock lock(mutex);
+ return active();
+ }
+
+private:
+ bool satisfied() const
+ {
+ return requests < max_requests && cost < max_cost;
+ }
+
+ bool active() const
+ {
+ return satisfied() && child_active;
+ }
+
+private:
+ std::mutex mutex;
+ Int64 requests = 0;
+ Int64 cost = 0;
+ bool child_active = false;
+
+ SchedulerNodePtr child;
+ Int64 max_requests = default_max_requests;
+ Int64 max_cost = default_max_cost;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/StaticResourceManager.cpp b/contrib/clickhouse/src/IO/Resource/StaticResourceManager.cpp
new file mode 100644
index 0000000000..a79e8148f9
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/StaticResourceManager.cpp
@@ -0,0 +1,138 @@
+#include <IO/Resource/StaticResourceManager.h>
+
+#include <IO/SchedulerNodeFactory.h>
+#include <IO/ResourceManagerFactory.h>
+#include <IO/ISchedulerQueue.h>
+
+#include <Common/Exception.h>
+#include <Common/StringUtils/StringUtils.h>
+
+#include <map>
+#include <tuple>
+#include <algorithm>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int RESOURCE_ACCESS_DENIED;
+ extern const int RESOURCE_NOT_FOUND;
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+StaticResourceManager::Resource::Resource(
+ const String & name,
+ EventQueue * event_queue,
+ const Poco::Util::AbstractConfiguration & config,
+ const std::string & config_prefix)
+{
+ // Initialize scheduler nodes
+ Poco::Util::AbstractConfiguration::Keys keys;
+ std::sort(keys.begin(), keys.end()); // for parents to appear before children
+ config.keys(config_prefix, keys);
+ for (const auto & key : keys)
+ {
+ if (!startsWith(key, "node"))
+ continue;
+
+ // Validate path
+ String path = config.getString(config_prefix + "." + key + "[@path]", "");
+ if (path.empty())
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Attribute 'path' must be specified in all nodes for resource '{}'", name);
+ if (path[0] != '/')
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "path must start with '/' for resource '{}'", name);
+
+ // Create node
+ String type = config.getString(config_prefix + "." + key + ".type", "fifo");
+ SchedulerNodePtr node = SchedulerNodeFactory::instance().get(type, event_queue, config, config_prefix + "." + key);
+ node->basename = path.substr(1);
+
+ // Take ownership
+ if (auto [_, inserted] = nodes.emplace(path, node); !inserted)
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Duplicate path '{}' for resource '{}'", path, name);
+
+ // Attach created node to parent (if not root)
+ if (path != "/")
+ {
+ String parent_path = path.substr(0, path.rfind('/'));
+ if (parent_path.empty())
+ parent_path = "/";
+ if (auto parent = nodes.find(parent_path); parent != nodes.end())
+ parent->second->attachChild(node);
+ else
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Parent doesn't exist for path '{}' for resource '{}'", path, name);
+ }
+ }
+
+ if (nodes.find("/") == nodes.end())
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "undefined root node path '/' for resource '{}'", name);
+}
+
+StaticResourceManager::Classifier::Classifier(const StaticResourceManager & manager, const ClassifierDescription & cfg)
+{
+ for (auto [resource_name, path] : cfg)
+ {
+ if (auto resource_iter = manager.resources.find(resource_name); resource_iter != manager.resources.end())
+ {
+ const Resource & resource = resource_iter->second;
+ if (auto node_iter = resource.nodes.find(path); node_iter != resource.nodes.end())
+ {
+ if (auto * queue = dynamic_cast<ISchedulerQueue *>(node_iter->second.get()))
+ resources.emplace(resource_name, ResourceLink{.queue = queue});
+ else
+ throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Unable to access non-queue node at path '{}' for resource '{}'", path, resource_name);
+ }
+ else
+ throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Path '{}' for resource '{}' does not exist", path, resource_name);
+ }
+ else
+ resources.emplace(resource_name, ResourceLink{}); // resource not configured - unlimited
+ }
+}
+
+ResourceLink StaticResourceManager::Classifier::get(const String & resource_name)
+{
+ if (auto iter = resources.find(resource_name); iter != resources.end())
+ return iter->second;
+ else
+ throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "Access denied to resource '{}'", resource_name);
+}
+
+void StaticResourceManager::updateConfiguration(const Poco::Util::AbstractConfiguration & config)
+{
+ if (!resources.empty())
+ return; // already initialized, configuration update is not supported
+
+ Poco::Util::AbstractConfiguration::Keys keys;
+ const String config_prefix = "resources";
+ config.keys(config_prefix, keys);
+
+ // Create resource for every element under <resources> tag
+ for (const auto & key : keys)
+ {
+ auto [iter, _] = resources.emplace(std::piecewise_construct,
+ std::forward_as_tuple(key),
+ std::forward_as_tuple(key, scheduler.event_queue, config, config_prefix + "." + key));
+ // Attach root of resource to scheduler
+ scheduler.attachChild(iter->second.nodes.find("/")->second);
+ }
+
+ // Initialize classifiers
+ classifiers = std::make_unique<ClassifiersConfig>(config);
+
+ // Run scheduler thread
+ scheduler.start();
+}
+
+ClassifierPtr StaticResourceManager::acquire(const String & classifier_name)
+{
+ return std::make_shared<Classifier>(*this, classifiers->get(classifier_name));
+}
+
+void registerStaticResourceManager(ResourceManagerFactory & factory)
+{
+ factory.registerMethod<StaticResourceManager>("static");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/StaticResourceManager.h b/contrib/clickhouse/src/IO/Resource/StaticResourceManager.h
new file mode 100644
index 0000000000..066dbf4ebf
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/StaticResourceManager.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <IO/IResourceManager.h>
+#include <IO/SchedulerRoot.h>
+#include <IO/Resource/ClassifiersConfig.h>
+
+#include <mutex>
+
+namespace DB
+{
+
+/*
+ * Reads `<resources>` from config at startup and registers them in single `SchedulerRoot`.
+ * Do not support configuration updates, server restart is required.
+ */
+class StaticResourceManager : public IResourceManager
+{
+public:
+ // Just initialization, any further updates are ignored for the sake of simplicity
+ // NOTE: manager must be initialized before any acquire() calls to avoid races
+ void updateConfiguration(const Poco::Util::AbstractConfiguration & config) override;
+
+ ClassifierPtr acquire(const String & classifier_name) override;
+
+private:
+ struct Resource
+ {
+ std::unordered_map<String, SchedulerNodePtr> nodes; // by paths
+
+ Resource(
+ const String & name,
+ EventQueue * event_queue,
+ const Poco::Util::AbstractConfiguration & config,
+ const std::string & config_prefix);
+ };
+
+ struct Classifier : public IClassifier
+ {
+ Classifier(const StaticResourceManager & manager, const ClassifierDescription & cfg);
+ ResourceLink get(const String & resource_name) override;
+ std::unordered_map<String, ResourceLink> resources; // accessible resources by names
+ };
+
+ SchedulerRoot scheduler;
+ std::unordered_map<String, Resource> resources; // by name
+ std::unique_ptr<ClassifiersConfig> classifiers;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/registerResourceManagers.cpp b/contrib/clickhouse/src/IO/Resource/registerResourceManagers.cpp
new file mode 100644
index 0000000000..5217bcdfbe
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/registerResourceManagers.cpp
@@ -0,0 +1,17 @@
+#include <IO/Resource/registerResourceManagers.h>
+#include <IO/ResourceManagerFactory.h>
+
+namespace DB
+{
+
+void registerDynamicResourceManager(ResourceManagerFactory &);
+void registerStaticResourceManager(ResourceManagerFactory &);
+
+void registerResourceManagers()
+{
+ auto & factory = ResourceManagerFactory::instance();
+ registerDynamicResourceManager(factory);
+ registerStaticResourceManager(factory);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/registerResourceManagers.h b/contrib/clickhouse/src/IO/Resource/registerResourceManagers.h
new file mode 100644
index 0000000000..243b25a958
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/registerResourceManagers.h
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace DB
+{
+
+void registerResourceManagers();
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.cpp b/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.cpp
new file mode 100644
index 0000000000..896f96d7f5
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.cpp
@@ -0,0 +1,30 @@
+#include <IO/Resource/registerSchedulerNodes.h>
+
+#include <IO/ISchedulerNode.h>
+#include <IO/ISchedulerConstraint.h>
+#include <IO/SchedulerNodeFactory.h>
+
+namespace DB
+{
+
+void registerPriorityPolicy(SchedulerNodeFactory &);
+void registerFairPolicy(SchedulerNodeFactory &);
+void registerSemaphoreConstraint(SchedulerNodeFactory &);
+void registerFifoQueue(SchedulerNodeFactory &);
+
+void registerSchedulerNodes()
+{
+ auto & factory = SchedulerNodeFactory::instance();
+
+ // ISchedulerNode
+ registerPriorityPolicy(factory);
+ registerFairPolicy(factory);
+
+ // ISchedulerConstraint
+ registerSemaphoreConstraint(factory);
+
+ // ISchedulerQueue
+ registerFifoQueue(factory);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.h b/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.h
new file mode 100644
index 0000000000..1e2092aaf4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/Resource/registerSchedulerNodes.h
@@ -0,0 +1,8 @@
+#pragma once
+
+namespace DB
+{
+
+void registerSchedulerNodes();
+
+}
diff --git a/contrib/clickhouse/src/IO/ResourceBudget.h b/contrib/clickhouse/src/IO/ResourceBudget.h
new file mode 100644
index 0000000000..7f67f9cfc1
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ResourceBudget.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <IO/ResourceRequest.h>
+#include <atomic>
+
+namespace DB
+{
+
+/*
+ * Helper class to keep track of requested and consumed amount of resource.
+ * Useful if real amount of consumed resource can differ from requested amount of resource (e.g. in case of failures).
+ * Can be safely used from multiple threads.
+ * Usage example:
+ * ResourceBudget budget;
+ * while (!stop) {
+ * ResourceCost est_cost = myEstimateOfCostOrJustUseOne();
+ * myAllocateResource(budget.ask(est_cost)); // Ask external system to allocate resource for you
+ * ResourceCost real_cost = mySynchronousConsumptionOfResource(); // Real consumption can differ from est_cost
+ * budget.adjust(est_cost, real_cost); // Adjust balance according to the actual cost, may affect the next iteration
+ * }
+ */
+class ResourceBudget
+{
+public:
+ // Returns amount of resource to be requested according to current balance and estimated cost of new consumption
+ ResourceCost ask(ResourceCost estimated_cost)
+ {
+ ResourceCost budget = available.load();
+ while (true)
+ {
+ // Valid resource request must have positive `cost`. Also takes consumption history into account.
+ ResourceCost cost = std::max<ResourceCost>(1ll, estimated_cost - budget);
+
+ // Assume every request is satisfied (no resource request cancellation is possible now)
+ // So we requested additional `cost` units and are going to consume `estimated_cost`
+ ResourceCost new_budget = budget + cost - estimated_cost;
+
+ // Try to commit this transaction
+ if (new_budget == budget || available.compare_exchange_strong(budget, new_budget))
+ return cost;
+ }
+ }
+
+ // Should be called to account for difference between real and estimated costs
+ // Optional. May be skipped if `real_cost` is known in advance (equals `estimated_cost`).
+ void adjust(ResourceCost estimated_cost, ResourceCost real_cost)
+ {
+ available.fetch_add(estimated_cost - real_cost);
+ }
+
+private:
+ std::atomic<ResourceCost> available = 0; // requested - consumed
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ResourceGuard.h b/contrib/clickhouse/src/IO/ResourceGuard.h
new file mode 100644
index 0000000000..92f25b40f6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ResourceGuard.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <base/types.h>
+
+#include <IO/ResourceRequest.h>
+#include <IO/ResourceLink.h>
+#include <IO/ISchedulerConstraint.h>
+
+#include <condition_variable>
+#include <mutex>
+
+
+namespace DB
+{
+
+/*
+ * Scoped resource guard.
+ * Waits for resource to be available in constructor and releases resource in destructor
+ * IMPORTANT: multiple resources should not be locked concurrently by a single thread
+ */
+class ResourceGuard
+{
+public:
+ enum ResourceGuardCtor
+ {
+ LockStraightAway, /// Locks inside constructor (default)
+
+ // WARNING: Only for tests. It is not exception-safe because `lock()` must be called after construction.
+ PostponeLocking /// Don't lock in constructor, but send request
+ };
+
+ enum RequestState
+ {
+ Finished, // Last request has already finished; no concurrent access is possible
+ Enqueued, // Enqueued into the scheduler; thread-safe access is required
+ Dequeued // Dequeued from the scheduler and is in consumption state; no concurrent access is possible
+ };
+
+ class Request : public ResourceRequest
+ {
+ public:
+ void enqueue(ResourceCost cost_, ResourceLink link_)
+ {
+ // lock(mutex) is not required because `Finished` request cannot be used by the scheduler thread
+ chassert(state == Finished);
+ state = Enqueued;
+ ResourceRequest::reset(cost_);
+ link_.queue->enqueueRequestUsingBudget(this);
+ }
+
+ // This function is executed inside scheduler thread and wakes thread issued this `request`.
+ // That thread will continue execution and do real consumption of requested resource synchronously.
+ void execute() override
+ {
+ {
+ std::unique_lock lock(mutex);
+ chassert(state == Enqueued);
+ state = Dequeued;
+ }
+ dequeued_cv.notify_one();
+ }
+
+ void wait()
+ {
+ std::unique_lock lock(mutex);
+ dequeued_cv.wait(lock, [this] { return state == Dequeued; });
+ }
+
+ void finish()
+ {
+ // lock(mutex) is not required because `Dequeued` request cannot be used by the scheduler thread
+ chassert(state == Dequeued);
+ state = Finished;
+ if (constraint)
+ constraint->finishRequest(this);
+ }
+
+ static Request & local()
+ {
+ // Since single thread cannot use more than one resource request simultaneously,
+ // we can reuse thread-local request to avoid allocations
+ static thread_local Request instance;
+ return instance;
+ }
+
+ private:
+ std::mutex mutex;
+ std::condition_variable dequeued_cv;
+ RequestState state = Finished;
+ };
+
+ /// Creates pending request for resource; blocks while resource is not available (unless `PostponeLocking`)
+ explicit ResourceGuard(ResourceLink link_, ResourceCost cost = 1, ResourceGuardCtor ctor = LockStraightAway)
+ : link(link_)
+ , request(Request::local())
+ {
+ if (cost == 0)
+ link.queue = nullptr; // Ignore zero-cost requests
+ else if (link.queue)
+ {
+ request.enqueue(cost, link);
+ if (ctor == LockStraightAway)
+ request.wait();
+ }
+ }
+
+ ~ResourceGuard()
+ {
+ unlock();
+ }
+
+ /// Blocks until resource is available
+ void lock()
+ {
+ if (link.queue)
+ request.wait();
+ }
+
+ /// Report resource consumption has finished
+ void unlock()
+ {
+ if (link.queue)
+ {
+ request.finish();
+ link.queue = nullptr;
+ }
+ }
+
+ /// Mark request as unsuccessful; by default request is considered to be successful
+ void setFailure()
+ {
+ request.successful = false;
+ }
+
+ ResourceLink link;
+ Request & request;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ResourceLink.h b/contrib/clickhouse/src/IO/ResourceLink.h
new file mode 100644
index 0000000000..2da5e75fcb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ResourceLink.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <base/types.h>
+
+#include <IO/ResourceRequest.h>
+#include <IO/ISchedulerQueue.h>
+
+
+namespace DB
+{
+
+/*
+ * Everything required for resource consumption. Connection to a specific resource queue.
+ */
+struct ResourceLink
+{
+ ISchedulerQueue * queue = nullptr;
+ bool operator==(const ResourceLink &) const = default;
+
+ void adjust(ResourceCost estimated_cost, ResourceCost real_cost) const
+ {
+ if (queue)
+ queue->adjustBudget(estimated_cost, real_cost);
+ }
+
+ void consumed(ResourceCost cost) const
+ {
+ if (queue)
+ queue->consumeBudget(cost);
+ }
+
+ void accumulate(ResourceCost cost) const
+ {
+ if (queue)
+ queue->accumulateBudget(cost);
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ResourceManagerFactory.h b/contrib/clickhouse/src/IO/ResourceManagerFactory.h
new file mode 100644
index 0000000000..8e972f0564
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ResourceManagerFactory.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <Common/ErrorCodes.h>
+#include <Common/Exception.h>
+
+#include <IO/IResourceManager.h>
+
+#include <boost/noncopyable.hpp>
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+class ResourceManagerFactory : private boost::noncopyable
+{
+public:
+ static ResourceManagerFactory & instance()
+ {
+ static ResourceManagerFactory ret;
+ return ret;
+ }
+
+ ResourceManagerPtr get(const String & name)
+ {
+ std::lock_guard lock{mutex};
+ if (auto iter = methods.find(name); iter != methods.end())
+ return iter->second();
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Unknown scheduler node type: {}", name);
+ }
+
+ template <class TDerived>
+ void registerMethod(const String & name)
+ {
+ std::lock_guard lock{mutex};
+ methods[name] = [] ()
+ {
+ return std::make_shared<TDerived>();
+ };
+ }
+
+private:
+ std::mutex mutex;
+ using Method = std::function<ResourceManagerPtr()>;
+ std::unordered_map<String, Method> methods;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ResourceRequest.h b/contrib/clickhouse/src/IO/ResourceRequest.h
new file mode 100644
index 0000000000..989349148c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ResourceRequest.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <base/types.h>
+#include <limits>
+
+namespace DB
+{
+
+// Forward declarations
+class ISchedulerQueue;
+class ISchedulerConstraint;
+
+/// Cost in terms of used resource (e.g. bytes for network IO)
+using ResourceCost = Int64;
+constexpr ResourceCost ResourceCostMax = std::numeric_limits<int>::max();
+
+/// Timestamps (nanoseconds since epoch)
+using ResourceNs = UInt64;
+
+/*
+ * Request for a resource consumption. The main moving part of the scheduling subsystem.
+ * Resource requests processing workflow:
+ *
+ * ----1=2222222222222=3=4=555555555555555=6-----> time
+ * ^ ^ ^ ^ ^ ^
+ * | | | | | |
+ * enqueue wait dequeue execute consume finish
+ *
+ * 1) Request is enqueued using ISchedulerQueue::enqueueRequest().
+ * 2) Request competes with others for access to a resource; effectively just waiting in a queue.
+ * 3) Scheduler calls ISchedulerNode::dequeueRequest() that returns the request.
+ * 4) Callback ResourceRequest::execute() is called to provide access to the resource.
+ * 5) The resource consumption is happening outside of the scheduling subsystem.
+ * 6) request->constraint->finishRequest() is called when consumption is finished.
+ *
+ * Steps (5) and (6) can be omitted if constraint is not used by the resource.
+ *
+ * Request can be created on stack or heap.
+ * Request ownership is done outside of the scheduling subsystem.
+ * After (6) request can be destructed safely.
+ *
+ * Request cancelling is not supported yet.
+ */
+class ResourceRequest
+{
+public:
+ /// Cost of request execution; should be filled before request enqueueing.
+ /// NOTE: If cost is not known in advance, credit model can be used:
+ /// NOTE: for the first request use 1 and
+ ResourceCost cost;
+
+ /// Request outcome
+ /// Should be filled during resource consumption
+ bool successful;
+
+ /// Scheduler node to be notified on consumption finish
+ /// Auto-filled during request enqueue/dequeue
+ ISchedulerConstraint * constraint;
+
+ /// Timestamps for introspection
+ ResourceNs enqueue_ns;
+ ResourceNs execute_ns;
+ ResourceNs finish_ns;
+
+ explicit ResourceRequest(ResourceCost cost_ = 1)
+ {
+ reset(cost_);
+ }
+
+ void reset(ResourceCost cost_)
+ {
+ cost = cost_;
+ successful = true;
+ constraint = nullptr;
+ enqueue_ns = 0;
+ execute_ns = 0;
+ finish_ns = 0;
+ }
+
+ virtual ~ResourceRequest() = default;
+
+ /// Callback to trigger resource consumption.
+ /// IMPORTANT: it is called from scheduler thread and must be fast,
+ /// just triggering start of a consumption, not doing the consumption itself
+ /// (e.g. setting an std::promise or creating a job in a thread pool)
+ virtual void execute() = 0;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/S3/AWSLogger.cpp b/contrib/clickhouse/src/IO/S3/AWSLogger.cpp
new file mode 100644
index 0000000000..d6162823ae
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/AWSLogger.cpp
@@ -0,0 +1,78 @@
+#include <IO/S3/AWSLogger.h>
+
+#if USE_AWS_S3
+
+#include <Core/SettingsEnums.h>
+#include <Common/logger_useful.h>
+#include <aws/core/utils/logging/LogLevel.h>
+#include <Poco/Logger.h>
+
+namespace
+{
+
+const char * S3_LOGGER_TAG_NAMES[][2] = {
+ {"AWSClient", "AWSClient"},
+ {"AWSAuthV4Signer", "AWSClient (AWSAuthV4Signer)"},
+};
+
+const std::pair<DB::LogsLevel, Poco::Message::Priority> & convertLogLevel(Aws::Utils::Logging::LogLevel log_level)
+{
+ /// We map levels to our own logger 1 to 1 except WARN+ levels. In most cases we failover such errors with retries
+ /// and don't want to see them as Errors in our logs.
+ static const std::unordered_map<Aws::Utils::Logging::LogLevel, std::pair<DB::LogsLevel, Poco::Message::Priority>> mapping =
+ {
+ {Aws::Utils::Logging::LogLevel::Off, {DB::LogsLevel::none, Poco::Message::PRIO_INFORMATION}},
+ {Aws::Utils::Logging::LogLevel::Fatal, {DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION}},
+ {Aws::Utils::Logging::LogLevel::Error, {DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION}},
+ {Aws::Utils::Logging::LogLevel::Warn, {DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION}},
+ {Aws::Utils::Logging::LogLevel::Info, {DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION}},
+ {Aws::Utils::Logging::LogLevel::Debug, {DB::LogsLevel::debug, Poco::Message::PRIO_TEST}},
+ {Aws::Utils::Logging::LogLevel::Trace, {DB::LogsLevel::trace, Poco::Message::PRIO_TEST}},
+ };
+ return mapping.at(log_level);
+}
+
+}
+
+namespace DB::S3
+{
+
+AWSLogger::AWSLogger(bool enable_s3_requests_logging_)
+ : enable_s3_requests_logging(enable_s3_requests_logging_)
+{
+ for (auto [tag, name] : S3_LOGGER_TAG_NAMES)
+ tag_loggers[tag] = &Poco::Logger::get(name);
+
+ default_logger = tag_loggers[S3_LOGGER_TAG_NAMES[0][0]];
+}
+
+Aws::Utils::Logging::LogLevel AWSLogger::GetLogLevel() const
+{
+ if (enable_s3_requests_logging)
+ return Aws::Utils::Logging::LogLevel::Trace;
+ else
+ return Aws::Utils::Logging::LogLevel::Info;
+}
+
+void AWSLogger::Log(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * format_str, ...) // NOLINT
+{
+ callLogImpl(log_level, tag, format_str); /// FIXME. Variadic arguments?
+}
+
+void AWSLogger::LogStream(Aws::Utils::Logging::LogLevel log_level, const char * tag, const Aws::OStringStream & message_stream)
+{
+ callLogImpl(log_level, tag, message_stream.str().c_str());
+}
+
+void AWSLogger::callLogImpl(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * message)
+{
+ const auto & [level, prio] = convertLogLevel(log_level);
+ if (tag_loggers.contains(tag))
+ LOG_IMPL(tag_loggers[tag], level, prio, fmt::runtime(message));
+ else
+ LOG_IMPL(default_logger, level, prio, "{}: {}", tag, message);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/AWSLogger.h b/contrib/clickhouse/src/IO/S3/AWSLogger.h
new file mode 100644
index 0000000000..7c31ea469f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/AWSLogger.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+#include <aws/core/utils/logging/LogSystemInterface.h>
+#include <base/types.h>
+#include <unordered_map>
+
+namespace Poco { class Logger; }
+
+namespace DB::S3
+{
+class AWSLogger final : public Aws::Utils::Logging::LogSystemInterface
+{
+public:
+ explicit AWSLogger(bool enable_s3_requests_logging_);
+
+ ~AWSLogger() final = default;
+
+ Aws::Utils::Logging::LogLevel GetLogLevel() const final;
+
+ void Log(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * format_str, ...) final; // NOLINT
+
+ void LogStream(Aws::Utils::Logging::LogLevel log_level, const char * tag, const Aws::OStringStream & message_stream) final;
+
+ void callLogImpl(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * message);
+
+ void Flush() final {}
+
+private:
+ Poco::Logger * default_logger;
+ bool enable_s3_requests_logging;
+ std::unordered_map<String, Poco::Logger *> tag_loggers;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Client.cpp b/contrib/clickhouse/src/IO/S3/Client.cpp
new file mode 100644
index 0000000000..104fc2dd5b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Client.cpp
@@ -0,0 +1,903 @@
+#include <IO/S3/Client.h>
+
+#if USE_AWS_S3
+
+#include <aws/core/client/CoreErrors.h>
+#include <aws/core/client/DefaultRetryStrategy.h>
+#include <aws/s3/model/HeadBucketRequest.h>
+#include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/ListObjectsV2Request.h>
+#include <aws/core/client/AWSErrorMarshaller.h>
+// #include <aws/core/endpoint/EndpointParameter.h>
+#include <aws/core/utils/HashingUtils.h>
+// #include <aws/core/utils/logging/ErrorMacros.h>
+
+#include <Poco/Net/NetException.h>
+
+#include <IO/S3Common.h>
+#include <IO/S3/Requests.h>
+#include <IO/S3/PocoHTTPClientFactory.h>
+#include <IO/S3/AWSLogger.h>
+#include <IO/S3/Credentials.h>
+
+#include <Common/assert_cast.h>
+
+#include <Common/logger_useful.h>
+#include <Common/ProxyConfigurationResolverProvider.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event S3WriteRequestsErrors;
+ extern const Event S3ReadRequestsErrors;
+
+ extern const Event DiskS3WriteRequestsErrors;
+ extern const Event DiskS3ReadRequestsErrors;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+ extern const int TOO_MANY_REDIRECTS;
+}
+
+namespace S3
+{
+
+Client::RetryStrategy::RetryStrategy(std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy_)
+ : wrapped_strategy(std::move(wrapped_strategy_))
+{
+ // if (!wrapped_strategy)
+ // wrapped_strategy = Aws::Client::InitRetryStrategy();
+}
+
+/// NOLINTNEXTLINE(google-runtime-int)
+bool Client::RetryStrategy::ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const
+{
+ if (error.GetResponseCode() == Aws::Http::HttpResponseCode::MOVED_PERMANENTLY)
+ return false;
+
+ return wrapped_strategy->ShouldRetry(error, attemptedRetries);
+}
+
+/// NOLINTNEXTLINE(google-runtime-int)
+long Client::RetryStrategy::CalculateDelayBeforeNextRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const
+{
+ return wrapped_strategy->CalculateDelayBeforeNextRetry(error, attemptedRetries);
+}
+
+/// NOLINTNEXTLINE(google-runtime-int)
+long Client::RetryStrategy::GetMaxAttempts() const
+{
+ return wrapped_strategy->GetMaxAttempts();
+}
+
+void Client::RetryStrategy::GetSendToken()
+{
+ return wrapped_strategy->GetSendToken();
+}
+
+// bool Client::RetryStrategy::HasSendToken()
+// {
+// return wrapped_strategy->HasSendToken();
+// }
+
+void Client::RetryStrategy::RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome)
+{
+ return wrapped_strategy->RequestBookkeeping(httpResponseOutcome);
+}
+
+void Client::RetryStrategy::RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome, const Aws::Client::AWSError<Aws::Client::CoreErrors>& lastError)
+{
+ return wrapped_strategy->RequestBookkeeping(httpResponseOutcome, lastError);
+}
+
+namespace
+{
+
+void verifyClientConfiguration(const Aws::Client::ClientConfiguration & client_config)
+{
+ if (!client_config.retryStrategy)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The S3 client can only be used with Client::RetryStrategy, define it in the client configuration");
+
+ assert_cast<const Client::RetryStrategy &>(*client_config.retryStrategy);
+}
+
+}
+
+std::unique_ptr<Client> Client::create(
+ size_t max_redirects_,
+ ServerSideEncryptionKMSConfig sse_kms_config_,
+ const std::shared_ptr<Aws::Auth::AWSCredentialsProvider> & credentials_provider,
+ const PocoHTTPClientConfiguration & client_configuration,
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads,
+ bool use_virtual_addressing)
+{
+ verifyClientConfiguration(client_configuration);
+ return std::unique_ptr<Client>(
+ new Client(max_redirects_, std::move(sse_kms_config_), credentials_provider, client_configuration, sign_payloads, use_virtual_addressing));
+}
+
+std::unique_ptr<Client> Client::clone(
+ std::optional<std::shared_ptr<RetryStrategy>> override_retry_strategy,
+ std::optional<Int64> override_request_timeout_ms) const
+{
+ PocoHTTPClientConfiguration new_configuration = client_configuration;
+ if (override_retry_strategy.has_value())
+ new_configuration.retryStrategy = *override_retry_strategy;
+ if (override_request_timeout_ms.has_value())
+ new_configuration.requestTimeoutMs = *override_request_timeout_ms;
+ return std::unique_ptr<Client>(new Client(*this, new_configuration));
+}
+
+namespace
+{
+
+ProviderType deduceProviderType(const std::string & url)
+{
+ if (url.find(".amazonaws.com") != std::string::npos)
+ return ProviderType::AWS;
+
+ if (url.find("storage.googleapis.com") != std::string::npos)
+ return ProviderType::GCS;
+
+ return ProviderType::UNKNOWN;
+}
+
+}
+
+Client::Client(
+ size_t max_redirects_,
+ ServerSideEncryptionKMSConfig sse_kms_config_,
+ const std::shared_ptr<Aws::Auth::AWSCredentialsProvider> & credentials_provider_,
+ const PocoHTTPClientConfiguration & client_configuration_,
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads_,
+ bool use_virtual_addressing_)
+ : Aws::S3::S3Client(credentials_provider_, client_configuration_, sign_payloads_, use_virtual_addressing_)
+ , credentials_provider(credentials_provider_)
+ , client_configuration(client_configuration_)
+ , sign_payloads(sign_payloads_)
+ , use_virtual_addressing(use_virtual_addressing_)
+ , max_redirects(max_redirects_)
+ , sse_kms_config(std::move(sse_kms_config_))
+ , log(&Poco::Logger::get("S3Client"))
+{
+#if 0
+ auto * endpoint_provider = dynamic_cast<Aws::S3::Endpoint::S3DefaultEpProviderBase *>(accessEndpointProvider().get());
+ endpoint_provider->GetBuiltInParameters().GetParameter("Region").GetString(explicit_region);
+ endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(initial_endpoint);
+#endif
+
+ provider_type = deduceProviderType(initial_endpoint);
+ LOG_TRACE(log, "Provider type: {}", toString(provider_type));
+
+ if (provider_type == ProviderType::GCS)
+ {
+ /// GCS can operate in 2 modes for header and query params names:
+ /// - with both x-amz and x-goog prefixes allowed (but cannot mix different prefixes in same request)
+ /// - only with x-goog prefix
+ /// first mode is allowed only with HMAC (or unsigned requests) so when we
+ /// find credential keys we can simply behave as the underlying storage is S3
+ /// otherwise, we need to be aware we are making requests to GCS
+ /// and replace all headers with a valid prefix when needed
+ if (credentials_provider)
+ {
+ auto credentials = credentials_provider->GetAWSCredentials();
+ if (credentials.IsEmpty())
+ api_mode = ApiMode::GCS;
+ }
+ }
+
+ LOG_TRACE(log, "API mode of the S3 client: {}", api_mode);
+
+ detect_region = provider_type == ProviderType::AWS && explicit_region == Aws::Region::AWS_GLOBAL;
+
+ cache = std::make_shared<ClientCache>();
+ ClientCacheRegistry::instance().registerClient(cache);
+}
+
+Client::Client(
+ const Client & other, const PocoHTTPClientConfiguration & client_configuration_)
+ : Aws::S3::S3Client(other.credentials_provider, client_configuration_, other.sign_payloads,
+ other.use_virtual_addressing)
+ , initial_endpoint(other.initial_endpoint)
+ , credentials_provider(other.credentials_provider)
+ , client_configuration(client_configuration_)
+ , sign_payloads(other.sign_payloads)
+ , use_virtual_addressing(other.use_virtual_addressing)
+ , explicit_region(other.explicit_region)
+ , detect_region(other.detect_region)
+ , provider_type(other.provider_type)
+ , max_redirects(other.max_redirects)
+ , sse_kms_config(other.sse_kms_config)
+ , log(&Poco::Logger::get("S3Client"))
+{
+ cache = std::make_shared<ClientCache>(*other.cache);
+ ClientCacheRegistry::instance().registerClient(cache);
+}
+
+Aws::Auth::AWSCredentials Client::getCredentials() const
+{
+ return credentials_provider->GetAWSCredentials();
+}
+
+bool Client::checkIfWrongRegionDefined(const std::string & bucket, const Aws::S3::S3Error & error, std::string & region) const
+{
+ if (detect_region)
+ return false;
+
+ if (error.GetResponseCode() == Aws::Http::HttpResponseCode::BAD_REQUEST && error.GetExceptionName() == "AuthorizationHeaderMalformed")
+ {
+ region = GetErrorMarshaller()->ExtractRegion(error);
+
+ if (region.empty())
+ region = getRegionForBucket(bucket, /*force_detect*/ true);
+
+ assert(!explicit_region.empty());
+ if (region == explicit_region)
+ return false;
+
+ insertRegionOverride(bucket, region);
+ return true;
+ }
+
+ return false;
+}
+
+void Client::insertRegionOverride(const std::string & bucket, const std::string & region) const
+{
+ std::lock_guard lock(cache->region_cache_mutex);
+ auto [it, inserted] = cache->region_for_bucket_cache.emplace(bucket, region);
+ if (inserted)
+ LOG_INFO(log, "Detected different region ('{}') for bucket {} than the one defined ('{}')", region, bucket, explicit_region);
+}
+
+template <typename RequestType>
+void Client::setKMSHeaders(RequestType & request) const
+{
+ // Don't do anything unless a key ID was specified
+ if (sse_kms_config.key_id)
+ {
+ request.SetServerSideEncryption(Model::ServerSideEncryption::aws_kms);
+ // If the key ID was specified but is empty, treat it as using the AWS managed key and omit the header
+ if (!sse_kms_config.key_id->empty())
+ request.SetSSEKMSKeyId(*sse_kms_config.key_id);
+ if (sse_kms_config.encryption_context)
+ request.SetSSEKMSEncryptionContext(*sse_kms_config.encryption_context);
+ if (sse_kms_config.bucket_key_enabled)
+ request.SetBucketKeyEnabled(*sse_kms_config.bucket_key_enabled);
+ }
+}
+
+// Explicitly instantiate this method only for the request types that support KMS headers
+template void Client::setKMSHeaders<CreateMultipartUploadRequest>(CreateMultipartUploadRequest & request) const;
+template void Client::setKMSHeaders<CopyObjectRequest>(CopyObjectRequest & request) const;
+template void Client::setKMSHeaders<PutObjectRequest>(PutObjectRequest & request) const;
+
+Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) const
+{
+ const auto & bucket = request.GetBucket();
+
+ request.setApiMode(api_mode);
+
+ if (auto region = getRegionForBucket(bucket); !region.empty())
+ {
+ if (!detect_region)
+ LOG_INFO(log, "Using region override {} for bucket {}", region, bucket);
+ request.overrideRegion(std::move(region));
+ }
+
+ if (auto uri = getURIForBucket(bucket); uri.has_value())
+ request.overrideURI(std::move(*uri));
+
+ auto result = HeadObject(static_cast<const Model::HeadObjectRequest&>(request));
+ if (result.IsSuccess())
+ return result;
+
+ const auto & error = result.GetError();
+
+ std::string new_region;
+ if (checkIfWrongRegionDefined(bucket, error, new_region))
+ {
+ request.overrideRegion(new_region);
+ return Aws::S3::S3Client::HeadObject(request);
+ }
+
+ if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY)
+ return result;
+
+ // maybe we detect a correct region
+ if (!detect_region)
+ {
+ if (auto region = GetErrorMarshaller()->ExtractRegion(error); !region.empty() && region != explicit_region)
+ {
+ request.overrideRegion(region);
+ insertRegionOverride(bucket, region);
+ }
+ }
+
+ auto bucket_uri = getURIForBucket(bucket);
+ if (!bucket_uri)
+ {
+ if (auto maybe_error = updateURIForBucketForHead(bucket); maybe_error.has_value())
+ return *maybe_error;
+
+ if (auto region = getRegionForBucket(bucket); !region.empty())
+ {
+ if (!detect_region)
+ LOG_INFO(log, "Using region override {} for bucket {}", region, bucket);
+ request.overrideRegion(std::move(region));
+ }
+
+ bucket_uri = getURIForBucket(bucket);
+ if (!bucket_uri)
+ {
+ LOG_ERROR(log, "Missing resolved URI for bucket {}, maybe the cache was cleaned", bucket);
+ return result;
+ }
+ }
+
+ const auto & current_uri_override = request.getURIOverride();
+ /// we already tried with this URI
+ if (current_uri_override && current_uri_override->uri == bucket_uri->uri)
+ {
+ LOG_INFO(log, "Getting redirected to the same invalid location {}", bucket_uri->uri.toString());
+ return result;
+ }
+
+ request.overrideURI(std::move(*bucket_uri));
+
+ /// The next call is NOT a recurcive call
+ /// This is a virtuall call Aws::S3::S3Client::HeadObject(const Model::HeadObjectRequest&)
+ return HeadObject(static_cast<const Model::HeadObjectRequest&>(request));
+}
+
+/// For each request, we wrap the request functions from Aws::S3::Client with doRequest
+/// doRequest calls virtuall function from Aws::S3::Client while DB::S3::Client has not virtual calls for each request type
+
+Model::ListObjectsV2Outcome Client::ListObjectsV2(const ListObjectsV2Request & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ true>(
+ request, [this](const Model::ListObjectsV2Request & req) { return ListObjectsV2(req); });
+}
+
+Model::ListObjectsOutcome Client::ListObjects(const ListObjectsRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ true>(
+ request, [this](const Model::ListObjectsRequest & req) { return ListObjects(req); });
+}
+
+Model::GetObjectOutcome Client::GetObject(const GetObjectRequest & request) const
+{
+ return doRequest(request, [this](const Model::GetObjectRequest & req) { return GetObject(req); });
+}
+
+Model::AbortMultipartUploadOutcome Client::AbortMultipartUpload(const AbortMultipartUploadRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::AbortMultipartUploadRequest & req) { return AbortMultipartUpload(req); });
+}
+
+Model::CreateMultipartUploadOutcome Client::CreateMultipartUpload(const CreateMultipartUploadRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::CreateMultipartUploadRequest & req) { return CreateMultipartUpload(req); });
+}
+
+Model::CompleteMultipartUploadOutcome Client::CompleteMultipartUpload(const CompleteMultipartUploadRequest & request) const
+{
+ auto outcome = doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::CompleteMultipartUploadRequest & req) { return CompleteMultipartUpload(req); });
+
+ if (!outcome.IsSuccess() || provider_type != ProviderType::GCS)
+ return outcome;
+
+ const auto & key = request.GetKey();
+ const auto & bucket = request.GetBucket();
+
+ /// For GCS we will try to compose object at the end, otherwise we cannot do a native copy
+ /// for the object (e.g. for backups)
+ /// We don't care if the compose fails, because the upload was still successful, only the
+ /// performance for copying the object will be affected
+ S3::ComposeObjectRequest compose_req;
+ compose_req.SetBucket(bucket);
+ compose_req.SetKey(key);
+ compose_req.SetComponentNames({key});
+ compose_req.SetContentType("binary/octet-stream");
+#if 0
+ auto compose_outcome = ComposeObject(compose_req);
+
+ if (compose_outcome.IsSuccess())
+ LOG_TRACE(log, "Composing object was successful");
+ else
+ LOG_INFO(log, "Failed to compose object. Message: {}, Key: {}, Bucket: {}", compose_outcome.GetError().GetMessage(), key, bucket);
+#endif
+
+ return outcome;
+}
+
+Model::CopyObjectOutcome Client::CopyObject(const CopyObjectRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::CopyObjectRequest & req) { return CopyObject(req); });
+}
+
+Model::PutObjectOutcome Client::PutObject(const PutObjectRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::PutObjectRequest & req) { return PutObject(req); });
+}
+
+Model::UploadPartOutcome Client::UploadPart(const UploadPartRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::UploadPartRequest & req) { return UploadPart(req); });
+}
+
+Model::UploadPartCopyOutcome Client::UploadPartCopy(const UploadPartCopyRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::UploadPartCopyRequest & req) { return UploadPartCopy(req); });
+}
+
+Model::DeleteObjectOutcome Client::DeleteObject(const DeleteObjectRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::DeleteObjectRequest & req) { return DeleteObject(req); });
+}
+
+Model::DeleteObjectsOutcome Client::DeleteObjects(const DeleteObjectsRequest & request) const
+{
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, [this](const Model::DeleteObjectsRequest & req) { return DeleteObjects(req); });
+}
+
+#if 0
+Client::ComposeObjectOutcome Client::ComposeObject(const ComposeObjectRequest & request) const
+{
+ auto request_fn = [this](const ComposeObjectRequest & req)
+ {
+ auto & endpoint_provider = const_cast<Client &>(*this).accessEndpointProvider();
+ AWS_OPERATION_CHECK_PTR(endpoint_provider, ComposeObject, Aws::Client::CoreErrors, Aws::Client::CoreErrors::ENDPOINT_RESOLUTION_FAILURE);
+
+ if (!req.BucketHasBeenSet())
+ {
+ AWS_LOGSTREAM_ERROR("ComposeObject", "Required field: Bucket, is not set")
+ return ComposeObjectOutcome(Aws::Client::AWSError<Aws::S3::S3Errors>(Aws::S3::S3Errors::MISSING_PARAMETER, "MISSING_PARAMETER", "Missing required field [Bucket]", false));
+ }
+
+ if (!req.KeyHasBeenSet())
+ {
+ AWS_LOGSTREAM_ERROR("ComposeObject", "Required field: Key, is not set")
+ return ComposeObjectOutcome(Aws::Client::AWSError<Aws::S3::S3Errors>(Aws::S3::S3Errors::MISSING_PARAMETER, "MISSING_PARAMETER", "Missing required field [Key]", false));
+ }
+
+ auto endpointResolutionOutcome = endpoint_provider->ResolveEndpoint(req.GetEndpointContextParams());
+ AWS_OPERATION_CHECK_SUCCESS(endpointResolutionOutcome, ComposeObject, Aws::Client::CoreErrors, Aws::Client::CoreErrors::ENDPOINT_RESOLUTION_FAILURE, endpointResolutionOutcome.GetError().GetMessage());
+ endpointResolutionOutcome.GetResult().AddPathSegments(req.GetKey());
+ endpointResolutionOutcome.GetResult().SetQueryString("?compose");
+ return ComposeObjectOutcome(MakeRequest(req, endpointResolutionOutcome.GetResult(), Aws::Http::HttpMethod::HTTP_PUT));
+ };
+
+ return doRequestWithRetryNetworkErrors</*IsReadMethod*/ false>(
+ request, request_fn);
+}
+#endif
+
+template <typename RequestType, typename RequestFn>
+std::invoke_result_t<RequestFn, RequestType>
+Client::doRequest(const RequestType & request, RequestFn request_fn) const
+{
+ const auto & bucket = request.GetBucket();
+ request.setApiMode(api_mode);
+
+ if (auto region = getRegionForBucket(bucket); !region.empty())
+ {
+ if (!detect_region)
+ LOG_INFO(log, "Using region override {} for bucket {}", region, bucket);
+
+ request.overrideRegion(std::move(region));
+ }
+
+ if (auto uri = getURIForBucket(bucket); uri.has_value())
+ request.overrideURI(std::move(*uri));
+
+
+ bool found_new_endpoint = false;
+ // if we found correct endpoint after 301 responses, update the cache for future requests
+ SCOPE_EXIT(
+ if (found_new_endpoint)
+ {
+ auto uri_override = request.getURIOverride();
+ assert(uri_override.has_value());
+ updateURIForBucket(bucket, std::move(*uri_override));
+ }
+ );
+
+ for (size_t attempt = 0; attempt <= max_redirects; ++attempt)
+ {
+ auto result = request_fn(request);
+ if (result.IsSuccess())
+ return result;
+
+ const auto & error = result.GetError();
+
+ std::string new_region;
+ if (checkIfWrongRegionDefined(bucket, error, new_region))
+ {
+ request.overrideRegion(new_region);
+ continue;
+ }
+
+ if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY)
+ return result;
+
+ // maybe we detect a correct region
+ if (!detect_region)
+ {
+ if (auto region = GetErrorMarshaller()->ExtractRegion(error); !region.empty() && region != explicit_region)
+ {
+ request.overrideRegion(region);
+ insertRegionOverride(bucket, region);
+ }
+ }
+
+ // we possibly got new location, need to try with that one
+ auto new_uri = getURIFromError(error);
+ if (!new_uri)
+ return result;
+
+ const auto & current_uri_override = request.getURIOverride();
+ /// we already tried with this URI
+ if (current_uri_override && current_uri_override->uri == new_uri->uri)
+ {
+ LOG_INFO(log, "Getting redirected to the same invalid location {}", new_uri->uri.toString());
+ return result;
+ }
+
+ found_new_endpoint = true;
+ request.overrideURI(*new_uri);
+ }
+
+ throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects");
+}
+
+template <bool IsReadMethod, typename RequestType, typename RequestFn>
+std::invoke_result_t<RequestFn, RequestType>
+Client::doRequestWithRetryNetworkErrors(const RequestType & request, RequestFn request_fn) const
+{
+ auto with_retries = [this, request_fn_ = std::move(request_fn)] (const RequestType & request_)
+ {
+ chassert(client_configuration.retryStrategy);
+ const Int64 max_attempts = client_configuration.retryStrategy->GetMaxAttempts();
+ std::exception_ptr last_exception = nullptr;
+ for (Int64 attempt_no = 0; attempt_no < max_attempts; ++attempt_no)
+ {
+ try
+ {
+ /// S3 does retries network errors actually.
+ /// But it is matter when errors occur.
+ /// This code retries a specific case when
+ /// network error happens when XML document is being read from the response body.
+ /// Hence, the response body is a stream, network errors are possible at reading.
+ /// S3 doesn't retry them.
+
+ /// Not all requests can be retried in that way.
+ /// Requests that read out response body to build the result are possible to retry.
+ /// Requests that expose the response stream as an answer are not retried with that code. E.g. GetObject.
+ return request_fn_(request_);
+ }
+ catch (Poco::Net::ConnectionResetException &)
+ {
+
+ if constexpr (IsReadMethod)
+ {
+ if (client_configuration.for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3ReadRequestsErrors);
+ else
+ ProfileEvents::increment(ProfileEvents::S3ReadRequestsErrors);
+ }
+ else
+ {
+ if (client_configuration.for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3WriteRequestsErrors);
+ else
+ ProfileEvents::increment(ProfileEvents::S3WriteRequestsErrors);
+ }
+
+ tryLogCurrentException(log, "Will retry");
+ last_exception = std::current_exception();
+
+ auto error = Aws::Client::AWSError<Aws::Client::CoreErrors>(Aws::Client::CoreErrors::NETWORK_CONNECTION, /*retry*/ true);
+ client_configuration.retryStrategy->CalculateDelayBeforeNextRetry(error, attempt_no);
+ continue;
+ }
+ }
+
+ chassert(last_exception);
+ std::rethrow_exception(last_exception);
+ };
+
+ return doRequest(request, with_retries);
+}
+
+bool Client::supportsMultiPartCopy() const
+{
+ return provider_type != ProviderType::GCS;
+}
+
+void Client::BuildHttpRequest(const Aws::AmazonWebServiceRequest& request,
+ const std::shared_ptr<Aws::Http::HttpRequest>& httpRequest) const
+{
+ Aws::S3::S3Client::BuildHttpRequest(request, httpRequest);
+
+ if (api_mode == ApiMode::GCS)
+ {
+ /// some GCS requests don't like S3 specific headers that the client sets
+ httpRequest->DeleteHeader("x-amz-api-version");
+ httpRequest->DeleteHeader("amz-sdk-invocation-id");
+ httpRequest->DeleteHeader("amz-sdk-request");
+ }
+}
+
+std::string Client::getRegionForBucket(const std::string & bucket, bool force_detect) const
+{
+ std::lock_guard lock(cache->region_cache_mutex);
+ if (auto it = cache->region_for_bucket_cache.find(bucket); it != cache->region_for_bucket_cache.end())
+ return it->second;
+
+ if (!force_detect && !detect_region)
+ return "";
+
+ LOG_INFO(log, "Resolving region for bucket {}", bucket);
+ Aws::S3::Model::HeadBucketRequest req;
+ req.SetBucket(bucket);
+
+ std::string region;
+ auto outcome = HeadBucket(req);
+ if (outcome.IsSuccess())
+ {
+ const auto & result = outcome.GetResult();
+ // region = result.GetRegion();
+ }
+ else
+ {
+ static const std::string region_header = "x-amz-bucket-region";
+ const auto & headers = outcome.GetError().GetResponseHeaders();
+ if (auto it = headers.find(region_header); it != headers.end())
+ region = it->second;
+ }
+
+ if (region.empty())
+ {
+ LOG_INFO(log, "Failed resolving region for bucket {}", bucket);
+ return "";
+ }
+
+ LOG_INFO(log, "Found region {} for bucket {}", region, bucket);
+
+ auto [it, _] = cache->region_for_bucket_cache.emplace(bucket, std::move(region));
+
+ return it->second;
+}
+
+std::optional<S3::URI> Client::getURIFromError(const Aws::S3::S3Error & error) const
+{
+ return std::nullopt;
+#if 0
+ auto endpoint = GetErrorMarshaller()->ExtractEndpoint(error);
+ if (endpoint.empty())
+ return std::nullopt;
+
+ auto & s3_client = const_cast<Client &>(*this);
+ const auto * endpoint_provider = dynamic_cast<Aws::S3::Endpoint::S3DefaultEpProviderBase *>(s3_client.accessEndpointProvider().get());
+ auto resolved_endpoint = endpoint_provider->ResolveEndpoint({});
+
+ if (!resolved_endpoint.IsSuccess())
+ return std::nullopt;
+
+ auto uri = resolved_endpoint.GetResult().GetURI();
+ uri.SetAuthority(endpoint);
+
+ return S3::URI(uri.GetURIString());
+#endif
+}
+
+// Do a list request because head requests don't have body in response
+std::optional<Aws::S3::S3Error> Client::updateURIForBucketForHead(const std::string & bucket) const
+{
+ ListObjectsV2Request req;
+ req.SetBucket(bucket);
+ req.SetMaxKeys(1);
+ auto result = ListObjectsV2(req);
+ if (result.IsSuccess())
+ return std::nullopt;
+ return result.GetError();
+}
+
+std::optional<S3::URI> Client::getURIForBucket(const std::string & bucket) const
+{
+ std::lock_guard lock(cache->uri_cache_mutex);
+ if (auto it = cache->uri_for_bucket_cache.find(bucket); it != cache->uri_for_bucket_cache.end())
+ return it->second;
+
+ return std::nullopt;
+}
+
+void Client::updateURIForBucket(const std::string & bucket, S3::URI new_uri) const
+{
+ std::lock_guard lock(cache->uri_cache_mutex);
+ if (auto it = cache->uri_for_bucket_cache.find(bucket); it != cache->uri_for_bucket_cache.end())
+ {
+ if (it->second.uri == new_uri.uri)
+ return;
+
+ LOG_INFO(log, "Updating URI for bucket {} to {}", bucket, new_uri.uri.toString());
+ it->second = std::move(new_uri);
+
+ return;
+ }
+
+ LOG_INFO(log, "Updating URI for bucket {} to {}", bucket, new_uri.uri.toString());
+ cache->uri_for_bucket_cache.emplace(bucket, std::move(new_uri));
+}
+
+
+void ClientCache::clearCache()
+{
+ {
+ std::lock_guard lock(region_cache_mutex);
+ region_for_bucket_cache.clear();
+ }
+ {
+ std::lock_guard lock(uri_cache_mutex);
+ uri_for_bucket_cache.clear();
+ }
+}
+
+void ClientCacheRegistry::registerClient(const std::shared_ptr<ClientCache> & client_cache)
+{
+ std::lock_guard lock(clients_mutex);
+ auto [it, inserted] = client_caches.emplace(client_cache.get(), client_cache);
+ if (!inserted)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Same S3 client registered twice");
+}
+
+void ClientCacheRegistry::unregisterClient(ClientCache * client)
+{
+ std::lock_guard lock(clients_mutex);
+ auto erased = client_caches.erase(client);
+ if (erased == 0)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't unregister S3 client, either it was already unregistered or not registered at all");
+}
+
+void ClientCacheRegistry::clearCacheForAll()
+{
+ std::lock_guard lock(clients_mutex);
+
+ for (auto it = client_caches.begin(); it != client_caches.end();)
+ {
+ if (auto locked_client = it->second.lock(); locked_client)
+ {
+ locked_client->clearCache();
+ ++it;
+ }
+ else
+ {
+ LOG_INFO(&Poco::Logger::get("ClientCacheRegistry"), "Deleting leftover S3 client cache");
+ it = client_caches.erase(it);
+ }
+ }
+
+}
+
+ClientFactory::ClientFactory()
+{
+ aws_options = Aws::SDKOptions{};
+ Aws::InitAPI(aws_options);
+ // Aws::Utils::Logging::InitializeAWSLogging(std::make_shared<AWSLogger>(false));
+ Aws::Http::SetHttpClientFactory(std::make_shared<PocoHTTPClientFactory>());
+}
+
+ClientFactory::~ClientFactory()
+{
+ // Aws::Utils::Logging::ShutdownAWSLogging();
+ Aws::ShutdownAPI(aws_options);
+}
+
+ClientFactory & ClientFactory::instance()
+{
+ static ClientFactory ret;
+ return ret;
+}
+
+std::unique_ptr<S3::Client> ClientFactory::create( // NOLINT
+ const PocoHTTPClientConfiguration & cfg_,
+ bool is_virtual_hosted_style,
+ const String & access_key_id,
+ const String & secret_access_key,
+ const String & server_side_encryption_customer_key_base64,
+ ServerSideEncryptionKMSConfig sse_kms_config,
+ HTTPHeaderEntries headers,
+ CredentialsConfiguration credentials_configuration,
+ const String & session_token)
+{
+ PocoHTTPClientConfiguration client_configuration = cfg_;
+ client_configuration.updateSchemeAndRegion();
+
+ if (!server_side_encryption_customer_key_base64.empty())
+ {
+ /// See Client::GeneratePresignedUrlWithSSEC().
+
+ headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
+ Aws::S3::Model::ServerSideEncryptionMapper::GetNameForServerSideEncryption(Aws::S3::Model::ServerSideEncryption::AES256)});
+
+ headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
+ server_side_encryption_customer_key_base64});
+
+ Aws::Utils::ByteBuffer buffer = Aws::Utils::HashingUtils::Base64Decode(server_side_encryption_customer_key_base64);
+ String str_buffer(reinterpret_cast<char *>(buffer.GetUnderlyingData()), buffer.GetLength());
+ headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
+ Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateMD5(str_buffer))});
+ }
+
+ // These will be added after request signing
+ client_configuration.extra_headers = std::move(headers);
+
+ Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key, session_token);
+ auto credentials_provider = std::make_shared<S3CredentialsProviderChain>(
+ client_configuration,
+ std::move(credentials),
+ credentials_configuration);
+
+ client_configuration.retryStrategy = std::make_shared<Client::RetryStrategy>(std::move(client_configuration.retryStrategy));
+ return Client::create(
+ client_configuration.s3_max_redirects,
+ std::move(sse_kms_config),
+ credentials_provider,
+ client_configuration, // Client configuration.
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
+ is_virtual_hosted_style || client_configuration.endpointOverride.empty() /// Use virtual addressing if endpoint is not specified.
+ );
+}
+
+PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
+ const String & force_region,
+ const RemoteHostFilter & remote_host_filter,
+ unsigned int s3_max_redirects,
+ bool enable_s3_requests_logging,
+ bool for_disk_s3,
+ const ThrottlerPtr & get_request_throttler,
+ const ThrottlerPtr & put_request_throttler,
+ const String & protocol)
+{
+ auto proxy_configuration_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::protocolFromString(protocol));
+
+ auto per_request_configuration = [=] () { return proxy_configuration_resolver->resolve(); };
+ auto error_report = [=] (const DB::ProxyConfiguration & req) { proxy_configuration_resolver->errorReport(req); };
+
+ auto config = PocoHTTPClientConfiguration(
+ per_request_configuration,
+ force_region,
+ remote_host_filter,
+ s3_max_redirects,
+ enable_s3_requests_logging,
+ for_disk_s3,
+ get_request_throttler,
+ put_request_throttler,
+ error_report);
+
+ config.scheme = Aws::Http::SchemeMapper::FromString(protocol.c_str());
+
+ return config;
+}
+
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Client.h b/contrib/clickhouse/src/IO/S3/Client.h
new file mode 100644
index 0000000000..721b8dd944
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Client.h
@@ -0,0 +1,329 @@
+#pragma once
+
+#include <optional>
+#include <base/types.h>
+
+#include "clickhouse_config.h"
+
+namespace DB::S3
+{
+
+/// See https://docs.aws.amazon.com/AmazonS3/latest/userguide/specifying-kms-encryption.html
+/// Needed by S3Common.h even if USE_AWS_S3 is 0
+struct ServerSideEncryptionKMSConfig
+{
+ // If key_id is non-null, enable SSE-KMS. If key_id is "", use the AWS managed key
+ std::optional<String> key_id = std::nullopt;
+ std::optional<String> encryption_context = std::nullopt;
+ std::optional<bool> bucket_key_enabled = std::nullopt;
+
+ bool operator==(const ServerSideEncryptionKMSConfig & other) const = default;
+};
+
+}
+
+#if USE_AWS_S3
+
+#include <Common/assert_cast.h>
+#include <base/scope_guard.h>
+
+#include <IO/S3/URI.h>
+#include <IO/S3/Requests.h>
+#include <IO/S3/PocoHTTPClient.h>
+#include <IO/S3/Credentials.h>
+#include <IO/S3/ProviderType.h>
+
+#include <aws/core/Aws.h>
+#include <aws/core/client/DefaultRetryStrategy.h>
+#include <aws/s3/S3Client.h>
+// #include <aws/s3/S3ServiceClientModel.h>
+#include <aws/core/client/AWSErrorMarshaller.h>
+#include <aws/core/client/RetryStrategy.h>
+
+namespace MockS3
+{
+ struct Client;
+}
+
+namespace DB::S3
+{
+
+namespace Model = Aws::S3::Model;
+
+struct ClientCache
+{
+ ClientCache() = default;
+
+ ClientCache(const ClientCache & other)
+ : region_for_bucket_cache(other.region_for_bucket_cache)
+ , uri_for_bucket_cache(other.uri_for_bucket_cache)
+ {}
+
+ ClientCache(ClientCache && other) = delete;
+
+ ClientCache & operator=(const ClientCache &) = delete;
+ ClientCache & operator=(ClientCache &&) = delete;
+
+ void clearCache();
+
+ std::mutex region_cache_mutex;
+ std::unordered_map<std::string, std::string> region_for_bucket_cache;
+
+ std::mutex uri_cache_mutex;
+ std::unordered_map<std::string, URI> uri_for_bucket_cache;
+};
+
+class ClientCacheRegistry
+{
+public:
+ static ClientCacheRegistry & instance()
+ {
+ static ClientCacheRegistry registry;
+ return registry;
+ }
+
+ void registerClient(const std::shared_ptr<ClientCache> & client_cache);
+ void unregisterClient(ClientCache * client);
+ void clearCacheForAll();
+private:
+ ClientCacheRegistry() = default;
+
+ std::mutex clients_mutex;
+ std::unordered_map<ClientCache *, std::weak_ptr<ClientCache>> client_caches;
+};
+
+/// Client that improves the client from the AWS SDK
+/// - inject region and URI into requests so they are rerouted to the correct destination if needed
+/// - automatically detect endpoint and regions for each bucket and cache them
+///
+/// For this client to work correctly both Client::RetryStrategy and Requests defined in <IO/S3/Requests.h> should be used.
+///
+/// To add support for new type of request
+/// - ExtendedRequest should be defined inside IO/S3/Requests.h
+/// - new method accepting that request should be defined in this Client (check other requests for reference)
+/// - method handling the request from Aws::S3::S3Client should be left to private so we don't use it by accident
+class Client : private Aws::S3::S3Client
+{
+public:
+ class RetryStrategy;
+
+ /// we use a factory method to verify arguments before creating a client because
+ /// there are certain requirements on arguments for it to work correctly
+ /// e.g. Client::RetryStrategy should be used
+ static std::unique_ptr<Client> create(
+ size_t max_redirects_,
+ ServerSideEncryptionKMSConfig sse_kms_config_,
+ const std::shared_ptr<Aws::Auth::AWSCredentialsProvider> & credentials_provider,
+ const PocoHTTPClientConfiguration & client_configuration,
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads,
+ bool use_virtual_addressing);
+
+ /// Create a client with adjusted settings:
+ /// * override_retry_strategy can be used to disable retries to avoid nested retries when we have
+ /// a retry loop outside of S3 client. Specifically, for read and write buffers. Currently not
+ /// actually used.
+ /// * override_request_timeout_ms is used to increase timeout for CompleteMultipartUploadRequest
+ /// because it often sits idle for 10 seconds: https://github.com/ClickHouse/ClickHouse/pull/42321
+ std::unique_ptr<Client> clone(
+ std::optional<std::shared_ptr<RetryStrategy>> override_retry_strategy = std::nullopt,
+ std::optional<Int64> override_request_timeout_ms = std::nullopt) const;
+
+ Client & operator=(const Client &) = delete;
+
+ Client(Client && other) = delete;
+ Client & operator=(Client &&) = delete;
+
+ ~Client() override
+ {
+ try
+ {
+ ClientCacheRegistry::instance().unregisterClient(cache.get());
+ }
+ catch (...)
+ {
+ tryLogCurrentException(log);
+ throw;
+ }
+ }
+
+ /// Returns the initial endpoint.
+ const String & getInitialEndpoint() const { return initial_endpoint; }
+ const String & getRegion() const { return explicit_region; }
+
+ Aws::Auth::AWSCredentials getCredentials() const;
+
+ /// Decorator for RetryStrategy needed for this client to work correctly.
+ /// We want to manually handle permanent moves (status code 301) because:
+ /// - redirect location is written in XML format inside the response body something that doesn't exist for HEAD
+ /// requests so we need to manually find the correct location
+ /// - we want to cache the new location to decrease number of roundtrips for future requests
+ /// This decorator doesn't retry if 301 is detected and fallbacks to the inner retry strategy otherwise.
+ class RetryStrategy : public Aws::Client::RetryStrategy
+ {
+ public:
+ explicit RetryStrategy(std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy_);
+
+ /// NOLINTNEXTLINE(google-runtime-int)
+ bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const override;
+
+ /// NOLINTNEXTLINE(google-runtime-int)
+ long CalculateDelayBeforeNextRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const override;
+
+ /// NOLINTNEXTLINE(google-runtime-int)
+ long GetMaxAttempts() const override;
+
+ void GetSendToken() override;
+
+ // bool HasSendToken() override;
+
+ void RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome) override;
+ void RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome, const Aws::Client::AWSError<Aws::Client::CoreErrors>& lastError) override;
+ private:
+ std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy;
+ };
+
+ /// SSE-KMS headers MUST be signed, so they need to be added before the SDK signs the message
+ /// (before sending the request with one of the methods below).
+ /// Per the docs (https://docs.aws.amazon.com/AmazonS3/latest/userguide/specifying-kms-encryption.html),
+ /// the headers should only be set for PutObject, CopyObject, POST Object, and CreateMultipartUpload.
+ template <typename RequestType>
+ void setKMSHeaders(RequestType & request) const;
+
+ Model::HeadObjectOutcome HeadObject(const HeadObjectRequest & request) const;
+ Model::ListObjectsV2Outcome ListObjectsV2(const ListObjectsV2Request & request) const;
+ Model::ListObjectsOutcome ListObjects(const ListObjectsRequest & request) const;
+ Model::GetObjectOutcome GetObject(const GetObjectRequest & request) const;
+
+ Model::AbortMultipartUploadOutcome AbortMultipartUpload(const AbortMultipartUploadRequest & request) const;
+ Model::CreateMultipartUploadOutcome CreateMultipartUpload(const CreateMultipartUploadRequest & request) const;
+ Model::CompleteMultipartUploadOutcome CompleteMultipartUpload(const CompleteMultipartUploadRequest & request) const;
+ Model::UploadPartOutcome UploadPart(const UploadPartRequest & request) const;
+ Model::UploadPartCopyOutcome UploadPartCopy(const UploadPartCopyRequest & request) const;
+
+ Model::CopyObjectOutcome CopyObject(const CopyObjectRequest & request) const;
+ Model::PutObjectOutcome PutObject(const PutObjectRequest & request) const;
+ Model::DeleteObjectOutcome DeleteObject(const DeleteObjectRequest & request) const;
+ Model::DeleteObjectsOutcome DeleteObjects(const DeleteObjectsRequest & request) const;
+
+ using ComposeObjectOutcome = Aws::Utils::Outcome<Aws::NoResult, Aws::S3::S3Error>;
+ ComposeObjectOutcome ComposeObject(const ComposeObjectRequest & request) const;
+
+ using Aws::S3::S3Client::EnableRequestProcessing;
+ using Aws::S3::S3Client::DisableRequestProcessing;
+
+ void BuildHttpRequest(const Aws::AmazonWebServiceRequest& request,
+ const std::shared_ptr<Aws::Http::HttpRequest>& httpRequest) const override;
+
+ bool supportsMultiPartCopy() const;
+private:
+ friend struct ::MockS3::Client;
+
+ Client(size_t max_redirects_,
+ ServerSideEncryptionKMSConfig sse_kms_config_,
+ const std::shared_ptr<Aws::Auth::AWSCredentialsProvider> & credentials_provider_,
+ const PocoHTTPClientConfiguration & client_configuration,
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads,
+ bool use_virtual_addressing);
+
+ Client(
+ const Client & other, const PocoHTTPClientConfiguration & client_configuration);
+
+ /// Leave regular functions private so we don't accidentally use them
+ /// otherwise region and endpoint redirection won't work
+ using Aws::S3::S3Client::HeadObject;
+ using Aws::S3::S3Client::ListObjectsV2;
+ using Aws::S3::S3Client::ListObjects;
+ using Aws::S3::S3Client::GetObject;
+
+ using Aws::S3::S3Client::AbortMultipartUpload;
+ using Aws::S3::S3Client::CreateMultipartUpload;
+ using Aws::S3::S3Client::CompleteMultipartUpload;
+ using Aws::S3::S3Client::UploadPart;
+ using Aws::S3::S3Client::UploadPartCopy;
+
+ using Aws::S3::S3Client::CopyObject;
+ using Aws::S3::S3Client::PutObject;
+ using Aws::S3::S3Client::DeleteObject;
+ using Aws::S3::S3Client::DeleteObjects;
+
+ template <typename RequestType, typename RequestFn>
+ std::invoke_result_t<RequestFn, RequestType>
+ doRequest(const RequestType & request, RequestFn request_fn) const;
+
+ template <bool IsReadMethod, typename RequestType, typename RequestFn>
+ std::invoke_result_t<RequestFn, RequestType>
+ doRequestWithRetryNetworkErrors(const RequestType & request, RequestFn request_fn) const;
+
+ void updateURIForBucket(const std::string & bucket, S3::URI new_uri) const;
+ std::optional<S3::URI> getURIFromError(const Aws::S3::S3Error & error) const;
+ std::optional<Aws::S3::S3Error> updateURIForBucketForHead(const std::string & bucket) const;
+
+ std::string getRegionForBucket(const std::string & bucket, bool force_detect = false) const;
+ std::optional<S3::URI> getURIForBucket(const std::string & bucket) const;
+
+ bool checkIfWrongRegionDefined(const std::string & bucket, const Aws::S3::S3Error & error, std::string & region) const;
+ void insertRegionOverride(const std::string & bucket, const std::string & region) const;
+
+ String initial_endpoint;
+ std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
+ PocoHTTPClientConfiguration client_configuration;
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads;
+ bool use_virtual_addressing;
+
+ std::string explicit_region;
+ mutable bool detect_region = true;
+
+ /// provider type can determine if some functionality is supported
+ /// but for same provider, we would need to generate different headers depending on the
+ /// mode
+ /// E.g. GCS can work in AWS mode in some cases and accept headers with x-amz prefix
+ ProviderType provider_type{ProviderType::UNKNOWN};
+ ApiMode api_mode{ApiMode::AWS};
+
+ mutable std::shared_ptr<ClientCache> cache;
+
+ const size_t max_redirects;
+
+ const ServerSideEncryptionKMSConfig sse_kms_config;
+
+ Poco::Logger * log;
+};
+
+class ClientFactory
+{
+public:
+ ~ClientFactory();
+
+ static ClientFactory & instance();
+
+ std::unique_ptr<S3::Client> create(
+ const PocoHTTPClientConfiguration & cfg,
+ bool is_virtual_hosted_style,
+ const String & access_key_id,
+ const String & secret_access_key,
+ const String & server_side_encryption_customer_key_base64,
+ ServerSideEncryptionKMSConfig sse_kms_config,
+ HTTPHeaderEntries headers,
+ CredentialsConfiguration credentials_configuration,
+ const String & session_token = "");
+
+ PocoHTTPClientConfiguration createClientConfiguration(
+ const String & force_region,
+ const RemoteHostFilter & remote_host_filter,
+ unsigned int s3_max_redirects,
+ bool enable_s3_requests_logging,
+ bool for_disk_s3,
+ const ThrottlerPtr & get_request_throttler,
+ const ThrottlerPtr & put_request_throttler,
+ const String & protocol = "https");
+
+private:
+ ClientFactory();
+
+ Aws::SDKOptions aws_options;
+ std::atomic<bool> s3_requests_logging_enabled;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Credentials.cpp b/contrib/clickhouse/src/IO/S3/Credentials.cpp
new file mode 100644
index 0000000000..c8820496bf
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Credentials.cpp
@@ -0,0 +1,574 @@
+#include <IO/S3/Credentials.h>
+
+#if USE_AWS_S3
+
+# include <aws/core/Version.h>
+# include <aws/core/platform/OSVersionInfo.h>
+# include <aws/core/auth/STSCredentialsProvider.h>
+# include <aws/core/platform/Environment.h>
+# include <aws/core/client/SpecifiedRetryableErrorsRetryStrategy.h>
+# include <aws/core/utils/json/JsonSerializer.h>
+# include <aws/core/utils/UUID.h>
+# include <aws/core/http/HttpClientFactory.h>
+
+# include <Common/logger_useful.h>
+
+# include <IO/S3/PocoHTTPClient.h>
+# include <IO/S3/Client.h>
+
+# include <fstream>
+# include <base/EnumReflection.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int AWS_ERROR;
+}
+
+namespace S3
+{
+
+namespace
+{
+
+bool areCredentialsEmptyOrExpired(const Aws::Auth::AWSCredentials & credentials, uint64_t expiration_window_seconds)
+{
+ if (credentials.IsEmpty())
+ return true;
+
+ const Aws::Utils::DateTime now = Aws::Utils::DateTime::Now();
+ return now >= credentials.GetExpiration() - std::chrono::seconds(expiration_window_seconds);
+}
+
+}
+
+AWSEC2MetadataClient::AWSEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration, const char * endpoint_)
+ : Aws::Internal::AWSHttpResourceClient(client_configuration)
+ , endpoint(endpoint_)
+ , logger(&Poco::Logger::get("AWSEC2InstanceProfileConfigLoader"))
+{
+}
+
+Aws::String AWSEC2MetadataClient::GetResource(const char * resource_path) const
+{
+ return GetResource(endpoint.c_str(), resource_path, nullptr/*authToken*/);
+}
+
+Aws::String AWSEC2MetadataClient::getDefaultCredentials() const
+{
+ String credentials_string;
+ {
+ std::lock_guard locker(token_mutex);
+
+ LOG_TRACE(logger, "Getting default credentials for ec2 instance from {}", endpoint);
+ auto result = GetResourceWithAWSWebServiceResult(endpoint.c_str(), EC2_SECURITY_CREDENTIALS_RESOURCE, nullptr);
+ credentials_string = result.GetPayload();
+ if (result.GetResponseCode() == Aws::Http::HttpResponseCode::UNAUTHORIZED)
+ {
+ return {};
+ }
+ }
+
+ String trimmed_credentials_string = Aws::Utils::StringUtils::Trim(credentials_string.c_str());
+ if (trimmed_credentials_string.empty())
+ return {};
+
+ std::vector<String> security_credentials = Aws::Utils::StringUtils::Split(trimmed_credentials_string, '\n');
+
+ LOG_DEBUG(logger, "Calling EC2MetadataService resource, {} returned credential string {}.",
+ EC2_SECURITY_CREDENTIALS_RESOURCE, trimmed_credentials_string);
+
+ if (security_credentials.empty())
+ {
+ LOG_WARNING(logger, "Initial call to EC2MetadataService to get credentials failed.");
+ return {};
+ }
+
+ Aws::StringStream ss;
+ ss << EC2_SECURITY_CREDENTIALS_RESOURCE << "/" << security_credentials[0];
+ LOG_DEBUG(logger, "Calling EC2MetadataService resource {}.", ss.str());
+ return GetResource(ss.str().c_str());
+}
+
+Aws::String AWSEC2MetadataClient::awsComputeUserAgentString()
+{
+ Aws::StringStream ss;
+ ss << "aws-sdk-cpp/" << Aws::Version::GetVersionString() << " " << Aws::OSVersionInfo::ComputeOSVersionString()
+ << " " << Aws::Version::GetCompilerVersionString();
+ return ss.str();
+}
+
+Aws::String AWSEC2MetadataClient::getDefaultCredentialsSecurely() const
+{
+ String user_agent_string = awsComputeUserAgentString();
+ auto [new_token, response_code] = getEC2MetadataToken(user_agent_string);
+ if (response_code == Aws::Http::HttpResponseCode::BAD_REQUEST)
+ return {};
+ else if (response_code != Aws::Http::HttpResponseCode::OK || new_token.empty())
+ {
+ LOG_TRACE(logger, "Calling EC2MetadataService to get token failed, "
+ "falling back to less secure way. HTTP response code: {}", response_code);
+ return getDefaultCredentials();
+ }
+
+ token = std::move(new_token);
+ String url = endpoint + EC2_SECURITY_CREDENTIALS_RESOURCE;
+ std::shared_ptr<Aws::Http::HttpRequest> profile_request(Aws::Http::CreateHttpRequest(url,
+ Aws::Http::HttpMethod::HTTP_GET,
+ Aws::Utils::Stream::DefaultResponseStreamFactoryMethod));
+ profile_request->SetHeaderValue(EC2_IMDS_TOKEN_HEADER, token);
+ profile_request->SetUserAgent(user_agent_string);
+ String profile_string = GetResourceWithAWSWebServiceResult(profile_request).GetPayload();
+
+ String trimmed_profile_string = Aws::Utils::StringUtils::Trim(profile_string.c_str());
+ std::vector<String> security_credentials = Aws::Utils::StringUtils::Split(trimmed_profile_string, '\n');
+
+ LOG_DEBUG(logger, "Calling EC2MetadataService resource, {} with token returned profile string {}.",
+ EC2_SECURITY_CREDENTIALS_RESOURCE, trimmed_profile_string);
+
+ if (security_credentials.empty())
+ {
+ LOG_WARNING(logger, "Calling EC2Metadataservice to get profiles failed.");
+ return {};
+ }
+
+ Aws::StringStream ss;
+ ss << endpoint << EC2_SECURITY_CREDENTIALS_RESOURCE << "/" << security_credentials[0];
+ std::shared_ptr<Aws::Http::HttpRequest> credentials_request(Aws::Http::CreateHttpRequest(ss.str(),
+ Aws::Http::HttpMethod::HTTP_GET,
+ Aws::Utils::Stream::DefaultResponseStreamFactoryMethod));
+ credentials_request->SetHeaderValue(EC2_IMDS_TOKEN_HEADER, token);
+ credentials_request->SetUserAgent(user_agent_string);
+ LOG_DEBUG(logger, "Calling EC2MetadataService resource {} with token.", ss.str());
+ return GetResourceWithAWSWebServiceResult(credentials_request).GetPayload();
+}
+
+Aws::String AWSEC2MetadataClient::getCurrentAvailabilityZone() const
+{
+ String user_agent_string = awsComputeUserAgentString();
+ auto [new_token, response_code] = getEC2MetadataToken(user_agent_string);
+ if (response_code != Aws::Http::HttpResponseCode::OK || new_token.empty())
+ throw DB::Exception(ErrorCodes::AWS_ERROR,
+ "Failed to make token request. HTTP response code: {}", response_code);
+
+ token = std::move(new_token);
+ const String url = endpoint + EC2_AVAILABILITY_ZONE_RESOURCE;
+ std::shared_ptr<Aws::Http::HttpRequest> profile_request(
+ Aws::Http::CreateHttpRequest(url, Aws::Http::HttpMethod::HTTP_GET, Aws::Utils::Stream::DefaultResponseStreamFactoryMethod));
+
+ profile_request->SetHeaderValue(EC2_IMDS_TOKEN_HEADER, token);
+ profile_request->SetUserAgent(user_agent_string);
+
+ const auto result = GetResourceWithAWSWebServiceResult(profile_request);
+ if (result.GetResponseCode() != Aws::Http::HttpResponseCode::OK)
+ throw DB::Exception(ErrorCodes::AWS_ERROR,
+ "Failed to get availability zone. HTTP response code: {}", result.GetResponseCode());
+
+ return Aws::Utils::StringUtils::Trim(result.GetPayload().c_str());
+}
+
+std::pair<Aws::String, Aws::Http::HttpResponseCode> AWSEC2MetadataClient::getEC2MetadataToken(const std::string & user_agent_string) const
+{
+ std::lock_guard locker(token_mutex);
+
+ Aws::StringStream ss;
+ ss << endpoint << EC2_IMDS_TOKEN_RESOURCE;
+ std::shared_ptr<Aws::Http::HttpRequest> token_request(
+ Aws::Http::CreateHttpRequest(
+ ss.str(), Aws::Http::HttpMethod::HTTP_PUT,
+ Aws::Utils::Stream::DefaultResponseStreamFactoryMethod));
+ token_request->SetHeaderValue(EC2_IMDS_TOKEN_TTL_HEADER, EC2_IMDS_TOKEN_TTL_DEFAULT_VALUE);
+ token_request->SetUserAgent(user_agent_string);
+
+ LOG_TRACE(logger, "Calling EC2MetadataService to get token.");
+ const auto result = GetResourceWithAWSWebServiceResult(token_request);
+ const auto & token_string = result.GetPayload();
+ return { Aws::Utils::StringUtils::Trim(token_string.c_str()), result.GetResponseCode() };
+}
+
+Aws::String AWSEC2MetadataClient::getCurrentRegion() const
+{
+ return Aws::Region::AWS_GLOBAL;
+}
+
+std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration)
+{
+ Aws::String ec2_metadata_service_endpoint = Aws::Environment::GetEnv("AWS_EC2_METADATA_SERVICE_ENDPOINT");
+ auto * logger = &Poco::Logger::get("AWSEC2InstanceProfileConfigLoader");
+ if (ec2_metadata_service_endpoint.empty())
+ {
+ Aws::String ec2_metadata_service_endpoint_mode = Aws::Environment::GetEnv("AWS_EC2_METADATA_SERVICE_ENDPOINT_MODE");
+ if (ec2_metadata_service_endpoint_mode.length() == 0)
+ {
+ ec2_metadata_service_endpoint = "http://169.254.169.254"; //default to IPv4 default endpoint
+ }
+ else
+ {
+ if (ec2_metadata_service_endpoint_mode.length() == 4)
+ {
+ if (Aws::Utils::StringUtils::CaselessCompare(ec2_metadata_service_endpoint_mode.c_str(), "ipv4"))
+ {
+ ec2_metadata_service_endpoint = "http://169.254.169.254"; //default to IPv4 default endpoint
+ }
+ else if (Aws::Utils::StringUtils::CaselessCompare(ec2_metadata_service_endpoint_mode.c_str(), "ipv6"))
+ {
+ ec2_metadata_service_endpoint = "http://[fd00:ec2::254]";
+ }
+ else
+ {
+ LOG_ERROR(logger, "AWS_EC2_METADATA_SERVICE_ENDPOINT_MODE can only be set to ipv4 or ipv6, received: {}", ec2_metadata_service_endpoint_mode);
+ }
+ }
+ else
+ {
+ LOG_ERROR(logger, "AWS_EC2_METADATA_SERVICE_ENDPOINT_MODE can only be set to ipv4 or ipv6, received: {}", ec2_metadata_service_endpoint_mode);
+ }
+ }
+ }
+ LOG_INFO(logger, "Using IMDS endpoint: {}", ec2_metadata_service_endpoint);
+ return std::make_shared<AWSEC2MetadataClient>(client_configuration, ec2_metadata_service_endpoint.c_str());
+}
+
+AWSEC2InstanceProfileConfigLoader::AWSEC2InstanceProfileConfigLoader(const std::shared_ptr<AWSEC2MetadataClient> & client_, bool use_secure_pull_)
+ : client(client_)
+ , use_secure_pull(use_secure_pull_)
+ , logger(&Poco::Logger::get("AWSEC2InstanceProfileConfigLoader"))
+{
+}
+
+bool AWSEC2InstanceProfileConfigLoader::LoadInternal()
+{
+ auto credentials_str = use_secure_pull ? client->getDefaultCredentialsSecurely() : client->getDefaultCredentials();
+
+ /// See EC2InstanceProfileConfigLoader.
+ if (credentials_str.empty())
+ return false;
+
+ Aws::Utils::Json::JsonValue credentials_doc(credentials_str);
+ if (!credentials_doc.WasParseSuccessful())
+ {
+ LOG_ERROR(logger, "Failed to parse output from EC2MetadataService.");
+ return false;
+ }
+ String access_key, secret_key, token;
+
+ auto credentials_view = credentials_doc.View();
+ access_key = credentials_view.GetString("AccessKeyId");
+ LOG_TRACE(logger, "Successfully pulled credentials from EC2MetadataService with access key.");
+
+ secret_key = credentials_view.GetString("SecretAccessKey");
+ token = credentials_view.GetString("Token");
+
+ auto region = client->getCurrentRegion();
+
+ Aws::Config::Profile profile;
+ profile.SetCredentials(Aws::Auth::AWSCredentials(access_key, secret_key, token));
+ profile.SetRegion(region);
+ profile.SetName(Aws::Config::INSTANCE_PROFILE_KEY);
+
+ m_profiles[Aws::Config::INSTANCE_PROFILE_KEY] = profile;
+
+ return true;
+}
+
+AWSInstanceProfileCredentialsProvider::AWSInstanceProfileCredentialsProvider(const std::shared_ptr<AWSEC2InstanceProfileConfigLoader> & config_loader)
+ : ec2_metadata_config_loader(config_loader)
+ , load_frequency_ms(Aws::Auth::REFRESH_THRESHOLD)
+ , logger(&Poco::Logger::get("AWSInstanceProfileCredentialsProvider"))
+{
+ LOG_INFO(logger, "Creating Instance with injected EC2MetadataClient and refresh rate.");
+}
+
+Aws::Auth::AWSCredentials AWSInstanceProfileCredentialsProvider::GetAWSCredentials()
+{
+ refreshIfExpired();
+ Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock);
+ auto profile_it = ec2_metadata_config_loader->GetProfiles().find(Aws::Config::INSTANCE_PROFILE_KEY);
+
+ if (profile_it != ec2_metadata_config_loader->GetProfiles().end())
+ {
+ return profile_it->second.GetCredentials();
+ }
+
+ return Aws::Auth::AWSCredentials();
+}
+
+void AWSInstanceProfileCredentialsProvider::Reload()
+{
+ LOG_INFO(logger, "Credentials have expired attempting to repull from EC2 Metadata Service.");
+ ec2_metadata_config_loader->Load();
+ AWSCredentialsProvider::Reload();
+}
+
+void AWSInstanceProfileCredentialsProvider::refreshIfExpired()
+{
+ Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock);
+ if (!IsTimeToRefresh(load_frequency_ms))
+ {
+ return;
+ }
+
+ guard.UpgradeToWriterLock();
+ if (!IsTimeToRefresh(load_frequency_ms)) // double-checked lock to avoid refreshing twice
+ {
+ return;
+ }
+ Reload();
+}
+
+AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider::AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider(
+ DB::S3::PocoHTTPClientConfiguration & aws_client_configuration, uint64_t expiration_window_seconds_)
+ : logger(&Poco::Logger::get("AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider"))
+ , expiration_window_seconds(expiration_window_seconds_)
+{
+ // check environment variables
+ String tmp_region = Aws::Environment::GetEnv("AWS_DEFAULT_REGION");
+ role_arn = Aws::Environment::GetEnv("AWS_ROLE_ARN");
+ token_file = Aws::Environment::GetEnv("AWS_WEB_IDENTITY_TOKEN_FILE");
+ session_name = Aws::Environment::GetEnv("AWS_ROLE_SESSION_NAME");
+
+ // check profile_config if either m_roleArn or m_tokenFile is not loaded from environment variable
+ // region source is not enforced, but we need it to construct sts endpoint, if we can't find from environment, we should check if it's set in config file.
+ if (role_arn.empty() || token_file.empty() || tmp_region.empty())
+ {
+ auto profile = Aws::Config::GetCachedConfigProfile(Aws::Auth::GetConfigProfileName());
+ if (tmp_region.empty())
+ {
+ tmp_region = profile.GetRegion();
+ }
+ // If either of these two were not found from environment, use whatever found for all three in config file
+ if (role_arn.empty() || token_file.empty())
+ {
+ role_arn = profile.GetRoleArn();
+ token_file = profile.GetValue("web_identity_token_file");
+ session_name = profile.GetValue("role_session_name");
+ }
+ }
+
+ if (token_file.empty())
+ {
+ LOG_WARNING(logger, "Token file must be specified to use STS AssumeRole web identity creds provider.");
+ return; // No need to do further constructing
+ }
+ else
+ {
+ LOG_DEBUG(logger, "Resolved token_file from profile_config or environment variable to be {}", token_file);
+ }
+
+ if (role_arn.empty())
+ {
+ LOG_WARNING(logger, "RoleArn must be specified to use STS AssumeRole web identity creds provider.");
+ return; // No need to do further constructing
+ }
+ else
+ {
+ LOG_DEBUG(logger, "Resolved role_arn from profile_config or environment variable to be {}", role_arn);
+ }
+
+ if (tmp_region.empty())
+ {
+ tmp_region = Aws::Region::US_EAST_1;
+ }
+ else
+ {
+ LOG_DEBUG(logger, "Resolved region from profile_config or environment variable to be {}", tmp_region);
+ }
+
+ if (session_name.empty())
+ {
+ session_name = Aws::Utils::UUID::RandomUUID();
+ }
+ else
+ {
+ LOG_DEBUG(logger, "Resolved session_name from profile_config or environment variable to be {}", session_name);
+ }
+
+ aws_client_configuration.scheme = Aws::Http::Scheme::HTTPS;
+ aws_client_configuration.region = tmp_region;
+
+ std::vector<String> retryable_errors;
+ retryable_errors.push_back("IDPCommunicationError");
+ retryable_errors.push_back("InvalidIdentityToken");
+
+ aws_client_configuration.retryStrategy = std::make_shared<Aws::Client::SpecifiedRetryableErrorsRetryStrategy>(
+ retryable_errors, /* maxRetries = */3);
+
+ client = std::make_unique<Aws::Internal::STSCredentialsClient>(aws_client_configuration);
+ initialized = true;
+ LOG_INFO(logger, "Creating STS AssumeRole with web identity creds provider.");
+}
+
+Aws::Auth::AWSCredentials AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider::GetAWSCredentials()
+{
+ // A valid client means required information like role arn and token file were constructed correctly.
+ // We can use this provider to load creds, otherwise, we can just return empty creds.
+ if (!initialized)
+ {
+ return Aws::Auth::AWSCredentials();
+ }
+ refreshIfExpired();
+ Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock);
+ return credentials;
+}
+
+void AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider::Reload()
+{
+ LOG_INFO(logger, "Credentials have expired, attempting to renew from STS.");
+
+ std::ifstream token_stream(token_file.data());
+ if (token_stream)
+ {
+ String token_string((std::istreambuf_iterator<char>(token_stream)), std::istreambuf_iterator<char>());
+ token = token_string;
+ }
+ else
+ {
+ LOG_INFO(logger, "Can't open token file: {}", token_file);
+ return;
+ }
+ Aws::Internal::STSCredentialsClient::STSAssumeRoleWithWebIdentityRequest request{session_name, role_arn, token};
+
+ auto result = client->GetAssumeRoleWithWebIdentityCredentials(request);
+ LOG_TRACE(logger, "Successfully retrieved credentials.");
+ credentials = result.creds;
+}
+
+void AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider::refreshIfExpired()
+{
+ Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock);
+ if (!areCredentialsEmptyOrExpired(credentials, expiration_window_seconds))
+ return;
+
+ guard.UpgradeToWriterLock();
+ if (!areCredentialsEmptyOrExpired(credentials, expiration_window_seconds)) // double-checked lock to avoid refreshing twice
+ return;
+
+ Reload();
+}
+
+S3CredentialsProviderChain::S3CredentialsProviderChain(
+ const DB::S3::PocoHTTPClientConfiguration & configuration,
+ const Aws::Auth::AWSCredentials & credentials,
+ CredentialsConfiguration credentials_configuration)
+{
+ auto * logger = &Poco::Logger::get("S3CredentialsProviderChain");
+
+ /// we don't provide any credentials to avoid signing
+ if (credentials_configuration.no_sign_request)
+ return;
+
+ /// add explicit credentials to the front of the chain
+ /// because it's manually defined by the user
+ if (!credentials.IsEmpty())
+ {
+ AddProvider(std::make_shared<Aws::Auth::SimpleAWSCredentialsProvider>(credentials));
+ return;
+ }
+
+ if (credentials_configuration.use_environment_credentials)
+ {
+ static const char AWS_ECS_CONTAINER_CREDENTIALS_RELATIVE_URI[] = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI";
+ static const char AWS_ECS_CONTAINER_CREDENTIALS_FULL_URI[] = "AWS_CONTAINER_CREDENTIALS_FULL_URI";
+ static const char AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN[] = "AWS_CONTAINER_AUTHORIZATION_TOKEN";
+ static const char AWS_EC2_METADATA_DISABLED[] = "AWS_EC2_METADATA_DISABLED";
+
+ /// The only difference from DefaultAWSCredentialsProviderChain::DefaultAWSCredentialsProviderChain()
+ /// is that this chain uses custom ClientConfiguration. Also we removed process provider because it's useless in our case.
+ ///
+ /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be
+ /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers.
+ {
+ DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(
+ configuration.region,
+ configuration.remote_host_filter,
+ configuration.s3_max_redirects,
+ configuration.enable_s3_requests_logging,
+ configuration.for_disk_s3,
+ configuration.get_request_throttler,
+ configuration.put_request_throttler);
+ AddProvider(std::make_shared<AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider>(aws_client_configuration, credentials_configuration.expiration_window_seconds));
+ }
+
+ AddProvider(std::make_shared<Aws::Auth::EnvironmentAWSCredentialsProvider>());
+
+
+ /// ECS TaskRole Credentials only available when ENVIRONMENT VARIABLE is set.
+ const auto relative_uri = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_CREDENTIALS_RELATIVE_URI);
+ LOG_DEBUG(logger, "The environment variable value {} is {}", AWS_ECS_CONTAINER_CREDENTIALS_RELATIVE_URI,
+ relative_uri);
+
+ const auto absolute_uri = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_CREDENTIALS_FULL_URI);
+ LOG_DEBUG(logger, "The environment variable value {} is {}", AWS_ECS_CONTAINER_CREDENTIALS_FULL_URI,
+ absolute_uri);
+
+ const auto ec2_metadata_disabled = Aws::Environment::GetEnv(AWS_EC2_METADATA_DISABLED);
+ LOG_DEBUG(logger, "The environment variable value {} is {}", AWS_EC2_METADATA_DISABLED,
+ ec2_metadata_disabled);
+
+ if (!relative_uri.empty())
+ {
+ AddProvider(std::make_shared<Aws::Auth::TaskRoleCredentialsProvider>(relative_uri.c_str()));
+ LOG_INFO(logger, "Added ECS metadata service credentials provider with relative path: [{}] to the provider chain.",
+ relative_uri);
+ }
+ else if (!absolute_uri.empty())
+ {
+ const auto token = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN);
+ AddProvider(std::make_shared<Aws::Auth::TaskRoleCredentialsProvider>(absolute_uri.c_str(), token.c_str()));
+
+ /// DO NOT log the value of the authorization token for security purposes.
+ LOG_INFO(logger, "Added ECS credentials provider with URI: [{}] to the provider chain with a{} authorization token.",
+ absolute_uri, token.empty() ? "n empty" : " non-empty");
+ }
+ else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true")
+ {
+ DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(
+ configuration.region,
+ configuration.remote_host_filter,
+ configuration.s3_max_redirects,
+ configuration.enable_s3_requests_logging,
+ configuration.for_disk_s3,
+ configuration.get_request_throttler,
+ configuration.put_request_throttler,
+ Aws::Http::SchemeMapper::ToString(Aws::Http::Scheme::HTTP));
+
+ /// See MakeDefaultHttpResourceClientConfiguration().
+ /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside
+ /// of contrib/aws/aws-cpp-sdk-core/source/internal/AWSHttpResourceClient.cpp
+ aws_client_configuration.maxConnections = 2;
+
+ /// Explicitly set the proxy settings to empty/zero to avoid relying on defaults that could potentially change
+ /// in the future.
+ aws_client_configuration.proxyHost = "";
+ aws_client_configuration.proxyUserName = "";
+ aws_client_configuration.proxyPassword = "";
+ aws_client_configuration.proxyPort = 0;
+
+ /// EC2MetadataService throttles by delaying the response so the service client should set a large read timeout.
+ /// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds.
+ aws_client_configuration.connectTimeoutMs = 1000;
+ aws_client_configuration.requestTimeoutMs = 1000;
+
+ aws_client_configuration.retryStrategy = std::make_shared<Aws::Client::DefaultRetryStrategy>(1, 1000);
+
+ auto ec2_metadata_client = InitEC2MetadataClient(aws_client_configuration);
+ auto config_loader = std::make_shared<AWSEC2InstanceProfileConfigLoader>(ec2_metadata_client, !credentials_configuration.use_insecure_imds_request);
+
+ AddProvider(std::make_shared<AWSInstanceProfileCredentialsProvider>(config_loader));
+ LOG_INFO(logger, "Added EC2 metadata service credentials provider to the provider chain.");
+ }
+ }
+
+ /// Quite verbose provider (argues if file with credentials doesn't exist) so iut's the last one
+ /// in chain.
+ AddProvider(std::make_shared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>());
+}
+
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Credentials.h b/contrib/clickhouse/src/IO/S3/Credentials.h
new file mode 100644
index 0000000000..429941cd84
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Credentials.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+# include <aws/core/client/ClientConfiguration.h>
+# include <aws/core/internal/AWSHttpResourceClient.h>
+# include <aws/core/config/AWSProfileConfigLoader.h>
+# include <aws/core/auth/AWSCredentialsProviderChain.h>
+
+# include <IO/S3/PocoHTTPClient.h>
+
+
+namespace DB::S3
+{
+
+inline static constexpr uint64_t DEFAULT_EXPIRATION_WINDOW_SECONDS = 120;
+
+class AWSEC2MetadataClient : public Aws::Internal::AWSHttpResourceClient
+{
+ static constexpr char EC2_SECURITY_CREDENTIALS_RESOURCE[] = "/latest/meta-data/iam/security-credentials";
+ static constexpr char EC2_AVAILABILITY_ZONE_RESOURCE[] = "/latest/meta-data/placement/availability-zone";
+ static constexpr char EC2_IMDS_TOKEN_RESOURCE[] = "/latest/api/token";
+ static constexpr char EC2_IMDS_TOKEN_HEADER[] = "x-aws-ec2-metadata-token";
+ static constexpr char EC2_IMDS_TOKEN_TTL_DEFAULT_VALUE[] = "21600";
+ static constexpr char EC2_IMDS_TOKEN_TTL_HEADER[] = "x-aws-ec2-metadata-token-ttl-seconds";
+
+public:
+ /// See EC2MetadataClient.
+
+ explicit AWSEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration, const char * endpoint_);
+
+ AWSEC2MetadataClient& operator =(const AWSEC2MetadataClient & rhs) = delete;
+ AWSEC2MetadataClient(const AWSEC2MetadataClient & rhs) = delete;
+ AWSEC2MetadataClient& operator =(const AWSEC2MetadataClient && rhs) = delete;
+ AWSEC2MetadataClient(const AWSEC2MetadataClient && rhs) = delete;
+
+ ~AWSEC2MetadataClient() override = default;
+
+ using Aws::Internal::AWSHttpResourceClient::GetResource;
+
+ virtual Aws::String GetResource(const char * resource_path) const;
+ virtual Aws::String getDefaultCredentials() const;
+
+ static Aws::String awsComputeUserAgentString();
+
+ virtual Aws::String getDefaultCredentialsSecurely() const;
+
+ virtual Aws::String getCurrentRegion() const;
+
+ virtual Aws::String getCurrentAvailabilityZone() const;
+
+private:
+ std::pair<Aws::String, Aws::Http::HttpResponseCode> getEC2MetadataToken(const std::string & user_agent_string) const;
+
+ const Aws::String endpoint;
+ mutable std::recursive_mutex token_mutex;
+ mutable Aws::String token;
+ Poco::Logger * logger;
+};
+
+std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration);
+
+class AWSEC2InstanceProfileConfigLoader : public Aws::Config::AWSProfileConfigLoader
+{
+public:
+ explicit AWSEC2InstanceProfileConfigLoader(const std::shared_ptr<AWSEC2MetadataClient> & client_, bool use_secure_pull_);
+
+ ~AWSEC2InstanceProfileConfigLoader() override = default;
+
+protected:
+ bool LoadInternal() override;
+
+private:
+ std::shared_ptr<AWSEC2MetadataClient> client;
+ bool use_secure_pull;
+ Poco::Logger * logger;
+};
+
+class AWSInstanceProfileCredentialsProvider : public Aws::Auth::AWSCredentialsProvider
+{
+public:
+ /// See InstanceProfileCredentialsProvider.
+
+ explicit AWSInstanceProfileCredentialsProvider(const std::shared_ptr<AWSEC2InstanceProfileConfigLoader> & config_loader);
+
+ Aws::Auth::AWSCredentials GetAWSCredentials() override;
+protected:
+ void Reload() override;
+
+private:
+ void refreshIfExpired();
+
+ std::shared_ptr<AWSEC2InstanceProfileConfigLoader> ec2_metadata_config_loader;
+ Int64 load_frequency_ms;
+ Poco::Logger * logger;
+};
+
+class AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider : public Aws::Auth::AWSCredentialsProvider
+{
+ /// See STSAssumeRoleWebIdentityCredentialsProvider.
+
+public:
+ explicit AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider(
+ DB::S3::PocoHTTPClientConfiguration & aws_client_configuration, uint64_t expiration_window_seconds_);
+
+ Aws::Auth::AWSCredentials GetAWSCredentials() override;
+
+protected:
+ void Reload() override;
+
+private:
+ void refreshIfExpired();
+
+ std::unique_ptr<Aws::Internal::STSCredentialsClient> client;
+ Aws::Auth::AWSCredentials credentials;
+ Aws::String role_arn;
+ Aws::String token_file;
+ Aws::String session_name;
+ Aws::String token;
+ bool initialized = false;
+ Poco::Logger * logger;
+ uint64_t expiration_window_seconds;
+};
+
+struct CredentialsConfiguration
+{
+ bool use_environment_credentials = false;
+ bool use_insecure_imds_request = false;
+ uint64_t expiration_window_seconds = DEFAULT_EXPIRATION_WINDOW_SECONDS;
+ bool no_sign_request = false;
+};
+
+class S3CredentialsProviderChain : public Aws::Auth::AWSCredentialsProviderChain
+{
+public:
+ S3CredentialsProviderChain(
+ const DB::S3::PocoHTTPClientConfiguration & configuration,
+ const Aws::Auth::AWSCredentials & credentials,
+ CredentialsConfiguration credentials_configuration);
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/PocoHTTPClient.cpp b/contrib/clickhouse/src/IO/S3/PocoHTTPClient.cpp
new file mode 100644
index 0000000000..a61f88c4af
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/PocoHTTPClient.cpp
@@ -0,0 +1,558 @@
+#include <Poco/Timespan.h>
+#include "Common/DNSResolver.h"
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include "PocoHTTPClient.h"
+
+#include <utility>
+#include <algorithm>
+#include <functional>
+
+#include <Common/logger_useful.h>
+#include <Common/Stopwatch.h>
+#include <Common/Throttler.h>
+#include <IO/HTTPCommon.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/Operators.h>
+#include <IO/S3/ProviderType.h>
+
+#include <aws/core/http/HttpRequest.h>
+#include <aws/core/http/HttpResponse.h>
+#include <aws/core/utils/xml/XmlSerializer.h>
+#include <aws/core/monitoring/HttpClientMetrics.h>
+#include <aws/core/utils/ratelimiter/RateLimiterInterface.h>
+#include "Poco/StreamCopier.h"
+#include <Poco/Net/HTTPRequest.h>
+#include <Poco/Net/HTTPResponse.h>
+#include <re2/re2.h>
+
+#include <boost/algorithm/string.hpp>
+
+static const int SUCCESS_RESPONSE_MIN = 200;
+static const int SUCCESS_RESPONSE_MAX = 299;
+
+namespace ProfileEvents
+{
+ extern const Event S3ReadMicroseconds;
+ extern const Event S3ReadRequestsCount;
+ extern const Event S3ReadRequestsErrors;
+ extern const Event S3ReadRequestsThrottling;
+ extern const Event S3ReadRequestsRedirects;
+
+ extern const Event S3WriteMicroseconds;
+ extern const Event S3WriteRequestsCount;
+ extern const Event S3WriteRequestsErrors;
+ extern const Event S3WriteRequestsThrottling;
+ extern const Event S3WriteRequestsRedirects;
+
+ extern const Event DiskS3ReadMicroseconds;
+ extern const Event DiskS3ReadRequestsCount;
+ extern const Event DiskS3ReadRequestsErrors;
+ extern const Event DiskS3ReadRequestsThrottling;
+ extern const Event DiskS3ReadRequestsRedirects;
+
+ extern const Event DiskS3WriteMicroseconds;
+ extern const Event DiskS3WriteRequestsCount;
+ extern const Event DiskS3WriteRequestsErrors;
+ extern const Event DiskS3WriteRequestsThrottling;
+ extern const Event DiskS3WriteRequestsRedirects;
+
+ extern const Event S3GetRequestThrottlerCount;
+ extern const Event S3GetRequestThrottlerSleepMicroseconds;
+ extern const Event S3PutRequestThrottlerCount;
+ extern const Event S3PutRequestThrottlerSleepMicroseconds;
+
+ extern const Event DiskS3GetRequestThrottlerCount;
+ extern const Event DiskS3GetRequestThrottlerSleepMicroseconds;
+ extern const Event DiskS3PutRequestThrottlerCount;
+ extern const Event DiskS3PutRequestThrottlerSleepMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric S3Requests;
+}
+
+namespace DB::ErrorCodes
+{
+ extern const int NOT_IMPLEMENTED;
+ extern const int TOO_MANY_REDIRECTS;
+}
+
+namespace DB::S3
+{
+
+PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
+ std::function<DB::ProxyConfiguration()> per_request_configuration_,
+ const String & force_region_,
+ const RemoteHostFilter & remote_host_filter_,
+ unsigned int s3_max_redirects_,
+ bool enable_s3_requests_logging_,
+ bool for_disk_s3_,
+ const ThrottlerPtr & get_request_throttler_,
+ const ThrottlerPtr & put_request_throttler_,
+ std::function<void(const DB::ProxyConfiguration &)> error_report_)
+ : per_request_configuration(per_request_configuration_)
+ , force_region(force_region_)
+ , remote_host_filter(remote_host_filter_)
+ , s3_max_redirects(s3_max_redirects_)
+ , enable_s3_requests_logging(enable_s3_requests_logging_)
+ , for_disk_s3(for_disk_s3_)
+ , get_request_throttler(get_request_throttler_)
+ , put_request_throttler(put_request_throttler_)
+ , error_report(error_report_)
+{
+}
+
+void PocoHTTPClientConfiguration::updateSchemeAndRegion()
+{
+ if (!endpointOverride.empty())
+ {
+ static const RE2 region_pattern(R"(^s3[.\-]([a-z0-9\-]+)\.amazonaws\.)");
+ Poco::URI uri(endpointOverride);
+ if (uri.getScheme() == "http")
+ scheme = Aws::Http::Scheme::HTTP;
+
+ if (force_region.empty())
+ {
+ String matched_region;
+ if (re2::RE2::PartialMatch(uri.getHost(), region_pattern, &matched_region))
+ {
+ boost::algorithm::to_lower(matched_region);
+ region = matched_region;
+ }
+ else
+ {
+ /// In global mode AWS C++ SDK send `us-east-1` but accept switching to another one if being suggested.
+ region = Aws::Region::AWS_GLOBAL;
+ }
+ }
+ else
+ {
+ region = force_region;
+ }
+ }
+}
+
+
+PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_configuration)
+ : per_request_configuration(client_configuration.per_request_configuration)
+ , error_report(client_configuration.error_report)
+ , timeouts(ConnectionTimeouts(
+ Poco::Timespan(client_configuration.connectTimeoutMs * 1000), /// connection timeout.
+ Poco::Timespan(client_configuration.requestTimeoutMs * 1000), /// send timeout.
+ Poco::Timespan(client_configuration.requestTimeoutMs * 1000), /// receive timeout.
+ Poco::Timespan(client_configuration.enableTcpKeepAlive ? client_configuration.tcpKeepAliveIntervalMs * 1000 : 0),
+ Poco::Timespan(client_configuration.http_keep_alive_timeout_ms * 1000))) /// flag indicating whether keep-alive is enabled is set to each session upon creation
+ , remote_host_filter(client_configuration.remote_host_filter)
+ , s3_max_redirects(client_configuration.s3_max_redirects)
+ , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging)
+ , for_disk_s3(client_configuration.for_disk_s3)
+ , get_request_throttler(client_configuration.get_request_throttler)
+ , put_request_throttler(client_configuration.put_request_throttler)
+ , extra_headers(client_configuration.extra_headers)
+ , http_connection_pool_size(client_configuration.http_connection_pool_size)
+ , wait_on_pool_size_limit(client_configuration.wait_on_pool_size_limit)
+{
+}
+
+std::shared_ptr<Aws::Http::HttpResponse> PocoHTTPClient::MakeRequest(
+ const std::shared_ptr<Aws::Http::HttpRequest> & request,
+ Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
+ Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const
+{
+ try
+ {
+ auto response = Aws::MakeShared<PocoHTTPResponse>("PocoHTTPClient", request);
+ makeRequestInternal(*request, response, readLimiter, writeLimiter);
+ return response;
+ }
+ catch (const Exception &)
+ {
+ throw;
+ }
+ catch (const Poco::Exception & e)
+ {
+ throw Exception(Exception::CreateFromPocoTag{}, e);
+ }
+ catch (const std::exception & e)
+ {
+ throw Exception(Exception::CreateFromSTDTag{}, e);
+ }
+ catch (...)
+ {
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ throw;
+ }
+}
+
+namespace
+{
+ /// No comments:
+ /// 1) https://aws.amazon.com/premiumsupport/knowledge-center/s3-resolve-200-internalerror/
+ /// 2) https://github.com/aws/aws-sdk-cpp/issues/658
+ bool checkRequestCanReturn2xxAndErrorInBody(Aws::Http::HttpRequest & request)
+ {
+ auto query_params = request.GetQueryStringParameters();
+ if (request.HasHeader("x-amz-copy-source") || request.HasHeader("x-goog-copy-source"))
+ {
+ /// CopyObject https://docs.aws.amazon.com/AmazonS3/latest/API/API_CopyObject.html
+ if (query_params.empty())
+ return true;
+
+ /// UploadPartCopy https://docs.aws.amazon.com/AmazonS3/latest/API/API_UploadPartCopy.html
+ if (query_params.contains("partNumber") && query_params.contains("uploadId"))
+ return true;
+
+ }
+ else
+ {
+ /// CompleteMultipartUpload https://docs.aws.amazon.com/AmazonS3/latest/API/API_CompleteMultipartUpload.html
+ if (query_params.size() == 1 && query_params.contains("uploadId"))
+ return true;
+ }
+
+ return false;
+ }
+}
+
+PocoHTTPClient::S3MetricKind PocoHTTPClient::getMetricKind(const Aws::Http::HttpRequest & request)
+{
+ switch (request.GetMethod())
+ {
+ case Aws::Http::HttpMethod::HTTP_GET:
+ case Aws::Http::HttpMethod::HTTP_HEAD:
+ return S3MetricKind::Read;
+ case Aws::Http::HttpMethod::HTTP_POST:
+ case Aws::Http::HttpMethod::HTTP_DELETE:
+ case Aws::Http::HttpMethod::HTTP_PUT:
+ case Aws::Http::HttpMethod::HTTP_PATCH:
+ return S3MetricKind::Write;
+ }
+ throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported request method");
+}
+
+void PocoHTTPClient::addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount) const
+{
+ const ProfileEvents::Event events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+ {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
+ {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
+ {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
+ {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
+ {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
+ };
+
+ const ProfileEvents::Event disk_s3_events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+ {ProfileEvents::DiskS3ReadMicroseconds, ProfileEvents::DiskS3WriteMicroseconds},
+ {ProfileEvents::DiskS3ReadRequestsCount, ProfileEvents::DiskS3WriteRequestsCount},
+ {ProfileEvents::DiskS3ReadRequestsErrors, ProfileEvents::DiskS3WriteRequestsErrors},
+ {ProfileEvents::DiskS3ReadRequestsThrottling, ProfileEvents::DiskS3WriteRequestsThrottling},
+ {ProfileEvents::DiskS3ReadRequestsRedirects, ProfileEvents::DiskS3WriteRequestsRedirects},
+ };
+
+ S3MetricKind kind = getMetricKind(request);
+
+ ProfileEvents::increment(events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+ if (for_disk_s3)
+ ProfileEvents::increment(disk_s3_events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+}
+
+void PocoHTTPClient::makeRequestInternal(
+ Aws::Http::HttpRequest & request,
+ std::shared_ptr<PocoHTTPResponse> & response,
+ Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
+ Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const
+{
+ /// Most sessions in pool are already connected and it is not possible to set proxy host/port to a connected session.
+ const auto request_configuration = per_request_configuration();
+ if (http_connection_pool_size && request_configuration.host.empty())
+ makeRequestInternalImpl<true>(request, request_configuration, response, readLimiter, writeLimiter);
+ else
+ makeRequestInternalImpl<false>(request, request_configuration, response, readLimiter, writeLimiter);
+}
+
+template <bool pooled>
+void PocoHTTPClient::makeRequestInternalImpl(
+ Aws::Http::HttpRequest & request,
+ const DB::ProxyConfiguration & request_configuration,
+ std::shared_ptr<PocoHTTPResponse> & response,
+ Aws::Utils::RateLimits::RateLimiterInterface *,
+ Aws::Utils::RateLimits::RateLimiterInterface *) const
+{
+ using SessionPtr = std::conditional_t<pooled, PooledHTTPSessionPtr, HTTPSessionPtr>;
+
+ Poco::Logger * log = &Poco::Logger::get("AWSClient");
+
+ auto uri = request.GetUri().GetURIString();
+
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Make request to: {}", uri);
+
+ switch (request.GetMethod())
+ {
+ case Aws::Http::HttpMethod::HTTP_GET:
+ case Aws::Http::HttpMethod::HTTP_HEAD:
+ if (get_request_throttler)
+ {
+ UInt64 sleep_us = get_request_throttler->add(1, ProfileEvents::S3GetRequestThrottlerCount, ProfileEvents::S3GetRequestThrottlerSleepMicroseconds);
+ if (for_disk_s3)
+ {
+ ProfileEvents::increment(ProfileEvents::DiskS3GetRequestThrottlerCount);
+ ProfileEvents::increment(ProfileEvents::DiskS3GetRequestThrottlerSleepMicroseconds, sleep_us);
+ }
+ }
+ break;
+ case Aws::Http::HttpMethod::HTTP_PUT:
+ case Aws::Http::HttpMethod::HTTP_POST:
+ case Aws::Http::HttpMethod::HTTP_PATCH:
+ if (put_request_throttler)
+ {
+ UInt64 sleep_us = put_request_throttler->add(1, ProfileEvents::S3PutRequestThrottlerCount, ProfileEvents::S3PutRequestThrottlerSleepMicroseconds);
+ if (for_disk_s3)
+ {
+ ProfileEvents::increment(ProfileEvents::DiskS3PutRequestThrottlerCount);
+ ProfileEvents::increment(ProfileEvents::DiskS3PutRequestThrottlerSleepMicroseconds, sleep_us);
+ }
+ }
+ break;
+ case Aws::Http::HttpMethod::HTTP_DELETE:
+ break; // Not throttled
+ }
+
+ addMetric(request, S3MetricType::Count);
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests};
+
+ try
+ {
+ for (unsigned int attempt = 0; attempt <= s3_max_redirects; ++attempt)
+ {
+ Poco::URI target_uri(uri);
+ SessionPtr session;
+
+ if (!request_configuration.host.empty())
+ {
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Due to reverse proxy host name ({}) won't be resolved on ClickHouse side", uri);
+
+ /// Reverse proxy can replace host header with resolved ip address instead of host name.
+ /// This can lead to request signature difference on S3 side.
+ if constexpr (pooled)
+ session = makePooledHTTPSession(
+ target_uri, timeouts, http_connection_pool_size, wait_on_pool_size_limit);
+ else
+ session = makeHTTPSession(target_uri, timeouts);
+ bool use_tunnel = request_configuration.protocol == DB::ProxyConfiguration::Protocol::HTTP && target_uri.getScheme() == "https";
+
+ // session->setProxy(
+ // request_configuration.proxy_host,
+ // request_configuration.proxy_port
+ // );
+ }
+ else
+ {
+ if constexpr (pooled)
+ session = makePooledHTTPSession(
+ target_uri, timeouts, http_connection_pool_size, wait_on_pool_size_limit);
+ else
+ session = makeHTTPSession(target_uri, timeouts);
+ }
+
+ /// In case of error this address will be written to logs
+ // request.SetResolvedRemoteHost(session->getResolvedAddress());
+
+ Poco::Net::HTTPRequest poco_request(Poco::Net::HTTPRequest::HTTP_1_1);
+
+ /** According to RFC-2616, Request-URI is allowed to be encoded.
+ * However, there is no clear agreement on which exact symbols must be encoded.
+ * Effectively, `Poco::URI` chooses smaller subset of characters to encode,
+ * whereas Amazon S3 and Google Cloud Storage expects another one.
+ * In order to successfully execute a request, a path must be exact representation
+ * of decoded path used by `AWSAuthSigner`.
+ * Therefore we shall encode some symbols "manually" to fit the signatures.
+ */
+
+ std::string path_and_query;
+ const std::string & query = target_uri.getRawQuery();
+ const std::string reserved = "?#:;+@&=%"; /// Poco::URI::RESERVED_QUERY_PARAM without '/' plus percent sign.
+ Poco::URI::encode(target_uri.getPath(), reserved, path_and_query);
+
+ if (!query.empty())
+ {
+ path_and_query += '?';
+ path_and_query += query;
+ }
+
+ /// `target_uri.getPath()` could return an empty string, but a proper HTTP request must
+ /// always contain a non-empty URI in its first line (e.g. "POST / HTTP/1.1").
+ if (path_and_query.empty())
+ path_and_query = "/";
+
+ poco_request.setURI(path_and_query);
+
+ switch (request.GetMethod())
+ {
+ case Aws::Http::HttpMethod::HTTP_GET:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_GET);
+ break;
+ case Aws::Http::HttpMethod::HTTP_POST:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_POST);
+ break;
+ case Aws::Http::HttpMethod::HTTP_DELETE:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_DELETE);
+ break;
+ case Aws::Http::HttpMethod::HTTP_PUT:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_PUT);
+ break;
+ case Aws::Http::HttpMethod::HTTP_HEAD:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_HEAD);
+ break;
+ case Aws::Http::HttpMethod::HTTP_PATCH:
+ poco_request.setMethod(Poco::Net::HTTPRequest::HTTP_PATCH);
+ break;
+ }
+
+ /// Headers coming from SDK are lower-cased.
+ for (const auto & [header_name, header_value] : request.GetHeaders())
+ poco_request.set(header_name, header_value);
+ for (const auto & [header_name, header_value] : extra_headers)
+ poco_request.set(boost::algorithm::to_lower_copy(header_name), header_value);
+
+ Poco::Net::HTTPResponse poco_response;
+
+ Stopwatch watch;
+
+ auto & request_body_stream = session->sendRequest(poco_request);
+
+ if (request.GetContentBody())
+ {
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Writing request body.");
+
+ /// Rewind content body buffer.
+ /// NOTE: we should do that always (even if `attempt == 0`) because the same request can be retried also by AWS,
+ /// see retryStrategy in Aws::Client::ClientConfiguration.
+ request.GetContentBody()->clear();
+ request.GetContentBody()->seekg(0);
+
+ auto size = Poco::StreamCopier::copyStream(*request.GetContentBody(), request_body_stream);
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Written {} bytes to request body", size);
+ }
+
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Receiving response...");
+ auto & response_body_stream = session->receiveResponse(poco_response);
+
+ watch.stop();
+ addMetric(request, S3MetricType::Microseconds, watch.elapsedMicroseconds());
+
+ int status_code = static_cast<int>(poco_response.getStatus());
+
+ if (status_code >= SUCCESS_RESPONSE_MIN && status_code <= SUCCESS_RESPONSE_MAX)
+ {
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Response status: {}, {}", status_code, poco_response.getReason());
+ }
+ else
+ {
+ /// Error statuses are more important so we show them even if `enable_s3_requests_logging == false`.
+ LOG_INFO(log, "Response status: {}, {}", status_code, poco_response.getReason());
+ }
+
+ if (poco_response.getStatus() == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT)
+ {
+ auto location = poco_response.get("location");
+ remote_host_filter.checkURL(Poco::URI(location));
+ uri = location;
+ if (enable_s3_requests_logging)
+ LOG_TEST(log, "Redirecting request to new location: {}", location);
+
+ addMetric(request, S3MetricType::Redirects);
+
+ continue;
+ }
+
+ response->SetResponseCode(static_cast<Aws::Http::HttpResponseCode>(status_code));
+ response->SetContentType(poco_response.getContentType());
+
+ if (enable_s3_requests_logging)
+ {
+ WriteBufferFromOwnString headers_ss;
+ for (const auto & [header_name, header_value] : poco_response)
+ {
+ response->AddHeader(header_name, header_value);
+ headers_ss << header_name << ": " << header_value << "; ";
+ }
+ LOG_TEST(log, "Received headers: {}", headers_ss.str());
+ }
+ else
+ {
+ for (const auto & [header_name, header_value] : poco_response)
+ response->AddHeader(header_name, header_value);
+ }
+
+ /// Request is successful but for some special requests we can have actual error message in body
+ if (status_code >= SUCCESS_RESPONSE_MIN && status_code <= SUCCESS_RESPONSE_MAX && checkRequestCanReturn2xxAndErrorInBody(request))
+ {
+ std::string response_string((std::istreambuf_iterator<char>(response_body_stream)),
+ std::istreambuf_iterator<char>());
+
+ /// Just trim string so it will not be so long
+ LOG_TRACE(log, "Got dangerous response with successful code {}, checking its body: '{}'", status_code, response_string.substr(0, 300));
+ const static std::string_view needle = "<Error>";
+ if (auto it = std::search(response_string.begin(), response_string.end(), std::default_searcher(needle.begin(), needle.end())); it != response_string.end())
+ {
+ LOG_WARNING(log, "Response for request contain <Error> tag in body, settings internal server error (500 code)");
+ response->SetResponseCode(Aws::Http::HttpResponseCode::INTERNAL_SERVER_ERROR);
+
+ addMetric(request, S3MetricType::Errors);
+ if (error_report)
+ error_report(request_configuration);
+
+ }
+
+ /// Set response from string
+ response->SetResponseBody(response_string);
+ }
+ else
+ {
+
+ if (status_code == 429 || status_code == 503)
+ { // API throttling
+ addMetric(request, S3MetricType::Throttling);
+ }
+ else if (status_code >= 300)
+ {
+ addMetric(request, S3MetricType::Errors);
+ if (status_code >= 500 && error_report)
+ error_report(request_configuration);
+ }
+ response->SetResponseBody(response_body_stream, session);
+ }
+
+ return;
+ }
+ throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects while trying to access {}", request.GetUri().GetURIString());
+ }
+ catch (...)
+ {
+ auto error_message = getCurrentExceptionMessageAndPattern(/* with_stacktrace */ true);
+ error_message.text = fmt::format("Failed to make request to: {}: {}", uri, error_message.text);
+ LOG_INFO(log, error_message);
+
+ response->SetClientErrorType(Aws::Client::CoreErrors::NETWORK_CONNECTION);
+ response->SetClientErrorMessage(getCurrentExceptionMessage(false));
+
+ addMetric(request, S3MetricType::Errors);
+
+ /// Probably this is socket timeout or something more or less related to DNS
+ /// Let's just remove this host from DNS cache to be more safe
+ DNSResolver::instance().removeHostFromCache(Poco::URI(uri).getHost());
+ }
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/PocoHTTPClient.h b/contrib/clickhouse/src/IO/S3/PocoHTTPClient.h
new file mode 100644
index 0000000000..92680072b2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/PocoHTTPClient.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#include <string>
+#include <vector>
+
+#if USE_AWS_S3
+
+#include <Common/RemoteHostFilter.h>
+#include <Common/Throttler_fwd.h>
+#include <Common/ProxyConfiguration.h>
+#include <IO/ConnectionTimeouts.h>
+#include <IO/HTTPCommon.h>
+#include <IO/HTTPHeaderEntries.h>
+#include <IO/S3/SessionAwareIOStream.h>
+
+#include <aws/core/client/ClientConfiguration.h>
+#include <aws/core/http/HttpClient.h>
+#include <aws/core/http/HttpRequest.h>
+#include <aws/core/http/standard/StandardHttpResponse.h>
+
+namespace Aws::Http::Standard
+{
+class StandardHttpResponse;
+}
+
+namespace DB
+{
+
+class Context;
+}
+
+namespace DB::S3
+{
+class ClientFactory;
+
+struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
+{
+ std::function<DB::ProxyConfiguration()> per_request_configuration;
+ String force_region;
+ const RemoteHostFilter & remote_host_filter;
+ unsigned int s3_max_redirects;
+ bool enable_s3_requests_logging;
+ bool for_disk_s3;
+ ThrottlerPtr get_request_throttler;
+ ThrottlerPtr put_request_throttler;
+ HTTPHeaderEntries extra_headers;
+
+ /// Not a client parameter in terms of HTTP and we won't send it to the server. Used internally to determine when connection have to be re-established.
+ uint32_t http_keep_alive_timeout_ms = 0;
+ /// Zero means pooling will not be used.
+ size_t http_connection_pool_size = 0;
+ /// See PoolBase::BehaviourOnLimit
+ bool wait_on_pool_size_limit = true;
+
+ void updateSchemeAndRegion();
+
+ std::function<void(const DB::ProxyConfiguration &)> error_report;
+
+private:
+ PocoHTTPClientConfiguration(
+ std::function<DB::ProxyConfiguration()> per_request_configuration_,
+ const String & force_region_,
+ const RemoteHostFilter & remote_host_filter_,
+ unsigned int s3_max_redirects_,
+ bool enable_s3_requests_logging_,
+ bool for_disk_s3_,
+ const ThrottlerPtr & get_request_throttler_,
+ const ThrottlerPtr & put_request_throttler_,
+ std::function<void(const DB::ProxyConfiguration &)> error_report_
+ );
+
+ /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization.
+ friend ClientFactory;
+};
+
+class PocoHTTPResponse : public Aws::Http::Standard::StandardHttpResponse
+{
+public:
+ using SessionPtr = HTTPSessionPtr;
+
+ explicit PocoHTTPResponse(const std::shared_ptr<const Aws::Http::HttpRequest> request)
+ : Aws::Http::Standard::StandardHttpResponse(request)
+ , body_stream(request->GetResponseStreamFactory())
+ {
+ }
+
+ void SetResponseBody(Aws::IStream & incoming_stream, SessionPtr & session_) /// NOLINT
+ {
+ body_stream = Aws::Utils::Stream::ResponseStream(
+ Aws::New<SessionAwareIOStream<SessionPtr>>("http result streambuf", session_, incoming_stream.rdbuf())
+ );
+ }
+
+ void SetResponseBody(Aws::IStream & incoming_stream, PooledHTTPSessionPtr & session_) /// NOLINT
+ {
+ body_stream = Aws::Utils::Stream::ResponseStream(
+ Aws::New<SessionAwareIOStream<PooledHTTPSessionPtr>>("http result streambuf", session_, incoming_stream.rdbuf()));
+ }
+
+ void SetResponseBody(std::string & response_body) /// NOLINT
+ {
+ auto stream = Aws::New<std::stringstream>("http result buf", response_body); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+ stream->exceptions(std::ios::failbit);
+ body_stream = Aws::Utils::Stream::ResponseStream(std::move(stream));
+ }
+
+ Aws::IOStream & GetResponseBody() const override
+ {
+ return body_stream.GetUnderlyingStream();
+ }
+
+ Aws::Utils::Stream::ResponseStream && SwapResponseStreamOwnership() override
+ {
+ return std::move(body_stream);
+ }
+
+private:
+ Aws::Utils::Stream::ResponseStream body_stream;
+};
+
+class PocoHTTPClient : public Aws::Http::HttpClient
+{
+public:
+ explicit PocoHTTPClient(const PocoHTTPClientConfiguration & client_configuration);
+ ~PocoHTTPClient() override = default;
+
+ std::shared_ptr<Aws::Http::HttpResponse> MakeRequest(
+ const std::shared_ptr<Aws::Http::HttpRequest> & request,
+ Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
+ Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const override;
+
+private:
+
+ void makeRequestInternal(
+ Aws::Http::HttpRequest & request,
+ std::shared_ptr<PocoHTTPResponse> & response,
+ Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
+ Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;
+
+ enum class S3MetricType
+ {
+ Microseconds,
+ Count,
+ Errors,
+ Throttling,
+ Redirects,
+
+ EnumSize,
+ };
+
+ enum class S3MetricKind
+ {
+ Read,
+ Write,
+
+ EnumSize,
+ };
+
+ template <bool pooled>
+ void makeRequestInternalImpl(
+ Aws::Http::HttpRequest & request,
+ const DB::ProxyConfiguration & per_request_configuration,
+ std::shared_ptr<PocoHTTPResponse> & response,
+ Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
+ Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;
+
+protected:
+ static S3MetricKind getMetricKind(const Aws::Http::HttpRequest & request);
+ void addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount = 1) const;
+
+ std::function<DB::ProxyConfiguration()> per_request_configuration;
+ std::function<void(const DB::ProxyConfiguration &)> error_report;
+ ConnectionTimeouts timeouts;
+ const RemoteHostFilter & remote_host_filter;
+ unsigned int s3_max_redirects;
+ bool enable_s3_requests_logging;
+ bool for_disk_s3;
+
+ /// Limits get request per second rate for GET, SELECT and all other requests, excluding throttled by put throttler
+ /// (i.e. throttles GetObject, HeadObject)
+ ThrottlerPtr get_request_throttler;
+
+ /// Limits put request per second rate for PUT, COPY, POST, LIST requests
+ /// (i.e. throttles PutObject, CopyObject, ListObjects, CreateMultipartUpload, UploadPartCopy, UploadPart, CompleteMultipartUpload)
+ /// NOTE: DELETE and CANCEL requests are not throttled by either put or get throttler
+ ThrottlerPtr put_request_throttler;
+
+ const HTTPHeaderEntries extra_headers;
+
+ size_t http_connection_pool_size = 0;
+ bool wait_on_pool_size_limit = true;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.cpp b/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.cpp
new file mode 100644
index 0000000000..87854b8f6e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.cpp
@@ -0,0 +1,40 @@
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include "PocoHTTPClientFactory.h"
+
+#include <IO/S3/PocoHTTPClient.h>
+#include <aws/core/client/ClientConfiguration.h>
+#include <aws/core/http/HttpRequest.h>
+#include <aws/core/http/HttpResponse.h>
+#include <aws/core/http/standard/StandardHttpRequest.h>
+
+namespace DB::S3
+{
+std::shared_ptr<Aws::Http::HttpClient>
+PocoHTTPClientFactory::CreateHttpClient(const Aws::Client::ClientConfiguration & clientConfiguration) const
+{
+ return std::make_shared<PocoHTTPClient>(static_cast<const PocoHTTPClientConfiguration &>(clientConfiguration));
+}
+
+std::shared_ptr<Aws::Http::HttpRequest> PocoHTTPClientFactory::CreateHttpRequest(
+ const Aws::String & uri, Aws::Http::HttpMethod method, const Aws::IOStreamFactory & streamFactory) const
+{
+ return CreateHttpRequest(Aws::Http::URI(uri), method, streamFactory);
+}
+
+std::shared_ptr<Aws::Http::HttpRequest> PocoHTTPClientFactory::CreateHttpRequest(
+ const Aws::Http::URI & uri, Aws::Http::HttpMethod method, const Aws::IOStreamFactory &) const
+{
+ auto request = Aws::MakeShared<Aws::Http::Standard::StandardHttpRequest>("PocoHTTPClientFactory", uri, method);
+
+ /// Don't create default response stream. Actual response stream will be set later in PocoHTTPClient.
+ request->SetResponseStreamFactory(null_factory);
+
+ return request;
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.h b/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.h
new file mode 100644
index 0000000000..4e555f0550
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/PocoHTTPClientFactory.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <aws/core/http/HttpClientFactory.h>
+
+namespace Aws::Http
+{
+class HttpClient;
+class HttpRequest;
+}
+
+namespace DB::S3
+{
+class PocoHTTPClientFactory : public Aws::Http::HttpClientFactory
+{
+public:
+ ~PocoHTTPClientFactory() override = default;
+ [[nodiscard]] std::shared_ptr<Aws::Http::HttpClient>
+ CreateHttpClient(const Aws::Client::ClientConfiguration & clientConfiguration) const override;
+ [[nodiscard]] std::shared_ptr<Aws::Http::HttpRequest>
+ CreateHttpRequest(const Aws::String & uri, Aws::Http::HttpMethod method, const Aws::IOStreamFactory & streamFactory) const override;
+ [[nodiscard]] std::shared_ptr<Aws::Http::HttpRequest>
+ CreateHttpRequest(const Aws::Http::URI & uri, Aws::Http::HttpMethod method, const Aws::IOStreamFactory & streamFactory) const override;
+
+private:
+ const Aws::IOStreamFactory null_factory = []() { return nullptr; };
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/S3/ProviderType.cpp b/contrib/clickhouse/src/IO/S3/ProviderType.cpp
new file mode 100644
index 0000000000..5987701db6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/ProviderType.cpp
@@ -0,0 +1,40 @@
+#include <IO/S3/ProviderType.h>
+
+#if USE_AWS_S3
+
+#include <string>
+
+namespace DB::S3
+{
+
+std::string_view toString(ProviderType provider_type)
+{
+ using enum ProviderType;
+
+ switch (provider_type)
+ {
+ case AWS:
+ return "AWS";
+ case GCS:
+ return "GCS";
+ case UNKNOWN:
+ return "Unknown";
+ }
+}
+
+std::string_view toString(ApiMode api_mode)
+{
+ using enum ApiMode;
+
+ switch (api_mode)
+ {
+ case AWS:
+ return "AWS";
+ case GCS:
+ return "GCS";
+ }
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/ProviderType.h b/contrib/clickhouse/src/IO/S3/ProviderType.h
new file mode 100644
index 0000000000..3e0ff3f36d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/ProviderType.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <string_view>
+#include <cstdint>
+
+namespace DB::S3
+{
+
+/// Provider type defines the platform containing the object
+/// we are trying to access
+/// This information is useful for determining general support for
+/// some feature like multipart copy which is currently supported by AWS
+/// but not by GCS
+enum class ProviderType : uint8_t
+{
+ AWS,
+ GCS,
+ UNKNOWN
+};
+
+std::string_view toString(ProviderType provider_type);
+
+/// Mode in which we can use the XML API
+/// This value can be same as the provider type but there can be a difference
+/// For example, GCS can work in both
+/// AWS compatible mode (accept headers starting with x-amz)
+/// and GCS mode (accept only headers starting with x-goog)
+/// Because GCS mode is enforced when some features are used we
+/// need to have support for both.
+enum class ApiMode : uint8_t
+{
+ AWS,
+ GCS
+};
+
+std::string_view toString(ApiMode api_mode);
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Requests.cpp b/contrib/clickhouse/src/IO/S3/Requests.cpp
new file mode 100644
index 0000000000..2f2f8637ef
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Requests.cpp
@@ -0,0 +1,156 @@
+#include <IO/S3/Requests.h>
+
+#if USE_AWS_S3
+
+#include <Common/logger_useful.h>
+// #include <aws/core/endpoint/EndpointParameter.h>
+#include <aws/core/utils/xml/XmlSerializer.h>
+
+namespace DB::S3
+{
+
+Aws::Http::HeaderValueCollection CopyObjectRequest::GetRequestSpecificHeaders() const
+{
+ auto headers = Model::CopyObjectRequest::GetRequestSpecificHeaders();
+ if (api_mode != ApiMode::GCS)
+ return headers;
+
+ /// GCS supports same headers as S3 but with a prefix x-goog instead of x-amz
+ /// we have to replace all the prefixes client set internally
+ const auto replace_with_gcs_header = [&](const std::string & amz_header, const std::string & gcs_header)
+ {
+ if (const auto it = headers.find(amz_header); it != headers.end())
+ {
+ auto header_value = std::move(it->second);
+ headers.erase(it);
+ headers.emplace(gcs_header, std::move(header_value));
+ }
+ };
+
+ replace_with_gcs_header("x-amz-copy-source", "x-goog-copy-source");
+ replace_with_gcs_header("x-amz-metadata-directive", "x-goog-metadata-directive");
+ replace_with_gcs_header("x-amz-storage-class", "x-goog-storage-class");
+
+ /// replace all x-amz-meta- headers
+ std::vector<std::pair<std::string, std::string>> new_meta_headers;
+ for (auto it = headers.begin(); it != headers.end();)
+ {
+ if (it->first.starts_with("x-amz-meta-"))
+ {
+ auto value = std::move(it->second);
+ auto header = "x-goog" + it->first.substr(/* x-amz */ 5);
+ new_meta_headers.emplace_back(std::pair{std::move(header), std::move(value)});
+ it = headers.erase(it);
+ }
+ else
+ ++it;
+ }
+
+ for (auto & [header, value] : new_meta_headers)
+ headers.emplace(std::move(header), std::move(value));
+
+ return headers;
+}
+
+Aws::String ComposeObjectRequest::SerializePayload() const
+{
+ if (component_names.empty())
+ return {};
+
+ Aws::Utils::Xml::XmlDocument payload_doc = Aws::Utils::Xml::XmlDocument::CreateWithRootNode("ComposeRequest");
+ auto root_node = payload_doc.GetRootElement();
+
+ for (const auto & name : component_names)
+ {
+ auto component_node = root_node.CreateChildElement("Component");
+ auto name_node = component_node.CreateChildElement("Name");
+ name_node.SetText(name);
+ }
+
+ return payload_doc.ConvertToString();
+}
+
+void ComposeObjectRequest::AddQueryStringParameters(Aws::Http::URI & /*uri*/) const
+{
+}
+
+Aws::Http::HeaderValueCollection ComposeObjectRequest::GetRequestSpecificHeaders() const
+{
+ if (content_type.empty())
+ return {};
+
+ return {Aws::Http::HeaderValuePair(Aws::Http::CONTENT_TYPE_HEADER, content_type)};
+}
+
+// Aws::Endpoint::EndpointParameters ComposeObjectRequest::GetEndpointContextParams() const
+// {
+// EndpointParameters parameters;
+// if (BucketHasBeenSet())
+// parameters.emplace_back("Bucket", GetBucket(), Aws::Endpoint::EndpointParameter::ParameterOrigin::OPERATION_CONTEXT);
+
+// return parameters;
+// }
+
+const Aws::String & ComposeObjectRequest::GetBucket() const
+{
+ return bucket;
+}
+
+bool ComposeObjectRequest::BucketHasBeenSet() const
+{
+ return !bucket.empty();
+}
+
+void ComposeObjectRequest::SetBucket(const Aws::String & value)
+{
+ bucket = value;
+}
+
+void ComposeObjectRequest::SetBucket(Aws::String && value)
+{
+ bucket = std::move(value);
+}
+
+void ComposeObjectRequest::SetBucket(const char * value)
+{
+ bucket.assign(value);
+}
+
+const Aws::String & ComposeObjectRequest::GetKey() const
+{
+ return key;
+}
+
+bool ComposeObjectRequest::KeyHasBeenSet() const
+{
+ return !key.empty();
+}
+
+void ComposeObjectRequest::SetKey(const Aws::String & value)
+{
+ key = value;
+}
+
+void ComposeObjectRequest::SetKey(Aws::String && value)
+{
+ key = std::move(value);
+}
+
+void ComposeObjectRequest::SetKey(const char * value)
+{
+ key.assign(value);
+}
+
+void ComposeObjectRequest::SetComponentNames(std::vector<Aws::String> component_names_)
+{
+ component_names = std::move(component_names_);
+}
+
+void ComposeObjectRequest::SetContentType(Aws::String value)
+{
+ content_type = std::move(value);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/Requests.h b/contrib/clickhouse/src/IO/S3/Requests.h
new file mode 100644
index 0000000000..5d0b930e01
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/Requests.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <IO/S3/URI.h>
+#include <IO/S3/ProviderType.h>
+
+// #include <aws/core/endpoint/EndpointParameter.h>
+#include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/ListObjectsV2Request.h>
+#include <aws/s3/model/ListObjectsRequest.h>
+#include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/AbortMultipartUploadRequest.h>
+#include <aws/s3/model/CreateMultipartUploadRequest.h>
+#include <aws/s3/model/CompleteMultipartUploadRequest.h>
+#include <aws/s3/model/CopyObjectRequest.h>
+#include <aws/s3/model/PutObjectRequest.h>
+#include <aws/s3/model/UploadPartRequest.h>
+#include <aws/s3/model/UploadPartCopyRequest.h>
+#include <aws/s3/model/DeleteObjectRequest.h>
+#include <aws/s3/model/DeleteObjectsRequest.h>
+
+namespace DB::S3
+{
+
+namespace Model = Aws::S3::Model;
+
+template <typename BaseRequest>
+class ExtendedRequest : public BaseRequest
+{
+public:
+ // Aws::Endpoint::EndpointParameters GetEndpointContextParams() const override
+ // {
+ // auto params = BaseRequest::GetEndpointContextParams();
+ // if (!region_override.empty())
+ // params.emplace_back("Region", region_override);
+
+ // if (uri_override.has_value())
+ // {
+ // static const Aws::String AWS_S3_FORCE_PATH_STYLE = "ForcePathStyle";
+ // params.emplace_back(AWS_S3_FORCE_PATH_STYLE, !uri_override->is_virtual_hosted_style);
+ // params.emplace_back("Endpoint", uri_override->endpoint);
+ // }
+
+ // return params;
+ // }
+
+ void overrideRegion(std::string region) const
+ {
+ region_override = std::move(region);
+ }
+
+ void overrideURI(S3::URI uri) const
+ {
+ uri_override = std::move(uri);
+ }
+
+ const auto & getURIOverride() const
+ {
+ return uri_override;
+ }
+
+ void setApiMode(ApiMode api_mode_) const
+ {
+ api_mode = api_mode_;
+ }
+
+protected:
+ mutable std::string region_override;
+ mutable std::optional<S3::URI> uri_override;
+ mutable ApiMode api_mode{ApiMode::AWS};
+};
+
+class CopyObjectRequest : public ExtendedRequest<Model::CopyObjectRequest>
+{
+public:
+ Aws::Http::HeaderValueCollection GetRequestSpecificHeaders() const override;
+};
+
+using HeadObjectRequest = ExtendedRequest<Model::HeadObjectRequest>;
+using ListObjectsV2Request = ExtendedRequest<Model::ListObjectsV2Request>;
+using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest>;
+using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest>;
+
+using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>;
+using CompleteMultipartUploadRequest = ExtendedRequest<Model::CompleteMultipartUploadRequest>;
+using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>;
+using UploadPartRequest = ExtendedRequest<Model::UploadPartRequest>;
+using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>;
+
+using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>;
+using DeleteObjectRequest = ExtendedRequest<Model::DeleteObjectRequest>;
+using DeleteObjectsRequest = ExtendedRequest<Model::DeleteObjectsRequest>;
+
+
+class ComposeObjectRequest : public ExtendedRequest<Aws::S3::S3Request>
+{
+public:
+ inline const char * GetServiceRequestName() const override { return "ComposeObject"; }
+
+ AWS_S3_API Aws::String SerializePayload() const override;
+
+ AWS_S3_API void AddQueryStringParameters(Aws::Http::URI & uri) const override;
+
+ AWS_S3_API Aws::Http::HeaderValueCollection GetRequestSpecificHeaders() const override;
+
+ // AWS_S3_API EndpointParameters GetEndpointContextParams() const override;
+
+ const Aws::String & GetBucket() const;
+ bool BucketHasBeenSet() const;
+ void SetBucket(const Aws::String & value);
+ void SetBucket(Aws::String && value);
+ void SetBucket(const char* value);
+
+ const Aws::String & GetKey() const;
+ bool KeyHasBeenSet() const;
+ void SetKey(const Aws::String & value);
+ void SetKey(Aws::String && value);
+ void SetKey(const char * value);
+
+ void SetComponentNames(std::vector<Aws::String> component_names_);
+
+ void SetContentType(Aws::String value);
+private:
+ Aws::String bucket;
+ Aws::String key;
+ std::vector<Aws::String> component_names;
+ Aws::String content_type;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/SessionAwareIOStream.h b/contrib/clickhouse/src/IO/S3/SessionAwareIOStream.h
new file mode 100644
index 0000000000..babe52545d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/SessionAwareIOStream.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <iosfwd>
+
+
+namespace DB::S3
+{
+/**
+ * Wrapper of IOStream to store response stream and corresponding HTTP session.
+ */
+template <typename Session>
+class SessionAwareIOStream : public std::iostream
+{
+public:
+ SessionAwareIOStream(Session session_, std::streambuf * sb)
+ : std::iostream(sb)
+ , session(std::move(session_))
+ {
+ }
+
+ Session & getSession() { return session; }
+
+ const Session & getSession() const { return session; }
+
+private:
+ /// Poco HTTP session is holder of response stream.
+ Session session;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/S3/URI.cpp b/contrib/clickhouse/src/IO/S3/URI.cpp
new file mode 100644
index 0000000000..34590df539
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/URI.cpp
@@ -0,0 +1,119 @@
+#include <IO/S3/URI.h>
+
+#if USE_AWS_S3
+#include <Common/Exception.h>
+#include <Common/quoteString.h>
+
+#include <boost/algorithm/string/case_conv.hpp>
+#include <re2/re2.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int BAD_ARGUMENTS;
+}
+
+namespace S3
+{
+
+URI::URI(const std::string & uri_)
+{
+ /// Case when bucket name represented in domain name of S3 URL.
+ /// E.g. (https://bucket-name.s3.Region.amazonaws.com/key)
+ /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#virtual-hosted-style-access
+ static const RE2 virtual_hosted_style_pattern(R"((.+)\.(s3|cos|obs|oss)([.\-][a-z0-9\-.:]+))");
+
+ /// Case when bucket name and key represented in path of S3 URL.
+ /// E.g. (https://s3.Region.amazonaws.com/bucket-name/key)
+ /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
+ static const RE2 path_style_pattern("^/([^/]*)/(.*)");
+
+ static constexpr auto S3 = "S3";
+ static constexpr auto COSN = "COSN";
+ static constexpr auto COS = "COS";
+ static constexpr auto OBS = "OBS";
+ static constexpr auto OSS = "OSS";
+
+ uri = Poco::URI(uri_);
+
+ storage_name = S3;
+
+ if (uri.getHost().empty())
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
+
+ /// Extract object version ID from query string.
+ bool has_version_id = false;
+ for (const auto & [query_key, query_value] : uri.getQueryParameters())
+ if (query_key == "versionId")
+ {
+ version_id = query_value;
+ has_version_id = true;
+ }
+
+ /// Poco::URI will ignore '?' when parsing the path, but if there is a versionId in the http parameter,
+ /// '?' can not be used as a wildcard, otherwise it will be ambiguous.
+ /// If no "versionId" in the http parameter, '?' can be used as a wildcard.
+ /// It is necessary to encode '?' to avoid deletion during parsing path.
+ if (!has_version_id && uri_.find('?') != String::npos)
+ {
+ String uri_with_question_mark_encode;
+ Poco::URI::encode(uri_, "?", uri_with_question_mark_encode);
+ uri = Poco::URI(uri_with_question_mark_encode);
+ }
+
+ String name;
+ String endpoint_authority_from_uri;
+
+ if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
+ {
+ is_virtual_hosted_style = true;
+ endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
+ validateBucket(bucket, uri);
+
+ if (!uri.getPath().empty())
+ {
+ /// Remove leading '/' from path to extract key.
+ key = uri.getPath().substr(1);
+ }
+
+ boost::to_upper(name);
+ if (name != S3 && name != COS && name != OBS && name != OSS)
+ throw Exception(ErrorCodes::BAD_ARGUMENTS,
+ "Object storage system name is unrecognized in virtual hosted style S3 URI: {}",
+ quoteString(name));
+
+ if (name == S3)
+ storage_name = name;
+ else if (name == OBS)
+ storage_name = OBS;
+ else if (name == OSS)
+ storage_name = OSS;
+ else
+ storage_name = COSN;
+ }
+ else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
+ {
+ is_virtual_hosted_style = false;
+ endpoint = uri.getScheme() + "://" + uri.getAuthority();
+ validateBucket(bucket, uri);
+ }
+ else
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
+}
+
+void URI::validateBucket(const String & bucket, const Poco::URI & uri)
+{
+ /// S3 specification requires at least 3 and at most 63 characters in bucket name.
+ /// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
+ if (bucket.length() < 3 || bucket.length() > 63)
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}",
+ quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : "");
+}
+
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/URI.h b/contrib/clickhouse/src/IO/S3/URI.h
new file mode 100644
index 0000000000..9df075c732
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/URI.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <string>
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <Poco/URI.h>
+
+namespace DB::S3
+{
+
+/**
+ * Represents S3 URI.
+ *
+ * The following patterns are allowed:
+ * s3://bucket/key
+ * http(s)://endpoint/bucket/key
+ */
+struct URI
+{
+ Poco::URI uri;
+ // Custom endpoint if URI scheme is not S3.
+ std::string endpoint;
+ std::string bucket;
+ std::string key;
+ std::string version_id;
+ std::string storage_name;
+
+ bool is_virtual_hosted_style;
+
+ URI() = default;
+ explicit URI(const std::string & uri_);
+
+ static void validateBucket(const std::string & bucket, const Poco::URI & uri);
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/copyS3File.cpp b/contrib/clickhouse/src/IO/S3/copyS3File.cpp
new file mode 100644
index 0000000000..002b8dde56
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/copyS3File.cpp
@@ -0,0 +1,850 @@
+#include <IO/S3/copyS3File.h>
+
+#if USE_AWS_S3
+
+#include <Common/ProfileEvents.h>
+#include <Common/typeid_cast.h>
+#include <Interpreters/Context.h>
+#include <IO/LimitSeekableReadBuffer.h>
+#include <IO/S3/getObjectInfo.h>
+#include <IO/SeekableReadBuffer.h>
+#include <IO/StdStreamFromReadBuffer.h>
+#include <IO/ReadBufferFromS3.h>
+
+#include <IO/S3/Requests.h>
+
+namespace ProfileEvents
+{
+ extern const Event WriteBufferFromS3Bytes;
+ extern const Event WriteBufferFromS3Microseconds;
+ extern const Event WriteBufferFromS3RequestsErrors;
+
+ extern const Event S3CreateMultipartUpload;
+ extern const Event S3CompleteMultipartUpload;
+ extern const Event S3PutObject;
+ extern const Event S3CopyObject;
+ extern const Event S3UploadPart;
+ extern const Event S3UploadPartCopy;
+
+ extern const Event DiskS3CreateMultipartUpload;
+ extern const Event DiskS3CompleteMultipartUpload;
+ extern const Event DiskS3PutObject;
+ extern const Event DiskS3CopyObject;
+ extern const Event DiskS3UploadPart;
+ extern const Event DiskS3UploadPartCopy;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int S3_ERROR;
+ extern const int INVALID_CONFIG_PARAMETER;
+ extern const int LOGICAL_ERROR;
+}
+
+
+namespace
+{
+ class UploadHelper
+ {
+ public:
+ UploadHelper(
+ const std::shared_ptr<const S3::Client> & client_ptr_,
+ const std::shared_ptr<const S3::Client> & client_with_long_timeout_ptr_,
+ const String & dest_bucket_,
+ const String & dest_key_,
+ const S3Settings::RequestSettings & request_settings_,
+ const std::optional<std::map<String, String>> & object_metadata_,
+ ThreadPoolCallbackRunner<void> schedule_,
+ bool for_disk_s3_,
+ const Poco::Logger * log_)
+ : client_ptr(client_ptr_)
+ , client_with_long_timeout_ptr(client_with_long_timeout_ptr_)
+ , dest_bucket(dest_bucket_)
+ , dest_key(dest_key_)
+ , request_settings(request_settings_)
+ , upload_settings(request_settings.getUploadSettings())
+ , object_metadata(object_metadata_)
+ , schedule(schedule_)
+ , for_disk_s3(for_disk_s3_)
+ , log(log_)
+ {
+ }
+
+ virtual ~UploadHelper() = default;
+
+ protected:
+ std::shared_ptr<const S3::Client> client_ptr;
+ std::shared_ptr<const S3::Client> client_with_long_timeout_ptr;
+ const String & dest_bucket;
+ const String & dest_key;
+ const S3Settings::RequestSettings & request_settings;
+ const S3Settings::RequestSettings::PartUploadSettings & upload_settings;
+ const std::optional<std::map<String, String>> & object_metadata;
+ ThreadPoolCallbackRunner<void> schedule;
+ bool for_disk_s3;
+ const Poco::Logger * log;
+
+ struct UploadPartTask
+ {
+ std::unique_ptr<Aws::AmazonWebServiceRequest> req;
+ bool is_finished = false;
+ String tag;
+ std::exception_ptr exception;
+ };
+
+ size_t normal_part_size;
+ String multipart_upload_id;
+ std::atomic<bool> multipart_upload_aborted = false;
+ Strings part_tags;
+
+ std::list<UploadPartTask> TSA_GUARDED_BY(bg_tasks_mutex) bg_tasks;
+ int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
+ int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
+ std::mutex bg_tasks_mutex;
+ std::condition_variable bg_tasks_condvar;
+
+ void fillCreateMultipartRequest(S3::CreateMultipartUploadRequest & request)
+ {
+ request.SetBucket(dest_bucket);
+ request.SetKey(dest_key);
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ request.SetContentType("binary/octet-stream");
+
+ if (object_metadata.has_value())
+ request.SetMetadata(object_metadata.value());
+
+ const auto & storage_class_name = upload_settings.storage_class_name;
+ if (!storage_class_name.empty())
+ request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name));
+
+ client_ptr->setKMSHeaders(request);
+ }
+
+ void createMultipartUpload()
+ {
+ S3::CreateMultipartUploadRequest request;
+ fillCreateMultipartRequest(request);
+
+ ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3CreateMultipartUpload);
+
+ auto outcome = client_ptr->CreateMultipartUpload(request);
+
+ if (outcome.IsSuccess())
+ {
+ multipart_upload_id = outcome.GetResult().GetUploadId();
+ LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", dest_bucket, dest_key, multipart_upload_id);
+ }
+ else
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+ }
+
+ void completeMultipartUpload()
+ {
+ if (multipart_upload_aborted)
+ return;
+
+ LOG_TRACE(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", dest_bucket, dest_key, multipart_upload_id, part_tags.size());
+
+ if (part_tags.empty())
+ throw Exception(ErrorCodes::S3_ERROR, "Failed to complete multipart upload. No parts have uploaded");
+
+ S3::CompleteMultipartUploadRequest request;
+ request.SetBucket(dest_bucket);
+ request.SetKey(dest_key);
+ request.SetUploadId(multipart_upload_id);
+
+ Aws::S3::Model::CompletedMultipartUpload multipart_upload;
+ for (size_t i = 0; i < part_tags.size(); ++i)
+ {
+ Aws::S3::Model::CompletedPart part;
+ multipart_upload.AddParts(part.WithETag(part_tags[i]).WithPartNumber(static_cast<int>(i + 1)));
+ }
+
+ request.SetMultipartUpload(multipart_upload);
+
+ size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL);
+ for (size_t retries = 1;; ++retries)
+ {
+ ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3CompleteMultipartUpload);
+
+ auto outcome = client_with_long_timeout_ptr->CompleteMultipartUpload(request);
+
+ if (outcome.IsSuccess())
+ {
+ LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", dest_bucket, dest_key, multipart_upload_id, part_tags.size());
+ break;
+ }
+
+ if ((outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) && (retries < max_retries))
+ {
+ /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests
+ /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it
+ LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Upload_id: {}, Parts: {}, will retry", dest_bucket, dest_key, multipart_upload_id, part_tags.size());
+ continue; /// will retry
+ }
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(
+ outcome.GetError().GetErrorType(),
+ "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+ outcome.GetError().GetMessage(), dest_key, dest_bucket, fmt::join(part_tags.begin(), part_tags.end(), " "));
+ }
+ }
+
+ void abortMultipartUpload()
+ {
+ LOG_TRACE(log, "Aborting multipart upload. Bucket: {}, Key: {}, Upload_id: {}", dest_bucket, dest_key, multipart_upload_id);
+ S3::AbortMultipartUploadRequest abort_request;
+ abort_request.SetBucket(dest_bucket);
+ abort_request.SetKey(dest_key);
+ abort_request.SetUploadId(multipart_upload_id);
+ client_ptr->AbortMultipartUpload(abort_request);
+ multipart_upload_aborted = true;
+ }
+
+ void checkObjectAfterUpload()
+ {
+ LOG_TRACE(log, "Checking object {} exists after upload", dest_key);
+ S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, request_settings, {}, "Immediately after upload");
+ LOG_TRACE(log, "Object {} exists after upload", dest_key);
+ }
+
+ void performMultipartUpload(size_t start_offset, size_t size)
+ {
+ calculatePartSize(size);
+ createMultipartUpload();
+
+ size_t position = start_offset;
+ size_t end_position = start_offset + size;
+
+ try
+ {
+ for (size_t part_number = 1; position < end_position; ++part_number)
+ {
+ if (multipart_upload_aborted)
+ break; /// No more part uploads.
+
+ size_t next_position = std::min(position + normal_part_size, end_position);
+ size_t part_size = next_position - position; /// `part_size` is either `normal_part_size` or smaller if it's the final part.
+
+ Stopwatch watch;
+ uploadPart(part_number, position, part_size);
+ watch.stop();
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, part_size);
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ position = next_position;
+ }
+ }
+ catch (...)
+ {
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ // Multipart upload failed because it wasn't possible to schedule all the tasks.
+ // To avoid execution of already scheduled tasks we abort MultipartUpload.
+ abortMultipartUpload();
+ waitForAllBackgroundTasks();
+ throw;
+ }
+
+ waitForAllBackgroundTasks();
+ completeMultipartUpload();
+ }
+
+ void calculatePartSize(size_t total_size)
+ {
+ if (!total_size)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen");
+
+ auto max_part_number = upload_settings.max_part_number;
+ auto min_upload_part_size = upload_settings.min_upload_part_size;
+ auto max_upload_part_size = upload_settings.max_upload_part_size;
+
+ if (!max_part_number)
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0");
+ else if (!min_upload_part_size)
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "min_upload_part_size must not be 0");
+ else if (max_upload_part_size < min_upload_part_size)
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be less than min_upload_part_size");
+
+ size_t part_size = min_upload_part_size;
+ size_t num_parts = (total_size + part_size - 1) / part_size;
+
+ if (num_parts > max_part_number)
+ {
+ part_size = (total_size + max_part_number - 1) / max_part_number;
+ num_parts = (total_size + part_size - 1) / part_size;
+ }
+
+ if (part_size > max_upload_part_size)
+ {
+ part_size = max_upload_part_size;
+ num_parts = (total_size + part_size - 1) / part_size;
+ }
+
+ if (num_parts < 1 || num_parts > max_part_number || part_size < min_upload_part_size || part_size > max_upload_part_size)
+ {
+ String msg;
+ if (num_parts < 1)
+ msg = "Number of parts is zero";
+ else if (num_parts > max_part_number)
+ msg = fmt::format("Number of parts exceeds {}", num_parts, max_part_number);
+ else if (part_size < min_upload_part_size)
+ msg = fmt::format("Size of a part is less than {}", part_size, min_upload_part_size);
+ else
+ msg = fmt::format("Size of a part exceeds {}", part_size, max_upload_part_size);
+
+ throw Exception(
+ ErrorCodes::INVALID_CONFIG_PARAMETER,
+ "{} while writing {} bytes to S3. Check max_part_number = {}, "
+ "min_upload_part_size = {}, max_upload_part_size = {}",
+ msg, total_size, max_part_number, min_upload_part_size, max_upload_part_size);
+ }
+
+ /// We've calculated the size of a normal part (the final part can be smaller).
+ normal_part_size = part_size;
+ }
+
+ void uploadPart(size_t part_number, size_t part_offset, size_t part_size)
+ {
+ LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Size: {}", dest_bucket, dest_key, multipart_upload_id, part_size);
+
+ if (!part_size)
+ {
+ LOG_TRACE(log, "Skipping writing an empty part.");
+ return;
+ }
+
+ if (schedule)
+ {
+ UploadPartTask * task = nullptr;
+
+ {
+ std::lock_guard lock(bg_tasks_mutex);
+ task = &bg_tasks.emplace_back();
+ ++num_added_bg_tasks;
+ }
+
+ /// Notify waiting thread when task finished
+ auto task_finish_notify = [this, task]()
+ {
+ std::lock_guard lock(bg_tasks_mutex);
+ task->is_finished = true;
+ ++num_finished_bg_tasks;
+
+ /// Notification under mutex is important here.
+ /// Otherwise, WriteBuffer could be destroyed in between
+ /// Releasing lock and condvar notification.
+ bg_tasks_condvar.notify_one();
+ };
+
+ try
+ {
+ task->req = fillUploadPartRequest(part_number, part_offset, part_size);
+
+ schedule([this, task, task_finish_notify]()
+ {
+ try
+ {
+ processUploadTask(*task);
+ }
+ catch (...)
+ {
+ task->exception = std::current_exception();
+ }
+ task_finish_notify();
+ }, Priority{});
+ }
+ catch (...)
+ {
+ task_finish_notify();
+ throw;
+ }
+ }
+ else
+ {
+ UploadPartTask task;
+ task.req = fillUploadPartRequest(part_number, part_offset, part_size);
+ processUploadTask(task);
+ part_tags.push_back(task.tag);
+ }
+ }
+
+ void processUploadTask(UploadPartTask & task)
+ {
+ if (multipart_upload_aborted)
+ return; /// Already aborted.
+
+ auto tag = processUploadPartRequest(*task.req);
+
+ std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race
+ task.tag = tag;
+ LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", dest_bucket, dest_key, multipart_upload_id, task.tag, bg_tasks.size());
+ }
+
+ virtual std::unique_ptr<Aws::AmazonWebServiceRequest> fillUploadPartRequest(size_t part_number, size_t part_offset, size_t part_size) = 0;
+ virtual String processUploadPartRequest(Aws::AmazonWebServiceRequest & request) = 0;
+
+ void waitForAllBackgroundTasks()
+ {
+ if (!schedule)
+ return;
+
+ std::unique_lock lock(bg_tasks_mutex);
+ /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock
+ bg_tasks_condvar.wait(lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); });
+
+ auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(bg_tasks);
+ for (auto & task : tasks)
+ {
+ if (task.exception)
+ {
+ /// abortMultipartUpload() might be called already, see processUploadPartRequest().
+ /// However if there were concurrent uploads at that time, those part uploads might or might not succeed.
+ /// As a result, it might be necessary to abort a given multipart upload multiple times in order to completely free
+ /// all storage consumed by all parts.
+ abortMultipartUpload();
+
+ std::rethrow_exception(task.exception);
+ }
+
+ part_tags.push_back(task.tag);
+ }
+ }
+ };
+
+ /// Helper class to help implementing copyDataToS3File().
+ class CopyDataToFileHelper : public UploadHelper
+ {
+ public:
+ CopyDataToFileHelper(
+ const CreateReadBuffer & create_read_buffer_,
+ size_t offset_,
+ size_t size_,
+ const std::shared_ptr<const S3::Client> & client_ptr_,
+ const std::shared_ptr<const S3::Client> & client_with_long_timeout_ptr_,
+ const String & dest_bucket_,
+ const String & dest_key_,
+ const S3Settings::RequestSettings & request_settings_,
+ const std::optional<std::map<String, String>> & object_metadata_,
+ ThreadPoolCallbackRunner<void> schedule_,
+ bool for_disk_s3_)
+ : UploadHelper(client_ptr_, client_with_long_timeout_ptr_, dest_bucket_, dest_key_, request_settings_, object_metadata_, schedule_, for_disk_s3_, &Poco::Logger::get("copyDataToS3File"))
+ , create_read_buffer(create_read_buffer_)
+ , offset(offset_)
+ , size(size_)
+ {
+ }
+
+ void performCopy()
+ {
+ if (size <= upload_settings.max_single_part_upload_size)
+ performSinglepartUpload();
+ else
+ performMultipartUpload();
+
+ if (request_settings.check_objects_after_upload)
+ checkObjectAfterUpload();
+ }
+
+ private:
+ std::function<std::unique_ptr<SeekableReadBuffer>()> create_read_buffer;
+ size_t offset;
+ size_t size;
+
+ void performSinglepartUpload()
+ {
+ S3::PutObjectRequest request;
+ fillPutRequest(request);
+ processPutRequest(request);
+ }
+
+ void fillPutRequest(S3::PutObjectRequest & request)
+ {
+ auto read_buffer = std::make_unique<LimitSeekableReadBuffer>(create_read_buffer(), offset, size);
+
+ request.SetBucket(dest_bucket);
+ request.SetKey(dest_key);
+ request.SetContentLength(size);
+ request.SetBody(std::make_unique<StdStreamFromReadBuffer>(std::move(read_buffer), size));
+
+ if (object_metadata.has_value())
+ request.SetMetadata(object_metadata.value());
+
+ const auto & storage_class_name = upload_settings.storage_class_name;
+ if (!storage_class_name.empty())
+ request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name));
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ request.SetContentType("binary/octet-stream");
+
+ client_ptr->setKMSHeaders(request);
+ }
+
+ void processPutRequest(const S3::PutObjectRequest & request)
+ {
+ size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL);
+ for (size_t retries = 1;; ++retries)
+ {
+ ProfileEvents::increment(ProfileEvents::S3PutObject);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3PutObject);
+
+ Stopwatch watch;
+ auto outcome = client_ptr->PutObject(request);
+ watch.stop();
+
+ if (outcome.IsSuccess())
+ {
+ Int64 object_size = request.GetContentLength();
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, object_size);
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+ LOG_TRACE(
+ log,
+ "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}",
+ dest_bucket,
+ dest_key,
+ object_size);
+ break;
+ }
+
+ if (outcome.GetError().GetExceptionName() == "EntityTooLarge" || outcome.GetError().GetExceptionName() == "InvalidRequest")
+ {
+ // Can't come here with MinIO, MinIO allows single part upload for large objects.
+ LOG_INFO(
+ log,
+ "Single part upload failed with error {} for Bucket: {}, Key: {}, Object size: {}, will retry with multipart upload",
+ outcome.GetError().GetExceptionName(),
+ dest_bucket,
+ dest_key,
+ size);
+ performMultipartUpload();
+ break;
+ }
+
+ if ((outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) && (retries < max_retries))
+ {
+ /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests
+ LOG_INFO(
+ log,
+ "Single part upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Object size: {}, will retry",
+ dest_bucket,
+ dest_key,
+ request.GetContentLength());
+ continue; /// will retry
+ }
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(
+ outcome.GetError().GetErrorType(),
+ "Message: {}, Key: {}, Bucket: {}, Object size: {}",
+ outcome.GetError().GetMessage(),
+ dest_key,
+ dest_bucket,
+ request.GetContentLength());
+ }
+ }
+
+ void performMultipartUpload() { UploadHelper::performMultipartUpload(offset, size); }
+
+ std::unique_ptr<Aws::AmazonWebServiceRequest> fillUploadPartRequest(size_t part_number, size_t part_offset, size_t part_size) override
+ {
+ auto read_buffer = std::make_unique<LimitSeekableReadBuffer>(create_read_buffer(), part_offset, part_size);
+
+ /// Setup request.
+ auto request = std::make_unique<S3::UploadPartRequest>();
+ request->SetBucket(dest_bucket);
+ request->SetKey(dest_key);
+ request->SetPartNumber(static_cast<int>(part_number));
+ request->SetUploadId(multipart_upload_id);
+ request->SetContentLength(part_size);
+ request->SetBody(std::make_unique<StdStreamFromReadBuffer>(std::move(read_buffer), part_size));
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ request->SetContentType("binary/octet-stream");
+
+ return request;
+ }
+
+ String processUploadPartRequest(Aws::AmazonWebServiceRequest & request) override
+ {
+ auto & req = typeid_cast<S3::UploadPartRequest &>(request);
+
+ ProfileEvents::increment(ProfileEvents::S3UploadPart);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3UploadPart);
+
+ auto outcome = client_ptr->UploadPart(req);
+ if (!outcome.IsSuccess())
+ {
+ abortMultipartUpload();
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+
+ return outcome.GetResult().GetETag();
+ }
+ };
+
+ /// Helper class to help implementing copyS3File().
+ class CopyFileHelper : public UploadHelper
+ {
+ public:
+ CopyFileHelper(
+ const std::shared_ptr<const S3::Client> & client_ptr_,
+ const std::shared_ptr<const S3::Client> & client_with_long_timeout_ptr_,
+ const String & src_bucket_,
+ const String & src_key_,
+ size_t src_offset_,
+ size_t src_size_,
+ const String & dest_bucket_,
+ const String & dest_key_,
+ const S3Settings::RequestSettings & request_settings_,
+ const std::optional<std::map<String, String>> & object_metadata_,
+ ThreadPoolCallbackRunner<void> schedule_,
+ bool for_disk_s3_)
+ : UploadHelper(client_ptr_, client_with_long_timeout_ptr_, dest_bucket_, dest_key_, request_settings_, object_metadata_, schedule_, for_disk_s3_, &Poco::Logger::get("copyS3File"))
+ , src_bucket(src_bucket_)
+ , src_key(src_key_)
+ , offset(src_offset_)
+ , size(src_size_)
+ , supports_multipart_copy(client_ptr_->supportsMultiPartCopy())
+ {
+ }
+
+ void performCopy()
+ {
+ if (!supports_multipart_copy || size <= upload_settings.max_single_operation_copy_size)
+ performSingleOperationCopy();
+ else
+ performMultipartUploadCopy();
+
+ if (request_settings.check_objects_after_upload)
+ checkObjectAfterUpload();
+ }
+
+ private:
+ const String & src_bucket;
+ const String & src_key;
+ size_t offset;
+ size_t size;
+ bool supports_multipart_copy;
+
+ CreateReadBuffer getSourceObjectReadBuffer()
+ {
+ return [&]
+ {
+ return std::make_unique<ReadBufferFromS3>(client_ptr, src_bucket, src_key, "", request_settings, Context::getGlobalContextInstance()->getReadSettings());
+ };
+ }
+
+ void performSingleOperationCopy()
+ {
+ S3::CopyObjectRequest request;
+ fillCopyRequest(request);
+ processCopyRequest(request);
+ }
+
+ void fillCopyRequest(S3::CopyObjectRequest & request)
+ {
+ request.SetCopySource(src_bucket + "/" + src_key);
+ request.SetBucket(dest_bucket);
+ request.SetKey(dest_key);
+
+ if (object_metadata.has_value())
+ {
+ request.SetMetadata(object_metadata.value());
+ request.SetMetadataDirective(Aws::S3::Model::MetadataDirective::REPLACE);
+ }
+
+ const auto & storage_class_name = upload_settings.storage_class_name;
+ if (!storage_class_name.empty())
+ request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name));
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ request.SetContentType("binary/octet-stream");
+
+ client_with_long_timeout_ptr->setKMSHeaders(request);
+ }
+
+ void processCopyRequest(const S3::CopyObjectRequest & request)
+ {
+ size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL);
+ for (size_t retries = 1;; ++retries)
+ {
+ ProfileEvents::increment(ProfileEvents::S3CopyObject);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3CopyObject);
+
+ auto outcome = client_with_long_timeout_ptr->CopyObject(request);
+ if (outcome.IsSuccess())
+ {
+ LOG_TRACE(
+ log,
+ "Single operation copy has completed. Bucket: {}, Key: {}, Object size: {}",
+ dest_bucket,
+ dest_key,
+ size);
+ break;
+ }
+
+ if (outcome.GetError().GetExceptionName() == "EntityTooLarge" || outcome.GetError().GetExceptionName() == "InvalidRequest" || outcome.GetError().GetExceptionName() == "InvalidArgument")
+ {
+ if (!supports_multipart_copy)
+ {
+ LOG_INFO(log, "Multipart upload using copy is not supported, will try regular upload for Bucket: {}, Key: {}, Object size: {}",
+ dest_bucket,
+ dest_key,
+ size);
+ copyDataToS3File(
+ getSourceObjectReadBuffer(),
+ offset,
+ size,
+ client_ptr,
+ client_with_long_timeout_ptr,
+ dest_bucket,
+ dest_key,
+ request_settings,
+ object_metadata,
+ schedule,
+ for_disk_s3);
+ break;
+ }
+ else
+ {
+ // Can't come here with MinIO, MinIO allows single part upload for large objects.
+ LOG_INFO(
+ log,
+ "Single operation copy failed with error {} for Bucket: {}, Key: {}, Object size: {}, will retry with multipart "
+ "upload copy",
+ outcome.GetError().GetExceptionName(),
+ dest_bucket,
+ dest_key,
+ size);
+
+ performMultipartUploadCopy();
+ break;
+ }
+ }
+
+ if ((outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) && (retries < max_retries))
+ {
+ /// TODO: Is it true for copy requests?
+ /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests
+ LOG_INFO(
+ log,
+ "Single operation copy failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Object size: {}, will retry",
+ dest_bucket,
+ dest_key,
+ size);
+ continue; /// will retry
+ }
+
+ throw S3Exception(
+ outcome.GetError().GetErrorType(),
+ "Message: {}, Key: {}, Bucket: {}, Object size: {}",
+ outcome.GetError().GetMessage(),
+ dest_key,
+ dest_bucket,
+ size);
+ }
+ }
+
+ void performMultipartUploadCopy() { UploadHelper::performMultipartUpload(offset, size); }
+
+ std::unique_ptr<Aws::AmazonWebServiceRequest> fillUploadPartRequest(size_t part_number, size_t part_offset, size_t part_size) override
+ {
+ auto request = std::make_unique<S3::UploadPartCopyRequest>();
+
+ /// Make a copy request to copy a part.
+ request->SetCopySource(src_bucket + "/" + src_key);
+ request->SetBucket(dest_bucket);
+ request->SetKey(dest_key);
+ request->SetUploadId(multipart_upload_id);
+ request->SetPartNumber(static_cast<int>(part_number));
+ request->SetCopySourceRange(fmt::format("bytes={}-{}", part_offset, part_offset + part_size - 1));
+
+ return request;
+ }
+
+ String processUploadPartRequest(Aws::AmazonWebServiceRequest & request) override
+ {
+ auto & req = typeid_cast<S3::UploadPartCopyRequest &>(request);
+
+ ProfileEvents::increment(ProfileEvents::S3UploadPartCopy);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3UploadPartCopy);
+
+ auto outcome = client_with_long_timeout_ptr->UploadPartCopy(req);
+ if (!outcome.IsSuccess())
+ {
+ abortMultipartUpload();
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+
+ return outcome.GetResult().GetCopyPartResult().GetETag();
+ }
+ };
+}
+
+
+void copyDataToS3File(
+ const std::function<std::unique_ptr<SeekableReadBuffer>()> & create_read_buffer,
+ size_t offset,
+ size_t size,
+ const std::shared_ptr<const S3::Client> & dest_s3_client,
+ const std::shared_ptr<const S3::Client> & dest_s3_client_with_long_timeout,
+ const String & dest_bucket,
+ const String & dest_key,
+ const S3Settings::RequestSettings & settings,
+ const std::optional<std::map<String, String>> & object_metadata,
+ ThreadPoolCallbackRunner<void> schedule,
+ bool for_disk_s3)
+{
+ CopyDataToFileHelper helper{create_read_buffer, offset, size, dest_s3_client, dest_s3_client_with_long_timeout, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_s3};
+ helper.performCopy();
+}
+
+
+void copyS3File(
+ const std::shared_ptr<const S3::Client> & s3_client,
+ const std::shared_ptr<const S3::Client> & s3_client_with_long_timeout,
+ const String & src_bucket,
+ const String & src_key,
+ size_t src_offset,
+ size_t src_size,
+ const String & dest_bucket,
+ const String & dest_key,
+ const S3Settings::RequestSettings & settings,
+ const std::optional<std::map<String, String>> & object_metadata,
+ ThreadPoolCallbackRunner<void> schedule,
+ bool for_disk_s3)
+{
+ if (settings.allow_native_copy)
+ {
+ CopyFileHelper helper{s3_client, s3_client_with_long_timeout, src_bucket, src_key, src_offset, src_size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_s3};
+ helper.performCopy();
+ }
+ else
+ {
+ auto create_read_buffer = [&]
+ {
+ return std::make_unique<ReadBufferFromS3>(s3_client, src_bucket, src_key, "", settings, Context::getGlobalContextInstance()->getReadSettings());
+ };
+ copyDataToS3File(create_read_buffer, src_offset, src_size, s3_client, s3_client_with_long_timeout, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_s3);
+ }
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/copyS3File.h b/contrib/clickhouse/src/IO/S3/copyS3File.h
new file mode 100644
index 0000000000..b39b7469eb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/copyS3File.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <Storages/StorageS3Settings.h>
+#include <Interpreters/threadPoolCallbackRunner.h>
+#include <base/types.h>
+#include <functional>
+#include <memory>
+
+
+namespace DB
+{
+class SeekableReadBuffer;
+
+using CreateReadBuffer = std::function<std::unique_ptr<SeekableReadBuffer>()>;
+
+/// Copies a file from S3 to S3.
+/// The same functionality can be done by using the function copyData() and the classes ReadBufferFromS3 and WriteBufferFromS3
+/// however copyS3File() is faster and spends less network traffic and memory.
+/// The parameters `src_offset` and `src_size` specify a part in the source to copy.
+///
+/// Note, that it tries to copy file using native copy (CopyObject), but if it
+/// has been disabled (with settings.allow_native_copy) or request failed
+/// because it is a known issue, it is fallbacks to read-write copy
+/// (copyDataToS3File()).
+///
+/// s3_client_with_long_timeout (may be equal to s3_client) is used for native copy and
+/// CompleteMultipartUpload requests. These requests need longer timeout because S3 servers often
+/// block on them for multiple seconds without sending or receiving data from us (maybe the servers
+/// are copying data internally, or maybe throttling, idk).
+void copyS3File(
+ const std::shared_ptr<const S3::Client> & s3_client,
+ const std::shared_ptr<const S3::Client> & s3_client_with_long_timeout,
+ const String & src_bucket,
+ const String & src_key,
+ size_t src_offset,
+ size_t src_size,
+ const String & dest_bucket,
+ const String & dest_key,
+ const S3Settings::RequestSettings & settings,
+ const std::optional<std::map<String, String>> & object_metadata = std::nullopt,
+ ThreadPoolCallbackRunner<void> schedule_ = {},
+ bool for_disk_s3 = false);
+
+/// Copies data from any seekable source to S3.
+/// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3
+/// however copyDataToS3File() is faster and spends less memory.
+/// The callback `create_read_buffer` can be called from multiple threads in parallel, so that should be thread-safe.
+/// The parameters `offset` and `size` specify a part in the source to copy.
+void copyDataToS3File(
+ const CreateReadBuffer & create_read_buffer,
+ size_t offset,
+ size_t size,
+ const std::shared_ptr<const S3::Client> & dest_s3_client,
+ const std::shared_ptr<const S3::Client> & dest_s3_client_with_long_timeout,
+ const String & dest_bucket,
+ const String & dest_key,
+ const S3Settings::RequestSettings & settings,
+ const std::optional<std::map<String, String>> & object_metadata = std::nullopt,
+ ThreadPoolCallbackRunner<void> schedule_ = {},
+ bool for_disk_s3 = false);
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/getObjectInfo.cpp b/contrib/clickhouse/src/IO/S3/getObjectInfo.cpp
new file mode 100644
index 0000000000..88f79f8d8d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/getObjectInfo.cpp
@@ -0,0 +1,144 @@
+#include <IO/S3/getObjectInfo.h>
+
+#if USE_AWS_S3
+
+namespace ErrorCodes
+{
+ extern const int S3_ERROR;
+}
+
+
+namespace ProfileEvents
+{
+ extern const Event S3GetObject;
+ extern const Event S3GetObjectAttributes;
+ extern const Event S3HeadObject;
+ extern const Event DiskS3GetObject;
+ extern const Event DiskS3GetObjectAttributes;
+ extern const Event DiskS3HeadObject;
+}
+
+
+namespace DB::S3
+{
+
+namespace
+{
+ Aws::S3::Model::HeadObjectOutcome headObject(
+ const S3::Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+ {
+ ProfileEvents::increment(ProfileEvents::S3HeadObject);
+ if (for_disk_s3)
+ ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
+
+ S3::HeadObjectRequest req;
+ req.SetBucket(bucket);
+ req.SetKey(key);
+
+ if (!version_id.empty())
+ req.SetVersionId(version_id);
+
+ return client.HeadObject(req);
+ }
+
+ /// Performs a request to get the size and last modification time of an object.
+ std::pair<std::optional<ObjectInfo>, Aws::S3::S3Error> tryGetObjectInfo(
+ const S3::Client & client, const String & bucket, const String & key, const String & version_id,
+ const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata, bool for_disk_s3)
+ {
+ auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
+ if (!outcome.IsSuccess())
+ return {std::nullopt, outcome.GetError()};
+
+ const auto & result = outcome.GetResult();
+ ObjectInfo object_info;
+ object_info.size = static_cast<size_t>(result.GetContentLength());
+ object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
+
+ if (with_metadata)
+ object_info.metadata = result.GetMetadata();
+
+ return {object_info, {}};
+ }
+}
+
+
+bool isNotFoundError(Aws::S3::S3Errors error)
+{
+ return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY;
+}
+
+ObjectInfo getObjectInfo(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id,
+ const S3Settings::RequestSettings & request_settings,
+ bool with_metadata,
+ bool for_disk_s3,
+ bool throw_on_error)
+{
+ auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, with_metadata, for_disk_s3);
+ if (object_info)
+ {
+ return *object_info;
+ }
+ else if (throw_on_error)
+ {
+ throw S3Exception(error.GetErrorType(),
+ "Failed to get object info: {}. HTTP response code: {}",
+ error.GetMessage(), static_cast<size_t>(error.GetResponseCode()));
+ }
+ return {};
+}
+
+size_t getObjectSize(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id,
+ const S3Settings::RequestSettings & request_settings,
+ bool for_disk_s3,
+ bool throw_on_error)
+{
+ return getObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3, throw_on_error).size;
+}
+
+bool objectExists(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id,
+ const S3Settings::RequestSettings & request_settings,
+ bool for_disk_s3)
+{
+ auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3);
+ if (object_info)
+ return true;
+
+ if (isNotFoundError(error.GetErrorType()))
+ return false;
+
+ throw S3Exception(error.GetErrorType(),
+ "Failed to check existence of key {} in bucket {}: {}",
+ key, bucket, error.GetMessage());
+}
+
+void checkObjectExists(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id,
+ const S3Settings::RequestSettings & request_settings,
+ bool for_disk_s3,
+ std::string_view description)
+{
+ auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3);
+ if (object_info)
+ return;
+ throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}",
+ (description.empty() ? "" : (String(description) + ": ")), key, bucket, error.GetMessage());
+}
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3/getObjectInfo.h b/contrib/clickhouse/src/IO/S3/getObjectInfo.h
new file mode 100644
index 0000000000..8804a9494e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3/getObjectInfo.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+#include <Storages/StorageS3Settings.h>
+#include <base/types.h>
+#include <IO/S3/Client.h>
+
+
+namespace DB::S3
+{
+
+struct ObjectInfo
+{
+ size_t size = 0;
+ time_t last_modification_time = 0;
+
+ std::map<String, String> metadata = {}; /// Set only if getObjectInfo() is called with `with_metadata = true`.
+};
+
+ObjectInfo getObjectInfo(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id = {},
+ const S3Settings::RequestSettings & request_settings = {},
+ bool with_metadata = false,
+ bool for_disk_s3 = false,
+ bool throw_on_error = true);
+
+size_t getObjectSize(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id = {},
+ const S3Settings::RequestSettings & request_settings = {},
+ bool for_disk_s3 = false,
+ bool throw_on_error = true);
+
+bool objectExists(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id = {},
+ const S3Settings::RequestSettings & request_settings = {},
+ bool for_disk_s3 = false);
+
+/// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message.
+void checkObjectExists(
+ const S3::Client & client,
+ const String & bucket,
+ const String & key,
+ const String & version_id = {},
+ const S3Settings::RequestSettings & request_settings = {},
+ bool for_disk_s3 = false,
+ std::string_view description = {});
+
+bool isNotFoundError(Aws::S3::S3Errors error);
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/S3Common.cpp b/contrib/clickhouse/src/IO/S3Common.cpp
new file mode 100644
index 0000000000..115877530f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3Common.cpp
@@ -0,0 +1,178 @@
+#include <IO/S3Common.h>
+
+#include <Common/Exception.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+# include <Common/quoteString.h>
+
+# include <IO/WriteBufferFromString.h>
+# include <IO/HTTPHeaderEntries.h>
+# include <Storages/StorageS3Settings.h>
+
+# include <IO/S3/PocoHTTPClientFactory.h>
+# include <IO/S3/PocoHTTPClient.h>
+# include <IO/S3/Client.h>
+# include <IO/S3/URI.h>
+# include <IO/S3/Requests.h>
+# include <IO/S3/Credentials.h>
+# include <Common/logger_useful.h>
+
+# include <fstream>
+
+namespace ProfileEvents
+{
+ extern const Event S3GetObjectAttributes;
+ extern const Event S3GetObjectMetadata;
+ extern const Event S3HeadObject;
+ extern const Event DiskS3GetObjectAttributes;
+ extern const Event DiskS3GetObjectMetadata;
+ extern const Event DiskS3HeadObject;
+}
+
+namespace DB
+{
+
+bool S3Exception::isRetryableError() const
+{
+ /// Looks like these list is quite conservative, add more codes if you wish
+ static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
+ Aws::S3::S3Errors::NO_SUCH_KEY,
+ Aws::S3::S3Errors::ACCESS_DENIED,
+ Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
+ Aws::S3::S3Errors::INVALID_SIGNATURE,
+ Aws::S3::S3Errors::NO_SUCH_UPLOAD,
+ Aws::S3::S3Errors::NO_SUCH_BUCKET,
+ };
+
+ return !unretryable_errors.contains(code);
+}
+
+}
+
+namespace DB::ErrorCodes
+{
+ extern const int S3_ERROR;
+}
+
+#endif
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_CONFIG_PARAMETER;
+}
+
+namespace S3
+{
+
+HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config)
+{
+ HTTPHeaderEntries headers;
+ Poco::Util::AbstractConfiguration::Keys subconfig_keys;
+ config.keys(config_elem, subconfig_keys);
+ for (const std::string & subkey : subconfig_keys)
+ {
+ if (subkey.starts_with("header"))
+ {
+ auto header_str = config.getString(config_elem + "." + subkey);
+ auto delimiter = header_str.find(':');
+ if (delimiter == std::string::npos)
+ throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Malformed s3 header value");
+ headers.emplace_back(header_str.substr(0, delimiter), header_str.substr(delimiter + 1, String::npos));
+ }
+ }
+ return headers;
+}
+
+ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config)
+{
+ ServerSideEncryptionKMSConfig sse_kms_config;
+
+ if (config.has(config_elem + ".server_side_encryption_kms_key_id"))
+ sse_kms_config.key_id = config.getString(config_elem + ".server_side_encryption_kms_key_id");
+
+ if (config.has(config_elem + ".server_side_encryption_kms_encryption_context"))
+ sse_kms_config.encryption_context = config.getString(config_elem + ".server_side_encryption_kms_encryption_context");
+
+ if (config.has(config_elem + ".server_side_encryption_kms_bucket_key_enabled"))
+ sse_kms_config.bucket_key_enabled = config.getBool(config_elem + ".server_side_encryption_kms_bucket_key_enabled");
+
+ return sse_kms_config;
+}
+
+AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config)
+{
+ auto access_key_id = config.getString(config_elem + ".access_key_id", "");
+ auto secret_access_key = config.getString(config_elem + ".secret_access_key", "");
+ auto region = config.getString(config_elem + ".region", "");
+ auto server_side_encryption_customer_key_base64 = config.getString(config_elem + ".server_side_encryption_customer_key_base64", "");
+
+ std::optional<bool> use_environment_credentials;
+ if (config.has(config_elem + ".use_environment_credentials"))
+ use_environment_credentials = config.getBool(config_elem + ".use_environment_credentials");
+
+ std::optional<bool> use_insecure_imds_request;
+ if (config.has(config_elem + ".use_insecure_imds_request"))
+ use_insecure_imds_request = config.getBool(config_elem + ".use_insecure_imds_request");
+
+ std::optional<uint64_t> expiration_window_seconds;
+ if (config.has(config_elem + ".expiration_window_seconds"))
+ expiration_window_seconds = config.getUInt64(config_elem + ".expiration_window_seconds");
+
+ std::optional<bool> no_sign_request;
+ if (config.has(config_elem + ".no_sign_request"))
+ no_sign_request = config.getBool(config_elem + ".no_sign_request");
+
+ HTTPHeaderEntries headers = getHTTPHeaders(config_elem, config);
+ ServerSideEncryptionKMSConfig sse_kms_config = getSSEKMSConfig(config_elem, config);
+
+ return AuthSettings
+ {
+ std::move(access_key_id), std::move(secret_access_key),
+ std::move(region),
+ std::move(server_side_encryption_customer_key_base64),
+ std::move(sse_kms_config),
+ std::move(headers),
+ use_environment_credentials,
+ use_insecure_imds_request,
+ expiration_window_seconds,
+ no_sign_request
+ };
+}
+
+
+void AuthSettings::updateFrom(const AuthSettings & from)
+{
+ /// Update with check for emptyness only parameters which
+ /// can be passed not only from config, but via ast.
+
+ if (!from.access_key_id.empty())
+ access_key_id = from.access_key_id;
+ if (!from.secret_access_key.empty())
+ secret_access_key = from.secret_access_key;
+
+ headers = from.headers;
+ region = from.region;
+ server_side_encryption_customer_key_base64 = from.server_side_encryption_customer_key_base64;
+ server_side_encryption_kms_config = from.server_side_encryption_kms_config;
+
+ if (from.use_environment_credentials.has_value())
+ use_environment_credentials = from.use_environment_credentials;
+
+ if (from.use_insecure_imds_request.has_value())
+ use_insecure_imds_request = from.use_insecure_imds_request;
+
+ if (from.expiration_window_seconds.has_value())
+ expiration_window_seconds = from.expiration_window_seconds;
+
+ if (from.no_sign_request.has_value())
+ no_sign_request = *from.no_sign_request;
+}
+
+}
+}
diff --git a/contrib/clickhouse/src/IO/S3Common.h b/contrib/clickhouse/src/IO/S3Common.h
new file mode 100644
index 0000000000..881edfcc9b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/S3Common.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <IO/S3/Client.h>
+#include <IO/S3/PocoHTTPClient.h>
+#include <IO/HTTPHeaderEntries.h>
+
+#include <string>
+#include <optional>
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <base/types.h>
+#include <Common/Exception.h>
+#include <Common/Throttler_fwd.h>
+
+#include <IO/S3/URI.h>
+
+#include <aws/core/Aws.h>
+#include <aws/s3/S3Errors.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int S3_ERROR;
+}
+
+class RemoteHostFilter;
+
+class S3Exception : public Exception
+{
+public:
+
+ // Format message with fmt::format, like the logging functions.
+ template <typename... Args>
+ S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
+ : Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
+ , code(code_)
+ {
+ }
+
+ S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
+ : Exception(msg, ErrorCodes::S3_ERROR)
+ , code(code_)
+ {}
+
+ Aws::S3::S3Errors getS3ErrorCode() const
+ {
+ return code;
+ }
+
+ bool isRetryableError() const;
+
+private:
+ Aws::S3::S3Errors code;
+};
+}
+
+#endif
+
+namespace Poco::Util
+{
+ class AbstractConfiguration;
+};
+
+namespace DB::S3
+{
+
+HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config);
+
+ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config);
+
+struct AuthSettings
+{
+ static AuthSettings loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config);
+
+ std::string access_key_id;
+ std::string secret_access_key;
+ std::string region;
+ std::string server_side_encryption_customer_key_base64;
+ ServerSideEncryptionKMSConfig server_side_encryption_kms_config;
+
+ HTTPHeaderEntries headers;
+
+ std::optional<bool> use_environment_credentials;
+ std::optional<bool> use_insecure_imds_request;
+ std::optional<uint64_t> expiration_window_seconds;
+ std::optional<bool> no_sign_request;
+
+ bool operator==(const AuthSettings & other) const = default;
+
+ void updateFrom(const AuthSettings & from);
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/SchedulerNodeFactory.h b/contrib/clickhouse/src/IO/SchedulerNodeFactory.h
new file mode 100644
index 0000000000..5c31534a9b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SchedulerNodeFactory.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <Common/ErrorCodes.h>
+#include <Common/Exception.h>
+
+#include <IO/ISchedulerNode.h>
+
+#include <Poco/Util/AbstractConfiguration.h>
+
+#include <boost/noncopyable.hpp>
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+class SchedulerNodeFactory : private boost::noncopyable
+{
+public:
+ static SchedulerNodeFactory & instance()
+ {
+ static SchedulerNodeFactory ret;
+ return ret;
+ }
+
+ SchedulerNodePtr get(const String & name, EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
+ {
+ std::lock_guard lock{mutex};
+ if (auto iter = methods.find(name); iter != methods.end())
+ return iter->second(event_queue, config, config_prefix);
+ throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Unknown scheduler node type: {}", name);
+ }
+
+ template <class TDerived>
+ void registerMethod(const String & name)
+ {
+ std::lock_guard lock{mutex};
+ methods[name] = [] (EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
+ {
+ return std::make_shared<TDerived>(event_queue, config, config_prefix);
+ };
+ }
+
+private:
+ std::mutex mutex;
+ using Method = std::function<SchedulerNodePtr(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const String & config_prefix)>;
+ std::unordered_map<String, Method> methods;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/SchedulerRoot.h b/contrib/clickhouse/src/IO/SchedulerRoot.h
new file mode 100644
index 0000000000..f9af2099b8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SchedulerRoot.h
@@ -0,0 +1,250 @@
+#pragma once
+
+#include <base/defines.h>
+
+#include <Common/Stopwatch.h>
+#include <Common/ThreadPool.h>
+
+#include <IO/ISchedulerNode.h>
+#include <IO/ISchedulerConstraint.h>
+
+#include <Poco/Util/XMLConfiguration.h>
+
+#include <unordered_map>
+#include <map>
+#include <memory>
+#include <atomic>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int INVALID_SCHEDULER_NODE;
+}
+
+/*
+ * Resource scheduler root node with a dedicated thread.
+ * Immediate children correspond to different resources.
+ */
+class SchedulerRoot : public ISchedulerNode
+{
+private:
+ struct TResource
+ {
+ SchedulerNodePtr root;
+
+ // Intrusive cyclic list of active resources
+ TResource * next = nullptr;
+ TResource * prev = nullptr;
+
+ explicit TResource(const SchedulerNodePtr & root_)
+ : root(root_)
+ {
+ root->info.parent.ptr = this;
+ }
+
+ // Get pointer stored by ctor in info
+ static TResource * get(SchedulerNodeInfo & info)
+ {
+ return reinterpret_cast<TResource *>(info.parent.ptr);
+ }
+ };
+
+public:
+ SchedulerRoot()
+ : ISchedulerNode(&events)
+ {}
+
+ ~SchedulerRoot() override
+ {
+ stop();
+ }
+
+ /// Runs separate scheduler thread
+ void start()
+ {
+ if (!scheduler.joinable())
+ scheduler = ThreadFromGlobalPool([this] { schedulerThread(); });
+ }
+
+ /// Joins scheduler threads and execute every pending request iff graceful
+ void stop(bool graceful = true)
+ {
+ if (scheduler.joinable())
+ {
+ stop_flag.store(true);
+ events.enqueue([]{}); // just to wake up thread
+ scheduler.join();
+ if (graceful)
+ {
+ // Do the same cycle as schedulerThread() but never block, just exit instead
+ bool has_work = true;
+ while (has_work)
+ {
+ auto [request, _] = dequeueRequest();
+ if (request)
+ execute(request);
+ else
+ has_work = false;
+ while (events.tryProcess())
+ has_work = true;
+ }
+ }
+ }
+ }
+
+ bool equals(ISchedulerNode * other) override
+ {
+ if (auto * o = dynamic_cast<SchedulerRoot *>(other))
+ return true;
+ return false;
+ }
+
+ void attachChild(const SchedulerNodePtr & child) override
+ {
+ // Take ownership
+ assert(child->parent == nullptr);
+ if (auto [it, inserted] = children.emplace(child.get(), child); !inserted)
+ throw Exception(
+ ErrorCodes::INVALID_SCHEDULER_NODE,
+ "Can't add the same scheduler node twice");
+
+ // Attach
+ child->setParent(this);
+
+ // Activate child if required
+ if (child->isActive())
+ activateChild(child.get());
+ }
+
+ void removeChild(ISchedulerNode * child) override
+ {
+ if (auto iter = children.find(child); iter != children.end())
+ {
+ SchedulerNodePtr removed = iter->second.root;
+
+ // Deactivate if required
+ deactivate(&iter->second);
+
+ // Detach
+ removed->setParent(nullptr);
+
+ // Remove ownership
+ children.erase(iter);
+ }
+ }
+
+ ISchedulerNode * getChild(const String &) override
+ {
+ abort(); // scheduler is allowed to have multiple children with the same name
+ }
+
+ std::pair<ResourceRequest *, bool> dequeueRequest() override
+ {
+ if (current == nullptr) // No active resources
+ return {nullptr, false};
+
+ // Dequeue request from current resource
+ auto [request, resource_active] = current->root->dequeueRequest();
+ assert(request != nullptr);
+
+ // Deactivate resource if required
+ if (!resource_active)
+ deactivate(current);
+ else
+ current = current->next; // Just move round-robin pointer
+
+ return {request, current != nullptr};
+ }
+
+ bool isActive() override
+ {
+ return current != nullptr;
+ }
+
+ void activateChild(ISchedulerNode * child) override
+ {
+ activate(TResource::get(child->info));
+ }
+
+ void setParent(ISchedulerNode *) override
+ {
+ abort(); // scheduler must be the root and this function should not be called
+ }
+
+private:
+ void activate(TResource * value)
+ {
+ assert(value->next == nullptr && value->prev == nullptr);
+ if (current == nullptr) // No active children
+ {
+ current = value;
+ value->prev = value;
+ value->next = value;
+ }
+ else
+ {
+ current->prev->next = value;
+ value->prev = current->prev;
+ current->prev = value;
+ value->next = current;
+ }
+ }
+
+ void deactivate(TResource * value)
+ {
+ if (value->next == nullptr)
+ return; // Already deactivated
+ assert(current != nullptr);
+ if (current == value)
+ {
+ if (current->next == current) // We are going to remove the last active child
+ {
+ value->next = nullptr;
+ value->prev = nullptr;
+ current = nullptr;
+ return;
+ }
+ else // Just move current to next to avoid invalidation
+ current = current->next;
+ }
+ value->prev->next = value->next;
+ value->next->prev = value->prev;
+ value->prev = nullptr;
+ value->next = nullptr;
+ }
+
+private:
+ void schedulerThread()
+ {
+ while (!stop_flag.load())
+ {
+ // Dequeue and execute single request
+ auto [request, _] = dequeueRequest();
+ if (request)
+ execute(request);
+ else // No more requests -- block until any event happens
+ events.process();
+
+ // Process all events before dequeuing to ensure fair competition
+ while (events.tryProcess()) {}
+ }
+ }
+
+ void execute(ResourceRequest * request)
+ {
+ request->execute_ns = clock_gettime_ns();
+ request->execute();
+ }
+
+private:
+ TResource * current = nullptr; // round-robin pointer
+ std::unordered_map<ISchedulerNode *, TResource> children; // resources by pointer
+ std::atomic<bool> stop_flag = false;
+ EventQueue events;
+ ThreadFromGlobalPool scheduler;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/SeekableReadBuffer.cpp b/contrib/clickhouse/src/IO/SeekableReadBuffer.cpp
new file mode 100644
index 0000000000..b83e382db0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SeekableReadBuffer.cpp
@@ -0,0 +1,109 @@
+#include <IO/SeekableReadBuffer.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int CANNOT_READ_FROM_ISTREAM;
+}
+
+namespace
+{
+ template <typename CustomData>
+ class SeekableReadBufferWrapper : public SeekableReadBuffer
+ {
+ public:
+ SeekableReadBufferWrapper(SeekableReadBuffer & in_, CustomData && custom_data_)
+ : SeekableReadBuffer(in_.buffer().begin(), in_.buffer().size(), in_.offset())
+ , in(in_)
+ , custom_data(std::move(custom_data_))
+ {
+ }
+
+ private:
+ SeekableReadBuffer & in;
+ CustomData custom_data;
+
+ bool nextImpl() override
+ {
+ in.position() = position();
+ if (!in.next())
+ {
+ set(in.position(), 0);
+ return false;
+ }
+ BufferBase::set(in.buffer().begin(), in.buffer().size(), in.offset());
+ return true;
+ }
+
+ off_t seek(off_t off, int whence) override
+ {
+ in.position() = position();
+ off_t new_pos = in.seek(off, whence);
+ BufferBase::set(in.buffer().begin(), in.buffer().size(), in.offset());
+ return new_pos;
+ }
+
+ off_t getPosition() override
+ {
+ in.position() = position();
+ return in.getPosition();
+ }
+ };
+}
+
+
+std::unique_ptr<SeekableReadBuffer> wrapSeekableReadBufferReference(SeekableReadBuffer & ref)
+{
+ return std::make_unique<SeekableReadBufferWrapper<nullptr_t>>(ref, nullptr);
+}
+
+std::unique_ptr<SeekableReadBuffer> wrapSeekableReadBufferPointer(SeekableReadBufferPtr ptr)
+{
+ return std::make_unique<SeekableReadBufferWrapper<SeekableReadBufferPtr>>(*ptr, SeekableReadBufferPtr{ptr});
+}
+
+size_t copyFromIStreamWithProgressCallback(std::istream & istr, char * to, size_t n, const std::function<bool(size_t)> & progress_callback, bool * out_cancelled)
+{
+ const size_t chunk = DBMS_DEFAULT_BUFFER_SIZE;
+ if (out_cancelled)
+ *out_cancelled = false;
+
+ size_t copied = 0;
+ while (copied < n)
+ {
+ size_t to_copy = std::min(chunk, n - copied);
+ istr.read(to + copied, to_copy);
+ size_t gcount = istr.gcount();
+
+ copied += gcount;
+
+ bool cancelled = false;
+ if (gcount && progress_callback)
+ cancelled = progress_callback(copied);
+
+ if (gcount != to_copy)
+ {
+ if (!istr.eof())
+ throw Exception(
+ ErrorCodes::CANNOT_READ_FROM_ISTREAM,
+ "{} at offset {}",
+ istr.fail() ? "Cannot read from istream" : "Unexpected state of istream",
+ copied);
+
+ break;
+ }
+
+ if (cancelled)
+ {
+ if (out_cancelled != nullptr)
+ *out_cancelled = true;
+ break;
+ }
+ }
+
+ return copied;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/SeekableReadBuffer.h b/contrib/clickhouse/src/IO/SeekableReadBuffer.h
new file mode 100644
index 0000000000..5770948be2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SeekableReadBuffer.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/WithFileSize.h>
+#include <optional>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int NOT_IMPLEMENTED;
+}
+
+
+class SeekableReadBuffer : public ReadBuffer
+{
+public:
+ SeekableReadBuffer(Position ptr, size_t size)
+ : ReadBuffer(ptr, size) {}
+ SeekableReadBuffer(Position ptr, size_t size, size_t offset)
+ : ReadBuffer(ptr, size, offset) {}
+
+ /**
+ * Shifts buffer current position to given offset.
+ * @param off Offset.
+ * @param whence Seek mode (@see SEEK_SET, @see SEEK_CUR).
+ * @return New position from the beginning of underlying buffer / file.
+ *
+ * What happens if you seek above the end of the file? Implementation-defined.
+ */
+ virtual off_t seek(off_t off, int whence) = 0;
+
+ /**
+ * Keep in mind that seekable buffer may encounter eof() once and the working buffer
+ * may get into inconsistent state. Don't forget to reset it on the first nextImpl()
+ * after seek().
+ */
+
+ /**
+ * @return Offset from the begin of the underlying buffer / file corresponds to the buffer current position.
+ */
+ virtual off_t getPosition() = 0;
+
+ virtual String getInfoForLog() { return ""; }
+
+ virtual size_t getFileOffsetOfBufferEnd() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFileOffsetOfBufferEnd() not implemented"); }
+
+ /// If true, setReadUntilPosition() guarantees that eof will be reported at the given position.
+ virtual bool supportsRightBoundedReads() const { return false; }
+
+ /// Returns true if seek() actually works, false if seek() will always throw (or make subsequent
+ /// nextImpl() calls throw).
+ ///
+ /// This is needed because:
+ /// * Sometimes there's no cheap way to know in advance whether the buffer is really seekable.
+ /// Specifically, HTTP read buffer needs to send a request to check whether the server
+ /// supports byte ranges.
+ /// * Sometimes when we create such buffer we don't know in advance whether we'll need it to be
+ /// seekable or not. So we don't want to pay the price for this check in advance.
+ virtual bool checkIfActuallySeekable() { return true; }
+
+ /// Unbuffered positional read.
+ /// Doesn't affect the buffer state (position, working_buffer, etc).
+ ///
+ /// `progress_callback` may be called periodically during the read, reporting that to[0..m-1]
+ /// has been filled. If it returns true, reading is stopped, and readBigAt() returns bytes read
+ /// so far. Called only from inside readBigAt(), from the same thread, with increasing m.
+ ///
+ /// Stops either after n bytes, or at end of file, or on exception. Returns number of bytes read.
+ /// If offset is past the end of file, may return 0 or throw exception.
+ ///
+ /// Caller needs to be careful:
+ /// * supportsReadAt() must be checked (called and return true) before calling readBigAt().
+ /// Otherwise readBigAt() may crash.
+ /// * Thread safety: multiple readBigAt() calls may be performed in parallel.
+ /// But readBigAt() may not be called in parallel with any other methods
+ /// (e.g. next() or supportsReadAt()).
+ /// * Performance: there's no buffering. Each readBigAt() call typically translates into actual
+ /// IO operation (e.g. HTTP request). Don't use it for small adjacent reads.
+ virtual size_t readBigAt(char * /*to*/, size_t /*n*/, size_t /*offset*/, const std::function<bool(size_t m)> & /*progress_callback*/ = nullptr)
+ { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method readBigAt() not implemented"); }
+
+ /// Checks if readBigAt() is allowed. May be slow, may throw (e.g. it may do an HTTP request or an fstat).
+ virtual bool supportsReadAt() { return false; }
+
+ /// We do some tricks to avoid seek cost. E.g we read more data and than ignore it (see remote_read_min_bytes_for_seek).
+ /// Sometimes however seek is basically free because underlying read buffer wasn't yet initialised (or re-initialised after reset).
+ virtual bool seekIsCheap() { return false; }
+};
+
+
+using SeekableReadBufferPtr = std::shared_ptr<SeekableReadBuffer>;
+
+/// Wraps a reference to a SeekableReadBuffer into an unique pointer to SeekableReadBuffer.
+/// This function is like wrapReadBufferReference() but for SeekableReadBuffer.
+std::unique_ptr<SeekableReadBuffer> wrapSeekableReadBufferReference(SeekableReadBuffer & ref);
+std::unique_ptr<SeekableReadBuffer> wrapSeekableReadBufferPointer(SeekableReadBufferPtr ptr);
+
+/// Helper for implementing readBigAt().
+size_t copyFromIStreamWithProgressCallback(std::istream & istr, char * to, size_t n, const std::function<bool(size_t)> & progress_callback, bool * out_cancelled = nullptr);
+
+}
diff --git a/contrib/clickhouse/src/IO/SharedThreadPools.cpp b/contrib/clickhouse/src/IO/SharedThreadPools.cpp
new file mode 100644
index 0000000000..6a0e953f0e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SharedThreadPools.cpp
@@ -0,0 +1,141 @@
+#include <IO/SharedThreadPools.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/ThreadPool.h>
+#include <Core/Field.h>
+
+namespace CurrentMetrics
+{
+ extern const Metric IOThreads;
+ extern const Metric IOThreadsActive;
+ extern const Metric BackupsIOThreads;
+ extern const Metric BackupsIOThreadsActive;
+ extern const Metric MergeTreePartsLoaderThreads;
+ extern const Metric MergeTreePartsLoaderThreadsActive;
+ extern const Metric MergeTreePartsCleanerThreads;
+ extern const Metric MergeTreePartsCleanerThreadsActive;
+ extern const Metric MergeTreeOutdatedPartsLoaderThreads;
+ extern const Metric MergeTreeOutdatedPartsLoaderThreadsActive;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+}
+
+
+StaticThreadPool::StaticThreadPool(
+ const String & name_,
+ CurrentMetrics::Metric threads_metric_,
+ CurrentMetrics::Metric threads_active_metric_)
+ : name(name_)
+ , threads_metric(threads_metric_)
+ , threads_active_metric(threads_active_metric_)
+{
+}
+
+void StaticThreadPool::initialize(size_t max_threads, size_t max_free_threads, size_t queue_size)
+{
+ if (instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is initialized twice", name);
+
+ /// By default enabling "turbo mode" won't affect the number of threads anyhow
+ max_threads_turbo = max_threads;
+ max_threads_normal = max_threads;
+ instance = std::make_unique<ThreadPool>(
+ threads_metric,
+ threads_active_metric,
+ max_threads,
+ max_free_threads,
+ queue_size,
+ /* shutdown_on_exception= */ false);
+}
+
+void StaticThreadPool::reloadConfiguration(size_t max_threads, size_t max_free_threads, size_t queue_size)
+{
+ if (!instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name);
+
+ instance->setMaxThreads(turbo_mode_enabled > 0 ? max_threads_turbo : max_threads);
+ instance->setMaxFreeThreads(max_free_threads);
+ instance->setQueueSize(queue_size);
+}
+
+
+ThreadPool & StaticThreadPool::get()
+{
+ if (!instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name);
+
+ return *instance;
+}
+
+void StaticThreadPool::enableTurboMode()
+{
+ if (!instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name);
+
+ std::lock_guard lock(mutex);
+
+ ++turbo_mode_enabled;
+ if (turbo_mode_enabled == 1)
+ instance->setMaxThreads(max_threads_turbo);
+}
+
+void StaticThreadPool::disableTurboMode()
+{
+ if (!instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name);
+
+ std::lock_guard lock(mutex);
+
+ --turbo_mode_enabled;
+ if (turbo_mode_enabled == 0)
+ instance->setMaxThreads(max_threads_normal);
+}
+
+void StaticThreadPool::setMaxTurboThreads(size_t max_threads_turbo_)
+{
+ if (!instance)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "The {} is not initialized", name);
+
+ std::lock_guard lock(mutex);
+
+ max_threads_turbo = max_threads_turbo_;
+ if (turbo_mode_enabled > 0)
+ instance->setMaxThreads(max_threads_turbo);
+}
+
+StaticThreadPool & getIOThreadPool()
+{
+ static StaticThreadPool instance("IOThreadPool", CurrentMetrics::IOThreads, CurrentMetrics::IOThreadsActive);
+ return instance;
+}
+
+StaticThreadPool & getBackupsIOThreadPool()
+{
+ static StaticThreadPool instance("BackupsIOThreadPool", CurrentMetrics::BackupsIOThreads, CurrentMetrics::BackupsIOThreadsActive);
+ return instance;
+}
+
+StaticThreadPool & getActivePartsLoadingThreadPool()
+{
+ static StaticThreadPool instance("MergeTreePartsLoaderThreadPool", CurrentMetrics::MergeTreePartsLoaderThreads, CurrentMetrics::MergeTreePartsLoaderThreadsActive);
+ return instance;
+}
+
+StaticThreadPool & getPartsCleaningThreadPool()
+{
+ static StaticThreadPool instance("MergeTreePartsCleanerThreadPool", CurrentMetrics::MergeTreePartsCleanerThreads, CurrentMetrics::MergeTreePartsCleanerThreadsActive);
+ return instance;
+}
+
+StaticThreadPool & getOutdatedPartsLoadingThreadPool()
+{
+ static StaticThreadPool instance("MergeTreeOutdatedPartsLoaderThreadPool", CurrentMetrics::MergeTreeOutdatedPartsLoaderThreads, CurrentMetrics::MergeTreeOutdatedPartsLoaderThreadsActive);
+ return instance;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/SharedThreadPools.h b/contrib/clickhouse/src/IO/SharedThreadPools.h
new file mode 100644
index 0000000000..188a2a4f00
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SharedThreadPools.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <base/types.h>
+#include <Common/ThreadPool_fwd.h>
+#include <Common/CurrentMetrics.h>
+
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+
+namespace DB
+{
+
+class StaticThreadPool
+{
+public:
+ StaticThreadPool(
+ const String & name_,
+ CurrentMetrics::Metric threads_metric_,
+ CurrentMetrics::Metric threads_active_metric_);
+
+ ThreadPool & get();
+
+ void initialize(size_t max_threads, size_t max_free_threads, size_t queue_size);
+ void reloadConfiguration(size_t max_threads, size_t max_free_threads, size_t queue_size);
+
+ /// At runtime we can increase the number of threads up the specified limit
+ /// This is needed to utilize as much a possible resources to accomplish some task.
+ void setMaxTurboThreads(size_t max_threads_turbo_);
+ void enableTurboMode();
+ void disableTurboMode();
+
+private:
+ const String name;
+ const CurrentMetrics::Metric threads_metric;
+ const CurrentMetrics::Metric threads_active_metric;
+
+ std::unique_ptr<ThreadPool> instance;
+ std::mutex mutex;
+ size_t max_threads_turbo = 0;
+ size_t max_threads_normal = 0;
+ /// If this counter is > 0 - this specific mode is enabled
+ size_t turbo_mode_enabled = 0;
+};
+
+/// ThreadPool used for the IO.
+StaticThreadPool & getIOThreadPool();
+
+/// ThreadPool used for the Backup IO.
+StaticThreadPool & getBackupsIOThreadPool();
+
+/// ThreadPool used for the loading of Outdated data parts for MergeTree tables.
+StaticThreadPool & getActivePartsLoadingThreadPool();
+
+/// ThreadPool used for deleting data parts for MergeTree tables.
+StaticThreadPool & getPartsCleaningThreadPool();
+
+/// This ThreadPool is used for the loading of Outdated data parts for MergeTree tables.
+/// Normally we will just load Outdated data parts concurrently in background, but in
+/// case when we need to synchronously wait for the loading to be finished, we can increase
+/// the number of threads by calling enableTurboMode() :-)
+StaticThreadPool & getOutdatedPartsLoadingThreadPool();
+
+}
diff --git a/contrib/clickhouse/src/IO/SnappyReadBuffer.cpp b/contrib/clickhouse/src/IO/SnappyReadBuffer.cpp
new file mode 100644
index 0000000000..74a1784d6d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SnappyReadBuffer.cpp
@@ -0,0 +1,75 @@
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+#include <memory>
+#include <fcntl.h>
+#include <sys/types.h>
+
+#include <snappy.h>
+
+#include <IO/copyData.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+
+#include "SnappyReadBuffer.h"
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int SNAPPY_UNCOMPRESS_FAILED;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+}
+
+
+SnappyReadBuffer::SnappyReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment)
+ : BufferWithOwnMemory<SeekableReadBuffer>(buf_size, existing_memory, alignment), in(std::move(in_))
+{
+}
+
+bool SnappyReadBuffer::nextImpl()
+{
+ if (compress_buffer.empty() && uncompress_buffer.empty())
+ {
+ WriteBufferFromString wb(compress_buffer);
+ copyData(*in, wb);
+
+ bool success = snappy::Uncompress(compress_buffer.data(), wb.count(), &uncompress_buffer);
+ if (!success)
+ {
+ throw Exception(ErrorCodes::SNAPPY_UNCOMPRESS_FAILED, "snappy uncomress failed: ");
+ }
+ BufferBase::set(const_cast<char *>(uncompress_buffer.data()), uncompress_buffer.size(), 0);
+ return true;
+ }
+ return false;
+}
+
+SnappyReadBuffer::~SnappyReadBuffer() = default;
+
+off_t SnappyReadBuffer::seek(off_t off, int whence)
+{
+ off_t new_pos;
+ if (whence == SEEK_SET)
+ new_pos = off;
+ else if (whence == SEEK_CUR)
+ new_pos = count() + off;
+ else
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Only SEEK_SET and SEEK_CUR seek modes allowed.");
+
+ working_buffer = internal_buffer;
+ if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND,
+ "Cannot seek through buffer because seek position ({}) is out of bounds [0, {}]",
+ new_pos, working_buffer.size());
+ position() = working_buffer.begin() + new_pos;
+ return new_pos;
+}
+
+off_t SnappyReadBuffer::getPosition()
+{
+ return count();
+}
+
+}
+#endif
diff --git a/contrib/clickhouse/src/IO/SnappyReadBuffer.h b/contrib/clickhouse/src/IO/SnappyReadBuffer.h
new file mode 100644
index 0000000000..532fcb14e8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SnappyReadBuffer.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+
+#include <IO/ReadBuffer.h>
+#include <IO/SeekableReadBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+
+namespace DB
+{
+class SnappyReadBuffer : public BufferWithOwnMemory<SeekableReadBuffer>
+{
+public:
+ explicit SnappyReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~SnappyReadBuffer() override;
+
+ bool nextImpl() override;
+ off_t seek(off_t off, int whence) override;
+ off_t getPosition() override;
+
+private:
+ std::unique_ptr<ReadBuffer> in;
+ String compress_buffer;
+ String uncompress_buffer;
+};
+
+}
+#endif
diff --git a/contrib/clickhouse/src/IO/SnappyWriteBuffer.cpp b/contrib/clickhouse/src/IO/SnappyWriteBuffer.cpp
new file mode 100644
index 0000000000..4a27615f24
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SnappyWriteBuffer.cpp
@@ -0,0 +1,92 @@
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+#include <cstring>
+
+#include <snappy.h>
+
+#include <Common/ErrorCodes.h>
+#include "SnappyWriteBuffer.h"
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int SNAPPY_COMPRESS_FAILED;
+}
+
+SnappyWriteBuffer::SnappyWriteBuffer(std::unique_ptr<WriteBuffer> out_, size_t buf_size, char * existing_memory, size_t alignment)
+ : BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment), out(std::move(out_))
+{
+}
+
+SnappyWriteBuffer::~SnappyWriteBuffer()
+{
+ finish();
+}
+
+void SnappyWriteBuffer::nextImpl()
+{
+ if (!offset())
+ {
+ return;
+ }
+
+ const char * in_data = reinterpret_cast<const char *>(working_buffer.begin());
+ size_t in_available = offset();
+ uncompress_buffer.append(in_data, in_available);
+}
+
+void SnappyWriteBuffer::finish()
+{
+ if (finished)
+ return;
+
+ try
+ {
+ finishImpl();
+ out->finalize();
+ finished = true;
+ }
+ catch (...)
+ {
+ /// Do not try to flush next time after exception.
+ out->position() = out->buffer().begin();
+ finished = true;
+ throw;
+ }
+}
+
+void SnappyWriteBuffer::finishImpl()
+{
+ next();
+
+ bool success = snappy::Compress(uncompress_buffer.data(), uncompress_buffer.size(), &compress_buffer);
+ if (!success)
+ {
+ throw Exception(ErrorCodes::SNAPPY_COMPRESS_FAILED, "snappy compress failed: ");
+ }
+
+ char * in_data = compress_buffer.data();
+ size_t in_available = compress_buffer.size();
+ char * out_data = nullptr;
+ size_t out_capacity = 0;
+ size_t len = 0;
+ while (in_available > 0)
+ {
+ out->nextIfAtEnd();
+ out_data = out->position();
+ out_capacity = out->buffer().end() - out->position();
+ len = in_available > out_capacity ? out_capacity : in_available;
+
+ memcpy(out_data, in_data, len);
+ in_data += len;
+ in_available -= len;
+ out->position() += len;
+ }
+}
+
+}
+
+#endif
+
diff --git a/contrib/clickhouse/src/IO/SnappyWriteBuffer.h b/contrib/clickhouse/src/IO/SnappyWriteBuffer.h
new file mode 100644
index 0000000000..73652f33a5
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SnappyWriteBuffer.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_SNAPPY
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/WriteBuffer.h>
+
+namespace DB
+{
+/// Performs compression using snappy library and write compressed data to the underlying buffer.
+class SnappyWriteBuffer : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ explicit SnappyWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~SnappyWriteBuffer() override;
+
+ void finalizeImpl() override { finish(); }
+
+private:
+ void nextImpl() override;
+
+ void finishImpl();
+ void finish();
+
+ std::unique_ptr<WriteBuffer> out;
+ bool finished = false;
+
+ String uncompress_buffer;
+ String compress_buffer;
+};
+
+}
+
+#endif
+
diff --git a/contrib/clickhouse/src/IO/StdIStreamFromMemory.cpp b/contrib/clickhouse/src/IO/StdIStreamFromMemory.cpp
new file mode 100644
index 0000000000..3242a7e638
--- /dev/null
+++ b/contrib/clickhouse/src/IO/StdIStreamFromMemory.cpp
@@ -0,0 +1,62 @@
+#include <IO/StdIStreamFromMemory.h>
+
+namespace DB
+{
+
+StdIStreamFromMemory::MemoryBuf::MemoryBuf(char * begin_, size_t size_)
+ : begin(begin_)
+ , size(size_)
+{
+ this->setg(begin, begin, begin + size);
+}
+
+StdIStreamFromMemory::MemoryBuf::int_type StdIStreamFromMemory::MemoryBuf::underflow()
+{
+ if (gptr() < egptr())
+ return traits_type::to_int_type(*gptr());
+ return traits_type::eof();
+}
+
+StdIStreamFromMemory::MemoryBuf::pos_type
+StdIStreamFromMemory::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way,
+ std::ios_base::openmode mode)
+{
+ bool out_mode = (std::ios_base::out & mode) != 0;
+ if (out_mode)
+ return off_type(-1);
+
+ off_type ret(-1);
+
+ if (way == std::ios_base::beg)
+ ret = 0;
+ else if (way == std::ios_base::cur)
+ ret = gptr() - begin;
+ else if (way == std::ios_base::end)
+ ret = size;
+
+ if (ret == off_type(-1))
+ return ret;
+
+ ret += off;
+ if (!(ret >= 0 && size_t(ret) <= size))
+ return off_type(-1);
+
+ this->setg(begin, begin + ret, begin + size);
+
+ return pos_type(ret);
+}
+
+StdIStreamFromMemory::MemoryBuf::pos_type StdIStreamFromMemory::MemoryBuf::seekpos(pos_type sp,
+ std::ios_base::openmode mode)
+{
+ return seekoff(off_type(sp), std::ios_base::beg, mode);
+}
+
+StdIStreamFromMemory::StdIStreamFromMemory(char * begin_, size_t size_)
+ : std::iostream(nullptr)
+ , mem_buf(begin_, size_)
+{
+ init(&mem_buf);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/StdIStreamFromMemory.h b/contrib/clickhouse/src/IO/StdIStreamFromMemory.h
new file mode 100644
index 0000000000..64b147fd29
--- /dev/null
+++ b/contrib/clickhouse/src/IO/StdIStreamFromMemory.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <iostream>
+
+namespace DB
+{
+
+/// StdIStreamFromMemory is used in WriteBufferFromS3 as a stream which is passed to the S3::Client
+/// It provides istream interface (only reading) over the memory.
+/// However S3::Client requires iostream interface it only reads from the stream
+
+class StdIStreamFromMemory : public std::iostream
+{
+ struct MemoryBuf: std::streambuf
+ {
+ MemoryBuf(char * begin_, size_t size_);
+
+ int_type underflow() override;
+
+ pos_type seekoff(off_type off, std::ios_base::seekdir way,
+ std::ios_base::openmode mode) override;
+
+ pos_type seekpos(pos_type sp,
+ std::ios_base::openmode mode) override;
+
+ char * begin = nullptr;
+ size_t size = 0;
+ };
+
+ MemoryBuf mem_buf;
+
+public:
+ StdIStreamFromMemory(char * begin_, size_t size_);
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.cpp b/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.cpp
new file mode 100644
index 0000000000..a814dff040
--- /dev/null
+++ b/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.cpp
@@ -0,0 +1,111 @@
+#include <IO/StdStreamBufFromReadBuffer.h>
+#include <IO/SeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+}
+
+
+StdStreamBufFromReadBuffer::StdStreamBufFromReadBuffer(std::unique_ptr<ReadBuffer> read_buffer_, size_t size_)
+ : read_buffer(std::move(read_buffer_)), seekable_read_buffer(dynamic_cast<SeekableReadBuffer *>(read_buffer.get())), size(size_)
+{
+}
+
+StdStreamBufFromReadBuffer::StdStreamBufFromReadBuffer(ReadBuffer & read_buffer_, size_t size_) : size(size_)
+{
+ if (dynamic_cast<SeekableReadBuffer *>(&read_buffer_))
+ {
+ read_buffer = wrapSeekableReadBufferReference(static_cast<SeekableReadBuffer &>(read_buffer_));
+ seekable_read_buffer = static_cast<SeekableReadBuffer *>(read_buffer.get());
+ }
+ else
+ {
+ read_buffer = wrapReadBufferReference(read_buffer_);
+ }
+}
+
+StdStreamBufFromReadBuffer::~StdStreamBufFromReadBuffer() = default;
+
+int StdStreamBufFromReadBuffer::underflow()
+{
+ char c;
+ if (!read_buffer->peek(c))
+ return std::char_traits<char>::eof();
+ return c;
+}
+
+std::streamsize StdStreamBufFromReadBuffer::showmanyc()
+{
+ return read_buffer->available();
+}
+
+std::streamsize StdStreamBufFromReadBuffer::xsgetn(char_type* s, std::streamsize count)
+{
+ return read_buffer->read(s, count);
+}
+
+std::streampos StdStreamBufFromReadBuffer::seekoff(std::streamoff off, std::ios_base::seekdir dir, std::ios_base::openmode which)
+{
+ if (dir == std::ios_base::beg)
+ return seekpos(off, which);
+ else if (dir == std::ios_base::cur)
+ return seekpos(getCurrentPosition() + off, which);
+ else if (dir == std::ios_base::end)
+ return seekpos(size + off, which);
+ else
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong seek's base {}", static_cast<int>(dir));
+}
+
+std::streampos StdStreamBufFromReadBuffer::seekpos(std::streampos pos, std::ios_base::openmode which)
+{
+ if (!(which & std::ios_base::in))
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR, "Wrong seek mode {}", static_cast<int>(which));
+
+ std::streamoff offset = pos - getCurrentPosition();
+ if (!offset)
+ return pos;
+
+ if ((read_buffer->buffer().begin() <= read_buffer->position() + offset) && (read_buffer->position() + offset <= read_buffer->buffer().end()))
+ {
+ read_buffer->position() += offset;
+ return pos;
+ }
+
+ if (seekable_read_buffer)
+ return seekable_read_buffer->seek(pos, SEEK_SET);
+
+ if (offset > 0)
+ {
+ read_buffer->ignore(offset);
+ return pos;
+ }
+
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek's offset {} is out of bound", pos);
+}
+
+std::streampos StdStreamBufFromReadBuffer::getCurrentPosition() const
+{
+ if (seekable_read_buffer)
+ return seekable_read_buffer->getPosition();
+ else
+ return read_buffer->count();
+}
+
+std::streamsize StdStreamBufFromReadBuffer::xsputn(const char*, std::streamsize)
+{
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "StdStreamBufFromReadBuffer cannot be used for output");
+}
+
+int StdStreamBufFromReadBuffer::overflow(int)
+{
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "StdStreamBufFromReadBuffer cannot be used for output");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.h b/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.h
new file mode 100644
index 0000000000..ff16b91e98
--- /dev/null
+++ b/contrib/clickhouse/src/IO/StdStreamBufFromReadBuffer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <memory>
+#include <streambuf>
+
+
+namespace DB
+{
+class ReadBuffer;
+class SeekableReadBuffer;
+
+/// `std::streambuf`-compatible wrapper around a ReadBuffer.
+class StdStreamBufFromReadBuffer : public std::streambuf
+{
+public:
+ using Base = std::streambuf;
+
+ explicit StdStreamBufFromReadBuffer(std::unique_ptr<ReadBuffer> read_buffer_, size_t size_);
+ explicit StdStreamBufFromReadBuffer(ReadBuffer & read_buffer_, size_t size_);
+ ~StdStreamBufFromReadBuffer() override;
+
+private:
+ int underflow() override;
+ std::streamsize showmanyc() override;
+ std::streamsize xsgetn(char* s, std::streamsize count) override;
+ std::streampos seekoff(std::streamoff off, std::ios_base::seekdir dir, std::ios_base::openmode which) override;
+ std::streampos seekpos(std::streampos pos, std::ios_base::openmode which) override;
+
+ std::streamsize xsputn(const char* s, std::streamsize n) override;
+ int overflow(int c) override;
+
+ std::streampos getCurrentPosition() const;
+
+ std::unique_ptr<ReadBuffer> read_buffer;
+ SeekableReadBuffer * seekable_read_buffer = nullptr;
+ size_t size;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/StdStreamFromReadBuffer.h b/contrib/clickhouse/src/IO/StdStreamFromReadBuffer.h
new file mode 100644
index 0000000000..ff327dc342
--- /dev/null
+++ b/contrib/clickhouse/src/IO/StdStreamFromReadBuffer.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <IO/StdStreamBufFromReadBuffer.h>
+#include <memory>
+
+
+namespace DB
+{
+class ReadBuffer;
+
+/// `std::istream`-compatible wrapper around a ReadBuffer.
+class StdIStreamFromReadBuffer : public std::istream
+{
+public:
+ using Base = std::istream;
+ StdIStreamFromReadBuffer(std::unique_ptr<ReadBuffer> buf, size_t size) : Base(&stream_buf), stream_buf(std::move(buf), size) { }
+ StdIStreamFromReadBuffer(ReadBuffer & buf, size_t size) : Base(&stream_buf), stream_buf(buf, size) { }
+ StdStreamBufFromReadBuffer * rdbuf() const { return const_cast<StdStreamBufFromReadBuffer *>(&stream_buf); }
+
+private:
+ StdStreamBufFromReadBuffer stream_buf;
+};
+
+
+/// `std::iostream`-compatible wrapper around a ReadBuffer.
+class StdStreamFromReadBuffer : public std::iostream
+{
+public:
+ using Base = std::iostream;
+ StdStreamFromReadBuffer(std::unique_ptr<ReadBuffer> buf, size_t size) : Base(&stream_buf), stream_buf(std::move(buf), size) { }
+ StdStreamFromReadBuffer(ReadBuffer & buf, size_t size) : Base(&stream_buf), stream_buf(buf, size) { }
+ StdStreamBufFromReadBuffer * rdbuf() const { return const_cast<StdStreamBufFromReadBuffer *>(&stream_buf); }
+
+private:
+ StdStreamBufFromReadBuffer stream_buf;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/SwapHelper.cpp b/contrib/clickhouse/src/IO/SwapHelper.cpp
new file mode 100644
index 0000000000..4a1cc8acf4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SwapHelper.cpp
@@ -0,0 +1,17 @@
+#include <IO/SwapHelper.h>
+
+namespace DB
+{
+
+SwapHelper::SwapHelper(BufferBase & b1_, BufferBase & b2_)
+ : b1(b1_), b2(b2_)
+{
+ b1.swap(b2);
+}
+
+SwapHelper::~SwapHelper()
+{
+ b1.swap(b2);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/SwapHelper.h b/contrib/clickhouse/src/IO/SwapHelper.h
new file mode 100644
index 0000000000..fcf32927f2
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SwapHelper.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <IO/BufferBase.h>
+
+namespace DB
+{
+
+class SwapHelper
+{
+public:
+ SwapHelper(BufferBase & b1_, BufferBase & b2_);
+ ~SwapHelper();
+
+private:
+ BufferBase & b1;
+ BufferBase & b2;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/SynchronousReader.cpp b/contrib/clickhouse/src/IO/SynchronousReader.cpp
new file mode 100644
index 0000000000..e1c654e48a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SynchronousReader.cpp
@@ -0,0 +1,89 @@
+#include <IO/SynchronousReader.h>
+#include <Common/assert_cast.h>
+#include <Common/Exception.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
+#include <base/errnoToString.h>
+#include <unordered_map>
+#include <mutex>
+#include <unistd.h>
+#include <fcntl.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event ReadBufferFromFileDescriptorRead;
+ extern const Event ReadBufferFromFileDescriptorReadFailed;
+ extern const Event ReadBufferFromFileDescriptorReadBytes;
+ extern const Event DiskReadElapsedMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric Read;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR;
+ extern const int CANNOT_ADVISE;
+}
+
+
+std::future<IAsynchronousReader::Result> SynchronousReader::submit(Request request)
+{
+ /// If size is zero, then read() cannot be distinguished from EOF
+ assert(request.size);
+
+ int fd = assert_cast<const LocalFileDescriptor &>(*request.descriptor).fd;
+
+#if defined(POSIX_FADV_WILLNEED)
+ if (0 != posix_fadvise(fd, request.offset, request.size, POSIX_FADV_WILLNEED))
+ throwFromErrno("Cannot posix_fadvise", ErrorCodes::CANNOT_ADVISE);
+#endif
+
+ return std::async(std::launch::deferred, [fd, request]
+ {
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorRead);
+ Stopwatch watch(CLOCK_MONOTONIC);
+
+ size_t bytes_read = 0;
+ while (!bytes_read)
+ {
+ ssize_t res = 0;
+
+ {
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::Read};
+ res = ::pread(fd, request.buf, request.size, request.offset);
+ }
+ if (!res)
+ break;
+
+ if (-1 == res && errno != EINTR)
+ {
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadFailed);
+ throwFromErrno(fmt::format("Cannot read from file {}", fd), ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR);
+ }
+
+ if (res > 0)
+ bytes_read += res;
+ }
+
+ ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read);
+
+ /// It reports real time spent including the time spent while thread was preempted doing nothing.
+ /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables).
+ /// Sometimes it is better to use taskstats::blkio_delay_total, but it is quite expensive to get it
+ /// (NetlinkMetricsProvider has about 500K RPS).
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::DiskReadElapsedMicroseconds, watch.elapsedMicroseconds());
+
+ return Result{ .size = bytes_read, .offset = request.ignore };
+ });
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/SynchronousReader.h b/contrib/clickhouse/src/IO/SynchronousReader.h
new file mode 100644
index 0000000000..238d6e9371
--- /dev/null
+++ b/contrib/clickhouse/src/IO/SynchronousReader.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <IO/AsynchronousReader.h>
+
+
+namespace DB
+{
+
+/** Implementation of IAsynchronousReader that in fact synchronous.
+ * The only addition is posix_fadvise.
+ */
+class SynchronousReader final : public IAsynchronousReader
+{
+public:
+ std::future<Result> submit(Request request) override;
+
+ void wait() override {}
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/TimeoutSetter.cpp b/contrib/clickhouse/src/IO/TimeoutSetter.cpp
new file mode 100644
index 0000000000..b8b7a81470
--- /dev/null
+++ b/contrib/clickhouse/src/IO/TimeoutSetter.cpp
@@ -0,0 +1,56 @@
+#include <IO/TimeoutSetter.h>
+
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+TimeoutSetter::TimeoutSetter(Poco::Net::StreamSocket & socket_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ bool limit_max_timeout)
+ : socket(socket_), send_timeout(send_timeout_), receive_timeout(receive_timeout_)
+{
+ old_send_timeout = socket.getSendTimeout();
+ old_receive_timeout = socket.getReceiveTimeout();
+
+ if (!limit_max_timeout || old_send_timeout > send_timeout)
+ socket.setSendTimeout(send_timeout);
+
+ if (!limit_max_timeout || old_receive_timeout > receive_timeout)
+ socket.setReceiveTimeout(receive_timeout);
+}
+
+TimeoutSetter::TimeoutSetter(Poco::Net::StreamSocket & socket_, Poco::Timespan timeout_, bool limit_max_timeout)
+ : TimeoutSetter(socket_, timeout_, timeout_, limit_max_timeout)
+{
+}
+
+TimeoutSetter::~TimeoutSetter()
+{
+ if (was_reset)
+ return;
+
+ try
+ {
+ reset();
+ }
+ catch (...)
+ {
+ tryLogCurrentException("Client", "TimeoutSetter: Can't reset timeouts");
+ }
+}
+
+void TimeoutSetter::reset()
+{
+ bool connected = socket.impl()->initialized();
+ if (!connected)
+ return;
+
+ socket.setSendTimeout(old_send_timeout);
+ socket.setReceiveTimeout(old_receive_timeout);
+ was_reset = true;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/TimeoutSetter.h b/contrib/clickhouse/src/IO/TimeoutSetter.h
new file mode 100644
index 0000000000..3479986d7f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/TimeoutSetter.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <Poco/Net/StreamSocket.h>
+#include <Poco/Timespan.h>
+
+
+namespace DB
+{
+/// Temporarily overrides socket send/receive timeouts and reset them back into destructor (or manually by calling reset method)
+/// If "limit_max_timeout" is true, timeouts could be only decreased (maxed by previous value).
+struct TimeoutSetter
+{
+ TimeoutSetter(Poco::Net::StreamSocket & socket_,
+ Poco::Timespan send_timeout_,
+ Poco::Timespan receive_timeout_,
+ bool limit_max_timeout = false);
+
+ TimeoutSetter(Poco::Net::StreamSocket & socket_, Poco::Timespan timeout_, bool limit_max_timeout = false);
+
+ ~TimeoutSetter();
+
+ /// Reset timeouts back.
+ void reset();
+
+ Poco::Net::StreamSocket & socket;
+
+ Poco::Timespan send_timeout;
+ Poco::Timespan receive_timeout;
+
+ Poco::Timespan old_send_timeout;
+ Poco::Timespan old_receive_timeout;
+ bool was_reset = false;
+};
+}
diff --git a/contrib/clickhouse/src/IO/UncompressedCache.h b/contrib/clickhouse/src/IO/UncompressedCache.h
new file mode 100644
index 0000000000..702804cdda
--- /dev/null
+++ b/contrib/clickhouse/src/IO/UncompressedCache.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <Common/SipHash.h>
+#include <Common/ProfileEvents.h>
+#include <Common/HashTable/Hash.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <Common/CacheBase.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event UncompressedCacheHits;
+ extern const Event UncompressedCacheMisses;
+ extern const Event UncompressedCacheWeightLost;
+}
+
+namespace DB
+{
+
+
+struct UncompressedCacheCell
+{
+ Memory<> data;
+ size_t compressed_size;
+ UInt32 additional_bytes;
+};
+
+struct UncompressedSizeWeightFunction
+{
+ size_t operator()(const UncompressedCacheCell & x) const
+ {
+ return x.data.size();
+ }
+};
+
+
+/** Cache of decompressed blocks for implementation of CachedCompressedReadBuffer. thread-safe.
+ */
+class UncompressedCache : public CacheBase<UInt128, UncompressedCacheCell, UInt128TrivialHash, UncompressedSizeWeightFunction>
+{
+private:
+ using Base = CacheBase<UInt128, UncompressedCacheCell, UInt128TrivialHash, UncompressedSizeWeightFunction>;
+
+public:
+ UncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio)
+ : Base(cache_policy, max_size_in_bytes, 0, size_ratio) {}
+
+ /// Calculate key from path to file and offset.
+ static UInt128 hash(const String & path_to_file, size_t offset)
+ {
+ SipHash hash;
+ hash.update(path_to_file.data(), path_to_file.size() + 1);
+ hash.update(offset);
+
+ return hash.get128();
+ }
+
+ template <typename LoadFunc>
+ MappedPtr getOrSet(const Key & key, LoadFunc && load)
+ {
+ auto result = Base::getOrSet(key, std::forward<LoadFunc>(load));
+
+ if (result.second)
+ ProfileEvents::increment(ProfileEvents::UncompressedCacheMisses);
+ else
+ ProfileEvents::increment(ProfileEvents::UncompressedCacheHits);
+
+ return result.first;
+ }
+
+private:
+ void onRemoveOverflowWeightLoss(size_t weight_loss) override
+ {
+ ProfileEvents::increment(ProfileEvents::UncompressedCacheWeightLost, weight_loss);
+ }
+};
+
+using UncompressedCachePtr = std::shared_ptr<UncompressedCache>;
+
+}
diff --git a/contrib/clickhouse/src/IO/UseSSL.cpp b/contrib/clickhouse/src/IO/UseSSL.cpp
new file mode 100644
index 0000000000..7a2ff928e0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/UseSSL.cpp
@@ -0,0 +1,24 @@
+#include "UseSSL.h"
+
+#include "clickhouse_config.h"
+
+#if USE_SSL
+# include <Poco/Net/SSLManager.h>
+#endif
+
+namespace DB
+{
+UseSSL::UseSSL()
+{
+#if USE_SSL
+ Poco::Net::initializeSSL();
+#endif
+}
+
+UseSSL::~UseSSL()
+{
+#if USE_SSL
+ Poco::Net::uninitializeSSL();
+#endif
+}
+}
diff --git a/contrib/clickhouse/src/IO/UseSSL.h b/contrib/clickhouse/src/IO/UseSSL.h
new file mode 100644
index 0000000000..324f318edb
--- /dev/null
+++ b/contrib/clickhouse/src/IO/UseSSL.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+
+namespace DB
+{
+// http://stackoverflow.com/questions/18315472/https-request-in-c-using-poco
+struct UseSSL : private boost::noncopyable
+{
+ UseSSL();
+ ~UseSSL();
+};
+}
diff --git a/contrib/clickhouse/src/IO/VarInt.cpp b/contrib/clickhouse/src/IO/VarInt.cpp
new file mode 100644
index 0000000000..a4b249b01d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/VarInt.cpp
@@ -0,0 +1,16 @@
+#include <IO/VarInt.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ATTEMPT_TO_READ_AFTER_EOF;
+}
+
+void throwReadAfterEOF()
+{
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after eof");
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/VarInt.h b/contrib/clickhouse/src/IO/VarInt.h
new file mode 100644
index 0000000000..8d10055a3d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/VarInt.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <base/types.h>
+#include <base/defines.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+/// Variable-Length Quantity (VLQ) Base-128 compression, also known as Variable Byte (VB) or Varint encoding.
+
+[[noreturn]] void throwReadAfterEOF();
+
+
+inline void writeVarUInt(UInt64 x, WriteBuffer & ostr)
+{
+ while (x > 0x7F)
+ {
+ uint8_t byte = 0x80 | (x & 0x7F);
+
+ ostr.nextIfAtEnd();
+ *ostr.position() = byte;
+ ++ostr.position();
+
+ x >>= 7;
+ }
+
+ uint8_t final_byte = static_cast<uint8_t>(x);
+
+ ostr.nextIfAtEnd();
+ *ostr.position() = final_byte;
+ ++ostr.position();
+}
+
+inline void writeVarUInt(UInt64 x, std::ostream & ostr)
+{
+ while (x > 0x7F)
+ {
+ uint8_t byte = 0x80 | (x & 0x7F);
+ ostr.put(byte);
+
+ x >>= 7;
+ }
+
+ uint8_t final_byte = static_cast<uint8_t>(x);
+ ostr.put(final_byte);
+}
+
+inline char * writeVarUInt(UInt64 x, char * ostr)
+{
+ while (x > 0x7F)
+ {
+ uint8_t byte = 0x80 | (x & 0x7F);
+
+ *ostr = byte;
+ ++ostr;
+
+ x >>= 7;
+ }
+
+ uint8_t final_byte = static_cast<uint8_t>(x);
+
+ *ostr = final_byte;
+ ++ostr;
+
+ return ostr;
+}
+
+template <typename Out>
+inline void writeVarInt(Int64 x, Out & ostr)
+{
+ writeVarUInt(static_cast<UInt64>((x << 1) ^ (x >> 63)), ostr);
+}
+
+inline char * writeVarInt(Int64 x, char * ostr)
+{
+ return writeVarUInt(static_cast<UInt64>((x << 1) ^ (x >> 63)), ostr);
+}
+
+namespace impl
+{
+
+template <bool check_eof>
+inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
+{
+ x = 0;
+ for (size_t i = 0; i < 10; ++i)
+ {
+ if constexpr (check_eof)
+ if (istr.eof()) [[unlikely]]
+ throwReadAfterEOF();
+
+ UInt64 byte = *istr.position();
+ ++istr.position();
+ x |= (byte & 0x7F) << (7 * i);
+
+ if (!(byte & 0x80))
+ return;
+ }
+}
+
+}
+
+inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
+{
+ if (istr.buffer().end() - istr.position() >= 10)
+ return impl::readVarUInt<false>(x, istr);
+ return impl::readVarUInt<true>(x, istr);
+}
+
+inline void readVarUInt(UInt64 & x, std::istream & istr)
+{
+ x = 0;
+ for (size_t i = 0; i < 10; ++i)
+ {
+ UInt64 byte = istr.get();
+ x |= (byte & 0x7F) << (7 * i);
+
+ if (!(byte & 0x80))
+ return;
+ }
+}
+
+inline const char * readVarUInt(UInt64 & x, const char * istr, size_t size)
+{
+ const char * end = istr + size;
+
+ x = 0;
+ for (size_t i = 0; i < 10; ++i)
+ {
+ if (istr == end) [[unlikely]]
+ throwReadAfterEOF();
+
+ UInt64 byte = *istr;
+ ++istr;
+ x |= (byte & 0x7F) << (7 * i);
+
+ if (!(byte & 0x80))
+ return istr;
+ }
+
+ return istr;
+}
+
+template <typename In>
+inline void readVarInt(Int64 & x, In & istr)
+{
+ readVarUInt(*reinterpret_cast<UInt64*>(&x), istr);
+ x = (static_cast<UInt64>(x) >> 1) ^ -(x & 1);
+}
+
+inline const char * readVarInt(Int64 & x, const char * istr, size_t size)
+{
+ const char * res = readVarUInt(*reinterpret_cast<UInt64*>(&x), istr, size);
+ x = (static_cast<UInt64>(x) >> 1) ^ -(x & 1);
+ return res;
+}
+
+inline void readVarUInt(UInt32 & x, ReadBuffer & istr)
+{
+ UInt64 tmp;
+ readVarUInt(tmp, istr);
+ x = static_cast<UInt32>(tmp);
+}
+
+inline void readVarInt(Int32 & x, ReadBuffer & istr)
+{
+ Int64 tmp;
+ readVarInt(tmp, istr);
+ x = static_cast<Int32>(tmp);
+}
+
+inline void readVarUInt(UInt16 & x, ReadBuffer & istr)
+{
+ UInt64 tmp;
+ readVarUInt(tmp, istr);
+ x = tmp;
+}
+
+inline void readVarInt(Int16 & x, ReadBuffer & istr)
+{
+ Int64 tmp;
+ readVarInt(tmp, istr);
+ x = tmp;
+}
+
+template <typename T>
+requires (!std::is_same_v<T, UInt64>)
+inline void readVarUInt(T & x, ReadBuffer & istr)
+{
+ UInt64 tmp;
+ readVarUInt(tmp, istr);
+ x = tmp;
+}
+
+inline size_t getLengthOfVarUInt(UInt64 x)
+{
+ return x < (1ULL << 7) ? 1
+ : (x < (1ULL << 14) ? 2
+ : (x < (1ULL << 21) ? 3
+ : (x < (1ULL << 28) ? 4
+ : (x < (1ULL << 35) ? 5
+ : (x < (1ULL << 42) ? 6
+ : (x < (1ULL << 49) ? 7
+ : (x < (1ULL << 56) ? 8
+ : (x < (1ULL << 63) ? 9
+ : 10))))))));
+}
+
+
+inline size_t getLengthOfVarInt(Int64 x)
+{
+ return getLengthOfVarUInt(static_cast<UInt64>((x << 1) ^ (x >> 63)));
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WithFileName.cpp b/contrib/clickhouse/src/IO/WithFileName.cpp
new file mode 100644
index 0000000000..2383182f7e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WithFileName.cpp
@@ -0,0 +1,39 @@
+#include <IO/WithFileName.h>
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/PeekableReadBuffer.h>
+
+namespace DB
+{
+
+template <typename T>
+static String getFileName(const T & entry)
+{
+ if (const auto * with_file_name = dynamic_cast<const WithFileName *>(&entry))
+ return with_file_name->getFileName();
+ return "";
+}
+
+String getFileNameFromReadBuffer(const ReadBuffer & in)
+{
+ if (const auto * compressed = dynamic_cast<const CompressedReadBufferWrapper *>(&in))
+ return getFileName(compressed->getWrappedReadBuffer());
+ else if (const auto * parallel = dynamic_cast<const ParallelReadBuffer *>(&in))
+ return getFileName(parallel->getReadBuffer());
+ else if (const auto * peekable = dynamic_cast<const PeekableReadBuffer *>(&in))
+ return getFileNameFromReadBuffer(peekable->getSubBuffer());
+ else
+ return getFileName(in);
+}
+
+String getExceptionEntryWithFileName(const ReadBuffer & in)
+{
+ auto filename = getFileNameFromReadBuffer(in);
+
+ if (filename.empty())
+ return "";
+
+ return fmt::format(": While reading from: {}", filename);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WithFileName.h b/contrib/clickhouse/src/IO/WithFileName.h
new file mode 100644
index 0000000000..595f1a768c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WithFileName.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <base/types.h>
+
+namespace DB
+{
+
+class ReadBuffer;
+
+class WithFileName
+{
+public:
+ virtual String getFileName() const = 0;
+ virtual ~WithFileName() = default;
+};
+
+String getFileNameFromReadBuffer(const ReadBuffer & in);
+String getExceptionEntryWithFileName(const ReadBuffer & in);
+
+}
diff --git a/contrib/clickhouse/src/IO/WithFileSize.cpp b/contrib/clickhouse/src/IO/WithFileSize.cpp
new file mode 100644
index 0000000000..3660d962c0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WithFileSize.cpp
@@ -0,0 +1,86 @@
+#include "WithFileSize.h"
+#include <IO/ReadBufferFromFile.h>
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/ReadBufferFromFileDecorator.h>
+#include <IO/PeekableReadBuffer.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int UNKNOWN_FILE_SIZE;
+}
+
+template <typename T>
+static size_t getFileSize(T & in)
+{
+ if (auto * with_file_size = dynamic_cast<WithFileSize *>(&in))
+ {
+ return with_file_size->getFileSize();
+ }
+
+ throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size");
+}
+
+size_t getFileSizeFromReadBuffer(ReadBuffer & in)
+{
+ if (auto * delegate = dynamic_cast<ReadBufferFromFileDecorator *>(&in))
+ {
+ return getFileSize(delegate->getWrappedReadBuffer());
+ }
+ else if (auto * compressed = dynamic_cast<CompressedReadBufferWrapper *>(&in))
+ {
+ return getFileSize(compressed->getWrappedReadBuffer());
+ }
+
+ return getFileSize(in);
+}
+
+std::optional<size_t> tryGetFileSizeFromReadBuffer(ReadBuffer & in)
+{
+ try
+ {
+ return getFileSizeFromReadBuffer(in);
+ }
+ catch (...)
+ {
+ return std::nullopt;
+ }
+}
+
+bool isBufferWithFileSize(const ReadBuffer & in)
+{
+ if (const auto * delegate = dynamic_cast<const ReadBufferFromFileDecorator *>(&in))
+ {
+ return delegate->isWithFileSize();
+ }
+ else if (const auto * compressed = dynamic_cast<const CompressedReadBufferWrapper *>(&in))
+ {
+ return isBufferWithFileSize(compressed->getWrappedReadBuffer());
+ }
+
+ return dynamic_cast<const WithFileSize *>(&in) != nullptr;
+}
+
+size_t getDataOffsetMaybeCompressed(const ReadBuffer & in)
+{
+ if (const auto * delegate = dynamic_cast<const ReadBufferFromFileDecorator *>(&in))
+ {
+ return getDataOffsetMaybeCompressed(delegate->getWrappedReadBuffer());
+ }
+ else if (const auto * compressed = dynamic_cast<const CompressedReadBufferWrapper *>(&in))
+ {
+ return getDataOffsetMaybeCompressed(compressed->getWrappedReadBuffer());
+ }
+ else if (const auto * peekable = dynamic_cast<const PeekableReadBuffer *>(&in))
+ {
+ return getDataOffsetMaybeCompressed(peekable->getSubBuffer());
+ }
+
+ return in.count();
+}
+
+
+}
diff --git a/contrib/clickhouse/src/IO/WithFileSize.h b/contrib/clickhouse/src/IO/WithFileSize.h
new file mode 100644
index 0000000000..0ae3af98ea
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WithFileSize.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <base/types.h>
+#include <optional>
+
+namespace DB
+{
+
+class ReadBuffer;
+
+class WithFileSize
+{
+public:
+ virtual size_t getFileSize() = 0;
+ virtual ~WithFileSize() = default;
+};
+
+bool isBufferWithFileSize(const ReadBuffer & in);
+
+size_t getFileSizeFromReadBuffer(ReadBuffer & in);
+
+/// Return nullopt if couldn't find out file size;
+std::optional<size_t> tryGetFileSizeFromReadBuffer(ReadBuffer & in);
+
+size_t getDataOffsetMaybeCompressed(const ReadBuffer & in);
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBuffer.cpp b/contrib/clickhouse/src/IO/WriteBuffer.cpp
new file mode 100644
index 0000000000..61fdd31e16
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBuffer.cpp
@@ -0,0 +1,33 @@
+#include "WriteBuffer.h"
+
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+/// Calling finalize() in the destructor of derived classes is a bad practice.
+/// This causes objects to be left on the remote FS when a write operation is rolled back.
+/// Do call finalize() explicitly, before this call you have no guarantee that the file has been written
+WriteBuffer::~WriteBuffer()
+{
+ // That destructor could be call with finalized=false in case of exceptions
+ if (count() > 0 && !finalized)
+ {
+ /// It is totally OK to destroy instance without finalization when an exception occurs
+ /// However it is suspicious to destroy instance without finalization at the green path
+ if (!std::uncaught_exceptions() && std::current_exception() == nullptr)
+ {
+ Poco::Logger * log = &Poco::Logger::get("WriteBuffer");
+ LOG_ERROR(
+ log,
+ "WriteBuffer is not finalized when destructor is called. "
+ "No exceptions in flight are detected. "
+ "The file might not be written at all or might be truncated. "
+ "Stack trace: {}",
+ StackTrace().toString());
+ chassert(false && "WriteBuffer is not finalized in destructor.");
+ }
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBuffer.h b/contrib/clickhouse/src/IO/WriteBuffer.h
new file mode 100644
index 0000000000..d29ca6d5c6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBuffer.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <cassert>
+#include <cstring>
+
+#include <Common/Exception.h>
+#include <Common/LockMemoryExceptionInThread.h>
+#include <IO/BufferBase.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+ extern const int LOGICAL_ERROR;
+}
+
+
+/** A simple abstract class for buffered data writing (char sequences) somewhere.
+ * Unlike std::ostream, it provides access to the internal buffer,
+ * and also allows you to manually manage the position inside the buffer.
+ *
+ * Derived classes must implement the nextImpl() method.
+ */
+class WriteBuffer : public BufferBase
+{
+public:
+ using BufferBase::set;
+ using BufferBase::position;
+ void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); }
+
+ /** write the data in the buffer (from the beginning of the buffer to the current position);
+ * set the position to the beginning; throw an exception, if something is wrong
+ */
+ inline void next()
+ {
+ if (!offset())
+ return;
+
+ auto bytes_in_buffer = offset();
+
+ try
+ {
+ nextImpl();
+ }
+ catch (...)
+ {
+ /** If the nextImpl() call was unsuccessful, move the cursor to the beginning,
+ * so that later (for example, when the stack was expanded) there was no second attempt to write data.
+ */
+ pos = working_buffer.begin();
+ bytes += bytes_in_buffer;
+ throw;
+ }
+
+ bytes += bytes_in_buffer;
+ pos = working_buffer.begin();
+ }
+
+ /// Calling finalize() in the destructor of derived classes is a bad practice.
+ virtual ~WriteBuffer();
+
+ inline void nextIfAtEnd()
+ {
+ if (!hasPendingData())
+ next();
+ }
+
+
+ void write(const char * from, size_t n)
+ {
+ if (finalized)
+ throw Exception{ErrorCodes::LOGICAL_ERROR, "Cannot write to finalized buffer"};
+
+ size_t bytes_copied = 0;
+
+ /// Produces endless loop
+ assert(!working_buffer.empty());
+
+ while (bytes_copied < n)
+ {
+ nextIfAtEnd();
+ size_t bytes_to_copy = std::min(static_cast<size_t>(working_buffer.end() - pos), n - bytes_copied);
+ memcpy(pos, from + bytes_copied, bytes_to_copy);
+ pos += bytes_to_copy;
+ bytes_copied += bytes_to_copy;
+ }
+ }
+
+ inline void write(char x)
+ {
+ if (finalized)
+ throw Exception{ErrorCodes::LOGICAL_ERROR, "Cannot write to finalized buffer"};
+
+ nextIfAtEnd();
+ *pos = x;
+ ++pos;
+ }
+
+ /// This method may be called before finalize() to tell there would not be any more data written.
+ /// Used does not have to call it, implementation should check it itself if needed.
+ ///
+ /// The idea is similar to prefetch. In case if all data is written, we can flush the buffer
+ /// and start sending data asynchronously. It may improve writing performance in case you have
+ /// multiple files to finalize. Mainly, for blob storage, finalization has high latency,
+ /// and calling preFinalize in a loop may parallelize it.
+ virtual void preFinalize() { next(); }
+
+ /// Write the last data.
+ void finalize()
+ {
+ if (finalized)
+ return;
+
+ LockMemoryExceptionInThread lock(VariableContext::Global);
+ try
+ {
+ finalizeImpl();
+ finalized = true;
+ }
+ catch (...)
+ {
+ pos = working_buffer.begin();
+ finalized = true;
+ throw;
+ }
+ }
+
+ /// Wait for data to be reliably written. Mainly, call fsync for fd.
+ /// May be called after finalize() if needed.
+ virtual void sync()
+ {
+ next();
+ }
+
+protected:
+ WriteBuffer(Position ptr, size_t size) : BufferBase(ptr, size, 0) {}
+
+ virtual void finalizeImpl()
+ {
+ next();
+ }
+
+ bool finalized = false;
+
+private:
+ /** Write the data in the buffer (from the beginning of the buffer to the current position).
+ * Throw an exception if something is wrong.
+ */
+ virtual void nextImpl()
+ {
+ throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER, "Cannot write after end of buffer.");
+ }
+};
+
+
+using WriteBufferPtr = std::shared_ptr<WriteBuffer>;
+
+
+class WriteBufferFromPointer : public WriteBuffer
+{
+public:
+ WriteBufferFromPointer(Position ptr, size_t size) : WriteBuffer(ptr, size) {}
+
+private:
+ virtual void finalizeImpl() override
+ {
+ /// no op
+ }
+
+ virtual void sync() override
+ {
+ /// no on
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferDecorator.h b/contrib/clickhouse/src/IO/WriteBufferDecorator.h
new file mode 100644
index 0000000000..7c984eeea8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferDecorator.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <utility>
+#include <memory>
+
+namespace DB
+{
+
+class WriteBuffer;
+
+/// WriteBuffer that decorates data and delegates it to underlying buffer.
+/// It's used for writing compressed and encrypted data
+template <class Base>
+class WriteBufferDecorator : public Base
+{
+public:
+ template <class ... BaseArgs>
+ explicit WriteBufferDecorator(std::unique_ptr<WriteBuffer> out_, BaseArgs && ... args)
+ : Base(std::forward<BaseArgs>(args)...), out(std::move(out_))
+ {
+ }
+
+ void finalizeImpl() override
+ {
+ try
+ {
+ finalizeBefore();
+ out->finalize();
+ finalizeAfter();
+ }
+ catch (...)
+ {
+ /// Do not try to flush next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+ }
+
+ WriteBuffer * getNestedBuffer() { return out.get(); }
+
+protected:
+ /// Do some finalization before finalization of underlying buffer.
+ virtual void finalizeBefore() {}
+
+ /// Do some finalization after finalization of underlying buffer.
+ virtual void finalizeAfter() {}
+
+ std::unique_ptr<WriteBuffer> out;
+};
+
+using WriteBufferWithOwnMemoryDecorator = WriteBufferDecorator<BufferWithOwnMemory<WriteBuffer>>;
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromArena.h b/contrib/clickhouse/src/IO/WriteBufferFromArena.h
new file mode 100644
index 0000000000..8e9276496b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromArena.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <Common/Arena.h>
+#include <base/StringRef.h>
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+/** Writes data contiguously into Arena.
+ * As it will be located in contiguous memory segment, it can be read back with ReadBufferFromMemory.
+ *
+ * While using this object, no other allocations in arena are possible.
+ */
+class WriteBufferFromArena final : public WriteBuffer
+{
+public:
+ /// begin_ - start of previously used contiguous memory segment or nullptr (see Arena::allocContinue method).
+ WriteBufferFromArena(Arena & arena_, const char *& begin_)
+ : WriteBuffer(nullptr, 0), arena(arena_), begin(begin_)
+ {
+ nextImpl();
+ pos = working_buffer.begin();
+ }
+
+ StringRef complete()
+ {
+ /// Return over-allocated memory back into arena.
+ arena.rollback(buffer().end() - position());
+ /// Reference to written data.
+ return { position() - count(), count() };
+ }
+
+private:
+ Arena & arena;
+ const char *& begin;
+
+ void nextImpl() override
+ {
+ /// Allocate more memory. At least same size as used before (this gives 2x growth ratio),
+ /// and at most grab all remaining size in current chunk of arena.
+ ///
+ /// FIXME this class just doesn't make sense -- WriteBuffer is not
+ /// a unified interface for everything, it doesn't work well with
+ /// Arena::allocContinue -- we lose the size of data and then use a
+ /// heuristic to guess it back? and make a virtual call while we're at it?
+ /// I don't even..
+ /// Being so ill-defined as it is, no wonder that the following line had
+ /// a bug leading to a very rare infinite loop. Just hack around it in
+ /// the most stupid way possible, because the real fix for this is to
+ /// tear down the entire WriteBuffer thing and implement it again,
+ /// properly.
+ size_t continuation_size = std::max(size_t(1),
+ std::max(count(), arena.remainingSpaceInCurrentMemoryChunk()));
+
+ /// allocContinue method will possibly move memory region to new place and modify "begin" pointer.
+
+ char * continuation = arena.allocContinue(continuation_size, begin);
+ char * end = continuation + continuation_size;
+
+ /// internal buffer points to whole memory segment and working buffer - to free space for writing.
+ internalBuffer() = Buffer(const_cast<char *>(begin), end);
+ buffer() = Buffer(continuation, end);
+ }
+
+ /// it is super strange,
+ /// but addition next call changes the data in serializeValueIntoArena result
+ virtual void finalizeImpl() override { /* no op */ }
+};
+
+}
+
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.cpp b/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.cpp
new file mode 100644
index 0000000000..5bca0dc68d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.cpp
@@ -0,0 +1,61 @@
+#include <IO/WriteBufferFromEncryptedFile.h>
+
+#if USE_SSL
+
+namespace DB
+{
+
+WriteBufferFromEncryptedFile::WriteBufferFromEncryptedFile(
+ size_t buffer_size_,
+ std::unique_ptr<WriteBufferFromFileBase> out_,
+ const String & key_,
+ const FileEncryption::Header & header_,
+ size_t old_file_size)
+ : WriteBufferDecorator<WriteBufferFromFileBase>(std::move(out_), buffer_size_, nullptr, 0)
+ , header(header_)
+ , flush_header(!old_file_size)
+ , encryptor(header.algorithm, key_, header.init_vector)
+{
+ encryptor.setOffset(old_file_size);
+}
+
+WriteBufferFromEncryptedFile::~WriteBufferFromEncryptedFile()
+{
+ finalize();
+}
+
+void WriteBufferFromEncryptedFile::finalizeBefore()
+{
+ /// If buffer has pending data - write it.
+ next();
+
+ /// Note that if there is no data to write an empty file will be written, even without the initialization vector
+ /// (see nextImpl(): it writes the initialization vector only if there is some data ready to write).
+ /// That's fine because DiskEncrypted allows files without initialization vectors when they're empty.
+}
+
+void WriteBufferFromEncryptedFile::sync()
+{
+ /// If buffer has pending data - write it.
+ next();
+
+ out->sync();
+}
+
+void WriteBufferFromEncryptedFile::nextImpl()
+{
+ if (!offset())
+ return;
+
+ if (flush_header)
+ {
+ header.write(*out);
+ flush_header = false;
+ }
+
+ encryptor.encrypt(working_buffer.begin(), offset(), *out);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.h b/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.h
new file mode 100644
index 0000000000..12c1ba5f6f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromEncryptedFile.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "clickhouse_config.h"
+#include <Common/assert_cast.h>
+
+#if USE_SSL
+#include <IO/WriteBufferFromFileBase.h>
+#include <IO/FileEncryptionCommon.h>
+#include <IO/WriteBufferDecorator.h>
+
+
+namespace DB
+{
+
+/// Encrypts data and writes the encrypted data to the underlying write buffer.
+class WriteBufferFromEncryptedFile : public WriteBufferDecorator<WriteBufferFromFileBase>
+{
+public:
+ /// `old_file_size` should be set to non-zero if we're going to append an existing file.
+ WriteBufferFromEncryptedFile(
+ size_t buffer_size_,
+ std::unique_ptr<WriteBufferFromFileBase> out_,
+ const String & key_,
+ const FileEncryption::Header & header_,
+ size_t old_file_size = 0);
+
+ ~WriteBufferFromEncryptedFile() override;
+
+ void sync() override;
+
+ std::string getFileName() const override { return assert_cast<WriteBufferFromFileBase *>(out.get())->getFileName(); }
+
+private:
+ void nextImpl() override;
+
+ void finalizeBefore() override;
+
+ FileEncryption::Header header;
+ bool flush_header = false;
+
+ FileEncryption::Encryptor encryptor;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFile.cpp b/contrib/clickhouse/src/IO/WriteBufferFromFile.cpp
new file mode 100644
index 0000000000..97059ff8f4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFile.cpp
@@ -0,0 +1,115 @@
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <cerrno>
+
+#include <Common/ProfileEvents.h>
+#include <base/defines.h>
+
+#include <IO/WriteBufferFromFile.h>
+#include <IO/WriteHelpers.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event FileOpen;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int FILE_DOESNT_EXIST;
+ extern const int CANNOT_OPEN_FILE;
+ extern const int CANNOT_CLOSE_FILE;
+}
+
+
+WriteBufferFromFile::WriteBufferFromFile(
+ const std::string & file_name_,
+ size_t buf_size,
+ int flags,
+ ThrottlerPtr throttler_,
+ mode_t mode,
+ char * existing_memory,
+ size_t alignment)
+ : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, throttler_, alignment, file_name_)
+{
+ ProfileEvents::increment(ProfileEvents::FileOpen);
+
+#ifdef OS_DARWIN
+ bool o_direct = (flags != -1) && (flags & O_DIRECT);
+ if (o_direct)
+ flags = flags & ~O_DIRECT;
+#endif
+
+ fd = ::open(file_name.c_str(), flags == -1 ? O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC : flags | O_CLOEXEC, mode);
+
+ if (-1 == fd)
+ throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+ errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
+
+#ifdef OS_DARWIN
+ if (o_direct)
+ {
+ if (fcntl(fd, F_NOCACHE, 1) == -1)
+ throwFromErrnoWithPath("Cannot set F_NOCACHE on file " + file_name, file_name, ErrorCodes::CANNOT_OPEN_FILE);
+ }
+#endif
+}
+
+
+/// Use pre-opened file descriptor.
+WriteBufferFromFile::WriteBufferFromFile(
+ int & fd_,
+ const std::string & original_file_name,
+ size_t buf_size,
+ ThrottlerPtr throttler_,
+ char * existing_memory,
+ size_t alignment)
+ : WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, throttler_, alignment, original_file_name)
+{
+ fd_ = -1;
+}
+
+WriteBufferFromFile::~WriteBufferFromFile()
+{
+ if (fd < 0)
+ return;
+
+ finalize();
+ int err = ::close(fd);
+ /// Everything except for EBADF should be ignored in dtor, since all of
+ /// others (EINTR/EIO/ENOSPC/EDQUOT) could be possible during writing to
+ /// fd, and then write already failed and the error had been reported to
+ /// the user/caller.
+ ///
+ /// Note, that for close() on Linux, EINTR should *not* be retried.
+ chassert(!(err && errno == EBADF));
+}
+
+void WriteBufferFromFile::finalizeImpl()
+{
+ if (fd < 0)
+ return;
+
+ next();
+}
+
+
+/// Close file before destruction of object.
+void WriteBufferFromFile::close()
+{
+ if (fd < 0)
+ return;
+
+ finalize();
+
+ if (0 != ::close(fd))
+ throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file");
+
+ fd = -1;
+ metric_increment.destroy();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFile.h b/contrib/clickhouse/src/IO/WriteBufferFromFile.h
new file mode 100644
index 0000000000..57847d893a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFile.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <Common/CurrentMetrics.h>
+#include <Common/Throttler_fwd.h>
+#include <IO/WriteBufferFromFileDescriptor.h>
+
+
+namespace CurrentMetrics
+{
+ extern const Metric OpenFileForWrite;
+}
+
+
+#ifndef O_DIRECT
+#define O_DIRECT 00040000
+#endif
+
+namespace DB
+{
+
+/** Accepts path to file and opens it, or pre-opened file descriptor.
+ * Closes file by himself (thus "owns" a file descriptor).
+ */
+class WriteBufferFromFile : public WriteBufferFromFileDescriptor
+{
+protected:
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForWrite};
+
+public:
+ explicit WriteBufferFromFile(
+ const std::string & file_name_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ int flags = -1,
+ ThrottlerPtr throttler_ = {},
+ mode_t mode = 0666,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ /// Use pre-opened file descriptor.
+ explicit WriteBufferFromFile(
+ int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
+ const std::string & original_file_name = {},
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ ThrottlerPtr throttler_ = {},
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~WriteBufferFromFile() override;
+
+ /// Close file before destruction of object.
+ void close();
+
+ std::string getFileName() const override
+ {
+ return file_name;
+ }
+
+private:
+ void finalizeImpl() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileBase.cpp b/contrib/clickhouse/src/IO/WriteBufferFromFileBase.cpp
new file mode 100644
index 0000000000..2b9cbb88cd
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileBase.cpp
@@ -0,0 +1,11 @@
+#include <IO/WriteBufferFromFileBase.h>
+
+namespace DB
+{
+
+WriteBufferFromFileBase::WriteBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
+ : BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment)
+{
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileBase.h b/contrib/clickhouse/src/IO/WriteBufferFromFileBase.h
new file mode 100644
index 0000000000..d6e2144bcc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileBase.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <string>
+#include <fcntl.h>
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+
+namespace DB
+{
+
+class WriteBufferFromFileBase : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ WriteBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment);
+
+ void sync() override = 0;
+ virtual std::string getFileName() const = 0;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.cpp b/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.cpp
new file mode 100644
index 0000000000..0e4e5e13a8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.cpp
@@ -0,0 +1,74 @@
+#include "WriteBufferFromFileDecorator.h"
+
+#include <IO/WriteBuffer.h>
+#include <IO/SwapHelper.h>
+
+namespace DB
+{
+
+WriteBufferFromFileDecorator::WriteBufferFromFileDecorator(std::unique_ptr<WriteBuffer> impl_)
+ : WriteBufferFromFileBase(0, nullptr, 0), impl(std::move(impl_))
+{
+ swap(*impl);
+}
+
+void WriteBufferFromFileDecorator::finalizeImpl()
+{
+
+ /// In case of exception in preFinalize as a part of finalize call
+ /// WriteBufferFromFileDecorator.finalized is set as true
+ /// but impl->finalized is remain false
+ /// That leads to situation when the destructor of impl is called with impl->finalized equal false.
+ if (!is_prefinalized)
+ WriteBufferFromFileDecorator::preFinalize();
+
+ {
+ SwapHelper swap(*this, *impl);
+ impl->finalize();
+ }
+}
+
+WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator()
+{
+ /// It is not a mistake that swap is called here
+ /// Swap has been called at constructor, it should be called at destructor
+ /// In oreder to provide valid buffer for impl's d-tor call
+ swap(*impl);
+}
+
+void WriteBufferFromFileDecorator::sync()
+{
+ next();
+
+ {
+ SwapHelper swap(*this, *impl);
+ impl->sync();
+ }
+}
+
+std::string WriteBufferFromFileDecorator::getFileName() const
+{
+ if (WriteBufferFromFileBase * buffer = dynamic_cast<WriteBufferFromFileBase*>(impl.get()))
+ return buffer->getFileName();
+ return std::string();
+}
+
+void WriteBufferFromFileDecorator::preFinalize()
+{
+ next();
+
+ {
+ SwapHelper swap(*this, *impl);
+ impl->preFinalize();
+ }
+
+ is_prefinalized = true;
+}
+
+void WriteBufferFromFileDecorator::nextImpl()
+{
+ SwapHelper swap(*this, *impl);
+ impl->next();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.h b/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.h
new file mode 100644
index 0000000000..5344bb1425
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDecorator.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <IO/WriteBufferFromFileBase.h>
+
+namespace DB
+{
+
+/// Delegates all writes to underlying buffer. Doesn't have own memory.
+class WriteBufferFromFileDecorator : public WriteBufferFromFileBase
+{
+public:
+ explicit WriteBufferFromFileDecorator(std::unique_ptr<WriteBuffer> impl_);
+
+ ~WriteBufferFromFileDecorator() override;
+
+ void sync() override;
+
+ std::string getFileName() const override;
+
+ void preFinalize() override;
+
+ const WriteBuffer & getImpl() const { return *impl; }
+
+protected:
+ void finalizeImpl() override;
+
+ std::unique_ptr<WriteBuffer> impl;
+
+private:
+ void nextImpl() override;
+
+ bool is_prefinalized = false;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.cpp b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.cpp
new file mode 100644
index 0000000000..135ff60896
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -0,0 +1,179 @@
+#include <unistd.h>
+#include <cerrno>
+#include <cassert>
+#include <sys/stat.h>
+
+#include <Common/Throttler.h>
+#include <Common/Exception.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/Stopwatch.h>
+
+#include <IO/WriteBufferFromFileDescriptor.h>
+#include <IO/WriteHelpers.h>
+
+
+namespace ProfileEvents
+{
+ extern const Event WriteBufferFromFileDescriptorWrite;
+ extern const Event WriteBufferFromFileDescriptorWriteFailed;
+ extern const Event WriteBufferFromFileDescriptorWriteBytes;
+ extern const Event DiskWriteElapsedMicroseconds;
+ extern const Event FileSync;
+ extern const Event FileSyncElapsedMicroseconds;
+ extern const Event LocalWriteThrottlerBytes;
+ extern const Event LocalWriteThrottlerSleepMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric Write;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR;
+ extern const int CANNOT_FSYNC;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int CANNOT_TRUNCATE_FILE;
+ extern const int CANNOT_FSTAT;
+}
+
+
+void WriteBufferFromFileDescriptor::nextImpl()
+{
+ if (!offset())
+ return;
+
+ Stopwatch watch;
+
+ size_t bytes_written = 0;
+ while (bytes_written != offset())
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWrite);
+
+ ssize_t res = 0;
+ {
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::Write};
+ res = ::write(fd, working_buffer.begin() + bytes_written, offset() - bytes_written);
+ }
+
+ if ((-1 == res || 0 == res) && errno != EINTR)
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
+
+ /// Don't use getFileName() here because this method can be called from destructor
+ String error_file_name = file_name;
+ if (error_file_name.empty())
+ error_file_name = "(fd = " + toString(fd) + ")";
+ throwFromErrnoWithPath("Cannot write to file " + error_file_name, error_file_name,
+ ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
+ }
+
+ if (res > 0)
+ {
+ bytes_written += res;
+ if (throttler)
+ throttler->add(res, ProfileEvents::LocalWriteThrottlerBytes, ProfileEvents::LocalWriteThrottlerSleepMicroseconds);
+ }
+ }
+
+ ProfileEvents::increment(ProfileEvents::DiskWriteElapsedMicroseconds, watch.elapsedMicroseconds());
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteBytes, bytes_written);
+}
+
+/// NOTE: This class can be used as a very low-level building block, for example
+/// in trace collector. In such places allocations of memory can be dangerous,
+/// so don't allocate anything in this constructor.
+WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
+ int fd_,
+ size_t buf_size,
+ char * existing_memory,
+ ThrottlerPtr throttler_,
+ size_t alignment,
+ std::string file_name_)
+ : WriteBufferFromFileBase(buf_size, existing_memory, alignment)
+ , fd(fd_)
+ , throttler(throttler_)
+ , file_name(std::move(file_name_))
+{
+}
+
+
+WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor()
+{
+ finalize();
+}
+
+void WriteBufferFromFileDescriptor::finalizeImpl()
+{
+ if (fd < 0)
+ {
+ assert(!offset() && "attempt to write after close");
+ return;
+ }
+
+ next();
+}
+
+void WriteBufferFromFileDescriptor::sync()
+{
+ /// If buffer has pending data - write it.
+ next();
+
+ ProfileEvents::increment(ProfileEvents::FileSync);
+
+ Stopwatch watch;
+
+ /// Request OS to sync data with storage medium.
+#if defined(OS_DARWIN)
+ int res = ::fsync(fd);
+#else
+ int res = ::fdatasync(fd);
+#endif
+ ProfileEvents::increment(ProfileEvents::FileSyncElapsedMicroseconds, watch.elapsedMicroseconds());
+
+ if (-1 == res)
+ throwFromErrnoWithPath("Cannot fsync " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSYNC);
+}
+
+
+off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT
+{
+ off_t res = lseek(fd, offset, whence);
+ if (-1 == res)
+ throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
+ ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+ return res;
+}
+
+void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT
+{
+ int res = ftruncate(fd, length);
+ if (-1 == res)
+ throwFromErrnoWithPath("Cannot truncate file " + getFileName(), getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
+}
+
+
+off_t WriteBufferFromFileDescriptor::size() const
+{
+ struct stat buf;
+ int res = fstat(fd, &buf);
+ if (-1 == res)
+ throwFromErrnoWithPath("Cannot execute fstat " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSTAT);
+ return buf.st_size;
+}
+
+std::string WriteBufferFromFileDescriptor::getFileName() const
+{
+ if (file_name.empty())
+ return "(fd = " + toString(fd) + ")";
+
+ return file_name;
+}
+
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.h b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.h
new file mode 100644
index 0000000000..cb73b1e1d0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptor.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <IO/WriteBufferFromFileBase.h>
+#include <Common/Throttler_fwd.h>
+
+
+namespace DB
+{
+
+/** Use ready file descriptor. Does not open or close a file.
+ */
+class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
+{
+public:
+ explicit WriteBufferFromFileDescriptor(
+ int fd_ = -1,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ ThrottlerPtr throttler_ = {},
+ size_t alignment = 0,
+ std::string file_name_ = "");
+
+ /** Could be used before initialization if needed 'fd' was not passed to constructor.
+ * It's not possible to change 'fd' during work.
+ */
+ void setFD(int fd_)
+ {
+ fd = fd_;
+ }
+
+ ~WriteBufferFromFileDescriptor() override;
+
+ int getFD() const
+ {
+ return fd;
+ }
+
+ void sync() override;
+
+ /// clang-tidy wants these methods to be const, but
+ /// they are not const semantically
+ off_t seek(off_t offset, int whence); // NOLINT
+ void truncate(off_t length); // NOLINT
+
+ /// Name or some description of file.
+ std::string getFileName() const override;
+
+ off_t size() const;
+
+protected:
+ void nextImpl() override;
+
+ int fd;
+ ThrottlerPtr throttler;
+
+ /// If file has name contains filename, otherwise contains string "(fd=...)"
+ std::string file_name;
+
+ void finalizeImpl() override;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.cpp b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.cpp
new file mode 100644
index 0000000000..69be24f0fa
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.cpp
@@ -0,0 +1,32 @@
+#include <IO/WriteBufferFromFileDescriptorDiscardOnFailure.h>
+
+namespace ProfileEvents
+{
+ extern const Event CannotWriteToWriteBufferDiscard;
+}
+
+namespace DB
+{
+
+void WriteBufferFromFileDescriptorDiscardOnFailure::nextImpl()
+{
+ size_t bytes_written = 0;
+ while (bytes_written != offset())
+ {
+ ssize_t res = ::write(fd, working_buffer.begin() + bytes_written, offset() - bytes_written);
+
+ if ((-1 == res || 0 == res) && errno != EINTR)
+ {
+ /// Never send this profile event to trace log because it may cause another
+ /// write into the same fd and likely will trigger the same error
+ /// and will lead to infinite recursion.
+ ProfileEvents::incrementNoTrace(ProfileEvents::CannotWriteToWriteBufferDiscard);
+ break; /// Discard
+ }
+
+ if (res > 0)
+ bytes_written += res;
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.h b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.h
new file mode 100644
index 0000000000..2803dd4e8b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromFileDescriptorDiscardOnFailure.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <IO/WriteBufferFromFileDescriptor.h>
+
+
+namespace DB
+{
+
+/** Write to file descriptor but drop the data if write would block or fail.
+ * To use within signal handler. Motivating example: a signal handler invoked during execution of malloc
+ * should not block because some mutex (or even worse - a spinlock) may be held.
+ */
+class WriteBufferFromFileDescriptorDiscardOnFailure : public WriteBufferFromFileDescriptor
+{
+protected:
+ void nextImpl() override;
+
+public:
+ using WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor;
+ ~WriteBufferFromFileDescriptorDiscardOnFailure() override = default;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromHTTP.cpp b/contrib/clickhouse/src/IO/WriteBufferFromHTTP.cpp
new file mode 100644
index 0000000000..056b965266
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromHTTP.cpp
@@ -0,0 +1,50 @@
+#include <IO/WriteBufferFromHTTP.h>
+
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+WriteBufferFromHTTP::WriteBufferFromHTTP(
+ const Poco::URI & uri,
+ const std::string & method,
+ const std::string & content_type,
+ const std::string & content_encoding,
+ const HTTPHeaderEntries & additional_headers,
+ const ConnectionTimeouts & timeouts,
+ size_t buffer_size_,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_configuration)
+ : WriteBufferFromOStream(buffer_size_)
+ , session{makeHTTPSession(uri, timeouts, proxy_configuration)}
+ , request{method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1}
+{
+ request.setHost(uri.getHost());
+ request.setChunkedTransferEncoding(true);
+
+ if (!content_type.empty())
+ {
+ request.set("Content-Type", content_type);
+ }
+
+ if (!content_encoding.empty())
+ request.set("Content-Encoding", content_encoding);
+
+ for (const auto & header: additional_headers)
+ request.add(header.name, header.value);
+
+ LOG_TRACE((&Poco::Logger::get("WriteBufferToHTTP")), "Sending request to {}", uri.toString());
+
+ ostr = &session->sendRequest(request);
+}
+
+void WriteBufferFromHTTP::finalizeImpl()
+{
+ // Make sure the content in the buffer has been flushed
+ this->next();
+
+ receiveResponse(*session, request, response, false);
+ /// TODO: Response body is ignored.
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromHTTP.h b/contrib/clickhouse/src/IO/WriteBufferFromHTTP.h
new file mode 100644
index 0000000000..65dc10213d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromHTTP.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <IO/ConnectionTimeouts.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferFromOStream.h>
+#include <IO/HTTPCommon.h>
+#include <IO/HTTPHeaderEntries.h>
+#include <Poco/Net/HTTPClientSession.h>
+#include <Poco/Net/HTTPRequest.h>
+#include <Poco/Net/HTTPResponse.h>
+#include <Poco/URI.h>
+
+
+namespace DB
+{
+
+/* Perform HTTP POST/PUT request.
+ */
+class WriteBufferFromHTTP : public WriteBufferFromOStream
+{
+public:
+ explicit WriteBufferFromHTTP(const Poco::URI & uri,
+ const std::string & method = Poco::Net::HTTPRequest::HTTP_POST, // POST or PUT only
+ const std::string & content_type = "",
+ const std::string & content_encoding = "",
+ const HTTPHeaderEntries & additional_headers = {},
+ const ConnectionTimeouts & timeouts = {},
+ size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+ Poco::Net::HTTPClientSession::ProxyConfig proxy_configuration = {});
+
+private:
+ /// Receives response from the server after sending all data.
+ void finalizeImpl() override;
+
+ HTTPSessionPtr session;
+ Poco::Net::HTTPRequest request;
+ Poco::Net::HTTPResponse response;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromOStream.cpp b/contrib/clickhouse/src/IO/WriteBufferFromOStream.cpp
new file mode 100644
index 0000000000..ffc3e62e9a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromOStream.cpp
@@ -0,0 +1,42 @@
+#include <IO/WriteBufferFromOStream.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_WRITE_TO_OSTREAM;
+}
+
+void WriteBufferFromOStream::nextImpl()
+{
+ if (!offset())
+ return;
+
+ ostr->write(working_buffer.begin(), offset());
+ ostr->flush();
+
+ if (!ostr->good())
+ throw Exception(ErrorCodes::CANNOT_WRITE_TO_OSTREAM, "Cannot write to ostream at offset {}", count());
+}
+
+WriteBufferFromOStream::WriteBufferFromOStream(
+ size_t size,
+ char * existing_memory,
+ size_t alignment)
+ : BufferWithOwnMemory<WriteBuffer>(size, existing_memory, alignment)
+{
+}
+
+WriteBufferFromOStream::WriteBufferFromOStream(
+ std::ostream & ostr_,
+ size_t size,
+ char * existing_memory,
+ size_t alignment)
+ : BufferWithOwnMemory<WriteBuffer>(size, existing_memory, alignment), ostr(&ostr_)
+{
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromOStream.h b/contrib/clickhouse/src/IO/WriteBufferFromOStream.h
new file mode 100644
index 0000000000..3f9d3ee3d9
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromOStream.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <iosfwd>
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+
+
+namespace DB
+{
+
+class WriteBufferFromOStream : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ explicit WriteBufferFromOStream(
+ std::ostream & ostr_,
+ size_t size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+protected:
+ explicit WriteBufferFromOStream(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0);
+
+ void nextImpl() override;
+
+ std::ostream * ostr{};
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.cpp b/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.cpp
new file mode 100644
index 0000000000..171e7f1ce6
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.cpp
@@ -0,0 +1,140 @@
+#include <Poco/Net/NetException.h>
+
+#include <base/scope_guard.h>
+
+#include <IO/WriteBufferFromPocoSocket.h>
+
+#include <Common/Exception.h>
+#include <Common/NetException.h>
+#include <Common/Stopwatch.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/AsyncTaskExecutor.h>
+#include <Common/checkSSLReturnCode.h>
+
+namespace ProfileEvents
+{
+ extern const Event NetworkSendElapsedMicroseconds;
+ extern const Event NetworkSendBytes;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric NetworkSend;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int NETWORK_ERROR;
+ extern const int SOCKET_TIMEOUT;
+ extern const int CANNOT_WRITE_TO_SOCKET;
+ extern const int LOGICAL_ERROR;
+}
+
+void WriteBufferFromPocoSocket::nextImpl()
+{
+ if (!offset())
+ return;
+
+ Stopwatch watch;
+ size_t bytes_written = 0;
+
+ SCOPE_EXIT({
+ ProfileEvents::increment(ProfileEvents::NetworkSendElapsedMicroseconds, watch.elapsedMicroseconds());
+ ProfileEvents::increment(ProfileEvents::NetworkSendBytes, bytes_written);
+ });
+
+ while (bytes_written < offset())
+ {
+ ssize_t res = 0;
+
+ /// Add more details to exceptions.
+ try
+ {
+ CurrentMetrics::Increment metric_increment(CurrentMetrics::NetworkSend);
+ char * pos = working_buffer.begin() + bytes_written;
+ size_t size = offset() - bytes_written;
+ if (size > INT_MAX)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Buffer overflow");
+
+ /// If async_callback is specified, set socket to non-blocking mode
+ /// and try to write data to it, if socket is not ready for writing,
+ /// run async_callback and try again later.
+ /// It is expected that file descriptor may be polled externally.
+ /// Note that send timeout is not checked here. External code should check it while polling.
+ if (async_callback)
+ {
+ socket.setBlocking(false);
+ /// Set socket to blocking mode at the end.
+ SCOPE_EXIT(socket.setBlocking(true));
+ bool secure = socket.secure();
+ res = socket.impl()->sendBytes(pos, static_cast<int>(size));
+
+ /// Check EAGAIN and ERR_SSL_WANT_WRITE/ERR_SSL_WANT_READ for secure socket (writing to secure socket can read too).
+ while (res < 0 && (errno == EAGAIN || (secure && (checkSSLWantRead(res) || checkSSLWantWrite(res)))))
+ {
+ /// In case of ERR_SSL_WANT_READ we should wait for socket to be ready for reading, otherwise - for writing.
+ if (secure && checkSSLWantRead(res))
+ async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), AsyncEventTimeoutType::RECEIVE, socket_description, AsyncTaskExecutor::Event::READ | AsyncTaskExecutor::Event::ERROR);
+ else
+ async_callback(socket.impl()->sockfd(), socket.getSendTimeout(), AsyncEventTimeoutType::SEND, socket_description, AsyncTaskExecutor::Event::WRITE | AsyncTaskExecutor::Event::ERROR);
+
+ /// Try to write again.
+ res = socket.impl()->sendBytes(pos, static_cast<int>(size));
+ }
+ }
+ else
+ {
+ res = socket.impl()->sendBytes(pos, static_cast<int>(size));
+ }
+ }
+ catch (const Poco::Net::NetException & e)
+ {
+ throw NetException(ErrorCodes::NETWORK_ERROR, "{}, while writing to socket ({} -> {})", e.displayText(),
+ our_address.toString(), peer_address.toString());
+ }
+ catch (const Poco::TimeoutException &)
+ {
+ throw NetException(ErrorCodes::SOCKET_TIMEOUT, "Timeout exceeded while writing to socket ({}, {} ms)",
+ peer_address.toString(),
+ socket.impl()->getSendTimeout().totalMilliseconds());
+ }
+ catch (const Poco::IOException & e)
+ {
+ throw NetException(ErrorCodes::NETWORK_ERROR, "{}, while writing to socket ({} -> {})", e.displayText(),
+ our_address.toString(), peer_address.toString());
+ }
+
+ if (res < 0)
+ throw NetException(ErrorCodes::CANNOT_WRITE_TO_SOCKET, "Cannot write to socket ({} -> {})",
+ our_address.toString(), peer_address.toString());
+
+ bytes_written += res;
+ }
+}
+
+WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size)
+ : BufferWithOwnMemory<WriteBuffer>(buf_size)
+ , socket(socket_)
+ , peer_address(socket.peerAddress())
+ , our_address(socket.address())
+ , socket_description("socket (" + peer_address.toString() + ")")
+{
+}
+
+WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket()
+{
+ try
+ {
+ finalize();
+ }
+ catch (...)
+ {
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.h b/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.h
new file mode 100644
index 0000000000..ecb6102035
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromPocoSocket.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <Poco/Net/Socket.h>
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <Common/AsyncTaskExecutor.h>
+
+namespace DB
+{
+
+using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>;
+
+/** Works with the ready Poco::Net::Socket. Blocking operations.
+ */
+class WriteBufferFromPocoSocket : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ explicit WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
+
+ ~WriteBufferFromPocoSocket() override;
+
+ void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }
+
+protected:
+ void nextImpl() override;
+
+ Poco::Net::Socket & socket;
+
+ /** For error messages. It is necessary to receive this address in advance, because,
+ * for example, if the connection is broken, the address will not be received anymore
+ * (getpeername will return an error).
+ */
+ Poco::Net::SocketAddress peer_address;
+ Poco::Net::SocketAddress our_address;
+
+private:
+ AsyncCallback async_callback;
+ std::string socket_description;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromS3.cpp b/contrib/clickhouse/src/IO/WriteBufferFromS3.cpp
new file mode 100644
index 0000000000..824d0ae00a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromS3.cpp
@@ -0,0 +1,699 @@
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include "StdIStreamFromMemory.h"
+#include "WriteBufferFromS3.h"
+#include "WriteBufferFromS3TaskTracker.h"
+
+#include <Common/logger_useful.h>
+#include <Common/ProfileEvents.h>
+#include <Common/Throttler.h>
+#include <Interpreters/Cache/FileCache.h>
+
+#include <IO/ResourceGuard.h>
+#include <IO/WriteHelpers.h>
+#include <IO/S3Common.h>
+#include <IO/S3/Requests.h>
+#include <IO/S3/getObjectInfo.h>
+#include <Interpreters/Context.h>
+
+#include <aws/s3/model/StorageClass.h>
+
+#include <utility>
+
+
+namespace ProfileEvents
+{
+ extern const Event WriteBufferFromS3Bytes;
+ extern const Event WriteBufferFromS3Microseconds;
+ extern const Event WriteBufferFromS3RequestsErrors;
+ extern const Event S3WriteBytes;
+
+ extern const Event S3CreateMultipartUpload;
+ extern const Event S3CompleteMultipartUpload;
+ extern const Event S3AbortMultipartUpload;
+ extern const Event S3UploadPart;
+ extern const Event S3PutObject;
+
+ extern const Event DiskS3CreateMultipartUpload;
+ extern const Event DiskS3CompleteMultipartUpload;
+ extern const Event DiskS3AbortMultipartUpload;
+ extern const Event DiskS3UploadPart;
+ extern const Event DiskS3PutObject;
+
+ extern const Event RemoteWriteThrottlerBytes;
+ extern const Event RemoteWriteThrottlerSleepMicroseconds;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int S3_ERROR;
+ extern const int INVALID_CONFIG_PARAMETER;
+ extern const int LOGICAL_ERROR;
+}
+
+// struct WriteBufferFromS3::PartData
+// {
+// Memory<> memory;
+// size_t data_size = 0;
+
+// std::shared_ptr<std::iostream> createAwsBuffer()
+// {
+// auto buffer = std::make_shared<StdIStreamFromMemory>(memory.data(), data_size);
+// buffer->exceptions(std::ios::badbit);
+// return buffer;
+// }
+
+// bool isEmpty() const
+// {
+// return data_size == 0;
+// }
+// };
+
+std::shared_ptr<std::iostream> WriteBufferFromS3::PartData::createAwsBuffer()
+{
+ auto buffer = std::make_shared<StdIStreamFromMemory>(memory.data(), data_size);
+ buffer->exceptions(std::ios::badbit);
+ return buffer;
+}
+
+WriteBufferFromS3::WriteBufferFromS3(
+ std::shared_ptr<const S3::Client> client_ptr_,
+ std::shared_ptr<const S3::Client> client_with_long_timeout_ptr_,
+ const String & bucket_,
+ const String & key_,
+ size_t buf_size_,
+ const S3Settings::RequestSettings & request_settings_,
+ std::optional<std::map<String, String>> object_metadata_,
+ ThreadPoolCallbackRunner<void> schedule_,
+ const WriteSettings & write_settings_)
+ : WriteBufferFromFileBase(buf_size_, nullptr, 0)
+ , bucket(bucket_)
+ , key(key_)
+ , request_settings(request_settings_)
+ , upload_settings(request_settings.getUploadSettings())
+ , write_settings(write_settings_)
+ , client_ptr(std::move(client_ptr_))
+ , client_with_long_timeout_ptr(std::move(client_with_long_timeout_ptr_))
+ , object_metadata(std::move(object_metadata_))
+ , buffer_allocation_policy(ChooseBufferPolicy(upload_settings))
+ , task_tracker(
+ std::make_unique<WriteBufferFromS3::TaskTracker>(
+ std::move(schedule_),
+ upload_settings.max_inflight_parts_for_one_file,
+ limitedLog))
+{
+ LOG_TRACE(limitedLog, "Create WriteBufferFromS3, {}", getShortLogDetails());
+
+ allocateBuffer();
+}
+
+void WriteBufferFromS3::nextImpl()
+{
+ if (is_prefinalized)
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest");
+
+ /// Make sense to call waitIfAny before adding new async task to check if there is an exception
+ /// The faster the exception is propagated the lesser time is spent for cancellation
+ /// Despite the fact that `task_tracker->add()` collects tasks statuses and propagates their exceptions
+ /// that call is necessary for the case when the is no in-flight limitation and therefore `task_tracker->add()` doesn't wait anything
+ task_tracker->waitIfAny();
+
+ hidePartialData();
+
+ reallocateFirstBuffer();
+
+ if (available() > 0)
+ return;
+
+ detachBuffer();
+
+ if (!multipart_upload_id.empty() || detached_part_data.size() > 1)
+ writeMultipartUpload();
+
+ allocateBuffer();
+}
+
+void WriteBufferFromS3::preFinalize()
+{
+ if (is_prefinalized)
+ return;
+
+ LOG_TEST(limitedLog, "preFinalize WriteBufferFromS3. {}", getShortLogDetails());
+
+ /// This function should not be run again if an exception has occurred
+ is_prefinalized = true;
+
+ hidePartialData();
+
+ if (hidden_size > 0)
+ detachBuffer();
+ setFakeBufferWhenPreFinalized();
+
+ bool do_single_part_upload = false;
+
+ if (multipart_upload_id.empty() && detached_part_data.size() <= 1)
+ {
+ if (detached_part_data.empty() || detached_part_data.front().data_size <= upload_settings.max_single_part_upload_size)
+ do_single_part_upload = true;
+ }
+
+ if (do_single_part_upload)
+ {
+ if (detached_part_data.empty())
+ {
+ makeSinglepartUpload({});
+ }
+ else
+ {
+ makeSinglepartUpload(std::move(detached_part_data.front()));
+ detached_part_data.pop_front();
+ }
+ }
+ else
+ {
+ writeMultipartUpload();
+ }
+}
+
+void WriteBufferFromS3::finalizeImpl()
+{
+ LOG_TRACE(limitedLog, "finalizeImpl WriteBufferFromS3. {}.", getShortLogDetails());
+
+ if (!is_prefinalized)
+ preFinalize();
+
+ chassert(offset() == 0);
+ chassert(hidden_size == 0);
+
+ task_tracker->waitAll();
+
+ if (!multipart_upload_id.empty())
+ {
+ completeMultipartUpload();
+ multipart_upload_finished = true;
+ }
+
+ if (request_settings.check_objects_after_upload)
+ {
+ S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
+
+ size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage);
+ if (actual_size != total_size)
+ throw Exception(
+ ErrorCodes::S3_ERROR,
+ "Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.",
+ key, bucket, actual_size, total_size);
+ }
+}
+
+String WriteBufferFromS3::getVerboseLogDetails() const
+{
+ String multipart_upload_details;
+ if (!multipart_upload_id.empty())
+ multipart_upload_details = fmt::format(", upload id {}, upload has finished {}"
+ , multipart_upload_id, multipart_upload_finished);
+
+ return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, prefinalized {}, finalized {}{}",
+ bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), is_prefinalized, finalized, multipart_upload_details);
+}
+
+String WriteBufferFromS3::getShortLogDetails() const
+{
+ String multipart_upload_details;
+ if (!multipart_upload_id.empty())
+ multipart_upload_details = fmt::format(", upload id {}"
+ , multipart_upload_id);
+
+ return fmt::format("Details: bucket {}, key {}{}",
+ bucket, key, multipart_upload_details);
+}
+
+void WriteBufferFromS3::tryToAbortMultipartUpload()
+{
+ try
+ {
+ task_tracker->safeWaitAll();
+ abortMultipartUpload();
+ }
+ catch (...)
+ {
+ LOG_ERROR(log, "Multipart upload hasn't aborted. {}", getVerboseLogDetails());
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+WriteBufferFromS3::~WriteBufferFromS3()
+{
+ LOG_TRACE(limitedLog, "Close WriteBufferFromS3. {}.", getShortLogDetails());
+
+ /// That destructor could be call with finalized=false in case of exceptions
+ if (!finalized)
+ {
+ LOG_INFO(
+ log,
+ "WriteBufferFromS3 is not finalized in destructor. "
+ "The file might not be written to S3. "
+ "{}.",
+ getVerboseLogDetails());
+ }
+
+ task_tracker->safeWaitAll();
+
+ if (!multipart_upload_id.empty() && !multipart_upload_finished)
+ {
+ LOG_WARNING(log, "WriteBufferFromS3 was neither finished nor aborted, try to abort upload in destructor. {}.", getVerboseLogDetails());
+ tryToAbortMultipartUpload();
+ }
+}
+
+void WriteBufferFromS3::hidePartialData()
+{
+ if (write_settings.remote_throttler)
+ write_settings.remote_throttler->add(offset(), ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
+
+ chassert(memory.size() >= hidden_size + offset());
+
+ hidden_size += offset();
+ chassert(memory.data() + hidden_size == working_buffer.begin() + offset());
+ chassert(memory.data() + hidden_size == position());
+
+ WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size);
+ chassert(offset() == 0);
+}
+
+void WriteBufferFromS3::reallocateFirstBuffer()
+{
+ chassert(offset() == 0);
+
+ if (buffer_allocation_policy->getBufferNumber() > 1 || available() > 0)
+ return;
+
+ const size_t max_first_buffer = buffer_allocation_policy->getBufferSize();
+ if (memory.size() == max_first_buffer)
+ return;
+
+ size_t size = std::min(memory.size() * 2, max_first_buffer);
+ memory.resize(size);
+
+ WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size);
+
+ chassert(offset() == 0);
+}
+
+void WriteBufferFromS3::detachBuffer()
+{
+ size_t data_size = size_t(position() - memory.data());
+ chassert(data_size == hidden_size);
+
+ auto buf = std::move(memory);
+
+ WriteBuffer::set(nullptr, 0);
+ total_size += hidden_size;
+ hidden_size = 0;
+
+ detached_part_data.push_back({std::move(buf), data_size});
+}
+
+void WriteBufferFromS3::allocateFirstBuffer()
+{
+ const auto max_first_buffer = buffer_allocation_policy->getBufferSize();
+ const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer);
+ memory = Memory(size);
+ WriteBuffer::set(memory.data(), memory.size());
+}
+
+void WriteBufferFromS3::allocateBuffer()
+{
+ buffer_allocation_policy->nextBuffer();
+ chassert(0 == hidden_size);
+
+ if (buffer_allocation_policy->getBufferNumber() == 1)
+ return allocateFirstBuffer();
+
+ memory = Memory(buffer_allocation_policy->getBufferSize());
+ WriteBuffer::set(memory.data(), memory.size());
+}
+
+void WriteBufferFromS3::setFakeBufferWhenPreFinalized()
+{
+ WriteBuffer::set(fake_buffer_when_prefinalized, sizeof(fake_buffer_when_prefinalized));
+}
+
+void WriteBufferFromS3::writeMultipartUpload()
+{
+ if (multipart_upload_id.empty())
+ {
+ createMultipartUpload();
+ }
+
+ while (!detached_part_data.empty())
+ {
+ writePart(std::move(detached_part_data.front()));
+ detached_part_data.pop_front();
+ }
+}
+
+void WriteBufferFromS3::createMultipartUpload()
+{
+ LOG_TEST(limitedLog, "Create multipart upload. {}", getShortLogDetails());
+
+ S3::CreateMultipartUploadRequest req;
+
+ req.SetBucket(bucket);
+ req.SetKey(key);
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ req.SetContentType("binary/octet-stream");
+
+ if (object_metadata.has_value())
+ req.SetMetadata(object_metadata.value());
+
+ client_ptr->setKMSHeaders(req);
+
+ ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload);
+ if (write_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3CreateMultipartUpload);
+
+ Stopwatch watch;
+ auto outcome = client_ptr->CreateMultipartUpload(req);
+ watch.stop();
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ if (!outcome.IsSuccess())
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+
+ multipart_upload_id = outcome.GetResult().GetUploadId();
+ LOG_TRACE(limitedLog, "Multipart upload has created. {}", getShortLogDetails());
+}
+
+void WriteBufferFromS3::abortMultipartUpload()
+{
+ if (multipart_upload_id.empty())
+ {
+ LOG_WARNING(log, "Nothing to abort. {}", getVerboseLogDetails());
+ return;
+ }
+
+ LOG_WARNING(log, "Abort multipart upload. {}", getVerboseLogDetails());
+
+ S3::AbortMultipartUploadRequest req;
+ req.SetBucket(bucket);
+ req.SetKey(key);
+ req.SetUploadId(multipart_upload_id);
+
+ ProfileEvents::increment(ProfileEvents::S3AbortMultipartUpload);
+ if (write_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3AbortMultipartUpload);
+
+ Stopwatch watch;
+ auto outcome = client_ptr->AbortMultipartUpload(req);
+ watch.stop();
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ if (!outcome.IsSuccess())
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+
+ LOG_WARNING(log, "Multipart upload has aborted successfully. {}", getVerboseLogDetails());
+}
+
+S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, PartData & data)
+{
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size);
+
+ S3::UploadPartRequest req;
+
+ /// Setup request.
+ req.SetBucket(bucket);
+ req.SetKey(key);
+ req.SetPartNumber(static_cast<int>(part_number));
+ req.SetUploadId(multipart_upload_id);
+ req.SetContentLength(data.data_size);
+ req.SetBody(data.createAwsBuffer());
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ req.SetContentType("binary/octet-stream");
+
+ return req;
+}
+
+void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data)
+{
+ if (data.data_size == 0)
+ {
+ LOG_TEST(limitedLog, "Skipping writing part as empty {}", getShortLogDetails());
+ return;
+ }
+
+ multipart_tags.push_back({});
+ size_t part_number = multipart_tags.size();
+ LOG_TEST(limitedLog, "writePart {}, part size {}, part number {}", getShortLogDetails(), data.data_size, part_number);
+
+ if (multipart_upload_id.empty())
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Unable to write a part without multipart_upload_id, details: WriteBufferFromS3 created for bucket {}, key {}",
+ bucket, key);
+
+ if (part_number > upload_settings.max_part_number)
+ {
+ throw Exception(
+ ErrorCodes::INVALID_CONFIG_PARAMETER,
+ "Part number exceeded {} while writing {} bytes to S3. Check min_upload_part_size = {}, max_upload_part_size = {}, "
+ "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, max_single_part_upload_size = {}",
+ upload_settings.max_part_number, count(), upload_settings.min_upload_part_size, upload_settings.max_upload_part_size,
+ upload_settings.upload_part_size_multiply_factor, upload_settings.upload_part_size_multiply_parts_count_threshold,
+ upload_settings.max_single_part_upload_size);
+ }
+
+ if (data.data_size > upload_settings.max_upload_part_size)
+ {
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Part size exceeded max_upload_part_size. {}, part number {}, part size {}, max_upload_part_size {}",
+ getShortLogDetails(),
+ part_number,
+ data.data_size,
+ upload_settings.max_upload_part_size
+ );
+ }
+
+ auto req = getUploadRequest(part_number, data);
+ auto worker_data = std::make_shared<std::tuple<S3::UploadPartRequest, WriteBufferFromS3::PartData>>(std::move(req), std::move(data));
+
+ auto upload_worker = [&, worker_data, part_number] ()
+ {
+ auto & data_size = std::get<1>(*worker_data).data_size;
+
+ LOG_TEST(limitedLog, "Write part started {}, part size {}, part number {}",
+ getShortLogDetails(), data_size, part_number);
+
+ ProfileEvents::increment(ProfileEvents::S3UploadPart);
+ if (write_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3UploadPart);
+
+ auto & request = std::get<0>(*worker_data);
+
+ ResourceCost cost = request.GetContentLength();
+ ResourceGuard rlock(write_settings.resource_link, cost);
+ Stopwatch watch;
+ auto outcome = client_ptr->UploadPart(request);
+ watch.stop();
+ rlock.unlock(); // Avoid acquiring other locks under resource lock
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ if (!outcome.IsSuccess())
+ {
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure
+ throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
+ }
+
+ multipart_tags[part_number-1] = outcome.GetResult().GetETag();
+
+ LOG_TEST(limitedLog, "Write part succeeded {}, part size {}, part number {}, etag {}",
+ getShortLogDetails(), data_size, part_number, multipart_tags[part_number-1]);
+ };
+
+ task_tracker->add(std::move(upload_worker));
+}
+
+void WriteBufferFromS3::completeMultipartUpload()
+{
+ LOG_TEST(limitedLog, "Completing multipart upload. {}, Parts: {}", getShortLogDetails(), multipart_tags.size());
+
+ if (multipart_tags.empty())
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Failed to complete multipart upload. No parts have uploaded");
+
+ for (size_t i = 0; i < multipart_tags.size(); ++i)
+ {
+ const auto tag = multipart_tags.at(i);
+ if (tag.empty())
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Failed to complete multipart upload. Part {} haven't been uploaded.", i);
+ }
+
+ S3::CompleteMultipartUploadRequest req;
+ req.SetBucket(bucket);
+ req.SetKey(key);
+ req.SetUploadId(multipart_upload_id);
+
+ Aws::S3::Model::CompletedMultipartUpload multipart_upload;
+ for (size_t i = 0; i < multipart_tags.size(); ++i)
+ {
+ Aws::S3::Model::CompletedPart part;
+ multipart_upload.AddParts(part.WithETag(multipart_tags[i]).WithPartNumber(static_cast<int>(i + 1)));
+ }
+
+ req.SetMultipartUpload(multipart_upload);
+
+ size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL);
+ for (size_t i = 0; i < max_retry; ++i)
+ {
+ ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload);
+ if (write_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3CompleteMultipartUpload);
+
+ Stopwatch watch;
+ auto outcome = client_with_long_timeout_ptr->CompleteMultipartUpload(req);
+ watch.stop();
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ if (outcome.IsSuccess())
+ {
+ LOG_TRACE(limitedLog, "Multipart upload has completed. {}, Parts: {}", getShortLogDetails(), multipart_tags.size());
+ return;
+ }
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+
+ if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY)
+ {
+ /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests
+ /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it
+ LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error, will retry. {}, Parts: {}", getVerboseLogDetails(), multipart_tags.size());
+ }
+ else
+ {
+ throw S3Exception(
+ outcome.GetError().GetErrorType(),
+ "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+ outcome.GetError().GetMessage(), key, bucket, fmt::join(multipart_tags.begin(), multipart_tags.end(), " "));
+ }
+ }
+
+ throw S3Exception(
+ Aws::S3::S3Errors::NO_SUCH_KEY,
+ "Message: Multipart upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}",
+ max_retry, key, bucket);
+}
+
+S3::PutObjectRequest WriteBufferFromS3::getPutRequest(PartData & data)
+{
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size);
+
+ S3::PutObjectRequest req;
+
+ req.SetBucket(bucket);
+ req.SetKey(key);
+ req.SetContentLength(data.data_size);
+ req.SetBody(data.createAwsBuffer());
+ if (object_metadata.has_value())
+ req.SetMetadata(object_metadata.value());
+ if (!upload_settings.storage_class_name.empty())
+ req.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(upload_settings.storage_class_name));
+
+ /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840
+ req.SetContentType("binary/octet-stream");
+
+ client_ptr->setKMSHeaders(req);
+
+ return req;
+}
+
+void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data)
+{
+ LOG_TEST(limitedLog, "Making single part upload. {}, size {}", getShortLogDetails(), data.data_size);
+
+ auto req = getPutRequest(data);
+ auto worker_data = std::make_shared<std::tuple<S3::PutObjectRequest, WriteBufferFromS3::PartData>>(std::move(req), std::move(data));
+
+ auto upload_worker = [&, worker_data] ()
+ {
+ LOG_TEST(limitedLog, "writing single part upload started. {}", getShortLogDetails());
+
+ auto & request = std::get<0>(*worker_data);
+ size_t content_length = request.GetContentLength();
+
+ size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL);
+ for (size_t i = 0; i < max_retry; ++i)
+ {
+ ProfileEvents::increment(ProfileEvents::S3PutObject);
+ if (write_settings.for_object_storage)
+ ProfileEvents::increment(ProfileEvents::DiskS3PutObject);
+
+ ResourceCost cost = request.GetContentLength();
+ ResourceGuard rlock(write_settings.resource_link, cost);
+ Stopwatch watch;
+ auto outcome = client_ptr->PutObject(request);
+ watch.stop();
+ rlock.unlock();
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
+
+ if (outcome.IsSuccess())
+ {
+ LOG_TRACE(limitedLog, "Single part upload has completed. {}, size {}", getShortLogDetails(), content_length);
+ return;
+ }
+
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
+ write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure
+
+ if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY)
+ {
+
+ /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests
+ LOG_INFO(log, "Single part upload failed with NO_SUCH_KEY error. {}, size {}, will retry", getShortLogDetails(), content_length);
+ }
+ else
+ {
+ LOG_ERROR(log, "S3Exception name {}, Message: {}, bucket {}, key {}, object size {}",
+ outcome.GetError().GetExceptionName(), outcome.GetError().GetMessage(), bucket, key, content_length);
+ throw S3Exception(
+ outcome.GetError().GetErrorType(),
+ "Message: {}, bucket {}, key {}, object size {}",
+ outcome.GetError().GetMessage(), bucket, key, content_length);
+ }
+ }
+
+ throw S3Exception(
+ Aws::S3::S3Errors::NO_SUCH_KEY,
+ "Message: Single part upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}",
+ max_retry, key, bucket);
+ };
+
+ task_tracker->add(std::move(upload_worker));
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromS3.h b/contrib/clickhouse/src/IO/WriteBufferFromS3.h
new file mode 100644
index 0000000000..0fdf771e1f
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromS3.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <base/types.h>
+#include <Common/logger_useful.h>
+#include <IO/WriteBufferFromFileBase.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteSettings.h>
+#include <Storages/StorageS3Settings.h>
+#include <Interpreters/threadPoolCallbackRunner.h>
+
+#include <memory>
+#include <vector>
+#include <list>
+
+namespace DB
+{
+/**
+ * Buffer to write a data to a S3 object with specified bucket and key.
+ * If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload.
+ * In another case multipart upload is used:
+ * Data is divided on chunks with size greater than 'minimum_upload_part_size'. Last chunk can be less than this threshold.
+ * Each chunk is written as a part to S3.
+ */
+class WriteBufferFromS3 final : public WriteBufferFromFileBase
+{
+public:
+ WriteBufferFromS3(
+ std::shared_ptr<const S3::Client> client_ptr_,
+ /// for CompleteMultipartUploadRequest, because it blocks on recv() for a few seconds on big uploads
+ std::shared_ptr<const S3::Client> client_with_long_timeout_ptr_,
+ const String & bucket_,
+ const String & key_,
+ size_t buf_size_,
+ const S3Settings::RequestSettings & request_settings_,
+ std::optional<std::map<String, String>> object_metadata_ = std::nullopt,
+ ThreadPoolCallbackRunner<void> schedule_ = {},
+ const WriteSettings & write_settings_ = {});
+
+ ~WriteBufferFromS3() override;
+ void nextImpl() override;
+ void preFinalize() override;
+ std::string getFileName() const override { return key; }
+ void sync() override { next(); }
+
+ class IBufferAllocationPolicy
+ {
+ public:
+ virtual size_t getBufferNumber() const = 0;
+ virtual size_t getBufferSize() const = 0;
+ virtual void nextBuffer() = 0;
+ virtual ~IBufferAllocationPolicy() = 0;
+ };
+ using IBufferAllocationPolicyPtr = std::unique_ptr<IBufferAllocationPolicy>;
+
+ static IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_);
+
+private:
+ /// Receives response from the server after sending all data.
+ void finalizeImpl() override;
+
+ String getVerboseLogDetails() const;
+ String getShortLogDetails() const;
+
+ struct PartData
+ {
+ Memory<> memory;
+ size_t data_size = 0;
+
+ std::shared_ptr<std::iostream> createAwsBuffer();
+
+ bool isEmpty() const
+ {
+ return data_size == 0;
+ }
+ };
+
+ void hidePartialData();
+ void allocateFirstBuffer();
+ void reallocateFirstBuffer();
+ void detachBuffer();
+ void allocateBuffer();
+ void setFakeBufferWhenPreFinalized();
+
+ S3::UploadPartRequest getUploadRequest(size_t part_number, PartData & data);
+ void writePart(PartData && data);
+ void writeMultipartUpload();
+ void createMultipartUpload();
+ void completeMultipartUpload();
+ void abortMultipartUpload();
+ void tryToAbortMultipartUpload();
+
+ S3::PutObjectRequest getPutRequest(PartData & data);
+ void makeSinglepartUpload(PartData && data);
+
+ const String bucket;
+ const String key;
+ const S3Settings::RequestSettings request_settings;
+ const S3Settings::RequestSettings::PartUploadSettings & upload_settings;
+ const WriteSettings write_settings;
+ const std::shared_ptr<const S3::Client> client_ptr;
+ const std::shared_ptr<const S3::Client> client_with_long_timeout_ptr;
+ const std::optional<std::map<String, String>> object_metadata;
+ Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3");
+ LogSeriesLimiterPtr limitedLog = std::make_shared<LogSeriesLimiter>(log, 1, 5);
+
+ IBufferAllocationPolicyPtr buffer_allocation_policy;
+
+ /// Upload in S3 is made in parts.
+ /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts.
+ String multipart_upload_id;
+ std::deque<String> multipart_tags;
+ bool multipart_upload_finished = false;
+
+ /// Track that prefinalize() is called only once
+ bool is_prefinalized = false;
+
+ /// First fully filled buffer has to be delayed
+ /// There are two ways after:
+ /// First is to call prefinalize/finalize, which leads to single part upload
+ /// Second is to write more data, which leads to multi part upload
+ std::deque<PartData> detached_part_data;
+ char fake_buffer_when_prefinalized[1] = {};
+
+ /// offset() and count() are unstable inside nextImpl
+ /// For example nextImpl changes position hence offset() and count() is changed
+ /// This vars are dedicated to store information about sizes when offset() and count() are unstable
+ size_t total_size = 0;
+ size_t hidden_size = 0;
+
+ class TaskTracker;
+ std::unique_ptr<TaskTracker> task_tracker;
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/contrib/clickhouse/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp
new file mode 100644
index 0000000000..e64ea82c48
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp
@@ -0,0 +1,112 @@
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <IO/WriteBufferFromS3.h>
+
+#include <memory>
+
+namespace
+{
+
+class FixedSizeBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy
+{
+ const size_t buffer_size = 0;
+ size_t buffer_number = 0;
+
+public:
+ explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_)
+ : buffer_size(settings_.strict_upload_part_size)
+ {
+ chassert(buffer_size > 0);
+ }
+
+ size_t getBufferNumber() const override { return buffer_number; }
+
+ size_t getBufferSize() const override
+ {
+ chassert(buffer_number > 0);
+ return buffer_size;
+ }
+
+ void nextBuffer() override
+ {
+ ++buffer_number;
+ }
+};
+
+
+class ExpBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy
+{
+ const size_t first_size = 0;
+ const size_t second_size = 0;
+
+ const size_t multiply_factor = 0;
+ const size_t multiply_threshold = 0;
+ const size_t max_size = 0;
+
+ size_t current_size = 0;
+ size_t buffer_number = 0;
+
+public:
+ explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_)
+ : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size))
+ , second_size(settings_.min_upload_part_size)
+ , multiply_factor(settings_.upload_part_size_multiply_factor)
+ , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold)
+ , max_size(settings_.max_upload_part_size)
+ {
+ chassert(first_size > 0);
+ chassert(second_size > 0);
+ chassert(multiply_factor >= 1);
+ chassert(multiply_threshold > 0);
+ chassert(max_size > 0);
+ }
+
+ size_t getBufferNumber() const override { return buffer_number; }
+
+ size_t getBufferSize() const override
+ {
+ chassert(buffer_number > 0);
+ return current_size;
+ }
+
+ void nextBuffer() override
+ {
+ ++buffer_number;
+
+ if (1 == buffer_number)
+ {
+ current_size = first_size;
+ return;
+ }
+
+ if (2 == buffer_number)
+ current_size = second_size;
+
+ if (0 == ((buffer_number - 1) % multiply_threshold))
+ {
+ current_size *= multiply_factor;
+ current_size = std::min(current_size, max_size);
+ }
+ }
+};
+
+}
+
+namespace DB
+{
+
+WriteBufferFromS3::IBufferAllocationPolicy::~IBufferAllocationPolicy() = default;
+
+WriteBufferFromS3::IBufferAllocationPolicyPtr WriteBufferFromS3::ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_)
+{
+ if (settings_.strict_upload_part_size > 0)
+ return std::make_unique<FixedSizeBufferAllocationPolicy>(settings_);
+ else
+ return std::make_unique<ExpBufferAllocationPolicy>(settings_);
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.cpp b/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.cpp
new file mode 100644
index 0000000000..ed63d0c530
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.cpp
@@ -0,0 +1,176 @@
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include <IO/WriteBufferFromS3TaskTracker.h>
+
+namespace ProfileEvents
+{
+ extern const Event WriteBufferFromS3WaitInflightLimitMicroseconds;
+}
+
+namespace DB
+{
+
+WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_, LogSeriesLimiterPtr limitedLog_)
+ : is_async(bool(scheduler_))
+ , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner())
+ , max_tasks_inflight(max_tasks_inflight_)
+ , limitedLog(limitedLog_)
+{}
+
+WriteBufferFromS3::TaskTracker::~TaskTracker()
+{
+ safeWaitAll();
+}
+
+ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
+{
+ return [](Callback && callback, int64_t) mutable -> std::future<void>
+ {
+ auto package = std::packaged_task<void()>(std::move(callback));
+ /// No exceptions are propagated, exceptions are packed to future
+ package();
+ return package.get_future();
+ };
+}
+
+void WriteBufferFromS3::TaskTracker::waitAll()
+{
+ /// Exceptions are propagated
+ for (auto & future : futures)
+ {
+ future.get();
+ }
+ futures.clear();
+
+ std::lock_guard lock(mutex);
+ finished_futures.clear();
+}
+
+void WriteBufferFromS3::TaskTracker::safeWaitAll()
+{
+ for (auto & future : futures)
+ {
+ if (future.valid())
+ {
+ try
+ {
+ /// Exceptions are not propagated
+ future.get();
+ } catch (...)
+ {
+ /// But at least they are printed
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+ }
+ }
+ futures.clear();
+
+ std::lock_guard lock(mutex);
+ finished_futures.clear();
+}
+
+void WriteBufferFromS3::TaskTracker::waitIfAny()
+{
+ if (futures.empty())
+ return;
+
+ Stopwatch watch;
+
+ {
+ std::lock_guard lock(mutex);
+ for (auto & it : finished_futures)
+ {
+ /// actually that call might lock this thread until the future is set finally
+ /// however that won't lock us for long, the task is about to finish when the pointer appears in the `finished_futures`
+ it->get();
+
+ /// in case of exception in `it->get()`
+ /// it it not necessary to remove `it` from list `futures`
+ /// `TaskTracker` has to be destroyed after any exception occurs, for this `safeWaitAll` is called.
+ /// `safeWaitAll` handles invalid futures in the list `futures`
+ futures.erase(it);
+ }
+ finished_futures.clear();
+ }
+
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+}
+
+void WriteBufferFromS3::TaskTracker::add(Callback && func)
+{
+ /// All this fuzz is about 2 things. This is the most critical place of TaskTracker.
+ /// The first is not to fail insertion in the list `futures`.
+ /// In order to face it, the element is allocated at the end of the list `futures` in advance.
+ /// The second is not to fail the notification of the task.
+ /// In order to face it, the list element, which would be inserted to the list `finished_futures`,
+ /// is allocated in advance as an other list `pre_allocated_finished` with one element inside.
+
+ /// preallocation for the first issue
+ futures.emplace_back();
+ auto future_placeholder = std::prev(futures.end());
+
+ /// preallocation for the second issue
+ FinishedList pre_allocated_finished {future_placeholder};
+
+ Callback func_with_notification = [&, my_func = std::move(func), my_pre_allocated_finished = std::move(pre_allocated_finished)]() mutable
+ {
+ SCOPE_EXIT({
+ DENY_ALLOCATIONS_IN_SCOPE;
+
+ std::lock_guard lock(mutex);
+ finished_futures.splice(finished_futures.end(), my_pre_allocated_finished);
+ has_finished.notify_one();
+ });
+
+ my_func();
+ };
+
+ /// this move is nothrow
+ *future_placeholder = scheduler(std::move(func_with_notification), Priority{});
+
+ waitTilInflightShrink();
+}
+
+void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
+{
+ if (!max_tasks_inflight)
+ return;
+
+ if (futures.size() >= max_tasks_inflight)
+ LOG_TEST(limitedLog, "have to wait some tasks finish, in queue {}, limit {}", futures.size(), max_tasks_inflight);
+
+ Stopwatch watch;
+
+ /// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
+ /// However the faster finished task is collected the faster CH checks if there is an exception
+ /// The faster an exception is propagated the lesser time is spent for cancellation
+ while (futures.size() >= max_tasks_inflight)
+ {
+ std::unique_lock lock(mutex);
+
+ has_finished.wait(lock, [this] () TSA_REQUIRES(mutex) { return !finished_futures.empty(); });
+
+ for (auto & it : finished_futures)
+ {
+ it->get();
+ futures.erase(it);
+ }
+
+ finished_futures.clear();
+ }
+
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+}
+
+bool WriteBufferFromS3::TaskTracker::isAsync() const
+{
+ return is_async;
+}
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.h b/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.h
new file mode 100644
index 0000000000..21daea22c0
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromS3TaskTracker.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include "clickhouse_config.h"
+
+#if USE_AWS_S3
+
+#include "WriteBufferFromS3.h"
+
+#include <Common/logger_useful.h>
+
+#include <list>
+
+namespace DB
+{
+
+/// That class is used only in WriteBufferFromS3 for now.
+/// Therefore it declared as a part of WriteBufferFromS3.
+/// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool.
+/// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll
+/// to help with coordination of the running tasks.
+
+/// Basic exception safety is provided. If exception occurred the object has to be destroyed.
+/// No thread safety is provided. Use this object with no concurrency.
+
+class WriteBufferFromS3::TaskTracker
+{
+public:
+ using Callback = std::function<void()>;
+
+ TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_, LogSeriesLimiterPtr limitedLog_);
+ ~TaskTracker();
+
+ static ThreadPoolCallbackRunner<void> syncRunner();
+
+ bool isAsync() const;
+
+ /// waitIfAny collects statuses from already finished tasks
+ /// There could be no finished tasks yet, so waitIfAny do nothing useful in that case
+ /// the first exception is thrown if any task has failed
+ void waitIfAny();
+
+ /// Well, waitAll waits all the tasks until they finish and collects their statuses
+ void waitAll();
+
+ /// safeWaitAll does the same as waitAll but mutes the exceptions
+ void safeWaitAll();
+
+ void add(Callback && func);
+
+private:
+ /// waitTilInflightShrink waits til the number of in-flight tasks beyond the limit `max_tasks_inflight`.
+ void waitTilInflightShrink() TSA_NO_THREAD_SAFETY_ANALYSIS;
+
+ void collectFinishedFutures(bool propagate_exceptions) TSA_REQUIRES(mutex);
+
+ const bool is_async;
+ ThreadPoolCallbackRunner<void> scheduler;
+ const size_t max_tasks_inflight;
+
+ using FutureList = std::list<std::future<void>>;
+ FutureList futures;
+ LogSeriesLimiterPtr limitedLog;
+
+ std::mutex mutex;
+ std::condition_variable has_finished TSA_GUARDED_BY(mutex);
+ using FinishedList = std::list<FutureList::iterator>;
+ FinishedList finished_futures TSA_GUARDED_BY(mutex);
+};
+
+}
+
+#endif
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromString.h b/contrib/clickhouse/src/IO/WriteBufferFromString.h
new file mode 100644
index 0000000000..1f813b1070
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromString.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <string>
+#include <IO/WriteBufferFromVector.h>
+#include <base/StringRef.h>
+
+
+namespace DB
+{
+
+/** Writes the data to a string.
+ * Note: before using the resulting string, destroy this object.
+ */
+using WriteBufferFromString = WriteBufferFromVector<std::string>;
+
+
+namespace detail
+{
+ /// For correct order of initialization.
+ class StringHolder
+ {
+ protected:
+ std::string value;
+ };
+}
+
+/// Creates the string by itself and allows to get it.
+class WriteBufferFromOwnString : public detail::StringHolder, public WriteBufferFromString
+{
+public:
+ WriteBufferFromOwnString() : WriteBufferFromString(value) {}
+
+ std::string_view stringView() const { return isFinished() ? std::string_view(value) : std::string_view(value.data(), pos - value.data()); }
+
+ std::string & str()
+ {
+ finalize();
+ return value;
+ }
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferFromVector.h b/contrib/clickhouse/src/IO/WriteBufferFromVector.h
new file mode 100644
index 0000000000..a2ecc34f1a
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferFromVector.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <vector>
+
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+}
+
+struct AppendModeTag {};
+
+/** Writes data to existing std::vector or similar type. When not enough space, it doubles vector size.
+ *
+ * In destructor, vector is cut to the size of written data.
+ * You can call 'finalize' to resize earlier.
+ *
+ * The vector should live until this object is destroyed or until the 'finalizeImpl()' method is called.
+ */
+template <typename VectorType>
+class WriteBufferFromVector : public WriteBuffer
+{
+public:
+ using ValueType = typename VectorType::value_type;
+ explicit WriteBufferFromVector(VectorType & vector_)
+ : WriteBuffer(reinterpret_cast<Position>(vector_.data()), vector_.size()), vector(vector_)
+ {
+ if (vector.empty())
+ {
+ vector.resize(initial_size);
+ set(reinterpret_cast<Position>(vector.data()), vector.size());
+ }
+ }
+
+ /// Append to vector instead of rewrite.
+ WriteBufferFromVector(VectorType & vector_, AppendModeTag)
+ : WriteBuffer(nullptr, 0), vector(vector_)
+ {
+ size_t old_size = vector.size();
+ size_t size = (old_size < initial_size) ? initial_size
+ : ((old_size < vector.capacity()) ? vector.capacity()
+ : vector.capacity() * size_multiplier);
+ vector.resize(size);
+ set(reinterpret_cast<Position>(vector.data() + old_size), (size - old_size) * sizeof(typename VectorType::value_type));
+ }
+
+ bool isFinished() const { return finalized; }
+
+ void restart(std::optional<size_t> max_capacity = std::nullopt)
+ {
+ if (max_capacity && vector.capacity() > max_capacity)
+ VectorType(initial_size, ValueType()).swap(vector);
+ else if (vector.empty())
+ vector.resize(initial_size);
+ set(reinterpret_cast<Position>(vector.data()), vector.size());
+ finalized = false;
+ }
+
+ ~WriteBufferFromVector() override
+ {
+ finalize();
+ }
+
+private:
+ void finalizeImpl() override
+ {
+ vector.resize(
+ ((position() - reinterpret_cast<Position>(vector.data())) /// NOLINT
+ + sizeof(ValueType) - 1) /// Align up.
+ / sizeof(ValueType));
+
+ /// Prevent further writes.
+ set(nullptr, 0);
+ }
+
+ void nextImpl() override
+ {
+ if (finalized)
+ throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER, "WriteBufferFromVector is finalized");
+
+ size_t old_size = vector.size();
+ /// pos may not be equal to vector.data() + old_size, because WriteBuffer::next() can be used to flush data
+ size_t pos_offset = pos - reinterpret_cast<Position>(vector.data());
+ if (pos_offset == old_size)
+ {
+ vector.resize(old_size * size_multiplier);
+ }
+ internal_buffer = Buffer(reinterpret_cast<Position>(vector.data() + pos_offset), reinterpret_cast<Position>(vector.data() + vector.size()));
+ working_buffer = internal_buffer;
+ }
+
+ VectorType & vector;
+
+ static constexpr size_t initial_size = 32;
+ static constexpr size_t size_multiplier = 2;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferValidUTF8.cpp b/contrib/clickhouse/src/IO/WriteBufferValidUTF8.cpp
new file mode 100644
index 0000000000..d611befac3
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferValidUTF8.cpp
@@ -0,0 +1,157 @@
+#include <Poco/UTF8Encoding.h>
+#include <IO/WriteBufferValidUTF8.h>
+#include <base/types.h>
+#include <base/simd.h>
+
+#ifdef __SSE2__
+ #include <emmintrin.h>
+#endif
+
+#if defined(__aarch64__) && defined(__ARM_NEON)
+# include <arm_neon.h>
+# pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+namespace DB
+{
+
+const size_t WriteBufferValidUTF8::DEFAULT_SIZE = 4096;
+
+/** Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
+ */
+extern const UInt8 length_of_utf8_sequence[256] =
+{
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
+};
+
+
+WriteBufferValidUTF8::WriteBufferValidUTF8(
+ WriteBuffer & output_buffer_, bool group_replacements_, const char * replacement_, size_t size)
+ : BufferWithOwnMemory<WriteBuffer>(std::max(static_cast<size_t>(32), size)), output_buffer(output_buffer_),
+ group_replacements(group_replacements_), replacement(replacement_)
+{
+}
+
+
+inline void WriteBufferValidUTF8::putReplacement()
+{
+ if (replacement.empty() || (group_replacements && just_put_replacement))
+ return;
+
+ just_put_replacement = true;
+ output_buffer.write(replacement.data(), replacement.size());
+}
+
+
+inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
+{
+ if (len == 0)
+ return;
+
+ just_put_replacement = false;
+ output_buffer.write(data, len);
+}
+
+
+void WriteBufferValidUTF8::nextImpl()
+{
+ char * p = memory.data();
+ char * valid_start = p;
+
+ while (p < pos)
+ {
+#ifdef __SSE2__
+ /// Fast skip of ASCII for x86.
+ static constexpr size_t SIMD_BYTES = 16;
+ const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
+
+ while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
+ p += SIMD_BYTES;
+
+ if (!(p < pos))
+ break;
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+ /// Fast skip of ASCII for aarch64.
+ static constexpr size_t SIMD_BYTES = 16;
+ const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
+ /// Other options include
+ /// vmaxvq_u8(input) < 0b10000000;
+ /// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
+ /// SIMDJSON uses it for 64 byte masks, so it's a little different.
+ /// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
+ /// shrn version has universally <=3 cycles, on servers 2 cycles.
+ while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
+ p += SIMD_BYTES;
+
+ if (!(p < pos))
+ break;
+#endif
+
+ UInt8 len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
+
+ if (len > 4)
+ { // NOLINT
+ /// Invalid start of sequence. Skip one byte.
+ putValid(valid_start, p - valid_start);
+ putReplacement();
+ ++p;
+ valid_start = p;
+ }
+ else if (p + len > pos)
+ {
+ /// Sequence was not fully written to this buffer.
+ break;
+ }
+ else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
+ {
+ /// Valid sequence.
+ p += len;
+ }
+ else
+ {
+ /// Invalid sequence. Skip just first byte.
+ putValid(valid_start, p - valid_start);
+ putReplacement();
+ ++p;
+ valid_start = p;
+ }
+ }
+
+ putValid(valid_start, p - valid_start);
+
+ size_t cnt = pos - p;
+
+ /// Shift unfinished sequence to start of buffer.
+ for (size_t i = 0; i < cnt; ++i)
+ memory[i] = p[i];
+
+ working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
+}
+
+WriteBufferValidUTF8::~WriteBufferValidUTF8()
+{
+ finalize();
+}
+
+void WriteBufferValidUTF8::finalizeImpl()
+{
+ /// Write all complete sequences from buffer.
+ nextImpl();
+
+ /// If unfinished sequence at end, then write replacement.
+ if (working_buffer.begin() != memory.data())
+ putReplacement();
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteBufferValidUTF8.h b/contrib/clickhouse/src/IO/WriteBufferValidUTF8.h
new file mode 100644
index 0000000000..daaf0427f8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteBufferValidUTF8.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+
+
+namespace DB
+{
+
+/** Writes the data to another buffer, replacing the invalid UTF-8 sequences with the specified sequence.
+ * If the valid UTF-8 is already written, it works faster.
+ * Note: before using the resulting string, destroy this object.
+ */
+class WriteBufferValidUTF8 final : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ static const size_t DEFAULT_SIZE;
+
+ explicit WriteBufferValidUTF8(
+ WriteBuffer & output_buffer_,
+ bool group_replacements_ = true,
+ const char * replacement_ = "\xEF\xBF\xBD",
+ size_t size = DEFAULT_SIZE);
+
+ ~WriteBufferValidUTF8() override;
+
+private:
+ void putReplacement();
+ void putValid(char * data, size_t len);
+
+ void nextImpl() override;
+ void finalizeImpl() override;
+
+ WriteBuffer & output_buffer;
+ bool group_replacements;
+ /// The last recorded character was `replacement`.
+ bool just_put_replacement = false;
+ std::string replacement;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteHelpers.cpp b/contrib/clickhouse/src/IO/WriteHelpers.cpp
new file mode 100644
index 0000000000..34eabe55d7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteHelpers.cpp
@@ -0,0 +1,125 @@
+#include <IO/WriteHelpers.h>
+#include <cinttypes>
+#include <utility>
+#include <Common/formatIPv6.h>
+#include <base/hex.h>
+
+
+namespace DB
+{
+
+template <typename IteratorSrc, typename IteratorDst>
+void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes)
+{
+ size_t src_pos = 0;
+ size_t dst_pos = 0;
+ for (; src_pos < num_bytes; ++src_pos)
+ {
+ writeHexByteLowercase(src[src_pos], &dst[dst_pos]);
+ dst_pos += 2;
+ }
+}
+
+std::array<char, 36> formatUUID(const UUID & uuid)
+{
+ std::array<char, 36> dst;
+ auto * dst_ptr = dst.data();
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ const auto * src_ptr = reinterpret_cast<const UInt8 *>(&uuid);
+ const std::reverse_iterator src(src_ptr + 16);
+#else
+ const auto * src = reinterpret_cast<const UInt8 *>(&uuid);
+#endif
+ formatHex(src + 8, dst_ptr, 4);
+ dst[8] = '-';
+ formatHex(src + 12, dst_ptr + 9, 2);
+ dst[13] = '-';
+ formatHex(src + 14, dst_ptr + 14, 2);
+ dst[18] = '-';
+ formatHex(src, dst_ptr + 19, 2);
+ dst[23] = '-';
+ formatHex(src + 2, dst_ptr + 24, 6);
+
+ return dst;
+}
+
+void writeIPv4Text(const IPv4 & ip, WriteBuffer & buf)
+{
+ size_t idx = (ip >> 24);
+ buf.write(one_byte_to_string_lookup_table[idx].first, one_byte_to_string_lookup_table[idx].second);
+ buf.write('.');
+ idx = (ip >> 16) & 0xFF;
+ buf.write(one_byte_to_string_lookup_table[idx].first, one_byte_to_string_lookup_table[idx].second);
+ buf.write('.');
+ idx = (ip >> 8) & 0xFF;
+ buf.write(one_byte_to_string_lookup_table[idx].first, one_byte_to_string_lookup_table[idx].second);
+ buf.write('.');
+ idx = ip & 0xFF;
+ buf.write(one_byte_to_string_lookup_table[idx].first, one_byte_to_string_lookup_table[idx].second);
+}
+
+void writeIPv6Text(const IPv6 & ip, WriteBuffer & buf)
+{
+ char addr[IPV6_MAX_TEXT_LENGTH + 1] {};
+ char * paddr = addr;
+
+ formatIPv6(reinterpret_cast<const unsigned char *>(&ip), paddr);
+ buf.write(addr, paddr - addr - 1);
+}
+
+void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trace)
+{
+ writeBinaryLittleEndian(e.code(), buf);
+ writeBinary(String(e.name()), buf);
+ writeBinary(e.displayText() + getExtraExceptionInfo(e), buf);
+
+ if (with_stack_trace)
+ writeBinary(e.getStackTraceString(), buf);
+ else
+ writeBinary(String(), buf);
+
+ bool has_nested = false;
+ writeBinary(has_nested, buf);
+}
+
+
+/// The same, but quotes apply only if there are characters that do not match the identifier without quotes
+template <typename F>
+static inline void writeProbablyQuotedStringImpl(StringRef s, WriteBuffer & buf, F && write_quoted_string)
+{
+ if (isValidIdentifier(s.toView())
+ /// This are valid identifiers but are problematic if present unquoted in SQL query.
+ && !(s.size == strlen("distinct") && 0 == strncasecmp(s.data, "distinct", strlen("distinct")))
+ && !(s.size == strlen("all") && 0 == strncasecmp(s.data, "all", strlen("all"))))
+ {
+ writeString(s, buf);
+ }
+ else
+ write_quoted_string(s, buf);
+}
+
+void writeProbablyBackQuotedString(StringRef s, WriteBuffer & buf)
+{
+ writeProbablyQuotedStringImpl(s, buf, [](StringRef s_, WriteBuffer & buf_) { return writeBackQuotedString(s_, buf_); });
+}
+
+void writeProbablyDoubleQuotedString(StringRef s, WriteBuffer & buf)
+{
+ writeProbablyQuotedStringImpl(s, buf, [](StringRef s_, WriteBuffer & buf_) { return writeDoubleQuotedString(s_, buf_); });
+}
+
+void writeProbablyBackQuotedStringMySQL(StringRef s, WriteBuffer & buf)
+{
+ writeProbablyQuotedStringImpl(s, buf, [](StringRef s_, WriteBuffer & buf_) { return writeBackQuotedStringMySQL(s_, buf_); });
+}
+
+void writePointerHex(const void * ptr, WriteBuffer & buf)
+{
+ writeString("0x", buf);
+ char hex_str[2 * sizeof(ptr)];
+ writeHexUIntLowercase(reinterpret_cast<uintptr_t>(ptr), hex_str);
+ buf.write(hex_str, 2 * sizeof(ptr));
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteHelpers.h b/contrib/clickhouse/src/IO/WriteHelpers.h
new file mode 100644
index 0000000000..57337e7bb9
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteHelpers.h
@@ -0,0 +1,1274 @@
+#pragma once
+
+#include <cstring>
+#include <cstdio>
+#include <limits>
+#include <algorithm>
+#include <iterator>
+#include <concepts>
+#include <bit>
+
+#include <pcg-random/pcg_random.hpp>
+
+#include <Common/StackTrace.h>
+#include <Common/formatIPv6.h>
+#include <Common/DateLUT.h>
+#include <Common/LocalDate.h>
+#include <Common/LocalDateTime.h>
+#include <Common/TransformEndianness.hpp>
+#include <base/find_symbols.h>
+#include <base/StringRef.h>
+#include <base/DecomposedFloat.h>
+#include <base/EnumReflection.h>
+
+#include <Core/DecimalFunctions.h>
+#include <Core/Types.h>
+#include <Core/UUID.h>
+#include <base/IPv4andIPv6.h>
+
+#include <Common/Exception.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/NaNUtils.h>
+
+#include <IO/CompressionMethod.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteIntText.h>
+#include <IO/VarInt.h>
+#include <IO/DoubleConverter.h>
+#include <IO/WriteBufferFromString.h>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#endif
+#include <dragonbox/dragonbox_to_chars.h>
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#include <Formats/FormatSettings.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER;
+}
+
+
+/// Helper functions for formatted and binary output.
+
+inline void writeChar(char x, WriteBuffer & buf)
+{
+ buf.nextIfAtEnd();
+ *buf.position() = x;
+ ++buf.position();
+}
+
+/// Write the same character n times.
+inline void writeChar(char c, size_t n, WriteBuffer & buf)
+{
+ while (n)
+ {
+ buf.nextIfAtEnd();
+ size_t count = std::min(n, buf.available());
+ memset(buf.position(), c, count);
+ n -= count;
+ buf.position() += count;
+ }
+}
+
+/// Write POD-type in native format. It's recommended to use only with packed (dense) data types.
+template <typename T>
+inline void writePODBinary(const T & x, WriteBuffer & buf)
+{
+ buf.write(reinterpret_cast<const char *>(&x), sizeof(x)); /// NOLINT
+}
+
+inline void writeUUIDBinary(const UUID & x, WriteBuffer & buf)
+{
+ const auto & uuid = x.toUnderType();
+ writePODBinary(uuid.items[0], buf);
+ writePODBinary(uuid.items[1], buf);
+}
+
+template <typename T>
+inline void writeIntBinary(const T & x, WriteBuffer & buf)
+{
+ writePODBinary(x, buf);
+}
+
+template <typename T>
+inline void writeFloatBinary(const T & x, WriteBuffer & buf)
+{
+ writePODBinary(x, buf);
+}
+
+
+inline void writeStringBinary(const std::string & s, WriteBuffer & buf)
+{
+ writeVarUInt(s.size(), buf);
+ buf.write(s.data(), s.size());
+}
+
+/// For historical reasons we store IPv6 as a String
+inline void writeIPv6Binary(const IPv6 & ip, WriteBuffer & buf)
+{
+ writeVarUInt(IPV6_BINARY_LENGTH, buf);
+ buf.write(reinterpret_cast<const char *>(&ip.toUnderType()), IPV6_BINARY_LENGTH);
+}
+
+inline void writeStringBinary(StringRef s, WriteBuffer & buf)
+{
+ writeVarUInt(s.size, buf);
+ buf.write(s.data, s.size);
+}
+
+inline void writeStringBinary(const char * s, WriteBuffer & buf)
+{
+ writeStringBinary(StringRef{s}, buf);
+}
+
+inline void writeStringBinary(std::string_view s, WriteBuffer & buf)
+{
+ writeStringBinary(StringRef{s}, buf);
+}
+
+
+template <typename T>
+void writeVectorBinary(const std::vector<T> & v, WriteBuffer & buf)
+{
+ writeVarUInt(v.size(), buf);
+
+ for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it)
+ writeBinary(*it, buf);
+}
+
+
+inline void writeBoolText(bool x, WriteBuffer & buf)
+{
+ writeChar(x ? '1' : '0', buf);
+}
+
+
+template <typename T>
+inline size_t writeFloatTextFastPath(T x, char * buffer)
+{
+ Int64 result = 0;
+
+ if constexpr (std::is_same_v<T, double>)
+ {
+ /// The library Ryu has low performance on integers.
+ /// This workaround improves performance 6..10 times.
+
+ if (DecomposedFloat64(x).isIntegerInRepresentableRange())
+ result = itoa(Int64(x), buffer) - buffer;
+ else
+ result = jkj::dragonbox::to_chars_n(x, buffer) - buffer;
+ }
+ else
+ {
+ if (DecomposedFloat32(x).isIntegerInRepresentableRange())
+ result = itoa(Int32(x), buffer) - buffer;
+ else
+ result = jkj::dragonbox::to_chars_n(x, buffer) - buffer;
+ }
+
+ if (result <= 0)
+ throw Exception(ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER, "Cannot print floating point number");
+ return result;
+}
+
+template <typename T>
+inline void writeFloatText(T x, WriteBuffer & buf)
+{
+ static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for writeFloatText must be float or double");
+
+ using Converter = DoubleConverter<false>;
+ if (likely(buf.available() >= Converter::MAX_REPRESENTATION_LENGTH))
+ {
+ buf.position() += writeFloatTextFastPath(x, buf.position());
+ return;
+ }
+
+ Converter::BufferType buffer;
+ size_t result = writeFloatTextFastPath(x, buffer);
+ buf.write(buffer, result);
+}
+
+
+inline void writeString(const char * data, size_t size, WriteBuffer & buf)
+{
+ buf.write(data, size);
+}
+
+// Otherwise StringRef and string_view overloads are ambiguous when passing string literal. Prefer std::string_view
+void writeString(std::same_as<StringRef> auto ref, WriteBuffer & buf)
+{
+ writeString(ref.data, ref.size, buf);
+}
+
+inline void writeString(std::string_view ref, WriteBuffer & buf)
+{
+ writeString(ref.data(), ref.size(), buf);
+}
+
+/** Writes a C-string without creating a temporary object. If the string is a literal, then `strlen` is executed at the compilation stage.
+ * Use when the string is a literal.
+ */
+#define writeCString(s, buf) \
+ (buf).write((s), strlen(s))
+
+/** Writes a string for use in the JSON format:
+ * - the string is written in double quotes
+ * - slash character '/' is escaped for compatibility with JavaScript
+ * - bytes from the range 0x00-0x1F except `\b', '\f', '\n', '\r', '\t' are escaped as \u00XX
+ * - code points U+2028 and U+2029 (byte sequences in UTF-8: e2 80 a8, e2 80 a9) are escaped as \u2028 and \u2029
+ * - it is assumed that string is in UTF-8, the invalid UTF-8 is not processed
+ * - all other non-ASCII characters remain as is
+ */
+inline void writeJSONString(const char * begin, const char * end, WriteBuffer & buf, const FormatSettings & settings)
+{
+ writeChar('"', buf);
+ for (const char * it = begin; it != end; ++it)
+ {
+ switch (*it)
+ {
+ case '\b':
+ writeChar('\\', buf);
+ writeChar('b', buf);
+ break;
+ case '\f':
+ writeChar('\\', buf);
+ writeChar('f', buf);
+ break;
+ case '\n':
+ writeChar('\\', buf);
+ writeChar('n', buf);
+ break;
+ case '\r':
+ writeChar('\\', buf);
+ writeChar('r', buf);
+ break;
+ case '\t':
+ writeChar('\\', buf);
+ writeChar('t', buf);
+ break;
+ case '\\':
+ writeChar('\\', buf);
+ writeChar('\\', buf);
+ break;
+ case '/':
+ if (settings.json.escape_forward_slashes)
+ writeChar('\\', buf);
+ writeChar('/', buf);
+ break;
+ case '"':
+ writeChar('\\', buf);
+ writeChar('"', buf);
+ break;
+ default:
+ UInt8 c = *it;
+ if (c <= 0x1F)
+ {
+ /// Escaping of ASCII control characters.
+
+ UInt8 higher_half = c >> 4;
+ UInt8 lower_half = c & 0xF;
+
+ writeCString("\\u00", buf);
+ writeChar('0' + higher_half, buf);
+
+ if (lower_half <= 9)
+ writeChar('0' + lower_half, buf);
+ else
+ writeChar('A' + lower_half - 10, buf);
+ }
+ else if (end - it >= 3 && it[0] == '\xE2' && it[1] == '\x80' && (it[2] == '\xA8' || it[2] == '\xA9'))
+ {
+ /// This is for compatibility with JavaScript, because unescaped line separators are prohibited in string literals,
+ /// and these code points are alternative line separators.
+
+ if (it[2] == '\xA8')
+ writeCString("\\u2028", buf);
+ if (it[2] == '\xA9')
+ writeCString("\\u2029", buf);
+
+ /// Byte sequence is 3 bytes long. We have additional two bytes to skip.
+ it += 2;
+ }
+ else
+ writeChar(*it, buf);
+ }
+ }
+ writeChar('"', buf);
+}
+
+
+/** Will escape quote_character and a list of special characters('\b', '\f', '\n', '\r', '\t', '\0', '\\').
+ * - when escape_quote_with_quote is true, use backslash to escape list of special characters,
+ * and use quote_character to escape quote_character. such as: 'hello''world'
+ * otherwise use backslash to escape list of special characters and quote_character
+ * - when escape_backslash_with_backslash is true, backslash is escaped with another backslash
+ */
+template <char quote_character, bool escape_quote_with_quote = false, bool escape_backslash_with_backslash = true>
+void writeAnyEscapedString(const char * begin, const char * end, WriteBuffer & buf)
+{
+ const char * pos = begin;
+ while (true)
+ {
+ /// On purpose we will escape more characters than minimally necessary.
+ const char * next_pos = find_first_symbols<'\b', '\f', '\n', '\r', '\t', '\0', '\\', quote_character>(pos, end);
+
+ if (next_pos == end)
+ {
+ buf.write(pos, next_pos - pos);
+ break;
+ }
+ else
+ {
+ buf.write(pos, next_pos - pos);
+ pos = next_pos;
+ switch (*pos)
+ {
+ case quote_character:
+ {
+ if constexpr (escape_quote_with_quote)
+ writeChar(quote_character, buf);
+ else
+ writeChar('\\', buf);
+ writeChar(quote_character, buf);
+ break;
+ }
+ case '\b':
+ writeChar('\\', buf);
+ writeChar('b', buf);
+ break;
+ case '\f':
+ writeChar('\\', buf);
+ writeChar('f', buf);
+ break;
+ case '\n':
+ writeChar('\\', buf);
+ writeChar('n', buf);
+ break;
+ case '\r':
+ writeChar('\\', buf);
+ writeChar('r', buf);
+ break;
+ case '\t':
+ writeChar('\\', buf);
+ writeChar('t', buf);
+ break;
+ case '\0':
+ writeChar('\\', buf);
+ writeChar('0', buf);
+ break;
+ case '\\':
+ if constexpr (escape_backslash_with_backslash)
+ writeChar('\\', buf);
+ writeChar('\\', buf);
+ break;
+ default:
+ writeChar(*pos, buf);
+ }
+ ++pos;
+ }
+ }
+}
+
+
+inline void writeJSONString(std::string_view s, WriteBuffer & buf, const FormatSettings & settings)
+{
+ writeJSONString(s.data(), s.data() + s.size(), buf, settings);
+}
+
+template <typename T>
+void writeJSONNumber(T x, WriteBuffer & ostr, const FormatSettings & settings)
+{
+ bool is_finite = isFinite(x);
+
+ const bool need_quote = (is_integer<T> && (sizeof(T) >= 8) && settings.json.quote_64bit_integers)
+ || (settings.json.quote_denormals && !is_finite) || (is_floating_point<T> && (sizeof(T) >= 8) && settings.json.quote_64bit_floats);
+
+ if (need_quote)
+ writeChar('"', ostr);
+
+ if (is_finite)
+ writeText(x, ostr);
+ else if (!settings.json.quote_denormals)
+ writeCString("null", ostr);
+ else
+ {
+ if constexpr (std::is_floating_point_v<T>)
+ {
+ if (std::signbit(x))
+ {
+ if (isNaN(x))
+ writeCString("-nan", ostr);
+ else
+ writeCString("-inf", ostr);
+ }
+ else
+ {
+ if (isNaN(x))
+ writeCString("nan", ostr);
+ else
+ writeCString("inf", ostr);
+ }
+ }
+ }
+
+ if (need_quote)
+ writeChar('"', ostr);
+}
+
+
+template <char c>
+void writeAnyEscapedString(std::string_view s, WriteBuffer & buf)
+{
+ writeAnyEscapedString<c>(s.data(), s.data() + s.size(), buf);
+}
+
+
+inline void writeEscapedString(const char * str, size_t size, WriteBuffer & buf)
+{
+ writeAnyEscapedString<'\''>(str, str + size, buf);
+}
+
+inline void writeEscapedString(std::string_view ref, WriteBuffer & buf)
+{
+ writeEscapedString(ref.data(), ref.size(), buf);
+}
+
+template <char quote_character>
+void writeAnyQuotedString(const char * begin, const char * end, WriteBuffer & buf)
+{
+ writeChar(quote_character, buf);
+ writeAnyEscapedString<quote_character>(begin, end, buf);
+ writeChar(quote_character, buf);
+}
+
+
+template <char quote_character>
+void writeAnyQuotedString(std::string_view ref, WriteBuffer & buf)
+{
+ writeAnyQuotedString<quote_character>(ref.data(), ref.data() + ref.size(), buf);
+}
+
+
+inline void writeQuotedString(const String & s, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'\''>(s, buf);
+}
+
+inline void writeQuotedString(StringRef ref, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'\''>(ref.toView(), buf);
+}
+
+inline void writeQuotedString(std::string_view ref, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'\''>(ref.data(), ref.data() + ref.size(), buf);
+}
+
+inline void writeQuotedStringPostgreSQL(std::string_view ref, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeAnyEscapedString<'\'', true, false>(ref.data(), ref.data() + ref.size(), buf);
+ writeChar('\'', buf);
+}
+
+inline void writeDoubleQuotedString(const String & s, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'"'>(s, buf);
+}
+
+inline void writeDoubleQuotedString(StringRef s, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'"'>(s.toView(), buf);
+}
+
+inline void writeDoubleQuotedString(std::string_view s, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'"'>(s.data(), s.data() + s.size(), buf);
+}
+
+/// Outputs a string in backquotes.
+inline void writeBackQuotedString(StringRef s, WriteBuffer & buf)
+{
+ writeAnyQuotedString<'`'>(s.toView(), buf);
+}
+
+/// Outputs a string in backquotes for MySQL.
+inline void writeBackQuotedStringMySQL(StringRef s, WriteBuffer & buf)
+{
+ writeChar('`', buf);
+ writeAnyEscapedString<'`', true>(s.data, s.data + s.size, buf);
+ writeChar('`', buf);
+}
+
+
+/// Write quoted if the string doesn't look like and identifier.
+void writeProbablyBackQuotedString(StringRef s, WriteBuffer & buf);
+void writeProbablyDoubleQuotedString(StringRef s, WriteBuffer & buf);
+void writeProbablyBackQuotedStringMySQL(StringRef s, WriteBuffer & buf);
+
+
+/** Outputs the string in for the CSV format.
+ * Rules:
+ * - the string is outputted in quotation marks;
+ * - the quotation mark inside the string is outputted as two quotation marks in sequence.
+ */
+template <char quote = '"'>
+void writeCSVString(const char * begin, const char * end, WriteBuffer & buf)
+{
+ writeChar(quote, buf);
+
+ const char * pos = begin;
+ while (true)
+ {
+ const char * next_pos = find_first_symbols<quote>(pos, end);
+
+ if (next_pos == end)
+ {
+ buf.write(pos, end - pos);
+ break;
+ }
+ else /// Quotation.
+ {
+ ++next_pos;
+ buf.write(pos, next_pos - pos);
+ writeChar(quote, buf);
+ }
+
+ pos = next_pos;
+ }
+
+ writeChar(quote, buf);
+}
+
+template <char quote = '"'>
+void writeCSVString(const String & s, WriteBuffer & buf)
+{
+ writeCSVString<quote>(s.data(), s.data() + s.size(), buf);
+}
+
+template <char quote = '"'>
+void writeCSVString(StringRef s, WriteBuffer & buf)
+{
+ writeCSVString<quote>(s.data, s.data + s.size, buf);
+}
+
+inline void writeXMLStringForTextElementOrAttributeValue(const char * begin, const char * end, WriteBuffer & buf)
+{
+ const char * pos = begin;
+ while (true)
+ {
+ const char * next_pos = find_first_symbols<'<', '&', '>', '"', '\''>(pos, end);
+
+ if (next_pos == end)
+ {
+ buf.write(pos, end - pos);
+ break;
+ }
+ else if (*next_pos == '<')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&lt;", buf);
+ }
+ else if (*next_pos == '&')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&amp;", buf);
+ }
+ else if (*next_pos == '>')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&gt;", buf);
+ }
+ else if (*next_pos == '"')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&quot;", buf);
+ }
+ else if (*next_pos == '\'')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&apos;", buf);
+ }
+
+ pos = next_pos;
+ }
+}
+
+inline void writeXMLStringForTextElementOrAttributeValue(std::string_view s, WriteBuffer & buf)
+{
+ writeXMLStringForTextElementOrAttributeValue(s.data(), s.data() + s.size(), buf);
+}
+
+/// Writing a string to a text node in XML (not into an attribute - otherwise you need more escaping).
+inline void writeXMLStringForTextElement(const char * begin, const char * end, WriteBuffer & buf)
+{
+ const char * pos = begin;
+ while (true)
+ {
+ /// NOTE Perhaps for some XML parsers, you need to escape the zero byte and some control characters.
+ const char * next_pos = find_first_symbols<'<', '&'>(pos, end);
+
+ if (next_pos == end)
+ {
+ buf.write(pos, end - pos);
+ break;
+ }
+ else if (*next_pos == '<')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&lt;", buf);
+ }
+ else if (*next_pos == '&')
+ {
+ buf.write(pos, next_pos - pos);
+ ++next_pos;
+ writeCString("&amp;", buf);
+ }
+
+ pos = next_pos;
+ }
+}
+
+inline void writeXMLStringForTextElement(std::string_view s, WriteBuffer & buf)
+{
+ writeXMLStringForTextElement(s.data(), s.data() + s.size(), buf);
+}
+
+/// @brief Serialize `uuid` into an array of characters in big-endian byte order.
+/// @param uuid UUID to serialize.
+/// @return Array of characters in big-endian byte order.
+std::array<char, 36> formatUUID(const UUID & uuid);
+
+inline void writeUUIDText(const UUID & uuid, WriteBuffer & buf)
+{
+ const auto serialized_uuid = formatUUID(uuid);
+ buf.write(serialized_uuid.data(), serialized_uuid.size());
+}
+
+void writeIPv4Text(const IPv4 & ip, WriteBuffer & buf);
+void writeIPv6Text(const IPv6 & ip, WriteBuffer & buf);
+
+template <typename DecimalType>
+inline void writeDateTime64FractionalText(typename DecimalType::NativeType fractional, UInt32 scale, WriteBuffer & buf)
+{
+ static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DecimalType>;
+
+ char data[20] = {'0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'};
+ static_assert(sizeof(data) >= MaxScale);
+
+ for (Int32 pos = scale - 1; pos >= 0 && fractional; --pos, fractional /= DateTime64(10))
+ data[pos] += fractional % DateTime64(10);
+
+ writeString(&data[0], static_cast<size_t>(scale), buf);
+}
+
+static const char digits100[201] =
+ "00010203040506070809"
+ "10111213141516171819"
+ "20212223242526272829"
+ "30313233343536373839"
+ "40414243444546474849"
+ "50515253545556575859"
+ "60616263646566676869"
+ "70717273747576777879"
+ "80818283848586878889"
+ "90919293949596979899";
+
+/// in YYYY-MM-DD format
+template <char delimiter = '-'>
+inline void writeDateText(const LocalDate & date, WriteBuffer & buf)
+{
+ if (reinterpret_cast<intptr_t>(buf.position()) + 10 <= reinterpret_cast<intptr_t>(buf.buffer().end()))
+ {
+ memcpy(buf.position(), &digits100[date.year() / 100 * 2], 2);
+ buf.position() += 2;
+ memcpy(buf.position(), &digits100[date.year() % 100 * 2], 2);
+ buf.position() += 2;
+ *buf.position() = delimiter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[date.month() * 2], 2);
+ buf.position() += 2;
+ *buf.position() = delimiter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[date.day() * 2], 2);
+ buf.position() += 2;
+ }
+ else
+ {
+ buf.write(&digits100[date.year() / 100 * 2], 2);
+ buf.write(&digits100[date.year() % 100 * 2], 2);
+ buf.write(delimiter);
+ buf.write(&digits100[date.month() * 2], 2);
+ buf.write(delimiter);
+ buf.write(&digits100[date.day() * 2], 2);
+ }
+}
+
+template <char delimiter = '-'>
+inline void writeDateText(DayNum date, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ writeDateText<delimiter>(LocalDate(date, time_zone), buf);
+}
+
+template <char delimiter = '-'>
+inline void writeDateText(ExtendedDayNum date, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ writeDateText<delimiter>(LocalDate(date, time_zone), buf);
+}
+
+/// In the format YYYY-MM-DD HH:MM:SS
+template <char date_delimeter = '-', char time_delimeter = ':', char between_date_time_delimiter = ' '>
+inline void writeDateTimeText(const LocalDateTime & datetime, WriteBuffer & buf)
+{
+ if (reinterpret_cast<intptr_t>(buf.position()) + 19 <= reinterpret_cast<intptr_t>(buf.buffer().end()))
+ {
+ memcpy(buf.position(), &digits100[datetime.year() / 100 * 2], 2);
+ buf.position() += 2;
+ memcpy(buf.position(), &digits100[datetime.year() % 100 * 2], 2);
+ buf.position() += 2;
+ *buf.position() = date_delimeter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[datetime.month() * 2], 2);
+ buf.position() += 2;
+ *buf.position() = date_delimeter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[datetime.day() * 2], 2);
+ buf.position() += 2;
+ *buf.position() = between_date_time_delimiter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[datetime.hour() * 2], 2);
+ buf.position() += 2;
+ *buf.position() = time_delimeter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[datetime.minute() * 2], 2);
+ buf.position() += 2;
+ *buf.position() = time_delimeter;
+ ++buf.position();
+ memcpy(buf.position(), &digits100[datetime.second() * 2], 2);
+ buf.position() += 2;
+ }
+ else
+ {
+ buf.write(&digits100[datetime.year() / 100 * 2], 2);
+ buf.write(&digits100[datetime.year() % 100 * 2], 2);
+ buf.write(date_delimeter);
+ buf.write(&digits100[datetime.month() * 2], 2);
+ buf.write(date_delimeter);
+ buf.write(&digits100[datetime.day() * 2], 2);
+ buf.write(between_date_time_delimiter);
+ buf.write(&digits100[datetime.hour() * 2], 2);
+ buf.write(time_delimeter);
+ buf.write(&digits100[datetime.minute() * 2], 2);
+ buf.write(time_delimeter);
+ buf.write(&digits100[datetime.second() * 2], 2);
+ }
+}
+
+/// In the format YYYY-MM-DD HH:MM:SS, according to the specified time zone.
+template <char date_delimeter = '-', char time_delimeter = ':', char between_date_time_delimiter = ' '>
+inline void writeDateTimeText(time_t datetime, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ writeDateTimeText<date_delimeter, time_delimeter, between_date_time_delimiter>(LocalDateTime(datetime, time_zone), buf);
+}
+
+/// In the format YYYY-MM-DD HH:MM:SS.NNNNNNNNN, according to the specified time zone.
+template <char date_delimeter = '-', char time_delimeter = ':', char between_date_time_delimiter = ' ', char fractional_time_delimiter = '.'>
+inline void writeDateTimeText(DateTime64 datetime64, UInt32 scale, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DateTime64>;
+ scale = scale > MaxScale ? MaxScale : scale;
+
+ auto components = DecimalUtils::split(datetime64, scale);
+ /// Case1:
+ /// -127914467.877
+ /// => whole = -127914467, fraction = 877(After DecimalUtils::split)
+ /// => new whole = -127914468(1965-12-12 12:12:12), new fraction = 1000 - 877 = 123(.123)
+ /// => 1965-12-12 12:12:12.123
+ ///
+ /// Case2:
+ /// -0.877
+ /// => whole = 0, fractional = -877(After DecimalUtils::split)
+ /// => whole = -1(1969-12-31 23:59:59), fractional = 1000 + (-877) = 123(.123)
+ using T = typename DateTime64::NativeType;
+ if (datetime64.value < 0 && components.fractional)
+ {
+ components.fractional = DecimalUtils::scaleMultiplier<T>(scale) + (components.whole ? T(-1) : T(1)) * components.fractional;
+ --components.whole;
+ }
+
+ writeDateTimeText<date_delimeter, time_delimeter, between_date_time_delimiter>(LocalDateTime(components.whole, time_zone), buf);
+
+ if (scale > 0)
+ {
+ buf.write(fractional_time_delimiter);
+ writeDateTime64FractionalText<DateTime64>(components.fractional, scale, buf);
+ }
+}
+
+/// In the RFC 1123 format: "Tue, 03 Dec 2019 00:11:50 GMT". You must provide GMT DateLUT.
+/// This is needed for HTTP requests.
+inline void writeDateTimeTextRFC1123(time_t datetime, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+{
+ const auto & values = time_zone.getValues(datetime);
+
+ static const char week_days[3 * 8 + 1] = "XXX" "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun";
+ static const char months[3 * 13 + 1] = "XXX" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec";
+
+ buf.write(&week_days[values.day_of_week * 3], 3);
+ buf.write(", ", 2);
+ buf.write(&digits100[values.day_of_month * 2], 2);
+ buf.write(' ');
+ buf.write(&months[values.month * 3], 3);
+ buf.write(' ');
+ buf.write(&digits100[values.year / 100 * 2], 2);
+ buf.write(&digits100[values.year % 100 * 2], 2);
+ buf.write(' ');
+ buf.write(&digits100[time_zone.toHour(datetime) * 2], 2);
+ buf.write(':');
+ buf.write(&digits100[time_zone.toMinute(datetime) * 2], 2);
+ buf.write(':');
+ buf.write(&digits100[time_zone.toSecond(datetime) * 2], 2);
+ buf.write(" GMT", 4);
+}
+
+inline void writeDateTimeTextISO(time_t datetime, WriteBuffer & buf, const DateLUTImpl & utc_time_zone)
+{
+ writeDateTimeText<'-', ':', 'T'>(datetime, buf, utc_time_zone);
+ buf.write('Z');
+}
+
+inline void writeDateTimeTextISO(DateTime64 datetime64, UInt32 scale, WriteBuffer & buf, const DateLUTImpl & utc_time_zone)
+{
+ writeDateTimeText<'-', ':', 'T'>(datetime64, scale, buf, utc_time_zone);
+ buf.write('Z');
+}
+
+inline void writeDateTimeUnixTimestamp(DateTime64 datetime64, UInt32 scale, WriteBuffer & buf)
+{
+ static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DateTime64>;
+ scale = scale > MaxScale ? MaxScale : scale;
+
+ auto components = DecimalUtils::split(datetime64, scale);
+ writeIntText(components.whole, buf);
+
+ if (scale > 0)
+ {
+ buf.write('.');
+ writeDateTime64FractionalText<DateTime64>(components.fractional, scale, buf);
+ }
+}
+
+/// Methods for output in binary format.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void writeBinary(const T & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+
+inline void writeBinary(const String & x, WriteBuffer & buf) { writeStringBinary(x, buf); }
+inline void writeBinary(StringRef x, WriteBuffer & buf) { writeStringBinary(x, buf); }
+inline void writeBinary(std::string_view x, WriteBuffer & buf) { writeStringBinary(x, buf); }
+inline void writeBinary(const Decimal32 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const Decimal64 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const Decimal128 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const Decimal256 & x, WriteBuffer & buf) { writePODBinary(x.value, buf); }
+inline void writeBinary(const LocalDate & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const LocalDateTime & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const IPv4 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const IPv6 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+
+inline void writeBinary(const UUID & x, WriteBuffer & buf)
+{
+ writeUUIDBinary(x, buf);
+}
+
+inline void writeBinary(const CityHash_v1_0_2::uint128 & x, WriteBuffer & buf)
+{
+ writePODBinary(x.low64, buf);
+ writePODBinary(x.high64, buf);
+}
+
+inline void writeBinary(const StackTrace::FramePointers & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+
+/// Methods for outputting the value in text form for a tab-separated format.
+
+inline void writeText(is_integer auto x, WriteBuffer & buf)
+{
+ if constexpr (std::is_same_v<decltype(x), bool>)
+ writeBoolText(x, buf);
+ else if constexpr (std::is_same_v<decltype(x), char>)
+ writeChar(x, buf);
+ else
+ writeIntText(x, buf);
+}
+
+inline void writeText(is_floating_point auto x, WriteBuffer & buf) { writeFloatText(x, buf); }
+
+inline void writeText(is_enum auto x, WriteBuffer & buf) { writeText(magic_enum::enum_name(x), buf); }
+
+inline void writeText(std::string_view x, WriteBuffer & buf) { writeString(x.data(), x.size(), buf); }
+
+inline void writeText(const DayNum & x, WriteBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { writeDateText(LocalDate(x, time_zone), buf); }
+inline void writeText(const LocalDate & x, WriteBuffer & buf) { writeDateText(x, buf); }
+inline void writeText(const LocalDateTime & x, WriteBuffer & buf) { writeDateTimeText(x, buf); }
+inline void writeText(const UUID & x, WriteBuffer & buf) { writeUUIDText(x, buf); }
+inline void writeText(const IPv4 & x, WriteBuffer & buf) { writeIPv4Text(x, buf); }
+inline void writeText(const IPv6 & x, WriteBuffer & buf) { writeIPv6Text(x, buf); }
+
+template <typename T>
+void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
+ bool fixed_fractional_length, UInt32 fractional_length)
+{
+ /// If it's big integer, but the number of digits is small,
+ /// use the implementation for smaller integers for more efficient arithmetic.
+ if constexpr (std::is_same_v<T, Int256>)
+ {
+ if (x <= std::numeric_limits<UInt32>::max())
+ {
+ writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ return;
+ }
+ else if (x <= std::numeric_limits<UInt64>::max())
+ {
+ writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ return;
+ }
+ else if (x <= std::numeric_limits<UInt128>::max())
+ {
+ writeDecimalFractional(static_cast<UInt128>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ return;
+ }
+ }
+ else if constexpr (std::is_same_v<T, Int128>)
+ {
+ if (x <= std::numeric_limits<UInt32>::max())
+ {
+ writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ return;
+ }
+ else if (x <= std::numeric_limits<UInt64>::max())
+ {
+ writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ return;
+ }
+ }
+
+ constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
+ assert(scale <= max_digits);
+ assert(fractional_length <= max_digits);
+
+ char buf[max_digits];
+ memset(buf, '0', std::max(scale, fractional_length));
+
+ T value = x;
+ Int32 last_nonzero_pos = 0;
+
+ if (fixed_fractional_length && fractional_length < scale)
+ {
+ T new_value = value / DecimalUtils::scaleMultiplier<Int256>(scale - fractional_length - 1);
+ auto round_carry = new_value % 10;
+ value = new_value / 10;
+ if (round_carry >= 5)
+ value += 1;
+ }
+
+ for (Int32 pos = fixed_fractional_length ? std::min(scale - 1, fractional_length - 1) : scale - 1; pos >= 0; --pos)
+ {
+ auto remainder = value % 10;
+ value /= 10;
+
+ if (remainder != 0 && last_nonzero_pos == 0)
+ last_nonzero_pos = pos;
+
+ buf[pos] += static_cast<char>(remainder);
+ }
+
+ writeChar('.', ostr);
+ ostr.write(buf, fixed_fractional_length ? fractional_length : (trailing_zeros ? scale : last_nonzero_pos + 1));
+}
+
+template <typename T>
+void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
+ bool fixed_fractional_length = false, UInt32 fractional_length = 0)
+{
+ T part = DecimalUtils::getWholePart(x, scale);
+
+ if (x.value < 0 && part == 0)
+ {
+ writeChar('-', ostr); /// avoid crop leading minus when whole part is zero
+ }
+
+ writeIntText(part, ostr);
+
+ if (scale || (fixed_fractional_length && fractional_length > 0))
+ {
+ part = DecimalUtils::getFractionalPart(x, scale);
+ if (part || trailing_zeros)
+ {
+ if (part < 0)
+ part *= T(-1);
+
+ writeDecimalFractional(part, scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
+ }
+ }
+}
+
+/// String, date, datetime are in single quotes with C-style escaping. Numbers - without.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void writeQuoted(const T & x, WriteBuffer & buf) { writeText(x, buf); }
+
+inline void writeQuoted(const String & x, WriteBuffer & buf) { writeQuotedString(x, buf); }
+
+inline void writeQuoted(std::string_view x, WriteBuffer & buf) { writeQuotedString(x, buf); }
+
+inline void writeQuoted(StringRef x, WriteBuffer & buf) { writeQuotedString(x, buf); }
+
+inline void writeQuoted(const LocalDate & x, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeDateText(x, buf);
+ writeChar('\'', buf);
+}
+
+inline void writeQuoted(const LocalDateTime & x, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeDateTimeText(x, buf);
+ writeChar('\'', buf);
+}
+
+inline void writeQuoted(const UUID & x, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeText(x, buf);
+ writeChar('\'', buf);
+}
+
+inline void writeQuoted(const IPv4 & x, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeText(x, buf);
+ writeChar('\'', buf);
+}
+
+inline void writeQuoted(const IPv6 & x, WriteBuffer & buf)
+{
+ writeChar('\'', buf);
+ writeText(x, buf);
+ writeChar('\'', buf);
+}
+
+/// String, date, datetime are in double quotes with C-style escaping. Numbers - without.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void writeDoubleQuoted(const T & x, WriteBuffer & buf) { writeText(x, buf); }
+
+inline void writeDoubleQuoted(const String & x, WriteBuffer & buf) { writeDoubleQuotedString(x, buf); }
+
+inline void writeDoubleQuoted(std::string_view x, WriteBuffer & buf) { writeDoubleQuotedString(x, buf); }
+
+inline void writeDoubleQuoted(StringRef x, WriteBuffer & buf) { writeDoubleQuotedString(x, buf); }
+
+inline void writeDoubleQuoted(const LocalDate & x, WriteBuffer & buf)
+{
+ writeChar('"', buf);
+ writeDateText(x, buf);
+ writeChar('"', buf);
+}
+
+inline void writeDoubleQuoted(const LocalDateTime & x, WriteBuffer & buf)
+{
+ writeChar('"', buf);
+ writeDateTimeText(x, buf);
+ writeChar('"', buf);
+}
+
+inline void writeDoubleQuoted(const UUID & x, WriteBuffer & buf)
+{
+ writeChar('"', buf);
+ writeText(x, buf);
+ writeChar('"', buf);
+}
+
+inline void writeDoubleQuoted(const IPv4 & x, WriteBuffer & buf)
+{
+ writeChar('"', buf);
+ writeText(x, buf);
+ writeChar('"', buf);
+}
+
+inline void writeDoubleQuoted(const IPv6 & x, WriteBuffer & buf)
+{
+ writeChar('"', buf);
+ writeText(x, buf);
+ writeChar('"', buf);
+}
+
+/// String - in double quotes and with CSV-escaping; date, datetime - in double quotes. Numbers - without.
+template <typename T>
+requires is_arithmetic_v<T>
+inline void writeCSV(const T & x, WriteBuffer & buf) { writeText(x, buf); }
+
+inline void writeCSV(const String & x, WriteBuffer & buf) { writeCSVString<>(x, buf); }
+inline void writeCSV(const LocalDate & x, WriteBuffer & buf) { writeDoubleQuoted(x, buf); }
+inline void writeCSV(const LocalDateTime & x, WriteBuffer & buf) { writeDoubleQuoted(x, buf); }
+inline void writeCSV(const UUID & x, WriteBuffer & buf) { writeDoubleQuoted(x, buf); }
+inline void writeCSV(const IPv4 & x, WriteBuffer & buf) { writeDoubleQuoted(x, buf); }
+inline void writeCSV(const IPv6 & x, WriteBuffer & buf) { writeDoubleQuoted(x, buf); }
+
+template <typename T>
+void writeBinary(const std::vector<T> & x, WriteBuffer & buf)
+{
+ size_t size = x.size();
+ writeVarUInt(size, buf);
+ for (size_t i = 0; i < size; ++i)
+ writeBinary(x[i], buf);
+}
+
+template <typename T>
+void writeQuoted(const std::vector<T> & x, WriteBuffer & buf)
+{
+ writeChar('[', buf);
+ for (size_t i = 0, size = x.size(); i < size; ++i)
+ {
+ if (i != 0)
+ writeChar(',', buf);
+ writeQuoted(x[i], buf);
+ }
+ writeChar(']', buf);
+}
+
+template <typename T>
+void writeDoubleQuoted(const std::vector<T> & x, WriteBuffer & buf)
+{
+ writeChar('[', buf);
+ for (size_t i = 0, size = x.size(); i < size; ++i)
+ {
+ if (i != 0)
+ writeChar(',', buf);
+ writeDoubleQuoted(x[i], buf);
+ }
+ writeChar(']', buf);
+}
+
+template <typename T>
+void writeText(const std::vector<T> & x, WriteBuffer & buf)
+{
+ writeQuoted(x, buf);
+}
+
+
+/// Serialize exception (so that it can be transferred over the network)
+void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trace);
+
+
+/// An easy-to-use method for converting something to a string in text form.
+template <typename T>
+inline String toString(const T & x)
+{
+ WriteBufferFromOwnString buf;
+ writeText(x, buf);
+ return buf.str();
+}
+
+inline String toString(const CityHash_v1_0_2::uint128 & hash)
+{
+ WriteBufferFromOwnString buf;
+ writeText(hash.low64, buf);
+ writeChar('_', buf);
+ writeText(hash.high64, buf);
+ return buf.str();
+}
+
+template <typename T>
+inline String toStringWithFinalSeparator(const std::vector<T> & x, const String & final_sep)
+{
+ WriteBufferFromOwnString buf;
+ for (auto it = x.begin(); it != x.end(); ++it)
+ {
+ if (it != x.begin())
+ {
+ if (std::next(it) == x.end())
+ writeString(final_sep, buf);
+ else
+ writeString(", ", buf);
+ }
+ writeQuoted(*it, buf);
+ }
+
+ return buf.str();
+}
+
+inline void writeNullTerminatedString(const String & s, WriteBuffer & buffer)
+{
+ /// c_str is guaranteed to return zero-terminated string
+ buffer.write(s.c_str(), s.size() + 1);
+}
+
+template <std::endian endian, typename T>
+inline void writeBinaryEndian(T x, WriteBuffer & buf)
+{
+ transformEndianness<endian>(x);
+ writeBinary(x, buf);
+}
+
+template <typename T>
+inline void writeBinaryLittleEndian(T x, WriteBuffer & buf)
+{
+ writeBinaryEndian<std::endian::little>(x, buf);
+}
+
+template <typename T>
+inline void writeBinaryBigEndian(T x, WriteBuffer & buf)
+{
+ writeBinaryEndian<std::endian::big>(x, buf);
+}
+
+
+struct PcgSerializer
+{
+ static void serializePcg32(const pcg32_fast & rng, WriteBuffer & buf)
+ {
+ writeText(rng.multiplier(), buf);
+ writeChar(' ', buf);
+ writeText(rng.increment(), buf);
+ writeChar(' ', buf);
+ writeText(rng.state_, buf);
+ }
+};
+
+void writePointerHex(const void * ptr, WriteBuffer & buf);
+
+}
+
+template<>
+struct fmt::formatter<DB::UUID>
+{
+ template<typename ParseContext>
+ constexpr auto parse(ParseContext & context)
+ {
+ return context.begin();
+ }
+
+ template<typename FormatContext>
+ auto format(const DB::UUID & uuid, FormatContext & context)
+ {
+ return fmt::format_to(context.out(), "{}", toString(uuid));
+ }
+};
diff --git a/contrib/clickhouse/src/IO/WriteIntText.h b/contrib/clickhouse/src/IO/WriteIntText.h
new file mode 100644
index 0000000000..c9a4cb0241
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteIntText.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <Core/Defines.h>
+#include <IO/WriteBuffer.h>
+#include <base/itoa.h>
+
+
+template <typename T> constexpr size_t max_int_width = 20;
+template <> inline constexpr size_t max_int_width<UInt8> = 3; /// 255
+template <> inline constexpr size_t max_int_width<Int8> = 4; /// -128
+template <> inline constexpr size_t max_int_width<UInt16> = 5; /// 65535
+template <> inline constexpr size_t max_int_width<Int16> = 6; /// -32768
+template <> inline constexpr size_t max_int_width<UInt32> = 10; /// 4294967295
+template <> inline constexpr size_t max_int_width<Int32> = 11; /// -2147483648
+template <> inline constexpr size_t max_int_width<UInt64> = 20; /// 18446744073709551615
+template <> inline constexpr size_t max_int_width<Int64> = 20; /// -9223372036854775808
+template <> inline constexpr size_t max_int_width<UInt128> = 39; /// 340282366920938463463374607431768211455
+template <> inline constexpr size_t max_int_width<Int128> = 40; /// -170141183460469231731687303715884105728
+template <> inline constexpr size_t max_int_width<UInt256> = 78; /// 115792089237316195423570985008687907853269984665640564039457584007913129639935
+template <> inline constexpr size_t max_int_width<Int256> = 78; /// -57896044618658097711785492504343953926634992332820282019728792003956564819968
+
+
+namespace DB
+{
+
+namespace detail
+{
+ template <typename T>
+ void NO_INLINE writeUIntTextFallback(T x, WriteBuffer & buf)
+ {
+ char tmp[max_int_width<T>];
+ char * end = itoa(x, tmp);
+ buf.write(tmp, end - tmp);
+ }
+}
+
+template <typename T>
+void writeIntText(T x, WriteBuffer & buf)
+{
+ if (likely(reinterpret_cast<uintptr_t>(buf.position()) + max_int_width<T> < reinterpret_cast<uintptr_t>(buf.buffer().end())))
+ buf.position() = itoa(x, buf.position());
+ else
+ detail::writeUIntTextFallback(x, buf);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/WriteSettings.h b/contrib/clickhouse/src/IO/WriteSettings.h
new file mode 100644
index 0000000000..8f22e44145
--- /dev/null
+++ b/contrib/clickhouse/src/IO/WriteSettings.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <Common/Throttler_fwd.h>
+#include <IO/ResourceLink.h>
+
+namespace DB
+{
+
+/// Settings to be passed to IDisk::writeFile()
+struct WriteSettings
+{
+ /// Bandwidth throttler to use during writing
+ ThrottlerPtr remote_throttler;
+ ThrottlerPtr local_throttler;
+
+ // Resource to be used during reading
+ ResourceLink resource_link;
+
+ /// Filesystem cache settings
+ bool enable_filesystem_cache_on_write_operations = false;
+ bool enable_filesystem_cache_log = false;
+ bool throw_on_error_from_cache = false;
+
+ bool s3_allow_parallel_part_upload = true;
+
+ /// Monitoring
+ bool for_object_storage = false; // to choose which profile events should be incremented
+
+ bool operator==(const WriteSettings & other) const = default;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.cpp b/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.cpp
new file mode 100644
index 0000000000..5455adcb7c
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.cpp
@@ -0,0 +1,131 @@
+#include <IO/ZlibDeflatingWriteBuffer.h>
+#include <Common/Exception.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ZLIB_DEFLATE_FAILED;
+}
+
+
+ZlibDeflatingWriteBuffer::ZlibDeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ CompressionMethod compression_method,
+ int compression_level,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+{
+ zstr.zalloc = nullptr;
+ zstr.zfree = nullptr;
+ zstr.opaque = nullptr;
+ zstr.next_in = nullptr;
+ zstr.avail_in = 0;
+ zstr.next_out = nullptr;
+ zstr.avail_out = 0;
+
+ int window_bits = 15;
+ if (compression_method == CompressionMethod::Gzip)
+ {
+ window_bits += 16;
+ }
+
+ int rc = deflateInit2(&zstr, compression_level, Z_DEFLATED, window_bits, 8, Z_DEFAULT_STRATEGY);
+
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_DEFLATE_FAILED, "deflateInit2 failed: {}; zlib version: {}", zError(rc), ZLIB_VERSION);
+}
+
+void ZlibDeflatingWriteBuffer::nextImpl()
+{
+ if (!offset())
+ return;
+
+ zstr.next_in = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ zstr.avail_in = static_cast<unsigned>(offset());
+
+ try
+ {
+ do
+ {
+ out->nextIfAtEnd();
+ zstr.next_out = reinterpret_cast<unsigned char *>(out->position());
+ zstr.avail_out = static_cast<unsigned>(out->buffer().end() - out->position());
+
+ int rc = deflate(&zstr, Z_NO_FLUSH);
+ out->position() = out->buffer().end() - zstr.avail_out;
+
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_DEFLATE_FAILED, "deflate failed: {}", zError(rc));
+ }
+ while (zstr.avail_in > 0 || zstr.avail_out == 0);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+}
+
+ZlibDeflatingWriteBuffer::~ZlibDeflatingWriteBuffer() = default;
+
+void ZlibDeflatingWriteBuffer::finalizeBefore()
+{
+ next();
+
+ /// https://github.com/zlib-ng/zlib-ng/issues/494
+ do
+ {
+ out->nextIfAtEnd();
+ zstr.next_out = reinterpret_cast<unsigned char *>(out->position());
+ zstr.avail_out = static_cast<unsigned>(out->buffer().end() - out->position());
+
+ int rc = deflate(&zstr, Z_FULL_FLUSH);
+ out->position() = out->buffer().end() - zstr.avail_out;
+
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_DEFLATE_FAILED, "deflate failed: {}", zError(rc));
+ }
+ while (zstr.avail_out == 0);
+
+ while (true)
+ {
+ out->nextIfAtEnd();
+ zstr.next_out = reinterpret_cast<unsigned char *>(out->position());
+ zstr.avail_out = static_cast<unsigned>(out->buffer().end() - out->position());
+
+ int rc = deflate(&zstr, Z_FINISH);
+ out->position() = out->buffer().end() - zstr.avail_out;
+
+ if (rc == Z_STREAM_END)
+ {
+ return;
+ }
+
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_DEFLATE_FAILED, "deflate finalizeImpl() failed: {}", zError(rc));
+ }
+}
+
+void ZlibDeflatingWriteBuffer::finalizeAfter()
+{
+ try
+ {
+ int rc = deflateEnd(&zstr);
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_DEFLATE_FAILED, "deflateEnd failed: {}", zError(rc));
+ }
+ catch (...)
+ {
+ /// It is OK not to terminate under an error from deflateEnd()
+ /// since all data already written to the stream.
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.h b/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.h
new file mode 100644
index 0000000000..58e709b54e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZlibDeflatingWriteBuffer.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/CompressionMethod.h>
+#include <IO/WriteBufferDecorator.h>
+
+
+#include <zlib.h>
+
+
+namespace DB
+{
+
+/// Performs compression using zlib library and writes compressed data to out_ WriteBuffer.
+class ZlibDeflatingWriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ ZlibDeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ CompressionMethod compression_method,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~ZlibDeflatingWriteBuffer() override;
+
+private:
+ void nextImpl() override;
+
+ /// Flush all pending data and write zlib footer to the underlying buffer.
+ /// After the first call to this function, subsequent calls will have no effect and
+ /// an attempt to write to this buffer will result in exception.
+ virtual void finalizeBefore() override;
+ virtual void finalizeAfter() override;
+
+ z_stream zstr;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.cpp b/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.cpp
new file mode 100644
index 0000000000..b43dda1bfc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.cpp
@@ -0,0 +1,125 @@
+#include <IO/ZlibInflatingReadBuffer.h>
+#include <IO/WithFileName.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ZLIB_INFLATE_FAILED;
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+ZlibInflatingReadBuffer::ZlibInflatingReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ CompressionMethod compression_method,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+ , eof_flag(false)
+{
+ if (buf_size > max_buffer_size)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
+ "Zlib does not support decompression with buffer size greater than {}, got buffer size: {}",
+ max_buffer_size, buf_size);
+
+ zstr.zalloc = nullptr;
+ zstr.zfree = nullptr;
+ zstr.opaque = nullptr;
+ zstr.next_in = nullptr;
+ zstr.avail_in = 0;
+ zstr.next_out = nullptr;
+ zstr.avail_out = 0;
+
+ int window_bits = 15;
+ if (compression_method == CompressionMethod::Gzip)
+ {
+ window_bits += 16;
+ }
+
+ int rc = inflateInit2(&zstr, window_bits);
+
+ if (rc != Z_OK)
+ throw Exception(ErrorCodes::ZLIB_INFLATE_FAILED, "inflateInit2 failed: {}; zlib version: {}.", zError(rc), ZLIB_VERSION);
+}
+
+ZlibInflatingReadBuffer::~ZlibInflatingReadBuffer()
+{
+ inflateEnd(&zstr);
+}
+
+bool ZlibInflatingReadBuffer::nextImpl()
+{
+ /// Need do-while loop to prevent situation, when
+ /// eof was not reached, but working buffer became empty (when nothing was decompressed in current iteration)
+ /// (this happens with compression algorithms, same idea is implemented in ZstdInflatingReadBuffer)
+ do
+ {
+ /// if we already found eof, we shouldn't do anything
+ if (eof_flag)
+ return false;
+
+ /// if there is no available bytes in zstr, move ptr to next available data
+ if (!zstr.avail_in)
+ {
+ in->nextIfAtEnd();
+ zstr.next_in = reinterpret_cast<unsigned char *>(in->position());
+ zstr.avail_in = static_cast<BufferSizeType>(std::min(
+ static_cast<UInt64>(in->buffer().end() - in->position()),
+ static_cast<UInt64>(max_buffer_size)));
+ }
+
+ /// init output bytes (place, where decompressed data will be)
+ zstr.next_out = reinterpret_cast<unsigned char *>(internal_buffer.begin());
+ zstr.avail_out = static_cast<BufferSizeType>(internal_buffer.size());
+
+ size_t old_total_in = zstr.total_in;
+ int rc = inflate(&zstr, Z_NO_FLUSH);
+
+ /// move in stream on place, where reading stopped
+ size_t bytes_read = zstr.total_in - old_total_in;
+ in->position() += bytes_read;
+
+ /// change size of working buffer (it's size equal to internal_buffer size without unused uncompressed values)
+ working_buffer.resize(internal_buffer.size() - zstr.avail_out);
+
+ /// If end was reached, it can be end of file or end of part (for example, chunk)
+ if (rc == Z_STREAM_END)
+ {
+ /// if it is end of file, remember this and return
+ /// * true if we can work with working buffer (we still have something to read, so next must return true)
+ /// * false if there is no data in working buffer
+ if (in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+ /// If it is not end of file, we need to reset zstr and return true, because we still have some data to read
+ else
+ {
+ rc = inflateReset(&zstr);
+ if (rc != Z_OK)
+ throw Exception(
+ ErrorCodes::ZLIB_INFLATE_FAILED,
+ "inflateReset failed: {}{}",
+ zError(rc),
+ getExceptionEntryWithFileName(*in));
+ return true;
+ }
+ }
+
+ /// If it is not end and not OK, something went wrong, throw exception
+ if (rc != Z_OK)
+ throw Exception(
+ ErrorCodes::ZLIB_INFLATE_FAILED,
+ "inflate failed: {}{}",
+ zError(rc),
+ getExceptionEntryWithFileName(*in));
+ }
+ while (working_buffer.empty());
+
+ /// if code reach this section, working buffer is not empty, so we have some data to process
+ return true;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.h b/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.h
new file mode 100644
index 0000000000..d9ca4c6126
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZlibInflatingReadBuffer.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/CompressionMethod.h>
+
+#include <limits>
+#include <zlib.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+}
+
+/// Reads compressed data from ReadBuffer in_ and performs decompression using zlib library.
+/// This buffer is able to seamlessly decompress multiple concatenated zlib streams.
+class ZlibInflatingReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ ZlibInflatingReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ CompressionMethod compression_method,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~ZlibInflatingReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ z_stream zstr;
+ bool eof_flag;
+
+ /// Limit size of buffer because zlib uses
+ /// UInt32 for sizes of internal buffers.
+ using BufferSizeType = decltype(zstr.avail_in);
+ static constexpr auto max_buffer_size = std::numeric_limits<BufferSizeType>::max();
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp b/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp
new file mode 100644
index 0000000000..81be8d8ce4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp
@@ -0,0 +1,223 @@
+#include <IO/ZstdDeflatingAppendableWriteBuffer.h>
+#include <Common/Exception.h>
+#include "IO/ReadBufferFromFileBase.h"
+#include <IO/ReadBufferFromFile.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ZSTD_ENCODER_FAILED;
+}
+
+ZstdDeflatingAppendableWriteBuffer::ZstdDeflatingAppendableWriteBuffer(
+ std::unique_ptr<WriteBufferFromFileBase> out_,
+ int compression_level,
+ bool append_to_existing_file_,
+ std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator_,
+ size_t buf_size,
+ char * existing_memory,
+ size_t alignment)
+ : BufferWithOwnMemory(buf_size, existing_memory, alignment)
+ , out(std::move(out_))
+ , read_buffer_creator(std::move(read_buffer_creator_))
+ , append_to_existing_file(append_to_existing_file_)
+{
+ cctx = ZSTD_createCCtx();
+ if (cctx == nullptr)
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "ZSTD stream encoder init failed: ZSTD version: {}", ZSTD_VERSION_STRING);
+ size_t ret = ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, compression_level);
+ if (ZSTD_isError(ret))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED,
+ "ZSTD stream encoder option setting failed: error code: {}; zstd version: {}",
+ ret, ZSTD_VERSION_STRING);
+
+ input = {nullptr, 0, 0};
+ output = {nullptr, 0, 0};
+}
+
+void ZstdDeflatingAppendableWriteBuffer::nextImpl()
+{
+ if (!offset())
+ return;
+
+ input.src = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ input.size = offset();
+ input.pos = 0;
+
+ if (first_write && append_to_existing_file && isNeedToAddEmptyBlock())
+ {
+ addEmptyBlock();
+ first_write = false;
+ }
+
+ try
+ {
+ bool ended = false;
+ do
+ {
+ out->nextIfAtEnd();
+
+ output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+ output.size = out->buffer().size();
+ output.pos = out->offset();
+
+ size_t compression_result = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_flush);
+ if (ZSTD_isError(compression_result))
+ throw Exception(
+ ErrorCodes::ZSTD_ENCODER_FAILED,
+ "ZSTD stream decoding failed: error code: {}; ZSTD version: {}",
+ ZSTD_getErrorName(compression_result), ZSTD_VERSION_STRING);
+
+ first_write = false;
+ out->position() = out->buffer().begin() + output.pos;
+
+ bool everything_was_compressed = (input.pos == input.size);
+ bool everything_was_flushed = compression_result == 0;
+
+ ended = everything_was_compressed && everything_was_flushed;
+ } while (!ended);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+
+}
+
+ZstdDeflatingAppendableWriteBuffer::~ZstdDeflatingAppendableWriteBuffer()
+{
+ finalize();
+}
+
+void ZstdDeflatingAppendableWriteBuffer::finalizeImpl()
+{
+ if (first_write)
+ {
+ /// To free cctx
+ finalizeZstd();
+ /// Nothing was written
+ }
+ else
+ {
+ try
+ {
+ finalizeBefore();
+ out->finalize();
+ finalizeAfter();
+ }
+ catch (...)
+ {
+ /// Do not try to flush next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+ }
+}
+
+void ZstdDeflatingAppendableWriteBuffer::finalizeBefore()
+{
+ next();
+
+ out->nextIfAtEnd();
+
+ input.src = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ input.size = offset();
+ input.pos = 0;
+
+ output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+ output.size = out->buffer().size();
+ output.pos = out->offset();
+
+ /// Actually we can use ZSTD_e_flush here and add empty termination
+ /// block on each new buffer creation for non-empty file unconditionally (without isNeedToAddEmptyBlock).
+ /// However ZSTD_decompressStream is able to read non-terminated frame (we use it in reader buffer),
+ /// but console zstd utility cannot.
+ size_t remaining = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_end);
+ while (remaining != 0)
+ {
+ if (ZSTD_isError(remaining))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED,
+ "ZSTD stream encoder end failed: error: '{}' ZSTD version: {}",
+ ZSTD_getErrorName(remaining), ZSTD_VERSION_STRING);
+
+ remaining = ZSTD_compressStream2(cctx, &output, &input, ZSTD_e_end);
+
+ out->position() = out->buffer().begin() + output.pos;
+
+ if (!out->hasPendingData())
+ {
+ out->next();
+ output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+ output.size = out->buffer().size();
+ output.pos = out->offset();
+ }
+ }
+}
+
+void ZstdDeflatingAppendableWriteBuffer::finalizeAfter()
+{
+ finalizeZstd();
+}
+
+void ZstdDeflatingAppendableWriteBuffer::finalizeZstd()
+{
+ try
+ {
+ size_t err = ZSTD_freeCCtx(cctx);
+ /// This is just in case, since it is impossible to get an error by using this wrapper.
+ if (unlikely(err))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "ZSTD_freeCCtx failed: error: '{}'; zstd version: {}",
+ ZSTD_getErrorName(err), ZSTD_VERSION_STRING);
+ }
+ catch (...)
+ {
+ /// It is OK not to terminate under an error from ZSTD_freeCCtx()
+ /// since all data already written to the stream.
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+void ZstdDeflatingAppendableWriteBuffer::addEmptyBlock()
+{
+ /// HACK: https://github.com/facebook/zstd/issues/2090#issuecomment-620158967
+
+ if (out->buffer().size() - out->offset() < ZSTD_CORRECT_TERMINATION_LAST_BLOCK.size())
+ out->next();
+
+ std::memcpy(out->buffer().begin() + out->offset(),
+ ZSTD_CORRECT_TERMINATION_LAST_BLOCK.data(), ZSTD_CORRECT_TERMINATION_LAST_BLOCK.size());
+
+ out->position() = out->buffer().begin() + out->offset() + ZSTD_CORRECT_TERMINATION_LAST_BLOCK.size();
+}
+
+
+bool ZstdDeflatingAppendableWriteBuffer::isNeedToAddEmptyBlock()
+{
+ auto reader = read_buffer_creator();
+ auto fsize = reader->getFileSize();
+ if (fsize > 3)
+ {
+ std::array<char, 3> result;
+ reader->seek(fsize - 3, SEEK_SET);
+ reader->readStrict(result.data(), 3);
+
+ /// If we don't have correct block in the end, then we need to add it manually.
+ /// NOTE: maybe we can have the same bytes in case of data corruption/unfinished write.
+ /// But in this case file still corrupted and we have to remove it.
+ return result != ZSTD_CORRECT_TERMINATION_LAST_BLOCK;
+ }
+ else if (fsize > 0)
+ {
+ throw Exception(
+ ErrorCodes::ZSTD_ENCODER_FAILED,
+ "Trying to write to non-empty file '{}' with tiny size {}. It can lead to data corruption",
+ out->getFileName(), fsize);
+ }
+ return false;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.h b/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.h
new file mode 100644
index 0000000000..d9c4f32d6d
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdDeflatingAppendableWriteBuffer.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/CompressionMethod.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferDecorator.h>
+#include <IO/WriteBufferFromFile.h>
+#include <IO/ReadBufferFromFileBase.h>
+
+#include <zstd.h>
+
+
+namespace DB
+{
+
+/// Performs stream compression using zstd library and writes compressed data to out_ WriteBuffer.
+/// Main differences from ZstdDeflatingWriteBuffer:
+/// 1) Allows to continue to write to the same output even if finalize() (or destructor) was not called, for example
+/// when server was killed with 9 signal. Natively zstd doesn't support such feature because
+/// ZSTD_decompressStream expect to see empty block (3 bytes 0x01, 0x00, 0x00) at the end of each frame. There is not API function for it
+/// so we just use HACK and add empty block manually on the first write (see addEmptyBlock). Maintainers of zstd
+/// said that there is no risks of compatibility issues https://github.com/facebook/zstd/issues/2090#issuecomment-620158967.
+/// 2) Doesn't support internal ZSTD check-summing, because ZSTD checksums written at the end of frame (frame epilogue).
+///
+class ZstdDeflatingAppendableWriteBuffer : public BufferWithOwnMemory<WriteBuffer>
+{
+public:
+ using ZSTDLastBlock = const std::array<char, 3>;
+ /// Frame end block. If we read non-empty file and see no such flag we should add it.
+ static inline constexpr ZSTDLastBlock ZSTD_CORRECT_TERMINATION_LAST_BLOCK = {0x01, 0x00, 0x00};
+
+ ZstdDeflatingAppendableWriteBuffer(
+ std::unique_ptr<WriteBufferFromFileBase> out_,
+ int compression_level,
+ bool append_to_existing_file_,
+ std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~ZstdDeflatingAppendableWriteBuffer() override;
+
+ void sync() override
+ {
+ next();
+ out->sync();
+ }
+
+ WriteBuffer * getNestedBuffer() { return out.get(); }
+
+private:
+ /// NOTE: will fill compressed data to the out.working_buffer, but will not call out.next method until the buffer is full
+ void nextImpl() override;
+
+ /// Write terminating ZSTD_e_end: empty block + frame epilogue. BTW it
+ /// should be almost noop, because frame epilogue contains only checksums,
+ /// and they are disabled for this buffer.
+ /// Flush all pending data and write zstd footer to the underlying buffer.
+ /// After the first call to this function, subsequent calls will have no effect and
+ /// an attempt to write to this buffer will result in exception.
+ void finalizeImpl() override;
+ void finalizeBefore();
+ void finalizeAfter();
+ void finalizeZstd();
+
+ /// Read three last bytes from non-empty compressed file and compares them with
+ /// ZSTD_CORRECT_TERMINATION_LAST_BLOCK.
+ bool isNeedToAddEmptyBlock();
+
+ /// Adding zstd empty block (ZSTD_CORRECT_TERMINATION_LAST_BLOCK) to out.working_buffer
+ void addEmptyBlock();
+
+ std::unique_ptr<WriteBufferFromFileBase> out;
+ std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator;
+
+ bool append_to_existing_file = false;
+ ZSTD_CCtx * cctx;
+ ZSTD_inBuffer input;
+ ZSTD_outBuffer output;
+ /// Flipped on the first nextImpl call
+ bool first_write = true;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.cpp b/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.cpp
new file mode 100644
index 0000000000..83d8487e3e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.cpp
@@ -0,0 +1,104 @@
+#include <IO/ZstdDeflatingWriteBuffer.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ZSTD_ENCODER_FAILED;
+}
+
+ZstdDeflatingWriteBuffer::ZstdDeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
+ : WriteBufferWithOwnMemoryDecorator(std::move(out_), buf_size, existing_memory, alignment)
+{
+ cctx = ZSTD_createCCtx();
+ if (cctx == nullptr)
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "zstd stream encoder init failed: zstd version: {}", ZSTD_VERSION_STRING);
+ size_t ret = ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, compression_level);
+ if (ZSTD_isError(ret))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED,
+ "zstd stream encoder option setting failed: error code: {}; zstd version: {}",
+ ret, ZSTD_VERSION_STRING);
+ ret = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1);
+ if (ZSTD_isError(ret))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED,
+ "zstd stream encoder option setting failed: error code: {}; zstd version: {}",
+ ret, ZSTD_VERSION_STRING);
+
+ input = {nullptr, 0, 0};
+ output = {nullptr, 0, 0};
+}
+
+ZstdDeflatingWriteBuffer::~ZstdDeflatingWriteBuffer() = default;
+
+void ZstdDeflatingWriteBuffer::flush(ZSTD_EndDirective mode)
+{
+ input.src = reinterpret_cast<unsigned char *>(working_buffer.begin());
+ input.size = offset();
+ input.pos = 0;
+
+ try
+ {
+ bool ended = false;
+ do
+ {
+ out->nextIfAtEnd();
+
+ output.dst = reinterpret_cast<unsigned char *>(out->buffer().begin());
+ output.size = out->buffer().size();
+ output.pos = out->offset();
+
+ size_t compression_result = ZSTD_compressStream2(cctx, &output, &input, mode);
+ if (ZSTD_isError(compression_result))
+ throw Exception(
+ ErrorCodes::ZSTD_ENCODER_FAILED,
+ "ZSTD stream encoding failed: error: '{}'; zstd version: {}",
+ ZSTD_getErrorName(compression_result), ZSTD_VERSION_STRING);
+
+ out->position() = out->buffer().begin() + output.pos;
+
+ bool everything_was_compressed = (input.pos == input.size);
+ bool everything_was_flushed = compression_result == 0;
+
+ ended = everything_was_compressed && everything_was_flushed;
+ } while (!ended);
+ }
+ catch (...)
+ {
+ /// Do not try to write next time after exception.
+ out->position() = out->buffer().begin();
+ throw;
+ }
+}
+
+void ZstdDeflatingWriteBuffer::nextImpl()
+{
+ if (offset())
+ flush(ZSTD_e_flush);
+}
+
+void ZstdDeflatingWriteBuffer::finalizeBefore()
+{
+ flush(ZSTD_e_end);
+}
+
+void ZstdDeflatingWriteBuffer::finalizeAfter()
+{
+ try
+ {
+ size_t err = ZSTD_freeCCtx(cctx);
+ /// This is just in case, since it is impossible to get an error by using this wrapper.
+ if (unlikely(err))
+ throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "ZSTD_freeCCtx failed: error: '{}'; zstd version: {}",
+ ZSTD_getErrorName(err), ZSTD_VERSION_STRING);
+ }
+ catch (...)
+ {
+ /// It is OK not to terminate under an error from ZSTD_freeCCtx()
+ /// since all data already written to the stream.
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+ }
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.h b/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.h
new file mode 100644
index 0000000000..a66d6085a7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdDeflatingWriteBuffer.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/CompressionMethod.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferDecorator.h>
+
+#include <zstd.h>
+
+namespace DB
+{
+
+/// Performs compression using zstd library and writes compressed data to out_ WriteBuffer.
+class ZstdDeflatingWriteBuffer : public WriteBufferWithOwnMemoryDecorator
+{
+public:
+ ZstdDeflatingWriteBuffer(
+ std::unique_ptr<WriteBuffer> out_,
+ int compression_level,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0);
+
+ ~ZstdDeflatingWriteBuffer() override;
+
+ void sync() override
+ {
+ out->sync();
+ }
+
+private:
+ void nextImpl() override;
+
+ /// Flush all pending data and write zstd footer to the underlying buffer.
+ /// After the first call to this function, subsequent calls will have no effect and
+ /// an attempt to write to this buffer will result in exception.
+ void finalizeBefore() override;
+ void finalizeAfter() override;
+
+ void flush(ZSTD_EndDirective mode);
+
+ ZSTD_CCtx * cctx;
+ ZSTD_inBuffer input;
+ ZSTD_outBuffer output;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.cpp b/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.cpp
new file mode 100644
index 0000000000..2b663ec714
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.cpp
@@ -0,0 +1,95 @@
+#include <IO/ZstdInflatingReadBuffer.h>
+#include <IO/WithFileName.h>
+#include <zstd_errors.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ZSTD_DECODER_FAILED;
+}
+
+ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max)
+ : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment)
+{
+ dctx = ZSTD_createDCtx();
+ input = {nullptr, 0, 0};
+ output = {nullptr, 0, 0};
+
+ if (dctx == nullptr)
+ {
+ throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: zstd version: {}", ZSTD_VERSION_STRING);
+ }
+
+ size_t ret = ZSTD_DCtx_setParameter(dctx, ZSTD_d_windowLogMax, zstd_window_log_max);
+ if (ZSTD_isError(ret))
+ {
+ throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: {}", ZSTD_getErrorName(ret));
+ }
+}
+
+ZstdInflatingReadBuffer::~ZstdInflatingReadBuffer()
+{
+ ZSTD_freeDCtx(dctx);
+}
+
+bool ZstdInflatingReadBuffer::nextImpl()
+{
+ do
+ {
+ // If it is known that end of file was reached, return false
+ if (eof_flag)
+ return false;
+
+ /// If end was reached, get next part
+ if (input.pos >= input.size)
+ {
+ in->nextIfAtEnd();
+ input.src = reinterpret_cast<unsigned char *>(in->position());
+ input.pos = 0;
+ input.size = in->buffer().end() - in->position();
+ }
+
+ /// fill output
+ output.dst = reinterpret_cast<unsigned char *>(internal_buffer.begin());
+ output.size = internal_buffer.size();
+ output.pos = 0;
+
+ /// Decompress data and check errors.
+ size_t ret = ZSTD_decompressStream(dctx, &output, &input);
+ if (ZSTD_getErrorCode(ret))
+ {
+ throw Exception(
+ ErrorCodes::ZSTD_DECODER_FAILED,
+ "ZSTD stream decoding failed: error '{}'{}; ZSTD version: {}{}",
+ ZSTD_getErrorName(ret),
+ ZSTD_error_frameParameter_windowTooLarge == ret
+ ? ". You can increase the maximum window size with the 'zstd_window_log_max' setting in ClickHouse. Example: 'SET zstd_window_log_max = 31'"
+ : "",
+ ZSTD_VERSION_STRING,
+ getExceptionEntryWithFileName(*in));
+ }
+
+ /// Check that something has changed after decompress (input or output position)
+ assert(in->eof() || output.pos > 0 || in->position() < in->buffer().begin() + input.pos);
+
+ /// move position to the end of read data
+ in->position() = in->buffer().begin() + input.pos;
+ working_buffer.resize(output.pos);
+
+ /// If end of file is reached, fill eof variable and return true if there is some data in buffer, otherwise return false
+ if (in->eof())
+ {
+ eof_flag = true;
+ return !working_buffer.empty();
+ }
+ /// It is possible, that input buffer is not at eof yet, but nothing was decompressed in current iteration.
+ /// But there are cases, when such behaviour is not allowed - i.e. if input buffer is not eof, then
+ /// it has to be guaranteed that working_buffer is not empty. So if it is empty, continue.
+ } while (output.pos == 0);
+
+ return true;
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.h b/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.h
new file mode 100644
index 0000000000..faa6231d4e
--- /dev/null
+++ b/contrib/clickhouse/src/IO/ZstdInflatingReadBuffer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <IO/CompressedReadBufferWrapper.h>
+#include <IO/CompressionMethod.h>
+#include <IO/ReadBuffer.h>
+
+#include <zstd.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+}
+
+class ZstdInflatingReadBuffer : public CompressedReadBufferWrapper
+{
+public:
+ explicit ZstdInflatingReadBuffer(
+ std::unique_ptr<ReadBuffer> in_,
+ size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+ char * existing_memory = nullptr,
+ size_t alignment = 0,
+ int zstd_window_log_max = 0);
+
+ ~ZstdInflatingReadBuffer() override;
+
+private:
+ bool nextImpl() override;
+
+ ZSTD_DCtx * dctx;
+ ZSTD_inBuffer input;
+ ZSTD_outBuffer output;
+ bool eof_flag = false;
+};
+
+}
diff --git a/contrib/clickhouse/src/IO/copyData.cpp b/contrib/clickhouse/src/IO/copyData.cpp
new file mode 100644
index 0000000000..07222a930b
--- /dev/null
+++ b/contrib/clickhouse/src/IO/copyData.cpp
@@ -0,0 +1,112 @@
+#include <Common/Exception.h>
+#include <Common/Throttler.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <IO/copyData.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+ extern const int ATTEMPT_TO_READ_AFTER_EOF;
+ extern const int CANNOT_READ_ALL_DATA;
+}
+
+namespace
+{
+
+void copyDataImpl(ReadBuffer & from, WriteBuffer & to, bool check_bytes, size_t bytes, const std::atomic<int> * is_cancelled, ThrottlerPtr throttler)
+{
+ /// If read to the end of the buffer, eof() either fills the buffer with new data and moves the cursor to the beginning, or returns false.
+ while (bytes > 0 && !from.eof())
+ {
+ if (is_cancelled && *is_cancelled)
+ return;
+
+ /// buffer() - a piece of data available for reading; position() - the cursor of the place to which you have already read.
+ size_t count = std::min(bytes, static_cast<size_t>(from.buffer().end() - from.position()));
+ to.write(from.position(), count);
+ from.position() += count;
+ bytes -= count;
+
+ if (throttler)
+ throttler->add(count);
+ }
+
+ if (check_bytes && bytes > 0)
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after EOF.");
+}
+
+void copyDataImpl(ReadBuffer & from, WriteBuffer & to, bool check_bytes, size_t bytes, std::function<void()> cancellation_hook, ThrottlerPtr throttler)
+{
+ /// If read to the end of the buffer, eof() either fills the buffer with new data and moves the cursor to the beginning, or returns false.
+ while (bytes > 0 && !from.eof())
+ {
+ if (cancellation_hook)
+ cancellation_hook();
+
+ /// buffer() - a piece of data available for reading; position() - the cursor of the place to which you have already read.
+ size_t count = std::min(bytes, static_cast<size_t>(from.buffer().end() - from.position()));
+ to.write(from.position(), count);
+ from.position() += count;
+ bytes -= count;
+
+ if (throttler)
+ throttler->add(count);
+ }
+
+ if (check_bytes && bytes > 0)
+ throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after EOF.");
+}
+
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to)
+{
+ copyDataImpl(from, to, false, std::numeric_limits<size_t>::max(), nullptr, nullptr);
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to, const std::atomic<int> & is_cancelled)
+{
+ copyDataImpl(from, to, false, std::numeric_limits<size_t>::max(), &is_cancelled, nullptr);
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to, std::function<void()> cancellation_hook)
+{
+ copyDataImpl(from, to, false, std::numeric_limits<size_t>::max(), cancellation_hook, nullptr);
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes)
+{
+ copyDataImpl(from, to, true, bytes, nullptr, nullptr);
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes, const std::atomic<int> & is_cancelled)
+{
+ copyDataImpl(from, to, true, bytes, &is_cancelled, nullptr);
+}
+
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes, std::function<void()> cancellation_hook)
+{
+ copyDataImpl(from, to, true, bytes, cancellation_hook, nullptr);
+}
+
+void copyDataMaxBytes(ReadBuffer & from, WriteBuffer & to, size_t max_bytes)
+{
+ copyDataImpl(from, to, false, max_bytes, nullptr, nullptr);
+ if (!from.eof())
+ throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all data, max readable size reached.");
+}
+
+void copyDataWithThrottler(ReadBuffer & from, WriteBuffer & to, const std::atomic<int> & is_cancelled, ThrottlerPtr throttler)
+{
+ copyDataImpl(from, to, false, std::numeric_limits<size_t>::max(), &is_cancelled, throttler);
+}
+
+void copyDataWithThrottler(ReadBuffer & from, WriteBuffer & to, size_t bytes, const std::atomic<int> & is_cancelled, ThrottlerPtr throttler)
+{
+ copyDataImpl(from, to, true, bytes, &is_cancelled, throttler);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/copyData.h b/contrib/clickhouse/src/IO/copyData.h
new file mode 100644
index 0000000000..b67088d8e4
--- /dev/null
+++ b/contrib/clickhouse/src/IO/copyData.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <atomic>
+#include <functional>
+
+
+namespace DB
+{
+
+class ReadBuffer;
+class WriteBuffer;
+class Throttler;
+
+using ThrottlerPtr = std::shared_ptr<Throttler>;
+
+
+/// Copies data from ReadBuffer to WriteBuffer, all that is.
+void copyData(ReadBuffer & from, WriteBuffer & to);
+
+/// Copies `bytes` bytes from ReadBuffer to WriteBuffer. If there are no `bytes` bytes, then throws an exception.
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes);
+
+/// The same, with the condition to cancel.
+void copyData(ReadBuffer & from, WriteBuffer & to, const std::atomic<int> & is_cancelled);
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes, const std::atomic<int> & is_cancelled);
+
+void copyData(ReadBuffer & from, WriteBuffer & to, std::function<void()> cancellation_hook);
+void copyData(ReadBuffer & from, WriteBuffer & to, size_t bytes, std::function<void()> cancellation_hook);
+
+/// Copies at most `max_bytes` bytes from ReadBuffer to WriteBuffer. If there are more bytes, then throws an exception.
+void copyDataMaxBytes(ReadBuffer & from, WriteBuffer & to, size_t max_bytes);
+
+/// Same as above but also use throttler to limit maximum speed
+void copyDataWithThrottler(ReadBuffer & from, WriteBuffer & to, const std::atomic<int> & is_cancelled, ThrottlerPtr throttler);
+void copyDataWithThrottler(ReadBuffer & from, WriteBuffer & to, size_t bytes, const std::atomic<int> & is_cancelled, ThrottlerPtr throttler);
+
+}
diff --git a/contrib/clickhouse/src/IO/parseDateTimeBestEffort.cpp b/contrib/clickhouse/src/IO/parseDateTimeBestEffort.cpp
new file mode 100644
index 0000000000..6bdba251c3
--- /dev/null
+++ b/contrib/clickhouse/src/IO/parseDateTimeBestEffort.cpp
@@ -0,0 +1,723 @@
+#include <Common/DateLUTImpl.h>
+#include <Common/StringUtils/StringUtils.h>
+
+#include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <IO/parseDateTimeBestEffort.h>
+
+#include <limits>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+ extern const int CANNOT_PARSE_DATETIME;
+}
+
+
+namespace
+{
+
+inline size_t readDigits(char * res, size_t max_chars, ReadBuffer & in)
+{
+ size_t num_chars = 0;
+ while (!in.eof() && isNumericASCII(*in.position()) && num_chars < max_chars)
+ {
+ res[num_chars] = *in.position() - '0';
+ ++num_chars;
+ ++in.position();
+ }
+ return num_chars;
+}
+
+inline size_t readAlpha(char * res, size_t max_chars, ReadBuffer & in)
+{
+ size_t num_chars = 0;
+ while (!in.eof() && isAlphaASCII(*in.position()) && num_chars < max_chars)
+ {
+ res[num_chars] = *in.position();
+ ++num_chars;
+ ++in.position();
+ }
+ return num_chars;
+}
+
+template <size_t digit, size_t power_of_ten, typename T>
+inline void readDecimalNumberImpl(T & res, const char * src)
+{
+ res += src[digit] * power_of_ten;
+ if constexpr (digit > 0)
+ readDecimalNumberImpl<digit - 1, power_of_ten * 10>(res, src);
+}
+
+template <size_t num_digits, typename T>
+inline void readDecimalNumber(T & res, const char * src)
+{
+ readDecimalNumberImpl<num_digits - 1, 1>(res, src);
+}
+
+template <typename T>
+inline void readDecimalNumber(T & res, size_t num_digits, const char * src)
+{
+#define READ_DECIMAL_NUMBER(N) do { res *= common::exp10_i32(N); readDecimalNumber<N>(res, src); src += (N); num_digits -= (N); } while (false)
+ while (num_digits)
+ {
+ switch (num_digits)
+ {
+ case 3: READ_DECIMAL_NUMBER(3); break;
+ case 2: READ_DECIMAL_NUMBER(2); break;
+ case 1: READ_DECIMAL_NUMBER(1); break;
+ default: READ_DECIMAL_NUMBER(4); break;
+ }
+ }
+#undef READ_DECIMAL_NUMBER
+}
+
+struct DateTimeSubsecondPart
+{
+ Int64 value;
+ UInt8 digits;
+};
+
+template <typename ReturnType, bool is_us_style>
+ReturnType parseDateTimeBestEffortImpl(
+ time_t & res,
+ ReadBuffer & in,
+ const DateLUTImpl & local_time_zone,
+ const DateLUTImpl & utc_time_zone,
+ DateTimeSubsecondPart * fractional)
+{
+ auto on_error = [&]<typename... FmtArgs>(int error_code [[maybe_unused]],
+ FormatStringHelper<FmtArgs...> fmt_string [[maybe_unused]],
+ FmtArgs && ...fmt_args [[maybe_unused]])
+ {
+ if constexpr (std::is_same_v<ReturnType, void>)
+ throw ParsingException(error_code, std::move(fmt_string), std::forward<FmtArgs>(fmt_args)...);
+ else
+ return false;
+ };
+
+ res = 0;
+ UInt16 year = 0;
+ UInt8 month = 0;
+ UInt8 day_of_month = 0;
+ UInt8 hour = 0;
+ UInt8 minute = 0;
+ UInt8 second = 0;
+
+ bool has_time = false;
+
+ bool has_time_zone_offset = false;
+ bool time_zone_offset_negative = false;
+ UInt8 time_zone_offset_hour = 0;
+ UInt8 time_zone_offset_minute = 0;
+
+ bool is_am = false;
+ bool is_pm = false;
+
+ bool has_comma_between_date_and_time = false;
+
+ auto read_alpha_month = [&month] (const auto & alpha)
+ {
+ if (0 == strncasecmp(alpha, "Jan", 3)) month = 1;
+ else if (0 == strncasecmp(alpha, "Feb", 3)) month = 2;
+ else if (0 == strncasecmp(alpha, "Mar", 3)) month = 3;
+ else if (0 == strncasecmp(alpha, "Apr", 3)) month = 4;
+ else if (0 == strncasecmp(alpha, "May", 3)) month = 5;
+ else if (0 == strncasecmp(alpha, "Jun", 3)) month = 6;
+ else if (0 == strncasecmp(alpha, "Jul", 3)) month = 7;
+ else if (0 == strncasecmp(alpha, "Aug", 3)) month = 8;
+ else if (0 == strncasecmp(alpha, "Sep", 3)) month = 9;
+ else if (0 == strncasecmp(alpha, "Oct", 3)) month = 10;
+ else if (0 == strncasecmp(alpha, "Nov", 3)) month = 11;
+ else if (0 == strncasecmp(alpha, "Dec", 3)) month = 12;
+ else
+ return false;
+ return true;
+ };
+
+ while (!in.eof())
+ {
+ if ((year && !has_time) || (!year && has_time))
+ {
+ if (*in.position() == ',')
+ {
+ has_comma_between_date_and_time = true;
+ ++in.position();
+ }
+ }
+
+ char digits[std::numeric_limits<UInt64>::digits10];
+
+ size_t num_digits = 0;
+
+ if (!year || !has_time)
+ {
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 13 && !year && !has_time)
+ {
+ /// This is unix timestamp with millisecond.
+ readDecimalNumber<10>(res, digits);
+ if (fractional)
+ {
+ fractional->digits = 3;
+ readDecimalNumber<3>(fractional->value, digits + 10);
+ }
+ return ReturnType(true);
+ }
+ else if (num_digits == 10 && !year && !has_time)
+ {
+ /// This is unix timestamp.
+ readDecimalNumber<10>(res, digits);
+ return ReturnType(true);
+ }
+ else if (num_digits == 9 && !year && !has_time)
+ {
+ /// This is unix timestamp.
+ readDecimalNumber<9>(res, digits);
+ return ReturnType(true);
+ }
+ else if (num_digits == 14 && !year && !has_time)
+ {
+ /// This is YYYYMMDDhhmmss
+ readDecimalNumber<4>(year, digits);
+ readDecimalNumber<2>(month, digits + 4);
+ readDecimalNumber<2>(day_of_month, digits + 6);
+ readDecimalNumber<2>(hour, digits + 8);
+ readDecimalNumber<2>(minute, digits + 10);
+ readDecimalNumber<2>(second, digits + 12);
+ has_time = true;
+ }
+ else if (num_digits == 8 && !year)
+ {
+ /// This is YYYYMMDD
+ readDecimalNumber<4>(year, digits);
+ readDecimalNumber<2>(month, digits + 4);
+ readDecimalNumber<2>(day_of_month, digits + 6);
+ }
+ else if (num_digits == 6)
+ {
+ /// This is YYYYMM or hhmmss
+ if (!year && !month)
+ {
+ readDecimalNumber<4>(year, digits);
+ readDecimalNumber<2>(month, digits + 4);
+ }
+ else if (!has_time)
+ {
+ readDecimalNumber<2>(hour, digits);
+ readDecimalNumber<2>(minute, digits + 2);
+ readDecimalNumber<2>(second, digits + 4);
+ has_time = true;
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: ambiguous 6 digits, it can be YYYYMM or hhmmss");
+ }
+ else if (num_digits == 4 && !year)
+ {
+ /// YYYY
+ /// YYYY*MM
+ /// YYYY*MM*DD
+ /// YYYY*M
+ /// YYYY*M*DD
+ /// YYYY*M*D
+
+ readDecimalNumber<4>(year, digits);
+
+ if (!in.eof())
+ {
+ char delimiter_after_year = *in.position();
+
+ if (delimiter_after_year < 0x20
+ || delimiter_after_year == ','
+ || delimiter_after_year == ';'
+ || delimiter_after_year == '\''
+ || delimiter_after_year == '"')
+ break;
+
+ if (month)
+ continue;
+
+ ++in.position();
+
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ readDecimalNumber<2>(month, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(month, digits);
+ else if (delimiter_after_year == ' ')
+ continue;
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after year: {}", num_digits);
+
+ /// Only the same delimiter.
+ if (!day_of_month && checkChar(delimiter_after_year, in))
+ {
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ readDecimalNumber<2>(day_of_month, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(day_of_month, digits);
+ else if (delimiter_after_year == ' ')
+ continue;
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after year and month: {}", num_digits);
+ }
+ }
+ }
+ else if (num_digits == 2 || num_digits == 1)
+ {
+ /// hh:mm:ss
+ /// hh:mm
+ /// hh - only if already have day of month
+ /// DD/MM/YYYY
+ /// DD/MM/YY
+ /// DD.MM.YYYY
+ /// DD.MM.YY
+ /// DD-MM-YYYY
+ /// DD-MM-YY
+ /// DD
+
+ UInt8 hour_or_day_of_month_or_month = 0;
+ if (num_digits == 2)
+ readDecimalNumber<2>(hour_or_day_of_month_or_month, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(hour_or_day_of_month_or_month, digits);
+ else
+ return on_error(ErrorCodes::LOGICAL_ERROR, "Cannot read DateTime: logical error, unexpected branch in code");
+
+ if (checkChar(':', in))
+ {
+ if (has_time)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: time component is duplicated");
+
+ hour = hour_or_day_of_month_or_month;
+ has_time = true;
+
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ readDecimalNumber<2>(minute, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(minute, digits);
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after hour: {}", num_digits);
+
+ if (checkChar(':', in))
+ {
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ readDecimalNumber<2>(second, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(second, digits);
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after hour and minute: {}", num_digits);
+ }
+ }
+ else if (checkChar(',', in))
+ {
+ if (month && !day_of_month)
+ day_of_month = hour_or_day_of_month_or_month;
+ }
+ else if (checkChar('/', in) || checkChar('.', in) || checkChar('-', in))
+ {
+ if (day_of_month)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: day of month is duplicated");
+
+ if (month)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month is duplicated");
+
+ if constexpr (is_us_style)
+ {
+ month = hour_or_day_of_month_or_month;
+ num_digits = readDigits(digits, sizeof(digits), in);
+ if (num_digits == 2)
+ readDecimalNumber<2>(day_of_month, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(day_of_month, digits);
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after month: {}", num_digits);
+ }
+ else
+ {
+ day_of_month = hour_or_day_of_month_or_month;
+
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ readDecimalNumber<2>(month, digits);
+ else if (num_digits == 1)
+ readDecimalNumber<1>(month, digits);
+ else if (num_digits == 0)
+ {
+ /// Month in alphabetical form
+
+ char alpha[9]; /// The longest month name: September
+ size_t num_alpha = readAlpha(alpha, sizeof(alpha), in);
+
+ if (num_alpha < 3)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of alphabetical characters after day of month: {}", num_alpha);
+
+ if (!read_alpha_month(alpha))
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: alphabetical characters after day of month don't look like month: {}", std::string(alpha, 3));
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after day of month: {}", num_digits);
+ }
+
+ if (month > 12)
+ std::swap(month, day_of_month);
+
+ if (checkChar('/', in) || checkChar('.', in) || checkChar('-', in))
+ {
+ if (year)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: year component is duplicated");
+
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 4)
+ readDecimalNumber<4>(year, digits);
+ else if (num_digits == 2)
+ {
+ readDecimalNumber<2>(year, digits);
+
+ if (year >= 70)
+ year += 1900;
+ else
+ year += 2000;
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after day of month and month: {}", num_digits);
+ }
+ }
+ else
+ {
+ if (day_of_month)
+ hour = hour_or_day_of_month_or_month;
+ else
+ day_of_month = hour_or_day_of_month_or_month;
+ }
+ }
+ else if (num_digits != 0)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits: {}", num_digits);
+ }
+
+ if (num_digits == 0)
+ {
+ char c = *in.position();
+
+ /// 'T' is a separator between date and time according to ISO 8601.
+ /// But don't skip it if we didn't read the date part yet, because 'T' is also a prefix for 'Tue' and 'Thu'.
+
+ if (c == ' ' || (c == 'T' && year && !has_time))
+ {
+ ++in.position();
+ }
+ else if (c == 'Z')
+ {
+ ++in.position();
+ has_time_zone_offset = true;
+ }
+ else if (c == '.') /// We don't support comma (ISO 8601:2004) for fractional part of second to not mess up with CSV separator.
+ {
+ if (!has_time)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected point symbol");
+
+ ++in.position();
+ num_digits = readDigits(digits, sizeof(digits), in);
+ if (fractional)
+ {
+ using FractionalType = typename std::decay<decltype(fractional->value)>::type;
+ // Reading more decimal digits than fits into FractionalType would case an
+ // overflow, so it is better to skip all digits from the right side that do not
+ // fit into result type. To provide less precise value rather than bogus one.
+ num_digits = std::min(static_cast<size_t>(std::numeric_limits<FractionalType>::digits10), num_digits);
+
+ fractional->digits = num_digits;
+ readDecimalNumber(fractional->value, num_digits, digits);
+ }
+ }
+ else if (c == '+' || c == '-')
+ {
+ ++in.position();
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 6 && !has_time && year && month && day_of_month)
+ {
+ /// It looks like hhmmss
+ readDecimalNumber<2>(hour, digits);
+ readDecimalNumber<2>(minute, digits + 2);
+ readDecimalNumber<2>(second, digits + 4);
+ has_time = true;
+ }
+ else
+ {
+ /// It looks like time zone offset
+ has_time_zone_offset = true;
+ if (c == '-')
+ time_zone_offset_negative = true;
+
+ if (num_digits == 4)
+ {
+ readDecimalNumber<2>(time_zone_offset_hour, digits);
+ readDecimalNumber<2>(time_zone_offset_minute, digits + 2);
+ }
+ else if (num_digits == 3)
+ {
+ readDecimalNumber<1>(time_zone_offset_hour, digits);
+ readDecimalNumber<2>(time_zone_offset_minute, digits + 1);
+ }
+ else if (num_digits == 2)
+ {
+ readDecimalNumber<2>(time_zone_offset_hour, digits);
+ }
+ else if (num_digits == 1)
+ {
+ readDecimalNumber<1>(time_zone_offset_hour, digits);
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits for time zone offset: {}", num_digits);
+
+ if (num_digits < 3 && checkChar(':', in))
+ {
+ num_digits = readDigits(digits, sizeof(digits), in);
+
+ if (num_digits == 2)
+ {
+ readDecimalNumber<2>(time_zone_offset_minute, digits);
+ }
+ else if (num_digits == 1)
+ {
+ readDecimalNumber<1>(time_zone_offset_minute, digits);
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits for time zone offset in minutes: {}", num_digits);
+ }
+ }
+ }
+ else
+ {
+ char alpha[3];
+
+ size_t num_alpha = readAlpha(alpha, sizeof(alpha), in);
+
+ if (!num_alpha)
+ {
+ break;
+ }
+ else if (num_alpha == 1)
+ {
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected alphabetical character");
+ }
+ else if (num_alpha == 2)
+ {
+ if (alpha[1] == 'M' || alpha[1] == 'm')
+ {
+ if (alpha[0] == 'A' || alpha[0] == 'a')
+ {
+ is_am = true;
+ }
+ else if (alpha[0] == 'P' || alpha[0] == 'p')
+ {
+ is_pm = true;
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word");
+ }
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word");
+ }
+ else if (num_alpha == 3)
+ {
+ bool has_day_of_week = false;
+
+ if (read_alpha_month(alpha))
+ {
+ }
+ else if (0 == strncasecmp(alpha, "UTC", 3)) has_time_zone_offset = true; // NOLINT
+ else if (0 == strncasecmp(alpha, "GMT", 3)) has_time_zone_offset = true;
+ else if (0 == strncasecmp(alpha, "MSK", 3)) { has_time_zone_offset = true; time_zone_offset_hour = 3; }
+ else if (0 == strncasecmp(alpha, "MSD", 3)) { has_time_zone_offset = true; time_zone_offset_hour = 4; }
+
+ else if (0 == strncasecmp(alpha, "Mon", 3)) has_day_of_week = true; // NOLINT
+ else if (0 == strncasecmp(alpha, "Tue", 3)) has_day_of_week = true;
+ else if (0 == strncasecmp(alpha, "Wed", 3)) has_day_of_week = true;
+ else if (0 == strncasecmp(alpha, "Thu", 3)) has_day_of_week = true;
+ else if (0 == strncasecmp(alpha, "Fri", 3)) has_day_of_week = true;
+ else if (0 == strncasecmp(alpha, "Sat", 3)) has_day_of_week = true;
+ else if (0 == strncasecmp(alpha, "Sun", 3)) has_day_of_week = true;
+
+ else
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word");
+
+ while (!in.eof() && isAlphaASCII(*in.position()))
+ ++in.position();
+
+ /// For RFC 2822
+ if (has_day_of_week)
+ checkChar(',', in);
+ }
+ else
+ return on_error(ErrorCodes::LOGICAL_ERROR, "Cannot read DateTime: logical error, unexpected branch in code");
+ }
+ }
+ }
+
+ //// Date like '2022/03/04, ' should parse fail?
+ if (has_comma_between_date_and_time && (!has_time || !year || !month || !day_of_month))
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected word after Date");
+
+ /// If neither Date nor Time is parsed successfully, it should fail
+ if (!year && !month && !day_of_month && !has_time)
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: neither Date nor Time was parsed successfully");
+
+ if (!day_of_month)
+ day_of_month = 1;
+ if (!month)
+ month = 1;
+ if (!year)
+ {
+ time_t now = time(nullptr);
+ UInt16 curr_year = local_time_zone.toYear(now);
+ year = now < local_time_zone.makeDateTime(curr_year, month, day_of_month, hour, minute, second) ? curr_year - 1 : curr_year;
+ }
+
+ auto is_leap_year = (year % 400 == 0) || (year % 100 != 0 && year % 4 == 0);
+
+ auto check_date = [](const auto & is_leap_year_, const auto & month_, const auto & day_)
+ {
+ if ((month_ == 1 || month_ == 3 || month_ == 5 || month_ == 7 || month_ == 8 || month_ == 10 || month_ == 12) && day_ >= 1 && day_ <= 31)
+ return true;
+ else if (month_ == 2 && ((is_leap_year_ && day_ >= 1 && day_ <= 29) || (!is_leap_year_ && day_ >= 1 && day_ <= 28)))
+ return true;
+ else if ((month_ == 4 || month_ == 6 || month_ == 9 || month_ == 11) && day_ >= 1 && day_ <= 30)
+ return true;
+ return false;
+ };
+
+ if (!check_date(is_leap_year, month, day_of_month))
+ return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected date: {}-{}-{}",
+ year, static_cast<UInt16>(month), static_cast<UInt16>(day_of_month));
+
+ if (is_am && hour == 12)
+ hour = 0;
+
+ if (is_pm && hour < 12)
+ hour += 12;
+
+ auto adjust_time_zone = [&]
+ {
+ if (time_zone_offset_hour)
+ {
+ if (time_zone_offset_negative)
+ res += time_zone_offset_hour * 3600;
+ else
+ res -= time_zone_offset_hour * 3600;
+ }
+
+ if (time_zone_offset_minute)
+ {
+ if (time_zone_offset_negative)
+ res += time_zone_offset_minute * 60;
+ else
+ res -= time_zone_offset_minute * 60;
+ }
+ };
+
+ if (has_time_zone_offset)
+ {
+ res = utc_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second);
+ adjust_time_zone();
+ }
+ else
+ {
+ res = local_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second);
+ }
+
+ return ReturnType(true);
+}
+
+template <typename ReturnType, bool is_us_style>
+ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ time_t whole;
+ DateTimeSubsecondPart subsecond = {0, 0}; // needs to be explicitly initialized sine it could be missing from input string
+
+ if constexpr (std::is_same_v<ReturnType, bool>)
+ {
+ if (!parseDateTimeBestEffortImpl<bool, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond))
+ return false;
+ }
+ else
+ {
+ parseDateTimeBestEffortImpl<ReturnType, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond);
+ }
+
+
+ DateTime64::NativeType fractional = subsecond.value;
+ if (scale < subsecond.digits)
+ {
+ fractional /= common::exp10_i64(subsecond.digits - scale);
+ }
+ else if (scale > subsecond.digits)
+ {
+ fractional *= common::exp10_i64(scale - subsecond.digits);
+ }
+
+ if constexpr (std::is_same_v<ReturnType, bool>)
+ return DecimalUtils::tryGetDecimalFromComponents<DateTime64>(whole, fractional, scale, res);
+
+ res = DecimalUtils::decimalFromComponents<DateTime64>(whole, fractional, scale);
+ return ReturnType(true);
+}
+
+}
+
+void parseDateTimeBestEffort(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ parseDateTimeBestEffortImpl<void, false>(res, in, local_time_zone, utc_time_zone, nullptr);
+}
+
+void parseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ parseDateTimeBestEffortImpl<void, true>(res, in, local_time_zone, utc_time_zone, nullptr);
+}
+
+bool tryParseDateTimeBestEffort(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTimeBestEffortImpl<bool, false>(res, in, local_time_zone, utc_time_zone, nullptr);
+}
+
+bool tryParseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTimeBestEffortImpl<bool, true>(res, in, local_time_zone, utc_time_zone, nullptr);
+}
+
+void parseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTime64BestEffortImpl<void, false>(res, scale, in, local_time_zone, utc_time_zone);
+}
+
+void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTime64BestEffortImpl<void, true>(res, scale, in, local_time_zone, utc_time_zone);
+}
+
+bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTime64BestEffortImpl<bool, false>(res, scale, in, local_time_zone, utc_time_zone);
+}
+
+bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+{
+ return parseDateTime64BestEffortImpl<bool, true>(res, scale, in, local_time_zone, utc_time_zone);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/parseDateTimeBestEffort.h b/contrib/clickhouse/src/IO/parseDateTimeBestEffort.h
new file mode 100644
index 0000000000..22af44f9e7
--- /dev/null
+++ b/contrib/clickhouse/src/IO/parseDateTimeBestEffort.h
@@ -0,0 +1,66 @@
+#pragma once
+#include <stddef.h>
+#include <time.h>
+
+#include <Core/Types.h>
+
+class DateLUTImpl;
+
+namespace DB
+{
+
+class ReadBuffer;
+
+/** https://xkcd.com/1179/
+ *
+ * The existence of this function is an example of bad practice
+ * and contradicts our development principles.
+ *
+ * This function will recognize the following patterns:
+ *
+ * NNNNNNNNNN - 9..10 digits is a unix timestamp
+ *
+ * YYYYMMDDhhmmss - 14 numbers is always interpreted this way
+ *
+ * YYYYMMDD - 8 digits in a row
+ * YYYY*MM*DD - or with any delimiter after first 4-digit year component and after month.
+ *
+ * DD/MM/YY
+ * DD/MM/YYYY - when '/' separator is used, these are the only possible forms
+ *
+ * hh:mm:ss - when ':' separator is used, it is always time
+ * hh:mm - it can be specified without seconds
+ *
+ * YYYY - 4 digits is always year
+ *
+ * YYYYMM - 6 digits is a year, month if year was not already read
+ * hhmmss - 6 digits is a time if year was already read
+ *
+ * .nnnnnnn - any number of digits after point is fractional part of second, if it is not YYYY.MM.DD or DD.MM.YYYY
+ *
+ * T - means that time will follow
+ *
+ * Z - means zero UTC offset
+ *
+ * +hhmm
+ * +hh:mm
+ * +hh
+ * -... - time zone offset
+ *
+ * single whitespace can be used as a separator
+ *
+ * AM/PM - AM means: subtract 12 hours if a value is 12 and PM means: add 12 hours if a value is less than 12.
+ *
+ * Jan/Feb/Mar/Apr/May/Jun/Jul/Aug/Sep/Oct/Nov/Dec - allowed to specify month
+ * Mon/Tue/Wed/Thu/Fri/Sat/Sun - simply ignored.
+ */
+
+void parseDateTimeBestEffort(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+bool tryParseDateTimeBestEffort(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+void parseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+bool tryParseDateTimeBestEffortUS(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+void parseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+}
diff --git a/contrib/clickhouse/src/IO/readDecimalText.h b/contrib/clickhouse/src/IO/readDecimalText.h
new file mode 100644
index 0000000000..9fd9c439b8
--- /dev/null
+++ b/contrib/clickhouse/src/IO/readDecimalText.h
@@ -0,0 +1,227 @@
+#pragma once
+
+#include <limits>
+#include <IO/ReadHelpers.h>
+#include <Common/intExp.h>
+#include <base/wide_integer_to_string.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_NUMBER;
+ extern const int ARGUMENT_OUT_OF_BOUND;
+}
+
+/// Try to read Decimal into underlying type T from ReadBuffer. Throws if 'digits_only' is set and there's unexpected symbol in input.
+/// Returns integer 'exponent' factor that x should be multiplied by to get correct Decimal value: result = x * 10^exponent.
+/// Use 'digits' input as max allowed meaning decimal digits in result. Place actual number of meaning digits in 'digits' output.
+/// Does not care about decimal scale, only about meaningful digits in decimal text representation.
+template <bool _throw_on_error, typename T>
+inline bool readDigits(ReadBuffer & buf, T & x, uint32_t & digits, int32_t & exponent, bool digits_only = false)
+{
+ x = T(0);
+ exponent = 0;
+ uint32_t max_digits = digits;
+ digits = 0;
+ uint32_t places = 0;
+ typename T::NativeType sign = 1;
+ bool leading_zeroes = true;
+ bool after_point = false;
+
+ if (buf.eof())
+ {
+ if constexpr (_throw_on_error)
+ throwReadAfterEOF();
+ return false;
+ }
+
+ switch (*buf.position())
+ {
+ case '-':
+ sign = -1;
+ [[fallthrough]];
+ case '+':
+ ++buf.position();
+ break;
+ }
+
+ bool stop = false;
+ while (!buf.eof() && !stop)
+ {
+ const char & byte = *buf.position();
+ switch (byte)
+ {
+ case '.':
+ after_point = true;
+ leading_zeroes = false;
+ break;
+ case '0':
+ {
+ if (leading_zeroes)
+ break;
+
+ if (after_point)
+ {
+ ++places; /// Count trailing zeroes. They would be used only if there's some other digit after them.
+ break;
+ }
+ [[fallthrough]];
+ }
+ case '1': [[fallthrough]];
+ case '2': [[fallthrough]];
+ case '3': [[fallthrough]];
+ case '4': [[fallthrough]];
+ case '5': [[fallthrough]];
+ case '6': [[fallthrough]];
+ case '7': [[fallthrough]];
+ case '8': [[fallthrough]];
+ case '9':
+ {
+ leading_zeroes = false;
+
+ ++places; // num zeroes before + current digit
+ if (digits + places > max_digits)
+ {
+ if (after_point)
+ {
+ /// Simply cut excessive digits.
+ break;
+ }
+ else
+ {
+ if constexpr (_throw_on_error)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Too many digits ({} > {}) in decimal value",
+ std::to_string(digits + places), std::to_string(max_digits));
+
+ return false;
+ }
+ }
+ else
+ {
+ digits += places;
+ if (after_point)
+ exponent -= places;
+
+ // TODO: accurate shift10 for big integers
+ x *= intExp10OfSize<typename T::NativeType>(places);
+ places = 0;
+
+ x += (byte - '0');
+ break;
+ }
+ }
+ case 'e': [[fallthrough]];
+ case 'E':
+ {
+ ++buf.position();
+ Int32 addition_exp = 0;
+ if (!tryReadIntText(addition_exp, buf))
+ {
+ if constexpr (_throw_on_error)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot parse exponent while reading decimal");
+ else
+ return false;
+ }
+ exponent += addition_exp;
+ stop = true;
+ continue;
+ }
+
+ default:
+ if (digits_only)
+ {
+ if constexpr (_throw_on_error)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected symbol while reading decimal");
+ return false;
+ }
+ stop = true;
+ continue;
+ }
+ ++buf.position();
+ }
+
+ x *= sign;
+ return true;
+}
+
+template <typename T, typename ReturnType=void>
+inline ReturnType readDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale, bool digits_only = false)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ uint32_t digits = precision;
+ int32_t exponent;
+ auto ok = readDigits<throw_exception>(buf, x, digits, exponent, digits_only);
+
+ if (!throw_exception && !ok)
+ return ReturnType(false);
+
+ if (static_cast<int32_t>(digits) + exponent > static_cast<int32_t>(precision - scale))
+ {
+ if constexpr (throw_exception)
+ {
+ static constexpr auto pattern = "Decimal value is too big: {} digits were read: {}e{}."
+ " Expected to read decimal with scale {} and precision {}";
+
+ if constexpr (is_big_int_v<typename T::NativeType>)
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, pattern, digits, x.value, exponent, scale, precision);
+ else
+ throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, pattern, digits, x, exponent, scale, precision);
+ }
+ else
+ return ReturnType(false);
+ }
+
+ if (static_cast<int32_t>(scale) + exponent < 0)
+ {
+ auto divisor_exp = -exponent - static_cast<int32_t>(scale);
+
+ if (divisor_exp >= std::numeric_limits<typename T::NativeType>::digits10)
+ {
+ /// Too big negative exponent
+ x.value = 0;
+ scale = 0;
+ return ReturnType(true);
+ }
+ else
+ {
+ /// Too many digits after point. Just cut off excessive digits.
+ auto divisor = intExp10OfSize<typename T::NativeType>(divisor_exp);
+ assert(divisor > 0); /// This is for Clang Static Analyzer. It is not smart enough to infer it automatically.
+ x.value /= divisor;
+ scale = 0;
+ return ReturnType(true);
+ }
+ }
+
+ scale += exponent;
+ return ReturnType(true);
+}
+
+template <typename T>
+inline bool tryReadDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale)
+{
+ return readDecimalText<T, bool>(buf, x, precision, scale, true);
+}
+
+template <typename T>
+inline void readCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale)
+{
+ if (buf.eof())
+ throwReadAfterEOF();
+
+ char maybe_quote = *buf.position();
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ ++buf.position();
+
+ readDecimalText(buf, x, precision, scale, false);
+
+ if (maybe_quote == '\'' || maybe_quote == '\"')
+ assertChar(maybe_quote, buf);
+}
+
+}
diff --git a/contrib/clickhouse/src/IO/readFloatText.cpp b/contrib/clickhouse/src/IO/readFloatText.cpp
new file mode 100644
index 0000000000..d1143f7c62
--- /dev/null
+++ b/contrib/clickhouse/src/IO/readFloatText.cpp
@@ -0,0 +1,70 @@
+#include <IO/readFloatText.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
+}
+
+/** Must successfully parse inf, INF and Infinity.
+ * All other variants in different cases are also parsed for simplicity.
+ */
+bool parseInfinity(ReadBuffer & buf)
+{
+ if (!checkStringCaseInsensitive("inf", buf))
+ return false;
+
+ /// Just inf.
+ if (buf.eof() || !isWordCharASCII(*buf.position()))
+ return true;
+
+ /// If word characters after inf, it should be infinity.
+ return checkStringCaseInsensitive("inity", buf);
+}
+
+
+/** Must successfully parse nan, NAN and NaN.
+ * All other variants in different cases are also parsed for simplicity.
+ */
+bool parseNaN(ReadBuffer & buf)
+{
+ return checkStringCaseInsensitive("nan", buf);
+}
+
+
+void assertInfinity(ReadBuffer & buf)
+{
+ if (!parseInfinity(buf))
+ throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse infinity.");
+}
+
+void assertNaN(ReadBuffer & buf)
+{
+ if (!parseNaN(buf))
+ throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse NaN.");
+}
+
+
+template void readFloatTextPrecise<Float32>(Float32 &, ReadBuffer &);
+template void readFloatTextPrecise<Float64>(Float64 &, ReadBuffer &);
+template bool tryReadFloatTextPrecise<Float32>(Float32 &, ReadBuffer &);
+template bool tryReadFloatTextPrecise<Float64>(Float64 &, ReadBuffer &);
+
+template void readFloatTextFast<Float32>(Float32 &, ReadBuffer &);
+template void readFloatTextFast<Float64>(Float64 &, ReadBuffer &);
+template bool tryReadFloatTextFast<Float32>(Float32 &, ReadBuffer &);
+template bool tryReadFloatTextFast<Float64>(Float64 &, ReadBuffer &);
+
+template void readFloatTextSimple<Float32>(Float32 &, ReadBuffer &);
+template void readFloatTextSimple<Float64>(Float64 &, ReadBuffer &);
+template bool tryReadFloatTextSimple<Float32>(Float32 &, ReadBuffer &);
+template bool tryReadFloatTextSimple<Float64>(Float64 &, ReadBuffer &);
+
+template void readFloatText<Float32>(Float32 &, ReadBuffer &);
+template void readFloatText<Float64>(Float64 &, ReadBuffer &);
+template bool tryReadFloatText<Float32>(Float32 &, ReadBuffer &);
+template bool tryReadFloatText<Float64>(Float64 &, ReadBuffer &);
+
+}
diff --git a/contrib/clickhouse/src/IO/readFloatText.h b/contrib/clickhouse/src/IO/readFloatText.h
new file mode 100644
index 0000000000..da4719b8dc
--- /dev/null
+++ b/contrib/clickhouse/src/IO/readFloatText.h
@@ -0,0 +1,596 @@
+#pragma once
+#include <type_traits>
+#include <IO/ReadHelpers.h>
+#include <Core/Defines.h>
+#include <base/shift10.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <double-conversion/double-conversion.h>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#endif
+#include <fast_float/fast_float.h>
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+/** Methods for reading floating point numbers from text with decimal representation.
+ * There are "precise", "fast" and "simple" implementations.
+ *
+ * Neither of methods support hexadecimal numbers (0xABC), binary exponent (1p100), leading plus sign.
+ *
+ * Precise method always returns a number that is the closest machine representable number to the input.
+ *
+ * Fast method is faster (up to 3 times) and usually return the same value,
+ * but in rare cases result may differ by lest significant bit (for Float32)
+ * and by up to two least significant bits (for Float64) from precise method.
+ * Also fast method may parse some garbage as some other unspecified garbage.
+ *
+ * Simple method is little faster for cases of parsing short (few digit) integers, but less precise and slower in other cases.
+ * It's not recommended to use simple method and it is left only for reference.
+ *
+ * For performance test, look at 'read_float_perf' test.
+ *
+ * For precision test.
+ * Parse all existing Float32 numbers:
+
+CREATE TABLE test.floats ENGINE = Log AS SELECT reinterpretAsFloat32(reinterpretAsString(toUInt32(number))) AS x FROM numbers(0x100000000);
+
+WITH
+ toFloat32(toString(x)) AS y,
+ reinterpretAsUInt32(reinterpretAsString(x)) AS bin_x,
+ reinterpretAsUInt32(reinterpretAsString(y)) AS bin_y,
+ abs(bin_x - bin_y) AS diff
+SELECT
+ diff,
+ count()
+FROM test.floats
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+ * Here are the results:
+ *
+ Precise:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 4278190082 │
+ └──────┴────────────┘
+ (100% roundtrip property)
+
+ Fast:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 3685260580 │
+ │ 1 │ 592929502 │
+ └──────┴────────────┘
+ (The difference is 1 in least significant bit in 13.8% of numbers.)
+
+ Simple:
+ ┌─diff─┬────count()─┐
+ │ 0 │ 2169879994 │
+ │ 1 │ 1807178292 │
+ │ 2 │ 269505944 │
+ │ 3 │ 28826966 │
+ │ 4 │ 2566488 │
+ │ 5 │ 212878 │
+ │ 6 │ 18276 │
+ │ 7 │ 1214 │
+ │ 8 │ 30 │
+ └──────┴────────────┘
+
+ * Parse random Float64 numbers:
+
+WITH
+ rand64() AS bin_x,
+ reinterpretAsFloat64(reinterpretAsString(bin_x)) AS x,
+ toFloat64(toString(x)) AS y,
+ reinterpretAsUInt64(reinterpretAsString(y)) AS bin_y,
+ abs(bin_x - bin_y) AS diff
+SELECT
+ diff,
+ count()
+FROM numbers(100000000)
+WHERE NOT isNaN(x)
+GROUP BY diff
+ORDER BY diff ASC
+LIMIT 100
+
+ */
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_PARSE_NUMBER;
+}
+
+
+/// Returns true, iff parsed.
+bool parseInfinity(ReadBuffer & buf);
+bool parseNaN(ReadBuffer & buf);
+
+void assertInfinity(ReadBuffer & buf);
+void assertNaN(ReadBuffer & buf);
+
+
+template <bool throw_exception>
+bool assertOrParseInfinity(ReadBuffer & buf)
+{
+ if constexpr (throw_exception)
+ {
+ assertInfinity(buf);
+ return true;
+ }
+ else
+ return parseInfinity(buf);
+}
+
+template <bool throw_exception>
+bool assertOrParseNaN(ReadBuffer & buf)
+{
+ if constexpr (throw_exception)
+ {
+ assertNaN(buf);
+ return true;
+ }
+ else
+ return parseNaN(buf);
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
+{
+ static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextPreciseImpl must be float or double");
+ static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ /// Fast path (avoid copying) if the buffer have at least MAX_LENGTH bytes.
+ static constexpr int MAX_LENGTH = 316;
+
+ if (likely(!buf.eof() && buf.position() + MAX_LENGTH <= buf.buffer().end()))
+ {
+ auto * initial_position = buf.position();
+ auto res = fast_float::from_chars(initial_position, buf.buffer().end(), x);
+
+ if (unlikely(res.ec != std::errc()))
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return ReturnType(false);
+ }
+
+ buf.position() += res.ptr - initial_position;
+
+ return ReturnType(true);
+ }
+ else
+ {
+ /// Slow path. Copy characters that may be present in floating point number to temporary buffer.
+ bool negative = false;
+
+ /// We check eof here because we can parse +inf +nan
+ while (!buf.eof())
+ {
+ switch (*buf.position())
+ {
+ case '+':
+ ++buf.position();
+ continue;
+
+ case '-':
+ {
+ negative = true;
+ ++buf.position();
+ continue;
+ }
+
+ case 'i': [[fallthrough]];
+ case 'I':
+ {
+ if (assertOrParseInfinity<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ case 'n': [[fallthrough]];
+ case 'N':
+ {
+ if (assertOrParseNaN<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ default:
+ break;
+ }
+
+ break;
+ }
+
+
+ char tmp_buf[MAX_LENGTH];
+ int num_copied_chars = 0;
+
+ while (!buf.eof() && num_copied_chars < MAX_LENGTH)
+ {
+ char c = *buf.position();
+ if (!(isNumericASCII(c) || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E'))
+ break;
+
+ tmp_buf[num_copied_chars] = c;
+ ++buf.position();
+ ++num_copied_chars;
+ }
+
+ auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
+
+ if (unlikely(res.ec != std::errc()))
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return ReturnType(false);
+ }
+
+ if (negative)
+ x = -x;
+
+ return ReturnType(true);
+ }
+}
+
+
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static inline bool is_made_of_eight_digits_fast(uint64_t val) noexcept
+{
+ return (((val & 0xF0F0F0F0F0F0F0F0) | (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == 0x3333333333333333);
+}
+
+static inline bool is_made_of_eight_digits_fast(const char * chars) noexcept
+{
+ uint64_t val;
+ ::memcpy(&val, chars, 8);
+ return is_made_of_eight_digits_fast(val);
+}
+
+template <size_t N, typename T>
+static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
+{
+ /// In optimistic case we can skip bound checking for first loop.
+ if (buf.position() + N <= buf.buffer().end())
+ {
+ for (size_t i = 0; i < N; ++i)
+ {
+ if (isNumericASCII(*buf.position()))
+ {
+ x *= 10;
+ x += *buf.position() & 0x0F;
+ ++buf.position();
+ }
+ else
+ return;
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < N; ++i)
+ {
+ if (!buf.eof() && isNumericASCII(*buf.position()))
+ {
+ x *= 10;
+ x += *buf.position() & 0x0F;
+ ++buf.position();
+ }
+ else
+ return;
+ }
+ }
+
+ while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) &&
+ is_made_of_eight_digits_fast(buf.position()))
+ {
+ buf.position() += 8;
+ }
+
+ while (!buf.eof() && isNumericASCII(*buf.position()))
+ ++buf.position();
+}
+
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
+{
+ static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
+ static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII");
+
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ bool negative = false;
+ x = 0;
+ UInt64 before_point = 0;
+ UInt64 after_point = 0;
+ int after_point_exponent = 0;
+ int exponent = 0;
+
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
+ else
+ return false;
+ }
+
+ if (*in.position() == '-')
+ {
+ negative = true;
+ ++in.position();
+ }
+ else if (*in.position() == '+')
+ ++in.position();
+
+ auto count_after_sign = in.count();
+
+ constexpr int significant_digits = std::numeric_limits<UInt64>::digits10;
+ readUIntTextUpToNSignificantDigits<significant_digits>(before_point, in);
+
+ size_t read_digits = in.count() - count_after_sign;
+
+ if (unlikely(read_digits > significant_digits))
+ {
+ int before_point_additional_exponent = static_cast<int>(read_digits) - significant_digits;
+ x = static_cast<T>(shift10(before_point, before_point_additional_exponent));
+ }
+ else
+ {
+ x = before_point;
+
+ /// Shortcut for the common case when there is an integer that fit in Int64.
+ if (read_digits && (in.eof() || *in.position() < '.'))
+ {
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ }
+
+ if (checkChar('.', in))
+ {
+ auto after_point_count = in.count();
+
+ while (!in.eof() && *in.position() == '0')
+ ++in.position();
+
+ auto after_leading_zeros_count = in.count();
+ int after_point_num_leading_zeros = static_cast<int>(after_leading_zeros_count - after_point_count);
+
+ readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in);
+ read_digits = in.count() - after_leading_zeros_count;
+ after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros;
+ }
+
+ if (checkChar('e', in) || checkChar('E', in))
+ {
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
+ else
+ return false;
+ }
+
+ bool exponent_negative = false;
+ if (*in.position() == '-')
+ {
+ exponent_negative = true;
+ ++in.position();
+ }
+ else if (*in.position() == '+')
+ {
+ ++in.position();
+ }
+
+ readUIntTextUpToNSignificantDigits<4>(exponent, in);
+ if (exponent_negative)
+ exponent = -exponent;
+ }
+
+ if (after_point)
+ x += static_cast<T>(shift10(after_point, after_point_exponent));
+
+ if (exponent)
+ x = static_cast<T>(shift10(x, exponent));
+
+ if (negative)
+ x = -x;
+
+ auto num_characters_without_sign = in.count() - count_after_sign;
+
+ /// Denormals. At most one character is read before denormal and it is '-'.
+ if (num_characters_without_sign == 0)
+ {
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read");
+ else
+ return false;
+ }
+
+ if (*in.position() == '+')
+ {
+ ++in.position();
+ if (in.eof())
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign");
+ else
+ return false;
+ }
+ else if (negative)
+ {
+ if constexpr (throw_exception)
+ throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign");
+ else
+ return false;
+ }
+ }
+
+ if (*in.position() == 'i' || *in.position() == 'I')
+ {
+ if (assertOrParseInfinity<throw_exception>(in))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+ else if (*in.position() == 'n' || *in.position() == 'N')
+ {
+ if (assertOrParseNaN<throw_exception>(in))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+ }
+
+ return ReturnType(true);
+}
+
+template <typename T, typename ReturnType>
+ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf)
+{
+ static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
+
+ bool negative = false;
+ x = 0;
+ bool after_point = false;
+ T power_of_ten = 1;
+
+ if (buf.eof())
+ throwReadAfterEOF();
+
+ while (!buf.eof())
+ {
+ switch (*buf.position())
+ {
+ case '+':
+ break;
+ case '-':
+ negative = true;
+ break;
+ case '.':
+ after_point = true;
+ break;
+ case '0': [[fallthrough]];
+ case '1': [[fallthrough]];
+ case '2': [[fallthrough]];
+ case '3': [[fallthrough]];
+ case '4': [[fallthrough]];
+ case '5': [[fallthrough]];
+ case '6': [[fallthrough]];
+ case '7': [[fallthrough]];
+ case '8': [[fallthrough]];
+ case '9':
+ if (after_point)
+ {
+ power_of_ten /= 10;
+ x += (*buf.position() - '0') * power_of_ten;
+ }
+ else
+ {
+ x *= 10;
+ x += *buf.position() - '0';
+ }
+ break;
+ case 'e': [[fallthrough]];
+ case 'E':
+ {
+ ++buf.position();
+ Int32 exponent = 0;
+ readIntText(exponent, buf);
+ x = shift10(x, exponent);
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+
+ case 'i': [[fallthrough]];
+ case 'I':
+ {
+ if (assertOrParseInfinity<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::infinity();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ case 'n': [[fallthrough]];
+ case 'N':
+ {
+ if (assertOrParseNaN<throw_exception>(buf))
+ {
+ x = std::numeric_limits<T>::quiet_NaN();
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ return ReturnType(false);
+ }
+
+ default:
+ {
+ if (negative)
+ x = -x;
+ return ReturnType(true);
+ }
+ }
+ ++buf.position();
+ }
+
+ if (negative)
+ x = -x;
+
+ return ReturnType(true);
+}
+
+template <typename T> void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool>(x, in); }
+
+template <typename T> void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl<T, void>(x, in); }
+template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl<T, bool>(x, in); }
+
+
+/// Implementation that is selected as default.
+
+template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
+template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
+
+}