summaryrefslogtreecommitdiffstats
path: root/library/cpp/on_disk
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2023-12-02 01:45:21 +0300
committerrobot-piglet <[email protected]>2023-12-02 02:42:50 +0300
commit9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch)
tree9f88a486917d371d099cd712efd91b4c122d209d /library/cpp/on_disk
parent32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff)
Intermediate changes
Diffstat (limited to 'library/cpp/on_disk')
-rw-r--r--library/cpp/on_disk/multi_blob/multiblob.cpp67
-rw-r--r--library/cpp/on_disk/multi_blob/multiblob.h77
-rw-r--r--library/cpp/on_disk/multi_blob/multiblob_builder.cpp146
-rw-r--r--library/cpp/on_disk/multi_blob/multiblob_builder.h64
-rw-r--r--library/cpp/on_disk/multi_blob/ya.make13
-rw-r--r--library/cpp/on_disk/st_hash/fake.cpp4
-rw-r--r--library/cpp/on_disk/st_hash/save_stl.h84
-rw-r--r--library/cpp/on_disk/st_hash/static_hash.h420
-rw-r--r--library/cpp/on_disk/st_hash/static_hash_map.h59
-rw-r--r--library/cpp/on_disk/st_hash/sthash_iterators.h334
-rw-r--r--library/cpp/on_disk/st_hash/ya.make15
11 files changed, 1283 insertions, 0 deletions
diff --git a/library/cpp/on_disk/multi_blob/multiblob.cpp b/library/cpp/on_disk/multi_blob/multiblob.cpp
new file mode 100644
index 00000000000..d92b31e6135
--- /dev/null
+++ b/library/cpp/on_disk/multi_blob/multiblob.cpp
@@ -0,0 +1,67 @@
+#include <util/generic/yexception.h>
+#include <util/system/align.h>
+
+#include <library/cpp/on_disk/chunks/reader.h>
+
+#include "multiblob.h"
+
+void TSubBlobs::ReadMultiBlob(const TBlob& multi) {
+ if (multi.Size() < sizeof(TMultiBlobHeader)) {
+ ythrow yexception() << "not a blob, too small";
+ }
+
+ Multi = multi;
+ memcpy((void*)&Header, Multi.Data(), sizeof(TMultiBlobHeader));
+
+ if (Header.BlobMetaSig != BLOBMETASIG) {
+ if (Header.BlobRecordSig != TMultiBlobHeader::RecordSig) {
+ if (ReadChunkedData(multi))
+ return;
+ }
+ ythrow yexception() << "is not a blob, MetaSig was read: "
+ << Header.BlobMetaSig
+ << ", must be" << BLOBMETASIG;
+ }
+
+ if (Header.BlobRecordSig != TMultiBlobHeader::RecordSig)
+ ythrow yexception() << "unknown multiblob RecordSig "
+ << Header.BlobRecordSig;
+
+ reserve(size() + Header.Count);
+ if (Header.Flags & EMF_INTERLAY) {
+ size_t pos = Header.HeaderSize();
+ for (size_t i = 0; i < Header.Count; ++i) {
+ pos = AlignUp<ui64>(pos, sizeof(ui64));
+ ui64 size = *((ui64*)((const char*)multi.Data() + pos));
+ pos = AlignUp<ui64>(pos + sizeof(ui64), Header.Align);
+ push_back(multi.SubBlob(pos, pos + size));
+ pos += size;
+ }
+ } else {
+ const ui64* sizes = Header.Sizes(multi.Data());
+ size_t pos = Header.HeaderSize() + Header.Count * sizeof(ui64);
+ for (size_t i = 0; i < Header.Count; ++i) {
+ pos = AlignUp<ui64>(pos, Header.Align);
+ push_back(multi.SubBlob(pos, pos + *sizes));
+ pos += *sizes;
+ sizes++;
+ }
+ }
+}
+
+bool TSubBlobs::ReadChunkedData(const TBlob& multi) noexcept {
+ Multi = multi;
+ memset((void*)&Header, 0, sizeof(Header));
+
+ TChunkedDataReader reader(Multi);
+ Header.Count = reader.GetBlocksCount();
+ resize(GetHeader()->Count);
+ for (size_t i = 0; i < size(); ++i)
+ // We can use TBlob::NoCopy() because of reader.GetBlock(i) returns
+ // address into memory of multi blob.
+ // This knowledge was acquired from implementation of
+ // TChunkedDataReader, so we need care about any changes that.
+ (*this)[i] = TBlob::NoCopy(reader.GetBlock(i), reader.GetBlockLen(i));
+ Header.Flags |= EMF_CHUNKED_DATA_READER;
+ return true;
+}
diff --git a/library/cpp/on_disk/multi_blob/multiblob.h b/library/cpp/on_disk/multi_blob/multiblob.h
new file mode 100644
index 00000000000..b40a5ae6af9
--- /dev/null
+++ b/library/cpp/on_disk/multi_blob/multiblob.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+
+#define BLOBMETASIG 0x3456789Au
+
+enum E_Multiblob_Flags {
+ // if EMF_INTERLAY is clear
+ // multiblob format
+ // HeaderSize() bytes for TMultiBlobHeader
+ // Count*sizeof(ui64) bytes for blob sizes
+ // blob1
+ // (alignment)
+ // blob2
+ // (alignment)
+ // ...
+ // (alignment)
+ // blobn
+ // if EMF_INTERLAY is set
+ // multiblob format
+ // HeaderSize() bytes for TMultiBlobHeader
+ // size1 ui64, the size of 1st blob
+ // blob1
+ // (alignment)
+ // size2 ui64, the size of 2nd blob
+ // blob2
+ // (alignment)
+ // ...
+ // (alignment)
+ // sizen ui64, the size of n'th blob
+ // blobn
+ EMF_INTERLAY = 1,
+
+ // Means that multiblob contains blocks in TChunkedDataReader format
+ // Legacy, use it only for old files, created for TChunkedDataReader
+ EMF_CHUNKED_DATA_READER = 2,
+
+ // Flags that may be configured for blobbuilder in client code
+ EMF_WRITEABLE = EMF_INTERLAY,
+};
+
+struct TMultiBlobHeader {
+ // data
+ ui32 BlobMetaSig;
+ ui32 BlobRecordSig;
+ ui64 Count; // count of sub blobs
+ ui32 Align; // alignment for every subblob
+ ui32 Flags;
+ static const ui32 RecordSig = 0x23456789;
+ static inline size_t HeaderSize() {
+ return 4 * sizeof(ui64);
+ }
+ inline const ui64* Sizes(const void* Data) const {
+ return (const ui64*)((const char*)Data + HeaderSize());
+ }
+};
+
+class TSubBlobs: public TVector<TBlob> {
+public:
+ TSubBlobs() {
+ }
+ TSubBlobs(const TBlob& multi) {
+ ReadMultiBlob(multi);
+ }
+ void ReadMultiBlob(const TBlob& multi);
+ const TMultiBlobHeader* GetHeader() const {
+ return (const TMultiBlobHeader*)&Header;
+ }
+
+protected:
+ TMultiBlobHeader Header;
+ TBlob Multi;
+
+private:
+ bool ReadChunkedData(const TBlob& multi) noexcept;
+};
diff --git a/library/cpp/on_disk/multi_blob/multiblob_builder.cpp b/library/cpp/on_disk/multi_blob/multiblob_builder.cpp
new file mode 100644
index 00000000000..44aa4a6c2fd
--- /dev/null
+++ b/library/cpp/on_disk/multi_blob/multiblob_builder.cpp
@@ -0,0 +1,146 @@
+#include <util/memory/tempbuf.h>
+#include <util/system/align.h>
+
+#include "multiblob_builder.h"
+
+/*
+ * TBlobSaverMemory
+ */
+TBlobSaverMemory::TBlobSaverMemory(const void* ptr, size_t size)
+ : Blob(TBlob::NoCopy(ptr, size))
+{
+}
+
+TBlobSaverMemory::TBlobSaverMemory(const TBlob& blob)
+ : Blob(blob)
+{
+}
+
+void TBlobSaverMemory::Save(IOutputStream& output, ui32 /*flags*/) {
+ output.Write((void*)Blob.Data(), Blob.Length());
+}
+
+size_t TBlobSaverMemory::GetLength() {
+ return Blob.Length();
+}
+
+/*
+ * TBlobSaverFile
+ */
+
+TBlobSaverFile::TBlobSaverFile(TFile file)
+ : File(file)
+{
+ Y_ASSERT(File.IsOpen());
+}
+
+TBlobSaverFile::TBlobSaverFile(const char* filename, EOpenMode oMode)
+ : File(filename, oMode)
+{
+ Y_ASSERT(File.IsOpen());
+}
+
+void TBlobSaverFile::Save(IOutputStream& output, ui32 /*flags*/) {
+ TTempBuf buffer(1 << 20);
+ while (size_t size = File.Read((void*)buffer.Data(), buffer.Size()))
+ output.Write((void*)buffer.Data(), size);
+}
+
+size_t TBlobSaverFile::GetLength() {
+ return File.GetLength();
+}
+
+/*
+ * TMultiBlobBuilder
+ */
+
+TMultiBlobBuilder::TMultiBlobBuilder(bool isOwn)
+ : IsOwner(isOwn)
+{
+}
+
+TMultiBlobBuilder::~TMultiBlobBuilder() {
+ if (IsOwner)
+ DeleteSubBlobs();
+}
+
+namespace {
+ ui64 PadToAlign(IOutputStream& output, ui64 fromPos, ui32 align) {
+ ui64 toPos = AlignUp<ui64>(fromPos, align);
+ for (; fromPos < toPos; ++fromPos) {
+ output << (char)0;
+ }
+ return toPos;
+ }
+}
+
+void TMultiBlobBuilder::Save(IOutputStream& output, ui32 flags) {
+ TMultiBlobHeader header;
+ memset((void*)&header, 0, sizeof(header));
+ header.BlobMetaSig = BLOBMETASIG;
+ header.BlobRecordSig = TMultiBlobHeader::RecordSig;
+ header.Count = Blobs.size();
+ header.Align = ALIGN;
+ header.Flags = flags & EMF_WRITEABLE;
+ output.Write((void*)&header, sizeof(header));
+ for (size_t i = sizeof(header); i < header.HeaderSize(); ++i)
+ output << (char)0;
+ ui64 pos = header.HeaderSize();
+ if (header.Flags & EMF_INTERLAY) {
+ for (size_t i = 0; i < Blobs.size(); ++i) {
+ ui64 size = Blobs[i]->GetLength();
+ pos = PadToAlign(output, pos, sizeof(ui64)); // Align size record
+ output.Write((void*)&size, sizeof(ui64));
+ pos = PadToAlign(output, pos + sizeof(ui64), header.Align); // Align blob
+ Blobs[i]->Save(output, header.Flags);
+ pos += size;
+ }
+ } else {
+ for (size_t i = 0; i < Blobs.size(); ++i) {
+ ui64 size = Blobs[i]->GetLength();
+ output.Write((void*)&size, sizeof(ui64));
+ }
+ pos += Blobs.size() * sizeof(ui64);
+ for (size_t i = 0; i < Blobs.size(); ++i) {
+ pos = PadToAlign(output, pos, header.Align);
+ Blobs[i]->Save(output, header.Flags);
+ pos += Blobs[i]->GetLength();
+ }
+ }
+ // Compensate for imprecise size
+ for (ui64 len = GetLength(); pos < len; ++pos) {
+ output << (char)0;
+ }
+}
+
+size_t TMultiBlobBuilder::GetLength() {
+ // Sizes may be diferent with and without EMF_INTERLAY, so choose greater of 2
+ size_t resNonInter = TMultiBlobHeader::HeaderSize() + Blobs.size() * sizeof(ui64);
+ size_t resInterlay = TMultiBlobHeader::HeaderSize();
+ for (size_t i = 0; i < Blobs.size(); ++i) {
+ resInterlay = AlignUp<ui64>(resInterlay, sizeof(ui64)) + sizeof(ui64);
+ resInterlay = AlignUp<ui64>(resInterlay, ALIGN) + Blobs[i]->GetLength();
+ resNonInter = AlignUp<ui64>(resNonInter, ALIGN) + Blobs[i]->GetLength();
+ }
+ resInterlay = AlignUp<ui64>(resInterlay, ALIGN);
+ resNonInter = AlignUp<ui64>(resNonInter, ALIGN);
+ return Max(resNonInter, resInterlay);
+}
+
+TMultiBlobBuilder::TSavers& TMultiBlobBuilder::GetBlobs() {
+ return Blobs;
+}
+
+const TMultiBlobBuilder::TSavers& TMultiBlobBuilder::GetBlobs() const {
+ return Blobs;
+}
+
+void TMultiBlobBuilder::AddBlob(IBlobSaverBase* blob) {
+ Blobs.push_back(blob);
+}
+
+void TMultiBlobBuilder::DeleteSubBlobs() {
+ for (size_t i = 0; i < Blobs.size(); ++i)
+ delete Blobs[i];
+ Blobs.clear();
+}
diff --git a/library/cpp/on_disk/multi_blob/multiblob_builder.h b/library/cpp/on_disk/multi_blob/multiblob_builder.h
new file mode 100644
index 00000000000..a8e3c6d35ee
--- /dev/null
+++ b/library/cpp/on_disk/multi_blob/multiblob_builder.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <util/system/align.h>
+#include <util/stream/output.h>
+#include <util/stream/file.h>
+#include <util/draft/holder_vector.h>
+
+#include "multiblob.h"
+
+class IBlobSaverBase {
+public:
+ virtual ~IBlobSaverBase() {
+ }
+ virtual void Save(IOutputStream& output, ui32 flags = 0) = 0;
+ virtual size_t GetLength() = 0;
+};
+
+inline void MultiBlobSave(IOutputStream& output, IBlobSaverBase& saver) {
+ saver.Save(output);
+}
+
+class TBlobSaverMemory: public IBlobSaverBase {
+public:
+ TBlobSaverMemory(const void* ptr, size_t size);
+ TBlobSaverMemory(const TBlob& blob);
+ void Save(IOutputStream& output, ui32 flags = 0) override;
+ size_t GetLength() override;
+
+private:
+ TBlob Blob;
+};
+
+class TBlobSaverFile: public IBlobSaverBase {
+public:
+ TBlobSaverFile(TFile file);
+ TBlobSaverFile(const char* filename, EOpenMode oMode = RdOnly);
+ void Save(IOutputStream& output, ui32 flags = 0) override;
+ size_t GetLength() override;
+
+protected:
+ TFile File;
+};
+
+class TMultiBlobBuilder: public IBlobSaverBase {
+protected:
+ // Data will be stored with default alignment DEVTOOLS-4548
+ static const size_t ALIGN = 16;
+
+public:
+ typedef TVector<IBlobSaverBase*> TSavers;
+
+ TMultiBlobBuilder(bool isOwn = true);
+ ~TMultiBlobBuilder() override;
+ void Save(IOutputStream& output, ui32 flags = 0) override;
+ size_t GetLength() override;
+ TSavers& GetBlobs();
+ const TSavers& GetBlobs() const;
+ void AddBlob(IBlobSaverBase* blob);
+ void DeleteSubBlobs();
+
+protected:
+ TSavers Blobs;
+ bool IsOwner;
+};
diff --git a/library/cpp/on_disk/multi_blob/ya.make b/library/cpp/on_disk/multi_blob/ya.make
new file mode 100644
index 00000000000..50615fc9012
--- /dev/null
+++ b/library/cpp/on_disk/multi_blob/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+ multiblob.cpp
+ multiblob_builder.cpp
+)
+
+PEERDIR(
+ library/cpp/on_disk/chunks
+ util/draft
+)
+
+END()
diff --git a/library/cpp/on_disk/st_hash/fake.cpp b/library/cpp/on_disk/st_hash/fake.cpp
new file mode 100644
index 00000000000..ef5af4d432b
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/fake.cpp
@@ -0,0 +1,4 @@
+#include "save_stl.h"
+#include "static_hash.h"
+#include "static_hash_map.h"
+#include "sthash_iterators.h"
diff --git a/library/cpp/on_disk/st_hash/save_stl.h b/library/cpp/on_disk/st_hash/save_stl.h
new file mode 100644
index 00000000000..00f8f0e20db
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/save_stl.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <util/generic/hash.h>
+#include <util/system/yassert.h>
+#include <util/stream/output.h>
+
+// this structure might be replaced with sthashtable class
+template <class HF, class Eq, class size_type>
+struct sthashtable_nvm_sv {
+ sthashtable_nvm_sv() {
+ if (sizeof(sthashtable_nvm_sv) != sizeof(HF) + sizeof(Eq) + 3 * sizeof(size_type)) {
+ memset(this, 0, sizeof(sthashtable_nvm_sv));
+ }
+ }
+
+ sthashtable_nvm_sv(const HF& phf, const Eq& peq, const size_type& pnb, const size_type& pne, const size_type& pnd)
+ : sthashtable_nvm_sv()
+ {
+ hf = phf;
+ eq = peq;
+ num_buckets = pnb;
+ num_elements = pne;
+ data_end_off = pnd;
+ }
+
+ HF hf;
+ Eq eq;
+ size_type num_buckets;
+ size_type num_elements;
+ size_type data_end_off;
+};
+
+/**
+ * Some hack to save both THashMap and sthash.
+ * Working with stHash does not depend on the template parameters, because the content of stHash is not used inside this method.
+ */
+template <class V, class K, class HF, class Ex, class Eq, class A>
+template <class KeySaver>
+inline int THashTable<V, K, HF, Ex, Eq, A>::save_for_st(IOutputStream* stream, KeySaver& ks, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash) const {
+ Y_ASSERT(!stHash || stHash->bucket_count() == bucket_count());
+ typedef sthashtable_nvm_sv<HF, Eq, typename KeySaver::TSizeType> sv_type;
+ sv_type sv = {this->_get_hash_fun(), this->_get_key_eq(), static_cast<typename KeySaver::TSizeType>(buckets.size()), static_cast<typename KeySaver::TSizeType>(num_elements), 0};
+ // to do: m.b. use just the size of corresponding object?
+ typename KeySaver::TSizeType cur_off = sizeof(sv_type) +
+ (sv.num_buckets + 1) * sizeof(typename KeySaver::TSizeType);
+ sv.data_end_off = cur_off;
+ const_iterator n;
+ for (n = begin(); n != end(); ++n) {
+ sv.data_end_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(*n));
+ }
+ typename KeySaver::TSizeType* sb = stHash ? (typename KeySaver::TSizeType*)(stHash->buckets()) : nullptr;
+ if (stHash)
+ sv.data_end_off += static_cast<typename KeySaver::TSizeType>(sb[buckets.size()] - sb[0]);
+ //saver.Align(sizeof(char*));
+ stream->Write(&sv, sizeof(sv));
+
+ size_type i;
+ //save vector
+ for (i = 0; i < buckets.size(); ++i) {
+ node* cur = buckets[i];
+ stream->Write(&cur_off, sizeof(cur_off));
+ if (cur) {
+ while (!((uintptr_t)cur & 1)) {
+ cur_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(cur->val));
+ cur = cur->next;
+ }
+ }
+ if (stHash)
+ cur_off += static_cast<typename KeySaver::TSizeType>(sb[i + 1] - sb[i]);
+ }
+ stream->Write(&cur_off, sizeof(cur_off)); // end mark
+ for (i = 0; i < buckets.size(); ++i) {
+ node* cur = buckets[i];
+ if (cur) {
+ while (!((uintptr_t)cur & 1)) {
+ ks.SaveRecord(stream, cur->val);
+ cur = cur->next;
+ }
+ }
+ if (stHash)
+ stream->Write((const char*)stHash + sb[i], sb[i + 1] - sb[i]);
+ }
+ return 0;
+}
diff --git a/library/cpp/on_disk/st_hash/static_hash.h b/library/cpp/on_disk/st_hash/static_hash.h
new file mode 100644
index 00000000000..ca7a6ccd369
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/static_hash.h
@@ -0,0 +1,420 @@
+#pragma once
+
+#include "save_stl.h"
+#include "sthash_iterators.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/vector.h>
+#include <util/generic/buffer.h>
+#include <util/generic/cast.h>
+#include <util/generic/yexception.h> // for save/load only
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <utility>
+
+#include <memory>
+#include <algorithm>
+#include <functional>
+
+#include <cstdlib>
+#include <cstddef>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4624) // 'destructor could not be generated because a base class destructor is inaccessible'
+#endif
+
+template <class HashType, class KeySaver>
+inline void SaveHashToStreamEx(HashType& hash, IOutputStream* stream) {
+ KeySaver ks;
+ if (hash.save_for_st(stream, ks))
+ ythrow yexception() << "Could not save hash to stream";
+}
+
+template <class HashType>
+inline void SaveHashToStream(HashType& hash, IOutputStream* stream) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ return SaveHashToStreamEx<HashType, KeySaver>(hash, stream);
+}
+
+template <class HashType, class KeySaver>
+inline void SaveHashToFileEx(HashType& hash, const char* fileName) {
+ TFileOutput output(fileName);
+ SaveHashToStreamEx<HashType, KeySaver>(hash, &output);
+}
+
+template <class HashType>
+inline void SaveHashToFile(HashType& hash, const char* fileName) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType>
+inline void SaveHashSetToFile(HashType& hash, const char* fileName) {
+ typedef TSthashSetWriter<typename HashType::key_type, ui64> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType>
+inline void SaveHashToFile32(HashType& hash, const char* fileName) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType, class KeySaver>
+inline void SaveHashToBufferEx(HashType& hash, TBuffer& buffer, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash = nullptr) {
+ TBufferOutput stream(buffer);
+ KeySaver ks;
+ if (hash.save_for_st(&stream, ks, stHash))
+ ythrow yexception() << "Could not save hash to memory";
+}
+
+template <class HashType>
+inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer);
+}
+
+/**
+ * Some hack to save both THashMap and sthash.
+ * THashMap and sthash must have same bucket_count().
+ */
+template <class HashType, class StHashType>
+inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer, StHashType* stHash) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ typedef sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* SH;
+
+ SH sh = reinterpret_cast<SH>(stHash);
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer, sh);
+}
+
+template <class HashType>
+inline void SaveHashToBuffer32(HashType& hash, TBuffer& buffer) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver;
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer);
+}
+
+template <class Iter, typename size_type_f = ui64>
+class sthashtable {
+public:
+ typedef typename Iter::TKeyType key_type;
+ typedef typename Iter::TValueType value_type;
+ typedef typename Iter::THasherType hasher;
+ typedef typename Iter::TKeyEqualType key_equal;
+
+ typedef size_type_f size_type;
+ typedef ptrdiff_t difference_type;
+ typedef const value_type* const_pointer;
+ typedef const value_type& const_reference;
+
+ typedef Iter const_iterator;
+
+ const hasher hash_funct() const {
+ return hash;
+ }
+ const key_equal key_eq() const {
+ return equals;
+ }
+
+private:
+ const hasher hash;
+ const key_equal equals;
+
+private:
+ const_iterator iter_at_bucket(size_type bucket) const {
+ return (const_iterator)(((char*)this + buckets()[bucket]));
+ }
+
+ const_iterator iter_at_bucket_or_end(size_type bucket) const {
+ if (bucket < num_buckets)
+ return (const_iterator)(((char*)this + buckets()[bucket]));
+ else
+ return end();
+ }
+
+ const size_type num_buckets;
+ const size_type num_elements;
+ const size_type data_end_off;
+
+protected: //shut up gcc warning
+ // we can't construct/destroy this object at all!
+ sthashtable();
+ sthashtable(const sthashtable& ht);
+ ~sthashtable();
+
+public:
+ // const size_type *buckets;
+ const size_type* buckets() const {
+ return (size_type*)((char*)this + sizeof(*this));
+ }
+ const size_type buckets(size_type n) const {
+ return buckets()[n];
+ }
+
+ size_type size() const {
+ return num_elements;
+ }
+ size_type max_size() const {
+ return size_type(-1);
+ }
+ bool empty() const {
+ return size() == 0;
+ }
+
+ const_iterator begin() const {
+ return num_buckets ? iter_at_bucket(0) : end();
+ }
+
+ const_iterator end() const {
+ return (const_iterator)(((char*)this + data_end_off));
+ }
+
+public:
+ size_type size_in_bytes() const {
+ return data_end_off;
+ }
+
+ size_type bucket_count() const {
+ return num_buckets;
+ }
+
+ size_type elems_in_bucket(size_type bucket) const {
+ size_type result = 0;
+ const_iterator first = iter_at_bucket(bucket);
+ const_iterator last = iter_at_bucket_or_end(bucket + 1);
+
+ for (; first != last; ++first)
+ ++result;
+ return result;
+ }
+
+ template <class TheKey>
+ const_iterator find(const TheKey& key) const {
+ size_type n = bkt_num_key(key);
+ const_iterator first(iter_at_bucket(n)), last(iter_at_bucket_or_end(n + 1));
+ for (;
+ first != last && !first.KeyEquals(equals, key);
+ ++first) {
+ }
+ if (first != last)
+ return first;
+ return end();
+ }
+
+ size_type count(const key_type& key) const {
+ const size_type n = bkt_num_key(key);
+ size_type result = 0;
+ const_iterator first = iter_at_bucket(n);
+ const_iterator last = iter_at_bucket_or_end(n + 1);
+
+ for (; first != last; ++first)
+ if (first.KeyEquals(equals, key))
+ ++result;
+ return result;
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const;
+
+private:
+ template <class TheKey>
+ size_type bkt_num_key(const TheKey& key) const {
+ return hash(key) % num_buckets;
+ }
+};
+
+template <class I, class size_type_f>
+std::pair<I, I> sthashtable<I, size_type_f>::equal_range(const key_type& key) const {
+ typedef std::pair<const_iterator, const_iterator> pii;
+ const size_type n = bkt_num_key(key);
+ const_iterator first = iter_at_bucket(n);
+ const_iterator last = iter_at_bucket_or_end(n + 1);
+
+ for (; first != last; ++first) {
+ if (first.KeyEquals(equals, key)) {
+ const_iterator cur = first;
+ ++cur;
+ for (; cur != last; ++cur)
+ if (!cur.KeyEquals(equals, key))
+ return pii(const_iterator(first),
+ const_iterator(cur));
+ return pii(const_iterator(first),
+ const_iterator(last));
+ }
+ }
+ return pii(end(), end());
+}
+
+/* end __SGI_STL_HASHTABLE_H */
+
+template <class Key, class T, class HashFcn /*= hash<Key>*/,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash {
+private:
+ typedef sthashtable<TSthashIterator<const Key, const T, HashFcn, EqualKey>, size_type_f> ht;
+ ht rep;
+
+public:
+ typedef typename ht::key_type key_type;
+ typedef typename ht::value_type value_type;
+ typedef typename ht::hasher hasher;
+ typedef typename ht::key_equal key_equal;
+ typedef T mapped_type;
+
+ typedef typename ht::size_type size_type;
+ typedef typename ht::difference_type difference_type;
+ typedef typename ht::const_pointer const_pointer;
+ typedef typename ht::const_reference const_reference;
+
+ typedef typename ht::const_iterator const_iterator;
+
+ const hasher hash_funct() const {
+ return rep.hash_funct();
+ }
+ const key_equal key_eq() const {
+ return rep.key_eq();
+ }
+
+public:
+ size_type size() const {
+ return rep.size();
+ }
+ size_type max_size() const {
+ return rep.max_size();
+ }
+ bool empty() const {
+ return rep.empty();
+ }
+
+ const_iterator begin() const {
+ return rep.begin();
+ }
+ const_iterator end() const {
+ return rep.end();
+ }
+
+public:
+ template <class TheKey>
+ const_iterator find(const TheKey& key) const {
+ return rep.find(key);
+ }
+ template <class TheKey>
+ bool has(const TheKey& key) const {
+ return rep.find(key) != rep.end();
+ }
+
+ size_type count(const key_type& key) const {
+ return rep.count(key);
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+ return rep.equal_range(key);
+ }
+
+ size_type size_in_bytes() const {
+ return rep.size_in_bytes();
+ }
+
+ size_type bucket_count() const {
+ return rep.bucket_count();
+ }
+ size_type max_bucket_count() const {
+ return rep.max_bucket_count();
+ }
+ size_type elems_in_bucket(size_type n) const {
+ return rep.elems_in_bucket(n);
+ }
+
+ const size_type* buckets() const {
+ return rep.buckets();
+ }
+ const size_type buckets(size_type n) const {
+ return rep.buckets()[n];
+ }
+};
+
+template <class Key, class HashFcn,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash_set: public sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> {
+ typedef sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> Base;
+
+public:
+ using Base::const_iterator;
+ using Base::hasher;
+ using Base::key_equal;
+ using Base::key_type;
+ using Base::size_type;
+ using Base::value_type;
+};
+
+template <class Key, class T, class HashFcn /*= hash<Key>*/,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash_mm {
+private:
+ typedef sthashtable<TSthashIterator<const Key, T, HashFcn, EqualKey>, size_type_f> ht;
+ ht rep;
+
+public:
+ typedef typename ht::key_type key_type;
+ typedef typename ht::value_type value_type;
+ typedef typename ht::hasher hasher;
+ typedef typename ht::key_equal key_equal;
+ typedef T mapped_type;
+
+ typedef typename ht::size_type size_type;
+ typedef typename ht::difference_type difference_type;
+ typedef typename ht::const_pointer const_pointer;
+ typedef typename ht::const_reference const_reference;
+
+ typedef typename ht::const_iterator const_iterator;
+
+ const hasher hash_funct() const {
+ return rep.hash_funct();
+ }
+ const key_equal key_eq() const {
+ return rep.key_eq();
+ }
+
+public:
+ size_type size() const {
+ return rep.size();
+ }
+ size_type max_size() const {
+ return rep.max_size();
+ }
+ bool empty() const {
+ return rep.empty();
+ }
+
+ const_iterator begin() const {
+ return rep.begin();
+ }
+ const_iterator end() const {
+ return rep.end();
+ }
+
+ const_iterator find(const key_type& key) const {
+ return rep.find(key);
+ }
+
+ size_type count(const key_type& key) const {
+ return rep.count(key);
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+ return rep.equal_range(key);
+ }
+
+ size_type bucket_count() const {
+ return rep.bucket_count();
+ }
+ size_type max_bucket_count() const {
+ return rep.max_bucket_count();
+ }
+ size_type elems_in_bucket(size_type n) const {
+ return rep.elems_in_bucket(n);
+ }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/library/cpp/on_disk/st_hash/static_hash_map.h b/library/cpp/on_disk/st_hash/static_hash_map.h
new file mode 100644
index 00000000000..5dc50abd392
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/static_hash_map.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "static_hash.h"
+
+#include <library/cpp/deprecated/mapped_file/mapped_file.h>
+
+#include <util/system/filemap.h>
+
+template <class SH>
+struct sthash_mapped_c {
+ typedef SH H;
+ typedef typename H::const_iterator const_iterator;
+ TMappedFile M;
+ H* hsh;
+ sthash_mapped_c()
+ : M()
+ , hsh(nullptr)
+ {
+ }
+ sthash_mapped_c(const char* fname, bool precharge)
+ : M()
+ , hsh(nullptr)
+ {
+ Open(fname, precharge);
+ }
+ void Open(const char* fname, bool precharge) {
+ M.init(fname);
+ if (precharge)
+ M.precharge();
+ hsh = (H*)M.getData();
+ if (M.getSize() < sizeof(H) || (ssize_t)M.getSize() != hsh->end().Data - (char*)hsh)
+ ythrow yexception() << "Could not map hash: " << fname << " is damaged";
+ }
+ H* operator->() {
+ return hsh;
+ }
+ const H* operator->() const {
+ return hsh;
+ }
+ H* GetSthash() {
+ return hsh;
+ }
+ const H* GetSthash() const {
+ return hsh;
+ }
+};
+
+template <class Key, class T, class Hash>
+struct sthash_mapped: public sthash_mapped_c<sthash<Key, T, Hash>> {
+ typedef sthash<Key, T, Hash> H;
+ sthash_mapped(const char* fname, bool precharge)
+ : sthash_mapped_c<H>(fname, precharge)
+ {
+ }
+ sthash_mapped()
+ : sthash_mapped_c<H>()
+ {
+ }
+};
diff --git a/library/cpp/on_disk/st_hash/sthash_iterators.h b/library/cpp/on_disk/st_hash/sthash_iterators.h
new file mode 100644
index 00000000000..6a9ebdd6c3f
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/sthash_iterators.h
@@ -0,0 +1,334 @@
+#pragma once
+
+#include "save_stl.h"
+
+#include <util/system/align.h>
+
+/**
+ This file provides functionality for saving some relatively simple THashMap object
+ to disk in a form that can be mapped read-only (via mmap) at any address.
+ That saved object is accessed via pointer to sthash object (that must have
+ the same parameters as original THashMap object)
+
+ If either key or value are variable-sized (i.e. contain pointers), user must
+ write his own instantiation of TSthashIterator (read iterator for sthash) and
+ TSthashWriter (write iterator for THashMap).
+ An example for <const char *, B> pair is in here.
+**/
+
+// TEmptyValue and SizeOfEx are helpers for sthash_set
+struct TEmptyValue {
+ TEmptyValue() = default;
+};
+
+template <class T>
+inline size_t SizeOfEx() {
+ return sizeof(T);
+}
+
+template <>
+inline size_t SizeOfEx<TEmptyValue>() {
+ return 0;
+}
+template <>
+inline size_t SizeOfEx<const TEmptyValue>() {
+ return 0;
+}
+
+template <class TKey, class TValue, class HashFcn, class EqualKey>
+struct TSthashIterator {
+ // Implementation for simple types
+ typedef const TKey TKeyType;
+ typedef const TValue TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ explicit TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ TKey& Key() const {
+ return *(TKey*)Data;
+ }
+ TValue& Value() {
+ return *(TValue*)(Data + sizeof(TKey));
+ }
+ const TValue& Value() const {
+ return *(const TValue*)(Data + sizeof(TKey));
+ }
+
+ template <class AnotherKeyType>
+ bool KeyEquals(const EqualKey& eq, const AnotherKeyType& key) const {
+ return eq(*(TKey*)Data, key);
+ }
+
+ size_t GetLength() const {
+ return sizeof(TKey) + SizeOfEx<TValue>();
+ }
+};
+
+template <class Key, class Value, typename size_type_o = ui64>
+struct TSthashWriter {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const Key, const Value>&) const {
+ return sizeof(Key) + SizeOfEx<Value>();
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const Key, const Value>& record) const {
+ stream->Write(&record.first, sizeof(Key));
+ stream->Write(&record.second, SizeOfEx<Value>());
+ return 0;
+ }
+};
+
+// Remember that this simplified implementation makes a copy of `key' in std::make_pair.
+// It can also waste some memory on undesired alignment.
+template <class Key, typename size_type_o = ui64>
+struct TSthashSetWriter: public TSthashWriter<Key, TEmptyValue, size_type_o> {
+ typedef TSthashWriter<Key, TEmptyValue, size_type_o> MapWriter;
+ size_t GetRecordSize(const Key& key) const {
+ return MapWriter::GetRecordSize(std::make_pair(key, TEmptyValue()));
+ }
+ int SaveRecord(IOutputStream* stream, const Key& key) const {
+ return MapWriter::SaveRecord(stream, std::make_pair(key, TEmptyValue()));
+ }
+};
+
+// we can't save something with pointers without additional tricks
+
+template <class A, class B, class HashFcn, class EqualKey>
+struct TSthashIterator<A*, B, HashFcn, EqualKey> {};
+
+template <class A, class B, class HashFcn, class EqualKey>
+struct TSthashIterator<A, B*, HashFcn, EqualKey> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A*, B*, size_type_o> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A*, B, size_type_o> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A, B*, size_type_o> {};
+
+template <class T>
+inline size_t AlignForChrKey() {
+ return 4; // TODO: change this (requeres rebuilt of a few existing files)
+}
+
+template <>
+inline size_t AlignForChrKey<TEmptyValue>() {
+ return 1;
+}
+
+template <>
+inline size_t AlignForChrKey<const TEmptyValue>() {
+ return AlignForChrKey<TEmptyValue>();
+}
+
+// !! note that for char*, physical placement of key and value is swapped
+template <class TValue, class HashFcn, class EqualKey>
+struct TSthashIterator<const char* const, TValue, HashFcn, EqualKey> {
+ typedef const TValue TValueType;
+ typedef const char* TKeyType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ const char* Key() const {
+ return Data + SizeOfEx<TValue>();
+ }
+ TValue& Value() {
+ return *(TValue*)Data;
+ }
+ const TValue& Value() const {
+ return *(const TValue*)Data;
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(Data + SizeOfEx<TValue>(), k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data + SizeOfEx<TValue>()) + 1 + SizeOfEx<TValue>();
+ length = AlignUp(length, AlignForChrKey<TValue>());
+ return length;
+ }
+};
+
+template <class Value, typename size_type_o>
+struct TSthashWriter<const char*, Value, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const char*, const Value>& record) const {
+ size_t length = strlen(record.first) + 1 + SizeOfEx<Value>();
+ length = AlignUp(length, AlignForChrKey<Value>());
+ return length;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const char*, const Value>& record) const {
+ const char* alignBuffer = "qqqq";
+ stream->Write(&record.second, SizeOfEx<Value>());
+ size_t length = strlen(record.first) + 1;
+ stream->Write(record.first, length);
+ length = AlignUpSpace(length, AlignForChrKey<Value>());
+ if (length)
+ stream->Write(alignBuffer, length);
+ return 0;
+ }
+};
+
+template <class TKey, class HashFcn, class EqualKey>
+struct TSthashIterator<TKey, const char* const, HashFcn, EqualKey> {
+ typedef const TKey TKeyType;
+ typedef const char* TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ TKey& Key() {
+ return *(TKey*)Data;
+ }
+ const char* Value() const {
+ return Data + sizeof(TKey);
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(*(TKey*)Data, k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data + sizeof(TKey)) + 1 + sizeof(TKey);
+ length = AlignUp(length, (size_t)4);
+ return length;
+ }
+};
+
+template <class Key, typename size_type_o>
+struct TSthashWriter<Key, const char*, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const Key, const char*>& record) const {
+ size_t length = strlen(record.second) + 1 + sizeof(Key);
+ length = AlignUp(length, (size_t)4);
+ return length;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const Key, const char*>& record) const {
+ const char* alignBuffer = "qqqq";
+ stream->Write(&record.first, sizeof(Key));
+ size_t length = strlen(record.second) + 1;
+ stream->Write(record.second, length);
+ length = AlignUpSpace(length, (size_t)4);
+ if (length)
+ stream->Write(alignBuffer, length);
+ return 0;
+ }
+};
+
+template <class HashFcn, class EqualKey>
+struct TSthashIterator<const char* const, const char* const, HashFcn, EqualKey> {
+ typedef const char* TKeyType;
+ typedef const char* TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ const char* Key() const {
+ return Data;
+ }
+ const char* Value() const {
+ return Data + strlen(Data) + 1;
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(Data, k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data) + 1;
+ length += strlen(Data + length) + 1;
+ return length;
+ }
+};
+
+template <typename size_type_o>
+struct TSthashWriter<const char*, const char*, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const char*, const char*>& record) const {
+ size_t size = strlen(record.first) + strlen(record.second) + 2;
+ return size;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const char*, const char*>& record) const {
+ stream->Write(record.first, strlen(record.first) + 1);
+ stream->Write(record.second, strlen(record.second) + 1);
+ return 0;
+ }
+};
diff --git a/library/cpp/on_disk/st_hash/ya.make b/library/cpp/on_disk/st_hash/ya.make
new file mode 100644
index 00000000000..8c6d05711c3
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/ya.make
@@ -0,0 +1,15 @@
+LIBRARY()
+
+SRCS(
+ fake.cpp
+ save_stl.h
+ static_hash.h
+ static_hash_map.h
+ sthash_iterators.h
+)
+
+PEERDIR(
+ library/cpp/deprecated/mapped_file
+)
+
+END()