diff options
| author | robot-piglet <[email protected]> | 2023-12-02 01:45:21 +0300 | 
|---|---|---|
| committer | robot-piglet <[email protected]> | 2023-12-02 02:42:50 +0300 | 
| commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
| tree | 9f88a486917d371d099cd712efd91b4c122d209d /library/cpp/on_disk | |
| parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) | |
Intermediate changes
Diffstat (limited to 'library/cpp/on_disk')
| -rw-r--r-- | library/cpp/on_disk/multi_blob/multiblob.cpp | 67 | ||||
| -rw-r--r-- | library/cpp/on_disk/multi_blob/multiblob.h | 77 | ||||
| -rw-r--r-- | library/cpp/on_disk/multi_blob/multiblob_builder.cpp | 146 | ||||
| -rw-r--r-- | library/cpp/on_disk/multi_blob/multiblob_builder.h | 64 | ||||
| -rw-r--r-- | library/cpp/on_disk/multi_blob/ya.make | 13 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/fake.cpp | 4 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/save_stl.h | 84 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/static_hash.h | 420 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/static_hash_map.h | 59 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/sthash_iterators.h | 334 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/ya.make | 15 | 
11 files changed, 1283 insertions, 0 deletions
| diff --git a/library/cpp/on_disk/multi_blob/multiblob.cpp b/library/cpp/on_disk/multi_blob/multiblob.cpp new file mode 100644 index 00000000000..d92b31e6135 --- /dev/null +++ b/library/cpp/on_disk/multi_blob/multiblob.cpp @@ -0,0 +1,67 @@ +#include <util/generic/yexception.h> +#include <util/system/align.h> + +#include <library/cpp/on_disk/chunks/reader.h> + +#include "multiblob.h" + +void TSubBlobs::ReadMultiBlob(const TBlob& multi) { +    if (multi.Size() < sizeof(TMultiBlobHeader)) { +        ythrow yexception() << "not a blob, too small"; +    } + +    Multi = multi; +    memcpy((void*)&Header, Multi.Data(), sizeof(TMultiBlobHeader)); + +    if (Header.BlobMetaSig != BLOBMETASIG) { +        if (Header.BlobRecordSig != TMultiBlobHeader::RecordSig) { +            if (ReadChunkedData(multi)) +                return; +        } +        ythrow yexception() << "is not a blob, MetaSig was read: " +                            << Header.BlobMetaSig +                            << ", must be" << BLOBMETASIG; +    } + +    if (Header.BlobRecordSig != TMultiBlobHeader::RecordSig) +        ythrow yexception() << "unknown multiblob RecordSig " +                            << Header.BlobRecordSig; + +    reserve(size() + Header.Count); +    if (Header.Flags & EMF_INTERLAY) { +        size_t pos = Header.HeaderSize(); +        for (size_t i = 0; i < Header.Count; ++i) { +            pos = AlignUp<ui64>(pos, sizeof(ui64)); +            ui64 size = *((ui64*)((const char*)multi.Data() + pos)); +            pos = AlignUp<ui64>(pos + sizeof(ui64), Header.Align); +            push_back(multi.SubBlob(pos, pos + size)); +            pos += size; +        } +    } else { +        const ui64* sizes = Header.Sizes(multi.Data()); +        size_t pos = Header.HeaderSize() + Header.Count * sizeof(ui64); +        for (size_t i = 0; i < Header.Count; ++i) { +            pos = AlignUp<ui64>(pos, Header.Align); +            push_back(multi.SubBlob(pos, pos + *sizes)); +            pos += *sizes; +            sizes++; +        } +    } +} + +bool TSubBlobs::ReadChunkedData(const TBlob& multi) noexcept { +    Multi = multi; +    memset((void*)&Header, 0, sizeof(Header)); + +    TChunkedDataReader reader(Multi); +    Header.Count = reader.GetBlocksCount(); +    resize(GetHeader()->Count); +    for (size_t i = 0; i < size(); ++i) +        // We can use TBlob::NoCopy() because of reader.GetBlock(i) returns +        // address into memory of multi blob. +        // This knowledge was acquired from implementation of +        // TChunkedDataReader, so we need care about any changes that. +        (*this)[i] = TBlob::NoCopy(reader.GetBlock(i), reader.GetBlockLen(i)); +    Header.Flags |= EMF_CHUNKED_DATA_READER; +    return true; +} diff --git a/library/cpp/on_disk/multi_blob/multiblob.h b/library/cpp/on_disk/multi_blob/multiblob.h new file mode 100644 index 00000000000..b40a5ae6af9 --- /dev/null +++ b/library/cpp/on_disk/multi_blob/multiblob.h @@ -0,0 +1,77 @@ +#pragma once + +#include <util/generic/vector.h> +#include <util/memory/blob.h> + +#define BLOBMETASIG 0x3456789Au + +enum E_Multiblob_Flags { +    // if EMF_INTERLAY is clear +    //     multiblob format +    //       HeaderSize()       bytes for TMultiBlobHeader +    //       Count*sizeof(ui64) bytes for blob sizes +    //       blob1 +    //       (alignment) +    //       blob2 +    //       (alignment) +    //       ... +    //       (alignment) +    //       blobn +    // if EMF_INTERLAY is set +    //     multiblob format +    //       HeaderSize()       bytes for TMultiBlobHeader +    //       size1              ui64, the size of 1st blob +    //       blob1 +    //       (alignment) +    //       size2              ui64, the size of 2nd blob +    //       blob2 +    //       (alignment) +    //       ... +    //       (alignment) +    //       sizen              ui64, the size of n'th blob +    //       blobn +    EMF_INTERLAY = 1, + +    // Means that multiblob contains blocks in TChunkedDataReader format +    // Legacy, use it only for old files, created for TChunkedDataReader +    EMF_CHUNKED_DATA_READER = 2, + +    // Flags that may be configured for blobbuilder in client code +    EMF_WRITEABLE = EMF_INTERLAY, +}; + +struct TMultiBlobHeader { +    // data +    ui32 BlobMetaSig; +    ui32 BlobRecordSig; +    ui64 Count; // count of sub blobs +    ui32 Align; // alignment for every subblob +    ui32 Flags; +    static const ui32 RecordSig = 0x23456789; +    static inline size_t HeaderSize() { +        return 4 * sizeof(ui64); +    } +    inline const ui64* Sizes(const void* Data) const { +        return (const ui64*)((const char*)Data + HeaderSize()); +    } +}; + +class TSubBlobs: public TVector<TBlob> { +public: +    TSubBlobs() { +    } +    TSubBlobs(const TBlob& multi) { +        ReadMultiBlob(multi); +    } +    void ReadMultiBlob(const TBlob& multi); +    const TMultiBlobHeader* GetHeader() const { +        return (const TMultiBlobHeader*)&Header; +    } + +protected: +    TMultiBlobHeader Header; +    TBlob Multi; + +private: +    bool ReadChunkedData(const TBlob& multi) noexcept; +}; diff --git a/library/cpp/on_disk/multi_blob/multiblob_builder.cpp b/library/cpp/on_disk/multi_blob/multiblob_builder.cpp new file mode 100644 index 00000000000..44aa4a6c2fd --- /dev/null +++ b/library/cpp/on_disk/multi_blob/multiblob_builder.cpp @@ -0,0 +1,146 @@ +#include <util/memory/tempbuf.h> +#include <util/system/align.h> + +#include "multiblob_builder.h" + +/* + * TBlobSaverMemory + */ +TBlobSaverMemory::TBlobSaverMemory(const void* ptr, size_t size) +    : Blob(TBlob::NoCopy(ptr, size)) +{ +} + +TBlobSaverMemory::TBlobSaverMemory(const TBlob& blob) +    : Blob(blob) +{ +} + +void TBlobSaverMemory::Save(IOutputStream& output, ui32 /*flags*/) { +    output.Write((void*)Blob.Data(), Blob.Length()); +} + +size_t TBlobSaverMemory::GetLength() { +    return Blob.Length(); +} + +/* + * TBlobSaverFile + */ + +TBlobSaverFile::TBlobSaverFile(TFile file) +    : File(file) +{ +    Y_ASSERT(File.IsOpen()); +} + +TBlobSaverFile::TBlobSaverFile(const char* filename, EOpenMode oMode) +    : File(filename, oMode) +{ +    Y_ASSERT(File.IsOpen()); +} + +void TBlobSaverFile::Save(IOutputStream& output, ui32 /*flags*/) { +    TTempBuf buffer(1 << 20); +    while (size_t size = File.Read((void*)buffer.Data(), buffer.Size())) +        output.Write((void*)buffer.Data(), size); +} + +size_t TBlobSaverFile::GetLength() { +    return File.GetLength(); +} + +/* + * TMultiBlobBuilder + */ + +TMultiBlobBuilder::TMultiBlobBuilder(bool isOwn) +    : IsOwner(isOwn) +{ +} + +TMultiBlobBuilder::~TMultiBlobBuilder() { +    if (IsOwner) +        DeleteSubBlobs(); +} + +namespace { +    ui64 PadToAlign(IOutputStream& output, ui64 fromPos, ui32 align) { +        ui64 toPos = AlignUp<ui64>(fromPos, align); +        for (; fromPos < toPos; ++fromPos) { +            output << (char)0; +        } +        return toPos; +    } +} + +void TMultiBlobBuilder::Save(IOutputStream& output, ui32 flags) { +    TMultiBlobHeader header; +    memset((void*)&header, 0, sizeof(header)); +    header.BlobMetaSig = BLOBMETASIG; +    header.BlobRecordSig = TMultiBlobHeader::RecordSig; +    header.Count = Blobs.size(); +    header.Align = ALIGN; +    header.Flags = flags & EMF_WRITEABLE; +    output.Write((void*)&header, sizeof(header)); +    for (size_t i = sizeof(header); i < header.HeaderSize(); ++i) +        output << (char)0; +    ui64 pos = header.HeaderSize(); +    if (header.Flags & EMF_INTERLAY) { +        for (size_t i = 0; i < Blobs.size(); ++i) { +            ui64 size = Blobs[i]->GetLength(); +            pos = PadToAlign(output, pos, sizeof(ui64));                // Align size record +            output.Write((void*)&size, sizeof(ui64)); +            pos = PadToAlign(output, pos + sizeof(ui64), header.Align); // Align blob +            Blobs[i]->Save(output, header.Flags); +            pos += size; +        } +    } else { +        for (size_t i = 0; i < Blobs.size(); ++i) { +            ui64 size = Blobs[i]->GetLength(); +            output.Write((void*)&size, sizeof(ui64)); +        } +        pos += Blobs.size() * sizeof(ui64); +        for (size_t i = 0; i < Blobs.size(); ++i) { +            pos = PadToAlign(output, pos, header.Align); +            Blobs[i]->Save(output, header.Flags); +            pos += Blobs[i]->GetLength(); +        } +    } +    // Compensate for imprecise size +    for (ui64 len = GetLength(); pos < len; ++pos) { +        output << (char)0; +    } +} + +size_t TMultiBlobBuilder::GetLength() { +    // Sizes may be diferent with and without EMF_INTERLAY, so choose greater of 2 +    size_t resNonInter = TMultiBlobHeader::HeaderSize() + Blobs.size() * sizeof(ui64); +    size_t resInterlay = TMultiBlobHeader::HeaderSize(); +    for (size_t i = 0; i < Blobs.size(); ++i) { +        resInterlay = AlignUp<ui64>(resInterlay, sizeof(ui64)) + sizeof(ui64); +        resInterlay = AlignUp<ui64>(resInterlay, ALIGN) + Blobs[i]->GetLength(); +        resNonInter = AlignUp<ui64>(resNonInter, ALIGN) + Blobs[i]->GetLength(); +    } +    resInterlay = AlignUp<ui64>(resInterlay, ALIGN); +    resNonInter = AlignUp<ui64>(resNonInter, ALIGN); +    return Max(resNonInter, resInterlay); +} + +TMultiBlobBuilder::TSavers& TMultiBlobBuilder::GetBlobs() { +    return Blobs; +} + +const TMultiBlobBuilder::TSavers& TMultiBlobBuilder::GetBlobs() const { +    return Blobs; +} + +void TMultiBlobBuilder::AddBlob(IBlobSaverBase* blob) { +    Blobs.push_back(blob); +} + +void TMultiBlobBuilder::DeleteSubBlobs() { +    for (size_t i = 0; i < Blobs.size(); ++i) +        delete Blobs[i]; +    Blobs.clear(); +} diff --git a/library/cpp/on_disk/multi_blob/multiblob_builder.h b/library/cpp/on_disk/multi_blob/multiblob_builder.h new file mode 100644 index 00000000000..a8e3c6d35ee --- /dev/null +++ b/library/cpp/on_disk/multi_blob/multiblob_builder.h @@ -0,0 +1,64 @@ +#pragma once + +#include <util/system/align.h> +#include <util/stream/output.h> +#include <util/stream/file.h> +#include <util/draft/holder_vector.h> + +#include "multiblob.h" + +class IBlobSaverBase { +public: +    virtual ~IBlobSaverBase() { +    } +    virtual void Save(IOutputStream& output, ui32 flags = 0) = 0; +    virtual size_t GetLength() = 0; +}; + +inline void MultiBlobSave(IOutputStream& output, IBlobSaverBase& saver) { +    saver.Save(output); +} + +class TBlobSaverMemory: public IBlobSaverBase { +public: +    TBlobSaverMemory(const void* ptr, size_t size); +    TBlobSaverMemory(const TBlob& blob); +    void Save(IOutputStream& output, ui32 flags = 0) override; +    size_t GetLength() override; + +private: +    TBlob Blob; +}; + +class TBlobSaverFile: public IBlobSaverBase { +public: +    TBlobSaverFile(TFile file); +    TBlobSaverFile(const char* filename, EOpenMode oMode = RdOnly); +    void Save(IOutputStream& output, ui32 flags = 0) override; +    size_t GetLength() override; + +protected: +    TFile File; +}; + +class TMultiBlobBuilder: public IBlobSaverBase { +protected: +    // Data will be stored with default alignment DEVTOOLS-4548 +    static const size_t ALIGN = 16; + +public: +    typedef TVector<IBlobSaverBase*> TSavers; + +    TMultiBlobBuilder(bool isOwn = true); +    ~TMultiBlobBuilder() override; +    void Save(IOutputStream& output, ui32 flags = 0) override; +    size_t GetLength() override; +    TSavers& GetBlobs(); +    const TSavers& GetBlobs() const; +    void AddBlob(IBlobSaverBase* blob); +    void DeleteSubBlobs(); + +protected: +    TSavers Blobs; +    bool IsOwner; +}; diff --git a/library/cpp/on_disk/multi_blob/ya.make b/library/cpp/on_disk/multi_blob/ya.make new file mode 100644 index 00000000000..50615fc9012 --- /dev/null +++ b/library/cpp/on_disk/multi_blob/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +SRCS( +    multiblob.cpp +    multiblob_builder.cpp +) + +PEERDIR( +    library/cpp/on_disk/chunks +    util/draft +) + +END() diff --git a/library/cpp/on_disk/st_hash/fake.cpp b/library/cpp/on_disk/st_hash/fake.cpp new file mode 100644 index 00000000000..ef5af4d432b --- /dev/null +++ b/library/cpp/on_disk/st_hash/fake.cpp @@ -0,0 +1,4 @@ +#include "save_stl.h" +#include "static_hash.h" +#include "static_hash_map.h" +#include "sthash_iterators.h" diff --git a/library/cpp/on_disk/st_hash/save_stl.h b/library/cpp/on_disk/st_hash/save_stl.h new file mode 100644 index 00000000000..00f8f0e20db --- /dev/null +++ b/library/cpp/on_disk/st_hash/save_stl.h @@ -0,0 +1,84 @@ +#pragma once + +#include <util/generic/hash.h> +#include <util/system/yassert.h> +#include <util/stream/output.h> + +// this structure might be replaced with sthashtable class +template <class HF, class Eq, class size_type> +struct sthashtable_nvm_sv { +    sthashtable_nvm_sv() { +        if (sizeof(sthashtable_nvm_sv) != sizeof(HF) + sizeof(Eq) + 3 * sizeof(size_type)) { +            memset(this, 0, sizeof(sthashtable_nvm_sv)); +        } +    } + +    sthashtable_nvm_sv(const HF& phf, const Eq& peq, const size_type& pnb, const size_type& pne, const size_type& pnd) +        : sthashtable_nvm_sv() +    { +        hf = phf; +        eq = peq; +        num_buckets = pnb; +        num_elements = pne; +        data_end_off = pnd; +    } + +    HF hf; +    Eq eq; +    size_type num_buckets; +    size_type num_elements; +    size_type data_end_off; +}; + +/** + * Some hack to save both THashMap and sthash. + * Working with stHash does not depend on the template parameters, because the content of stHash is not used inside this method. + */ +template <class V, class K, class HF, class Ex, class Eq, class A> +template <class KeySaver> +inline int THashTable<V, K, HF, Ex, Eq, A>::save_for_st(IOutputStream* stream, KeySaver& ks, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash) const { +    Y_ASSERT(!stHash || stHash->bucket_count() == bucket_count()); +    typedef sthashtable_nvm_sv<HF, Eq, typename KeySaver::TSizeType> sv_type; +    sv_type sv = {this->_get_hash_fun(), this->_get_key_eq(), static_cast<typename KeySaver::TSizeType>(buckets.size()), static_cast<typename KeySaver::TSizeType>(num_elements), 0}; +    // to do: m.b. use just the size of corresponding object? +    typename KeySaver::TSizeType cur_off = sizeof(sv_type) + +                                           (sv.num_buckets + 1) * sizeof(typename KeySaver::TSizeType); +    sv.data_end_off = cur_off; +    const_iterator n; +    for (n = begin(); n != end(); ++n) { +        sv.data_end_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(*n)); +    } +    typename KeySaver::TSizeType* sb = stHash ? (typename KeySaver::TSizeType*)(stHash->buckets()) : nullptr; +    if (stHash) +        sv.data_end_off += static_cast<typename KeySaver::TSizeType>(sb[buckets.size()] - sb[0]); +    //saver.Align(sizeof(char*)); +    stream->Write(&sv, sizeof(sv)); + +    size_type i; +    //save vector +    for (i = 0; i < buckets.size(); ++i) { +        node* cur = buckets[i]; +        stream->Write(&cur_off, sizeof(cur_off)); +        if (cur) { +            while (!((uintptr_t)cur & 1)) { +                cur_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(cur->val)); +                cur = cur->next; +            } +        } +        if (stHash) +            cur_off += static_cast<typename KeySaver::TSizeType>(sb[i + 1] - sb[i]); +    } +    stream->Write(&cur_off, sizeof(cur_off)); // end mark +    for (i = 0; i < buckets.size(); ++i) { +        node* cur = buckets[i]; +        if (cur) { +            while (!((uintptr_t)cur & 1)) { +                ks.SaveRecord(stream, cur->val); +                cur = cur->next; +            } +        } +        if (stHash) +            stream->Write((const char*)stHash + sb[i], sb[i + 1] - sb[i]); +    } +    return 0; +} diff --git a/library/cpp/on_disk/st_hash/static_hash.h b/library/cpp/on_disk/st_hash/static_hash.h new file mode 100644 index 00000000000..ca7a6ccd369 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash.h @@ -0,0 +1,420 @@ +#pragma once + +#include "save_stl.h" +#include "sthash_iterators.h" + +#include <util/generic/hash.h> +#include <util/generic/vector.h> +#include <util/generic/buffer.h> +#include <util/generic/cast.h> +#include <util/generic/yexception.h> // for save/load only +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <utility> + +#include <memory> +#include <algorithm> +#include <functional> + +#include <cstdlib> +#include <cstddef> + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4624) // 'destructor could not be generated because a base class destructor is inaccessible' +#endif + +template <class HashType, class KeySaver> +inline void SaveHashToStreamEx(HashType& hash, IOutputStream* stream) { +    KeySaver ks; +    if (hash.save_for_st(stream, ks)) +        ythrow yexception() << "Could not save hash to stream"; +} + +template <class HashType> +inline void SaveHashToStream(HashType& hash, IOutputStream* stream) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    return SaveHashToStreamEx<HashType, KeySaver>(hash, stream); +} + +template <class HashType, class KeySaver> +inline void SaveHashToFileEx(HashType& hash, const char* fileName) { +    TFileOutput output(fileName); +    SaveHashToStreamEx<HashType, KeySaver>(hash, &output); +} + +template <class HashType> +inline void SaveHashToFile(HashType& hash, const char* fileName) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashSetToFile(HashType& hash, const char* fileName) { +    typedef TSthashSetWriter<typename HashType::key_type, ui64> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashToFile32(HashType& hash, const char* fileName) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType, class KeySaver> +inline void SaveHashToBufferEx(HashType& hash, TBuffer& buffer, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash = nullptr) { +    TBufferOutput stream(buffer); +    KeySaver ks; +    if (hash.save_for_st(&stream, ks, stHash)) +        ythrow yexception() << "Could not save hash to memory"; +} + +template <class HashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +/** + * Some hack to save both THashMap and sthash. + * THashMap and sthash must have same bucket_count(). + */ +template <class HashType, class StHashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer, StHashType* stHash) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    typedef sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* SH; + +    SH sh = reinterpret_cast<SH>(stHash); +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer, sh); +} + +template <class HashType> +inline void SaveHashToBuffer32(HashType& hash, TBuffer& buffer) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +template <class Iter, typename size_type_f = ui64> +class sthashtable { +public: +    typedef typename Iter::TKeyType key_type; +    typedef typename Iter::TValueType value_type; +    typedef typename Iter::THasherType hasher; +    typedef typename Iter::TKeyEqualType key_equal; + +    typedef size_type_f size_type; +    typedef ptrdiff_t difference_type; +    typedef const value_type* const_pointer; +    typedef const value_type& const_reference; + +    typedef Iter const_iterator; + +    const hasher hash_funct() const { +        return hash; +    } +    const key_equal key_eq() const { +        return equals; +    } + +private: +    const hasher hash; +    const key_equal equals; + +private: +    const_iterator iter_at_bucket(size_type bucket) const { +        return (const_iterator)(((char*)this + buckets()[bucket])); +    } + +    const_iterator iter_at_bucket_or_end(size_type bucket) const { +        if (bucket < num_buckets) +            return (const_iterator)(((char*)this + buckets()[bucket])); +        else +            return end(); +    } + +    const size_type num_buckets; +    const size_type num_elements; +    const size_type data_end_off; + +protected: //shut up gcc warning +    // we can't construct/destroy this object at all! +    sthashtable(); +    sthashtable(const sthashtable& ht); +    ~sthashtable(); + +public: +    //  const size_type *buckets; +    const size_type* buckets() const { +        return (size_type*)((char*)this + sizeof(*this)); +    } +    const size_type buckets(size_type n) const { +        return buckets()[n]; +    } + +    size_type size() const { +        return num_elements; +    } +    size_type max_size() const { +        return size_type(-1); +    } +    bool empty() const { +        return size() == 0; +    } + +    const_iterator begin() const { +        return num_buckets ? iter_at_bucket(0) : end(); +    } + +    const_iterator end() const { +        return (const_iterator)(((char*)this + data_end_off)); +    } + +public: +    size_type size_in_bytes() const { +        return data_end_off; +    } + +    size_type bucket_count() const { +        return num_buckets; +    } + +    size_type elems_in_bucket(size_type bucket) const { +        size_type result = 0; +        const_iterator first = iter_at_bucket(bucket); +        const_iterator last = iter_at_bucket_or_end(bucket + 1); + +        for (; first != last; ++first) +            ++result; +        return result; +    } + +    template <class TheKey> +    const_iterator find(const TheKey& key) const { +        size_type n = bkt_num_key(key); +        const_iterator first(iter_at_bucket(n)), last(iter_at_bucket_or_end(n + 1)); +        for (; +             first != last && !first.KeyEquals(equals, key); +             ++first) { +        } +        if (first != last) +            return first; +        return end(); +    } + +    size_type count(const key_type& key) const { +        const size_type n = bkt_num_key(key); +        size_type result = 0; +        const_iterator first = iter_at_bucket(n); +        const_iterator last = iter_at_bucket_or_end(n + 1); + +        for (; first != last; ++first) +            if (first.KeyEquals(equals, key)) +                ++result; +        return result; +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const; + +private: +    template <class TheKey> +    size_type bkt_num_key(const TheKey& key) const { +        return hash(key) % num_buckets; +    } +}; + +template <class I, class size_type_f> +std::pair<I, I> sthashtable<I, size_type_f>::equal_range(const key_type& key) const { +    typedef std::pair<const_iterator, const_iterator> pii; +    const size_type n = bkt_num_key(key); +    const_iterator first = iter_at_bucket(n); +    const_iterator last = iter_at_bucket_or_end(n + 1); + +    for (; first != last; ++first) { +        if (first.KeyEquals(equals, key)) { +            const_iterator cur = first; +            ++cur; +            for (; cur != last; ++cur) +                if (!cur.KeyEquals(equals, key)) +                    return pii(const_iterator(first), +                               const_iterator(cur)); +            return pii(const_iterator(first), +                       const_iterator(last)); +        } +    } +    return pii(end(), end()); +} + +/* end __SGI_STL_HASHTABLE_H */ + +template <class Key, class T, class HashFcn /*= hash<Key>*/, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash { +private: +    typedef sthashtable<TSthashIterator<const Key, const T, HashFcn, EqualKey>, size_type_f> ht; +    ht rep; + +public: +    typedef typename ht::key_type key_type; +    typedef typename ht::value_type value_type; +    typedef typename ht::hasher hasher; +    typedef typename ht::key_equal key_equal; +    typedef T mapped_type; + +    typedef typename ht::size_type size_type; +    typedef typename ht::difference_type difference_type; +    typedef typename ht::const_pointer const_pointer; +    typedef typename ht::const_reference const_reference; + +    typedef typename ht::const_iterator const_iterator; + +    const hasher hash_funct() const { +        return rep.hash_funct(); +    } +    const key_equal key_eq() const { +        return rep.key_eq(); +    } + +public: +    size_type size() const { +        return rep.size(); +    } +    size_type max_size() const { +        return rep.max_size(); +    } +    bool empty() const { +        return rep.empty(); +    } + +    const_iterator begin() const { +        return rep.begin(); +    } +    const_iterator end() const { +        return rep.end(); +    } + +public: +    template <class TheKey> +    const_iterator find(const TheKey& key) const { +        return rep.find(key); +    } +    template <class TheKey> +    bool has(const TheKey& key) const { +        return rep.find(key) != rep.end(); +    } + +    size_type count(const key_type& key) const { +        return rep.count(key); +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { +        return rep.equal_range(key); +    } + +    size_type size_in_bytes() const { +        return rep.size_in_bytes(); +    } + +    size_type bucket_count() const { +        return rep.bucket_count(); +    } +    size_type max_bucket_count() const { +        return rep.max_bucket_count(); +    } +    size_type elems_in_bucket(size_type n) const { +        return rep.elems_in_bucket(n); +    } + +    const size_type* buckets() const { +        return rep.buckets(); +    } +    const size_type buckets(size_type n) const { +        return rep.buckets()[n]; +    } +}; + +template <class Key, class HashFcn, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_set: public sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> { +    typedef sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> Base; + +public: +    using Base::const_iterator; +    using Base::hasher; +    using Base::key_equal; +    using Base::key_type; +    using Base::size_type; +    using Base::value_type; +}; + +template <class Key, class T, class HashFcn /*= hash<Key>*/, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_mm { +private: +    typedef sthashtable<TSthashIterator<const Key, T, HashFcn, EqualKey>, size_type_f> ht; +    ht rep; + +public: +    typedef typename ht::key_type key_type; +    typedef typename ht::value_type value_type; +    typedef typename ht::hasher hasher; +    typedef typename ht::key_equal key_equal; +    typedef T mapped_type; + +    typedef typename ht::size_type size_type; +    typedef typename ht::difference_type difference_type; +    typedef typename ht::const_pointer const_pointer; +    typedef typename ht::const_reference const_reference; + +    typedef typename ht::const_iterator const_iterator; + +    const hasher hash_funct() const { +        return rep.hash_funct(); +    } +    const key_equal key_eq() const { +        return rep.key_eq(); +    } + +public: +    size_type size() const { +        return rep.size(); +    } +    size_type max_size() const { +        return rep.max_size(); +    } +    bool empty() const { +        return rep.empty(); +    } + +    const_iterator begin() const { +        return rep.begin(); +    } +    const_iterator end() const { +        return rep.end(); +    } + +    const_iterator find(const key_type& key) const { +        return rep.find(key); +    } + +    size_type count(const key_type& key) const { +        return rep.count(key); +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { +        return rep.equal_range(key); +    } + +    size_type bucket_count() const { +        return rep.bucket_count(); +    } +    size_type max_bucket_count() const { +        return rep.max_bucket_count(); +    } +    size_type elems_in_bucket(size_type n) const { +        return rep.elems_in_bucket(n); +    } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif diff --git a/library/cpp/on_disk/st_hash/static_hash_map.h b/library/cpp/on_disk/st_hash/static_hash_map.h new file mode 100644 index 00000000000..5dc50abd392 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash_map.h @@ -0,0 +1,59 @@ +#pragma once + +#include "static_hash.h" + +#include <library/cpp/deprecated/mapped_file/mapped_file.h> + +#include <util/system/filemap.h> + +template <class SH> +struct sthash_mapped_c { +    typedef SH H; +    typedef typename H::const_iterator const_iterator; +    TMappedFile M; +    H* hsh; +    sthash_mapped_c() +        : M() +        , hsh(nullptr) +    { +    } +    sthash_mapped_c(const char* fname, bool precharge) +        : M() +        , hsh(nullptr) +    { +        Open(fname, precharge); +    } +    void Open(const char* fname, bool precharge) { +        M.init(fname); +        if (precharge) +            M.precharge(); +        hsh = (H*)M.getData(); +        if (M.getSize() < sizeof(H) || (ssize_t)M.getSize() != hsh->end().Data - (char*)hsh) +            ythrow yexception() << "Could not map hash: " << fname << " is damaged"; +    } +    H* operator->() { +        return hsh; +    } +    const H* operator->() const { +        return hsh; +    } +    H* GetSthash() { +        return hsh; +    } +    const H* GetSthash() const { +        return hsh; +    } +}; + +template <class Key, class T, class Hash> +struct sthash_mapped: public sthash_mapped_c<sthash<Key, T, Hash>> { +    typedef sthash<Key, T, Hash> H; +    sthash_mapped(const char* fname, bool precharge) +        : sthash_mapped_c<H>(fname, precharge) +    { +    } +    sthash_mapped() +        : sthash_mapped_c<H>() +    { +    } +}; diff --git a/library/cpp/on_disk/st_hash/sthash_iterators.h b/library/cpp/on_disk/st_hash/sthash_iterators.h new file mode 100644 index 00000000000..6a9ebdd6c3f --- /dev/null +++ b/library/cpp/on_disk/st_hash/sthash_iterators.h @@ -0,0 +1,334 @@ +#pragma once + +#include "save_stl.h" + +#include <util/system/align.h> + +/** +    This file provides functionality for saving some relatively simple THashMap object +    to disk in a form that can be mapped read-only (via mmap) at any address. +    That saved object is accessed via pointer to sthash object (that must have +    the same parameters as original THashMap object) + +    If either key or value are variable-sized (i.e. contain pointers), user must +    write his own instantiation of TSthashIterator (read iterator for sthash) and +    TSthashWriter (write iterator for THashMap). +    An example for <const char *, B> pair is in here. +**/ + +// TEmptyValue and SizeOfEx are helpers for sthash_set +struct TEmptyValue { +    TEmptyValue() = default; +}; + +template <class T> +inline size_t SizeOfEx() { +    return sizeof(T); +} + +template <> +inline size_t SizeOfEx<TEmptyValue>() { +    return 0; +} +template <> +inline size_t SizeOfEx<const TEmptyValue>() { +    return 0; +} + +template <class TKey, class TValue, class HashFcn, class EqualKey> +struct TSthashIterator { +    // Implementation for simple types +    typedef const TKey TKeyType; +    typedef const TValue TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    explicit TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    TKey& Key() const { +        return *(TKey*)Data; +    } +    TValue& Value() { +        return *(TValue*)(Data + sizeof(TKey)); +    } +    const TValue& Value() const { +        return *(const TValue*)(Data + sizeof(TKey)); +    } + +    template <class AnotherKeyType> +    bool KeyEquals(const EqualKey& eq, const AnotherKeyType& key) const { +        return eq(*(TKey*)Data, key); +    } + +    size_t GetLength() const { +        return sizeof(TKey) + SizeOfEx<TValue>(); +    } +}; + +template <class Key, class Value, typename size_type_o = ui64> +struct TSthashWriter { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const Key, const Value>&) const { +        return sizeof(Key) + SizeOfEx<Value>(); +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const Key, const Value>& record) const { +        stream->Write(&record.first, sizeof(Key)); +        stream->Write(&record.second, SizeOfEx<Value>()); +        return 0; +    } +}; + +// Remember that this simplified implementation makes a copy of `key' in std::make_pair. +// It can also waste some memory on undesired alignment. +template <class Key, typename size_type_o = ui64> +struct TSthashSetWriter: public TSthashWriter<Key, TEmptyValue, size_type_o> { +    typedef TSthashWriter<Key, TEmptyValue, size_type_o> MapWriter; +    size_t GetRecordSize(const Key& key) const { +        return MapWriter::GetRecordSize(std::make_pair(key, TEmptyValue())); +    } +    int SaveRecord(IOutputStream* stream, const Key& key) const { +        return MapWriter::SaveRecord(stream, std::make_pair(key, TEmptyValue())); +    } +}; + +// we can't save something with pointers without additional tricks + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A*, B, HashFcn, EqualKey> {}; + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A, B*, HashFcn, EqualKey> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B*, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A, B*, size_type_o> {}; + +template <class T> +inline size_t AlignForChrKey() { +    return 4; // TODO: change this (requeres rebuilt of a few existing files) +} + +template <> +inline size_t AlignForChrKey<TEmptyValue>() { +    return 1; +} + +template <> +inline size_t AlignForChrKey<const TEmptyValue>() { +    return AlignForChrKey<TEmptyValue>(); +} + +// !! note that for char*, physical placement of key and value is swapped +template <class TValue, class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, TValue, HashFcn, EqualKey> { +    typedef const TValue TValueType; +    typedef const char* TKeyType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    const char* Key() const { +        return Data + SizeOfEx<TValue>(); +    } +    TValue& Value() { +        return *(TValue*)Data; +    } +    const TValue& Value() const { +        return *(const TValue*)Data; +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(Data + SizeOfEx<TValue>(), k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data + SizeOfEx<TValue>()) + 1 + SizeOfEx<TValue>(); +        length = AlignUp(length, AlignForChrKey<TValue>()); +        return length; +    } +}; + +template <class Value, typename size_type_o> +struct TSthashWriter<const char*, Value, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const char*, const Value>& record) const { +        size_t length = strlen(record.first) + 1 + SizeOfEx<Value>(); +        length = AlignUp(length, AlignForChrKey<Value>()); +        return length; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const char*, const Value>& record) const { +        const char* alignBuffer = "qqqq"; +        stream->Write(&record.second, SizeOfEx<Value>()); +        size_t length = strlen(record.first) + 1; +        stream->Write(record.first, length); +        length = AlignUpSpace(length, AlignForChrKey<Value>()); +        if (length) +            stream->Write(alignBuffer, length); +        return 0; +    } +}; + +template <class TKey, class HashFcn, class EqualKey> +struct TSthashIterator<TKey, const char* const, HashFcn, EqualKey> { +    typedef const TKey TKeyType; +    typedef const char* TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    TKey& Key() { +        return *(TKey*)Data; +    } +    const char* Value() const { +        return Data + sizeof(TKey); +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(*(TKey*)Data, k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data + sizeof(TKey)) + 1 + sizeof(TKey); +        length = AlignUp(length, (size_t)4); +        return length; +    } +}; + +template <class Key, typename size_type_o> +struct TSthashWriter<Key, const char*, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const Key, const char*>& record) const { +        size_t length = strlen(record.second) + 1 + sizeof(Key); +        length = AlignUp(length, (size_t)4); +        return length; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const Key, const char*>& record) const { +        const char* alignBuffer = "qqqq"; +        stream->Write(&record.first, sizeof(Key)); +        size_t length = strlen(record.second) + 1; +        stream->Write(record.second, length); +        length = AlignUpSpace(length, (size_t)4); +        if (length) +            stream->Write(alignBuffer, length); +        return 0; +    } +}; + +template <class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, const char* const, HashFcn, EqualKey> { +    typedef const char* TKeyType; +    typedef const char* TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    const char* Key() const { +        return Data; +    } +    const char* Value() const { +        return Data + strlen(Data) + 1; +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(Data, k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data) + 1; +        length += strlen(Data + length) + 1; +        return length; +    } +}; + +template <typename size_type_o> +struct TSthashWriter<const char*, const char*, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const char*, const char*>& record) const { +        size_t size = strlen(record.first) + strlen(record.second) + 2; +        return size; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const char*, const char*>& record) const { +        stream->Write(record.first, strlen(record.first) + 1); +        stream->Write(record.second, strlen(record.second) + 1); +        return 0; +    } +}; diff --git a/library/cpp/on_disk/st_hash/ya.make b/library/cpp/on_disk/st_hash/ya.make new file mode 100644 index 00000000000..8c6d05711c3 --- /dev/null +++ b/library/cpp/on_disk/st_hash/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +SRCS( +    fake.cpp +    save_stl.h +    static_hash.h +    static_hash_map.h +    sthash_iterators.h +) + +PEERDIR( +    library/cpp/deprecated/mapped_file +) + +END() | 
