diff options
| author | vvvv <[email protected]> | 2023-07-31 18:21:04 +0300 | 
|---|---|---|
| committer | vvvv <[email protected]> | 2023-07-31 18:21:04 +0300 | 
| commit | dec41c40e51aa407edef81a3c566a5a15780fc49 (patch) | |
| tree | 4f197b596b32f35eca368121f0dff913419da9af /library/cpp/on_disk | |
| parent | 3ca8b54c96e09eb2b65be7f09675623438d559c7 (diff) | |
YQL-16239 Move purecalc to public
Diffstat (limited to 'library/cpp/on_disk')
| -rw-r--r-- | library/cpp/on_disk/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt | 18 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt | 19 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt | 19 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/CMakeLists.txt | 17 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt | 18 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/fake.cpp | 4 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/save_stl.h | 84 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/static_hash.h | 420 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/static_hash_map.h | 59 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/sthash_iterators.h | 334 | ||||
| -rw-r--r-- | library/cpp/on_disk/st_hash/ya.make | 15 | 
12 files changed, 1008 insertions, 0 deletions
| diff --git a/library/cpp/on_disk/CMakeLists.txt b/library/cpp/on_disk/CMakeLists.txt index 42029471693..ade3b33c9ab 100644 --- a/library/cpp/on_disk/CMakeLists.txt +++ b/library/cpp/on_disk/CMakeLists.txt @@ -7,3 +7,4 @@  add_subdirectory(chunks) +add_subdirectory(st_hash) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt new file mode 100644 index 00000000000..ad332fef627 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC +  contrib-libs-cxxsupp +  yutil +  cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE +  ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt new file mode 100644 index 00000000000..737875ca6c5 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC +  contrib-libs-linux-headers +  contrib-libs-cxxsupp +  yutil +  cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE +  ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt new file mode 100644 index 00000000000..737875ca6c5 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt @@ -0,0 +1,19 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC +  contrib-libs-linux-headers +  contrib-libs-cxxsupp +  yutil +  cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE +  ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/CMakeLists.txt b/library/cpp/on_disk/st_hash/CMakeLists.txt new file mode 100644 index 00000000000..f8b31df0c11 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) +  include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") +  include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) +  include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) +  include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt new file mode 100644 index 00000000000..ad332fef627 --- /dev/null +++ b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt @@ -0,0 +1,18 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-on_disk-st_hash) +target_link_libraries(cpp-on_disk-st_hash PUBLIC +  contrib-libs-cxxsupp +  yutil +  cpp-deprecated-mapped_file +) +target_sources(cpp-on_disk-st_hash PRIVATE +  ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp +) diff --git a/library/cpp/on_disk/st_hash/fake.cpp b/library/cpp/on_disk/st_hash/fake.cpp new file mode 100644 index 00000000000..ef5af4d432b --- /dev/null +++ b/library/cpp/on_disk/st_hash/fake.cpp @@ -0,0 +1,4 @@ +#include "save_stl.h" +#include "static_hash.h" +#include "static_hash_map.h" +#include "sthash_iterators.h" diff --git a/library/cpp/on_disk/st_hash/save_stl.h b/library/cpp/on_disk/st_hash/save_stl.h new file mode 100644 index 00000000000..00f8f0e20db --- /dev/null +++ b/library/cpp/on_disk/st_hash/save_stl.h @@ -0,0 +1,84 @@ +#pragma once + +#include <util/generic/hash.h> +#include <util/system/yassert.h> +#include <util/stream/output.h> + +// this structure might be replaced with sthashtable class +template <class HF, class Eq, class size_type> +struct sthashtable_nvm_sv { +    sthashtable_nvm_sv() { +        if (sizeof(sthashtable_nvm_sv) != sizeof(HF) + sizeof(Eq) + 3 * sizeof(size_type)) { +            memset(this, 0, sizeof(sthashtable_nvm_sv)); +        } +    } + +    sthashtable_nvm_sv(const HF& phf, const Eq& peq, const size_type& pnb, const size_type& pne, const size_type& pnd) +        : sthashtable_nvm_sv() +    { +        hf = phf; +        eq = peq; +        num_buckets = pnb; +        num_elements = pne; +        data_end_off = pnd; +    } + +    HF hf; +    Eq eq; +    size_type num_buckets; +    size_type num_elements; +    size_type data_end_off; +}; + +/** + * Some hack to save both THashMap and sthash. + * Working with stHash does not depend on the template parameters, because the content of stHash is not used inside this method. + */ +template <class V, class K, class HF, class Ex, class Eq, class A> +template <class KeySaver> +inline int THashTable<V, K, HF, Ex, Eq, A>::save_for_st(IOutputStream* stream, KeySaver& ks, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash) const { +    Y_ASSERT(!stHash || stHash->bucket_count() == bucket_count()); +    typedef sthashtable_nvm_sv<HF, Eq, typename KeySaver::TSizeType> sv_type; +    sv_type sv = {this->_get_hash_fun(), this->_get_key_eq(), static_cast<typename KeySaver::TSizeType>(buckets.size()), static_cast<typename KeySaver::TSizeType>(num_elements), 0}; +    // to do: m.b. use just the size of corresponding object? +    typename KeySaver::TSizeType cur_off = sizeof(sv_type) + +                                           (sv.num_buckets + 1) * sizeof(typename KeySaver::TSizeType); +    sv.data_end_off = cur_off; +    const_iterator n; +    for (n = begin(); n != end(); ++n) { +        sv.data_end_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(*n)); +    } +    typename KeySaver::TSizeType* sb = stHash ? (typename KeySaver::TSizeType*)(stHash->buckets()) : nullptr; +    if (stHash) +        sv.data_end_off += static_cast<typename KeySaver::TSizeType>(sb[buckets.size()] - sb[0]); +    //saver.Align(sizeof(char*)); +    stream->Write(&sv, sizeof(sv)); + +    size_type i; +    //save vector +    for (i = 0; i < buckets.size(); ++i) { +        node* cur = buckets[i]; +        stream->Write(&cur_off, sizeof(cur_off)); +        if (cur) { +            while (!((uintptr_t)cur & 1)) { +                cur_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(cur->val)); +                cur = cur->next; +            } +        } +        if (stHash) +            cur_off += static_cast<typename KeySaver::TSizeType>(sb[i + 1] - sb[i]); +    } +    stream->Write(&cur_off, sizeof(cur_off)); // end mark +    for (i = 0; i < buckets.size(); ++i) { +        node* cur = buckets[i]; +        if (cur) { +            while (!((uintptr_t)cur & 1)) { +                ks.SaveRecord(stream, cur->val); +                cur = cur->next; +            } +        } +        if (stHash) +            stream->Write((const char*)stHash + sb[i], sb[i + 1] - sb[i]); +    } +    return 0; +} diff --git a/library/cpp/on_disk/st_hash/static_hash.h b/library/cpp/on_disk/st_hash/static_hash.h new file mode 100644 index 00000000000..ca7a6ccd369 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash.h @@ -0,0 +1,420 @@ +#pragma once + +#include "save_stl.h" +#include "sthash_iterators.h" + +#include <util/generic/hash.h> +#include <util/generic/vector.h> +#include <util/generic/buffer.h> +#include <util/generic/cast.h> +#include <util/generic/yexception.h> // for save/load only +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <utility> + +#include <memory> +#include <algorithm> +#include <functional> + +#include <cstdlib> +#include <cstddef> + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4624) // 'destructor could not be generated because a base class destructor is inaccessible' +#endif + +template <class HashType, class KeySaver> +inline void SaveHashToStreamEx(HashType& hash, IOutputStream* stream) { +    KeySaver ks; +    if (hash.save_for_st(stream, ks)) +        ythrow yexception() << "Could not save hash to stream"; +} + +template <class HashType> +inline void SaveHashToStream(HashType& hash, IOutputStream* stream) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    return SaveHashToStreamEx<HashType, KeySaver>(hash, stream); +} + +template <class HashType, class KeySaver> +inline void SaveHashToFileEx(HashType& hash, const char* fileName) { +    TFileOutput output(fileName); +    SaveHashToStreamEx<HashType, KeySaver>(hash, &output); +} + +template <class HashType> +inline void SaveHashToFile(HashType& hash, const char* fileName) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashSetToFile(HashType& hash, const char* fileName) { +    typedef TSthashSetWriter<typename HashType::key_type, ui64> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType> +inline void SaveHashToFile32(HashType& hash, const char* fileName) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; +    return SaveHashToFileEx<HashType, KeySaver>(hash, fileName); +} + +template <class HashType, class KeySaver> +inline void SaveHashToBufferEx(HashType& hash, TBuffer& buffer, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash = nullptr) { +    TBufferOutput stream(buffer); +    KeySaver ks; +    if (hash.save_for_st(&stream, ks, stHash)) +        ythrow yexception() << "Could not save hash to memory"; +} + +template <class HashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +/** + * Some hack to save both THashMap and sthash. + * THashMap and sthash must have same bucket_count(). + */ +template <class HashType, class StHashType> +inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer, StHashType* stHash) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver; +    typedef sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* SH; + +    SH sh = reinterpret_cast<SH>(stHash); +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer, sh); +} + +template <class HashType> +inline void SaveHashToBuffer32(HashType& hash, TBuffer& buffer) { +    typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver; +    SaveHashToBufferEx<HashType, KeySaver>(hash, buffer); +} + +template <class Iter, typename size_type_f = ui64> +class sthashtable { +public: +    typedef typename Iter::TKeyType key_type; +    typedef typename Iter::TValueType value_type; +    typedef typename Iter::THasherType hasher; +    typedef typename Iter::TKeyEqualType key_equal; + +    typedef size_type_f size_type; +    typedef ptrdiff_t difference_type; +    typedef const value_type* const_pointer; +    typedef const value_type& const_reference; + +    typedef Iter const_iterator; + +    const hasher hash_funct() const { +        return hash; +    } +    const key_equal key_eq() const { +        return equals; +    } + +private: +    const hasher hash; +    const key_equal equals; + +private: +    const_iterator iter_at_bucket(size_type bucket) const { +        return (const_iterator)(((char*)this + buckets()[bucket])); +    } + +    const_iterator iter_at_bucket_or_end(size_type bucket) const { +        if (bucket < num_buckets) +            return (const_iterator)(((char*)this + buckets()[bucket])); +        else +            return end(); +    } + +    const size_type num_buckets; +    const size_type num_elements; +    const size_type data_end_off; + +protected: //shut up gcc warning +    // we can't construct/destroy this object at all! +    sthashtable(); +    sthashtable(const sthashtable& ht); +    ~sthashtable(); + +public: +    //  const size_type *buckets; +    const size_type* buckets() const { +        return (size_type*)((char*)this + sizeof(*this)); +    } +    const size_type buckets(size_type n) const { +        return buckets()[n]; +    } + +    size_type size() const { +        return num_elements; +    } +    size_type max_size() const { +        return size_type(-1); +    } +    bool empty() const { +        return size() == 0; +    } + +    const_iterator begin() const { +        return num_buckets ? iter_at_bucket(0) : end(); +    } + +    const_iterator end() const { +        return (const_iterator)(((char*)this + data_end_off)); +    } + +public: +    size_type size_in_bytes() const { +        return data_end_off; +    } + +    size_type bucket_count() const { +        return num_buckets; +    } + +    size_type elems_in_bucket(size_type bucket) const { +        size_type result = 0; +        const_iterator first = iter_at_bucket(bucket); +        const_iterator last = iter_at_bucket_or_end(bucket + 1); + +        for (; first != last; ++first) +            ++result; +        return result; +    } + +    template <class TheKey> +    const_iterator find(const TheKey& key) const { +        size_type n = bkt_num_key(key); +        const_iterator first(iter_at_bucket(n)), last(iter_at_bucket_or_end(n + 1)); +        for (; +             first != last && !first.KeyEquals(equals, key); +             ++first) { +        } +        if (first != last) +            return first; +        return end(); +    } + +    size_type count(const key_type& key) const { +        const size_type n = bkt_num_key(key); +        size_type result = 0; +        const_iterator first = iter_at_bucket(n); +        const_iterator last = iter_at_bucket_or_end(n + 1); + +        for (; first != last; ++first) +            if (first.KeyEquals(equals, key)) +                ++result; +        return result; +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const; + +private: +    template <class TheKey> +    size_type bkt_num_key(const TheKey& key) const { +        return hash(key) % num_buckets; +    } +}; + +template <class I, class size_type_f> +std::pair<I, I> sthashtable<I, size_type_f>::equal_range(const key_type& key) const { +    typedef std::pair<const_iterator, const_iterator> pii; +    const size_type n = bkt_num_key(key); +    const_iterator first = iter_at_bucket(n); +    const_iterator last = iter_at_bucket_or_end(n + 1); + +    for (; first != last; ++first) { +        if (first.KeyEquals(equals, key)) { +            const_iterator cur = first; +            ++cur; +            for (; cur != last; ++cur) +                if (!cur.KeyEquals(equals, key)) +                    return pii(const_iterator(first), +                               const_iterator(cur)); +            return pii(const_iterator(first), +                       const_iterator(last)); +        } +    } +    return pii(end(), end()); +} + +/* end __SGI_STL_HASHTABLE_H */ + +template <class Key, class T, class HashFcn /*= hash<Key>*/, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash { +private: +    typedef sthashtable<TSthashIterator<const Key, const T, HashFcn, EqualKey>, size_type_f> ht; +    ht rep; + +public: +    typedef typename ht::key_type key_type; +    typedef typename ht::value_type value_type; +    typedef typename ht::hasher hasher; +    typedef typename ht::key_equal key_equal; +    typedef T mapped_type; + +    typedef typename ht::size_type size_type; +    typedef typename ht::difference_type difference_type; +    typedef typename ht::const_pointer const_pointer; +    typedef typename ht::const_reference const_reference; + +    typedef typename ht::const_iterator const_iterator; + +    const hasher hash_funct() const { +        return rep.hash_funct(); +    } +    const key_equal key_eq() const { +        return rep.key_eq(); +    } + +public: +    size_type size() const { +        return rep.size(); +    } +    size_type max_size() const { +        return rep.max_size(); +    } +    bool empty() const { +        return rep.empty(); +    } + +    const_iterator begin() const { +        return rep.begin(); +    } +    const_iterator end() const { +        return rep.end(); +    } + +public: +    template <class TheKey> +    const_iterator find(const TheKey& key) const { +        return rep.find(key); +    } +    template <class TheKey> +    bool has(const TheKey& key) const { +        return rep.find(key) != rep.end(); +    } + +    size_type count(const key_type& key) const { +        return rep.count(key); +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { +        return rep.equal_range(key); +    } + +    size_type size_in_bytes() const { +        return rep.size_in_bytes(); +    } + +    size_type bucket_count() const { +        return rep.bucket_count(); +    } +    size_type max_bucket_count() const { +        return rep.max_bucket_count(); +    } +    size_type elems_in_bucket(size_type n) const { +        return rep.elems_in_bucket(n); +    } + +    const size_type* buckets() const { +        return rep.buckets(); +    } +    const size_type buckets(size_type n) const { +        return rep.buckets()[n]; +    } +}; + +template <class Key, class HashFcn, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_set: public sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> { +    typedef sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> Base; + +public: +    using Base::const_iterator; +    using Base::hasher; +    using Base::key_equal; +    using Base::key_type; +    using Base::size_type; +    using Base::value_type; +}; + +template <class Key, class T, class HashFcn /*= hash<Key>*/, +          class EqualKey = TEqualTo<Key>, typename size_type_f = ui64> +class sthash_mm { +private: +    typedef sthashtable<TSthashIterator<const Key, T, HashFcn, EqualKey>, size_type_f> ht; +    ht rep; + +public: +    typedef typename ht::key_type key_type; +    typedef typename ht::value_type value_type; +    typedef typename ht::hasher hasher; +    typedef typename ht::key_equal key_equal; +    typedef T mapped_type; + +    typedef typename ht::size_type size_type; +    typedef typename ht::difference_type difference_type; +    typedef typename ht::const_pointer const_pointer; +    typedef typename ht::const_reference const_reference; + +    typedef typename ht::const_iterator const_iterator; + +    const hasher hash_funct() const { +        return rep.hash_funct(); +    } +    const key_equal key_eq() const { +        return rep.key_eq(); +    } + +public: +    size_type size() const { +        return rep.size(); +    } +    size_type max_size() const { +        return rep.max_size(); +    } +    bool empty() const { +        return rep.empty(); +    } + +    const_iterator begin() const { +        return rep.begin(); +    } +    const_iterator end() const { +        return rep.end(); +    } + +    const_iterator find(const key_type& key) const { +        return rep.find(key); +    } + +    size_type count(const key_type& key) const { +        return rep.count(key); +    } + +    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const { +        return rep.equal_range(key); +    } + +    size_type bucket_count() const { +        return rep.bucket_count(); +    } +    size_type max_bucket_count() const { +        return rep.max_bucket_count(); +    } +    size_type elems_in_bucket(size_type n) const { +        return rep.elems_in_bucket(n); +    } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif diff --git a/library/cpp/on_disk/st_hash/static_hash_map.h b/library/cpp/on_disk/st_hash/static_hash_map.h new file mode 100644 index 00000000000..5dc50abd392 --- /dev/null +++ b/library/cpp/on_disk/st_hash/static_hash_map.h @@ -0,0 +1,59 @@ +#pragma once + +#include "static_hash.h" + +#include <library/cpp/deprecated/mapped_file/mapped_file.h> + +#include <util/system/filemap.h> + +template <class SH> +struct sthash_mapped_c { +    typedef SH H; +    typedef typename H::const_iterator const_iterator; +    TMappedFile M; +    H* hsh; +    sthash_mapped_c() +        : M() +        , hsh(nullptr) +    { +    } +    sthash_mapped_c(const char* fname, bool precharge) +        : M() +        , hsh(nullptr) +    { +        Open(fname, precharge); +    } +    void Open(const char* fname, bool precharge) { +        M.init(fname); +        if (precharge) +            M.precharge(); +        hsh = (H*)M.getData(); +        if (M.getSize() < sizeof(H) || (ssize_t)M.getSize() != hsh->end().Data - (char*)hsh) +            ythrow yexception() << "Could not map hash: " << fname << " is damaged"; +    } +    H* operator->() { +        return hsh; +    } +    const H* operator->() const { +        return hsh; +    } +    H* GetSthash() { +        return hsh; +    } +    const H* GetSthash() const { +        return hsh; +    } +}; + +template <class Key, class T, class Hash> +struct sthash_mapped: public sthash_mapped_c<sthash<Key, T, Hash>> { +    typedef sthash<Key, T, Hash> H; +    sthash_mapped(const char* fname, bool precharge) +        : sthash_mapped_c<H>(fname, precharge) +    { +    } +    sthash_mapped() +        : sthash_mapped_c<H>() +    { +    } +}; diff --git a/library/cpp/on_disk/st_hash/sthash_iterators.h b/library/cpp/on_disk/st_hash/sthash_iterators.h new file mode 100644 index 00000000000..6a9ebdd6c3f --- /dev/null +++ b/library/cpp/on_disk/st_hash/sthash_iterators.h @@ -0,0 +1,334 @@ +#pragma once + +#include "save_stl.h" + +#include <util/system/align.h> + +/** +    This file provides functionality for saving some relatively simple THashMap object +    to disk in a form that can be mapped read-only (via mmap) at any address. +    That saved object is accessed via pointer to sthash object (that must have +    the same parameters as original THashMap object) + +    If either key or value are variable-sized (i.e. contain pointers), user must +    write his own instantiation of TSthashIterator (read iterator for sthash) and +    TSthashWriter (write iterator for THashMap). +    An example for <const char *, B> pair is in here. +**/ + +// TEmptyValue and SizeOfEx are helpers for sthash_set +struct TEmptyValue { +    TEmptyValue() = default; +}; + +template <class T> +inline size_t SizeOfEx() { +    return sizeof(T); +} + +template <> +inline size_t SizeOfEx<TEmptyValue>() { +    return 0; +} +template <> +inline size_t SizeOfEx<const TEmptyValue>() { +    return 0; +} + +template <class TKey, class TValue, class HashFcn, class EqualKey> +struct TSthashIterator { +    // Implementation for simple types +    typedef const TKey TKeyType; +    typedef const TValue TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    explicit TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    TKey& Key() const { +        return *(TKey*)Data; +    } +    TValue& Value() { +        return *(TValue*)(Data + sizeof(TKey)); +    } +    const TValue& Value() const { +        return *(const TValue*)(Data + sizeof(TKey)); +    } + +    template <class AnotherKeyType> +    bool KeyEquals(const EqualKey& eq, const AnotherKeyType& key) const { +        return eq(*(TKey*)Data, key); +    } + +    size_t GetLength() const { +        return sizeof(TKey) + SizeOfEx<TValue>(); +    } +}; + +template <class Key, class Value, typename size_type_o = ui64> +struct TSthashWriter { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const Key, const Value>&) const { +        return sizeof(Key) + SizeOfEx<Value>(); +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const Key, const Value>& record) const { +        stream->Write(&record.first, sizeof(Key)); +        stream->Write(&record.second, SizeOfEx<Value>()); +        return 0; +    } +}; + +// Remember that this simplified implementation makes a copy of `key' in std::make_pair. +// It can also waste some memory on undesired alignment. +template <class Key, typename size_type_o = ui64> +struct TSthashSetWriter: public TSthashWriter<Key, TEmptyValue, size_type_o> { +    typedef TSthashWriter<Key, TEmptyValue, size_type_o> MapWriter; +    size_t GetRecordSize(const Key& key) const { +        return MapWriter::GetRecordSize(std::make_pair(key, TEmptyValue())); +    } +    int SaveRecord(IOutputStream* stream, const Key& key) const { +        return MapWriter::SaveRecord(stream, std::make_pair(key, TEmptyValue())); +    } +}; + +// we can't save something with pointers without additional tricks + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A*, B, HashFcn, EqualKey> {}; + +template <class A, class B, class HashFcn, class EqualKey> +struct TSthashIterator<A, B*, HashFcn, EqualKey> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B*, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A*, B, size_type_o> {}; + +template <class A, class B, typename size_type_o> +struct TSthashWriter<A, B*, size_type_o> {}; + +template <class T> +inline size_t AlignForChrKey() { +    return 4; // TODO: change this (requeres rebuilt of a few existing files) +} + +template <> +inline size_t AlignForChrKey<TEmptyValue>() { +    return 1; +} + +template <> +inline size_t AlignForChrKey<const TEmptyValue>() { +    return AlignForChrKey<TEmptyValue>(); +} + +// !! note that for char*, physical placement of key and value is swapped +template <class TValue, class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, TValue, HashFcn, EqualKey> { +    typedef const TValue TValueType; +    typedef const char* TKeyType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    const char* Key() const { +        return Data + SizeOfEx<TValue>(); +    } +    TValue& Value() { +        return *(TValue*)Data; +    } +    const TValue& Value() const { +        return *(const TValue*)Data; +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(Data + SizeOfEx<TValue>(), k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data + SizeOfEx<TValue>()) + 1 + SizeOfEx<TValue>(); +        length = AlignUp(length, AlignForChrKey<TValue>()); +        return length; +    } +}; + +template <class Value, typename size_type_o> +struct TSthashWriter<const char*, Value, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const char*, const Value>& record) const { +        size_t length = strlen(record.first) + 1 + SizeOfEx<Value>(); +        length = AlignUp(length, AlignForChrKey<Value>()); +        return length; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const char*, const Value>& record) const { +        const char* alignBuffer = "qqqq"; +        stream->Write(&record.second, SizeOfEx<Value>()); +        size_t length = strlen(record.first) + 1; +        stream->Write(record.first, length); +        length = AlignUpSpace(length, AlignForChrKey<Value>()); +        if (length) +            stream->Write(alignBuffer, length); +        return 0; +    } +}; + +template <class TKey, class HashFcn, class EqualKey> +struct TSthashIterator<TKey, const char* const, HashFcn, EqualKey> { +    typedef const TKey TKeyType; +    typedef const char* TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    TKey& Key() { +        return *(TKey*)Data; +    } +    const char* Value() const { +        return Data + sizeof(TKey); +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(*(TKey*)Data, k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data + sizeof(TKey)) + 1 + sizeof(TKey); +        length = AlignUp(length, (size_t)4); +        return length; +    } +}; + +template <class Key, typename size_type_o> +struct TSthashWriter<Key, const char*, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const Key, const char*>& record) const { +        size_t length = strlen(record.second) + 1 + sizeof(Key); +        length = AlignUp(length, (size_t)4); +        return length; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const Key, const char*>& record) const { +        const char* alignBuffer = "qqqq"; +        stream->Write(&record.first, sizeof(Key)); +        size_t length = strlen(record.second) + 1; +        stream->Write(record.second, length); +        length = AlignUpSpace(length, (size_t)4); +        if (length) +            stream->Write(alignBuffer, length); +        return 0; +    } +}; + +template <class HashFcn, class EqualKey> +struct TSthashIterator<const char* const, const char* const, HashFcn, EqualKey> { +    typedef const char* TKeyType; +    typedef const char* TValueType; +    typedef EqualKey TKeyEqualType; +    typedef HashFcn THasherType; + +    const char* Data; +    TSthashIterator() +        : Data(nullptr) +    { +    } +    TSthashIterator(const char* data) +        : Data(data) +    { +    } +    void operator++() { +        Data += GetLength(); +    } + +    bool operator!=(const TSthashIterator& that) const { +        return Data != that.Data; +    } +    bool operator==(const TSthashIterator& that) const { +        return Data == that.Data; +    } +    const char* Key() const { +        return Data; +    } +    const char* Value() const { +        return Data + strlen(Data) + 1; +    } + +    template <class K> +    bool KeyEquals(const EqualKey& eq, const K& k) const { +        return eq(Data, k); +    } + +    size_t GetLength() const { +        size_t length = strlen(Data) + 1; +        length += strlen(Data + length) + 1; +        return length; +    } +}; + +template <typename size_type_o> +struct TSthashWriter<const char*, const char*, size_type_o> { +    typedef size_type_o TSizeType; +    size_t GetRecordSize(const std::pair<const char*, const char*>& record) const { +        size_t size = strlen(record.first) + strlen(record.second) + 2; +        return size; +    } +    int SaveRecord(IOutputStream* stream, const std::pair<const char*, const char*>& record) const { +        stream->Write(record.first, strlen(record.first) + 1); +        stream->Write(record.second, strlen(record.second) + 1); +        return 0; +    } +}; diff --git a/library/cpp/on_disk/st_hash/ya.make b/library/cpp/on_disk/st_hash/ya.make new file mode 100644 index 00000000000..8c6d05711c3 --- /dev/null +++ b/library/cpp/on_disk/st_hash/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +SRCS( +    fake.cpp +    save_stl.h +    static_hash.h +    static_hash_map.h +    sthash_iterators.h +) + +PEERDIR( +    library/cpp/deprecated/mapped_file +) + +END() | 
