aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/containers/comptrie/comptrie_builder.h
blob: f8a4926ef0366ed53f62428c4e1fe5aa85b77ce9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#pragma once

#include "comptrie_packer.h"
#include "minimize.h"
#include "key_selector.h"

#include <util/stream/file.h>

// --------------------------------------------------------------------------------------
// Data Builder
// To build the data buffer, we first create an automaton in memory. The automaton
// is created incrementally. It actually helps a lot to have the input data prefix-grouped
// by key; otherwise, memory consumption becomes a tough issue.
// NOTE: building and serializing the automaton may be lengthy, and takes lots of memory.

// PREFIX_GROUPED means that if we, while constructing a trie, add to the builder two keys with the same prefix,
// then all the keys that we add between these two also have the same prefix.
// Actually in this mode the builder can accept even more freely ordered input,
// but for input as above it is guaranteed to work.
enum ECompactTrieBuilderFlags {
    CTBF_NONE = 0,
    CTBF_PREFIX_GROUPED = 1 << 0,
    CTBF_VERBOSE = 1 << 1,
    CTBF_UNIQUE = 1 << 2,
};

using TCompactTrieBuilderFlags = ECompactTrieBuilderFlags;

inline TCompactTrieBuilderFlags operator|(TCompactTrieBuilderFlags first, TCompactTrieBuilderFlags second) {
    return static_cast<TCompactTrieBuilderFlags>(static_cast<int>(first) | second);
}

inline TCompactTrieBuilderFlags& operator|=(TCompactTrieBuilderFlags& first, TCompactTrieBuilderFlags second) {
    return first = first | second;
}

template <typename T>
class TArrayWithSizeHolder;

template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
class TCompactTrieBuilder {
public:
    typedef T TSymbol;
    typedef D TData;
    typedef S TPacker;
    typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
    typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;

    explicit TCompactTrieBuilder(TCompactTrieBuilderFlags flags = CTBF_NONE, TPacker packer = TPacker(), IAllocator* alloc = TDefaultAllocator::Instance()); 

    // All Add.. methods return true if it was a new key, false if the key already existed.

    bool Add(const TSymbol* key, size_t keylen, const TData& value);
    bool Add(const TKeyBuf& key, const TData& value) {
        return Add(key.data(), key.size(), value);
    }

    // add already serialized data
    bool AddPtr(const TSymbol* key, size_t keylen, const char* data);
    bool AddPtr(const TKeyBuf& key, const char* data) {
        return AddPtr(key.data(), key.size(), data);
    }

    bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& filename);
    bool AddSubtreeInFile(const TKeyBuf& key, const TString& filename) {
        return AddSubtreeInFile(key.data(), key.size(), filename);
    }

    bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
    bool AddSubtreeInBuffer(const TKeyBuf& key, TArrayWithSizeHolder<char>&& buffer) {
        return AddSubtreeInBuffer(key.data(), key.size(), std::move(buffer));
    }

    bool Find(const TSymbol* key, size_t keylen, TData* value) const;
    bool Find(const TKeyBuf& key, TData* value = nullptr) const { 
        return Find(key.data(), key.size(), value);
    }

    bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr) const; 
    bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr) const { 
        return FindLongestPrefix(key.data(), key.size(), prefixLen, value);
    } 

    size_t Save(IOutputStream& os) const;
    size_t SaveAndDestroy(IOutputStream& os);
    size_t SaveToFile(const TString& fileName) const {
        TFixedBufferFileOutput out(fileName);
        return Save(out);
    }

    void Clear(); // Returns all memory to the system and resets the builder state.

    size_t GetEntryCount() const;
    size_t GetNodeCount() const;

    // Exact output file size in bytes.
    size_t MeasureByteSize() const {
        return Impl->MeasureByteSize();
    }

protected:
    class TCompactTrieBuilderImpl;
    THolder<TCompactTrieBuilderImpl> Impl;
};

//----------------------------------------------------------------------------------------------------------------------
// Minimize the trie. The result is equivalent to the original
// trie, except that it takes less space (and has marginally lower
// performance, because of eventual epsilon links).
// The algorithm is as follows: starting from the largest pieces, we find
// nodes that have identical continuations  (Daciuk's right language),
// and repack the trie. Repacking is done in-place, so memory is less
// of an issue; however, it may take considerable time.

// IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
// Because of non-local structure and epsilon links, it won't work
// as you expect it to, and can destroy the trie in the making.
// If you want both minimization and fast layout, do the minimization first.

template <class TPacker>
size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker(), NCompactTrie::EMinimizeMode mode = NCompactTrie::MM_DEFAULT);

template <class TTrieBuilder>
size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);

//----------------------------------------------------------------------------------------------------------------
// Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
// The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
// Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
// Calling it the second time on the same trie does nothing.
//
// The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
// by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
// two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
// The original paper (describing the layout in Section 2.1) is:
// Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
//      SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
// Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
// Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
//      Proceedings of the 41st Annual Symposium
//      on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
// Available on the web: http://erikdemaine.org/papers/FOCS2000b/
// (there is not much difference between these papers, actually).
//
template <class TPacker>
size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());

template <class TTrieBuilder>
size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);

// Composition of minimization and fast layout
template <class TPacker>
size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose = false, const TPacker& packer = TPacker());

template <class TTrieBuilder>
size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose = false);

// Implementation details moved here.
#include "comptrie_builder.inl"