#pragma once
#include "comptrie_impl.h"
#include "comptrie_trie.h"
#include "make_fast_layout.h"
#include "array_with_size.h"
#include <library/cpp/containers/compact_vector/compact_vector.h>
#include <util/memory/alloc.h>
#include <util/memory/blob.h>
#include <util/memory/pool.h>
#include <util/memory/tempbuf.h>
#include <util/memory/smallobj.h>
#include <util/generic/algorithm.h>
#include <util/generic/buffer.h>
#include <util/generic/strbuf.h>
#include <util/system/align.h>
#include <util/stream/buffer.h>
#define CONSTEXPR_MAX2(a, b) (a) > (b) ? (a) : (b)
#define CONSTEXPR_MAX3(a, b, c) CONSTEXPR_MAX2(CONSTEXPR_MAX2(a, b), c)
// TCompactTrieBuilder::TCompactTrieBuilderImpl
template <class T, class D, class S>
class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl {
protected:
TMemoryPool Pool;
size_t PayloadSize;
THolder<TFixedSizeAllocator> NodeAllocator;
class TNode;
class TArc;
TNode* Root;
TCompactTrieBuilderFlags Flags;
size_t EntryCount;
size_t NodeCount;
TPacker Packer;
enum EPayload {
DATA_ABSENT,
DATA_INSIDE,
DATA_MALLOCED,
DATA_IN_MEMPOOL,
};
protected:
void ConvertSymbolArrayToChar(const TSymbol* key, size_t keylen, TTempBuf& buf, size_t ckeylen) const;
void NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node);
TNode* NodeForwardAdd(TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount);
bool FindEntryImpl(const char* key, size_t keylen, TData* value) const;
bool FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const;
size_t NodeMeasureSubtree(TNode* thiz) const;
ui64 NodeSaveSubtree(TNode* thiz, IOutputStream& os) const;
ui64 NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& osy);
void NodeBufferSubtree(TNode* thiz);
size_t NodeMeasureLeafValue(TNode* thiz) const;
ui64 NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const;
virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const;
virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const;
ui64 ArcSave(const TArc* thiz, IOutputStream& os) const;
ui64 ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os);
public:
TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc);
virtual ~TCompactTrieBuilderImpl();
void DestroyNode(TNode* node);
void NodeReleasePayload(TNode* thiz);
char* AddEntryForData(const TSymbol* key, size_t keylen, size_t dataLen, bool& isNewAddition);
TNode* AddEntryForSomething(const TSymbol* key, size_t keylen, bool& isNewAddition);
bool AddEntry(const TSymbol* key, size_t keylen, const TData& value);
bool AddEntryPtr(const TSymbol* key, size_t keylen, const char* value);
bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName);
bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer);
bool FindEntry(const TSymbol* key, size_t keylen, TData* value) const;
bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const;
size_t Save(IOutputStream& os) const;
size_t SaveAndDestroy(IOutputStream& os);
void Clear();
// lies if some key was added at least twice
size_t GetEntryCount() const;
size_t GetNodeCount() const;
size_t MeasureByteSize() const {
return NodeMeasureSubtree(Root);
}
};
template <class T, class D, class S>
class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TArc {
public:
TBlob Label;
TNode* Node;
mutable size_t LeftOffset;
mutable size_t RightOffset;
TArc(const TBlob& lbl, TNode* nd);
};
template <class T, class D, class S>
class TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode {
public:
typedef typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl TBuilderImpl;
typedef typename TBuilderImpl::TArc TArc;
struct ISubtree {
virtual ~ISubtree() = default;
virtual bool IsLast() const = 0;
virtual ui64 Measure(const TBuilderImpl* builder) const = 0;
virtual ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const = 0;
virtual ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) = 0;
virtual void Destroy(TBuilderImpl*) { }
// Tries to find key in subtree.
// Returns next node to find the key in (to avoid recursive calls)
// If it has end result, writes it to @value and @result arguments and returns nullptr
virtual const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0;
virtual const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0;
};
class TArcSet: public ISubtree, public TCompactVector<TArc> {
public:
typedef typename TCompactVector<TArc>::iterator iterator;
typedef typename TCompactVector<TArc>::const_iterator const_iterator;
TArcSet() {
Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
}
iterator Find(char ch);
const_iterator Find(char ch) const;
void Add(const TBlob& s, TNode* node);
bool IsLast() const override {
return this->Empty();
}
const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override;
const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
return Find(key, value, result, packer);
}
ui64 Measure(const TBuilderImpl* builder) const override {
return MeasureRange(builder, 0, this->size());
}
ui64 MeasureRange(const TBuilderImpl* builder, size_t from, size_t to) const {
if (from >= to)
return 0;
size_t median = (from + to) / 2;
size_t leftsize = (size_t)MeasureRange(builder, from, median);
size_t rightsize = (size_t)MeasureRange(builder, median + 1, to);
return builder->ArcMeasure(&(*this)[median], leftsize, rightsize);
}
ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const override {
return SaveRange(builder, 0, this->size(), os);
}
ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
ui64 result = SaveRangeAndDestroy(builder, 0, this->size(), os);
Destroy(builder);
return result;
}
ui64 SaveRange(const TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) const {
if (from >= to)
return 0;
size_t median = (from + to) / 2;
ui64 written = builder->ArcSave(&(*this)[median], os);
written += SaveRange(builder, from, median, os);
written += SaveRange(builder, median + 1, to, os);
return written;
}
ui64 SaveRangeAndDestroy(TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) {
if (from >= to)
return 0;
size_t median = (from + to) / 2;
ui64 written = builder->ArcSaveAndDestroy(&(*this)[median], os);
written += SaveRangeAndDestroy(builder, from, median, os);
written += SaveRangeAndDestroy(builder, median + 1, to, os);
return written;
}
void Destroy(TBuilderImpl* builder) override {
// Delete all nodes down the stream.
for (iterator it = this->begin(); it != this->end(); ++it) {
builder->DestroyNode(it->Node);
}
this->clear();
}
~TArcSet() override {
Y_ASSERT(this->empty());
}
};
struct TBufferedSubtree: public ISubtree {
TArrayWithSizeHolder<char> Buffer;
TBufferedSubtree() {
Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
}
bool IsLast() const override {
return Buffer.Empty();
}
const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
if (Buffer.Empty()) {
result = false;
return nullptr;
}
TCompactTrie<char, D, S> trie(Buffer.Get(), Buffer.Size(), packer);
result = trie.Find(key.data(), key.size(), value);
return nullptr;
}
const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override {
if (Buffer.Empty()) {
result = false;
return nullptr;
}
TCompactTrie<char, D, S> trie(Buffer.Get(), Buffer.Size(), packer);
size_t prefixLen = 0;
result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value);
key = key.SubStr(prefixLen);
return nullptr;
}
ui64 Measure(const TBuilderImpl*) const override {
return Buffer.Size();
}
ui64 Save(const TBuilderImpl*, IOutputStream& os) const override {
os.Write(Buffer.Get(), Buffer.Size());
return Buffer.Size();
}
ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
ui64 result = Save(builder, os);
TArrayWithSizeHolder<char>().Swap(Buffer);
return result;
}
};
struct TSubtreeInFile: public ISubtree {
struct TData {
TString FileName;
ui64 Size;
};
THolder<TData> Data;
TSubtreeInFile(const TString& fileName) {
// stupid API
TFile file(fileName, RdOnly);
i64 size = file.GetLength();
if (size < 0)
ythrow yexception() << "unable to get file " << fileName.Quote() << " size for unknown reason";
Data.Reset(new TData);
Data->FileName = fileName;
Data->Size = size;
Y_ASSERT(reinterpret_cast<ISubtree*>(this) == static_cast<void*>(this)); // This assumption is used in TNode::Subtree()
}
bool IsLast() const override {
return Data->Size == 0;
}
const TNode* Find(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override {
if (!Data) {
result = false;
return nullptr;
}
TCompactTrie<char, D, S> trie(TBlob::FromFile(Data->FileName), packer);
result = trie.Find(key.data(), key.size(), value);
return nullptr;
}
const TNode* FindLongestPrefix(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override {
if (!Data) {
result = false;
return nullptr;
}
TCompactTrie<char, D, S> trie(TBlob::FromFile(Data->FileName), packer);
size_t prefixLen = 0;
result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value);
key = key.SubStr(prefixLen);
return nullptr;
}
ui64 Measure(const TBuilderImpl*) const override {
return Data->Size;
}
ui64 Save(const TBuilderImpl*, IOutputStream& os) const override {
TUnbufferedFileInput is(Data->FileName);
ui64 written = TransferData(&is, &os);
if (written != Data->Size)
ythrow yexception() << "file " << Data->FileName.Quote() << " size changed";
return written;
}
ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override {
return Save(builder, os);
}
};
union {
char ArcsData[CONSTEXPR_MAX3(sizeof(TArcSet), sizeof(TBufferedSubtree), sizeof(TSubtreeInFile))];
union {
void* Data1;
long long int Data2;
} Aligner;
};
inline ISubtree* Subtree() {
return reinterpret_cast<ISubtree*>(ArcsData);
}
inline const ISubtree* Subtree() const {
return reinterpret_cast<const ISubtree*>(ArcsData);
}
EPayload PayloadType;
inline const char* PayloadPtr() const {
return ((const char*) this) + sizeof(TNode);
}
inline char* PayloadPtr() {
return ((char*) this) + sizeof(TNode);
}
// *Payload()
inline const char*& PayloadAsPtr() const {
const char** payload = (const char**) PayloadPtr();
return *payload;
}
inline char*& PayloadAsPtr() {
char** payload = (char**) PayloadPtr();
return *payload;
}
inline const char* GetPayload() const {
switch (PayloadType) {
case DATA_INSIDE:
return PayloadPtr();
case DATA_MALLOCED:
case DATA_IN_MEMPOOL:
return PayloadAsPtr();
case DATA_ABSENT:
default:
abort();
}
}
inline char* GetPayload() {
const TNode* thiz = this;
return const_cast<char*>(thiz->GetPayload()); // const_cast is to avoid copy-paste style
}
bool IsFinal() const {
return PayloadType != DATA_ABSENT;
}
bool IsLast() const {
return Subtree()->IsLast();
}
inline void* operator new(size_t, TFixedSizeAllocator& pool) {
return pool.Allocate();
}
inline void operator delete(void* ptr, TFixedSizeAllocator& pool) noexcept {
pool.Release(ptr);
}
TNode()
: PayloadType(DATA_ABSENT)
{
new (Subtree()) TArcSet;
}
~TNode() {
Subtree()->~ISubtree();
Y_ASSERT(PayloadType == DATA_ABSENT);
}
};
// TCompactTrieBuilder
template <class T, class D, class S>
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilder(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc)
: Impl(new TCompactTrieBuilderImpl(flags, packer, alloc))
{
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::Add(const TSymbol* key, size_t keylen, const TData& value) {
return Impl->AddEntry(key, keylen, value);
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::AddPtr(const TSymbol* key, size_t keylen, const char* value) {
return Impl->AddEntryPtr(key, keylen, value);
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName) {
return Impl->AddSubtreeInFile(key, keylen, fileName);
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer) {
return Impl->AddSubtreeInBuffer(key, keylen, std::move(buffer));
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::Find(const TSymbol* key, size_t keylen, TData* value) const {
return Impl->FindEntry(key, keylen, value);
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::FindLongestPrefix(
const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const {
return Impl->FindLongestPrefix(key, keylen, prefixlen, value);
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::Save(IOutputStream& os) const {
return Impl->Save(os);
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::SaveAndDestroy(IOutputStream& os) {
return Impl->SaveAndDestroy(os);
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::Clear() {
Impl->Clear();
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::GetEntryCount() const {
return Impl->GetEntryCount();
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::GetNodeCount() const {
return Impl->GetNodeCount();
}
// TCompactTrieBuilder::TCompactTrieBuilderImpl
template <class T, class D, class S>
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc)
: Pool(1000000, TMemoryPool::TLinearGrow::Instance(), alloc)
, PayloadSize(sizeof(void*)) // XXX: find better value
, NodeAllocator(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, alloc))
, Flags(flags)
, EntryCount(0)
, NodeCount(1)
, Packer(packer)
{
Root = new (*NodeAllocator) TNode;
}
template <class T, class D, class S>
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::~TCompactTrieBuilderImpl() {
DestroyNode(Root);
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ConvertSymbolArrayToChar(
const TSymbol* key, size_t keylen, TTempBuf& buf, size_t buflen) const {
char* ckeyptr = buf.Data();
for (size_t i = 0; i < keylen; ++i) {
TSymbol label = key[i];
for (int j = (int)NCompactTrie::ExtraBits<TSymbol>(); j >= 0; j -= 8) {
Y_ASSERT(ckeyptr < buf.Data() + buflen);
*(ckeyptr++) = (char)(label >> j);
}
}
buf.Proceed(buflen);
Y_ASSERT(ckeyptr == buf.Data() + buf.Filled());
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::DestroyNode(TNode* thiz) {
thiz->Subtree()->Destroy(this);
NodeReleasePayload(thiz);
thiz->~TNode();
NodeAllocator->Release(thiz);
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeReleasePayload(TNode* thiz) {
switch (thiz->PayloadType) {
case DATA_ABSENT:
case DATA_INSIDE:
case DATA_IN_MEMPOOL:
break;
case DATA_MALLOCED:
delete[] thiz->PayloadAsPtr();
break;
default:
abort();
}
thiz->PayloadType = DATA_ABSENT;
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntry(
const TSymbol* key, size_t keylen, const TData& value) {
size_t datalen = Packer.MeasureLeaf(value);
bool isNewAddition = false;
char* place = AddEntryForData(key, keylen, datalen, isNewAddition);
Packer.PackLeaf(place, value, datalen);
return isNewAddition;
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryPtr(
const TSymbol* key, size_t keylen, const char* value) {
size_t datalen = Packer.SkipLeaf(value);
bool isNewAddition = false;
char* place = AddEntryForData(key, keylen, datalen, isNewAddition);
memcpy(place, value, datalen);
return isNewAddition;
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddSubtreeInFile(
const TSymbol* key, size_t keylen, const TString& fileName) {
typedef typename TNode::ISubtree ISubtree;
typedef typename TNode::TSubtreeInFile TSubtreeInFile;
bool isNewAddition = false;
TNode* node = AddEntryForSomething(key, keylen, isNewAddition);
node->Subtree()->Destroy(this);
node->Subtree()->~ISubtree();
new (node->Subtree()) TSubtreeInFile(fileName);
return isNewAddition;
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddSubtreeInBuffer(
const TSymbol* key, size_t keylen, TArrayWithSizeHolder<char>&& buffer) {
typedef typename TNode::TBufferedSubtree TBufferedSubtree;
bool isNewAddition = false;
TNode* node = AddEntryForSomething(key, keylen, isNewAddition);
node->Subtree()->Destroy(this);
node->Subtree()->~ISubtree();
auto subtree = new (node->Subtree()) TBufferedSubtree();
subtree->Buffer.Swap(buffer);
return isNewAddition;
}
template <class T, class D, class S>
typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryForSomething(
const TSymbol* key, size_t keylen, bool& isNewAddition) {
using namespace NCompactTrie;
EntryCount++;
if (Flags & CTBF_VERBOSE)
ShowProgress(EntryCount);
TNode* current = Root;
size_t passed;
// Special case of empty key: replace it by 1-byte "\0" key.
size_t ckeylen = keylen ? keylen * sizeof(TSymbol) : 1;
TTempBuf ckeybuf(ckeylen);
if (keylen == 0) {
ckeybuf.Append("\0", 1);
} else {
ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
}
char* ckey = ckeybuf.Data();
TNode* next;
while ((ckeylen > 0) && (next = NodeForwardAdd(current, ckey, ckeylen, passed, &NodeCount)) != nullptr) {
current = next;
ckeylen -= passed;
ckey += passed;
}
if (ckeylen != 0) {
//new leaf
NodeCount++;
TNode* leaf = new (*NodeAllocator) TNode();
NodeLinkTo(current, TBlob::Copy(ckey, ckeylen), leaf);
current = leaf;
}
isNewAddition = (current->PayloadType == DATA_ABSENT);
if ((Flags & CTBF_UNIQUE) && !isNewAddition)
ythrow yexception() << "Duplicate key";
return current;
}
template <class T, class D, class S>
char* TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::AddEntryForData(const TSymbol* key, size_t keylen,
size_t datalen, bool& isNewAddition) {
TNode* current = AddEntryForSomething(key, keylen, isNewAddition);
NodeReleasePayload(current);
if (datalen <= PayloadSize) {
current->PayloadType = DATA_INSIDE;
} else if (Flags & CTBF_PREFIX_GROUPED) {
current->PayloadType = DATA_MALLOCED;
current->PayloadAsPtr() = new char[datalen];
} else {
current->PayloadType = DATA_IN_MEMPOOL;
current->PayloadAsPtr() = (char*) Pool.Allocate(datalen); // XXX: allocate unaligned
}
return current->GetPayload();
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindEntry(const TSymbol* key, size_t keylen, TData* value) const {
using namespace NCompactTrie;
if (!keylen) {
const char zero = '\0';
return FindEntryImpl(&zero, 1, value);
} else {
size_t ckeylen = keylen * sizeof(TSymbol);
TTempBuf ckeybuf(ckeylen);
ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
return FindEntryImpl(ckeybuf.Data(), ckeylen, value);
}
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindEntryImpl(const char* keyptr, size_t keylen, TData* value) const {
const TNode* node = Root;
bool result = false;
TStringBuf key(keyptr, keylen);
while (key && (node = node->Subtree()->Find(key, value, result, Packer))) {
}
return result;
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindLongestPrefix(
const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const {
using namespace NCompactTrie;
if (!keylen) {
const char zero = '\0';
const bool ret = FindLongestPrefixImpl(&zero, 1, prefixlen, value);
if (ret && prefixlen)
*prefixlen = 0; // empty key found
return ret;
} else {
size_t ckeylen = keylen * sizeof(TSymbol);
TTempBuf ckeybuf(ckeylen);
ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen);
bool ret = FindLongestPrefixImpl(ckeybuf.Data(), ckeylen, prefixlen, value);
if (ret && prefixlen && *prefixlen == 1 && ckeybuf.Data()[0] == '\0')
*prefixlen = 0; // if we have found empty key, set prefixlen to zero
else if (!ret) // try to find value with empty key, because empty key is prefix of a every key
ret = FindLongestPrefix(nullptr, 0, prefixlen, value);
if (ret && prefixlen)
*prefixlen /= sizeof(TSymbol);
return ret;
}
}
template <class T, class D, class S>
bool TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const {
const TNode* node = Root;
const TNode* lastFinalNode = nullptr;
bool endResult = false;
TStringBuf key(keyptr, keylen);
TStringBuf keyTail = key;
TStringBuf lastFinalKeyTail;
while (keyTail && (node = node->Subtree()->FindLongestPrefix(keyTail, value, endResult, Packer))) {
if (endResult) // no more ways to find prefix and prefix has been found
break;
if (node->IsFinal()) {
lastFinalNode = node;
lastFinalKeyTail = keyTail;
}
}
if (!endResult && lastFinalNode) {
if (value)
Packer.UnpackLeaf(lastFinalNode->GetPayload(), *value);
keyTail = lastFinalKeyTail;
endResult = true;
}
if (endResult && prefixLen)
*prefixLen = keyTail ? key.size() - keyTail.size() : key.size();
return endResult;
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::Clear() {
DestroyNode(Root);
Pool.Clear();
NodeAllocator.Reset(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, TDefaultAllocator::Instance()));
Root = new (*NodeAllocator) TNode;
EntryCount = 0;
NodeCount = 1;
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::Save(IOutputStream& os) const {
const size_t len = NodeMeasureSubtree(Root);
if (len != NodeSaveSubtree(Root, os))
ythrow yexception() << "something wrong";
return len;
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::SaveAndDestroy(IOutputStream& os) {
const size_t len = NodeMeasureSubtree(Root);
if (len != NodeSaveSubtreeAndDestroy(Root, os))
ythrow yexception() << "something wrong";
return len;
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::GetEntryCount() const {
return EntryCount;
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::GetNodeCount() const {
return NodeCount;
}
template <class T, class D, class S>
typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeForwardAdd(
TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount) {
typename TNode::TArcSet* arcSet = dynamic_cast<typename TNode::TArcSet*>(thiz->Subtree());
if (!arcSet)
ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped.";
typename TNode::TArcSet::iterator it = arcSet->Find(*label);
if (it != arcSet->end()) {
const char* arcLabel = it->Label.AsCharPtr();
size_t arcLabelLen = it->Label.Length();
for (passed = 0; (passed < len) && (passed < arcLabelLen) && (label[passed] == arcLabel[passed]); ++passed) {
//just count
}
if (passed < arcLabelLen) {
(*nodeCount)++;
TNode* node = new (*NodeAllocator) TNode();
NodeLinkTo(node, it->Label.SubBlob(passed, arcLabelLen), it->Node);
it->Node = node;
it->Label = it->Label.SubBlob(passed);
}
return it->Node;
}
return nullptr;
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node) {
typename TNode::TArcSet* arcSet = dynamic_cast<typename TNode::TArcSet*>(thiz->Subtree());
if (!arcSet)
ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped.";
// Buffer the node at the last arc
if ((Flags & CTBF_PREFIX_GROUPED) && !arcSet->empty())
NodeBufferSubtree(arcSet->back().Node);
arcSet->Add(label, node);
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeMeasureSubtree(TNode* thiz) const {
return (size_t)thiz->Subtree()->Measure(this);
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveSubtree(TNode* thiz, IOutputStream& os) const {
return thiz->Subtree()->Save(this, os);
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& os) {
return thiz->Subtree()->SaveAndDestroy(this, os);
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeBufferSubtree(TNode* thiz) {
typedef typename TNode::TArcSet TArcSet;
TArcSet* arcSet = dynamic_cast<TArcSet*>(thiz->Subtree());
if (!arcSet)
return;
size_t bufferLength = (size_t)arcSet->Measure(this);
TArrayWithSizeHolder<char> buffer;
buffer.Resize(bufferLength);
TMemoryOutput bufout(buffer.Get(), buffer.Size());
ui64 written = arcSet->Save(this, bufout);
Y_ASSERT(written == bufferLength);
arcSet->Destroy(this);
arcSet->~TArcSet();
typename TNode::TBufferedSubtree* bufferedArcSet = new (thiz->Subtree()) typename TNode::TBufferedSubtree;
bufferedArcSet->Buffer.Swap(buffer);
}
template <class T, class D, class S>
size_t TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeMeasureLeafValue(TNode* thiz) const {
if (!thiz->IsFinal())
return 0;
return Packer.SkipLeaf(thiz->GetPayload());
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const {
if (!thiz->IsFinal())
return 0;
size_t len = Packer.SkipLeaf(thiz->GetPayload());
os.Write(thiz->GetPayload(), len);
return len;
}
// TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArc
template <class T, class D, class S>
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TArc::TArc(const TBlob& lbl, TNode* nd)
: Label(lbl)
, Node(nd)
, LeftOffset(0)
, RightOffset(0)
{}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcMeasure(
const TArc* thiz, size_t leftsize, size_t rightsize) const {
using namespace NCompactTrie;
size_t coresize = 2 + NodeMeasureLeafValue(thiz->Node); // 2 == (char + flags)
size_t treesize = NodeMeasureSubtree(thiz->Node);
if (thiz->Label.Length() > 0)
treesize += 2 * (thiz->Label.Length() - 1);
// Triple measurements are needed because the space needed to store the offset
// shall be added to the offset itself. Hence three iterations.
size_t leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize) : 0;
size_t rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize) : 0;
leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0;
rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0;
coresize += leftoffsetsize + rightoffsetsize;
thiz->LeftOffset = leftsize ? coresize + treesize : 0;
thiz->RightOffset = rightsize ? coresize + treesize + leftsize : 0;
return coresize + treesize + leftsize + rightsize;
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSaveSelf(const TArc* thiz, IOutputStream& os) const {
using namespace NCompactTrie;
ui64 written = 0;
size_t leftoffsetsize = MeasureOffset(thiz->LeftOffset);
size_t rightoffsetsize = MeasureOffset(thiz->RightOffset);
size_t labelLen = thiz->Label.Length();
for (size_t i = 0; i < labelLen; ++i) {
char flags = 0;
if (i == 0) {
flags |= (leftoffsetsize << MT_LEFTSHIFT);
flags |= (rightoffsetsize << MT_RIGHTSHIFT);
}
if (i == labelLen-1) {
if (thiz->Node->IsFinal())
flags |= MT_FINAL;
if (!thiz->Node->IsLast())
flags |= MT_NEXT;
} else {
flags |= MT_NEXT;
}
os.Write(&flags, 1);
os.Write(&thiz->Label.AsCharPtr()[i], 1);
written += 2;
if (i == 0) {
written += ArcSaveOffset(thiz->LeftOffset, os);
written += ArcSaveOffset(thiz->RightOffset, os);
}
}
written += NodeSaveLeafValue(thiz->Node, os);
return written;
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSave(const TArc* thiz, IOutputStream& os) const {
ui64 written = ArcSaveSelf(thiz, os);
written += NodeSaveSubtree(thiz->Node, os);
return written;
}
template <class T, class D, class S>
ui64 TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os) {
ui64 written = ArcSaveSelf(thiz, os);
written += NodeSaveSubtreeAndDestroy(thiz->Node, os);
return written;
}
// TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet
template <class T, class D, class S>
typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::iterator
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) {
using namespace NCompTriePrivate;
iterator it = LowerBound(this->begin(), this->end(), ch, TCmp());
if (it != this->end() && it->Label[0] == (unsigned char)ch) {
return it;
}
return this->end();
}
template <class T, class D, class S>
typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::const_iterator
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) const {
using namespace NCompTriePrivate;
const_iterator it = LowerBound(this->begin(), this->end(), ch, TCmp());
if (it != this->end() && it->Label[0] == (unsigned char)ch) {
return it;
}
return this->end();
}
template <class T, class D, class S>
void TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Add(const TBlob& s, TNode* node) {
using namespace NCompTriePrivate;
this->insert(LowerBound(this->begin(), this->end(), s[0], TCmp()), TArc(s, node));
}
template <class T, class D, class S>
const typename TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode*
TCompactTrieBuilder<T, D, S>::TCompactTrieBuilderImpl::TNode::TArcSet::Find(
TStringBuf& key, TData* value, bool& result, const TPacker& packer) const {
result = false;
if (!key)
return nullptr;
const const_iterator it = Find(key[0]);
if (it != this->end()) {
const char* const arcLabel = it->Label.AsCharPtr();
const size_t arcLabelLen = it->Label.Length();
if (key.size() >= arcLabelLen && memcmp(key.data(), arcLabel, arcLabelLen) == 0) {
const TStringBuf srcKey = key;
key = key.SubStr(arcLabelLen);
const TNode* const node = it->Node;
if (srcKey.size() == arcLabelLen) {
// unpack value of it->Node, if it has value
if (!node->IsFinal())
return nullptr;
if (value)
packer.UnpackLeaf(node->GetPayload(), *value);
result = true;
return nullptr;
}
// find in subtree
return node;
}
}
return nullptr;
}
// Different
//----------------------------------------------------------------------------------------------------------------------
// Minimize the trie. The result is equivalent to the original
// trie, except that it takes less space (and has marginally lower
// performance, because of eventual epsilon links).
// The algorithm is as follows: starting from the largest pieces, we find
// nodes that have identical continuations (Daciuk's right language),
// and repack the trie. Repacking is done in-place, so memory is less
// of an issue; however, it may take considerable time.
// IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout.
// Because of non-local structure and epsilon links, it won't work
// as you expect it to, and can destroy the trie in the making.
template <class TPacker>
size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/, NCompactTrie::EMinimizeMode mode) {
using namespace NCompactTrie;
return CompactTrieMinimizeImpl(os, data, datalength, verbose, &packer, mode);
}
template <class TTrieBuilder>
size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
TBufferStream buftmp;
size_t len = builder.Save(buftmp);
return CompactTrieMinimize<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
}
//----------------------------------------------------------------------------------------------------------------
// Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf.
// The trie becomes about 2% larger, but the access became about 25% faster in our experiments.
// Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory.
// Calling it the second time on the same trie does nothing.
//
// The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms
// by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels
// two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree).
// The original paper (describing the layout in Section 2.1) is:
// Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees
// SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358.
// Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/
// Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees
// Proceedings of the 41st Annual Symposium
// on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409.
// Available on the web: http://erikdemaine.org/papers/FOCS2000b/
// (there is not much difference between these papers, actually).
//
template <class TPacker>
size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/) {
using namespace NCompactTrie;
return CompactTrieMakeFastLayoutImpl(os, data, datalength, verbose, &packer);
}
template <class TTrieBuilder>
size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
TBufferStream buftmp;
size_t len = builder.Save(buftmp);
return CompactTrieMakeFastLayout<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
}
template <class TPacker>
size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose/*=false*/, const TPacker& packer/*= TPacker()*/) {
TBufferStream buftmp;
size_t len = CompactTrieMinimize(buftmp, data, datalength, verbose, packer);
return CompactTrieMakeFastLayout(os, buftmp.Buffer().Data(), len, verbose, packer);
}
template <class TTrieBuilder>
size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) {
TBufferStream buftmp;
size_t len = CompactTrieMinimize(buftmp, builder, verbose);
return CompactTrieMakeFastLayout<typename TTrieBuilder::TPacker>(os, buftmp.Buffer().Data(), len, verbose);
}