diff options
author | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
---|---|---|
committer | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
commit | 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch) | |
tree | 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/on_disk | |
parent | 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff) | |
download | ydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz |
fix ya.make
Diffstat (limited to 'library/cpp/on_disk')
-rw-r--r-- | library/cpp/on_disk/aho_corasick/common.h | 14 | ||||
-rw-r--r-- | library/cpp/on_disk/aho_corasick/helpers.h | 17 | ||||
-rw-r--r-- | library/cpp/on_disk/aho_corasick/reader.h | 341 | ||||
-rw-r--r-- | library/cpp/on_disk/aho_corasick/writer.h | 354 |
4 files changed, 726 insertions, 0 deletions
diff --git a/library/cpp/on_disk/aho_corasick/common.h b/library/cpp/on_disk/aho_corasick/common.h new file mode 100644 index 00000000000..0aad93041a7 --- /dev/null +++ b/library/cpp/on_disk/aho_corasick/common.h @@ -0,0 +1,14 @@ +#pragma once + +#include <util/system/defaults.h> + +class TAhoCorasickCommon { +public: + static ui32 GetVersion() { + return 3; + } + + static size_t GetBlockCount() { + return 4; + } +}; diff --git a/library/cpp/on_disk/aho_corasick/helpers.h b/library/cpp/on_disk/aho_corasick/helpers.h new file mode 100644 index 00000000000..1e413c08a6c --- /dev/null +++ b/library/cpp/on_disk/aho_corasick/helpers.h @@ -0,0 +1,17 @@ +#pragma once + +#include "reader.h" +#include "writer.h" + +template <bool> +struct TDefaultAhoCorasickG; + +template <> +struct TDefaultAhoCorasickG<false> { + typedef TDefaultMappedAhoCorasick T; +}; + +template <> +struct TDefaultAhoCorasickG<true> { + typedef TDefaultAhoCorasickBuilder T; +}; diff --git a/library/cpp/on_disk/aho_corasick/reader.h b/library/cpp/on_disk/aho_corasick/reader.h new file mode 100644 index 00000000000..e5db58685bc --- /dev/null +++ b/library/cpp/on_disk/aho_corasick/reader.h @@ -0,0 +1,341 @@ +#pragma once + +#include <util/generic/deque.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/memory/blob.h> +#include <util/stream/buffer.h> +#include <util/stream/mem.h> +#include <util/system/unaligned_mem.h> +#include <utility> + +#include <library/cpp/on_disk/chunks/chunked_helpers.h> + +#include "common.h" + +template <class O> +class TAhoSearchResult: public TDeque<std::pair<ui32, O>> { +}; + +/* + * Mapped-declaraion + */ + +template <class O> +class TMappedDefaultOutputContainer { +private: + TGeneralVector<O> List_; + +public: + TMappedDefaultOutputContainer(const char* data) + : List_(TBlob::NoCopy(data, (size_t)-1)) + { + } + + bool IsEmpty() const { + return List_.GetSize() == 0; + } + + void FillAnswer(TAhoSearchResult<O>& answer, ui32 pos) const { + for (ui32 i = 0; i < List_.GetSize(); ++i) { + answer.push_back(std::make_pair(pos, O())); + List_.Get(i, answer.back().second); + } + } + + size_t CheckData() const { + return List_.RealSize(); + } +}; + +template <class O> +class TMappedSingleOutputContainer { + const ui32* Data; + + ui32 Size() const { + return ReadUnaligned<ui32>(Data); + } + +public: + TMappedSingleOutputContainer(const char* data) + : Data((const ui32*)data) + { + } + + bool IsEmpty() const { + return Size() == 0; + } + + void FillAnswer(TAhoSearchResult<O>& answer, ui32 pos) const { + if (!IsEmpty()) { + answer.push_back(std::make_pair(pos, O())); + TMemoryInput input(Data + 1, Size()); + TSaveLoadVectorNonPodElement<O>::Load(&input, answer.back().second, Size()); + } + } + + size_t CheckData() const { + return sizeof(ui32) + ReadUnaligned<ui32>(Data); + } +}; + +template <class TStringType, class O, class C> +class TMappedAhoCorasick; + +template <typename TKey, typename TValue> +class TEmptyMapData : TNonCopyable { +private: + TBufferStream Stream; + +public: + const char* P; + + TEmptyMapData() { + TPlainHashWriter<TKey, TValue> hash; + hash.Save(Stream); + P = Stream.Buffer().Begin(); + } +}; + +/* + * каждая вершина имеет свой ui32-номер + * блок данных для вершины: + * ui32, ui32, ui32, ui32, степень*char, данные контейнера + * fail, suff, степень, самый левый сын, лексикографический список меток исходящих рёбер. + * если степень нулевая, то в блоке только 3 инта + */ +template <class TStringType, class O, class C> +class TMappedAhoVertex { +public: + typedef typename TStringType::value_type TCharType; + friend class TMappedAhoCorasick<TStringType, O, C>; + +private: + const char* Data; + typedef TPlainHash<TCharType, ui32> TGotoMap; + TGotoMap GotoMap; + static const TEmptyMapData<TCharType, ui32> EmptyData; + + static const size_t GENERAL_SHIFT = 3 * sizeof(ui32); + +private: + const ui32* DataAsInt() const { + return (const ui32*)Data; + } + + ui32 Power() const { + return ReadUnaligned<ui32>(DataAsInt() + 2); + } + +protected: + const C Output() const { + return C(Power() ? GotoMap.ByteEnd() : Data + GENERAL_SHIFT); + } + + ui32 Fail() const { + return ReadUnaligned<ui32>(DataAsInt()); + } + + ui32 Suffix() const { + return ReadUnaligned<ui32>(DataAsInt() + 1); + } + + bool GotoFunction(const TCharType c, ui32* result) const { + if (0 == Power()) + return false; + return GotoMap.Find(c, result); + } + + bool operator==(const TMappedAhoVertex& rhs) const { + return Data == rhs.Data; + } + + size_t CheckData(ui32 totalVertices) const; /// throws yexception in case of bad data + +public: + TMappedAhoVertex(const char* data) + : Data(data) + , GotoMap(Power() ? (Data + GENERAL_SHIFT) : EmptyData.P) + { + } +}; + +/* + * блок данных для бора: + * количество вершин N, ui32 + * суммарный размер блоков для вершин, ui32 + * блоки данных для каждой вершины + * отображение id->offset для блока вершины id, N*ui32 + */ +template <class TStringType, class O, class C = TMappedDefaultOutputContainer<O>> +class TMappedAhoCorasick : TNonCopyable { +public: + typedef TAhoSearchResult<O> TSearchResult; + typedef TMappedAhoVertex<TStringType, O, C> TAhoVertexType; + typedef typename TStringType::value_type TCharType; + typedef TBasicStringBuf<TCharType> TSample; + +private: + const TBlob Blob; + const char* const AhoVertexes; + const ui32 VertexAmount; + const ui32* const Id2Offset; + const TAhoVertexType Root; + +private: + bool ValidVertex(ui32 id) const { + return id < VertexAmount; + } + + TAhoVertexType GetVertexAt(ui32 id) const { + if (!ValidVertex(id)) + ythrow yexception() << "TMappedAhoCorasick fatal error: invalid id " << id; + return TAhoVertexType(AhoVertexes + Id2Offset[id]); + } + +public: + TMappedAhoCorasick(const TBlob& blob) + : Blob(blob) + , AhoVertexes(GetBlock(blob, 1).AsCharPtr()) + , VertexAmount(TSingleValue<ui32>(GetBlock(blob, 2)).Get()) + , Id2Offset((const ui32*)(GetBlock(Blob, 3).AsCharPtr())) + , Root(GetVertexAt(0)) + { + { + const ui32 version = TSingleValue<ui32>(GetBlock(blob, 0)).Get(); + if (version != TAhoCorasickCommon::GetVersion()) + ythrow yexception() << "Unknown version " << version << " instead of " << TAhoCorasickCommon::GetVersion(); + } + { + TChunkedDataReader reader(blob); + if (reader.GetBlocksCount() != TAhoCorasickCommon::GetBlockCount()) + ythrow yexception() << "wrong block count " << reader.GetBlocksCount(); + } + } + + bool AhoContains(const TSample& str) const; + TSearchResult AhoSearch(const TSample& str) const; + void AhoSearch(const TSample& str, TSearchResult* result) const; + size_t CheckData() const; /// throws yexception in case of bad data +}; + +using TSimpleMappedAhoCorasick = TMappedAhoCorasick<TString, ui32, TMappedSingleOutputContainer<ui32>>; +using TDefaultMappedAhoCorasick = TMappedAhoCorasick<TString, ui32>; + +/* + * Mapped-implementation + */ +template <class TStringType, class O, class C> +bool TMappedAhoCorasick<TStringType, O, C>::AhoContains(const TSample& str) const { + TAhoVertexType current = Root; + const size_t len = str.size(); + for (size_t i = 0; i < len; ++i) { + bool outer = false; + ui32 gotoVertex; + while (!current.GotoFunction(str[i], &gotoVertex)) { + if (current == Root) { /// nowhere to go + outer = true; + break; + } + current = GetVertexAt(current.Fail()); + } + if (outer) + continue; + current = GetVertexAt(gotoVertex); + + TAhoVertexType v = current; + while (true) { + if (!v.Output().IsEmpty()) + return true; + if ((ui32)-1 == v.Suffix()) + break; + v = GetVertexAt(v.Suffix()); + } + } + return false; +} + +template <class TStringType, class O, class C> +void TMappedAhoCorasick<TStringType, O, C>::AhoSearch(const TSample& str, typename TMappedAhoCorasick<TStringType, O, C>::TSearchResult* answer) const { + answer->clear(); + TAhoVertexType current = Root; + const size_t len = str.length(); + for (size_t i = 0; i < len; ++i) { + bool outer = false; + ui32 gotoVertex; + while (!current.GotoFunction(str[i], &gotoVertex)) { + if (current == Root) { /// nowhere to go + outer = true; + break; + } + current = GetVertexAt(current.Fail()); + } + if (outer) + continue; + current = GetVertexAt(gotoVertex); + + TAhoVertexType v = current; + while (true) { + v.Output().FillAnswer(*answer, (ui32)i); + if ((ui32)-1 == v.Suffix()) + break; + v = GetVertexAt(v.Suffix()); + } + } +} + +template <class TStringType, class O, class C> +typename TMappedAhoCorasick<TStringType, O, C>::TSearchResult TMappedAhoCorasick<TStringType, O, C>::AhoSearch(const TSample& str) const { + TAhoSearchResult<O> answer; + AhoSearch(str, &answer); + return answer; +} + +/* + * implementation of CheckData in Mapped-classes + */ + +static inline void CheckRange(ui32 id, ui32 strictUpperBound) { + if (id >= strictUpperBound) { + throw yexception() << id << " of " << strictUpperBound << " - index is invalid"; + } +} + +template <class TStringType, class O, class C> +const TEmptyMapData<typename TStringType::value_type, ui32> TMappedAhoVertex<TStringType, O, C>::EmptyData; + +template <class TStringType, class O, class C> +size_t TMappedAhoVertex<TStringType, O, C>::CheckData(ui32 totalVertices) const { + size_t bytesNeeded = GENERAL_SHIFT; + CheckRange(Fail(), totalVertices); + if (Suffix() != (ui32)(-1)) + CheckRange(Suffix(), totalVertices); + if (Power()) { + for (typename TGotoMap::TConstIterator toItem = GotoMap.Begin(); toItem != GotoMap.End(); ++toItem) + CheckRange(toItem->Second(), totalVertices); + bytesNeeded += GotoMap.ByteSize(); + } + bytesNeeded += Output().CheckData(); + return bytesNeeded; +} + +template <class TStringType, class O, class C> +size_t TMappedAhoCorasick<TStringType, O, C>::CheckData() const { + try { + size_t bytesNeeded = 0; + for (ui32 id = 0; id < VertexAmount; ++id) { + if (Id2Offset[id] != bytesNeeded) { + ythrow yexception() << "wrong offset[" << id << "]: " << Id2Offset[id]; + } + bytesNeeded += GetVertexAt(id).CheckData(VertexAmount); + } + bytesNeeded += VertexAmount * sizeof(ui32); + const size_t realsize = GetBlock(Blob, 1).Size() + GetBlock(Blob, 3).Size(); + if (realsize != bytesNeeded) { + ythrow yexception() << "extra information " << bytesNeeded << " " << realsize; + } + return bytesNeeded; + } catch (const yexception& e) { + ythrow yexception() << "Bad data: " << e.what(); + } +} diff --git a/library/cpp/on_disk/aho_corasick/writer.h b/library/cpp/on_disk/aho_corasick/writer.h new file mode 100644 index 00000000000..e866200e1c9 --- /dev/null +++ b/library/cpp/on_disk/aho_corasick/writer.h @@ -0,0 +1,354 @@ +#pragma once + +#include <util/generic/deque.h> +#include <util/generic/hash.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> +#include <util/stream/buffer.h> +#include <util/memory/blob.h> +#include <utility> + +#include <library/cpp/on_disk/chunks/writer.h> +#include <library/cpp/on_disk/chunks/chunked_helpers.h> + +#include "common.h" + +template <class I> +struct meta_iterator_pair { + typedef typename std::iterator_traits<I>::value_type::first_type first_type; + typedef typename std::iterator_traits<I>::value_type::second_type second_type; +}; + +/* + * Builder implementation + */ + +/* + * Builder declaration + */ + +template <class O> +class TDefaultContainerBuilder { + TGeneralVectorWriter<O> List_; + +public: + void AddOut(const O& o) { + List_.PushBack(o); + } + + bool IsEmpty() const { + return List_.Size() == 0; + } + + void SaveContent(IOutputStream* stream) const { + List_.Save(*stream); + } +}; + +template <class O> +class TSingleContainerBuilder { + bool Empty; + O Out_; + +public: + TSingleContainerBuilder() + : Empty(true) + { + } + + void AddOut(const O& o) { + Empty = false; + Out_ = o; + } + + bool IsEmpty() const { + return Empty; + } + + void SaveContent(IOutputStream* stream) const { + if (IsEmpty()) { + WriteBin<ui32>(stream, 0); + } else { + TBuffer buf; + { + TBufferOutput tempStream(buf); + TSaveLoadVectorNonPodElement<O>::Save(&tempStream, Out_); + } + WriteBin<ui32>(stream, buf.Size()); + stream->Write(buf.Data(), buf.Size()); + } + } +}; + +template <class TStringType, class O, class C> +class TAhoCorasickBuilder; + +template <class TStringType, class O, class C> +class TAhoVertex : TNonCopyable { + typedef TAhoVertex<TStringType, O, C> TMyself; + typedef TAhoCorasickBuilder<TStringType, O, C> TParent; + typedef typename TStringType::value_type TCharType; + + friend class TAhoCorasickBuilder<TStringType, O, C>; + +private: + typedef THashMap<TCharType, TMyself*> TGotoMap; + + TGotoMap GotoMap_; + C Output_; + TMyself* FailVertex_; + TMyself* SuffixVertex_; + +protected: + const C& Output() const { + return Output_; + } + + void AddVertex(TMyself* v, TCharType c) { + GotoMap_.insert(std::make_pair(c, v)); + } + + TMyself* Fail() const { + return FailVertex_; + } + + TMyself* Suffix() const { + return SuffixVertex_; + } + + TMyself* GotoFunction(const TCharType c) { + typename TGotoMap::iterator it; + it = GotoMap_.find(c); + return it != GotoMap_.end() ? it->second : nullptr; + } + + TMyself const* GotoFunction(const TCharType c) const { + typename TGotoMap::const_iterator it = GotoMap_.find(c); + return it != GotoMap_.end() ? it->second : NULL; + } + + TMyself* AddString(TParent* ahoCorasick, const TStringType& s, const ui32 position, const O& o) { + if (position >= s.size()) { + Output_.AddOut(o); + return nullptr; + } else { + TCharType c = s[position]; + TMyself* v = GotoFunction(c); + if (!v) { + v = ahoCorasick->CreateAhoVertex(); + AddVertex(v, c); + } + return v; + } + } + + void SetFail(TMyself* v) { + FailVertex_ = v; + } + + void SetSuffix(TMyself* v) { + SuffixVertex_ = v; + } + + const TGotoMap& GotoMap() const { + return GotoMap_; + } + +public: + TAhoVertex() + : FailVertex_(nullptr) + , SuffixVertex_(nullptr) + { + } +}; + +template <class TStringType, class O, class C = TDefaultContainerBuilder<O>> +class TAhoCorasickBuilder : TNonCopyable { +public: + typedef TAhoVertex<TStringType, O, C> TAhoVertexType; + typedef typename TStringType::value_type TCharType; + + friend class TAhoVertex<TStringType, O, C>; + friend class TTestMappedAhoCorasick; + +private: + TDeque<TAhoVertexType*> AhoVertexes; + +private: + TAhoVertexType* GetRoot() { + return AhoVertexes.front(); + } + + TAhoVertexType const* GetRoot() const { + return AhoVertexes.front(); + } + + TAhoVertexType* CreateAhoVertex() { + AhoVertexes.push_back(new TAhoVertexType()); + return AhoVertexes.back(); + } + + void ConstructFail(); + +public: + TAhoCorasickBuilder() + : AhoVertexes(1, new TAhoVertexType()) + { + } + + ~TAhoCorasickBuilder() { + for (size_t i = 0; i < AhoVertexes.size(); ++i) { + delete AhoVertexes[i]; + } + } + + void AddString(const TStringType& s, const O& value) { + TAhoVertexType* c = GetRoot(); + for (ui32 i = 0; i <= s.size(); ++i) { + c = c->AddString(this, s, i, value); + } + } + + const TBlob Save(); + const TBlob AtomicSave(); + void SaveToStream(IOutputStream* stream); +}; + +using TSimpleAhoCorasickBuilder = TAhoCorasickBuilder<TString, ui32, TSingleContainerBuilder<ui32>>; +using TDefaultAhoCorasickBuilder = TAhoCorasickBuilder<TString, ui32>; + +template <class AhoCorasick, class Iterator> +const TBlob BuildAho(AhoCorasick& ahoCorasick, Iterator begin, Iterator end) { + for (Iterator it = begin; it != end; ++it) + ahoCorasick.AddString(*it, it->size()); + return ahoCorasick.Save(); +} + +template <class TStringType, class Iterator> +const TBlob BuildAhoIndex(TAhoCorasickBuilder<TStringType, ui32>& ahoCorasick, Iterator begin, Iterator end) { + ui32 index = 0; + for (Iterator it = begin; it != end; ++it, ++index) + ahoCorasick.AddString(*it, index); + return ahoCorasick.Save(); +} + +template <class TStringType, class Iterator> +const TBlob BuildAhoObject(TAhoCorasickBuilder<TStringType, typename meta_iterator_pair<Iterator>::second_type>& ahoCorasick, Iterator begin, Iterator end) { + for (Iterator it = begin; it != end; ++it) + ahoCorasick.AddString(it->first, it->second); + return ahoCorasick.Save(); +} + +template <class TStringType, class O, class C> +void TAhoCorasickBuilder<TStringType, O, C>::ConstructFail() { + TAhoVertexType* root = GetRoot(); + root->SetFail(root); + TDeque<TAhoVertexType*> q; + typename TAhoVertexType::TGotoMap::const_iterator it; + for (it = root->GotoMap().begin(); it != root->GotoMap().end(); ++it) { + TAhoVertexType* v = it->second; + v->SetFail(root); + q.push_back(v); + } + while (!q.empty()) { + TAhoVertexType* c = q.front(); + q.pop_front(); + for (it = c->GotoMap().begin(); it != c->GotoMap().end(); ++it) { + TAhoVertexType* v = it->second; + TCharType a = it->first; + q.push_back(v); + TAhoVertexType* h = c->Fail(); + bool outer = false; + while (!h->GotoFunction(a)) { + if (h->Fail() == h) { + v->SetFail(h); + outer = true; + break; + } + h = h->Fail(); + } + if (outer) + continue; + TAhoVertexType* fail = h->GotoFunction(a); + v->SetFail(fail); + if (!fail->Output().IsEmpty()) + v->SetSuffix(fail); + else + v->SetSuffix(fail->Suffix()); + } + } +} + +template <class TStringType, class O, class C> +void TAhoCorasickBuilder<TStringType, O, C>::SaveToStream(IOutputStream* out) { + ConstructFail(); /// the reason of non-const declaration + + Y_ASSERT(AhoVertexes.size() < Max<ui32>()); + const ui32 vertexAmount = (ui32)AhoVertexes.size(); + + TChunkedDataWriter writer(*out); + { + TSingleValueWriter<ui32> versionWriter(TAhoCorasickCommon::GetVersion()); + WriteBlock(writer, versionWriter); + } + writer.NewBlock(); + + TVector<TAhoVertexType const*> q(1, GetRoot()); + THashMap<TAhoVertexType const*, ui32> vertex2id(vertexAmount + 1); + TVector<ui32> id2offset(vertexAmount, 0); + + TAhoVertexType* vt = nullptr; + vertex2id[vt] = (ui32)-1; + q.reserve(vertexAmount); + + for (ui32 curId = 0; curId < vertexAmount; ++curId) { + TAhoVertexType const* c = q[curId]; + vertex2id[c] = curId; + id2offset[curId] = (ui32)writer.GetCurrentBlockOffset(); + + WriteBin<ui32>(&writer, vertex2id[c->Fail()]); + WriteBin<ui32>(&writer, vertex2id[c->Suffix()]); + + typedef TVector<std::pair<TCharType, TAhoVertexType const*>> TChildren; + TChildren children(c->GotoMap().begin(), c->GotoMap().end()); + WriteBin<ui32>(&writer, static_cast<ui32>(children.size())); + + if (!children.empty()) { + TPlainHashWriter<TCharType, ui32> hashWriter; + const ui32 id = static_cast<ui32>(q.size()); + for (size_t i = 0; i < children.size(); ++i) { + hashWriter.Add(children[i].first, ui32(id + i)); + q.push_back(children[i].second); + } + + hashWriter.Save(writer); + } + + c->Output().SaveContent(&writer); + } + + { + Y_ASSERT(id2offset.size() < Max<ui32>()); + TSingleValueWriter<ui32> lenWriter((ui32)id2offset.size()); + WriteBlock(writer, lenWriter); + } + writer.NewBlock(); + writer.Write((const char*)id2offset.data(), id2offset.size() * sizeof(ui32)); + writer.WriteFooter(); + Y_ASSERT(TAhoCorasickCommon::GetBlockCount() == writer.GetBlockCount()); +} + +template <class TStringType, class O, class C> +const TBlob TAhoCorasickBuilder<TStringType, O, C>::Save() { + TBufferStream buffer; + SaveToStream(&buffer); + return TBlob::FromStream(buffer); +} + +template <class TStringType, class O, class C> +const TBlob TAhoCorasickBuilder<TStringType, O, C>::AtomicSave() { + TBufferStream buffer; + SaveToStream(&buffer); + return TBlob::FromStream(buffer); +} |