diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/llvm12/lib/Support/SuffixTree.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/llvm12/lib/Support/SuffixTree.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Support/SuffixTree.cpp | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/contrib/libs/llvm12/lib/Support/SuffixTree.cpp b/contrib/libs/llvm12/lib/Support/SuffixTree.cpp new file mode 100644 index 0000000000..0d419f12cd --- /dev/null +++ b/contrib/libs/llvm12/lib/Support/SuffixTree.cpp @@ -0,0 +1,210 @@ +//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Suffix Tree class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/SuffixTree.h" +#include "llvm/Support/Allocator.h" +#include <vector> + +using namespace llvm; + +SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { + Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); + Active.Node = Root; + + // Keep track of the number of suffixes we have to add of the current + // prefix. + unsigned SuffixesToAdd = 0; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(); +} + +SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent, + unsigned StartIdx, unsigned Edge) { + + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + + SuffixTreeNode *N = new (NodeAllocator.Allocate()) + SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr); + Parent.Children[Edge] = N; + + return N; +} + +SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent, + unsigned StartIdx, + unsigned EndIdx, unsigned Edge) { + + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != EmptyIdx) && + "Non-root internal nodes must have parents!"); + + unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx); + SuffixTreeNode *N = + new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root); + if (Parent) + Parent->Children[Edge] = N; + + return N; +} + +void SuffixTree::setSuffixIndices() { + // List of nodes we need to visit along with the current length of the + // string. + std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; + + // Current node being visited. + SuffixTreeNode *CurrNode = Root; + + // Sum of the lengths of the nodes down the path to the current one. + unsigned CurrNodeLen = 0; + ToVisit.push_back({CurrNode, CurrNodeLen}); + while (!ToVisit.empty()) { + std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); + ToVisit.pop_back(); + CurrNode->ConcatLen = CurrNodeLen; + for (auto &ChildPair : CurrNode->Children) { + assert(ChildPair.second && "Node had a null child!"); + ToVisit.push_back( + {ChildPair.second, CurrNodeLen + ChildPair.second->size()}); + } + + // No children, so we are at the end of the string. + if (CurrNode->Children.size() == 0 && !CurrNode->isRoot()) + CurrNode->SuffixIdx = Str.size() - CurrNodeLen; + } +} + +unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { + SuffixTreeNode *NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move to the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + + unsigned SubstringLen = NextNode->size(); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = NextNode; + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->StartIdx + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeNode *SplitNode = + insertInternalNode(Active.Node, NextNode->StartIdx, + NextNode->StartIdx + Active.Len - 1, FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->StartIdx += Active.Len; + SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) + NeedsLink->Link = SplitNode; + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->Link; + } + } + + return SuffixesToAdd; +} |