diff options
author | bnagaev <bnagaev@yandex-team.ru> | 2022-02-10 16:47:04 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:04 +0300 |
commit | c74559fb88da8adac0d9186cfa55a6b13c47695f (patch) | |
tree | b83306b6e37edeea782e9eed673d89286c4fef35 /contrib/libs/hyperscan/src/parser/buildstate.cpp | |
parent | d6449ba66291ff0c0d352c82e6eb3efb4c8a7e8d (diff) | |
download | ydb-c74559fb88da8adac0d9186cfa55a6b13c47695f.tar.gz |
Restoring authorship annotation for <bnagaev@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/parser/buildstate.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/parser/buildstate.cpp | 1048 |
1 files changed, 524 insertions, 524 deletions
diff --git a/contrib/libs/hyperscan/src/parser/buildstate.cpp b/contrib/libs/hyperscan/src/parser/buildstate.cpp index 3e2cbd9a39..75cfbb7b2d 100644 --- a/contrib/libs/hyperscan/src/parser/buildstate.cpp +++ b/contrib/libs/hyperscan/src/parser/buildstate.cpp @@ -1,529 +1,529 @@ -/* +/* * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Glushkov construction. - */ -#include "buildstate.h" -#include "position.h" -#include "position_dump.h" -#include "position_info.h" -#include "parse_error.h" -#include "hs_internal.h" -#include "ue2common.h" -#include "nfagraph/ng_builder.h" -#include "util/charreach.h" -#include "util/container.h" + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Glushkov construction. + */ +#include "buildstate.h" +#include "position.h" +#include "position_dump.h" +#include "position_info.h" +#include "parse_error.h" +#include "hs_internal.h" +#include "ue2common.h" +#include "nfagraph/ng_builder.h" +#include "util/charreach.h" +#include "util/container.h" #include "util/flat_containers.h" #include "util/hash.h" -#include "util/make_unique.h" +#include "util/make_unique.h" #include "util/unordered.h" - -#include <algorithm> -#include <iterator> -#include <limits> -#include <map> -#include <utility> - -#if defined(DEBUG) || defined(DUMP_SUPPORT) -#include <ostream> -#include <sstream> -#endif - -using namespace std; - -namespace ue2 { - -/** \brief Represents an uninitialized state. */ -const Position GlushkovBuildState::POS_UNINITIALIZED = - numeric_limits<Position>::max(); - -/** \brief Represents an epsilon transition in the firsts of a component. */ -const Position GlushkovBuildState::POS_EPSILON = - numeric_limits<Position>::max() - 1; - -GlushkovBuildState::~GlushkovBuildState() { } - -namespace /* anonymous */ { - -class CheckPositionFlags { -public: - explicit CheckPositionFlags(int fl) : flags(fl) {} - bool operator()(const PositionInfo &p) const { - return (p.flags & flags) == flags; - } -private: - int flags; -}; - -class CheckUnflaggedEpsilon { -public: - bool operator()(const PositionInfo &p) const { - return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0; - } -}; - -/** \brief Concrete impl of the GlushkovBuildState interface. */ -class GlushkovBuildStateImpl : public GlushkovBuildState { -public: - GlushkovBuildStateImpl(NFABuilder &b, bool prefilter); - - /** \brief Returns a reference to the NFABuilder being used. */ - NFABuilder &getBuilder() override { return builder; } - - /** \brief Returns a const reference to the NFABuilder being used. */ - const NFABuilder &getBuilder() const override { return builder; } - - /** \brief Wire up the lasts of one component to the firsts of another. */ - void connectRegions(const vector<PositionInfo> &lasts, - const vector<PositionInfo> &firsts) override; - - /** \brief Wire the lasts of the main sequence to accepts. */ - void connectAccepts(const vector<PositionInfo> &lasts) override; - - /** \brief Wire up a single last to a list of firsts. */ - void connectSuccessors(const PositionInfo &last, - vector<PositionInfo> firsts); - - /** Wire up a pair of positions. */ - void addSuccessor(Position from, Position to) override; - - /** \brief Clone the vertex properties and edges of all vertices between - * two positions. */ - void cloneFollowSet(Position from, Position to, unsigned offset) override; - - /** \brief Build the prioritised list of edges out of our successor map. */ - void buildEdges() override; - - /** Construct an edge, called internally by \ref buildEdges. */ - void buildEdge(Position from, const PositionInfo &to); - - Position startState; - Position startDotstarState; - Position acceptState; - Position acceptEodState; - Position acceptNlEodState; - Position acceptNlState; - - NFABuilder &builder; //!< \brief builder for the NFAGraph - - bool doPrefilter; //!< \brief we're building a prefiltering pattern - - /** \brief Map storing successors for each position. */ - map<Position, flat_set<PositionInfo>> successors; -}; - -} // namespace - -GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b, - bool prefilter) : - startState(b.getStart()), - startDotstarState(b.getStartDotStar()), - acceptState(b.getAccept()), - acceptEodState(b.getAcceptEOD()), - acceptNlEodState(POS_UNINITIALIZED), - acceptNlState(POS_UNINITIALIZED), - builder(b), - doPrefilter(prefilter) -{ - // Our special nodes need special relationships. - vector<PositionInfo> lasts, firsts; - - // start->startDs and startDs self-loop. - lasts.push_back(startState); - lasts.push_back(startDotstarState); - firsts.push_back(startDotstarState); - connectRegions(lasts, firsts); - - // accept to acceptEod edges already wired - - // XXX: a small hack to support vacuous NFAs: give start and startDs an - // initial report ID. - builder.setNodeReportID(startState, 0); - builder.setNodeReportID(startDotstarState, 0); -} - -static -void checkEmbeddedEndAnchor(const PositionInfo &from, - const vector<PositionInfo> &firsts) { - if (!(from.flags & POS_FLAG_ONLY_ENDS)) { - return; - } - - for (const auto &first : firsts) { - if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) { - /* can make it through the parse tree */ - throw ParseError("Embedded end anchors not supported."); - } - } -} - -// Wire up the lasts of one component to the firsts of another -void -GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts, - const vector<PositionInfo> &firsts) { - for (const auto &last : lasts) { - checkEmbeddedEndAnchor(last, firsts); - connectSuccessors(last, firsts); - } -} - -static -void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from, - vector<PositionInfo> &tolist) { - if (from.pos == bs.startDotstarState) { - // If we're connecting from start-dotstar, remove all caret flavoured - // positions. - CheckPositionFlags check(POS_FLAG_NOFLOAT); - tolist.erase(remove_if(tolist.begin(), tolist.end(), check), - tolist.end()); - if (from.flags & POS_FLAG_NOFLOAT) { - tolist.clear(); - } - } else if (from.pos == bs.startState) { - // If we're connecting from start, we should remove any epsilons that - // aren't caret flavoured. - CheckUnflaggedEpsilon check; - tolist.erase(remove_if(tolist.begin(), tolist.end(), check), - tolist.end()); - CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT); - tolist.erase(remove_if(tolist.begin(), tolist.end(), check2), - tolist.end()); - } - - if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) { - // If we have a (mildly boneheaded) pattern like /^$/m, we're right up - // against the edge of what we can do without true assertion support. - // Here we have an evil hack to prevent us plugging the \n generated by - // the caret right into acceptEod (which is in the firsts of the - // dollar). - /* This is due to the 'interesting quirk' that multiline ^ does not - * not match a newline at the end of buffer. */ - DEBUG_PRINTF("multiline start - no eod\n"); - tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState), - tolist.end()); - } -} - -static -Position makeNewlineAssertPos(GlushkovBuildState &bs) { - NFABuilder &builder = bs.getBuilder(); - Position newline = builder.makePositions(1); - builder.addCharReach(newline, CharReach('\n')); - builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT); - builder.setNodeReportID(newline, -1); - return newline; -} - -static -void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from, - vector<PositionInfo> *tolist) { - NFABuilder &builder = bs.getBuilder(); - u32 flags = from.flags; - - bool require_eod = flags & POS_FLAG_WIRE_EOD; - bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD - && !(flags & POS_FLAG_NO_NL_EOD); - bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT) - && !(flags & POS_FLAG_NO_NL_ACCEPT); - - bool require_accept = !(flags & POS_FLAG_ONLY_ENDS); - - if (require_eod) { - tolist->push_back(bs.acceptEodState); - } - - if (require_nl_accept) { - if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) { - Position newline = makeNewlineAssertPos(bs); - bs.addSuccessor(newline, builder.getAccept()); - bs.acceptNlState = newline; - } - tolist->push_back(bs.acceptNlState); - } - - if (require_nl_eod) { - if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) { - Position newline = makeNewlineAssertPos(bs); - bs.addSuccessor(newline, builder.getAcceptEOD()); - bs.acceptNlEodState = newline; - } - tolist->push_back(bs.acceptNlEodState); - } - - if (require_accept) { - tolist->push_back(bs.acceptState); - } -} - -void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) { - for (const auto &last : lasts) { - vector<PositionInfo> accepts; - generateAccepts(*this, last, &accepts); - connectSuccessors(last, accepts); - } -} - -#if defined(DEBUG) || defined(DUMP_SUPPORT) - -static UNUSED -string dumpCaptures(const PositionInfo &p) { - ostringstream oss; - - if (p.flags & POS_FLAG_NOFLOAT) { - oss << "<nofloat>"; - } - if (p.flags & POS_FLAG_MUST_FLOAT) { - oss << "<must_float>"; - } - if (p.flags & POS_FLAG_FIDDLE_ACCEPT) { - oss << "<fiddle_accept>"; - } - if (p.flags & POS_FLAG_ONLY_ENDS) { - oss << "<only_ends>"; - } - if (p.flags & POS_FLAG_NO_NL_EOD) { - oss << "<no_nl_eod>"; - } - if (p.flags & POS_FLAG_NO_NL_ACCEPT) { - oss << "<no_nl_acc>"; - } - - return oss.str(); -} - -#endif // DEBUG || DUMP_SUPPORT - -void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from, - vector<PositionInfo> tolist) { - /* note: tolist maybe modified for our own internal use -> not a reference */ - assert(from.pos != POS_EPSILON); - assert(from.pos != POS_UNINITIALIZED); - assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED) - == tolist.end()); - - DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(), - dumpPositions(tolist.begin(), tolist.end()).c_str()); - - /* prevent creation of edges with invalid assertions */ - filterEdges(*this, from, tolist); - - if (from.flags & POS_FLAG_FIDDLE_ACCEPT) { - auto accept = find(tolist.begin(), tolist.end(), acceptState); - if (accept != tolist.end()) { - DEBUG_PRINTF("accept through -1 offset-adjusting dot\n"); - Position fakedot = builder.makePositions(1); - builder.addCharReach(fakedot, CharReach(0x00, 0xff)); - builder.setNodeReportID(fakedot, -1); - addSuccessor(fakedot, acceptState); - *accept = fakedot; - } else { - // We might lead to accept via an assertion vertex, so we add the - // offset adj to this vertex itself. Used for cases like /^\B/m, - // which should match only at 0 for '\n'. - builder.setNodeReportID(from.pos, -1); - } - - assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end()); - } - - auto &succ = successors[from.pos]; - - DEBUG_PRINTF("connect %u -> %s\n", from.pos, - dumpPositions(tolist.begin(), tolist.end()).c_str()); - DEBUG_PRINTF("%u curr succ: %s\n", from.pos, - dumpPositions(begin(succ), end(succ)).c_str()); - - for (const auto &to : tolist) { - if (to.pos != POS_EPSILON) { - succ.insert(to); - } - } - - DEBUG_PRINTF("%u succ: %s\n", from.pos, - dumpPositions(begin(succ), end(succ)).c_str()); -} - -void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) { - DEBUG_PRINTF("connect %u -> %u\n", from, to); - assert(from != POS_EPSILON && from != POS_UNINITIALIZED); - assert(to != POS_EPSILON && to != POS_UNINITIALIZED); - - auto &succ = successors[from]; - succ.insert(to); - - DEBUG_PRINTF("%u succ: %s\n", from, - dumpPositions(begin(succ), end(succ)).c_str()); -} - -void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last, - unsigned offset) { - assert(first <= last); - - // Clone vertex properties (reachability, etc) - builder.cloneRegion(first, last, offset); - - /* Clone the successors of all the positions between first and last - * inclusive, producing a new set of positions starting at (first + - * offset). */ - for (Position i = first; i <= last; i++) { - // This should be a new position. - assert(successors[i + offset].empty()); - - for (const PositionInfo &to : successors[i]) { - if (to.pos >= first && to.pos <= last) { - PositionInfo clone(to); - clone.pos += offset; - DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos); - successors[i + offset].insert(clone); - } else { - // There shouldn't be any stray edges leading out of this - // region! - assert(0); - } - } - } -} - -void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) { - // Guard against embedded anchors - if (to == startState) { - /* can make it through the parse tree */ - throw ParseError("Embedded start anchors not supported."); - } - - assert(to.pos != POS_UNINITIALIZED); - assert(to.pos != POS_EPSILON); - - if (builder.hasEdge(from, to.pos)) { - return; - } - - builder.addEdge(from, to.pos); -} - -void GlushkovBuildStateImpl::buildEdges() { - // Create all the edges and track which vertices are asserts which need to - // be removed later. - for (const auto &m : successors) { - const Position from = m.first; - for (const auto &to : m.second) { - buildEdge(from, to); - } - } -} - -// Construct a usable GlushkovBuildState for the outside world. -unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b, - bool prefilter) { - return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter); -} - -// free functions for utility use - -/** \brief Eliminate lower-priority duplicate PositionInfo entries. - * - * Scans through a list of positions and retains only the highest priority - * version of a given (position, flags) entry. */ -void cleanupPositions(vector<PositionInfo> &a) { + +#include <algorithm> +#include <iterator> +#include <limits> +#include <map> +#include <utility> + +#if defined(DEBUG) || defined(DUMP_SUPPORT) +#include <ostream> +#include <sstream> +#endif + +using namespace std; + +namespace ue2 { + +/** \brief Represents an uninitialized state. */ +const Position GlushkovBuildState::POS_UNINITIALIZED = + numeric_limits<Position>::max(); + +/** \brief Represents an epsilon transition in the firsts of a component. */ +const Position GlushkovBuildState::POS_EPSILON = + numeric_limits<Position>::max() - 1; + +GlushkovBuildState::~GlushkovBuildState() { } + +namespace /* anonymous */ { + +class CheckPositionFlags { +public: + explicit CheckPositionFlags(int fl) : flags(fl) {} + bool operator()(const PositionInfo &p) const { + return (p.flags & flags) == flags; + } +private: + int flags; +}; + +class CheckUnflaggedEpsilon { +public: + bool operator()(const PositionInfo &p) const { + return p.pos == GlushkovBuildState::POS_EPSILON && p.flags == 0; + } +}; + +/** \brief Concrete impl of the GlushkovBuildState interface. */ +class GlushkovBuildStateImpl : public GlushkovBuildState { +public: + GlushkovBuildStateImpl(NFABuilder &b, bool prefilter); + + /** \brief Returns a reference to the NFABuilder being used. */ + NFABuilder &getBuilder() override { return builder; } + + /** \brief Returns a const reference to the NFABuilder being used. */ + const NFABuilder &getBuilder() const override { return builder; } + + /** \brief Wire up the lasts of one component to the firsts of another. */ + void connectRegions(const vector<PositionInfo> &lasts, + const vector<PositionInfo> &firsts) override; + + /** \brief Wire the lasts of the main sequence to accepts. */ + void connectAccepts(const vector<PositionInfo> &lasts) override; + + /** \brief Wire up a single last to a list of firsts. */ + void connectSuccessors(const PositionInfo &last, + vector<PositionInfo> firsts); + + /** Wire up a pair of positions. */ + void addSuccessor(Position from, Position to) override; + + /** \brief Clone the vertex properties and edges of all vertices between + * two positions. */ + void cloneFollowSet(Position from, Position to, unsigned offset) override; + + /** \brief Build the prioritised list of edges out of our successor map. */ + void buildEdges() override; + + /** Construct an edge, called internally by \ref buildEdges. */ + void buildEdge(Position from, const PositionInfo &to); + + Position startState; + Position startDotstarState; + Position acceptState; + Position acceptEodState; + Position acceptNlEodState; + Position acceptNlState; + + NFABuilder &builder; //!< \brief builder for the NFAGraph + + bool doPrefilter; //!< \brief we're building a prefiltering pattern + + /** \brief Map storing successors for each position. */ + map<Position, flat_set<PositionInfo>> successors; +}; + +} // namespace + +GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b, + bool prefilter) : + startState(b.getStart()), + startDotstarState(b.getStartDotStar()), + acceptState(b.getAccept()), + acceptEodState(b.getAcceptEOD()), + acceptNlEodState(POS_UNINITIALIZED), + acceptNlState(POS_UNINITIALIZED), + builder(b), + doPrefilter(prefilter) +{ + // Our special nodes need special relationships. + vector<PositionInfo> lasts, firsts; + + // start->startDs and startDs self-loop. + lasts.push_back(startState); + lasts.push_back(startDotstarState); + firsts.push_back(startDotstarState); + connectRegions(lasts, firsts); + + // accept to acceptEod edges already wired + + // XXX: a small hack to support vacuous NFAs: give start and startDs an + // initial report ID. + builder.setNodeReportID(startState, 0); + builder.setNodeReportID(startDotstarState, 0); +} + +static +void checkEmbeddedEndAnchor(const PositionInfo &from, + const vector<PositionInfo> &firsts) { + if (!(from.flags & POS_FLAG_ONLY_ENDS)) { + return; + } + + for (const auto &first : firsts) { + if (first.pos != GlushkovBuildStateImpl::POS_EPSILON) { + /* can make it through the parse tree */ + throw ParseError("Embedded end anchors not supported."); + } + } +} + +// Wire up the lasts of one component to the firsts of another +void +GlushkovBuildStateImpl::connectRegions(const vector<PositionInfo> &lasts, + const vector<PositionInfo> &firsts) { + for (const auto &last : lasts) { + checkEmbeddedEndAnchor(last, firsts); + connectSuccessors(last, firsts); + } +} + +static +void filterEdges(const GlushkovBuildStateImpl &bs, const PositionInfo &from, + vector<PositionInfo> &tolist) { + if (from.pos == bs.startDotstarState) { + // If we're connecting from start-dotstar, remove all caret flavoured + // positions. + CheckPositionFlags check(POS_FLAG_NOFLOAT); + tolist.erase(remove_if(tolist.begin(), tolist.end(), check), + tolist.end()); + if (from.flags & POS_FLAG_NOFLOAT) { + tolist.clear(); + } + } else if (from.pos == bs.startState) { + // If we're connecting from start, we should remove any epsilons that + // aren't caret flavoured. + CheckUnflaggedEpsilon check; + tolist.erase(remove_if(tolist.begin(), tolist.end(), check), + tolist.end()); + CheckPositionFlags check2(POS_FLAG_MUST_FLOAT | POS_FLAG_NOFLOAT); + tolist.erase(remove_if(tolist.begin(), tolist.end(), check2), + tolist.end()); + } + + if (bs.builder.getAssertFlag(from.pos) & POS_FLAG_MULTILINE_START) { + // If we have a (mildly boneheaded) pattern like /^$/m, we're right up + // against the edge of what we can do without true assertion support. + // Here we have an evil hack to prevent us plugging the \n generated by + // the caret right into acceptEod (which is in the firsts of the + // dollar). + /* This is due to the 'interesting quirk' that multiline ^ does not + * not match a newline at the end of buffer. */ + DEBUG_PRINTF("multiline start - no eod\n"); + tolist.erase(remove(tolist.begin(), tolist.end(), bs.acceptEodState), + tolist.end()); + } +} + +static +Position makeNewlineAssertPos(GlushkovBuildState &bs) { + NFABuilder &builder = bs.getBuilder(); + Position newline = builder.makePositions(1); + builder.addCharReach(newline, CharReach('\n')); + builder.setAssertFlag(newline, POS_FLAG_FIDDLE_ACCEPT); + builder.setNodeReportID(newline, -1); + return newline; +} + +static +void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from, + vector<PositionInfo> *tolist) { + NFABuilder &builder = bs.getBuilder(); + u32 flags = from.flags; + + bool require_eod = flags & POS_FLAG_WIRE_EOD; + bool require_nl_eod = flags & POS_FLAG_WIRE_NL_EOD + && !(flags & POS_FLAG_NO_NL_EOD); + bool require_nl_accept = (flags & POS_FLAG_WIRE_NL_ACCEPT) + && !(flags & POS_FLAG_NO_NL_ACCEPT); + + bool require_accept = !(flags & POS_FLAG_ONLY_ENDS); + + if (require_eod) { + tolist->push_back(bs.acceptEodState); + } + + if (require_nl_accept) { + if (bs.acceptNlState == GlushkovBuildState::POS_UNINITIALIZED) { + Position newline = makeNewlineAssertPos(bs); + bs.addSuccessor(newline, builder.getAccept()); + bs.acceptNlState = newline; + } + tolist->push_back(bs.acceptNlState); + } + + if (require_nl_eod) { + if (bs.acceptNlEodState == GlushkovBuildState::POS_UNINITIALIZED) { + Position newline = makeNewlineAssertPos(bs); + bs.addSuccessor(newline, builder.getAcceptEOD()); + bs.acceptNlEodState = newline; + } + tolist->push_back(bs.acceptNlEodState); + } + + if (require_accept) { + tolist->push_back(bs.acceptState); + } +} + +void GlushkovBuildStateImpl::connectAccepts(const vector<PositionInfo> &lasts) { + for (const auto &last : lasts) { + vector<PositionInfo> accepts; + generateAccepts(*this, last, &accepts); + connectSuccessors(last, accepts); + } +} + +#if defined(DEBUG) || defined(DUMP_SUPPORT) + +static UNUSED +string dumpCaptures(const PositionInfo &p) { + ostringstream oss; + + if (p.flags & POS_FLAG_NOFLOAT) { + oss << "<nofloat>"; + } + if (p.flags & POS_FLAG_MUST_FLOAT) { + oss << "<must_float>"; + } + if (p.flags & POS_FLAG_FIDDLE_ACCEPT) { + oss << "<fiddle_accept>"; + } + if (p.flags & POS_FLAG_ONLY_ENDS) { + oss << "<only_ends>"; + } + if (p.flags & POS_FLAG_NO_NL_EOD) { + oss << "<no_nl_eod>"; + } + if (p.flags & POS_FLAG_NO_NL_ACCEPT) { + oss << "<no_nl_acc>"; + } + + return oss.str(); +} + +#endif // DEBUG || DUMP_SUPPORT + +void GlushkovBuildStateImpl::connectSuccessors(const PositionInfo &from, + vector<PositionInfo> tolist) { + /* note: tolist maybe modified for our own internal use -> not a reference */ + assert(from.pos != POS_EPSILON); + assert(from.pos != POS_UNINITIALIZED); + assert(find(tolist.begin(), tolist.end(), POS_UNINITIALIZED) + == tolist.end()); + + DEBUG_PRINTF("FROM = %u%s TO = %s\n", from.pos, dumpCaptures(from).c_str(), + dumpPositions(tolist.begin(), tolist.end()).c_str()); + + /* prevent creation of edges with invalid assertions */ + filterEdges(*this, from, tolist); + + if (from.flags & POS_FLAG_FIDDLE_ACCEPT) { + auto accept = find(tolist.begin(), tolist.end(), acceptState); + if (accept != tolist.end()) { + DEBUG_PRINTF("accept through -1 offset-adjusting dot\n"); + Position fakedot = builder.makePositions(1); + builder.addCharReach(fakedot, CharReach(0x00, 0xff)); + builder.setNodeReportID(fakedot, -1); + addSuccessor(fakedot, acceptState); + *accept = fakedot; + } else { + // We might lead to accept via an assertion vertex, so we add the + // offset adj to this vertex itself. Used for cases like /^\B/m, + // which should match only at 0 for '\n'. + builder.setNodeReportID(from.pos, -1); + } + + assert(find(tolist.begin(), tolist.end(), acceptState) == tolist.end()); + } + + auto &succ = successors[from.pos]; + + DEBUG_PRINTF("connect %u -> %s\n", from.pos, + dumpPositions(tolist.begin(), tolist.end()).c_str()); + DEBUG_PRINTF("%u curr succ: %s\n", from.pos, + dumpPositions(begin(succ), end(succ)).c_str()); + + for (const auto &to : tolist) { + if (to.pos != POS_EPSILON) { + succ.insert(to); + } + } + + DEBUG_PRINTF("%u succ: %s\n", from.pos, + dumpPositions(begin(succ), end(succ)).c_str()); +} + +void GlushkovBuildStateImpl::addSuccessor(Position from, Position to) { + DEBUG_PRINTF("connect %u -> %u\n", from, to); + assert(from != POS_EPSILON && from != POS_UNINITIALIZED); + assert(to != POS_EPSILON && to != POS_UNINITIALIZED); + + auto &succ = successors[from]; + succ.insert(to); + + DEBUG_PRINTF("%u succ: %s\n", from, + dumpPositions(begin(succ), end(succ)).c_str()); +} + +void GlushkovBuildStateImpl::cloneFollowSet(Position first, Position last, + unsigned offset) { + assert(first <= last); + + // Clone vertex properties (reachability, etc) + builder.cloneRegion(first, last, offset); + + /* Clone the successors of all the positions between first and last + * inclusive, producing a new set of positions starting at (first + + * offset). */ + for (Position i = first; i <= last; i++) { + // This should be a new position. + assert(successors[i + offset].empty()); + + for (const PositionInfo &to : successors[i]) { + if (to.pos >= first && to.pos <= last) { + PositionInfo clone(to); + clone.pos += offset; + DEBUG_PRINTF("clone: %u -> %u\n", i + offset, clone.pos); + successors[i + offset].insert(clone); + } else { + // There shouldn't be any stray edges leading out of this + // region! + assert(0); + } + } + } +} + +void GlushkovBuildStateImpl::buildEdge(Position from, const PositionInfo &to) { + // Guard against embedded anchors + if (to == startState) { + /* can make it through the parse tree */ + throw ParseError("Embedded start anchors not supported."); + } + + assert(to.pos != POS_UNINITIALIZED); + assert(to.pos != POS_EPSILON); + + if (builder.hasEdge(from, to.pos)) { + return; + } + + builder.addEdge(from, to.pos); +} + +void GlushkovBuildStateImpl::buildEdges() { + // Create all the edges and track which vertices are asserts which need to + // be removed later. + for (const auto &m : successors) { + const Position from = m.first; + for (const auto &to : m.second) { + buildEdge(from, to); + } + } +} + +// Construct a usable GlushkovBuildState for the outside world. +unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b, + bool prefilter) { + return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter); +} + +// free functions for utility use + +/** \brief Eliminate lower-priority duplicate PositionInfo entries. + * + * Scans through a list of positions and retains only the highest priority + * version of a given (position, flags) entry. */ +void cleanupPositions(vector<PositionInfo> &a) { ue2_unordered_set<pair<Position, int>> seen; - - vector<PositionInfo> out; - out.reserve(a.size()); // output should be close to input in size. - - for (const auto &p : a) { - if (seen.emplace(p.pos, p.flags).second) { - out.push_back(p); // first encounter - } - } - - DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size()); - a.swap(out); -} - -static -vector<PositionInfo>::iterator -replaceElemWithSequence(vector<PositionInfo> &dest, - vector<PositionInfo>::iterator &victim, - const vector<PositionInfo> &replacement) { - auto past = dest.erase(victim); - size_t d = distance(dest.begin(), past) + replacement.size(); - dest.insert(past, replacement.begin(), replacement.end()); - /* recalc past as iterator may have been invalidated */ - return dest.begin() + d; -} - -/** \brief Replace all epsilons with the given positions. - * - * Replace epsilons in a firsts list with another given firsts list. Note: the - * firsts lists must come from disjoint sets of components. If no epsilons are - * in the first firsts list the source is appended to the end. - */ -void replaceEpsilons(vector<PositionInfo> &target, - const vector<PositionInfo> &source) { - auto found = - find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON); - - if (found == target.end()) { - // no epsilons to replace, push on to the end - target.insert(target.end(), source.begin(), source.end()); - return; - } - - while (found != target.end()) { - checkEmbeddedEndAnchor(*found, source); - - // replace this epsilon with a copy of source with the same flags - vector<PositionInfo> newsource(source); - for (auto &pos : newsource) { - pos.flags |= found->flags; - } - - found = replaceElemWithSequence(target, found, newsource); - // find the next epsilon - found = find(found, target.end(), GlushkovBuildState::POS_EPSILON); - } - - cleanupPositions(target); -} - -#ifdef DUMP_SUPPORT - -void dump(ostream &os, const PositionInfo &p) { - if (p.pos == GlushkovBuildState::POS_EPSILON) { - os << "epsilon"; - } else { - os << p.pos; - } - - os << dumpCaptures(p); -} - -#endif // DUMP_SUPPORT - -} // namespace ue2 + + vector<PositionInfo> out; + out.reserve(a.size()); // output should be close to input in size. + + for (const auto &p : a) { + if (seen.emplace(p.pos, p.flags).second) { + out.push_back(p); // first encounter + } + } + + DEBUG_PRINTF("in %zu; out %zu\n", a.size(), out.size()); + a.swap(out); +} + +static +vector<PositionInfo>::iterator +replaceElemWithSequence(vector<PositionInfo> &dest, + vector<PositionInfo>::iterator &victim, + const vector<PositionInfo> &replacement) { + auto past = dest.erase(victim); + size_t d = distance(dest.begin(), past) + replacement.size(); + dest.insert(past, replacement.begin(), replacement.end()); + /* recalc past as iterator may have been invalidated */ + return dest.begin() + d; +} + +/** \brief Replace all epsilons with the given positions. + * + * Replace epsilons in a firsts list with another given firsts list. Note: the + * firsts lists must come from disjoint sets of components. If no epsilons are + * in the first firsts list the source is appended to the end. + */ +void replaceEpsilons(vector<PositionInfo> &target, + const vector<PositionInfo> &source) { + auto found = + find(target.begin(), target.end(), GlushkovBuildState::POS_EPSILON); + + if (found == target.end()) { + // no epsilons to replace, push on to the end + target.insert(target.end(), source.begin(), source.end()); + return; + } + + while (found != target.end()) { + checkEmbeddedEndAnchor(*found, source); + + // replace this epsilon with a copy of source with the same flags + vector<PositionInfo> newsource(source); + for (auto &pos : newsource) { + pos.flags |= found->flags; + } + + found = replaceElemWithSequence(target, found, newsource); + // find the next epsilon + found = find(found, target.end(), GlushkovBuildState::POS_EPSILON); + } + + cleanupPositions(target); +} + +#ifdef DUMP_SUPPORT + +void dump(ostream &os, const PositionInfo &p) { + if (p.pos == GlushkovBuildState::POS_EPSILON) { + os << "epsilon"; + } else { + os << p.pos; + } + + os << dumpCaptures(p); +} + +#endif // DUMP_SUPPORT + +} // namespace ue2 |