diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 01:45:21 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 02:42:50 +0300 |
commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
tree | 9f88a486917d371d099cd712efd91b4c122d209d /contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp | |
parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) | |
download | ydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz |
Intermediate changes
Diffstat (limited to 'contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp')
-rw-r--r-- | contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp | 617 |
1 files changed, 617 insertions, 0 deletions
diff --git a/contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp b/contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp new file mode 100644 index 0000000000..ef1b1cf2f1 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/atn/LexerATNSimulator.cpp @@ -0,0 +1,617 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include "IntStream.h" +#include "atn/OrderedATNConfigSet.h" +#include "Token.h" +#include "LexerNoViableAltException.h" +#include "atn/RuleStopState.h" +#include "atn/RuleTransition.h" +#include "atn/SingletonPredictionContext.h" +#include "atn/PredicateTransition.h" +#include "atn/ActionTransition.h" +#include "atn/TokensStartState.h" +#include "misc/Interval.h" +#include "dfa/DFA.h" +#include "Lexer.h" +#include "internal/Synchronization.h" + +#include "dfa/DFAState.h" +#include "atn/LexerATNConfig.h" +#include "atn/LexerActionExecutor.h" + +#include "atn/LexerATNSimulator.h" + +#define DEBUG_ATN 0 +#define DEBUG_DFA 0 + +using namespace antlr4; +using namespace antlr4::atn; +using namespace antlr4::internal; +using namespace antlrcpp; + +void LexerATNSimulator::SimState::reset() { + *this = SimState(); +} + +LexerATNSimulator::LexerATNSimulator(const ATN &atn, std::vector<dfa::DFA> &decisionToDFA, + PredictionContextCache &sharedContextCache) + : LexerATNSimulator(nullptr, atn, decisionToDFA, sharedContextCache) { +} + +LexerATNSimulator::LexerATNSimulator(Lexer *recog, const ATN &atn, std::vector<dfa::DFA> &decisionToDFA, + PredictionContextCache &sharedContextCache) + : ATNSimulator(atn, sharedContextCache), _recog(recog), _decisionToDFA(decisionToDFA) { + InitializeInstanceFields(); +} + +void LexerATNSimulator::copyState(LexerATNSimulator *simulator) { + _charPositionInLine = simulator->_charPositionInLine; + _line = simulator->_line; + _mode = simulator->_mode; + _startIndex = simulator->_startIndex; +} + +size_t LexerATNSimulator::match(CharStream *input, size_t mode) { + _mode = mode; + ssize_t mark = input->mark(); + + auto onExit = finally([input, mark] { + input->release(mark); + }); + + _startIndex = input->index(); + _prevAccept.reset(); + const dfa::DFA &dfa = _decisionToDFA[mode]; + dfa::DFAState* s0; + { + SharedLock<SharedMutex> stateLock(atn._stateMutex); + s0 = dfa.s0; + } + if (s0 == nullptr) { + return matchATN(input); + } else { + return execATN(input, s0); + } +} + +void LexerATNSimulator::reset() { + _prevAccept.reset(); + _startIndex = 0; + _line = 1; + _charPositionInLine = 0; + _mode = Lexer::DEFAULT_MODE; +} + +void LexerATNSimulator::clearDFA() { + size_t size = _decisionToDFA.size(); + _decisionToDFA.clear(); + for (size_t d = 0; d < size; ++d) { + _decisionToDFA.emplace_back(atn.getDecisionState(d), d); + } +} + +size_t LexerATNSimulator::matchATN(CharStream *input) { + ATNState *startState = atn.modeToStartState[_mode]; + + std::unique_ptr<ATNConfigSet> s0_closure = computeStartState(input, startState); + + bool suppressEdge = s0_closure->hasSemanticContext; + s0_closure->hasSemanticContext = false; + + dfa::DFAState *next = addDFAState(s0_closure.release(), suppressEdge); + + size_t predict = execATN(input, next); + + return predict; +} + +size_t LexerATNSimulator::execATN(CharStream *input, dfa::DFAState *ds0) { + if (ds0->isAcceptState) { + // allow zero-length tokens + // ml: in Java code this method uses 3 params. The first is a member var of the class anyway (_prevAccept), so why pass it here? + captureSimState(input, ds0); + } + + size_t t = input->LA(1); + dfa::DFAState *s = ds0; // s is current/from DFA state + + while (true) { // while more work + // As we move src->trg, src->trg, we keep track of the previous trg to + // avoid looking up the DFA state again, which is expensive. + // If the previous target was already part of the DFA, we might + // be able to avoid doing a reach operation upon t. If s!=null, + // it means that semantic predicates didn't prevent us from + // creating a DFA state. Once we know s!=null, we check to see if + // the DFA state has an edge already for t. If so, we can just reuse + // it's configuration set; there's no point in re-computing it. + // This is kind of like doing DFA simulation within the ATN + // simulation because DFA simulation is really just a way to avoid + // computing reach/closure sets. Technically, once we know that + // we have a previously added DFA state, we could jump over to + // the DFA simulator. But, that would mean popping back and forth + // a lot and making things more complicated algorithmically. + // This optimization makes a lot of sense for loops within DFA. + // A character will take us back to an existing DFA state + // that already has lots of edges out of it. e.g., .* in comments. + dfa::DFAState *target = getExistingTargetState(s, t); + if (target == nullptr) { + target = computeTargetState(input, s, t); + } + + if (target == ERROR.get()) { + break; + } + + // If this is a consumable input element, make sure to consume before + // capturing the accept state so the input index, line, and char + // position accurately reflect the state of the interpreter at the + // end of the token. + if (t != Token::EOF) { + consume(input); + } + + if (target->isAcceptState) { + captureSimState(input, target); + if (t == Token::EOF) { + break; + } + } + + t = input->LA(1); + s = target; // flip; current DFA target becomes new src/from state + } + + return failOrAccept(input, s->configs.get(), t); +} + +dfa::DFAState *LexerATNSimulator::getExistingTargetState(dfa::DFAState *s, size_t t) { + dfa::DFAState* retval = nullptr; + SharedLock<SharedMutex> edgeLock(atn._edgeMutex); + if (t <= MAX_DFA_EDGE) { + auto iterator = s->edges.find(t - MIN_DFA_EDGE); +#if DEBUG_ATN == 1 + if (iterator != s->edges.end()) { + std::cout << std::string("reuse state ") << s->stateNumber << std::string(" edge to ") << iterator->second->stateNumber << std::endl; + } +#endif + + if (iterator != s->edges.end()) + retval = iterator->second; + } + return retval; +} + +dfa::DFAState *LexerATNSimulator::computeTargetState(CharStream *input, dfa::DFAState *s, size_t t) { + OrderedATNConfigSet *reach = new OrderedATNConfigSet(); /* mem-check: deleted on error or managed by new DFA state. */ + + // if we don't find an existing DFA state + // Fill reach starting from closure, following t transitions + getReachableConfigSet(input, s->configs.get(), reach, t); + + if (reach->isEmpty()) { // we got nowhere on t from s + if (!reach->hasSemanticContext) { + // we got nowhere on t, don't throw out this knowledge; it'd + // cause a failover from DFA later. + addDFAEdge(s, t, ERROR.get()); + } + delete reach; + + // stop when we can't match any more char + return ERROR.get(); + } + + // Add an edge from s to target DFA found/created for reach + return addDFAEdge(s, t, reach); +} + +size_t LexerATNSimulator::failOrAccept(CharStream *input, ATNConfigSet *reach, size_t t) { + if (_prevAccept.dfaState != nullptr) { + accept(input, _prevAccept.dfaState->lexerActionExecutor, _startIndex, _prevAccept.index, _prevAccept.line, _prevAccept.charPos); + return _prevAccept.dfaState->prediction; + } else { + // if no accept and EOF is first char, return EOF + if (t == Token::EOF && input->index() == _startIndex) { + return Token::EOF; + } + + throw LexerNoViableAltException(_recog, input, _startIndex, reach); + } +} + +void LexerATNSimulator::getReachableConfigSet(CharStream *input, ATNConfigSet *closure_, ATNConfigSet *reach, size_t t) { + // this is used to skip processing for configs which have a lower priority + // than a config that already reached an accept state for the same rule + size_t skipAlt = ATN::INVALID_ALT_NUMBER; + + for (const auto &c : closure_->configs) { + bool currentAltReachedAcceptState = c->alt == skipAlt; + if (currentAltReachedAcceptState && (std::static_pointer_cast<LexerATNConfig>(c))->hasPassedThroughNonGreedyDecision()) { + continue; + } + +#if DEBUG_ATN == 1 + std::cout << "testing " << getTokenName((int)t) << " at " << c->toString(true) << std::endl; +#endif + + size_t n = c->state->transitions.size(); + for (size_t ti = 0; ti < n; ti++) { // for each transition + const Transition *trans = c->state->transitions[ti].get(); + ATNState *target = getReachableTarget(trans, (int)t); + if (target != nullptr) { + auto lexerActionExecutor = downCast<const LexerATNConfig&>(*c).getLexerActionExecutor(); + if (lexerActionExecutor != nullptr) { + lexerActionExecutor = lexerActionExecutor->fixOffsetBeforeMatch((int)input->index() - (int)_startIndex); + } + + bool treatEofAsEpsilon = t == Token::EOF; + Ref<LexerATNConfig> config = std::make_shared<LexerATNConfig>(downCast<const LexerATNConfig&>(*c), + target, std::move(lexerActionExecutor)); + + if (closure(input, config, reach, currentAltReachedAcceptState, true, treatEofAsEpsilon)) { + // any remaining configs for this alt have a lower priority than + // the one that just reached an accept state. + skipAlt = c->alt; + break; + } + } + } + } +} + +void LexerATNSimulator::accept(CharStream *input, const Ref<const LexerActionExecutor> &lexerActionExecutor, size_t /*startIndex*/, + size_t index, size_t line, size_t charPos) { +#if DEBUG_ATN == 1 + std::cout << "ACTION "; + std::cout << toString(lexerActionExecutor) << std::endl; +#endif + + // seek to after last char in token + input->seek(index); + _line = line; + _charPositionInLine = (int)charPos; + + if (lexerActionExecutor != nullptr && _recog != nullptr) { + lexerActionExecutor->execute(_recog, input, _startIndex); + } +} + +atn::ATNState *LexerATNSimulator::getReachableTarget(const Transition *trans, size_t t) { + if (trans->matches(t, Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE)) { + return trans->target; + } + + return nullptr; +} + +std::unique_ptr<ATNConfigSet> LexerATNSimulator::computeStartState(CharStream *input, ATNState *p) { + Ref<const PredictionContext> initialContext = PredictionContext::EMPTY; // ml: the purpose of this assignment is unclear + std::unique_ptr<ATNConfigSet> configs(new OrderedATNConfigSet()); + for (size_t i = 0; i < p->transitions.size(); i++) { + ATNState *target = p->transitions[i]->target; + Ref<LexerATNConfig> c = std::make_shared<LexerATNConfig>(target, (int)(i + 1), initialContext); + closure(input, c, configs.get(), false, false, false); + } + + return configs; +} + +bool LexerATNSimulator::closure(CharStream *input, const Ref<LexerATNConfig> &config, ATNConfigSet *configs, + bool currentAltReachedAcceptState, bool speculative, bool treatEofAsEpsilon) { +#if DEBUG_ATN == 1 + std::cout << "closure(" << config->toString(true) << ")" << std::endl; +#endif + + if (config->state != nullptr && config->state->getStateType() == ATNStateType::RULE_STOP) { +#if DEBUG_ATN == 1 + if (_recog != nullptr) { + std::cout << "closure at " << _recog->getRuleNames()[config->state->ruleIndex] << " rule stop " << config << std::endl; + } else { + std::cout << "closure at rule stop " << config << std::endl; + } +#endif + + if (config->context == nullptr || config->context->hasEmptyPath()) { + if (config->context == nullptr || config->context->isEmpty()) { + configs->add(config); + return true; + } else { + configs->add(std::make_shared<LexerATNConfig>(*config, config->state, PredictionContext::EMPTY)); + currentAltReachedAcceptState = true; + } + } + + if (config->context != nullptr && !config->context->isEmpty()) { + for (size_t i = 0; i < config->context->size(); i++) { + if (config->context->getReturnState(i) != PredictionContext::EMPTY_RETURN_STATE) { + Ref<const PredictionContext> newContext = config->context->getParent(i); // "pop" return state + ATNState *returnState = atn.states[config->context->getReturnState(i)]; + Ref<LexerATNConfig> c = std::make_shared<LexerATNConfig>(*config, returnState, newContext); + currentAltReachedAcceptState = closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon); + } + } + } + + return currentAltReachedAcceptState; + } + + // optimization + if (!config->state->epsilonOnlyTransitions) { + if (!currentAltReachedAcceptState || !config->hasPassedThroughNonGreedyDecision()) { + configs->add(config); + } + } + + ATNState *p = config->state; + for (size_t i = 0; i < p->transitions.size(); i++) { + const Transition *t = p->transitions[i].get(); + Ref<LexerATNConfig> c = getEpsilonTarget(input, config, t, configs, speculative, treatEofAsEpsilon); + if (c != nullptr) { + currentAltReachedAcceptState = closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon); + } + } + + return currentAltReachedAcceptState; +} + +Ref<LexerATNConfig> LexerATNSimulator::getEpsilonTarget(CharStream *input, const Ref<LexerATNConfig> &config, const Transition *t, + ATNConfigSet *configs, bool speculative, bool treatEofAsEpsilon) { + + Ref<LexerATNConfig> c = nullptr; + switch (t->getTransitionType()) { + case TransitionType::RULE: { + const RuleTransition *ruleTransition = static_cast<const RuleTransition*>(t); + Ref<const PredictionContext> newContext = SingletonPredictionContext::create(config->context, ruleTransition->followState->stateNumber); + c = std::make_shared<LexerATNConfig>(*config, t->target, newContext); + break; + } + + case TransitionType::PRECEDENCE: + throw UnsupportedOperationException("Precedence predicates are not supported in lexers."); + + case TransitionType::PREDICATE: { + /* Track traversing semantic predicates. If we traverse, + we cannot add a DFA state for this "reach" computation + because the DFA would not test the predicate again in the + future. Rather than creating collections of semantic predicates + like v3 and testing them on prediction, v4 will test them on the + fly all the time using the ATN not the DFA. This is slower but + semantically it's not used that often. One of the key elements to + this predicate mechanism is not adding DFA states that see + predicates immediately afterwards in the ATN. For example, + + a : ID {p1}? | ID {p2}? ; + + should create the start state for rule 'a' (to save start state + competition), but should not create target of ID state. The + collection of ATN states the following ID references includes + states reached by traversing predicates. Since this is when we + test them, we cannot cash the DFA state target of ID. + */ + const PredicateTransition *pt = static_cast<const PredicateTransition*>(t); + +#if DEBUG_ATN == 1 + std::cout << "EVAL rule " << pt->getRuleIndex() << ":" << pt->getPredIndex() << std::endl; +#endif + + configs->hasSemanticContext = true; + if (evaluatePredicate(input, pt->getRuleIndex(), pt->getPredIndex(), speculative)) { + c = std::make_shared<LexerATNConfig>(*config, t->target); + } + break; + } + + case TransitionType::ACTION: + if (config->context == nullptr|| config->context->hasEmptyPath()) { + // execute actions anywhere in the start rule for a token. + // + // TODO: if the entry rule is invoked recursively, some + // actions may be executed during the recursive call. The + // problem can appear when hasEmptyPath() is true but + // isEmpty() is false. In this case, the config needs to be + // split into two contexts - one with just the empty path + // and another with everything but the empty path. + // Unfortunately, the current algorithm does not allow + // getEpsilonTarget to return two configurations, so + // additional modifications are needed before we can support + // the split operation. + auto lexerActionExecutor = LexerActionExecutor::append(config->getLexerActionExecutor(), + atn.lexerActions[static_cast<const ActionTransition *>(t)->actionIndex]); + c = std::make_shared<LexerATNConfig>(*config, t->target, std::move(lexerActionExecutor)); + break; + } + else { + // ignore actions in referenced rules + c = std::make_shared<LexerATNConfig>(*config, t->target); + break; + } + + case TransitionType::EPSILON: + c = std::make_shared<LexerATNConfig>(*config, t->target); + break; + + case TransitionType::ATOM: + case TransitionType::RANGE: + case TransitionType::SET: + if (treatEofAsEpsilon) { + if (t->matches(Token::EOF, Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE)) { + c = std::make_shared<LexerATNConfig>(*config, t->target); + break; + } + } + + break; + + default: // To silence the compiler. Other transition types are not used here. + break; + } + + return c; +} + +bool LexerATNSimulator::evaluatePredicate(CharStream *input, size_t ruleIndex, size_t predIndex, bool speculative) { + // assume true if no recognizer was provided + if (_recog == nullptr) { + return true; + } + + if (!speculative) { + return _recog->sempred(nullptr, ruleIndex, predIndex); + } + + size_t savedCharPositionInLine = _charPositionInLine; + size_t savedLine = _line; + size_t index = input->index(); + ssize_t marker = input->mark(); + + auto onExit = finally([this, input, savedCharPositionInLine, savedLine, index, marker] { + _charPositionInLine = savedCharPositionInLine; + _line = savedLine; + input->seek(index); + input->release(marker); + }); + + consume(input); + return _recog->sempred(nullptr, ruleIndex, predIndex); +} + +void LexerATNSimulator::captureSimState(CharStream *input, dfa::DFAState *dfaState) { + _prevAccept.index = input->index(); + _prevAccept.line = _line; + _prevAccept.charPos = _charPositionInLine; + _prevAccept.dfaState = dfaState; +} + +dfa::DFAState *LexerATNSimulator::addDFAEdge(dfa::DFAState *from, size_t t, ATNConfigSet *q) { + /* leading to this call, ATNConfigSet.hasSemanticContext is used as a + * marker indicating dynamic predicate evaluation makes this edge + * dependent on the specific input sequence, so the static edge in the + * DFA should be omitted. The target DFAState is still created since + * execATN has the ability to resynchronize with the DFA state cache + * following the predicate evaluation step. + * + * TJP notes: next time through the DFA, we see a pred again and eval. + * If that gets us to a previously created (but dangling) DFA + * state, we can continue in pure DFA mode from there. + */ + bool suppressEdge = q->hasSemanticContext; + q->hasSemanticContext = false; + + dfa::DFAState *to = addDFAState(q); + + if (suppressEdge) { + return to; + } + + addDFAEdge(from, t, to); + return to; +} + +void LexerATNSimulator::addDFAEdge(dfa::DFAState *p, size_t t, dfa::DFAState *q) { + if (/*t < MIN_DFA_EDGE ||*/ t > MAX_DFA_EDGE) { // MIN_DFA_EDGE is 0 + // Only track edges within the DFA bounds + return; + } + + UniqueLock<SharedMutex> edgeLock(atn._edgeMutex); + p->edges[t - MIN_DFA_EDGE] = q; // connect +} + +dfa::DFAState *LexerATNSimulator::addDFAState(ATNConfigSet *configs) { + return addDFAState(configs, true); +} + +dfa::DFAState *LexerATNSimulator::addDFAState(ATNConfigSet *configs, bool suppressEdge) { + /* the lexer evaluates predicates on-the-fly; by this point configs + * should not contain any configurations with unevaluated predicates. + */ + assert(!configs->hasSemanticContext); + + dfa::DFAState *proposed = new dfa::DFAState(std::unique_ptr<ATNConfigSet>(configs)); /* mem-check: managed by the DFA or deleted below */ + Ref<ATNConfig> firstConfigWithRuleStopState = nullptr; + for (const auto &c : configs->configs) { + if (RuleStopState::is(c->state)) { + firstConfigWithRuleStopState = c; + break; + } + } + + if (firstConfigWithRuleStopState != nullptr) { + proposed->isAcceptState = true; + proposed->lexerActionExecutor = downCast<const LexerATNConfig&>(*firstConfigWithRuleStopState).getLexerActionExecutor(); + proposed->prediction = atn.ruleToTokenType[firstConfigWithRuleStopState->state->ruleIndex]; + } + + dfa::DFA &dfa = _decisionToDFA[_mode]; + + { + UniqueLock<SharedMutex> stateLock(atn._stateMutex); + auto [existing, inserted] = dfa.states.insert(proposed); + if (!inserted) { + delete proposed; + proposed = *existing; + } else { + // Previously we did a lookup, then set fields, then inserted. It was `dfa.states.size()`, + // since we already inserted we need to subtract one. + proposed->stateNumber = static_cast<int>(dfa.states.size() - 1); + proposed->configs->setReadonly(true); + } + if (!suppressEdge) { + dfa.s0 = proposed; + } + } + + return proposed; +} + +dfa::DFA& LexerATNSimulator::getDFA(size_t mode) { + return _decisionToDFA[mode]; +} + +std::string LexerATNSimulator::getText(CharStream *input) { + // index is first lookahead char, don't include. + return input->getText(misc::Interval(_startIndex, input->index() - 1)); +} + +size_t LexerATNSimulator::getLine() const { + return _line; +} + +void LexerATNSimulator::setLine(size_t line) { + _line = line; +} + +size_t LexerATNSimulator::getCharPositionInLine() { + return _charPositionInLine; +} + +void LexerATNSimulator::setCharPositionInLine(size_t charPositionInLine) { + _charPositionInLine = charPositionInLine; +} + +void LexerATNSimulator::consume(CharStream *input) { + size_t curChar = input->LA(1); + if (curChar == '\n') { + _line++; + _charPositionInLine = 0; + } else { + _charPositionInLine++; + } + input->consume(); +} + +std::string LexerATNSimulator::getTokenName(size_t t) { + if (t == Token::EOF) { + return "EOF"; + } + return std::string("'") + static_cast<char>(t) + std::string("'"); +} + +void LexerATNSimulator::InitializeInstanceFields() { + _startIndex = 0; + _line = 1; + _charPositionInLine = 0; + _mode = antlr4::Lexer::DEFAULT_MODE; +} |