/* * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Shortcut literal pass: directly add literal components to Rose. */ #include "AsciiComponentClass.h" #include "Utf8ComponentClass.h" #include "ComponentAssertion.h" #include "ComponentAtomicGroup.h" #include "ComponentBackReference.h" #include "ComponentBoundary.h" #include "ComponentClass.h" #include "ComponentCondReference.h" #include "ComponentRepeat.h" #include "ComponentSequence.h" #include "ComponentVisitor.h" #include "ComponentWordBoundary.h" #include "ConstComponentVisitor.h" #include "parse_error.h" #include "shortcut_literal.h" #include "grey.h" #include "nfagraph/ng.h" #include "compiler/compiler.h" #include "util/ue2string.h" #include "ue2common.h" #include <stack> using namespace std; namespace ue2 { /** * \brief Visitor that constructs a ue2_literal from a component tree. * * If a component that can't be part of a literal is encountered, this visitor * will throw ConstructLiteralVisitor::NotLiteral. */ class ConstructLiteralVisitor : public ConstComponentVisitor { public: ~ConstructLiteralVisitor() override; /** \brief Thrown if this component does not represent a literal. */ struct NotLiteral {}; void pre(const AsciiComponentClass &c) override { const CharReach &cr = c.cr; const size_t width = cr.count(); if (width == 1) { lit.push_back(cr.find_first(), false); } else if (width == 2 && cr.isCaselessChar()) { lit.push_back(cr.find_first(), true); } else { throw NotLiteral(); } } void pre(const ComponentRepeat &c) override { if (c.m_min == 0 || c.m_min != c.m_max) { throw NotLiteral(); } if (c.m_max < ComponentRepeat::NoLimit && c.m_max > 32767) { throw ParseError("Bounded repeat is too large."); } // Store the current length of the literal; in this repeat's post() // call we will append N-1 more copies of [index..end]. repeat_stack.push(lit.length()); } void post(const ComponentRepeat &c) override { // Add N-1 copies of the string between the entry to the repeat and the // current end of the literal. assert(!repeat_stack.empty()); const ue2_literal suffix = lit.substr(repeat_stack.top()); repeat_stack.pop(); for (unsigned i = 1; i < c.m_min; i++) { lit += suffix; } } void pre(const ComponentSequence &) override { // Pass through. } void pre(const ComponentAlternation &) override { throw NotLiteral(); } void pre(const ComponentAssertion &) override { throw NotLiteral(); } void pre(const ComponentAtomicGroup &) override { throw NotLiteral(); } void pre(const ComponentBackReference &) override { throw NotLiteral(); } void pre(const ComponentBoundary &) override { throw NotLiteral(); } void pre(const ComponentByte &) override { throw NotLiteral(); } void pre(const ComponentCondReference &) override { throw NotLiteral(); } void pre(const ComponentEmpty &) override { throw NotLiteral(); } void pre(const ComponentEUS &) override { throw NotLiteral(); } void pre(const ComponentWordBoundary &) override { throw NotLiteral(); } void pre(const UTF8ComponentClass &) override { throw NotLiteral(); } void during(const AsciiComponentClass &) override {} void during(const ComponentAlternation &) override {} void during(const ComponentAssertion &) override {} void during(const ComponentAtomicGroup &) override {} void during(const ComponentBackReference &) override {} void during(const ComponentBoundary &) override {} void during(const ComponentByte &) override {} void during(const ComponentCondReference &) override {} void during(const ComponentEmpty &) override {} void during(const ComponentEUS &) override {} void during(const ComponentRepeat &) override {} void during(const ComponentSequence &) override {} void during(const ComponentWordBoundary &) override {} void during(const UTF8ComponentClass &) override {} void post(const AsciiComponentClass &) override {} void post(const ComponentAlternation &) override {} void post(const ComponentAssertion &) override {} void post(const ComponentAtomicGroup &) override {} void post(const ComponentBackReference &) override {} void post(const ComponentBoundary &) override {} void post(const ComponentByte &) override {} void post(const ComponentCondReference &) override {} void post(const ComponentEmpty &) override {} void post(const ComponentEUS &) override {} void post(const ComponentSequence &) override {} void post(const ComponentWordBoundary &) override {} void post(const UTF8ComponentClass &) override {} ue2_literal lit; stack<size_t> repeat_stack; //!< index of entry to repeat. }; ConstructLiteralVisitor::~ConstructLiteralVisitor() {} /** \brief True if the literal expression \a expr could be added to Rose. */ bool shortcutLiteral(NG &ng, const ParsedExpression &pe) { assert(pe.component); if (!ng.cc.grey.allowLiteral) { return false; } const auto &expr = pe.expr; // XXX: don't shortcut literals with extended params (yet) if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length || expr.edit_distance || expr.hamm_distance) { DEBUG_PRINTF("extended params not allowed\n"); return false; } ConstructLiteralVisitor vis; try { assert(pe.component); pe.component->accept(vis); assert(vis.repeat_stack.empty()); } catch (const ConstructLiteralVisitor::NotLiteral&) { DEBUG_PRINTF("not a literal\n"); return false; } const ue2_literal &lit = vis.lit; if (lit.empty()) { DEBUG_PRINTF("empty literal\n"); return false; } if (expr.highlander && lit.length() <= 1) { DEBUG_PRINTF("not shortcutting SEP literal\n"); return false; } DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str()); return ng.addLiteral(lit, expr.index, expr.report, expr.highlander, expr.som, expr.quiet); } } // namespace ue2