/* * Copyright (c) 2015-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Tools for string manipulation, ue2_literal definition. */ #ifndef UE2STRING_H #define UE2STRING_H #include "ue2common.h" #include "util/charreach.h" #include "util/compare.h" #include "util/hash.h" #include "util/operators.h" #include <iterator> #include <string> #include <vector> #include <boost/dynamic_bitset.hpp> #include <boost/iterator/iterator_facade.hpp> namespace ue2 { /// Force the given string to upper-case. void upperString(std::string &s); size_t maxStringOverlap(const std::string &a, const std::string &b, bool nocase); size_t maxStringSelfOverlap(const std::string &a, bool nocase); /// Compares two strings, returns non-zero if they're different. u32 cmp(const char *a, const char *b, size_t len, bool nocase); /** * \brief String type that also records whether the whole string is caseful or * caseless. * * You should use \ref ue2_literal if you need to represent a mixed-case * literal. */ struct ue2_case_string { ue2_case_string(std::string s_in, bool nocase_in) : s(std::move(s_in)), nocase(nocase_in) { if (nocase) { upperString(s); } } bool operator==(const ue2_case_string &other) const { return s == other.s && nocase == other.nocase; } std::string s; bool nocase; }; struct ue2_literal : totally_ordered<ue2_literal> { public: /// Single element proxy, pointed to by our const_iterator. struct elem { elem() : c(0), nocase(false) {} elem(char c_in, bool nc_in) : c(c_in), nocase(nc_in) {} bool operator==(const elem &o) const { return c == o.c && nocase == o.nocase; } bool operator!=(const elem &o) const { return c != o.c || nocase != o.nocase; } operator CharReach() const; char c; bool nocase; }; /// Boost iterator_facade lets us synthesize an iterator simply. class const_iterator : public boost::iterator_facade< const_iterator, elem const, boost::random_access_traversal_tag, elem const> { public: using iterator_category = typename std::random_access_iterator_tag; const_iterator() {} private: friend class boost::iterator_core_access; void increment() { ++idx; } void decrement() { --idx; } void advance(size_t n) { idx += n; } difference_type distance_to(const const_iterator &other) const { return other.idx - idx; } bool equal(const const_iterator &other) const { return idx == other.idx && lit == other.lit; } const elem dereference() const { return elem(lit->s[idx], lit->nocase[idx]); } friend struct ue2_literal; const_iterator(const ue2_literal &lit_in, size_t idx_in) : lit(&lit_in), idx(idx_in) {} const ue2_literal *lit = nullptr; size_t idx = 0; }; using const_reverse_iterator = std::reverse_iterator<const_iterator>; using size_type = std::string::size_type; static const size_type npos; ue2_literal() = default; ue2_literal(const std::string &s_in, bool nc_in); ue2_literal(char c, bool nc_in); ue2_literal(const ue2_literal &) = default; ue2_literal(ue2_literal &&) = default; ue2_literal &operator=(const ue2_literal &) = default; ue2_literal &operator=(ue2_literal &&) = default; template<typename InputIt> ue2_literal(InputIt b, InputIt e) { for (; b != e; ++b) { push_back(*b); } } size_type length() const { return s.length(); } bool empty() const { return s.empty(); } ue2_literal substr(size_type pos, size_type n = npos) const; const char *c_str() const { return s.c_str(); } bool any_nocase() const; const_iterator begin() const { return const_iterator(*this, 0); } const_iterator end() const { return const_iterator(*this, s.size()); } const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } ue2_literal &erase(size_type pos = 0, size_type n = npos); void push_back(const elem &e) { push_back(e.c, e.nocase); } void push_back(char c, bool nc); const elem back() const { return *rbegin(); } friend ue2_literal operator+(ue2_literal a, const ue2_literal &b) { a += b; return a; } /// Reverse this literal in-place. void reverse(); void operator+=(const ue2_literal &b); bool operator==(const ue2_literal &b) const { return s == b.s && nocase == b.nocase; } bool operator<(const ue2_literal &b) const; void clear(void) { s.clear(); nocase.clear(); } const std::string &get_string() const { return s; } void swap(ue2_literal &other) { s.swap(other.s); nocase.swap(other.nocase); } size_t hash() const; private: friend const_iterator; std::string s; boost::dynamic_bitset<> nocase; }; /// Return a reversed copy of this literal. ue2_literal reverse_literal(const ue2_literal &in); // Escape any meta characters in a string std::string escapeStringMeta(const std::string &s); /** Note: may be overly conservative if only partially nocase */ size_t maxStringSelfOverlap(const ue2_literal &a); size_t minStringPeriod(const ue2_literal &a); size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b); /** * \brief True iff the range of a literal given cannot be considered entirely * case-sensitive nor entirely case-insensitive. */ template<class Iter> bool mixed_sensitivity_in(Iter begin, Iter end) { bool cs = false; bool nc = false; for (auto it = begin; it != end; ++it) { if (!ourisalpha(it->c)) { continue; } if (it->nocase) { nc = true; } else { cs = true; } } return cs && nc; } /** * \brief True iff the literal cannot be considered entirely case-sensitive * nor entirely case-insensitive. */ inline bool mixed_sensitivity(const ue2_literal &s) { return mixed_sensitivity_in(s.begin(), s.end()); } void make_nocase(ue2_literal *lit); struct case_iter { explicit case_iter(const ue2_literal &ss); const std::string &operator*() const { return s; } /* limited lifetime */ case_iter &operator++ (); bool operator!=(const case_iter &b) const { return s != b.s; } private: std::string s; std::string s_orig; std::vector<bool> nocase; }; case_iter caseIterateBegin(const ue2_literal &lit); case_iter caseIterateEnd(); /** \brief True if there is any overlap between the characters in \a s and the * set characters in \a cr. * * Note: this means that if \a s is nocase, then \a cr only needs to have * either the lower-case or upper-case version of a letter set. */ bool contains(const ue2_literal &s, const CharReach &cr); /// Returns true if \a a is a suffix of (or equal to) \a b. bool isSuffix(const ue2_literal &a, const ue2_literal &b); static inline std::vector<CharReach> as_cr_seq(const ue2_literal &s) { std::vector<CharReach> rv; rv.reserve(s.length()); rv.insert(rv.end(), s.begin(), s.end()); return rv; } /** \brief True if the given literal consists entirely of a flood of the same * character. */ bool is_flood(const ue2_literal &s); #if defined(DUMP_SUPPORT) || defined(DEBUG) /* Utility functions for debugging/dumping */ /// Escape a string so it's dot-printable. std::string dotEscapeString(const std::string &s); std::string dumpString(const ue2_literal &lit); /// Escape a string so that it's screen-printable. std::string escapeString(const std::string &s); /// Escape a ue2_literal so that it's screen-printable. std::string escapeString(const ue2_literal &lit); #endif } // namespace ue2 namespace std { template<> struct hash<ue2::ue2_literal::elem> { size_t operator()(const ue2::ue2_literal::elem &elem) const { return ue2::hash_all(elem.c, elem.nocase); } }; template<> struct hash<ue2::ue2_literal> { size_t operator()(const ue2::ue2_literal &lit) const { return lit.hash(); } }; } // namespace std #endif