diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pcre/pcre.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pcre/pcre.h')
-rw-r--r-- | library/cpp/regex/pcre/pcre.h | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/library/cpp/regex/pcre/pcre.h b/library/cpp/regex/pcre/pcre.h new file mode 100644 index 0000000000..82a9774f00 --- /dev/null +++ b/library/cpp/regex/pcre/pcre.h @@ -0,0 +1,191 @@ +#pragma once + +#include "traits.h" + +#include <library/cpp/containers/stack_array/stack_array.h> + +#include <util/generic/maybe.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +namespace NPcre { + //! Start and end offset for match group. + using TPcreMatch = std::pair<int, int>; + + //! Full match result containing all capturing groups. + /*! + * At zero index we have whole matched string start and end offsets. + * All other elements will contain capturing groups positions. + * Non-captured capturing groups will have {-1, -1} offsets. + */ + using TPcreMatches = TVector<TPcreMatch>; + + //! Compiled pattern optimization strategy. + enum class EOptimize { + //! No optimization. + /*! + * Useful for non-reusable patterns where compile time matters. + */ + None, + //! Basic optimization via |pcre_study|. + /*! + * Could give up to 4x match speed boost in exchange of increased + * construction time. Could not. + */ + Study, + //! PCRE JIT optimization. + /*! + * Could give up to 10x match speed bust in exchange of significantly + * increased compile time. Also, for very complex patterns |pcre_exec| + * could return |PCRE_ERROR_JIT_STACKLIMIT|. See + * https://www.pcre.org/original/doc/html/pcrejit.html for details. + */ + JIT + }; + + //! PCRE code container. Controls its life time and provides handy wrapper. + template <class TCharType> + class TPcre { + private: + using TCodeType = typename TPcreTraits<TCharType>::TCodeType; + using TExtraType = typename TPcreTraits<TCharType>::TExtraType; + using TStringType = typename TPcreTraits<TCharType>::TStringType; + using TTraits = TPcreTraits<TCharType>; + static constexpr size_t DefaultWorkspaceSize = 16; + + public: + //! Compiles regexp into internal representation for future use. + /*! + * \param pattern Regular expression to be compiled. + * \param optimize If |EOptimize::JIT|, perform additional + * analysis, which will take extra time, but could + * speed up matching. |None| to omit optimization. + * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html + **/ + TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) { + int errcode; + const char* errptr; + int erroffset; + Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr)); + if (!Code) { + ythrow yexception() << "Failed to compile pattern <" << pattern + << ">, because of error at pos " << erroffset + << ", error code " << errcode << ": " << errptr; + } + if (optimize != EOptimize::None) { + errptr = nullptr; + int options; + if (optimize == EOptimize::Study) { + options = 0; + } else { + options = PCRE_STUDY_JIT_COMPILE; + } + Extra.Reset(TTraits::Study(Code.Get(), options, &errptr)); + if (errptr) { + ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr; + } + } + } + + //! Check if compiled pattern matches string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns |true| if there is a match. + */ + bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize)); + } + + //! Find compiled pattern in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns Start and end offsets pair if there is a + * match. |Nothing| otherwise. + */ + Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + for (size_t i = 0; i < ovecsize; ++i) { + ovector[i] = -4; + } + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (ConvertReturnCode(rc)) { + return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]); + } else { + return Nothing(); + } + } + + //! Find and return all capturing groups in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param initialWorkspaceSize Capturing groups vector initial size. + * Workspace will be grown and search will + * be repeated if there is not enough + * space. + * \returns List of capturing groups start and end + * offsets. First element will contain + * whole matched substring start and end + * offsets. For non-matched capturing + * groups, result will contain {-1, -1} + * pair. + * If pattern not found in string, result + * vector will be empty. + */ + Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(initialWorkspaceSize > 0); + size_t ovecsize = (initialWorkspaceSize + 1) * 3; + while (true) { + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (rc > 0) { + TPcreMatches result(Reserve(rc >> 1)); + for (int i = 0, pos = 0; i < rc; ++i) { + int start = ovector[pos++]; + int end = ovector[pos++]; + result.emplace_back(start, end); + } + return result; + } else if (rc == 0) { + ovecsize <<= 1; + } else if (rc == PCRE_ERROR_NOMATCH) { + return TPcreMatches{}; + } else if (rc < 0) { + ythrow yexception() << "Error. RC = " << rc; + } + } + } + + private: + TPcreCode<TCharType> Code; + TPcreExtra<TCharType> Extra; + + private: + static inline bool ConvertReturnCode(int rc) { + if (rc >= 0) { + return true; + } else if (rc == PCRE_ERROR_NOMATCH) { + return false; + } else { + ythrow yexception() << "Error. RC = " << rc; + } + } + }; +} + |