aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/pcre/pcre.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pcre/pcre.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pcre/pcre.h')
-rw-r--r--library/cpp/regex/pcre/pcre.h191
1 files changed, 191 insertions, 0 deletions
diff --git a/library/cpp/regex/pcre/pcre.h b/library/cpp/regex/pcre/pcre.h
new file mode 100644
index 0000000000..82a9774f00
--- /dev/null
+++ b/library/cpp/regex/pcre/pcre.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "traits.h"
+
+#include <library/cpp/containers/stack_array/stack_array.h>
+
+#include <util/generic/maybe.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+
+namespace NPcre {
+ //! Start and end offset for match group.
+ using TPcreMatch = std::pair<int, int>;
+
+ //! Full match result containing all capturing groups.
+ /*!
+ * At zero index we have whole matched string start and end offsets.
+ * All other elements will contain capturing groups positions.
+ * Non-captured capturing groups will have {-1, -1} offsets.
+ */
+ using TPcreMatches = TVector<TPcreMatch>;
+
+ //! Compiled pattern optimization strategy.
+ enum class EOptimize {
+ //! No optimization.
+ /*!
+ * Useful for non-reusable patterns where compile time matters.
+ */
+ None,
+ //! Basic optimization via |pcre_study|.
+ /*!
+ * Could give up to 4x match speed boost in exchange of increased
+ * construction time. Could not.
+ */
+ Study,
+ //! PCRE JIT optimization.
+ /*!
+ * Could give up to 10x match speed bust in exchange of significantly
+ * increased compile time. Also, for very complex patterns |pcre_exec|
+ * could return |PCRE_ERROR_JIT_STACKLIMIT|. See
+ * https://www.pcre.org/original/doc/html/pcrejit.html for details.
+ */
+ JIT
+ };
+
+ //! PCRE code container. Controls its life time and provides handy wrapper.
+ template <class TCharType>
+ class TPcre {
+ private:
+ using TCodeType = typename TPcreTraits<TCharType>::TCodeType;
+ using TExtraType = typename TPcreTraits<TCharType>::TExtraType;
+ using TStringType = typename TPcreTraits<TCharType>::TStringType;
+ using TTraits = TPcreTraits<TCharType>;
+ static constexpr size_t DefaultWorkspaceSize = 16;
+
+ public:
+ //! Compiles regexp into internal representation for future use.
+ /*!
+ * \param pattern Regular expression to be compiled.
+ * \param optimize If |EOptimize::JIT|, perform additional
+ * analysis, which will take extra time, but could
+ * speed up matching. |None| to omit optimization.
+ * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html
+ **/
+ TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) {
+ int errcode;
+ const char* errptr;
+ int erroffset;
+ Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr));
+ if (!Code) {
+ ythrow yexception() << "Failed to compile pattern <" << pattern
+ << ">, because of error at pos " << erroffset
+ << ", error code " << errcode << ": " << errptr;
+ }
+ if (optimize != EOptimize::None) {
+ errptr = nullptr;
+ int options;
+ if (optimize == EOptimize::Study) {
+ options = 0;
+ } else {
+ options = PCRE_STUDY_JIT_COMPILE;
+ }
+ Extra.Reset(TTraits::Study(Code.Get(), options, &errptr));
+ if (errptr) {
+ ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr;
+ }
+ }
+ }
+
+ //! Check if compiled pattern matches string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param workspaceSize Amount of space which will be allocated for
+ * back references. PCRE could allocate more
+ * heap space is provided workspaceSize won't
+ * fit all of them.
+ * \returns |true| if there is a match.
+ */
+ bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(workspaceSize >= 0);
+ size_t ovecsize = workspaceSize * 3;
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize));
+ }
+
+ //! Find compiled pattern in string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param workspaceSize Amount of space which will be allocated for
+ * back references. PCRE could allocate more
+ * heap space is provided workspaceSize won't
+ * fit all of them.
+ * \returns Start and end offsets pair if there is a
+ * match. |Nothing| otherwise.
+ */
+ Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(workspaceSize >= 0);
+ size_t ovecsize = workspaceSize * 3;
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ for (size_t i = 0; i < ovecsize; ++i) {
+ ovector[i] = -4;
+ }
+ int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
+ if (ConvertReturnCode(rc)) {
+ return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]);
+ } else {
+ return Nothing();
+ }
+ }
+
+ //! Find and return all capturing groups in string.
+ /*!
+ * \param string String to search in.
+ * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
+ * \param initialWorkspaceSize Capturing groups vector initial size.
+ * Workspace will be grown and search will
+ * be repeated if there is not enough
+ * space.
+ * \returns List of capturing groups start and end
+ * offsets. First element will contain
+ * whole matched substring start and end
+ * offsets. For non-matched capturing
+ * groups, result will contain {-1, -1}
+ * pair.
+ * If pattern not found in string, result
+ * vector will be empty.
+ */
+ Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const {
+ Y_ASSERT(initialWorkspaceSize > 0);
+ size_t ovecsize = (initialWorkspaceSize + 1) * 3;
+ while (true) {
+ NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
+ int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize);
+ if (rc > 0) {
+ TPcreMatches result(Reserve(rc >> 1));
+ for (int i = 0, pos = 0; i < rc; ++i) {
+ int start = ovector[pos++];
+ int end = ovector[pos++];
+ result.emplace_back(start, end);
+ }
+ return result;
+ } else if (rc == 0) {
+ ovecsize <<= 1;
+ } else if (rc == PCRE_ERROR_NOMATCH) {
+ return TPcreMatches{};
+ } else if (rc < 0) {
+ ythrow yexception() << "Error. RC = " << rc;
+ }
+ }
+ }
+
+ private:
+ TPcreCode<TCharType> Code;
+ TPcreExtra<TCharType> Extra;
+
+ private:
+ static inline bool ConvertReturnCode(int rc) {
+ if (rc >= 0) {
+ return true;
+ } else if (rc == PCRE_ERROR_NOMATCH) {
+ return false;
+ } else {
+ ythrow yexception() << "Error. RC = " << rc;
+ }
+ }
+ };
+}
+