#pragma once
#include "traits.h"
#include <library/cpp/containers/stack_array/stack_array.h>
#include <util/generic/maybe.h>
#include <util/generic/strbuf.h>
#include <util/generic/vector.h>
#include <util/generic/yexception.h>
namespace NPcre {
//! Start and end offset for match group.
using TPcreMatch = std::pair<int, int>;
//! Full match result containing all capturing groups.
/*!
* At zero index we have whole matched string start and end offsets.
* All other elements will contain capturing groups positions.
* Non-captured capturing groups will have {-1, -1} offsets.
*/
using TPcreMatches = TVector<TPcreMatch>;
//! Compiled pattern optimization strategy.
enum class EOptimize {
//! No optimization.
/*!
* Useful for non-reusable patterns where compile time matters.
*/
None,
//! Basic optimization via |pcre_study|.
/*!
* Could give up to 4x match speed boost in exchange of increased
* construction time. Could not.
*/
Study,
//! PCRE JIT optimization.
/*!
* Could give up to 10x match speed bust in exchange of significantly
* increased compile time. Also, for very complex patterns |pcre_exec|
* could return |PCRE_ERROR_JIT_STACKLIMIT|. See
* https://www.pcre.org/original/doc/html/pcrejit.html for details.
*/
JIT
};
//! PCRE code container. Controls its life time and provides handy wrapper.
template <class TCharType>
class TPcre {
private:
using TCodeType = typename TPcreTraits<TCharType>::TCodeType;
using TExtraType = typename TPcreTraits<TCharType>::TExtraType;
using TStringType = typename TPcreTraits<TCharType>::TStringType;
using TTraits = TPcreTraits<TCharType>;
static constexpr size_t DefaultWorkspaceSize = 16;
public:
//! Compiles regexp into internal representation for future use.
/*!
* \param pattern Regular expression to be compiled.
* \param optimize If |EOptimize::JIT|, perform additional
* analysis, which will take extra time, but could
* speed up matching. |None| to omit optimization.
* \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html
**/
TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) {
int errcode;
const char* errptr;
int erroffset;
Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr));
if (!Code) {
ythrow yexception() << "Failed to compile pattern <" << pattern
<< ">, because of error at pos " << erroffset
<< ", error code " << errcode << ": " << errptr;
}
if (optimize != EOptimize::None) {
errptr = nullptr;
int options;
if (optimize == EOptimize::Study) {
options = 0;
} else {
options = PCRE_STUDY_JIT_COMPILE;
}
Extra.Reset(TTraits::Study(Code.Get(), options, &errptr));
if (errptr) {
ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr;
}
}
}
//! Check if compiled pattern matches string.
/*!
* \param string String to search in.
* \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
* \param workspaceSize Amount of space which will be allocated for
* back references. PCRE could allocate more
* heap space is provided workspaceSize won't
* fit all of them.
* \returns |true| if there is a match.
*/
bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
Y_ASSERT(workspaceSize >= 0);
size_t ovecsize = workspaceSize * 3;
NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.data(), string.size(), 0, executeFlags, ovector.data(), ovecsize));
}
//! Find compiled pattern in string.
/*!
* \param string String to search in.
* \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
* \param workspaceSize Amount of space which will be allocated for
* back references. PCRE could allocate more
* heap space is provided workspaceSize won't
* fit all of them.
* \returns Start and end offsets pair if there is a
* match. |Nothing| otherwise.
*/
Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const {
Y_ASSERT(workspaceSize >= 0);
size_t ovecsize = workspaceSize * 3;
NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
for (size_t i = 0; i < ovecsize; ++i) {
ovector[i] = -4;
}
int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.data(), string.size(), 0, executeFlags, ovector.data(), ovecsize);
if (ConvertReturnCode(rc)) {
return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]);
} else {
return Nothing();
}
}
//! Find and return all capturing groups in string.
/*!
* \param string String to search in.
* \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html
* \param initialWorkspaceSize Capturing groups vector initial size.
* Workspace will be grown and search will
* be repeated if there is not enough
* space.
* \returns List of capturing groups start and end
* offsets. First element will contain
* whole matched substring start and end
* offsets. For non-matched capturing
* groups, result will contain {-1, -1}
* pair.
* If pattern not found in string, result
* vector will be empty.
*/
Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const {
Y_ASSERT(initialWorkspaceSize > 0);
size_t ovecsize = (initialWorkspaceSize + 1) * 3;
while (true) {
NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize));
int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.data(), string.size(), 0, executeFlags, ovector.data(), ovecsize);
if (rc > 0) {
TPcreMatches result(Reserve(rc >> 1));
for (int i = 0, pos = 0; i < rc; ++i) {
int start = ovector[pos++];
int end = ovector[pos++];
result.emplace_back(start, end);
}
return result;
} else if (rc == 0) {
ovecsize <<= 1;
} else if (rc == PCRE_ERROR_NOMATCH) {
return TPcreMatches{};
} else if (rc < 0) {
ythrow yexception() << "Error. RC = " << rc;
}
}
}
private:
TPcreCode<TCharType> Code;
TPcreExtra<TCharType> Extra;
private:
static inline bool ConvertReturnCode(int rc) {
if (rc >= 0) {
return true;
} else if (rc == PCRE_ERROR_NOMATCH) {
return false;
} else {
ythrow yexception() << "Error. RC = " << rc;
}
}
};
}