aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/regex/pcre/regexp.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pcre/regexp.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pcre/regexp.cpp')
-rw-r--r--library/cpp/regex/pcre/regexp.cpp317
1 files changed, 317 insertions, 0 deletions
diff --git a/library/cpp/regex/pcre/regexp.cpp b/library/cpp/regex/pcre/regexp.cpp
new file mode 100644
index 0000000000..575c09cee4
--- /dev/null
+++ b/library/cpp/regex/pcre/regexp.cpp
@@ -0,0 +1,317 @@
+#include "regexp.h"
+
+#include <util/generic/string.h>
+#include <util/string/ascii.h>
+#include <util/system/defaults.h>
+
+#include <cstdlib>
+#include <util/generic/noncopyable.h>
+
+class TGlobalImpl : TNonCopyable {
+private:
+ const char* Str;
+ regmatch_t* Pmatch;
+ int Options;
+ int StrLen;
+ int StartOffset, NotEmptyOpts, MatchPos;
+ int MatchBuf[NMATCHES * 3];
+ pcre* PregComp;
+
+ enum StateCode {
+ TGI_EXIT,
+ TGI_CONTINUE,
+ TGI_WALKTHROUGH
+ };
+
+private:
+ void CopyResults(int count) {
+ for (int i = 0; i < count; i++) {
+ Pmatch[MatchPos].rm_so = MatchBuf[2 * i];
+ Pmatch[MatchPos].rm_eo = MatchBuf[2 * i + 1];
+ MatchPos++;
+ if (MatchPos >= NMATCHES) {
+ ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
+ }
+ }
+ }
+
+ int DoPcreExec(int opts) {
+ int rc = pcre_exec(
+ PregComp, /* the compiled pattern */
+ nullptr, /* no extra data - we didn't study the pattern */
+ Str, /* the subject string */
+ StrLen, /* the length of the subject */
+ StartOffset, /* start at offset 0 in the subject */
+ opts, /* default options */
+ MatchBuf, /* output vector for substring information */
+ NMATCHES); /* number of elements in the output vector */
+
+ if (rc == 0) {
+ ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
+ }
+
+ return rc;
+ }
+
+ StateCode CheckEmptyCase() {
+ if (MatchBuf[0] == MatchBuf[1]) { // founded an empty string
+ if (MatchBuf[0] == StrLen) { // at the end
+ return TGI_EXIT;
+ }
+ NotEmptyOpts = PCRE_NOTEMPTY | PCRE_ANCHORED; // trying to find non empty string
+ }
+ return TGI_WALKTHROUGH;
+ }
+
+ StateCode CheckNoMatch(int rc) {
+ if (rc == PCRE_ERROR_NOMATCH) {
+ if (NotEmptyOpts == 0) {
+ return TGI_EXIT;
+ }
+
+ MatchBuf[1] = StartOffset + 1; // we have failed to find non-empty-string. trying to find again shifting "previous match offset"
+ return TGI_CONTINUE;
+ }
+ return TGI_WALKTHROUGH;
+ }
+
+public:
+ TGlobalImpl(const char* st, regmatch_t& pma, int opts, pcre* pc_re)
+ : Str(st)
+ , Pmatch(&pma)
+ , Options(opts)
+ , StartOffset(0)
+ , NotEmptyOpts(0)
+ , MatchPos(0)
+ , PregComp(pc_re)
+ {
+ memset(Pmatch, -1, sizeof(regmatch_t) * NMATCHES);
+ StrLen = strlen(Str);
+ }
+
+ int ExecGlobal() {
+ StartOffset = 0;
+ int rc = DoPcreExec(Options);
+
+ if (rc < 0) {
+ return rc;
+ }
+ CopyResults(rc);
+ do {
+ NotEmptyOpts = 0;
+ StartOffset = MatchBuf[1];
+
+ if (CheckEmptyCase() == TGI_EXIT) {
+ return 0;
+ }
+
+ rc = DoPcreExec(NotEmptyOpts | Options);
+
+ switch (CheckNoMatch(rc)) {
+ case TGI_CONTINUE:
+ continue;
+ case TGI_EXIT:
+ return 0;
+ case TGI_WALKTHROUGH:
+ default:
+ break;
+ }
+
+ if (rc < 0) {
+ return rc;
+ }
+
+ CopyResults(rc);
+ } while (true);
+
+ return 0;
+ }
+
+private:
+};
+
+class TRegExBaseImpl: public TAtomicRefCount<TRegExBaseImpl> {
+ friend class TRegExBase;
+
+protected:
+ int CompileOptions;
+ TString RegExpr;
+ regex_t Preg;
+
+public:
+ TRegExBaseImpl()
+ : CompileOptions(0)
+ {
+ memset(&Preg, 0, sizeof(Preg));
+ }
+
+ TRegExBaseImpl(const TString& re, int cflags)
+ : CompileOptions(cflags)
+ , RegExpr(re)
+ {
+ int rc = regcomp(&Preg, re.data(), cflags);
+ if (rc) {
+ const size_t ERRBUF_SIZE = 100;
+ char errbuf[ERRBUF_SIZE];
+ regerror(rc, &Preg, errbuf, ERRBUF_SIZE);
+ Error = "Error: regular expression " + re + " is wrong: " + errbuf;
+ ythrow yexception() << "RegExp " << re << ": " << Error.data();
+ }
+ }
+
+ int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
+ if (!RegExpr) {
+ ythrow yexception() << "Regular expression is not compiled";
+ }
+ if (!str) {
+ ythrow yexception() << "Empty string is passed to TRegExBaseImpl::Exec";
+ }
+ if ((eflags & REGEXP_GLOBAL) == 0) {
+ return regexec(&Preg, str, nmatches, pmatch, eflags);
+ } else {
+ int options = 0;
+ if ((eflags & REG_NOTBOL) != 0)
+ options |= PCRE_NOTBOL;
+ if ((eflags & REG_NOTEOL) != 0)
+ options |= PCRE_NOTEOL;
+
+ return TGlobalImpl(str, pmatch[0], options, (pcre*)Preg.re_pcre).ExecGlobal();
+ }
+ }
+
+ bool IsCompiled() {
+ return Preg.re_pcre;
+ }
+
+ ~TRegExBaseImpl() {
+ regfree(&Preg);
+ }
+
+private:
+ TString Error;
+};
+
+bool TRegExBase::IsCompiled() const {
+ return Impl && Impl->IsCompiled();
+}
+
+TRegExBase::TRegExBase(const char* re, int cflags) {
+ if (re) {
+ Compile(re, cflags);
+ }
+}
+
+TRegExBase::TRegExBase(const TString& re, int cflags) {
+ Compile(re, cflags);
+}
+
+TRegExBase::~TRegExBase() {
+}
+
+void TRegExBase::Compile(const TString& re, int cflags) {
+ Impl = new TRegExBaseImpl(re, cflags);
+}
+
+int TRegExBase::Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->Exec(str, pmatch, eflags, nmatches);
+}
+
+int TRegExBase::GetCompileOptions() const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->CompileOptions;
+}
+
+TString TRegExBase::GetRegExpr() const {
+ if (!Impl)
+ ythrow yexception() << "!Regular expression is not compiled";
+ return Impl->RegExpr;
+}
+
+TRegExMatch::TRegExMatch(const char* re, int cflags)
+ : TRegExBase(re, cflags)
+{
+}
+
+TRegExMatch::TRegExMatch(const TString& re, int cflags)
+ : TRegExBase(re, cflags)
+{
+}
+
+bool TRegExMatch::Match(const char* str) const {
+ return Exec(str, nullptr, 0, 0) == 0;
+}
+
+TRegExSubst::TRegExSubst(const char* re, int cflags)
+ : TRegExBase(re, cflags)
+ , Replacement(nullptr)
+{
+ memset(Brfs, 0, sizeof(TBackReferences) * NMATCHES);
+}
+
+TString TRegExSubst::Replace(const char* str, int eflags) {
+ TString s;
+ if (BrfsCount) {
+ if (Exec(str, PMatch, eflags) == 0) {
+ int i;
+ for (i = 0; i < BrfsCount; i++) {
+ s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
+ if (Brfs[i].Refer >= 0 && Brfs[i].Refer < NMATCHES)
+ s += TString(str, PMatch[Brfs[i].Refer].rm_so, int(PMatch[Brfs[i].Refer].rm_eo - PMatch[Brfs[i].Refer].rm_so));
+ }
+ s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
+ }
+ } else {
+ s = Replacement;
+ }
+ return s;
+}
+
+//***
+// ��� ������������ ������ aaa.$1.$$$$.$2.bbb.$$$ccc Brfs ����� �����:
+// {beg = 0, end = 4, Refer = 1} => "aaa." + $1_match
+// {beg = 6, end = 8, Refer = -1} => ".$"
+// {beg = 9, end = 10, Refer = -1} => "$"
+// {beg = 11, end = 12, Refer = 2} => "." + $2_match
+// {beg = 14, end = 20, Refer = -1} => ".bbb.$"
+// {beg = 21, end = 22, Refer = -1} => "$"
+// {beg = 22, end = 25, Refer = -1} => "ccc"
+// {beg = 0, end = 0, Refer = 0}
+//***
+int TRegExSubst::ParseReplacement(const char* repl) {
+ Replacement = repl;
+ if (!Replacement || *Replacement == 0)
+ return 0;
+ char* pos = (char*)Replacement;
+ char* pos1 = nullptr;
+ char* pos2 = nullptr;
+ int i = 0;
+ while (pos && *pos && i < NMATCHES) {
+ pos1 = strchr(pos, '$');
+ Brfs[i].Refer = -1;
+ pos2 = pos1;
+ if (pos1) {
+ pos2 = pos1 + 1;
+ while (IsAsciiDigit(*pos2))
+ pos2++;
+ if (pos2 > pos1 + 1) {
+ Brfs[i].Refer = atol(TString(Replacement, pos1 + 1 - Replacement, pos2 - (pos1 + 1)).data());
+ } else {
+ pos1++;
+ if (*pos2 == '$')
+ pos2++;
+ Brfs[i].Refer = -1;
+ }
+ }
+ Brfs[i].Beg = int(pos - (char*)Replacement);
+ Brfs[i].End = (pos1 == nullptr ? (int)strlen(Replacement) : int(pos1 - Replacement));
+ pos = pos2;
+ i++;
+ }
+ Brfs[i].Beg = Brfs[i].End = 0;
+ Brfs[i].Refer = -1;
+ BrfsCount = i;
+ return BrfsCount;
+}