#include "regexp.h"
#include <util/generic/string.h>
#include <util/string/ascii.h>
#include <util/system/defaults.h>
#include <cstdlib>
#include <util/generic/noncopyable.h>
class TGlobalImpl : TNonCopyable {
private:
const char* Str;
regmatch_t* Pmatch;
int Options;
int StrLen;
int StartOffset, NotEmptyOpts, MatchPos;
int MatchBuf[NMATCHES * 3];
pcre* PregComp;
enum StateCode {
TGI_EXIT,
TGI_CONTINUE,
TGI_WALKTHROUGH
};
private:
void CopyResults(int count) {
for (int i = 0; i < count; i++) {
Pmatch[MatchPos].rm_so = MatchBuf[2 * i];
Pmatch[MatchPos].rm_eo = MatchBuf[2 * i + 1];
MatchPos++;
if (MatchPos >= NMATCHES) {
ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
}
}
}
int DoPcreExec(int opts) {
int rc = pcre_exec(
PregComp, /* the compiled pattern */
nullptr, /* no extra data - we didn't study the pattern */
Str, /* the subject string */
StrLen, /* the length of the subject */
StartOffset, /* start at offset 0 in the subject */
opts, /* default options */
MatchBuf, /* output vector for substring information */
NMATCHES); /* number of elements in the output vector */
if (rc == 0) {
ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer.";
}
return rc;
}
StateCode CheckEmptyCase() {
if (MatchBuf[0] == MatchBuf[1]) { // founded an empty string
if (MatchBuf[0] == StrLen) { // at the end
return TGI_EXIT;
}
NotEmptyOpts = PCRE_NOTEMPTY | PCRE_ANCHORED; // trying to find non empty string
}
return TGI_WALKTHROUGH;
}
StateCode CheckNoMatch(int rc) {
if (rc == PCRE_ERROR_NOMATCH) {
if (NotEmptyOpts == 0) {
return TGI_EXIT;
}
MatchBuf[1] = StartOffset + 1; // we have failed to find non-empty-string. trying to find again shifting "previous match offset"
return TGI_CONTINUE;
}
return TGI_WALKTHROUGH;
}
public:
TGlobalImpl(const char* st, regmatch_t& pma, int opts, pcre* pc_re)
: Str(st)
, Pmatch(&pma)
, Options(opts)
, StartOffset(0)
, NotEmptyOpts(0)
, MatchPos(0)
, PregComp(pc_re)
{
memset(Pmatch, -1, sizeof(regmatch_t) * NMATCHES);
StrLen = strlen(Str);
}
int ExecGlobal() {
StartOffset = 0;
int rc = DoPcreExec(Options);
if (rc < 0) {
return rc;
}
CopyResults(rc);
do {
NotEmptyOpts = 0;
StartOffset = MatchBuf[1];
if (CheckEmptyCase() == TGI_EXIT) {
return 0;
}
rc = DoPcreExec(NotEmptyOpts | Options);
switch (CheckNoMatch(rc)) {
case TGI_CONTINUE:
continue;
case TGI_EXIT:
return 0;
case TGI_WALKTHROUGH:
default:
break;
}
if (rc < 0) {
return rc;
}
CopyResults(rc);
} while (true);
return 0;
}
private:
};
class TRegExBaseImpl: public TAtomicRefCount<TRegExBaseImpl> {
friend class TRegExBase;
protected:
int CompileOptions;
TString RegExpr;
regex_t Preg;
public:
TRegExBaseImpl()
: CompileOptions(0)
{
memset(&Preg, 0, sizeof(Preg));
}
TRegExBaseImpl(const TString& re, int cflags)
: CompileOptions(cflags)
, RegExpr(re)
{
int rc = regcomp(&Preg, re.data(), cflags);
if (rc) {
const size_t ERRBUF_SIZE = 100;
char errbuf[ERRBUF_SIZE];
regerror(rc, &Preg, errbuf, ERRBUF_SIZE);
Error = "Error: regular expression " + re + " is wrong: " + errbuf;
ythrow yexception() << "RegExp " << re << ": " << Error.data();
}
}
int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
if (!RegExpr) {
ythrow yexception() << "Regular expression is not compiled";
}
if (!str) {
ythrow yexception() << "Empty string is passed to TRegExBaseImpl::Exec";
}
if ((eflags & REGEXP_GLOBAL) == 0) {
return regexec(&Preg, str, nmatches, pmatch, eflags);
} else {
int options = 0;
if ((eflags & REG_NOTBOL) != 0)
options |= PCRE_NOTBOL;
if ((eflags & REG_NOTEOL) != 0)
options |= PCRE_NOTEOL;
return TGlobalImpl(str, pmatch[0], options, (pcre*)Preg.re_pcre).ExecGlobal();
}
}
bool IsCompiled() {
return Preg.re_pcre;
}
~TRegExBaseImpl() {
regfree(&Preg);
}
private:
TString Error;
};
bool TRegExBase::IsCompiled() const {
return Impl && Impl->IsCompiled();
}
TRegExBase::TRegExBase(const char* re, int cflags) {
if (re) {
Compile(re, cflags);
}
}
TRegExBase::TRegExBase(const TString& re, int cflags) {
Compile(re, cflags);
}
TRegExBase::~TRegExBase() {
}
void TRegExBase::Compile(const TString& re, int cflags) {
Impl = new TRegExBaseImpl(re, cflags);
}
int TRegExBase::Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const {
if (!Impl)
ythrow yexception() << "!Regular expression is not compiled";
return Impl->Exec(str, pmatch, eflags, nmatches);
}
int TRegExBase::GetCompileOptions() const {
if (!Impl)
ythrow yexception() << "!Regular expression is not compiled";
return Impl->CompileOptions;
}
TString TRegExBase::GetRegExpr() const {
if (!Impl)
ythrow yexception() << "!Regular expression is not compiled";
return Impl->RegExpr;
}
TRegExMatch::TRegExMatch(const char* re, int cflags)
: TRegExBase(re, cflags)
{
}
TRegExMatch::TRegExMatch(const TString& re, int cflags)
: TRegExBase(re, cflags)
{
}
bool TRegExMatch::Match(const char* str) const {
return Exec(str, nullptr, 0, 0) == 0;
}
TRegExSubst::TRegExSubst(const char* re, int cflags)
: TRegExBase(re, cflags)
, Replacement(nullptr)
{
memset(Brfs, 0, sizeof(TBackReferences) * NMATCHES);
}
TString TRegExSubst::Replace(const char* str, int eflags) {
TString s;
if (BrfsCount) {
if (Exec(str, PMatch, eflags) == 0) {
int i;
for (i = 0; i < BrfsCount; i++) {
s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
if (Brfs[i].Refer >= 0 && Brfs[i].Refer < NMATCHES)
s += TString(str, PMatch[Brfs[i].Refer].rm_so, int(PMatch[Brfs[i].Refer].rm_eo - PMatch[Brfs[i].Refer].rm_so));
}
s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg);
}
} else {
s = Replacement;
}
return s;
}
//***
// ��� ������������ ������ aaa.$1.$$$$.$2.bbb.$$$ccc Brfs ����� �����:
// {beg = 0, end = 4, Refer = 1} => "aaa." + $1_match
// {beg = 6, end = 8, Refer = -1} => ".$"
// {beg = 9, end = 10, Refer = -1} => "$"
// {beg = 11, end = 12, Refer = 2} => "." + $2_match
// {beg = 14, end = 20, Refer = -1} => ".bbb.$"
// {beg = 21, end = 22, Refer = -1} => "$"
// {beg = 22, end = 25, Refer = -1} => "ccc"
// {beg = 0, end = 0, Refer = 0}
//***
int TRegExSubst::ParseReplacement(const char* repl) {
Replacement = repl;
if (!Replacement || *Replacement == 0)
return 0;
char* pos = (char*)Replacement;
char* pos1 = nullptr;
char* pos2 = nullptr;
int i = 0;
while (pos && *pos && i < NMATCHES) {
pos1 = strchr(pos, '$');
Brfs[i].Refer = -1;
pos2 = pos1;
if (pos1) {
pos2 = pos1 + 1;
while (IsAsciiDigit(*pos2))
pos2++;
if (pos2 > pos1 + 1) {
Brfs[i].Refer = atol(TString(Replacement, pos1 + 1 - Replacement, pos2 - (pos1 + 1)).data());
} else {
pos1++;
if (*pos2 == '$')
pos2++;
Brfs[i].Refer = -1;
}
}
Brfs[i].Beg = int(pos - (char*)Replacement);
Brfs[i].End = (pos1 == nullptr ? (int)strlen(Replacement) : int(pos1 - Replacement));
pos = pos2;
i++;
}
Brfs[i].Beg = Brfs[i].End = 0;
Brfs[i].Refer = -1;
BrfsCount = i;
return BrfsCount;
}