diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/regex/pcre | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/regex/pcre')
-rw-r--r-- | library/cpp/regex/pcre/README.md | 59 | ||||
-rw-r--r-- | library/cpp/regex/pcre/benchmark/main.cpp | 80 | ||||
-rw-r--r-- | library/cpp/regex/pcre/benchmark/ya.make | 14 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre.cpp | 1 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre.h | 191 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre_ut.cpp | 89 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre_ut_base.h | 38 | ||||
-rw-r--r-- | library/cpp/regex/pcre/regexp.cpp | 317 | ||||
-rw-r--r-- | library/cpp/regex/pcre/regexp.h | 63 | ||||
-rw-r--r-- | library/cpp/regex/pcre/regexp_ut.cpp | 103 | ||||
-rw-r--r-- | library/cpp/regex/pcre/traits.h | 99 | ||||
-rw-r--r-- | library/cpp/regex/pcre/ut/ya.make | 10 | ||||
-rw-r--r-- | library/cpp/regex/pcre/ya.make | 23 |
13 files changed, 1087 insertions, 0 deletions
diff --git a/library/cpp/regex/pcre/README.md b/library/cpp/regex/pcre/README.md new file mode 100644 index 0000000000..b5b09a3715 --- /dev/null +++ b/library/cpp/regex/pcre/README.md @@ -0,0 +1,59 @@ +# About +This is a PCRE library wrapper which provides unified interface for UTF-8, UTF-16 and UTF-32 strings matching and optimization control. + +# Rationale +Many Arcadia related libraries (telfinder, lemmer etc.) provides only UTF-16 interfaces, because this is way faster for cyrillic texts. Any algorithm that is working with such libraries and regular expressions must use `WideToUTF8` and `UTF8ToWide` at the borderline between regular expression and UTF-18 interface. This leads us to great performance penalty. +This library allows us to erase these charset conversions. + +# Interface + +Before starting with interface details, let's consider simplest library usage example: +`UNIT_ASSERT(NPcre::TPcre<wchar16>(u"ba+d").Matches(TWtringBuf(u"baaad")));` + +Here we see regular expression construction for UTF-16 charset: + +`NPcre::TPcre<wchar16>(u"ba+d")` + +and matching of the subject string `baaad` against this pattern: + +`.Matches(TWtringBuf(u"baaad"))`; + +Let's consider both of them in details. + +## Construction +`NPcre::TPcre` class accepts single template parameter: `TCharType`. Currently supported char types are `char`, `wchar16` and `wchar32`. Additional char types traits can be defined in `traits.h` + +Constructor accepts three arguments. Two of them are optional: +1. Zero-terminated string on characters with pattern +2. Optimization type. The default value is `NPcre::EOptimize::None` which means no pattern optimization. Another possible value is `NPcre::EOptimize::Study` which will take some time at construction stage but could give up to 4x speed boost. And the last but not the least is `NPcre::EOptimize::JIT` which performs JIT optimization which could take significant time but could give up to 10x speed boost. +3. Regular expressions compile flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_compile2.html), but for most cases `PCRE_UTF8 | PCRE_UCP` will be enough. The default value is `0`. + +## Matching +{% note tip %} +Two words on PCRE workspaces. Workspace is memory area where PCRE stores information about back references and capturing groups. If passed workspace size is not enough, PCRE will allocate bigger workspace in heap. For simple matching and string searching of string without back references, workspace is not required and this library provides separate functions that won't waste space on workspace and this could save ≈0.5% of CPU TIME on simple patterns. +For regular expressions with capturing groups, recommended workspace size is `(capturing groups count + 1)`. +{% endnote %} + +In the example above matching function `Matches` returns boolean indicating that subject string matched pattern and accepts two arguments: +1. `TBasicStringBuf<TCharType>` with subject string +2. Regular expression execute flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_exec.html). For most cases `0` will be just fine and this is the default value. + +## Searching +Function `Find` accepts the same arguments as `Match` and returns `TMaybe<NPcre::TPcreMatch>` which contains pair of ints with start and end offsets of string found. Check result for `Defined` to ensure that pattern was found in subject string. + +## Capturing +The last member function of `NPcre::TPcre` is `Capture` which searches for pattern and returns capturing group. + +### Return value +Return value is `NPcre::TPcreMatches` which is alias for `TVector<NPcre::TPcreMatch>`. +Vector will be empty if pattern wasn't found in subject string. +If pattern was found, first element will contain start and end offsets of string found. +All other elements will contains start and end offsets of capturing groups in order they appeared in regular expression. +{% note tip %} +If some capturing group not matched subject string, but some of consequent capturing groups did, this capturing group will present as `-1, -1` pair. +For example: calling `Capture` on pattern `(a)(?:(b)c|b(d))` against subject string `zabda` will return `[{1,4},{1,2},{-1,-1},{3,4}]` because capturing group `(b)` wasn't matched. +{% endnote %} +### Arguments +1. `TBasicStringBuf<TCharType>` with subject string +2. Regular expression execute flags. +3. Initial workspace size. Default value is `16` but if pattern contains more than 16 capturing groups, this function will reallocate workspace with bigger size. diff --git a/library/cpp/regex/pcre/benchmark/main.cpp b/library/cpp/regex/pcre/benchmark/main.cpp new file mode 100644 index 0000000000..3c11ef4f29 --- /dev/null +++ b/library/cpp/regex/pcre/benchmark/main.cpp @@ -0,0 +1,80 @@ +#include <benchmark/benchmark.h> + +#include <library/cpp/regex/pcre/pcre.h> + +#include <util/charset/wide.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> + +static TStringBuf SimplePattern = "[-.\\w]+@(?:[a-z\\d]{2,}\\.)+[a-z]{2,6}"; +static TStringBuf ComplexPattern = R"((?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s*(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*))*)?;\s*))"; + +static constexpr size_t HaystacksCount = 32; +static constexpr size_t MinPrefix = 1024; + +static TVector<TString> GenerateHaystacks() { + // Generate long randomized haystacks to prevent cache hit + TVector<TString> result(Reserve(HaystacksCount)); + for (size_t i = 0; i < HaystacksCount; ++i) { + result.push_back(TString::Join(ComplexPattern.SubString(MinPrefix + i, ComplexPattern.Size() - MinPrefix - i), ComplexPattern.SubString(0, MinPrefix + i))); + } + return result; +} + +static const TVector<TString> Haystacks{GenerateHaystacks()}; + +static const NPcre::TPcre<char> Simple{SimplePattern.Data()}; +static const NPcre::TPcre<char> SimpleStudy{SimplePattern.Data(), NPcre::EOptimize::Study}; +static const NPcre::TPcre<char> SimpleJIT{SimplePattern.Data(), NPcre::EOptimize::JIT}; +static const NPcre::TPcre<char> Complex{ComplexPattern.Data()}; +static const NPcre::TPcre<char> ComplexStudy{ComplexPattern.Data(), NPcre::EOptimize::Study}; +static const NPcre::TPcre<char> ComplexJIT{ComplexPattern.Data(), NPcre::EOptimize::JIT}; + +static void Benchmark(benchmark::State& state, const NPcre::TPcre<char>& pattern) { + for (auto _ : state) { + for (size_t i = 0; i < HaystacksCount; ++i) { + // Force string reallocation, so there will be no chance for cache hit of any type + benchmark::DoNotOptimize(pattern.Matches(TString{i, 'a'} + Haystacks[i])); + } + } +} + +static void BenchmarkSimplePatternJIT(benchmark::State& state) { + Benchmark(state, SimpleJIT); +} + +static void BenchmarkSimplePatternStudy(benchmark::State& state) { + Benchmark(state, SimpleStudy); +} + +static void BenchmarkSimplePattern(benchmark::State& state) { + Benchmark(state, Simple); +} + +BENCHMARK(BenchmarkSimplePatternJIT)->Iterations(1); +BENCHMARK(BenchmarkSimplePatternStudy)->Iterations(1); +BENCHMARK(BenchmarkSimplePattern)->Iterations(1); +BENCHMARK(BenchmarkSimplePatternJIT); +BENCHMARK(BenchmarkSimplePatternStudy); +BENCHMARK(BenchmarkSimplePattern); + +static void BenchmarkComplexPatternJIT(benchmark::State& state) { + Benchmark(state, ComplexJIT); +} + +static void BenchmarkComplexPatternStudy(benchmark::State& state) { + Benchmark(state, ComplexStudy); +} + +static void BenchmarkComplexPattern(benchmark::State& state) { + Benchmark(state, Complex); +} + +BENCHMARK(BenchmarkComplexPatternJIT)->Iterations(1); +BENCHMARK(BenchmarkComplexPatternStudy)->Iterations(1); +BENCHMARK(BenchmarkComplexPattern)->Iterations(1); +BENCHMARK(BenchmarkComplexPatternJIT); +BENCHMARK(BenchmarkComplexPatternStudy); +BENCHMARK(BenchmarkComplexPattern); + diff --git a/library/cpp/regex/pcre/benchmark/ya.make b/library/cpp/regex/pcre/benchmark/ya.make new file mode 100644 index 0000000000..7c30fae0a6 --- /dev/null +++ b/library/cpp/regex/pcre/benchmark/ya.make @@ -0,0 +1,14 @@ +G_BENCHMARK() + +OWNER(g:so) + +PEERDIR( + library/cpp/regex/pcre +) + +SRCS( + main.cpp +) + +END() + diff --git a/library/cpp/regex/pcre/pcre.cpp b/library/cpp/regex/pcre/pcre.cpp new file mode 100644 index 0000000000..9e97d5f8f7 --- /dev/null +++ b/library/cpp/regex/pcre/pcre.cpp @@ -0,0 +1 @@ +#include "pcre.h" diff --git a/library/cpp/regex/pcre/pcre.h b/library/cpp/regex/pcre/pcre.h new file mode 100644 index 0000000000..82a9774f00 --- /dev/null +++ b/library/cpp/regex/pcre/pcre.h @@ -0,0 +1,191 @@ +#pragma once + +#include "traits.h" + +#include <library/cpp/containers/stack_array/stack_array.h> + +#include <util/generic/maybe.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +namespace NPcre { + //! Start and end offset for match group. + using TPcreMatch = std::pair<int, int>; + + //! Full match result containing all capturing groups. + /*! + * At zero index we have whole matched string start and end offsets. + * All other elements will contain capturing groups positions. + * Non-captured capturing groups will have {-1, -1} offsets. + */ + using TPcreMatches = TVector<TPcreMatch>; + + //! Compiled pattern optimization strategy. + enum class EOptimize { + //! No optimization. + /*! + * Useful for non-reusable patterns where compile time matters. + */ + None, + //! Basic optimization via |pcre_study|. + /*! + * Could give up to 4x match speed boost in exchange of increased + * construction time. Could not. + */ + Study, + //! PCRE JIT optimization. + /*! + * Could give up to 10x match speed bust in exchange of significantly + * increased compile time. Also, for very complex patterns |pcre_exec| + * could return |PCRE_ERROR_JIT_STACKLIMIT|. See + * https://www.pcre.org/original/doc/html/pcrejit.html for details. + */ + JIT + }; + + //! PCRE code container. Controls its life time and provides handy wrapper. + template <class TCharType> + class TPcre { + private: + using TCodeType = typename TPcreTraits<TCharType>::TCodeType; + using TExtraType = typename TPcreTraits<TCharType>::TExtraType; + using TStringType = typename TPcreTraits<TCharType>::TStringType; + using TTraits = TPcreTraits<TCharType>; + static constexpr size_t DefaultWorkspaceSize = 16; + + public: + //! Compiles regexp into internal representation for future use. + /*! + * \param pattern Regular expression to be compiled. + * \param optimize If |EOptimize::JIT|, perform additional + * analysis, which will take extra time, but could + * speed up matching. |None| to omit optimization. + * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html + **/ + TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) { + int errcode; + const char* errptr; + int erroffset; + Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr)); + if (!Code) { + ythrow yexception() << "Failed to compile pattern <" << pattern + << ">, because of error at pos " << erroffset + << ", error code " << errcode << ": " << errptr; + } + if (optimize != EOptimize::None) { + errptr = nullptr; + int options; + if (optimize == EOptimize::Study) { + options = 0; + } else { + options = PCRE_STUDY_JIT_COMPILE; + } + Extra.Reset(TTraits::Study(Code.Get(), options, &errptr)); + if (errptr) { + ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr; + } + } + } + + //! Check if compiled pattern matches string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns |true| if there is a match. + */ + bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize)); + } + + //! Find compiled pattern in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns Start and end offsets pair if there is a + * match. |Nothing| otherwise. + */ + Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + for (size_t i = 0; i < ovecsize; ++i) { + ovector[i] = -4; + } + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (ConvertReturnCode(rc)) { + return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]); + } else { + return Nothing(); + } + } + + //! Find and return all capturing groups in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param initialWorkspaceSize Capturing groups vector initial size. + * Workspace will be grown and search will + * be repeated if there is not enough + * space. + * \returns List of capturing groups start and end + * offsets. First element will contain + * whole matched substring start and end + * offsets. For non-matched capturing + * groups, result will contain {-1, -1} + * pair. + * If pattern not found in string, result + * vector will be empty. + */ + Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(initialWorkspaceSize > 0); + size_t ovecsize = (initialWorkspaceSize + 1) * 3; + while (true) { + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (rc > 0) { + TPcreMatches result(Reserve(rc >> 1)); + for (int i = 0, pos = 0; i < rc; ++i) { + int start = ovector[pos++]; + int end = ovector[pos++]; + result.emplace_back(start, end); + } + return result; + } else if (rc == 0) { + ovecsize <<= 1; + } else if (rc == PCRE_ERROR_NOMATCH) { + return TPcreMatches{}; + } else if (rc < 0) { + ythrow yexception() << "Error. RC = " << rc; + } + } + } + + private: + TPcreCode<TCharType> Code; + TPcreExtra<TCharType> Extra; + + private: + static inline bool ConvertReturnCode(int rc) { + if (rc >= 0) { + return true; + } else if (rc == PCRE_ERROR_NOMATCH) { + return false; + } else { + ythrow yexception() << "Error. RC = " << rc; + } + } + }; +} + diff --git a/library/cpp/regex/pcre/pcre_ut.cpp b/library/cpp/regex/pcre/pcre_ut.cpp new file mode 100644 index 0000000000..84d06499ae --- /dev/null +++ b/library/cpp/regex/pcre/pcre_ut.cpp @@ -0,0 +1,89 @@ +#include <library/cpp/regex/pcre/pcre.h> + +#include <library/cpp/testing/unittest/registar.h> + +template <class T> +inline IOutputStream& operator<<(IOutputStream& out, const TVector<T>& value) { + size_t size = value.size(); + out << "["; + for (size_t i = 0; i < size; ++i) { + if (i) { + out << ","; + } + out << value[i]; + } + out << "]"; + return out; +} + +template <class T, class U> +inline IOutputStream& operator<<(IOutputStream& out, const std::pair<T, U>& value) { + out << "{" << value.first << "," << value.second << "}"; + return out; +} + +// char8_t +#define OPTIMIZE NPcre::EOptimize::None +#define TEST_NAME(S) S +#define STRING(S) S +#define CHAR_TYPE char +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT +#include "pcre_ut_base.h" + +// char16_t +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::None +#undef TEST_NAME +#define TEST_NAME(S) S ## 16 +#undef STRING +#define STRING(S) u ## S +#undef CHAR_TYPE +#define CHAR_TYPE wchar16 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study16 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT16 +#include "pcre_ut_base.h" + +// char32_t +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::None +#undef TEST_NAME +#define TEST_NAME(S) S ## 32 +#undef STRING +#define STRING(S) U ## S +#undef CHAR_TYPE +#define CHAR_TYPE wchar32 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study32 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT32 +#include "pcre_ut_base.h" + diff --git a/library/cpp/regex/pcre/pcre_ut_base.h b/library/cpp/regex/pcre/pcre_ut_base.h new file mode 100644 index 0000000000..1d61d07b14 --- /dev/null +++ b/library/cpp/regex/pcre/pcre_ut_base.h @@ -0,0 +1,38 @@ +#define CHECK_MATCHES(EXPECTED, PATTERN, STR) \ + UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR))); \ + UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR), 0, 10)); + +#define CHECK(A, B) UNIT_ASSERT_STRINGS_EQUAL(ToString(STRING(A)), ToString(B)) + +#define CHECK_GROUPS(EXPECTED, PATTERN, STR) \ + CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR))); \ + CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR), 0, 10)); + +Y_UNIT_TEST_SUITE(TEST_NAME(TestRegExp)) { + Y_UNIT_TEST(TestMatches) { + CHECK_MATCHES(true, "ю", "bюd"); + CHECK_MATCHES(false, "c", "bюd"); + CHECK_MATCHES(true, "(ю)(?:(b)c|bd)", "zюbda"); + CHECK_MATCHES(false, "(ю)(?:(b)c|bd)", "bюd"); + CHECK_MATCHES(true, "(abc|def)=\\g1", "abc=abc"); + CHECK_MATCHES(true, "(abc|def)=\\g1", "def=def"); + CHECK_MATCHES(false, "(abc|def)=\\g1", "abc=def"); + } + + Y_UNIT_TEST(TestGroups) { + CHECK_GROUPS("{1,2}", "a", "bad"); + CHECK_GROUPS("(empty maybe)", "c", "bad"); + CHECK_GROUPS("{1,4}", "(a)(?:(b)c|bd)", "zabda"); + CHECK_GROUPS("(empty maybe)", "(a)(?:(b)c|bd)", "bad"); + CHECK_GROUPS("{1,8}", "(abc|def)=\\g1", "aabc=abca"); + CHECK_GROUPS("(empty maybe)", "(abc|def)=\\g1", "abc=def"); + } + + Y_UNIT_TEST(TestCapture) { + CHECK("[{1,2}]",NPcre::TPcre<CHAR_TYPE>(STRING("a"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("c"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + CHECK("[{1,4},{1,2},{-1,-1},{3,4}]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|b(d))"), OPTIMIZE).Capture(STRING("zabda"), 0, 1)); + CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|bd)"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + } +} + diff --git a/library/cpp/regex/pcre/regexp.cpp b/library/cpp/regex/pcre/regexp.cpp new file mode 100644 index 0000000000..575c09cee4 --- /dev/null +++ b/library/cpp/regex/pcre/regexp.cpp @@ -0,0 +1,317 @@ +#include "regexp.h" + +#include <util/generic/string.h> +#include <util/string/ascii.h> +#include <util/system/defaults.h> + +#include <cstdlib> +#include <util/generic/noncopyable.h> + +class TGlobalImpl : TNonCopyable { +private: + const char* Str; + regmatch_t* Pmatch; + int Options; + int StrLen; + int StartOffset, NotEmptyOpts, MatchPos; + int MatchBuf[NMATCHES * 3]; + pcre* PregComp; + + enum StateCode { + TGI_EXIT, + TGI_CONTINUE, + TGI_WALKTHROUGH + }; + +private: + void CopyResults(int count) { + for (int i = 0; i < count; i++) { + Pmatch[MatchPos].rm_so = MatchBuf[2 * i]; + Pmatch[MatchPos].rm_eo = MatchBuf[2 * i + 1]; + MatchPos++; + if (MatchPos >= NMATCHES) { + ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer."; + } + } + } + + int DoPcreExec(int opts) { + int rc = pcre_exec( + PregComp, /* the compiled pattern */ + nullptr, /* no extra data - we didn't study the pattern */ + Str, /* the subject string */ + StrLen, /* the length of the subject */ + StartOffset, /* start at offset 0 in the subject */ + opts, /* default options */ + MatchBuf, /* output vector for substring information */ + NMATCHES); /* number of elements in the output vector */ + + if (rc == 0) { + ythrow yexception() << "TRegExBase::Exec(): Not enough space in internal buffer."; + } + + return rc; + } + + StateCode CheckEmptyCase() { + if (MatchBuf[0] == MatchBuf[1]) { // founded an empty string + if (MatchBuf[0] == StrLen) { // at the end + return TGI_EXIT; + } + NotEmptyOpts = PCRE_NOTEMPTY | PCRE_ANCHORED; // trying to find non empty string + } + return TGI_WALKTHROUGH; + } + + StateCode CheckNoMatch(int rc) { + if (rc == PCRE_ERROR_NOMATCH) { + if (NotEmptyOpts == 0) { + return TGI_EXIT; + } + + MatchBuf[1] = StartOffset + 1; // we have failed to find non-empty-string. trying to find again shifting "previous match offset" + return TGI_CONTINUE; + } + return TGI_WALKTHROUGH; + } + +public: + TGlobalImpl(const char* st, regmatch_t& pma, int opts, pcre* pc_re) + : Str(st) + , Pmatch(&pma) + , Options(opts) + , StartOffset(0) + , NotEmptyOpts(0) + , MatchPos(0) + , PregComp(pc_re) + { + memset(Pmatch, -1, sizeof(regmatch_t) * NMATCHES); + StrLen = strlen(Str); + } + + int ExecGlobal() { + StartOffset = 0; + int rc = DoPcreExec(Options); + + if (rc < 0) { + return rc; + } + CopyResults(rc); + do { + NotEmptyOpts = 0; + StartOffset = MatchBuf[1]; + + if (CheckEmptyCase() == TGI_EXIT) { + return 0; + } + + rc = DoPcreExec(NotEmptyOpts | Options); + + switch (CheckNoMatch(rc)) { + case TGI_CONTINUE: + continue; + case TGI_EXIT: + return 0; + case TGI_WALKTHROUGH: + default: + break; + } + + if (rc < 0) { + return rc; + } + + CopyResults(rc); + } while (true); + + return 0; + } + +private: +}; + +class TRegExBaseImpl: public TAtomicRefCount<TRegExBaseImpl> { + friend class TRegExBase; + +protected: + int CompileOptions; + TString RegExpr; + regex_t Preg; + +public: + TRegExBaseImpl() + : CompileOptions(0) + { + memset(&Preg, 0, sizeof(Preg)); + } + + TRegExBaseImpl(const TString& re, int cflags) + : CompileOptions(cflags) + , RegExpr(re) + { + int rc = regcomp(&Preg, re.data(), cflags); + if (rc) { + const size_t ERRBUF_SIZE = 100; + char errbuf[ERRBUF_SIZE]; + regerror(rc, &Preg, errbuf, ERRBUF_SIZE); + Error = "Error: regular expression " + re + " is wrong: " + errbuf; + ythrow yexception() << "RegExp " << re << ": " << Error.data(); + } + } + + int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const { + if (!RegExpr) { + ythrow yexception() << "Regular expression is not compiled"; + } + if (!str) { + ythrow yexception() << "Empty string is passed to TRegExBaseImpl::Exec"; + } + if ((eflags & REGEXP_GLOBAL) == 0) { + return regexec(&Preg, str, nmatches, pmatch, eflags); + } else { + int options = 0; + if ((eflags & REG_NOTBOL) != 0) + options |= PCRE_NOTBOL; + if ((eflags & REG_NOTEOL) != 0) + options |= PCRE_NOTEOL; + + return TGlobalImpl(str, pmatch[0], options, (pcre*)Preg.re_pcre).ExecGlobal(); + } + } + + bool IsCompiled() { + return Preg.re_pcre; + } + + ~TRegExBaseImpl() { + regfree(&Preg); + } + +private: + TString Error; +}; + +bool TRegExBase::IsCompiled() const { + return Impl && Impl->IsCompiled(); +} + +TRegExBase::TRegExBase(const char* re, int cflags) { + if (re) { + Compile(re, cflags); + } +} + +TRegExBase::TRegExBase(const TString& re, int cflags) { + Compile(re, cflags); +} + +TRegExBase::~TRegExBase() { +} + +void TRegExBase::Compile(const TString& re, int cflags) { + Impl = new TRegExBaseImpl(re, cflags); +} + +int TRegExBase::Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches) const { + if (!Impl) + ythrow yexception() << "!Regular expression is not compiled"; + return Impl->Exec(str, pmatch, eflags, nmatches); +} + +int TRegExBase::GetCompileOptions() const { + if (!Impl) + ythrow yexception() << "!Regular expression is not compiled"; + return Impl->CompileOptions; +} + +TString TRegExBase::GetRegExpr() const { + if (!Impl) + ythrow yexception() << "!Regular expression is not compiled"; + return Impl->RegExpr; +} + +TRegExMatch::TRegExMatch(const char* re, int cflags) + : TRegExBase(re, cflags) +{ +} + +TRegExMatch::TRegExMatch(const TString& re, int cflags) + : TRegExBase(re, cflags) +{ +} + +bool TRegExMatch::Match(const char* str) const { + return Exec(str, nullptr, 0, 0) == 0; +} + +TRegExSubst::TRegExSubst(const char* re, int cflags) + : TRegExBase(re, cflags) + , Replacement(nullptr) +{ + memset(Brfs, 0, sizeof(TBackReferences) * NMATCHES); +} + +TString TRegExSubst::Replace(const char* str, int eflags) { + TString s; + if (BrfsCount) { + if (Exec(str, PMatch, eflags) == 0) { + int i; + for (i = 0; i < BrfsCount; i++) { + s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg); + if (Brfs[i].Refer >= 0 && Brfs[i].Refer < NMATCHES) + s += TString(str, PMatch[Brfs[i].Refer].rm_so, int(PMatch[Brfs[i].Refer].rm_eo - PMatch[Brfs[i].Refer].rm_so)); + } + s += TString(Replacement, Brfs[i].Beg, Brfs[i].End - Brfs[i].Beg); + } + } else { + s = Replacement; + } + return s; +} + +//*** +// ��� ������������ ������ aaa.$1.$$$$.$2.bbb.$$$ccc Brfs ����� �����: +// {beg = 0, end = 4, Refer = 1} => "aaa." + $1_match +// {beg = 6, end = 8, Refer = -1} => ".$" +// {beg = 9, end = 10, Refer = -1} => "$" +// {beg = 11, end = 12, Refer = 2} => "." + $2_match +// {beg = 14, end = 20, Refer = -1} => ".bbb.$" +// {beg = 21, end = 22, Refer = -1} => "$" +// {beg = 22, end = 25, Refer = -1} => "ccc" +// {beg = 0, end = 0, Refer = 0} +//*** +int TRegExSubst::ParseReplacement(const char* repl) { + Replacement = repl; + if (!Replacement || *Replacement == 0) + return 0; + char* pos = (char*)Replacement; + char* pos1 = nullptr; + char* pos2 = nullptr; + int i = 0; + while (pos && *pos && i < NMATCHES) { + pos1 = strchr(pos, '$'); + Brfs[i].Refer = -1; + pos2 = pos1; + if (pos1) { + pos2 = pos1 + 1; + while (IsAsciiDigit(*pos2)) + pos2++; + if (pos2 > pos1 + 1) { + Brfs[i].Refer = atol(TString(Replacement, pos1 + 1 - Replacement, pos2 - (pos1 + 1)).data()); + } else { + pos1++; + if (*pos2 == '$') + pos2++; + Brfs[i].Refer = -1; + } + } + Brfs[i].Beg = int(pos - (char*)Replacement); + Brfs[i].End = (pos1 == nullptr ? (int)strlen(Replacement) : int(pos1 - Replacement)); + pos = pos2; + i++; + } + Brfs[i].Beg = Brfs[i].End = 0; + Brfs[i].Refer = -1; + BrfsCount = i; + return BrfsCount; +} diff --git a/library/cpp/regex/pcre/regexp.h b/library/cpp/regex/pcre/regexp.h new file mode 100644 index 0000000000..bc610bd2f3 --- /dev/null +++ b/library/cpp/regex/pcre/regexp.h @@ -0,0 +1,63 @@ +#pragma once + +#include <sys/types.h> + +#include <util/system/defaults.h> +#include <util/generic/string.h> +#include <util/generic/yexception.h> + +#include <contrib/libs/pcre/pcre.h> +#include <contrib/libs/pcre/pcreposix.h> + +//THIS CODE LOOKS LIKE A TRASH, BUT WORKS. + +#define NMATCHES 100 +#define REGEXP_GLOBAL 0x0080 // use this if you want to find all occurences + +class TRegExBaseImpl; + +class TRegExBase { +protected: + TSimpleIntrusivePtr<TRegExBaseImpl> Impl; + +public: + TRegExBase(const char* regExpr = nullptr, int cflags = REG_EXTENDED); + TRegExBase(const TString& regExpr, int cflags = REG_EXTENDED); + + virtual ~TRegExBase(); + + int Exec(const char* str, regmatch_t pmatch[], int eflags, int nmatches = NMATCHES) const; + void Compile(const TString& regExpr, int cflags = REG_EXTENDED); + bool IsCompiled() const; + int GetCompileOptions() const; + TString GetRegExpr() const; +}; + +class TRegExMatch: public TRegExBase { +public: + TRegExMatch(const char* regExpr = nullptr, int cflags = REG_NOSUB | REG_EXTENDED); + TRegExMatch(const TString& regExpr, int cflags = REG_NOSUB | REG_EXTENDED); + + bool Match(const char* str) const; +}; + +struct TBackReferences { + int Beg; + int End; + int Refer; +}; + +class TRegExSubst: public TRegExBase { +private: + const char* Replacement; + regmatch_t PMatch[NMATCHES]; + + TBackReferences Brfs[NMATCHES]; + int BrfsCount; + +public: + TRegExSubst(const char* regExpr = nullptr, int cflags = REG_EXTENDED); + + TString Replace(const char* str, int eflags = 0); + int ParseReplacement(const char* replacement); +}; diff --git a/library/cpp/regex/pcre/regexp_ut.cpp b/library/cpp/regex/pcre/regexp_ut.cpp new file mode 100644 index 0000000000..5184e801cc --- /dev/null +++ b/library/cpp/regex/pcre/regexp_ut.cpp @@ -0,0 +1,103 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <util/string/strip.h> +#include <library/cpp/regex/pcre/regexp.h> +#include <util/stream/output.h> + +struct TRegTest { + const char* Regexp; + const char* Data; + const char* Result; + int CompileOptions; + int RunOptions; + + TRegTest(const char* re, const char* text, const char* res, int copts = REG_EXTENDED, int ropts = 0) + : Regexp(re) + , Data(text) + , Result(res) + , CompileOptions(copts) + , RunOptions(ropts) + { + } +}; + +struct TSubstTest: public TRegTest { + const char* Replacement; + const char* Replacement2; + + TSubstTest(const char* re, const char* text, const char* res, const char* repl, const char* repl2) + : TRegTest(re, text, res, REG_EXTENDED, REGEXP_GLOBAL) + , Replacement(repl) + , Replacement2(repl2) + { + } +}; + +const TRegTest REGTEST_DATA[] = { + TRegTest("test", "its a test and test string.", "6 10", REG_EXTENDED, 0), + TRegTest("test", "its a test and test string.", "6 10 15 19", REG_EXTENDED, REGEXP_GLOBAL), + TRegTest("test|[an]{0,0}", "test and test an test string tes", "0 4 4 4 5 5 6 6 7 7 8 8 9 13 13 13 14 14 15 15 16 16 17 21 21 21 22 22 23 23 24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31 32 32", REG_EXTENDED, REGEXP_GLOBAL), + TRegTest("test[an]{1,}", "test and test an test string tes", "NM", REG_EXTENDED, REGEXP_GLOBAL)}; + +const TSubstTest SUBSTTEST_DATA[] = { + TSubstTest("([a-zA-Z]*[0-9]+) (_[a-z]+)", "Xxx123 534 ___124 bsd _A ZXC _L 141 _sd dsfg QWE123 _bbb", "141 XXX/_sd", "$1 XXX/$2", "$2$2$2 YY$1Y/$2")}; + +class TRegexpTest: public TTestBase { +private: + regmatch_t Matches[NMATCHES]; + +private: + UNIT_TEST_SUITE(TRegexpTest); + UNIT_TEST(TestRe) + UNIT_TEST(TestSubst) + UNIT_TEST(TestOffEndOfBuffer); + UNIT_TEST_SUITE_END(); + + inline void TestRe() { + for (const auto& regTest : REGTEST_DATA) { + memset(Matches, 0, sizeof(Matches)); + TString result; + + TRegExBase re(regTest.Regexp, regTest.CompileOptions); + if (re.Exec(regTest.Data, Matches, regTest.RunOptions) == 0) { + for (auto& matche : Matches) { + if (matche.rm_so == -1) { + break; + } + result.append(Sprintf("%i %i ", matche.rm_so, matche.rm_eo)); + } + } else { + result = "NM"; + } + StripInPlace(result); + UNIT_ASSERT_VALUES_EQUAL(result, regTest.Result); + } + } + + inline void TestSubst() { + for (const auto& substTest : SUBSTTEST_DATA) { + TRegExSubst subst(substTest.Regexp, substTest.CompileOptions); + subst.ParseReplacement(substTest.Replacement); + TString result = subst.Replace(substTest.Data, substTest.RunOptions); + UNIT_ASSERT_VALUES_EQUAL(result, substTest.Result); + TRegExSubst substCopy = subst; + subst.ParseReplacement(substTest.Replacement2); + TString newResult = subst.Replace(substTest.Data, substTest.RunOptions); + UNIT_ASSERT_VALUES_UNEQUAL(newResult.c_str(), result.c_str()); + TString copyResult = substCopy.Replace(substTest.Data, substTest.RunOptions); + UNIT_ASSERT_VALUES_EQUAL(copyResult, result); + substCopy = subst; + copyResult = substCopy.Replace(substTest.Data, substTest.RunOptions); + UNIT_ASSERT_VALUES_EQUAL(copyResult, newResult); + } + } + + void TestOffEndOfBuffer() { + const TString needle{".*[^./]gov[.].*"}; + TRegExMatch re{needle, REG_UTF8}; + const TString haystack{"fakty.ictv.ua"}; + UNIT_ASSERT_VALUES_EQUAL(re.Match(haystack.c_str()), false); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TRegexpTest); diff --git a/library/cpp/regex/pcre/traits.h b/library/cpp/regex/pcre/traits.h new file mode 100644 index 0000000000..e926bdd758 --- /dev/null +++ b/library/cpp/regex/pcre/traits.h @@ -0,0 +1,99 @@ +#pragma once + +#include <contrib/libs/pcre/pcre.h> + +#include <util/generic/ptr.h> // THolder +#include <util/system/types.h> // wchar16, wchar32 + +namespace NPcre { + template <class TCharType> + struct TPcreTraits; + + template <> + struct TPcreTraits<char> { + using TCharType = char; + using TStringType = const char*; + using TCodeType = pcre; + using TExtraType = pcre_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre_exec; + }; + + template <> + struct TPcreTraits<wchar16> { + using TCharType = wchar16; + using TStringType = PCRE_SPTR16; + using TCodeType = pcre16; + using TExtraType = pcre16_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre16_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre16_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre16_exec; + }; + + template <> + struct TPcreTraits<wchar32> { + using TCharType = wchar32; + using TStringType = PCRE_SPTR32; + using TCodeType = pcre32; + using TExtraType = pcre32_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre32_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre32_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre32_exec; + }; + + template <class TCharType> + struct TFreePcre; + + template <> + struct TFreePcre<char> { + static inline void Destroy(void* ptr) noexcept { + pcre_free(ptr); + } + }; + + template <> + struct TFreePcre<wchar16> { + static inline void Destroy(void* ptr) noexcept { + pcre16_free(ptr); + } + }; + + template <> + struct TFreePcre<wchar32> { + static inline void Destroy(void* ptr) noexcept { + pcre32_free(ptr); + } + }; + + template <class TCharType> + struct TFreePcreExtra; + + template <> + struct TFreePcreExtra<char> { + static inline void Destroy(pcre_extra* ptr) noexcept { + pcre_free_study(ptr); + } + }; + + template <> + struct TFreePcreExtra<wchar16> { + static inline void Destroy(pcre16_extra* ptr) noexcept { + pcre16_free_study(ptr); + } + }; + + template <> + struct TFreePcreExtra<wchar32> { + static inline void Destroy(pcre32_extra* ptr) noexcept { + pcre32_free_study(ptr); + } + }; + + template <typename TCharType> + using TPcreCode = THolder<typename TPcreTraits<TCharType>::TCodeType, TFreePcre<TCharType>>; + + template <typename TCharType> + using TPcreExtra = THolder<typename TPcreTraits<TCharType>::TExtraType, TFreePcreExtra<TCharType>>; +} + diff --git a/library/cpp/regex/pcre/ut/ya.make b/library/cpp/regex/pcre/ut/ya.make new file mode 100644 index 0000000000..0721ef87c2 --- /dev/null +++ b/library/cpp/regex/pcre/ut/ya.make @@ -0,0 +1,10 @@ +UNITTEST_FOR(library/cpp/regex/pcre) + +OWNER(g:util) + +SRCS( + pcre_ut.cpp + regexp_ut.cpp +) + +END() diff --git a/library/cpp/regex/pcre/ya.make b/library/cpp/regex/pcre/ya.make new file mode 100644 index 0000000000..d34911f103 --- /dev/null +++ b/library/cpp/regex/pcre/ya.make @@ -0,0 +1,23 @@ +LIBRARY() + +OWNER(g:util) + +PEERDIR( + contrib/libs/pcre + contrib/libs/pcre/pcre16 + contrib/libs/pcre/pcre32 + library/cpp/containers/stack_array +) + +SRCS( + pcre.cpp + regexp.cpp +) + +END() + +RECURSE_FOR_TESTS( + benchmark + ut +) + |