diff options
author | Dmitry Potapov <potapov.d@gmail.com> | 2022-02-10 16:46:39 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:39 +0300 |
commit | 536101ea75c9ff5df10d01c2f460b1f6e12311b3 (patch) | |
tree | 115291277ad61b2cdcf5044d210fb103b5e1647e /library/cpp | |
parent | 5036b5f2122001f9aef8a0e4cd85440d73ea6b9f (diff) | |
download | ydb-536101ea75c9ff5df10d01c2f460b1f6e12311b3.tar.gz |
Restoring authorship annotation for Dmitry Potapov <potapov.d@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'library/cpp')
-rw-r--r-- | library/cpp/html/ya.make | 6 | ||||
-rw-r--r-- | library/cpp/http/io/compression.h | 2 | ||||
-rw-r--r-- | library/cpp/http/io/stream.cpp | 16 | ||||
-rw-r--r-- | library/cpp/http/io/stream.h | 2 | ||||
-rw-r--r-- | library/cpp/http/server/http.cpp | 4 | ||||
-rw-r--r-- | library/cpp/on_disk/chunks/reader.cpp | 2 | ||||
-rw-r--r-- | library/cpp/regex/pcre/README.md | 114 | ||||
-rw-r--r-- | library/cpp/regex/pcre/benchmark/main.cpp | 160 | ||||
-rw-r--r-- | library/cpp/regex/pcre/benchmark/ya.make | 28 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre.cpp | 2 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre.h | 382 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre_ut.cpp | 178 | ||||
-rw-r--r-- | library/cpp/regex/pcre/pcre_ut_base.h | 76 | ||||
-rw-r--r-- | library/cpp/regex/pcre/traits.h | 198 | ||||
-rw-r--r-- | library/cpp/regex/pcre/ut/ya.make | 2 | ||||
-rw-r--r-- | library/cpp/regex/pcre/ya.make | 20 | ||||
-rw-r--r-- | library/cpp/regex/pire/regexp.h | 308 | ||||
-rw-r--r-- | library/cpp/regex/pire/ut/regexp_ut.cpp | 56 | ||||
-rw-r--r-- | library/cpp/ya.make | 2 |
19 files changed, 779 insertions, 779 deletions
diff --git a/library/cpp/html/ya.make b/library/cpp/html/ya.make index ab904be3b9..ebcdfb3725 100644 --- a/library/cpp/html/ya.make +++ b/library/cpp/html/ya.make @@ -1,4 +1,4 @@ -RECURSE( +RECURSE( dehtml dehtml/ut detect @@ -20,7 +20,7 @@ RECURSE( print relalternate relalternate/ut - sanitize + sanitize spec storage storage/ut @@ -29,4 +29,4 @@ RECURSE( url zoneconf zoneconf/ut -) +) diff --git a/library/cpp/http/io/compression.h b/library/cpp/http/io/compression.h index f16c4a18eb..6e4d9c78aa 100644 --- a/library/cpp/http/io/compression.h +++ b/library/cpp/http/io/compression.h @@ -35,7 +35,7 @@ public: inline TArrayRef<const TStringBuf> GetBestCodecs() const { return BestCodecs_; } - + private: void Add(TStringBuf name, TDecoderConstructor d, TEncoderConstructor e); diff --git a/library/cpp/http/io/stream.cpp b/library/cpp/http/io/stream.cpp index 6689be684f..5eee4a5ea5 100644 --- a/library/cpp/http/io/stream.cpp +++ b/library/cpp/http/io/stream.cpp @@ -9,7 +9,7 @@ #include <util/stream/null.h> #include <util/stream/tee.h> -#include <util/system/compat.h> +#include <util/system/compat.h> #include <util/system/yassert.h> #include <util/network/socket.h> @@ -25,7 +25,7 @@ #define HEADERCMP(header, str) \ case sizeof(str) - 1: \ if (!stricmp((header).Name().data(), str)) - + namespace { inline size_t SuggestBufferSize() { return 8192; @@ -192,10 +192,10 @@ public: if (HasContentLength_) { value = ContentLength_; return true; - } + } return false; } - + inline bool ContentEncoded() const noexcept { return ContentEncoded_; } @@ -381,7 +381,7 @@ private: bool KeepAlive_; TAcceptCodings Codings_; - + bool HasContentLength_; ui64 ContentLength_; @@ -441,9 +441,9 @@ TString THttpInput::BestCompressionScheme() const { } bool THttpInput::GetContentLength(ui64& value) const noexcept { - return Impl_->GetContentLength(value); -} - + return Impl_->GetContentLength(value); +} + bool THttpInput::ContentEncoded() const noexcept { return Impl_->ContentEncoded(); } diff --git a/library/cpp/http/io/stream.h b/library/cpp/http/io/stream.h index 78ca4fc814..c8198bf664 100644 --- a/library/cpp/http/io/stream.h +++ b/library/cpp/http/io/stream.h @@ -76,7 +76,7 @@ public: /// Если заголовки содержат Content-Length, возвращает true и /// записывает значение из заголовка в value bool GetContentLength(ui64& value) const noexcept; - + /// Признак запакованности данных, - если выставлен, то Content-Length, при наличии в заголовках, /// показывает объём запакованных данных, а из THttpInput мы будем вычитывать уже распакованные. bool ContentEncoded() const noexcept; diff --git a/library/cpp/http/server/http.cpp b/library/cpp/http/server/http.cpp index 128583bdd7..c53a121390 100644 --- a/library/cpp/http/server/http.cpp +++ b/library/cpp/http/server/http.cpp @@ -3,8 +3,8 @@ #include <library/cpp/threading/equeue/equeue.h> -#include <util/generic/buffer.h> -#include <util/generic/cast.h> +#include <util/generic/buffer.h> +#include <util/generic/cast.h> #include <util/generic/intrlist.h> #include <util/generic/yexception.h> #include <util/network/address.h> diff --git a/library/cpp/on_disk/chunks/reader.cpp b/library/cpp/on_disk/chunks/reader.cpp index 6e28cbf367..043324786a 100644 --- a/library/cpp/on_disk/chunks/reader.cpp +++ b/library/cpp/on_disk/chunks/reader.cpp @@ -1,4 +1,4 @@ -#include <util/generic/cast.h> +#include <util/generic/cast.h> #include <util/memory/blob.h> #include <util/system/unaligned_mem.h> diff --git a/library/cpp/regex/pcre/README.md b/library/cpp/regex/pcre/README.md index b5b09a3715..d8ce466456 100644 --- a/library/cpp/regex/pcre/README.md +++ b/library/cpp/regex/pcre/README.md @@ -1,59 +1,59 @@ -# About -This is a PCRE library wrapper which provides unified interface for UTF-8, UTF-16 and UTF-32 strings matching and optimization control. - -# Rationale -Many Arcadia related libraries (telfinder, lemmer etc.) provides only UTF-16 interfaces, because this is way faster for cyrillic texts. Any algorithm that is working with such libraries and regular expressions must use `WideToUTF8` and `UTF8ToWide` at the borderline between regular expression and UTF-18 interface. This leads us to great performance penalty. -This library allows us to erase these charset conversions. - -# Interface - -Before starting with interface details, let's consider simplest library usage example: +# About +This is a PCRE library wrapper which provides unified interface for UTF-8, UTF-16 and UTF-32 strings matching and optimization control. + +# Rationale +Many Arcadia related libraries (telfinder, lemmer etc.) provides only UTF-16 interfaces, because this is way faster for cyrillic texts. Any algorithm that is working with such libraries and regular expressions must use `WideToUTF8` and `UTF8ToWide` at the borderline between regular expression and UTF-18 interface. This leads us to great performance penalty. +This library allows us to erase these charset conversions. + +# Interface + +Before starting with interface details, let's consider simplest library usage example: `UNIT_ASSERT(NPcre::TPcre<wchar16>(u"ba+d").Matches(TWtringBuf(u"baaad")));` - -Here we see regular expression construction for UTF-16 charset: - -`NPcre::TPcre<wchar16>(u"ba+d")` - -and matching of the subject string `baaad` against this pattern: - + +Here we see regular expression construction for UTF-16 charset: + +`NPcre::TPcre<wchar16>(u"ba+d")` + +and matching of the subject string `baaad` against this pattern: + `.Matches(TWtringBuf(u"baaad"))`; - -Let's consider both of them in details. - -## Construction -`NPcre::TPcre` class accepts single template parameter: `TCharType`. Currently supported char types are `char`, `wchar16` and `wchar32`. Additional char types traits can be defined in `traits.h` - -Constructor accepts three arguments. Two of them are optional: -1. Zero-terminated string on characters with pattern -2. Optimization type. The default value is `NPcre::EOptimize::None` which means no pattern optimization. Another possible value is `NPcre::EOptimize::Study` which will take some time at construction stage but could give up to 4x speed boost. And the last but not the least is `NPcre::EOptimize::JIT` which performs JIT optimization which could take significant time but could give up to 10x speed boost. -3. Regular expressions compile flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_compile2.html), but for most cases `PCRE_UTF8 | PCRE_UCP` will be enough. The default value is `0`. - -## Matching -{% note tip %} -Two words on PCRE workspaces. Workspace is memory area where PCRE stores information about back references and capturing groups. If passed workspace size is not enough, PCRE will allocate bigger workspace in heap. For simple matching and string searching of string without back references, workspace is not required and this library provides separate functions that won't waste space on workspace and this could save ≈0.5% of CPU TIME on simple patterns. -For regular expressions with capturing groups, recommended workspace size is `(capturing groups count + 1)`. -{% endnote %} - -In the example above matching function `Matches` returns boolean indicating that subject string matched pattern and accepts two arguments: -1. `TBasicStringBuf<TCharType>` with subject string -2. Regular expression execute flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_exec.html). For most cases `0` will be just fine and this is the default value. - -## Searching -Function `Find` accepts the same arguments as `Match` and returns `TMaybe<NPcre::TPcreMatch>` which contains pair of ints with start and end offsets of string found. Check result for `Defined` to ensure that pattern was found in subject string. - -## Capturing -The last member function of `NPcre::TPcre` is `Capture` which searches for pattern and returns capturing group. - -### Return value -Return value is `NPcre::TPcreMatches` which is alias for `TVector<NPcre::TPcreMatch>`. -Vector will be empty if pattern wasn't found in subject string. -If pattern was found, first element will contain start and end offsets of string found. -All other elements will contains start and end offsets of capturing groups in order they appeared in regular expression. -{% note tip %} -If some capturing group not matched subject string, but some of consequent capturing groups did, this capturing group will present as `-1, -1` pair. -For example: calling `Capture` on pattern `(a)(?:(b)c|b(d))` against subject string `zabda` will return `[{1,4},{1,2},{-1,-1},{3,4}]` because capturing group `(b)` wasn't matched. -{% endnote %} -### Arguments -1. `TBasicStringBuf<TCharType>` with subject string -2. Regular expression execute flags. -3. Initial workspace size. Default value is `16` but if pattern contains more than 16 capturing groups, this function will reallocate workspace with bigger size. + +Let's consider both of them in details. + +## Construction +`NPcre::TPcre` class accepts single template parameter: `TCharType`. Currently supported char types are `char`, `wchar16` and `wchar32`. Additional char types traits can be defined in `traits.h` + +Constructor accepts three arguments. Two of them are optional: +1. Zero-terminated string on characters with pattern +2. Optimization type. The default value is `NPcre::EOptimize::None` which means no pattern optimization. Another possible value is `NPcre::EOptimize::Study` which will take some time at construction stage but could give up to 4x speed boost. And the last but not the least is `NPcre::EOptimize::JIT` which performs JIT optimization which could take significant time but could give up to 10x speed boost. +3. Regular expressions compile flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_compile2.html), but for most cases `PCRE_UTF8 | PCRE_UCP` will be enough. The default value is `0`. + +## Matching +{% note tip %} +Two words on PCRE workspaces. Workspace is memory area where PCRE stores information about back references and capturing groups. If passed workspace size is not enough, PCRE will allocate bigger workspace in heap. For simple matching and string searching of string without back references, workspace is not required and this library provides separate functions that won't waste space on workspace and this could save ≈0.5% of CPU TIME on simple patterns. +For regular expressions with capturing groups, recommended workspace size is `(capturing groups count + 1)`. +{% endnote %} + +In the example above matching function `Matches` returns boolean indicating that subject string matched pattern and accepts two arguments: +1. `TBasicStringBuf<TCharType>` with subject string +2. Regular expression execute flags. We don't want to reimplement every constant from PCRE library, so they are passed as they are. Full list of compile flags can be found [here](https://www.pcre.org/original/doc/html/pcre_exec.html). For most cases `0` will be just fine and this is the default value. + +## Searching +Function `Find` accepts the same arguments as `Match` and returns `TMaybe<NPcre::TPcreMatch>` which contains pair of ints with start and end offsets of string found. Check result for `Defined` to ensure that pattern was found in subject string. + +## Capturing +The last member function of `NPcre::TPcre` is `Capture` which searches for pattern and returns capturing group. + +### Return value +Return value is `NPcre::TPcreMatches` which is alias for `TVector<NPcre::TPcreMatch>`. +Vector will be empty if pattern wasn't found in subject string. +If pattern was found, first element will contain start and end offsets of string found. +All other elements will contains start and end offsets of capturing groups in order they appeared in regular expression. +{% note tip %} +If some capturing group not matched subject string, but some of consequent capturing groups did, this capturing group will present as `-1, -1` pair. +For example: calling `Capture` on pattern `(a)(?:(b)c|b(d))` against subject string `zabda` will return `[{1,4},{1,2},{-1,-1},{3,4}]` because capturing group `(b)` wasn't matched. +{% endnote %} +### Arguments +1. `TBasicStringBuf<TCharType>` with subject string +2. Regular expression execute flags. +3. Initial workspace size. Default value is `16` but if pattern contains more than 16 capturing groups, this function will reallocate workspace with bigger size. diff --git a/library/cpp/regex/pcre/benchmark/main.cpp b/library/cpp/regex/pcre/benchmark/main.cpp index 3c11ef4f29..bf2687228f 100644 --- a/library/cpp/regex/pcre/benchmark/main.cpp +++ b/library/cpp/regex/pcre/benchmark/main.cpp @@ -1,80 +1,80 @@ -#include <benchmark/benchmark.h> - -#include <library/cpp/regex/pcre/pcre.h> - -#include <util/charset/wide.h> -#include <util/generic/strbuf.h> -#include <util/generic/string.h> -#include <util/generic/vector.h> - -static TStringBuf SimplePattern = "[-.\\w]+@(?:[a-z\\d]{2,}\\.)+[a-z]{2,6}"; -static TStringBuf ComplexPattern = R"((?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s*(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*))*)?;\s*))"; - -static constexpr size_t HaystacksCount = 32; -static constexpr size_t MinPrefix = 1024; - -static TVector<TString> GenerateHaystacks() { - // Generate long randomized haystacks to prevent cache hit - TVector<TString> result(Reserve(HaystacksCount)); - for (size_t i = 0; i < HaystacksCount; ++i) { - result.push_back(TString::Join(ComplexPattern.SubString(MinPrefix + i, ComplexPattern.Size() - MinPrefix - i), ComplexPattern.SubString(0, MinPrefix + i))); - } - return result; -} - -static const TVector<TString> Haystacks{GenerateHaystacks()}; - -static const NPcre::TPcre<char> Simple{SimplePattern.Data()}; -static const NPcre::TPcre<char> SimpleStudy{SimplePattern.Data(), NPcre::EOptimize::Study}; -static const NPcre::TPcre<char> SimpleJIT{SimplePattern.Data(), NPcre::EOptimize::JIT}; -static const NPcre::TPcre<char> Complex{ComplexPattern.Data()}; -static const NPcre::TPcre<char> ComplexStudy{ComplexPattern.Data(), NPcre::EOptimize::Study}; -static const NPcre::TPcre<char> ComplexJIT{ComplexPattern.Data(), NPcre::EOptimize::JIT}; - -static void Benchmark(benchmark::State& state, const NPcre::TPcre<char>& pattern) { - for (auto _ : state) { - for (size_t i = 0; i < HaystacksCount; ++i) { - // Force string reallocation, so there will be no chance for cache hit of any type - benchmark::DoNotOptimize(pattern.Matches(TString{i, 'a'} + Haystacks[i])); - } - } -} - -static void BenchmarkSimplePatternJIT(benchmark::State& state) { - Benchmark(state, SimpleJIT); -} - -static void BenchmarkSimplePatternStudy(benchmark::State& state) { - Benchmark(state, SimpleStudy); -} - -static void BenchmarkSimplePattern(benchmark::State& state) { - Benchmark(state, Simple); -} - -BENCHMARK(BenchmarkSimplePatternJIT)->Iterations(1); -BENCHMARK(BenchmarkSimplePatternStudy)->Iterations(1); -BENCHMARK(BenchmarkSimplePattern)->Iterations(1); -BENCHMARK(BenchmarkSimplePatternJIT); -BENCHMARK(BenchmarkSimplePatternStudy); -BENCHMARK(BenchmarkSimplePattern); - -static void BenchmarkComplexPatternJIT(benchmark::State& state) { - Benchmark(state, ComplexJIT); -} - -static void BenchmarkComplexPatternStudy(benchmark::State& state) { - Benchmark(state, ComplexStudy); -} - -static void BenchmarkComplexPattern(benchmark::State& state) { - Benchmark(state, Complex); -} - -BENCHMARK(BenchmarkComplexPatternJIT)->Iterations(1); -BENCHMARK(BenchmarkComplexPatternStudy)->Iterations(1); -BENCHMARK(BenchmarkComplexPattern)->Iterations(1); -BENCHMARK(BenchmarkComplexPatternJIT); -BENCHMARK(BenchmarkComplexPatternStudy); -BENCHMARK(BenchmarkComplexPattern); - +#include <benchmark/benchmark.h> + +#include <library/cpp/regex/pcre/pcre.h> + +#include <util/charset/wide.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> + +static TStringBuf SimplePattern = "[-.\\w]+@(?:[a-z\\d]{2,}\\.)+[a-z]{2,6}"; +static TStringBuf ComplexPattern = R"((?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s*(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[(?:[^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*))*)?;\s*))"; + +static constexpr size_t HaystacksCount = 32; +static constexpr size_t MinPrefix = 1024; + +static TVector<TString> GenerateHaystacks() { + // Generate long randomized haystacks to prevent cache hit + TVector<TString> result(Reserve(HaystacksCount)); + for (size_t i = 0; i < HaystacksCount; ++i) { + result.push_back(TString::Join(ComplexPattern.SubString(MinPrefix + i, ComplexPattern.Size() - MinPrefix - i), ComplexPattern.SubString(0, MinPrefix + i))); + } + return result; +} + +static const TVector<TString> Haystacks{GenerateHaystacks()}; + +static const NPcre::TPcre<char> Simple{SimplePattern.Data()}; +static const NPcre::TPcre<char> SimpleStudy{SimplePattern.Data(), NPcre::EOptimize::Study}; +static const NPcre::TPcre<char> SimpleJIT{SimplePattern.Data(), NPcre::EOptimize::JIT}; +static const NPcre::TPcre<char> Complex{ComplexPattern.Data()}; +static const NPcre::TPcre<char> ComplexStudy{ComplexPattern.Data(), NPcre::EOptimize::Study}; +static const NPcre::TPcre<char> ComplexJIT{ComplexPattern.Data(), NPcre::EOptimize::JIT}; + +static void Benchmark(benchmark::State& state, const NPcre::TPcre<char>& pattern) { + for (auto _ : state) { + for (size_t i = 0; i < HaystacksCount; ++i) { + // Force string reallocation, so there will be no chance for cache hit of any type + benchmark::DoNotOptimize(pattern.Matches(TString{i, 'a'} + Haystacks[i])); + } + } +} + +static void BenchmarkSimplePatternJIT(benchmark::State& state) { + Benchmark(state, SimpleJIT); +} + +static void BenchmarkSimplePatternStudy(benchmark::State& state) { + Benchmark(state, SimpleStudy); +} + +static void BenchmarkSimplePattern(benchmark::State& state) { + Benchmark(state, Simple); +} + +BENCHMARK(BenchmarkSimplePatternJIT)->Iterations(1); +BENCHMARK(BenchmarkSimplePatternStudy)->Iterations(1); +BENCHMARK(BenchmarkSimplePattern)->Iterations(1); +BENCHMARK(BenchmarkSimplePatternJIT); +BENCHMARK(BenchmarkSimplePatternStudy); +BENCHMARK(BenchmarkSimplePattern); + +static void BenchmarkComplexPatternJIT(benchmark::State& state) { + Benchmark(state, ComplexJIT); +} + +static void BenchmarkComplexPatternStudy(benchmark::State& state) { + Benchmark(state, ComplexStudy); +} + +static void BenchmarkComplexPattern(benchmark::State& state) { + Benchmark(state, Complex); +} + +BENCHMARK(BenchmarkComplexPatternJIT)->Iterations(1); +BENCHMARK(BenchmarkComplexPatternStudy)->Iterations(1); +BENCHMARK(BenchmarkComplexPattern)->Iterations(1); +BENCHMARK(BenchmarkComplexPatternJIT); +BENCHMARK(BenchmarkComplexPatternStudy); +BENCHMARK(BenchmarkComplexPattern); + diff --git a/library/cpp/regex/pcre/benchmark/ya.make b/library/cpp/regex/pcre/benchmark/ya.make index 7c30fae0a6..3cef5e14b9 100644 --- a/library/cpp/regex/pcre/benchmark/ya.make +++ b/library/cpp/regex/pcre/benchmark/ya.make @@ -1,14 +1,14 @@ -G_BENCHMARK() - -OWNER(g:so) - -PEERDIR( - library/cpp/regex/pcre -) - -SRCS( - main.cpp -) - -END() - +G_BENCHMARK() + +OWNER(g:so) + +PEERDIR( + library/cpp/regex/pcre +) + +SRCS( + main.cpp +) + +END() + diff --git a/library/cpp/regex/pcre/pcre.cpp b/library/cpp/regex/pcre/pcre.cpp index 9e97d5f8f7..345667890c 100644 --- a/library/cpp/regex/pcre/pcre.cpp +++ b/library/cpp/regex/pcre/pcre.cpp @@ -1 +1 @@ -#include "pcre.h" +#include "pcre.h" diff --git a/library/cpp/regex/pcre/pcre.h b/library/cpp/regex/pcre/pcre.h index 82a9774f00..b19aa9a5b9 100644 --- a/library/cpp/regex/pcre/pcre.h +++ b/library/cpp/regex/pcre/pcre.h @@ -1,191 +1,191 @@ -#pragma once - -#include "traits.h" - -#include <library/cpp/containers/stack_array/stack_array.h> - -#include <util/generic/maybe.h> -#include <util/generic/strbuf.h> -#include <util/generic/vector.h> -#include <util/generic/yexception.h> - -namespace NPcre { - //! Start and end offset for match group. - using TPcreMatch = std::pair<int, int>; - - //! Full match result containing all capturing groups. - /*! - * At zero index we have whole matched string start and end offsets. - * All other elements will contain capturing groups positions. - * Non-captured capturing groups will have {-1, -1} offsets. - */ - using TPcreMatches = TVector<TPcreMatch>; - - //! Compiled pattern optimization strategy. - enum class EOptimize { - //! No optimization. - /*! - * Useful for non-reusable patterns where compile time matters. - */ - None, - //! Basic optimization via |pcre_study|. - /*! - * Could give up to 4x match speed boost in exchange of increased - * construction time. Could not. - */ - Study, - //! PCRE JIT optimization. - /*! - * Could give up to 10x match speed bust in exchange of significantly - * increased compile time. Also, for very complex patterns |pcre_exec| - * could return |PCRE_ERROR_JIT_STACKLIMIT|. See - * https://www.pcre.org/original/doc/html/pcrejit.html for details. - */ - JIT - }; - - //! PCRE code container. Controls its life time and provides handy wrapper. - template <class TCharType> - class TPcre { - private: - using TCodeType = typename TPcreTraits<TCharType>::TCodeType; - using TExtraType = typename TPcreTraits<TCharType>::TExtraType; - using TStringType = typename TPcreTraits<TCharType>::TStringType; - using TTraits = TPcreTraits<TCharType>; - static constexpr size_t DefaultWorkspaceSize = 16; - - public: - //! Compiles regexp into internal representation for future use. - /*! - * \param pattern Regular expression to be compiled. - * \param optimize If |EOptimize::JIT|, perform additional - * analysis, which will take extra time, but could - * speed up matching. |None| to omit optimization. - * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html - **/ - TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) { - int errcode; - const char* errptr; - int erroffset; - Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr)); - if (!Code) { - ythrow yexception() << "Failed to compile pattern <" << pattern - << ">, because of error at pos " << erroffset - << ", error code " << errcode << ": " << errptr; - } - if (optimize != EOptimize::None) { - errptr = nullptr; - int options; - if (optimize == EOptimize::Study) { - options = 0; - } else { - options = PCRE_STUDY_JIT_COMPILE; - } - Extra.Reset(TTraits::Study(Code.Get(), options, &errptr)); - if (errptr) { - ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr; - } - } - } - - //! Check if compiled pattern matches string. - /*! - * \param string String to search in. - * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html - * \param workspaceSize Amount of space which will be allocated for - * back references. PCRE could allocate more - * heap space is provided workspaceSize won't - * fit all of them. - * \returns |true| if there is a match. - */ - bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { - Y_ASSERT(workspaceSize >= 0); - size_t ovecsize = workspaceSize * 3; - NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); - return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize)); - } - - //! Find compiled pattern in string. - /*! - * \param string String to search in. - * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html - * \param workspaceSize Amount of space which will be allocated for - * back references. PCRE could allocate more - * heap space is provided workspaceSize won't - * fit all of them. - * \returns Start and end offsets pair if there is a - * match. |Nothing| otherwise. - */ - Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { - Y_ASSERT(workspaceSize >= 0); - size_t ovecsize = workspaceSize * 3; - NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); - for (size_t i = 0; i < ovecsize; ++i) { - ovector[i] = -4; - } - int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); - if (ConvertReturnCode(rc)) { - return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]); - } else { - return Nothing(); - } - } - - //! Find and return all capturing groups in string. - /*! - * \param string String to search in. - * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html - * \param initialWorkspaceSize Capturing groups vector initial size. - * Workspace will be grown and search will - * be repeated if there is not enough - * space. - * \returns List of capturing groups start and end - * offsets. First element will contain - * whole matched substring start and end - * offsets. For non-matched capturing - * groups, result will contain {-1, -1} - * pair. - * If pattern not found in string, result - * vector will be empty. - */ - Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const { - Y_ASSERT(initialWorkspaceSize > 0); - size_t ovecsize = (initialWorkspaceSize + 1) * 3; - while (true) { - NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); - int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); - if (rc > 0) { - TPcreMatches result(Reserve(rc >> 1)); - for (int i = 0, pos = 0; i < rc; ++i) { - int start = ovector[pos++]; - int end = ovector[pos++]; - result.emplace_back(start, end); - } - return result; - } else if (rc == 0) { - ovecsize <<= 1; - } else if (rc == PCRE_ERROR_NOMATCH) { - return TPcreMatches{}; - } else if (rc < 0) { - ythrow yexception() << "Error. RC = " << rc; - } - } - } - - private: - TPcreCode<TCharType> Code; - TPcreExtra<TCharType> Extra; - - private: - static inline bool ConvertReturnCode(int rc) { - if (rc >= 0) { - return true; - } else if (rc == PCRE_ERROR_NOMATCH) { - return false; - } else { - ythrow yexception() << "Error. RC = " << rc; - } - } - }; -} - +#pragma once + +#include "traits.h" + +#include <library/cpp/containers/stack_array/stack_array.h> + +#include <util/generic/maybe.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +namespace NPcre { + //! Start and end offset for match group. + using TPcreMatch = std::pair<int, int>; + + //! Full match result containing all capturing groups. + /*! + * At zero index we have whole matched string start and end offsets. + * All other elements will contain capturing groups positions. + * Non-captured capturing groups will have {-1, -1} offsets. + */ + using TPcreMatches = TVector<TPcreMatch>; + + //! Compiled pattern optimization strategy. + enum class EOptimize { + //! No optimization. + /*! + * Useful for non-reusable patterns where compile time matters. + */ + None, + //! Basic optimization via |pcre_study|. + /*! + * Could give up to 4x match speed boost in exchange of increased + * construction time. Could not. + */ + Study, + //! PCRE JIT optimization. + /*! + * Could give up to 10x match speed bust in exchange of significantly + * increased compile time. Also, for very complex patterns |pcre_exec| + * could return |PCRE_ERROR_JIT_STACKLIMIT|. See + * https://www.pcre.org/original/doc/html/pcrejit.html for details. + */ + JIT + }; + + //! PCRE code container. Controls its life time and provides handy wrapper. + template <class TCharType> + class TPcre { + private: + using TCodeType = typename TPcreTraits<TCharType>::TCodeType; + using TExtraType = typename TPcreTraits<TCharType>::TExtraType; + using TStringType = typename TPcreTraits<TCharType>::TStringType; + using TTraits = TPcreTraits<TCharType>; + static constexpr size_t DefaultWorkspaceSize = 16; + + public: + //! Compiles regexp into internal representation for future use. + /*! + * \param pattern Regular expression to be compiled. + * \param optimize If |EOptimize::JIT|, perform additional + * analysis, which will take extra time, but could + * speed up matching. |None| to omit optimization. + * \param compileFlags See https://www.pcre.org/original/doc/html/pcre_compile2.html + **/ + TPcre(const TCharType* pattern, EOptimize optimize = EOptimize::None, int compileFlags = 0) { + int errcode; + const char* errptr; + int erroffset; + Code.Reset(TTraits::Compile((TStringType) pattern, compileFlags, &errcode, &errptr, &erroffset, nullptr)); + if (!Code) { + ythrow yexception() << "Failed to compile pattern <" << pattern + << ">, because of error at pos " << erroffset + << ", error code " << errcode << ": " << errptr; + } + if (optimize != EOptimize::None) { + errptr = nullptr; + int options; + if (optimize == EOptimize::Study) { + options = 0; + } else { + options = PCRE_STUDY_JIT_COMPILE; + } + Extra.Reset(TTraits::Study(Code.Get(), options, &errptr)); + if (errptr) { + ythrow yexception() << "Failed to study pattern <" << pattern << ">: " << errptr; + } + } + } + + //! Check if compiled pattern matches string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns |true| if there is a match. + */ + bool Matches(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + return ConvertReturnCode(TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize)); + } + + //! Find compiled pattern in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param workspaceSize Amount of space which will be allocated for + * back references. PCRE could allocate more + * heap space is provided workspaceSize won't + * fit all of them. + * \returns Start and end offsets pair if there is a + * match. |Nothing| otherwise. + */ + Y_NO_SANITIZE("memory") TMaybe<TPcreMatch> Find(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t workspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(workspaceSize >= 0); + size_t ovecsize = workspaceSize * 3; + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + for (size_t i = 0; i < ovecsize; ++i) { + ovector[i] = -4; + } + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (ConvertReturnCode(rc)) { + return MakeMaybe<TPcreMatch>(ovector[0], ovector[1]); + } else { + return Nothing(); + } + } + + //! Find and return all capturing groups in string. + /*! + * \param string String to search in. + * \param executeFlags See https://www.pcre.org/original/doc/html/pcre_exec.html + * \param initialWorkspaceSize Capturing groups vector initial size. + * Workspace will be grown and search will + * be repeated if there is not enough + * space. + * \returns List of capturing groups start and end + * offsets. First element will contain + * whole matched substring start and end + * offsets. For non-matched capturing + * groups, result will contain {-1, -1} + * pair. + * If pattern not found in string, result + * vector will be empty. + */ + Y_NO_SANITIZE("memory") TPcreMatches Capture(TBasicStringBuf<TCharType> string, int executeFlags = 0, size_t initialWorkspaceSize = DefaultWorkspaceSize) const { + Y_ASSERT(initialWorkspaceSize > 0); + size_t ovecsize = (initialWorkspaceSize + 1) * 3; + while (true) { + NStackArray::TStackArray<int> ovector(ALLOC_ON_STACK(int, ovecsize)); + int rc = TTraits::Exec(Code.Get(), Extra.Get(), (TStringType) string.Data(), string.Size(), 0, executeFlags, ovector.data(), ovecsize); + if (rc > 0) { + TPcreMatches result(Reserve(rc >> 1)); + for (int i = 0, pos = 0; i < rc; ++i) { + int start = ovector[pos++]; + int end = ovector[pos++]; + result.emplace_back(start, end); + } + return result; + } else if (rc == 0) { + ovecsize <<= 1; + } else if (rc == PCRE_ERROR_NOMATCH) { + return TPcreMatches{}; + } else if (rc < 0) { + ythrow yexception() << "Error. RC = " << rc; + } + } + } + + private: + TPcreCode<TCharType> Code; + TPcreExtra<TCharType> Extra; + + private: + static inline bool ConvertReturnCode(int rc) { + if (rc >= 0) { + return true; + } else if (rc == PCRE_ERROR_NOMATCH) { + return false; + } else { + ythrow yexception() << "Error. RC = " << rc; + } + } + }; +} + diff --git a/library/cpp/regex/pcre/pcre_ut.cpp b/library/cpp/regex/pcre/pcre_ut.cpp index 84d06499ae..ad5535f96d 100644 --- a/library/cpp/regex/pcre/pcre_ut.cpp +++ b/library/cpp/regex/pcre/pcre_ut.cpp @@ -1,89 +1,89 @@ -#include <library/cpp/regex/pcre/pcre.h> - -#include <library/cpp/testing/unittest/registar.h> - -template <class T> -inline IOutputStream& operator<<(IOutputStream& out, const TVector<T>& value) { - size_t size = value.size(); - out << "["; - for (size_t i = 0; i < size; ++i) { - if (i) { - out << ","; - } - out << value[i]; - } - out << "]"; - return out; -} - -template <class T, class U> -inline IOutputStream& operator<<(IOutputStream& out, const std::pair<T, U>& value) { - out << "{" << value.first << "," << value.second << "}"; - return out; -} - -// char8_t -#define OPTIMIZE NPcre::EOptimize::None -#define TEST_NAME(S) S -#define STRING(S) S -#define CHAR_TYPE char -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::Study -#undef TEST_NAME -#define TEST_NAME(S) S ## Study -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::JIT -#undef TEST_NAME -#define TEST_NAME(S) S ## JIT -#include "pcre_ut_base.h" - -// char16_t -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::None -#undef TEST_NAME -#define TEST_NAME(S) S ## 16 -#undef STRING -#define STRING(S) u ## S -#undef CHAR_TYPE -#define CHAR_TYPE wchar16 -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::Study -#undef TEST_NAME -#define TEST_NAME(S) S ## Study16 -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::JIT -#undef TEST_NAME -#define TEST_NAME(S) S ## JIT16 -#include "pcre_ut_base.h" - -// char32_t -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::None -#undef TEST_NAME -#define TEST_NAME(S) S ## 32 -#undef STRING -#define STRING(S) U ## S -#undef CHAR_TYPE -#define CHAR_TYPE wchar32 -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::Study -#undef TEST_NAME -#define TEST_NAME(S) S ## Study32 -#include "pcre_ut_base.h" - -#undef OPTIMIZE -#define OPTIMIZE NPcre::EOptimize::JIT -#undef TEST_NAME -#define TEST_NAME(S) S ## JIT32 -#include "pcre_ut_base.h" - +#include <library/cpp/regex/pcre/pcre.h> + +#include <library/cpp/testing/unittest/registar.h> + +template <class T> +inline IOutputStream& operator<<(IOutputStream& out, const TVector<T>& value) { + size_t size = value.size(); + out << "["; + for (size_t i = 0; i < size; ++i) { + if (i) { + out << ","; + } + out << value[i]; + } + out << "]"; + return out; +} + +template <class T, class U> +inline IOutputStream& operator<<(IOutputStream& out, const std::pair<T, U>& value) { + out << "{" << value.first << "," << value.second << "}"; + return out; +} + +// char8_t +#define OPTIMIZE NPcre::EOptimize::None +#define TEST_NAME(S) S +#define STRING(S) S +#define CHAR_TYPE char +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT +#include "pcre_ut_base.h" + +// char16_t +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::None +#undef TEST_NAME +#define TEST_NAME(S) S ## 16 +#undef STRING +#define STRING(S) u ## S +#undef CHAR_TYPE +#define CHAR_TYPE wchar16 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study16 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT16 +#include "pcre_ut_base.h" + +// char32_t +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::None +#undef TEST_NAME +#define TEST_NAME(S) S ## 32 +#undef STRING +#define STRING(S) U ## S +#undef CHAR_TYPE +#define CHAR_TYPE wchar32 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::Study +#undef TEST_NAME +#define TEST_NAME(S) S ## Study32 +#include "pcre_ut_base.h" + +#undef OPTIMIZE +#define OPTIMIZE NPcre::EOptimize::JIT +#undef TEST_NAME +#define TEST_NAME(S) S ## JIT32 +#include "pcre_ut_base.h" + diff --git a/library/cpp/regex/pcre/pcre_ut_base.h b/library/cpp/regex/pcre/pcre_ut_base.h index 1d61d07b14..1e15319b90 100644 --- a/library/cpp/regex/pcre/pcre_ut_base.h +++ b/library/cpp/regex/pcre/pcre_ut_base.h @@ -1,38 +1,38 @@ -#define CHECK_MATCHES(EXPECTED, PATTERN, STR) \ - UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR))); \ - UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR), 0, 10)); - -#define CHECK(A, B) UNIT_ASSERT_STRINGS_EQUAL(ToString(STRING(A)), ToString(B)) - -#define CHECK_GROUPS(EXPECTED, PATTERN, STR) \ - CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR))); \ - CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR), 0, 10)); - -Y_UNIT_TEST_SUITE(TEST_NAME(TestRegExp)) { - Y_UNIT_TEST(TestMatches) { - CHECK_MATCHES(true, "ю", "bюd"); - CHECK_MATCHES(false, "c", "bюd"); - CHECK_MATCHES(true, "(ю)(?:(b)c|bd)", "zюbda"); - CHECK_MATCHES(false, "(ю)(?:(b)c|bd)", "bюd"); - CHECK_MATCHES(true, "(abc|def)=\\g1", "abc=abc"); - CHECK_MATCHES(true, "(abc|def)=\\g1", "def=def"); - CHECK_MATCHES(false, "(abc|def)=\\g1", "abc=def"); - } - - Y_UNIT_TEST(TestGroups) { - CHECK_GROUPS("{1,2}", "a", "bad"); - CHECK_GROUPS("(empty maybe)", "c", "bad"); - CHECK_GROUPS("{1,4}", "(a)(?:(b)c|bd)", "zabda"); - CHECK_GROUPS("(empty maybe)", "(a)(?:(b)c|bd)", "bad"); - CHECK_GROUPS("{1,8}", "(abc|def)=\\g1", "aabc=abca"); - CHECK_GROUPS("(empty maybe)", "(abc|def)=\\g1", "abc=def"); - } - - Y_UNIT_TEST(TestCapture) { - CHECK("[{1,2}]",NPcre::TPcre<CHAR_TYPE>(STRING("a"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); - CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("c"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); - CHECK("[{1,4},{1,2},{-1,-1},{3,4}]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|b(d))"), OPTIMIZE).Capture(STRING("zabda"), 0, 1)); - CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|bd)"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); - } -} - +#define CHECK_MATCHES(EXPECTED, PATTERN, STR) \ + UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR))); \ + UNIT_ASSERT(EXPECTED == NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Matches(STRING(STR), 0, 10)); + +#define CHECK(A, B) UNIT_ASSERT_STRINGS_EQUAL(ToString(STRING(A)), ToString(B)) + +#define CHECK_GROUPS(EXPECTED, PATTERN, STR) \ + CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR))); \ + CHECK(EXPECTED, NPcre::TPcre<CHAR_TYPE>(STRING(PATTERN), OPTIMIZE).Find(STRING(STR), 0, 10)); + +Y_UNIT_TEST_SUITE(TEST_NAME(TestRegExp)) { + Y_UNIT_TEST(TestMatches) { + CHECK_MATCHES(true, "ю", "bюd"); + CHECK_MATCHES(false, "c", "bюd"); + CHECK_MATCHES(true, "(ю)(?:(b)c|bd)", "zюbda"); + CHECK_MATCHES(false, "(ю)(?:(b)c|bd)", "bюd"); + CHECK_MATCHES(true, "(abc|def)=\\g1", "abc=abc"); + CHECK_MATCHES(true, "(abc|def)=\\g1", "def=def"); + CHECK_MATCHES(false, "(abc|def)=\\g1", "abc=def"); + } + + Y_UNIT_TEST(TestGroups) { + CHECK_GROUPS("{1,2}", "a", "bad"); + CHECK_GROUPS("(empty maybe)", "c", "bad"); + CHECK_GROUPS("{1,4}", "(a)(?:(b)c|bd)", "zabda"); + CHECK_GROUPS("(empty maybe)", "(a)(?:(b)c|bd)", "bad"); + CHECK_GROUPS("{1,8}", "(abc|def)=\\g1", "aabc=abca"); + CHECK_GROUPS("(empty maybe)", "(abc|def)=\\g1", "abc=def"); + } + + Y_UNIT_TEST(TestCapture) { + CHECK("[{1,2}]",NPcre::TPcre<CHAR_TYPE>(STRING("a"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("c"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + CHECK("[{1,4},{1,2},{-1,-1},{3,4}]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|b(d))"), OPTIMIZE).Capture(STRING("zabda"), 0, 1)); + CHECK("[]",NPcre::TPcre<CHAR_TYPE>(STRING("(a)(?:(b)c|bd)"), OPTIMIZE).Capture(STRING("bad"), 0, 1)); + } +} + diff --git a/library/cpp/regex/pcre/traits.h b/library/cpp/regex/pcre/traits.h index e926bdd758..c117ffbd50 100644 --- a/library/cpp/regex/pcre/traits.h +++ b/library/cpp/regex/pcre/traits.h @@ -1,99 +1,99 @@ -#pragma once - -#include <contrib/libs/pcre/pcre.h> - -#include <util/generic/ptr.h> // THolder -#include <util/system/types.h> // wchar16, wchar32 - -namespace NPcre { - template <class TCharType> - struct TPcreTraits; - - template <> - struct TPcreTraits<char> { - using TCharType = char; - using TStringType = const char*; - using TCodeType = pcre; - using TExtraType = pcre_extra; - static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre_compile2; - static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre_study; - static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre_exec; - }; - - template <> - struct TPcreTraits<wchar16> { - using TCharType = wchar16; - using TStringType = PCRE_SPTR16; - using TCodeType = pcre16; - using TExtraType = pcre16_extra; - static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre16_compile2; - static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre16_study; - static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre16_exec; - }; - - template <> - struct TPcreTraits<wchar32> { - using TCharType = wchar32; - using TStringType = PCRE_SPTR32; - using TCodeType = pcre32; - using TExtraType = pcre32_extra; - static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre32_compile2; - static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre32_study; - static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre32_exec; - }; - - template <class TCharType> - struct TFreePcre; - - template <> - struct TFreePcre<char> { - static inline void Destroy(void* ptr) noexcept { - pcre_free(ptr); - } - }; - - template <> - struct TFreePcre<wchar16> { - static inline void Destroy(void* ptr) noexcept { - pcre16_free(ptr); - } - }; - - template <> - struct TFreePcre<wchar32> { - static inline void Destroy(void* ptr) noexcept { - pcre32_free(ptr); - } - }; - - template <class TCharType> - struct TFreePcreExtra; - - template <> - struct TFreePcreExtra<char> { - static inline void Destroy(pcre_extra* ptr) noexcept { - pcre_free_study(ptr); - } - }; - - template <> - struct TFreePcreExtra<wchar16> { - static inline void Destroy(pcre16_extra* ptr) noexcept { - pcre16_free_study(ptr); - } - }; - - template <> - struct TFreePcreExtra<wchar32> { - static inline void Destroy(pcre32_extra* ptr) noexcept { - pcre32_free_study(ptr); - } - }; - - template <typename TCharType> - using TPcreCode = THolder<typename TPcreTraits<TCharType>::TCodeType, TFreePcre<TCharType>>; - - template <typename TCharType> - using TPcreExtra = THolder<typename TPcreTraits<TCharType>::TExtraType, TFreePcreExtra<TCharType>>; -} - +#pragma once + +#include <contrib/libs/pcre/pcre.h> + +#include <util/generic/ptr.h> // THolder +#include <util/system/types.h> // wchar16, wchar32 + +namespace NPcre { + template <class TCharType> + struct TPcreTraits; + + template <> + struct TPcreTraits<char> { + using TCharType = char; + using TStringType = const char*; + using TCodeType = pcre; + using TExtraType = pcre_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre_exec; + }; + + template <> + struct TPcreTraits<wchar16> { + using TCharType = wchar16; + using TStringType = PCRE_SPTR16; + using TCodeType = pcre16; + using TExtraType = pcre16_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre16_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre16_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre16_exec; + }; + + template <> + struct TPcreTraits<wchar32> { + using TCharType = wchar32; + using TStringType = PCRE_SPTR32; + using TCodeType = pcre32; + using TExtraType = pcre32_extra; + static constexpr TCodeType* (*Compile)(TStringType pattern, int options, int* errcodeptr, const char** errptr, int* erroffset, const unsigned char* tableptr) = pcre32_compile2; + static constexpr TExtraType* (*Study)(const TCodeType* pattern, int options, const char** errptr) = pcre32_study; + static constexpr int (*Exec)(const TCodeType* code, const TExtraType* extra, TStringType str, int length, int startoffset, int options, int* ovector, int ovecsize) = pcre32_exec; + }; + + template <class TCharType> + struct TFreePcre; + + template <> + struct TFreePcre<char> { + static inline void Destroy(void* ptr) noexcept { + pcre_free(ptr); + } + }; + + template <> + struct TFreePcre<wchar16> { + static inline void Destroy(void* ptr) noexcept { + pcre16_free(ptr); + } + }; + + template <> + struct TFreePcre<wchar32> { + static inline void Destroy(void* ptr) noexcept { + pcre32_free(ptr); + } + }; + + template <class TCharType> + struct TFreePcreExtra; + + template <> + struct TFreePcreExtra<char> { + static inline void Destroy(pcre_extra* ptr) noexcept { + pcre_free_study(ptr); + } + }; + + template <> + struct TFreePcreExtra<wchar16> { + static inline void Destroy(pcre16_extra* ptr) noexcept { + pcre16_free_study(ptr); + } + }; + + template <> + struct TFreePcreExtra<wchar32> { + static inline void Destroy(pcre32_extra* ptr) noexcept { + pcre32_free_study(ptr); + } + }; + + template <typename TCharType> + using TPcreCode = THolder<typename TPcreTraits<TCharType>::TCodeType, TFreePcre<TCharType>>; + + template <typename TCharType> + using TPcreExtra = THolder<typename TPcreTraits<TCharType>::TExtraType, TFreePcreExtra<TCharType>>; +} + diff --git a/library/cpp/regex/pcre/ut/ya.make b/library/cpp/regex/pcre/ut/ya.make index 0721ef87c2..03e44a03ec 100644 --- a/library/cpp/regex/pcre/ut/ya.make +++ b/library/cpp/regex/pcre/ut/ya.make @@ -3,7 +3,7 @@ UNITTEST_FOR(library/cpp/regex/pcre) OWNER(g:util) SRCS( - pcre_ut.cpp + pcre_ut.cpp regexp_ut.cpp ) diff --git a/library/cpp/regex/pcre/ya.make b/library/cpp/regex/pcre/ya.make index d34911f103..2b7aac0e97 100644 --- a/library/cpp/regex/pcre/ya.make +++ b/library/cpp/regex/pcre/ya.make @@ -4,20 +4,20 @@ OWNER(g:util) PEERDIR( contrib/libs/pcre - contrib/libs/pcre/pcre16 - contrib/libs/pcre/pcre32 - library/cpp/containers/stack_array + contrib/libs/pcre/pcre16 + contrib/libs/pcre/pcre32 + library/cpp/containers/stack_array ) SRCS( - pcre.cpp + pcre.cpp regexp.cpp ) END() - -RECURSE_FOR_TESTS( - benchmark - ut -) - + +RECURSE_FOR_TESTS( + benchmark + ut +) + diff --git a/library/cpp/regex/pire/regexp.h b/library/cpp/regex/pire/regexp.h index 94bba4064b..a517b47b71 100644 --- a/library/cpp/regex/pire/regexp.h +++ b/library/cpp/regex/pire/regexp.h @@ -5,36 +5,36 @@ #include <library/cpp/charset/doccodes.h> #include <library/cpp/charset/recyr.hh> #include <util/generic/maybe.h> -#include <util/generic/strbuf.h> +#include <util/generic/strbuf.h> #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/generic/yexception.h> - +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + namespace NRegExp { struct TMatcher; - struct TFsmBase { - struct TOptions { + struct TFsmBase { + struct TOptions { inline TOptions& SetCaseInsensitive(bool v) noexcept { - CaseInsensitive = v; - return *this; - } + CaseInsensitive = v; + return *this; + } inline TOptions& SetSurround(bool v) noexcept { - Surround = v; - return *this; - } + Surround = v; + return *this; + } inline TOptions& SetCapture(size_t pos) noexcept { - CapturePos = pos; - return *this; + CapturePos = pos; + return *this; } inline TOptions& SetCharset(ECharset charset) noexcept { - Charset = charset; - return *this; - } - + Charset = charset; + return *this; + } + inline TOptions& SetAndNotSupport(bool andNotSupport) noexcept { AndNotSupport = andNotSupport; return *this; @@ -45,14 +45,14 @@ namespace NRegExp { TMaybe<size_t> CapturePos; ECharset Charset = CODES_UNKNOWN; bool AndNotSupport = false; - }; - + }; + static inline NPire::TFsm Parse(const TStringBuf& regexp, const TOptions& opts, const bool needDetermine = true) { - NPire::TLexer lexer; - if (opts.Charset == CODES_UNKNOWN) { + NPire::TLexer lexer; + if (opts.Charset == CODES_UNKNOWN) { lexer.Assign(regexp.data(), regexp.data() + regexp.size()); - } else { + } else { TVector<wchar32> ucs4(regexp.size() + 1); size_t inRead = 0; size_t outWritten = 0; @@ -61,13 +61,13 @@ namespace NRegExp { Y_ASSERT(recodeRes == RECODE_OK); Y_ASSERT(outWritten < ucs4.size()); ucs4[outWritten] = 0; - - lexer.Assign(ucs4.begin(), + + lexer.Assign(ucs4.begin(), ucs4.begin() + std::char_traits<wchar32>::length(ucs4.data())); - } - - if (opts.CaseInsensitive) { - lexer.AddFeature(NPire::NFeatures::CaseInsensitive()); + } + + if (opts.CaseInsensitive) { + lexer.AddFeature(NPire::NFeatures::CaseInsensitive()); } if (opts.CapturePos) { @@ -78,7 +78,7 @@ namespace NRegExp { lexer.AddFeature(NPire::NFeatures::AndNotSupport()); } - switch (opts.Charset) { + switch (opts.Charset) { case CODES_UNKNOWN: break; case CODES_UTF8: @@ -90,76 +90,76 @@ namespace NRegExp { default: lexer.SetEncoding(NPire::NEncodings::Get(opts.Charset)); break; - } - - NPire::TFsm ret = lexer.Parse(); - - if (opts.Surround) { - ret.Surround(); + } + + NPire::TFsm ret = lexer.Parse(); + + if (opts.Surround) { + ret.Surround(); } if (needDetermine) { ret.Determine(); } - return ret; - } - }; + return ret; + } + }; - template <class TScannerType> - class TFsmParser: public TFsmBase { - public: - typedef TScannerType TScanner; + template <class TScannerType> + class TFsmParser: public TFsmBase { + public: + typedef TScannerType TScanner; - public: + public: inline explicit TFsmParser(const TStringBuf& regexp, const TOptions& opts = TOptions(), bool needDetermine = true) : Scanner(Parse(regexp, opts, needDetermine).template Compile<TScanner>()) - { - } + { + } inline const TScanner& GetScanner() const noexcept { - return Scanner; - } - - static inline TFsmParser False() { - return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>()); - } - - inline explicit TFsmParser(const TScanner& compiled) - : Scanner(compiled) - { - if (Scanner.Empty()) - ythrow yexception() << "Can't create fsm with empty scanner"; - } - - private: - TScanner Scanner; + return Scanner; + } + + static inline TFsmParser False() { + return TFsmParser(NPire::TFsm::MakeFalse().Compile<TScanner>()); + } + + inline explicit TFsmParser(const TScanner& compiled) + : Scanner(compiled) + { + if (Scanner.Empty()) + ythrow yexception() << "Can't create fsm with empty scanner"; + } + + private: + TScanner Scanner; }; - class TFsm: public TFsmParser<NPire::TNonrelocScanner> { - public: + class TFsm: public TFsmParser<NPire::TNonrelocScanner> { + public: inline explicit TFsm(const TStringBuf& regexp, const TOptions& opts = TOptions()) - : TFsmParser<TScanner>(regexp, opts) - { - } - - inline TFsm(const TFsmParser<TScanner>& fsm) - : TFsmParser<TScanner>(fsm) - { - } - + : TFsmParser<TScanner>(regexp, opts) + { + } + + inline TFsm(const TFsmParser<TScanner>& fsm) + : TFsmParser<TScanner>(fsm) + { + } + static inline TFsm Glue(const TFsm& l, const TFsm& r) { - return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner())); - } - - inline explicit TFsm(const TScanner& compiled) - : TFsmParser<TScanner>(compiled) - { - } - }; - + return TFsm(TScanner::Glue(l.GetScanner(), r.GetScanner())); + } + + inline explicit TFsm(const TScanner& compiled) + : TFsmParser<TScanner>(compiled) + { + } + }; + static inline TFsm operator|(const TFsm& l, const TFsm& r) { return TFsm::Glue(l, r); } @@ -167,15 +167,15 @@ namespace NRegExp { struct TCapturingFsm : TFsmParser<NPire::TCapturingScanner> { inline explicit TCapturingFsm(const TStringBuf& regexp, TOptions opts = TOptions()) - : TFsmParser<TScanner>(regexp, + : TFsmParser<TScanner>(regexp, opts.SetSurround(true).CapturePos ? opts : opts.SetCapture(1)) { - } + } - inline TCapturingFsm(const TFsmParser<TScanner>& fsm) - : TFsmParser<TScanner>(fsm) - { - } - }; + inline TCapturingFsm(const TFsmParser<TScanner>& fsm) + : TFsmParser<TScanner>(fsm) + { + } + }; struct TSlowCapturingFsm : TFsmParser<NPire::TSlowCapturingScanner> { inline explicit TSlowCapturingFsm(const TStringBuf& regexp, @@ -190,101 +190,101 @@ namespace NRegExp { } }; - template <class TFsm> - class TMatcherBase { - public: - typedef typename TFsm::TScanner::State TState; + template <class TFsm> + class TMatcherBase { + public: + typedef typename TFsm::TScanner::State TState; - public: - inline explicit TMatcherBase(const TFsm& fsm) - : Fsm(fsm) - { - Fsm.GetScanner().Initialize(State); - } + public: + inline explicit TMatcherBase(const TFsm& fsm) + : Fsm(fsm) + { + Fsm.GetScanner().Initialize(State); + } inline bool Final() const noexcept { - return GetScanner().Final(GetState()); - } + return GetScanner().Final(GetState()); + } - protected: + protected: inline void Run(const char* data, size_t len, bool addBegin, bool addEnd) noexcept { if (addBegin) { - NPire::Step(GetScanner(), State, NPire::BeginMark); + NPire::Step(GetScanner(), State, NPire::BeginMark); } - NPire::Run(GetScanner(), State, data, data + len); + NPire::Run(GetScanner(), State, data, data + len); if (addEnd) { - NPire::Step(GetScanner(), State, NPire::EndMark); - } - } + NPire::Step(GetScanner(), State, NPire::EndMark); + } + } inline const typename TFsm::TScanner& GetScanner() const noexcept { - return Fsm.GetScanner(); - } + return Fsm.GetScanner(); + } inline const TState& GetState() const noexcept { - return State; - } + return State; + } - private: - const TFsm& Fsm; - TState State; + private: + const TFsm& Fsm; + TState State; }; - + struct TMatcher : TMatcherBase<TFsm> { - inline explicit TMatcher(const TFsm& fsm) - : TMatcherBase<TFsm>(fsm) - { - } - + inline explicit TMatcher(const TFsm& fsm) + : TMatcherBase<TFsm>(fsm) + { + } + inline TMatcher& Match(const char* data, size_t len, bool addBegin = false, bool addEnd = false) noexcept { Run(data, len, addBegin, addEnd); - return *this; - } - + return *this; + } + inline TMatcher& Match(const TStringBuf& s, bool addBegin = false, bool addEnd = false) noexcept { return Match(s.data(), s.size(), addBegin, addEnd); - } - + } + inline const char* Find(const char* b, const char* e) noexcept { - return NPire::ShortestPrefix(GetScanner(), b, e); - } - + return NPire::ShortestPrefix(GetScanner(), b, e); + } + typedef std::pair<const size_t*, const size_t*> TMatchedRegexps; - + inline TMatchedRegexps MatchedRegexps() const noexcept { - return GetScanner().AcceptedRegexps(GetState()); - } - }; - - class TSearcher: public TMatcherBase<TCapturingFsm> { - public: - inline explicit TSearcher(const TCapturingFsm& fsm) - : TMatcherBase<TCapturingFsm>(fsm) - { - } - + return GetScanner().AcceptedRegexps(GetState()); + } + }; + + class TSearcher: public TMatcherBase<TCapturingFsm> { + public: + inline explicit TSearcher(const TCapturingFsm& fsm) + : TMatcherBase<TCapturingFsm>(fsm) + { + } + inline bool Captured() const noexcept { - return GetState().Captured(); - } - + return GetState().Captured(); + } + inline TSearcher& Search(const char* data, size_t len, bool addBegin = true, bool addEnd = true) noexcept { - Data = TStringBuf(data, len); + Data = TStringBuf(data, len); Run(data, len, addBegin, addEnd); - return *this; - } - + return *this; + } + inline TSearcher& Search(const TStringBuf& s) noexcept { return Search(s.data(), s.size()); - } - + } + inline TStringBuf GetCaptured() const noexcept { return TStringBuf(Data.data() + GetState().Begin() - 1, Data.data() + GetState().End() - 1); - } - - private: - TStringBuf Data; - }; + } + + private: + TStringBuf Data; + }; class TSlowSearcher : TMatcherBase<TSlowCapturingFsm>{ public: diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp index e7206de9ad..c8db34d986 100644 --- a/library/cpp/regex/pire/ut/regexp_ut.cpp +++ b/library/cpp/regex/pire/ut/regexp_ut.cpp @@ -93,44 +93,44 @@ Y_UNIT_TEST_SUITE(TRegExp) { UNIT_ASSERT(TMatcher(glued).Match("abc").Final()); UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final()); } - + Y_UNIT_TEST(Capture1) { - TCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); - - TSearcher searcher(fsm); - searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); - UNIT_ASSERT(searcher.Captured()); + TCapturingFsm fsm("here we have user_id=([a-z0-9]+);"); + + TSearcher searcher(fsm); + searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF"); + UNIT_ASSERT(searcher.Captured()); UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a")); - } - + } + Y_UNIT_TEST(Capture2) { - TCapturingFsm fsm("w([abcdez]+)f"); - - TSearcher searcher(fsm); - searcher.Search("wabcdef"); - UNIT_ASSERT(searcher.Captured()); + TCapturingFsm fsm("w([abcdez]+)f"); + + TSearcher searcher(fsm); + searcher.Search("wabcdef"); + UNIT_ASSERT(searcher.Captured()); UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde")); - } - + } + Y_UNIT_TEST(Capture3) { - TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", + TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)", TFsm::TOptions().SetCapture(2)); - - TSearcher searcher(fsm); - searcher.Search("http://vkontakte.ru/id100500"); - UNIT_ASSERT(searcher.Captured()); + + TSearcher searcher(fsm); + searcher.Search("http://vkontakte.ru/id100500"); + UNIT_ASSERT(searcher.Captured()); UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500")); - } - + } + Y_UNIT_TEST(Capture4) { - TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", + TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!", TFsm::TOptions().SetCharset(CODES_UTF8)); - - TSearcher searcher(fsm); - searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); - UNIT_ASSERT(searcher.Captured()); + + TSearcher searcher(fsm); + searcher.Search(" Здравствуйте, Уважаемый (-ая)! "); + UNIT_ASSERT(searcher.Captured()); UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)")); - } + } Y_UNIT_TEST(Capture5) { TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\""); diff --git a/library/cpp/ya.make b/library/cpp/ya.make index 8c1193b007..dfedd9750f 100644 --- a/library/cpp/ya.make +++ b/library/cpp/ya.make @@ -153,7 +153,7 @@ RECURSE( grpc histogram hnsw - html + html html/dehtml/ut http hyperloglog |