diff options
| author | karina-usm <[email protected]> | 2022-02-10 16:48:05 +0300 | 
|---|---|---|
| committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:48:05 +0300 | 
| commit | 62517661cde7aa7c93efe0281ec48eeb70ea420c (patch) | |
| tree | 066f34bb401d85fa43842442fb0d888ffb2a305f | |
| parent | 5f8a2ce7b1dc3b3e1fae197610f189e7ed1d5723 (diff) | |
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
| -rw-r--r-- | contrib/libs/pire/pire/approx_matching.cpp | 188 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/approx_matching.h | 56 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/extra/capture.h | 14 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/fsm.h | 10 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/re_lexer.cpp | 112 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/read_unicode.cpp | 166 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/read_unicode.h | 80 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/scanners/loaded.h | 10 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/scanners/multi.h | 10 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/scanners/simple.h | 12 | ||||
| -rw-r--r-- | contrib/libs/pire/pire/scanners/slow.h | 34 | ||||
| -rw-r--r-- | library/cpp/regex/pire/ut/regexp_ut.cpp | 10 | ||||
| -rw-r--r-- | library/cpp/regex/pire/ut/ya.make | 4 | ||||
| -rw-r--r-- | library/cpp/regex/pire/ya.make | 4 | 
14 files changed, 355 insertions, 355 deletions
| diff --git a/contrib/libs/pire/pire/approx_matching.cpp b/contrib/libs/pire/pire/approx_matching.cpp index 23f74ca01df..8c393b39e05 100644 --- a/contrib/libs/pire/pire/approx_matching.cpp +++ b/contrib/libs/pire/pire/approx_matching.cpp @@ -1,94 +1,94 @@ -/* - * approx_matching.cpp -- implementation of CreateApproxFsm function - * - * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <[email protected]> - * - * This file is part of Pire, the Perl Incompatible - * Regular Expressions library. - * - * Pire is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Pire is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU Lesser Public License for more details. - * You should have received a copy of the GNU Lesser Public License - * along with Pire.  If not, see <http://www.gnu.org/licenses>. - */ - - -#include "approx_matching.h" - -namespace Pire { -	Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) { -		Fsm approxFsm = regexp; - -		TVector<TSet<Char>> outgoingLettersTable(regexp.Size()); -		for (size_t state = 0; state < regexp.Size(); ++state) { -			outgoingLettersTable[state] = regexp.OutgoingLetters(state); -		} - -		TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size()); -		for (size_t state = 0; state < regexp.Size(); ++state) { -			for (Char letter : outgoingLettersTable[state]) { -				destinationsTable[state][letter] = regexp.Destinations(state, letter); -			} -		} - -		for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) { -			approxFsm.Import(regexp); -			const auto shift = fsmIdx * regexp.Size(); - -			for (size_t state = 0; state < regexp.Size(); ++state) { -				for (Char letter : outgoingLettersTable[state]) { -					for (size_t to : destinationsTable[state][letter]) { -						for (Char ch = 0; ch < MaxChar; ++ch) { -							if (!approxFsm.Connected(state + shift, to + shift, ch)) { -								approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch); -							} -						} - -						approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon); -					} - -					for (Char ch = 0; ch < MaxChar; ++ch) { -						approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch); -					} -				} - -				if (regexp.IsFinal(state)) { -					approxFsm.SetFinal(state + shift + regexp.Size(), true); -				} -			} -		} - -		size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0; -		for (size_t state = 0; state < maxState; ++state) { -			size_t currentDist = state / regexp.Size(); -			size_t intState = state % regexp.Size(); - -			for (Char firstLetter : outgoingLettersTable[intState]) { -				for (size_t firstDest : destinationsTable[intState][firstLetter]) { -					for (Char secondLetter : outgoingLettersTable[firstDest]) { -						for (size_t secondDest : destinationsTable[firstDest][secondLetter]) { -							if (secondDest != intState || firstDest != intState) { -								approxFsm.Resize(approxFsm.Size() + 1); - -								size_t to = secondDest + (currentDist + 1) * regexp.Size(); -								size_t middle = approxFsm.Size() - 1; - -								approxFsm.Connect(state, middle, secondLetter); -								approxFsm.Connect(middle, to, firstLetter); -							} -						} -					} -				} -			} -		} - -		return approxFsm; -	} -} +/*  + * approx_matching.cpp -- implementation of CreateApproxFsm function  + *  + * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <[email protected]>  + *  + * This file is part of Pire, the Perl Incompatible  + * Regular Expressions library.  + *  + * Pire is free software: you can redistribute it and/or modify  + * it under the terms of the GNU Lesser Public License as published by  + * the Free Software Foundation, either version 3 of the License, or  + * (at your option) any later version.  + *  + * Pire is distributed in the hope that it will be useful,  + * but WITHOUT ANY WARRANTY; without even the implied warranty of  + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  + * GNU Lesser Public License for more details.  + * You should have received a copy of the GNU Lesser Public License  + * along with Pire.  If not, see <http://www.gnu.org/licenses>.  + */  +  +  +#include "approx_matching.h"  +  +namespace Pire {  +	Fsm CreateApproxFsm(const Fsm& regexp, size_t distance) {  +		Fsm approxFsm = regexp;  +  +		TVector<TSet<Char>> outgoingLettersTable(regexp.Size());  +		for (size_t state = 0; state < regexp.Size(); ++state) {  +			outgoingLettersTable[state] = regexp.OutgoingLetters(state);  +		}  +  +		TVector<TMap<Char, Fsm::StatesSet>> destinationsTable(regexp.Size());  +		for (size_t state = 0; state < regexp.Size(); ++state) {  +			for (Char letter : outgoingLettersTable[state]) {  +				destinationsTable[state][letter] = regexp.Destinations(state, letter);  +			}  +		}  +  +		for (size_t fsmIdx = 0; fsmIdx < distance; ++fsmIdx) {  +			approxFsm.Import(regexp);  +			const auto shift = fsmIdx * regexp.Size();  +  +			for (size_t state = 0; state < regexp.Size(); ++state) {  +				for (Char letter : outgoingLettersTable[state]) {  +					for (size_t to : destinationsTable[state][letter]) {  +						for (Char ch = 0; ch < MaxChar; ++ch) {  +							if (!approxFsm.Connected(state + shift, to + shift, ch)) {  +								approxFsm.Connect(state + shift, to + shift + regexp.Size(), ch);  +							}  +						}  +  +						approxFsm.Connect(state + shift, to + shift + regexp.Size(), Epsilon);  +					}  +  +					for (Char ch = 0; ch < MaxChar; ++ch) {  +						approxFsm.Connect(state + shift, state + shift + regexp.Size(), ch);  +					}  +				}  +  +				if (regexp.IsFinal(state)) {  +					approxFsm.SetFinal(state + shift + regexp.Size(), true);  +				}  +			}  +		}  +  +		size_t maxState = (distance > 0) ? approxFsm.Size() - regexp.Size() : 0;  +		for (size_t state = 0; state < maxState; ++state) {  +			size_t currentDist = state / regexp.Size();  +			size_t intState = state % regexp.Size();  +  +			for (Char firstLetter : outgoingLettersTable[intState]) {  +				for (size_t firstDest : destinationsTable[intState][firstLetter]) {  +					for (Char secondLetter : outgoingLettersTable[firstDest]) {  +						for (size_t secondDest : destinationsTable[firstDest][secondLetter]) {  +							if (secondDest != intState || firstDest != intState) {  +								approxFsm.Resize(approxFsm.Size() + 1);  +  +								size_t to = secondDest + (currentDist + 1) * regexp.Size();  +								size_t middle = approxFsm.Size() - 1;  +  +								approxFsm.Connect(state, middle, secondLetter);  +								approxFsm.Connect(middle, to, firstLetter);  +							}  +						}  +					}  +				}  +			}  +		}  +  +		return approxFsm;  +	}  +}  diff --git a/contrib/libs/pire/pire/approx_matching.h b/contrib/libs/pire/pire/approx_matching.h index fc2a9fd61c1..2b2568d96ba 100644 --- a/contrib/libs/pire/pire/approx_matching.h +++ b/contrib/libs/pire/pire/approx_matching.h @@ -1,28 +1,28 @@ -/* - * approx_matching.h -- function for creating fsm which matches words - *                      within a levenshtein distance - * - * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <[email protected]> - * - * This file is part of Pire, the Perl Incompatible - * Regular Expressions library. - * - * Pire is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Pire is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU Lesser Public License for more details. - * You should have received a copy of the GNU Lesser Public License - * along with Pire.  If not, see <http://www.gnu.org/licenses>. - */ - - -#include "fsm.h" - -namespace Pire { -	Fsm CreateApproxFsm(const Fsm& regexp, size_t distance); -} +/*  + * approx_matching.h -- function for creating fsm which matches words  + *                      within a levenshtein distance  + *  + * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <[email protected]>  + *  + * This file is part of Pire, the Perl Incompatible  + * Regular Expressions library.  + *  + * Pire is free software: you can redistribute it and/or modify  + * it under the terms of the GNU Lesser Public License as published by  + * the Free Software Foundation, either version 3 of the License, or  + * (at your option) any later version.  + *  + * Pire is distributed in the hope that it will be useful,  + * but WITHOUT ANY WARRANTY; without even the implied warranty of  + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  + * GNU Lesser Public License for more details.  + * You should have received a copy of the GNU Lesser Public License  + * along with Pire.  If not, see <http://www.gnu.org/licenses>.  + */  +  +  +#include "fsm.h"  +  +namespace Pire {  +	Fsm CreateApproxFsm(const Fsm& regexp, size_t distance);  +}  diff --git a/contrib/libs/pire/pire/extra/capture.h b/contrib/libs/pire/pire/extra/capture.h index 8399914a67f..7bca334eacb 100644 --- a/contrib/libs/pire/pire/extra/capture.h +++ b/contrib/libs/pire/pire/extra/capture.h @@ -25,7 +25,7 @@  #define PIRE_EXTRA_CAPTURE_H -#include <contrib/libs/pire/pire/approx_matching.h> +#include <contrib/libs/pire/pire/approx_matching.h>   #include <contrib/libs/pire/pire/scanners/loaded.h>  #include <contrib/libs/pire/pire/scanners/multi.h>  #include <contrib/libs/pire/pire/scanners/slow.h> @@ -139,11 +139,11 @@ public:  	CapturingScanner() {}  	CapturingScanner(const CapturingScanner& s): LoadedScanner(s) {} -	explicit CapturingScanner(Fsm& fsm, size_t distance = 0) +	explicit CapturingScanner(Fsm& fsm, size_t distance = 0)   	{ -		if (distance) { -			fsm = CreateApproxFsm(fsm, distance); -		} +		if (distance) {  +			fsm = CreateApproxFsm(fsm, distance);  +		}   		fsm.Canonize();  		Init(fsm.Size(), fsm.Letters(), fsm.Initial());  		BuildScanner(fsm, *this); @@ -576,8 +576,8 @@ public:  	{  	} -	SlowCapturingScanner(Fsm& fsm, size_t distance = 0) -		: SlowScanner(fsm, true, false, distance) +	SlowCapturingScanner(Fsm& fsm, size_t distance = 0)  +		: SlowScanner(fsm, true, false, distance)   	{  	}  }; diff --git a/contrib/libs/pire/pire/fsm.h b/contrib/libs/pire/pire/fsm.h index 4dad06ca065..348e6b62168 100644 --- a/contrib/libs/pire/pire/fsm.h +++ b/contrib/libs/pire/pire/fsm.h @@ -115,9 +115,9 @@ namespace Pire {  		/// Determines and minimizes the FSM if neccessary. Returns *this.  		Fsm& Canonize(size_t maxSize = 0); - +   		template<class Scanner> -		Scanner Compile(size_t distance = 0); +		Scanner Compile(size_t distance = 0);   		void DumpState(yostream& s, size_t state) const;  		void DumpTo(yostream& s, const ystring& name = "") const; @@ -270,11 +270,11 @@ namespace Pire {  		r.FinishBuild();  	} - +   	template<class Scanner> -	inline Scanner Fsm::Compile(size_t distance) +	inline Scanner Fsm::Compile(size_t distance)   	{ -		return Scanner(*this, distance); +		return Scanner(*this, distance);   	}  	yostream& operator << (yostream&, const Fsm&); diff --git a/contrib/libs/pire/pire/re_lexer.cpp b/contrib/libs/pire/pire/re_lexer.cpp index 132fbeb0399..dbae421f160 100644 --- a/contrib/libs/pire/pire/re_lexer.cpp +++ b/contrib/libs/pire/pire/re_lexer.cpp @@ -11,7 +11,7 @@   * it under the terms of the GNU Lesser Public License as published by   * the Free Software Foundation, either version 3 of the License, or   * (at your option) any later version. - * + *    * Pire is distributed in the hope that it will be useful,   * but WITHOUT ANY WARRANTY; without even the implied warranty of   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the @@ -28,12 +28,12 @@  #include <contrib/libs/pire/pire/stub/utf8.h>  #include <contrib/libs/pire/pire/stub/singleton.h> -#include "fsm.h" +#include "fsm.h"   #include "re_lexer.h"  #include "re_parser.h" -#include "read_unicode.h" - +#include "read_unicode.h"  +   namespace Pire {  namespace Impl { @@ -161,7 +161,7 @@ Term Lexer::Lex()                  if ((j & ControlMask) == Control)                      Error("Control character in tokens sequence");      } - +       int type = t.Type();      if (type == TokenTypes::Letters)          type = YRE_LETTERS; @@ -205,19 +205,19 @@ wchar32 Feature::CorrectChar(wchar32 c, const char* controls)  }  namespace { -    class EnableUnicodeSequencesImpl : public UnicodeReader { -    public: -        bool Accepts(wchar32 c) const { -            return c == (Control | 'x'); -        } - -        Term Lex() { -            return Term::Character(ReadUnicodeCharacter()); -        } -    }; - -    class CharacterRangeReader: public UnicodeReader { +    class EnableUnicodeSequencesImpl : public UnicodeReader {       public: +        bool Accepts(wchar32 c) const {  +            return c == (Control | 'x');  +        }  +  +        Term Lex() {  +            return Term::Character(ReadUnicodeCharacter());  +        }  +    };  +  +    class CharacterRangeReader: public UnicodeReader {  +    public:           bool Accepts(wchar32 c) const { return c == '[' || c == (Control | '[') || c == (Control | ']'); }          Term Lex() @@ -235,49 +235,49 @@ namespace {                  ch = CorrectChar(GetChar(), controls);              } -            bool firstUnicode; -            wchar32 unicodeSymbol = 0; - +            bool firstUnicode;  +            wchar32 unicodeSymbol = 0;  +               for (; ch != End && ch != (Control | ']'); ch = CorrectChar(GetChar(), controls)) { -                if (ch == (Control | 'x')) { -                    UngetChar(ch); -					firstUnicode = true; -					unicodeSymbol = ReadUnicodeCharacter(); -                } else { -                    firstUnicode = false; -                } - -                if (((ch & ControlMask) != Control || firstUnicode) && CorrectChar(PeekChar(), controls) == (Control | '-')) { +                if (ch == (Control | 'x')) {  +                    UngetChar(ch);  +					firstUnicode = true;  +					unicodeSymbol = ReadUnicodeCharacter();  +                } else {  +                    firstUnicode = false;  +                }  +  +                if (((ch & ControlMask) != Control || firstUnicode) && CorrectChar(PeekChar(), controls) == (Control | '-')) {                       GetChar(); -                    wchar32 current = GetChar(); - -                    bool secondUnicode = (current == (Control | 'x')); - -                    wchar32 begin = (firstUnicode) ? unicodeSymbol : ch; -                    wchar32 end; -                    if (secondUnicode) { -                        UngetChar(current); -                        end = ReadUnicodeCharacter(); -                    } else { -                        end = CorrectChar(current, controls); -                        if ((end & ControlMask) == Control) -                            Error("Wrong character range"); -                    } - -                    for (ch = begin; ch <= end; ++ch) { +                    wchar32 current = GetChar();  +  +                    bool secondUnicode = (current == (Control | 'x'));  +  +                    wchar32 begin = (firstUnicode) ? unicodeSymbol : ch;  +                    wchar32 end;  +                    if (secondUnicode) {  +                        UngetChar(current);  +                        end = ReadUnicodeCharacter();  +                    } else {  +                        end = CorrectChar(current, controls);  +                        if ((end & ControlMask) == Control)  +                            Error("Wrong character range");  +                    }  +  +                    for (ch = begin; ch <= end; ++ch) {                           cs.first.insert(Term::String(1, ch)); -                    } -                } else if (ch == (Control | '-')) { +                    }  +                } else if (ch == (Control | '-')) {                       cs.first.insert(Term::String(1, '-')); -                } -                else if ((ch & ControlMask) == Control && (strchr(controls2, ch & ~ControlMask) || strchr(controls, ch & ~ControlMask))) { +                }  +                else if ((ch & ControlMask) == Control && (strchr(controls2, ch & ~ControlMask) || strchr(controls, ch & ~ControlMask))) {                       cs.first.insert(Term::String(1, ch & ~ControlMask)); -                } -                else if ((ch & ControlMask) != Control || !strchr(controls, ch & ~ControlMask)) { -                    cs.first.insert(Term::String(1, (firstUnicode) ? unicodeSymbol : ch)); -                } else { +                }  +                else if ((ch & ControlMask) != Control || !strchr(controls, ch & ~ControlMask)) {  +                    cs.first.insert(Term::String(1, (firstUnicode) ? unicodeSymbol : ch));  +                } else {                       Error("Wrong character in range"); -                } +                }               }              if (ch == End)                  Error("Unexpected end of pattern"); @@ -347,7 +347,7 @@ namespace {          {              return c == '&' || c == '~' || c == (Control | '&') || c == (Control | '~');          } - +           Term Lex()          {              wchar32 ch = GetChar(); @@ -376,7 +376,7 @@ void Lexer::InstallDefaultFeatures()      AddFeature(Feature::Ptr(new CharacterRangeReader));      AddFeature(Feature::Ptr(new RepetitionCountReader));      AddFeature(Features::CharClasses()); -    AddFeature(Feature::Ptr(new EnableUnicodeSequencesImpl)); +    AddFeature(Feature::Ptr(new EnableUnicodeSequencesImpl));   }  Fsm Lexer::Parse() diff --git a/contrib/libs/pire/pire/read_unicode.cpp b/contrib/libs/pire/pire/read_unicode.cpp index 5b21e4eb285..e167cf5ccaa 100644 --- a/contrib/libs/pire/pire/read_unicode.cpp +++ b/contrib/libs/pire/pire/read_unicode.cpp @@ -1,83 +1,83 @@ -/* - * read_unicode.cpp -- implementation of the UnicodeReader. - * - * Copyright (c) 2019 YANDEX LLC - * Author: Karina Usmanova <[email protected]> - * - * This file is part of Pire, the Perl Incompatible - * Regular Expressions library. - * - * Pire is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Pire is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU Lesser Public License for more details. - * You should have received a copy of the GNU Lesser Public License - * along with Pire.  If not, see <http://www.gnu.org/licenses>. - */ - - -#include "read_unicode.h" - -#include <contrib/libs/pire/pire/re_lexer.h> - -namespace Pire { -	wchar32 UnicodeReader::ReadUnicodeCharacter() { -		ystring hexStr; -		GetChar(); -		wchar32 ch = PeekChar(); - -		if (ch == '{') { -			GetChar(); -			hexStr = ReadHexDigit( -					[](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); }); -			ch = GetChar(); -			if (ch != '}') { -				Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\""); -			} -		} else { -			hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; }); -			if (hexStr.size() != 2) { -				Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols"); -			} -		} -		return HexToDec(hexStr); -	} - -	bool UnicodeReader::IsHexDigit(wchar32 ch) { -		return ch < 256 && std::isxdigit(ch) != 0; -	} - -	ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) { -		ystring result; -		wchar32 ch = GetChar(); -		while (!shouldStop(ch, result.size())) { -			if (!IsHexDigit(ch)) { -				Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number"); -			} -			result.push_back(ch); -			ch = GetChar(); -		} -		UngetChar(ch); -		return result; -	} - -	wchar32 UnicodeReader::HexToDec(const ystring &hexStr) { -		wchar32 converted; -		try { -			converted = std::stoul(hexStr, 0, 16); -		} catch (std::out_of_range &) { -			converted = MAX_UNICODE + 1; -		} -		if (converted > MAX_UNICODE) { -			Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large"); -		} -		return converted; -	} -} - - +/*  + * read_unicode.cpp -- implementation of the UnicodeReader.  + *  + * Copyright (c) 2019 YANDEX LLC  + * Author: Karina Usmanova <[email protected]>  + *  + * This file is part of Pire, the Perl Incompatible  + * Regular Expressions library.  + *  + * Pire is free software: you can redistribute it and/or modify  + * it under the terms of the GNU Lesser Public License as published by  + * the Free Software Foundation, either version 3 of the License, or  + * (at your option) any later version.  + *  + * Pire is distributed in the hope that it will be useful,  + * but WITHOUT ANY WARRANTY; without even the implied warranty of  + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  + * GNU Lesser Public License for more details.  + * You should have received a copy of the GNU Lesser Public License  + * along with Pire.  If not, see <http://www.gnu.org/licenses>.  + */  +  +  +#include "read_unicode.h"  +  +#include <contrib/libs/pire/pire/re_lexer.h>  +  +namespace Pire {  +	wchar32 UnicodeReader::ReadUnicodeCharacter() {  +		ystring hexStr;  +		GetChar();  +		wchar32 ch = PeekChar();  +  +		if (ch == '{') {  +			GetChar();  +			hexStr = ReadHexDigit(  +					[](wchar32 ch, size_t numAdded) -> bool { return ch == End || (numAdded != 0 && ch == '}'); });  +			ch = GetChar();  +			if (ch != '}') {  +				Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x{...\" sequence should be closed by \"}\"");  +			}  +		} else {  +			hexStr = ReadHexDigit([](wchar32, size_t numAdded) -> bool { return numAdded == 2; });  +			if (hexStr.size() != 2) {  +				Error("Pire::UnicodeReader::ReadUnicodeCharacter(): \"\\x...\" sequence should contain two symbols");  +			}  +		}  +		return HexToDec(hexStr);  +	}  +  +	bool UnicodeReader::IsHexDigit(wchar32 ch) {  +		return ch < 256 && std::isxdigit(ch) != 0;  +	}  +  +	ystring UnicodeReader::ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop) {  +		ystring result;  +		wchar32 ch = GetChar();  +		while (!shouldStop(ch, result.size())) {  +			if (!IsHexDigit(ch)) {  +				Error("Pire::UnicodeReader::ReadHexDigit(): \"\\x...\" sequence contains non-valid hex number");  +			}  +			result.push_back(ch);  +			ch = GetChar();  +		}  +		UngetChar(ch);  +		return result;  +	}  +  +	wchar32 UnicodeReader::HexToDec(const ystring &hexStr) {  +		wchar32 converted;  +		try {  +			converted = std::stoul(hexStr, 0, 16);  +		} catch (std::out_of_range &) {  +			converted = MAX_UNICODE + 1;  +		}  +		if (converted > MAX_UNICODE) {  +			Error("Pire::UnicodeReader::HexToDec(): hex number in \"\\x...\" sequence is too large");  +		}  +		return converted;  +	}  +}  +  +  diff --git a/contrib/libs/pire/pire/read_unicode.h b/contrib/libs/pire/pire/read_unicode.h index 107545e5a18..f0705c14aab 100644 --- a/contrib/libs/pire/pire/read_unicode.h +++ b/contrib/libs/pire/pire/read_unicode.h @@ -1,40 +1,40 @@ -/* - * read_unicode.h -- declaration of the UnicodeReader class, helper for UnicodeRange and EnableUnicodeSequences. - * - * Copyright (c) 2019 YANDEX LLC - * Author: Karina Usmanova <[email protected]> - * - * This file is part of Pire, the Perl Incompatible - * Regular Expressions library. - * - * Pire is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Pire is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU Lesser Public License for more details. - * You should have received a copy of the GNU Lesser Public License - * along with Pire.  If not, see <http://www.gnu.org/licenses>. - */ - - -#include <contrib/libs/pire/pire/re_lexer.h> - -namespace Pire { -	class UnicodeReader : public Feature { -	public: -		wchar32 ReadUnicodeCharacter(); - -	private: -		static const wchar32 MAX_UNICODE = 0x10FFFF; - -		bool IsHexDigit(wchar32 ch); -		ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop); -		wchar32 HexToDec(const ystring& hexStr); -	}; -} - - +/*  + * read_unicode.h -- declaration of the UnicodeReader class, helper for UnicodeRange and EnableUnicodeSequences.  + *  + * Copyright (c) 2019 YANDEX LLC  + * Author: Karina Usmanova <[email protected]>  + *  + * This file is part of Pire, the Perl Incompatible  + * Regular Expressions library.  + *  + * Pire is free software: you can redistribute it and/or modify  + * it under the terms of the GNU Lesser Public License as published by  + * the Free Software Foundation, either version 3 of the License, or  + * (at your option) any later version.  + *  + * Pire is distributed in the hope that it will be useful,  + * but WITHOUT ANY WARRANTY; without even the implied warranty of  + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  + * GNU Lesser Public License for more details.  + * You should have received a copy of the GNU Lesser Public License  + * along with Pire.  If not, see <http://www.gnu.org/licenses>.  + */  +  +  +#include <contrib/libs/pire/pire/re_lexer.h>  +  +namespace Pire {  +	class UnicodeReader : public Feature {  +	public:  +		wchar32 ReadUnicodeCharacter();  +  +	private:  +		static const wchar32 MAX_UNICODE = 0x10FFFF;  +  +		bool IsHexDigit(wchar32 ch);  +		ystring ReadHexDigit(std::function<bool(wchar32, size_t)> shouldStop);  +		wchar32 HexToDec(const ystring& hexStr);  +	};  +}  +  +  diff --git a/contrib/libs/pire/pire/scanners/loaded.h b/contrib/libs/pire/pire/scanners/loaded.h index 120dc403b75..24ded64a68c 100644 --- a/contrib/libs/pire/pire/scanners/loaded.h +++ b/contrib/libs/pire/pire/scanners/loaded.h @@ -26,7 +26,7 @@  #include <string.h> -#include <contrib/libs/pire/pire/approx_matching.h> +#include <contrib/libs/pire/pire/approx_matching.h>   #include <contrib/libs/pire/pire/fsm.h>  #include <contrib/libs/pire/pire/partition.h> @@ -245,11 +245,11 @@ protected:  	virtual ~LoadedScanner();  private: -	explicit LoadedScanner(Fsm& fsm, size_t distance = 0) +	explicit LoadedScanner(Fsm& fsm, size_t distance = 0)   	{ -		if (distance) { -			fsm = CreateApproxFsm(fsm, distance); -		} +		if (distance) {  +			fsm = CreateApproxFsm(fsm, distance);  +		}   		fsm.Canonize();  		Init(fsm.Size(), fsm.Letters(), fsm.Initial());  		BuildScanner(fsm, *this); diff --git a/contrib/libs/pire/pire/scanners/multi.h b/contrib/libs/pire/pire/scanners/multi.h index 29679e416ed..b6cdceaa327 100644 --- a/contrib/libs/pire/pire/scanners/multi.h +++ b/contrib/libs/pire/pire/scanners/multi.h @@ -26,7 +26,7 @@  #include <cstring>  #include <string.h> -#include <contrib/libs/pire/pire/approx_matching.h> +#include <contrib/libs/pire/pire/approx_matching.h>   #include <contrib/libs/pire/pire/fsm.h>  #include <contrib/libs/pire/pire/partition.h>  #include <contrib/libs/pire/pire/run.h> @@ -121,11 +121,11 @@ public:  	Scanner() { Alias(Null()); } -	explicit Scanner(Fsm& fsm, size_t distance = 0) +	explicit Scanner(Fsm& fsm, size_t distance = 0)   	{ -		if (distance) { -			fsm = CreateApproxFsm(fsm, distance); -		} +		if (distance) {  +			fsm = CreateApproxFsm(fsm, distance);  +		}   		fsm.Canonize();  		Init(fsm.Size(), fsm.Letters(), fsm.Finals().size(), fsm.Initial(), 1);  		BuildScanner(fsm, *this); diff --git a/contrib/libs/pire/pire/scanners/simple.h b/contrib/libs/pire/pire/scanners/simple.h index ef959aeed13..6874e1f2a30 100644 --- a/contrib/libs/pire/pire/scanners/simple.h +++ b/contrib/libs/pire/pire/scanners/simple.h @@ -24,7 +24,7 @@  #ifndef PIRE_SCANNERS_SIMPLE_H  #define PIRE_SCANNERS_SIMPLE_H -#include <contrib/libs/pire/pire/approx_matching.h> +#include <contrib/libs/pire/pire/approx_matching.h>   #include <contrib/libs/pire/pire/stub/stl.h>  #include <contrib/libs/pire/pire/stub/defaults.h>  #include <contrib/libs/pire/pire/stub/saveload.h> @@ -49,7 +49,7 @@ public:  	SimpleScanner()	{ Alias(Null()); } -	explicit SimpleScanner(Fsm& fsm, size_t distance = 0); +	explicit SimpleScanner(Fsm& fsm, size_t distance = 0);   	size_t Size() const { return m.statesCount; }  	bool Empty() const { return m_transitions == Null().m_transitions; } @@ -229,11 +229,11 @@ protected:  	}  }; -inline SimpleScanner::SimpleScanner(Fsm& fsm, size_t distance) +inline SimpleScanner::SimpleScanner(Fsm& fsm, size_t distance)   { -	if (distance) { -		fsm = CreateApproxFsm(fsm, distance); -	} +	if (distance) {  +		fsm = CreateApproxFsm(fsm, distance);  +	}   	fsm.Canonize();  	m.statesCount = fsm.Size(); diff --git a/contrib/libs/pire/pire/scanners/slow.h b/contrib/libs/pire/pire/scanners/slow.h index 6adfcb8c1d0..8f1e4ca4d0c 100644 --- a/contrib/libs/pire/pire/scanners/slow.h +++ b/contrib/libs/pire/pire/scanners/slow.h @@ -24,7 +24,7 @@  #ifndef PIRE_SCANNERS_SLOW_H  #define PIRE_SCANNERS_SLOW_H -#include <contrib/libs/pire/pire/approx_matching.h> +#include <contrib/libs/pire/pire/approx_matching.h>   #include <contrib/libs/pire/pire/partition.h>  #include <contrib/libs/pire/pire/vbitset.h>  #include <contrib/libs/pire/pire/fsm.h> @@ -250,12 +250,12 @@ public:  		}  	} -	explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0) +	explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0)   		: need_actions(needActions)  	{ -		if (distance) { -			fsm = CreateApproxFsm(fsm, distance); -		} +		if (distance) {  +			fsm = CreateApproxFsm(fsm, distance);  +		}   		if (removeEpsilons)  			fsm.RemoveEpsilons();  		fsm.Sparse(!removeEpsilons); @@ -357,7 +357,7 @@ private:  	bool need_actions;  	TVector<TVector<Action>> m_actionsvec; -	static const SlowScanner& Null(); +	static const SlowScanner& Null();   	template<class T> void alloc(T*& p, size_t size)  	{ @@ -416,17 +416,17 @@ private:  	friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&);  }; -template<> -inline SlowScanner Fsm::Compile(size_t distance) { -	return SlowScanner(*this, false, true, distance); -} - -inline const SlowScanner& SlowScanner::Null() -{ -	static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>(); -	return n; -} - +template<>  +inline SlowScanner Fsm::Compile(size_t distance) {  +	return SlowScanner(*this, false, true, distance);  +}  +  +inline const SlowScanner& SlowScanner::Null()  +{  +	static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>();  +	return n;  +}  +   #ifndef PIRE_DEBUG  /// A specialization of Run(), since its state is much heavier than other ones  /// and we thus want to avoid copying states. diff --git a/library/cpp/regex/pire/ut/regexp_ut.cpp b/library/cpp/regex/pire/ut/regexp_ut.cpp index e7206de9ad4..fd7ac68a2cf 100644 --- a/library/cpp/regex/pire/ut/regexp_ut.cpp +++ b/library/cpp/regex/pire/ut/regexp_ut.cpp @@ -37,11 +37,11 @@ Y_UNIT_TEST_SUITE(TRegExp) {          UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());      } -    Y_UNIT_TEST(UnicodeCase) { -        UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final()); -        UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final()); -    } - +    Y_UNIT_TEST(UnicodeCase) {  +        UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());  +        UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());  +    }  +       Y_UNIT_TEST(Utf) {          NRegExp::TFsmBase::TOptions opts;          opts.Charset = CODES_UTF8; diff --git a/library/cpp/regex/pire/ut/ya.make b/library/cpp/regex/pire/ut/ya.make index 8776695f405..bf068415862 100644 --- a/library/cpp/regex/pire/ut/ya.make +++ b/library/cpp/regex/pire/ut/ya.make @@ -30,9 +30,9 @@ SRCS(      count_ut.cpp      glyph_ut.cpp      easy_ut.cpp -    read_unicode_ut.cpp +    read_unicode_ut.cpp       regexp_ut.cpp -    approx_matching_ut.cpp +    approx_matching_ut.cpp   )  SIZE(MEDIUM) diff --git a/library/cpp/regex/pire/ya.make b/library/cpp/regex/pire/ya.make index c857e6d18bc..88afde3c61d 100644 --- a/library/cpp/regex/pire/ya.make +++ b/library/cpp/regex/pire/ya.make @@ -24,9 +24,9 @@ SRCS(      extra/glyphs.cpp      re_lexer.cpp      re_parser.y -    read_unicode.cpp +    read_unicode.cpp       extraencodings.cpp -    approx_matching.cpp +    approx_matching.cpp       half_final_fsm.cpp      minimize.h  ) | 
