diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp')
-rw-r--r-- | contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp b/contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp new file mode 100644 index 0000000000..d167f5b392 --- /dev/null +++ b/contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp @@ -0,0 +1,325 @@ +/** \file + * Defines the basic structures used to manipulate character + * streams from any input source. Any character size and encoding + * can in theory be used, so long as a set of functinos is provided that + * can return a 32 bit Integer representation of their characters amd efficiently mark and revert + * to specific offsets into their input streams. + */ +#ifndef _ANTLR_INPUT_HPP +#define _ANTLR_INPUT_HPP + +// [The "BSD licence"] +// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB + +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// 3. The name of the author may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +namespace antlr3 { + +/// Master context structure for an ANTLR3 C runtime based input stream. +/// \ingroup apistructures. Calling LT on this doesn't seem right. You would +/// call it only with parser / TreeParser, and their respective input streams +/// has that function. calling it from lexer will throw a compile time error +/// + +template<class ImplTraits> +class InputStream : public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > +{ +public: + typedef typename ImplTraits::AllocPolicyType AllocPolicyType; + typedef typename ImplTraits::LexStateType LexStateType; + typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType; + typedef IntStreamType BaseType; + typedef typename ImplTraits::StreamDataType UnitType; + typedef UnitType DataType; + typedef UnitType TokenType; + typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType; + typedef typename ImplTraits::StringType StringType; + +private: + /** Pointer the start of the input string, characters may be + * taken as offsets from here and in original input format encoding. + */ + const DataType* m_data; + + /** Pointer to the next character to be consumed from the input data + * This is cast to point at the encoding of the original file that + * was read by the functions installed as pointer in this input stream + * context instance at file/string/whatever load time. + */ + const DataType* m_nextChar; + + /** Number of characters that can be consumed at this point in time. + * Mostly this is just what is left in the pre-read buffer, but if the + * input source is a stream such as a socket or something then we may + * call special read code to wait for more input. + */ + ANTLR_UINT32 m_sizeBuf; + + /** The line number we are traversing in the input file. This gets incremented + * by a newline() call in the lexer grammar actions. + */ + ANTLR_UINT32 m_line; + + /** Pointer into the input buffer where the current line + * started. + */ + const DataType* m_currentLine; + + /** The offset within the current line of the current character + */ + ANTLR_INT32 m_charPositionInLine; + + /** Tracks how deep mark() calls are nested + */ + ANTLR_UINT32 m_markDepth; + + /** List of mark() points in the input stream + */ + MarkersType m_markers; + + /** File name string, set to pointer to memory if + * you set it manually as it will be free()d + */ + StringType m_fileName; + + /** File number, needs to be set manually to some file index of your devising. + */ + ANTLR_UINT32 m_fileNo; + + /// Character that automatically causes an internal line count + /// increment. + /// + ANTLR_UCHAR m_newlineChar; + + /// Indicates the size, in 8 bit units, of a single character. Note that + /// the C runtime does not deal with surrogates as this would be + /// slow and complicated. If this is a UTF-8 stream then this field + /// will be set to 0. Generally you are best working internally with 32 bit characters + /// as this is the most efficient. + /// + ANTLR_UINT8 m_charByteSize; + + /** Indicates if the data pointer was allocated by us, and so should be freed + * when the stream dies. + */ + bool m_isAllocated; + + /// Indicates the encoding scheme used in this input stream + /// + ANTLR_UINT32 m_encoding; + + /* API */ +public: + InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding); + InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name); + ~InputStream(); + const DataType* get_data() const; + bool get_isAllocated() const; + const DataType* get_nextChar() const; + ANTLR_UINT32 get_sizeBuf() const; + ANTLR_UINT32 get_line() const; + const DataType* get_currentLine() const; + ANTLR_INT32 get_charPositionInLine() const; + ANTLR_UINT32 get_markDepth() const; + MarkersType& get_markers(); + const StringType& get_fileName() const; + ANTLR_UINT32 get_fileNo() const; + ANTLR_UCHAR get_newlineChar() const; + ANTLR_UINT8 get_charByteSize() const; + ANTLR_UINT32 get_encoding() const; + + void set_data( DataType* data ); + void set_isAllocated( bool isAllocated ); + void set_nextChar( const DataType* nextChar ); + void set_sizeBuf( ANTLR_UINT32 sizeBuf ); + void set_line( ANTLR_UINT32 line ); + void set_currentLine( const DataType* currentLine ); + void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); + void set_markDepth( ANTLR_UINT32 markDepth ); + void set_markers( const MarkersType& markers ); + void set_fileName( const StringType& fileName ); + void set_fileNo( ANTLR_UINT32 fileNo ); + void set_newlineChar( ANTLR_UCHAR newlineChar ); + void set_charByteSize( ANTLR_UINT8 charByteSize ); + void set_encoding( ANTLR_UINT32 encoding ); + + void inc_charPositionInLine(); + void inc_line(); + void inc_markDepth(); + + IntStreamType* get_istream(); + + /** Function that resets the input stream + */ + void reset(); + + /** Pointer to a function that reuses and resets an input stream by + * supplying a new 'source' + */ + void reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name); + + + /** Function to return the total size of the input buffer. For streams + * this may be just the total we have available so far. This means of course that + * the input stream must be careful to accumulate enough input so that any backtracking + * can be satisfied. + */ + ANTLR_UINT32 size(); + + /** Function to return a substring of the input stream. String is returned in allocated + * memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form. + */ + StringType substr(ANTLR_MARKER start, ANTLR_MARKER stop); + + /** Function to return the current line number in the input stream + */ + ANTLR_UINT32 get_line(); + + /** Function to return the current line buffer in the input stream + * The pointer returned is directly into the input stream so you must copy + * it if you wish to manipulate it without damaging the input stream. Encoding + * is obviously in the same form as the input stream. + * \remark + * - Note taht this function wil lbe inaccurate if setLine is called as there + * is no way at the moment to position the input stream at a particular line + * number offset. + */ + const DataType* getLineBuf(); + + /** Function to return the current offset in the current input stream line + */ + ANTLR_UINT32 get_charPositionInLine(); + + /** Function to set the current position in the current line. + */ + void set_charPositionInLine(ANTLR_UINT32 position); + + /** Function to override the default newline character that the input stream + * looks for to trigger the line/offset and line buffer recording information. + * \remark + * - By default the chracter '\n' will be installed as the newline trigger character. When this + * character is seen by the consume() function then the current line number is incremented and the + * current line offset is reset to 0. The Pointer for the line of input we are consuming + * is updated to point to the next character after this one in the input stream (which means it + * may become invalid if the last newline character in the file is seen (so watch out). + * - If for some reason you do not want the counters and pointers to be restee, you can set the + * chracter to some impossible character such as '\0' or whatever. + * - This is a single character only, so choose the last character in a sequence of two or more. + * - This is only a simple aid to error reporting - if you have a complicated binary input structure + * it may not be adequate, but you can always override every function in the input stream with your + * own of course, and can even write your own complete input stream set if you like. + * - It is your responsiblity to set a valid character for the input stream type. There is no point + * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never + * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF + */ + void set_newLineChar(ANTLR_UINT32 newlineChar); + + ANTLR_MARKER index_impl(); + +private: + /** \brief Use the contents of an operating system file as the input + * for an input stream. + * + * \param fileName Name of operating system file to read. + * \return + * - Pointer to new input stream context upon success + * - One of the ANTLR3_ERR_ defines on error. + */ + void createFileStream(const ANTLR_UINT8* fileName); + + /** \brief Use the supplied 'string' as input to the stream + * + * \param data Pointer to the input data + * \return + * - Pointer to new input stream context upon success + * - NULL defines on error. + */ + void createStringStream(const ANTLR_UINT8* data); + void genericSetupStream(); + + /// Determine endianess of the input stream and install the + /// API required for the encoding in that format. + /// + void setupInputStream(); + +}; + +/** \brief Structure for track lex input states as part of mark() + * and rewind() of lexer. + */ +template<class ImplTraits> +class LexState : public ImplTraits::AllocPolicyType +{ +public: + typedef typename ImplTraits::StreamDataType DataType; + +private: + /** Pointer to the next character to be consumed from the input data + * This is cast to point at the encoding of the original file that + * was read by the functions installed as pointer in this input stream + * context instance at file/string/whatever load time. + */ + const DataType* m_nextChar; + + /** The line number we are traversing in the input file. This gets incremented + * by a newline() call in the lexer grammer actions. + */ + ANTLR_UINT32 m_line; + + /** Pointer into the input buffer where the current line + * started. + */ + const DataType* m_currentLine; + + /** The offset within the current line of the current character + */ + ANTLR_INT32 m_charPositionInLine; + +public: + LexState(); + const DataType* get_nextChar() const; + ANTLR_UINT32 get_line() const; + const DataType* get_currentLine() const; + ANTLR_INT32 get_charPositionInLine() const; + void set_nextChar( const DataType* nextChar ); + void set_line( ANTLR_UINT32 line ); + void set_currentLine( const DataType* currentLine ); + void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); +}; + +class ParseNullStringException : public std::exception +{ + virtual const char* what() const noexcept + { + return "Null String"; + } +}; + +} + +#include "antlr3input.inl" + +#endif /* _ANTLR_INPUT_H */ |