aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/antlr3_cpp_runtime/include/antlr3input.hpp
blob: 0db48958974c5a9961c5822a1896f66411905585 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
/** \file
 * Defines the basic structures used to manipulate character
 * streams from any input source. Any character size and encoding
 * can in theory be used, so long as a set of functinos is provided that
 * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
 * to specific offsets into their input streams.
 */
#ifndef	_ANTLR_INPUT_HPP
#define	_ANTLR_INPUT_HPP

// [The "BSD licence"]
// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB

//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

namespace antlr3 {

/// Master context structure for an ANTLR3 C runtime based input stream.
/// \ingroup apistructures. Calling LT on this doesn't seem right. You would
/// call it only with parser / TreeParser, and their respective input streams 
/// has that function. calling it from lexer will throw a compile time error
///

template<class ImplTraits>
class	InputStream :   public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType >
{
public:
	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
	typedef typename ImplTraits::LexStateType LexStateType;
	typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType;
	typedef IntStreamType BaseType;
	typedef typename ImplTraits::StreamDataType UnitType;
	typedef UnitType DataType;
	typedef UnitType TokenType;
	typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType;
	typedef typename ImplTraits::StringType StringType;

private:
    /** Pointer the start of the input string, characters may be
     *  taken as offsets from here and in original input format encoding.
     */
    const DataType*		m_data;

    /** Pointer to the next character to be consumed from the input data
     *  This is cast to point at the encoding of the original file that
     *  was read by the functions installed as pointer in this input stream
     *  context instance at file/string/whatever load time.
     */
    const DataType*		m_nextChar;

    /** Number of characters that can be consumed at this point in time.
     *  Mostly this is just what is left in the pre-read buffer, but if the
     *  input source is a stream such as a socket or something then we may
     *  call special read code to wait for more input.
     */
    ANTLR_UINT32	m_sizeBuf;

    /** The line number we are traversing in the input file. This gets incremented
     *  by a newline() call in the lexer grammar actions.
     */
    ANTLR_UINT32	m_line;

    /** Pointer into the input buffer where the current line
     *  started.
     */
    const DataType*		m_currentLine;

    /** The offset within the current line of the current character
     */
    ANTLR_INT32		m_charPositionInLine;

    /** Tracks how deep mark() calls are nested
     */
    ANTLR_UINT32	m_markDepth;

    /** List of mark() points in the input stream
     */
    MarkersType		m_markers;

    /** File name string, set to pointer to memory if
     * you set it manually as it will be free()d
     */
    StringType		m_fileName;

    /** File number, needs to be set manually to some file index of your devising.
     */
    ANTLR_UINT32	m_fileNo;

	/// Character that automatically causes an internal line count
    ///  increment.
    ///
    ANTLR_UCHAR		m_newlineChar;

    /// Indicates the size, in 8 bit units, of a single character. Note that
    /// the C runtime does not deal with surrogates as this would be
    /// slow and complicated. If this is a UTF-8 stream then this field
    /// will be set to 0. Generally you are best working internally with 32 bit characters
    /// as this is the most efficient.
    ///
    ANTLR_UINT8		m_charByteSize;

   /** Indicates if the data pointer was allocated by us, and so should be freed
     *  when the stream dies.
     */
    bool			m_isAllocated;

    /// Indicates the encoding scheme used in this input stream
    ///
    ANTLR_UINT32    m_encoding;

    /* API */
public:
	InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding);
	InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name);
	~InputStream();
	const DataType* get_data() const;
	bool get_isAllocated() const;
	const DataType* get_nextChar() const;
	ANTLR_UINT32 get_sizeBuf() const;
	ANTLR_UINT32 get_line() const;
	const DataType* get_currentLine() const;
	ANTLR_INT32 get_charPositionInLine() const;
	ANTLR_UINT32 get_markDepth() const;
	MarkersType& get_markers();
	const StringType& get_fileName() const;
	ANTLR_UINT32 get_fileNo() const;
	ANTLR_UCHAR get_newlineChar() const;
	ANTLR_UINT8 get_charByteSize() const;
	ANTLR_UINT32 get_encoding() const;

	void  set_data( DataType* data );
	void  set_isAllocated( bool isAllocated );
	void  set_nextChar( const DataType* nextChar );
	void  set_sizeBuf( ANTLR_UINT32 sizeBuf );
	void  set_line( ANTLR_UINT32 line );
	void  set_currentLine( const DataType* currentLine );
	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
	void  set_markDepth( ANTLR_UINT32 markDepth );
	void  set_markers( const MarkersType& markers );
	void  set_fileName( const StringType& fileName );
	void  set_fileNo( ANTLR_UINT32 fileNo );
	void  set_newlineChar( ANTLR_UCHAR newlineChar );
	void  set_charByteSize( ANTLR_UINT8 charByteSize );
	void  set_encoding( ANTLR_UINT32 encoding );

	void inc_charPositionInLine();
	void inc_line();	
	void inc_markDepth();

	IntStreamType*	get_istream();

    /** Function that resets the input stream
     */
    void	reset();

    /** Pointer to a function that reuses and resets an input stream by
     *  supplying a new 'source'
     */
    void    reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name);

	
    /** Function to return the total size of the input buffer. For streams
     *  this may be just the total we have available so far. This means of course that
     *  the input stream must be careful to accumulate enough input so that any backtracking
     *  can be satisfied.
     */
    ANTLR_UINT32	size();

    /** Function to return a substring of the input stream. String is returned in allocated
     *  memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form.
     */
    StringType	substr(ANTLR_MARKER start, ANTLR_MARKER stop);

    /** Function to return the current line number in the input stream
     */
    ANTLR_UINT32	get_line();

    /** Function to return the current line buffer in the input stream
     *  The pointer returned is directly into the input stream so you must copy
     *  it if you wish to manipulate it without damaging the input stream. Encoding
     *  is obviously in the same form as the input stream.
     *  \remark
     *    - Note taht this function wil lbe inaccurate if setLine is called as there
     *      is no way at the moment to position the input stream at a particular line 
     *	    number offset.
     */
    const DataType*	getLineBuf();

    /** Function to return the current offset in the current input stream line
     */
    ANTLR_UINT32	get_charPositionInLine();

    /** Function to set the current position in the current line.
     */
    void	set_charPositionInLine(ANTLR_UINT32 position);

    /** Function to override the default newline character that the input stream
     *  looks for to trigger the line/offset and line buffer recording information.
     *  \remark
     *   - By default the chracter '\n' will be installed as the newline trigger character. When this
     *     character is seen by the consume() function then the current line number is incremented and the
     *     current line offset is reset to 0. The Pointer for the line of input we are consuming
     *     is updated to point to the next character after this one in the input stream (which means it
     *     may become invalid if the last newline character in the file is seen (so watch out).
     *   - If for some reason you do not want the counters and pointers to be restee, you can set the 
     *     chracter to some impossible character such as '\0' or whatever.
     *   - This is a single character only, so choose the last character in a sequence of two or more.
     *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
     *     it may not be adequate, but you can always override every function in the input stream with your
     *     own of course, and can even write your own complete input stream set if you like.
     *   - It is your responsiblity to set a valid character for the input stream type. There is no point 
     *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
     *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
     */
    void	set_newLineChar(ANTLR_UINT32 newlineChar);
	
	ANTLR_MARKER index_impl();

private:
	/** \brief Use the contents of an operating system file as the input
	 *         for an input stream.
	 *
	 * \param fileName Name of operating system file to read.
	 * \return
	 *	- Pointer to new input stream context upon success
	 *	- One of the ANTLR3_ERR_ defines on error.
	 */
	void createFileStream(const ANTLR_UINT8* fileName);

	/** \brief Use the supplied 'string' as input to the stream
	 *
	 * \param data Pointer to the input data
	 * \return
	 *	- Pointer to new input stream context upon success
	 *	- NULL defines on error.
	 */
	void createStringStream(const ANTLR_UINT8* data);
	void genericSetupStream();

	/// Determine endianess of the input stream and install the
	/// API required for the encoding in that format.
	///
	void setupInputStream();

};

/** \brief Structure for track lex input states as part of mark()
 *  and rewind() of lexer.
 */
template<class ImplTraits>
class	LexState : public ImplTraits::AllocPolicyType
{
public:
	typedef typename ImplTraits::StreamDataType DataType;

private:
        /** Pointer to the next character to be consumed from the input data
     *  This is cast to point at the encoding of the original file that
     *  was read by the functions installed as pointer in this input stream
     *  context instance at file/string/whatever load time.
     */
    const DataType*			m_nextChar;

    /** The line number we are traversing in the input file. This gets incremented
     *  by a newline() call in the lexer grammer actions.
     */
    ANTLR_UINT32	m_line;

    /** Pointer into the input buffer where the current line
     *  started.
     */
    const DataType*			m_currentLine;

    /** The offset within the current line of the current character
     */
    ANTLR_INT32		m_charPositionInLine;

public:
	LexState();
	const DataType* get_nextChar() const;
	ANTLR_UINT32 get_line() const;
	const DataType* get_currentLine() const;
	ANTLR_INT32 get_charPositionInLine() const;
	void  set_nextChar( const DataType* nextChar );
	void  set_line( ANTLR_UINT32 line );
	void  set_currentLine( const DataType* currentLine );
	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
};

class ParseNullStringException : public std::exception
{
	virtual const char* what() const noexcept 
	{
		return "Null String";
	}
};

}

#include "antlr3input.inl"

#endif	/* _ANTLR_INPUT_H  */