aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/xml/document/xml-textreader.h
blob: ab4c329d26d291e5cabe1f493e115d07636e3438 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#pragma once

#include "xml-document.h"
#include "xml-options.h"

#include <contrib/libs/libxml/include/libxml/xmlreader.h>

#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h>

#include <util/generic/noncopyable.h>
#include <util/generic/ptr.h>
#include <util/generic/strbuf.h>
#include <util/generic/string.h>
#include <functional>
#include <util/stream/input.h>
#include <util/stream/str.h>

namespace NXml {
    /**
     * TextReader Parser
     *
     * API of the XML streaming API based on C# interfaces.
     * Provides fast, non-cached, forward-only access to XML data.
     *
     * Like the SAX parser, the TextReader parser is suitable for sequential
     * parsing, but instead of implementing handlers for specific parts of the
     * document, it allows you to detect the current node type, process the node
     * accordingly, and skip forward in the document as much as necessary.
     *
     * Unlike the DOM parser, you may not move backwards in the XML document.
     * And unlike the SAX parser, you must not waste time processing nodes that do not
     * interest you.
     *
     * All methods are on the single parser instance, but their result depends on the current context.
     * For instance, use Read() to move to the next node, and MoveToElement() to navigate to child nodes.
     * These methods will return false when no more nodes are available. Then use
     * methods such as GetName() and GetValue() to examine the elements and their attributes.
     *
     * This wrapper is inspired by TextReader from libxml++.
     */

    class TTextReader: private TNonCopyable {
    public:
        // strongly-typed alias for enum from xmlreader.h
        enum class ENodeType : int {
            // clang-format off
            Attribute             = XML_READER_TYPE_ATTRIBUTE,
            CDATA                 = XML_READER_TYPE_CDATA,
            Comment               = XML_READER_TYPE_COMMENT,
            Document              = XML_READER_TYPE_DOCUMENT,
            DocumentFragment      = XML_READER_TYPE_DOCUMENT_FRAGMENT,
            DocumentType          = XML_READER_TYPE_DOCUMENT_TYPE,
            Element               = XML_READER_TYPE_ELEMENT,
            EndElement            = XML_READER_TYPE_END_ELEMENT,
            EndEntity             = XML_READER_TYPE_END_ENTITY,
            Entity                = XML_READER_TYPE_ENTITY,
            EntityReference       = XML_READER_TYPE_ENTITY_REFERENCE,
            None                  = XML_READER_TYPE_NONE,
            Notation              = XML_READER_TYPE_NOTATION,
            ProcessingInstruction = XML_READER_TYPE_PROCESSING_INSTRUCTION,
            SignificantWhitespace = XML_READER_TYPE_SIGNIFICANT_WHITESPACE,
            Text                  = XML_READER_TYPE_TEXT,
            Whitespace            = XML_READER_TYPE_WHITESPACE,
            XmlDeclaration        = XML_READER_TYPE_XML_DECLARATION,
            // clang-format on
        };

        enum class EReadState : int {
            // clang-format off
            Closed      = XML_TEXTREADER_MODE_CLOSED,
            EndOfFile   = XML_TEXTREADER_MODE_EOF,
            Error       = XML_TEXTREADER_MODE_ERROR,
            Initial     = XML_TEXTREADER_MODE_INITIAL,
            Interactive = XML_TEXTREADER_MODE_INTERACTIVE,
            Reading     = XML_TEXTREADER_MODE_READING,
            // clang-format on
        };

    public:
        TTextReader(IInputStream& stream, const TOptions& options = TOptions());
        ~TTextReader();

        /**
         * Moves the position of the current instance to the next node in the stream, exposing its properties.
         * @return true if the node was read successfully, false if there are no more nodes to read
         */
        bool Read();

        /**
         * Reads the contents of the current node, including child nodes and markup.
         * @return A string containing the XML content, or an empty string
         *         if the current node is neither an element nor attribute, or has no child nodes
         */
        TString ReadInnerXml() const;

        /**
         * Reads the current node and its contents, including child nodes and markup.
         * @return A string containing the XML content, or an empty string
         *         if the current node is neither an element nor attribute
         */
        TString ReadOuterXml() const;

        /**
         * Reads the contents of an element or a text node as a string.
         * @return A string containing the contents of the Element or Text node,
         *         or an empty string if the reader is positioned on any other type of node
         */
        TString ReadString() const;

        /**
         * Parses an attribute value into one or more Text and EntityReference nodes.
         * @return A bool where true indicates the attribute value was parsed,
         *         and false indicates the reader was not positioned on an attribute node
         *         or all the attribute values have been read
         */
        bool ReadAttributeValue() const;

        /**
         * Gets the number of attributes on the current node.
         * @return The number of attributes on the current node, or zero if the current node
         *         does not support attributes
         */
        int GetAttributeCount() const;

        /**
         * Gets the base Uniform Resource Identifier (URI) of the current node.
         * @return The base URI of the current node or an empty string if not available
         */
        TStringBuf GetBaseUri() const;

        /**
         * Gets the depth of the current node in the XML document.
         * @return The depth of the current node in the XML document
         */
        int GetDepth() const;

        /**
         * Gets a value indicating whether the current node has any attributes.
         * @return true if the current has attributes, false otherwise
         */
        bool HasAttributes() const;

        /**
         * Whether the node can have a text value.
         * @return true if the current node can have an associated text value, false otherwise
         */
        bool HasValue() const;

        /**
         * Whether an Attribute node was generated from the default value defined in the DTD or schema.
         * @return true if defaulted, false otherwise
         */
        bool IsDefault() const;

        /**
         * Check if the current node is empty.
         * @return true if empty, false otherwise
         */
        bool IsEmptyElement() const;

        /**
         * The local name of the node.
         * @return the local name or empty string if not available
         */
        TStringBuf GetLocalName() const;

        /**
         * The qualified name of the node, equal to Prefix:LocalName.
         * @return the name or empty string if not available
         */
        TStringBuf GetName() const;

        /**
         * The URI defining the namespace associated with the node.
         * @return the namespace URI or empty string if not available
         */
        TStringBuf GetNamespaceUri() const;

        /**
         * Get the node type of the current node.
         * @return the ENodeType of the current node
         */
        ENodeType GetNodeType() const;

        /**
         * Get the namespace prefix associated with the current node.
         * @return the namespace prefix, or an empty string if not available
         */
        TStringBuf GetPrefix() const;

        /**
         * Get the quotation mark character used to enclose the value of an attribute.
         * @return " or '
         */
        char GetQuoteChar() const;

        /**
         * Provides the text value of the node if present.
         * @return the string or empty if not available
         */
        TStringBuf GetValue() const;

        /**
         * Gets the read state of the reader.
         * @return the state value
         */
        EReadState GetReadState() const;

        /**
         * This method releases any resources allocated by the current instance
         * changes the state to Closed and close any underlying input.
         */
        void Close();

        /**
         * Provides the value of the attribute with the specified index relative to the containing element.
         * @param number the zero-based index of the attribute relative to the containing element
         */
        TString GetAttribute(int number) const;

        /**
         * Provides the value of the attribute with the specified qualified name.
         * @param name the qualified name of the attribute
         */
        TString GetAttribute(TZtStringBuf name) const;

        /**
         * Provides the value of the specified attribute.
         * @param localName the local name of the attribute
         * @param nsUri the namespace URI of the attribute
         */
        TString GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const;

        /**
         * Resolves a namespace prefix in the scope of the current element.
         * @param prefix the prefix whose namespace URI is to be resolved. To return the default namespace, specify empty string.
         * @return a string containing the namespace URI to which the prefix maps.
         */
        TString LookupNamespace(TZtStringBuf prefix) const;

        /**
         * Moves the position of the current instance to the attribute with the specified index relative to the containing element.
         * @param number the zero-based index of the attribute relative to the containing element
         * @return true in case of success, false if not found
         */
        bool MoveToAttribute(int number);

        /**
         * Moves the position of the current instance to the attribute with the specified qualified name.
         * @param name the qualified name of the attribute
         * @return true in case of success, false if not found
         */
        bool MoveToAttribute(TZtStringBuf name);

        /**
         * Moves the position of the current instance to the attribute with the specified local name and namespace URI.
         * @param localName the local name of the attribute
         * @param nsUri the namespace URI of the attribute
         * @return true in case of success, false if not found
         */
        bool MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri);

        /**
         * Moves the position of the current instance to the first attribute associated with the current node.
         * @return true in case of success, false if not found
         */
        bool MoveToFirstAttribute();

        /**
         * Moves the position of the current instance to the next attribute associated with the current node.
         * @return true in case of success, false if not found
         */
        bool MoveToNextAttribute();

        /**
         * Moves the position of the current instance to the node that contains the current Attribute node.
         * @return true in case of success, false if not found
         */
        bool MoveToElement();

        /**
         * Reads the contents of the current node and the full subtree. It then makes the subtree available until the next Read() call.
         */
        TConstNode Expand() const;

        /**
         * Skip to the node following the current one in document order while avoiding the subtree if any.
         * @return true if the node was read successfully, false if there is no more nodes to read
         */
        bool Next();

        /**
         * Retrieve the validity status from the parser context.
         */
        bool IsValid() const;

    private:
        static int ReadFromInputStreamCallback(void* context, char* buffer, int len);
        static void OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator);

        void SetupErrorHandler();
        TStringStream& LogError() const;
        void CheckForExceptions() const;
        void ThrowException() const;

        // helpers that check return codes of C functions from libxml
        bool BoolResult(int value) const;
        int IntResult(int value) const;
        char CharResult(int value) const;
        TStringBuf ConstStringResult(const xmlChar* value) const;
        TStringBuf ConstStringOrEmptyResult(const xmlChar* value) const;
        TString TempStringResult(TCharPtr value) const;
        TString TempStringOrEmptyResult(TCharPtr value) const;

    private:
        IInputStream& Stream;

        mutable bool IsError;
        mutable TStringStream ErrorBuffer;

        struct TDeleter;
        THolder<xmlTextReader, TDeleter> Impl;
    };

}