aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/xml/document/xml-textreader.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/xml/document/xml-textreader.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/xml/document/xml-textreader.h')
-rw-r--r--library/cpp/xml/document/xml-textreader.h325
1 files changed, 325 insertions, 0 deletions
diff --git a/library/cpp/xml/document/xml-textreader.h b/library/cpp/xml/document/xml-textreader.h
new file mode 100644
index 0000000000..ab4c329d26
--- /dev/null
+++ b/library/cpp/xml/document/xml-textreader.h
@@ -0,0 +1,325 @@
+#pragma once
+
+#include "xml-document.h"
+#include "xml-options.h"
+
+#include <contrib/libs/libxml/include/libxml/xmlreader.h>
+
+#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <functional>
+#include <util/stream/input.h>
+#include <util/stream/str.h>
+
+namespace NXml {
+ /**
+ * TextReader Parser
+ *
+ * API of the XML streaming API based on C# interfaces.
+ * Provides fast, non-cached, forward-only access to XML data.
+ *
+ * Like the SAX parser, the TextReader parser is suitable for sequential
+ * parsing, but instead of implementing handlers for specific parts of the
+ * document, it allows you to detect the current node type, process the node
+ * accordingly, and skip forward in the document as much as necessary.
+ *
+ * Unlike the DOM parser, you may not move backwards in the XML document.
+ * And unlike the SAX parser, you must not waste time processing nodes that do not
+ * interest you.
+ *
+ * All methods are on the single parser instance, but their result depends on the current context.
+ * For instance, use Read() to move to the next node, and MoveToElement() to navigate to child nodes.
+ * These methods will return false when no more nodes are available. Then use
+ * methods such as GetName() and GetValue() to examine the elements and their attributes.
+ *
+ * This wrapper is inspired by TextReader from libxml++.
+ */
+
+ class TTextReader: private TNonCopyable {
+ public:
+ // strongly-typed alias for enum from xmlreader.h
+ enum class ENodeType : int {
+ // clang-format off
+ Attribute = XML_READER_TYPE_ATTRIBUTE,
+ CDATA = XML_READER_TYPE_CDATA,
+ Comment = XML_READER_TYPE_COMMENT,
+ Document = XML_READER_TYPE_DOCUMENT,
+ DocumentFragment = XML_READER_TYPE_DOCUMENT_FRAGMENT,
+ DocumentType = XML_READER_TYPE_DOCUMENT_TYPE,
+ Element = XML_READER_TYPE_ELEMENT,
+ EndElement = XML_READER_TYPE_END_ELEMENT,
+ EndEntity = XML_READER_TYPE_END_ENTITY,
+ Entity = XML_READER_TYPE_ENTITY,
+ EntityReference = XML_READER_TYPE_ENTITY_REFERENCE,
+ None = XML_READER_TYPE_NONE,
+ Notation = XML_READER_TYPE_NOTATION,
+ ProcessingInstruction = XML_READER_TYPE_PROCESSING_INSTRUCTION,
+ SignificantWhitespace = XML_READER_TYPE_SIGNIFICANT_WHITESPACE,
+ Text = XML_READER_TYPE_TEXT,
+ Whitespace = XML_READER_TYPE_WHITESPACE,
+ XmlDeclaration = XML_READER_TYPE_XML_DECLARATION,
+ // clang-format on
+ };
+
+ enum class EReadState : int {
+ // clang-format off
+ Closed = XML_TEXTREADER_MODE_CLOSED,
+ EndOfFile = XML_TEXTREADER_MODE_EOF,
+ Error = XML_TEXTREADER_MODE_ERROR,
+ Initial = XML_TEXTREADER_MODE_INITIAL,
+ Interactive = XML_TEXTREADER_MODE_INTERACTIVE,
+ Reading = XML_TEXTREADER_MODE_READING,
+ // clang-format on
+ };
+
+ public:
+ TTextReader(IInputStream& stream, const TOptions& options = TOptions());
+ ~TTextReader();
+
+ /**
+ * Moves the position of the current instance to the next node in the stream, exposing its properties.
+ * @return true if the node was read successfully, false if there are no more nodes to read
+ */
+ bool Read();
+
+ /**
+ * Reads the contents of the current node, including child nodes and markup.
+ * @return A string containing the XML content, or an empty string
+ * if the current node is neither an element nor attribute, or has no child nodes
+ */
+ TString ReadInnerXml() const;
+
+ /**
+ * Reads the current node and its contents, including child nodes and markup.
+ * @return A string containing the XML content, or an empty string
+ * if the current node is neither an element nor attribute
+ */
+ TString ReadOuterXml() const;
+
+ /**
+ * Reads the contents of an element or a text node as a string.
+ * @return A string containing the contents of the Element or Text node,
+ * or an empty string if the reader is positioned on any other type of node
+ */
+ TString ReadString() const;
+
+ /**
+ * Parses an attribute value into one or more Text and EntityReference nodes.
+ * @return A bool where true indicates the attribute value was parsed,
+ * and false indicates the reader was not positioned on an attribute node
+ * or all the attribute values have been read
+ */
+ bool ReadAttributeValue() const;
+
+ /**
+ * Gets the number of attributes on the current node.
+ * @return The number of attributes on the current node, or zero if the current node
+ * does not support attributes
+ */
+ int GetAttributeCount() const;
+
+ /**
+ * Gets the base Uniform Resource Identifier (URI) of the current node.
+ * @return The base URI of the current node or an empty string if not available
+ */
+ TStringBuf GetBaseUri() const;
+
+ /**
+ * Gets the depth of the current node in the XML document.
+ * @return The depth of the current node in the XML document
+ */
+ int GetDepth() const;
+
+ /**
+ * Gets a value indicating whether the current node has any attributes.
+ * @return true if the current has attributes, false otherwise
+ */
+ bool HasAttributes() const;
+
+ /**
+ * Whether the node can have a text value.
+ * @return true if the current node can have an associated text value, false otherwise
+ */
+ bool HasValue() const;
+
+ /**
+ * Whether an Attribute node was generated from the default value defined in the DTD or schema.
+ * @return true if defaulted, false otherwise
+ */
+ bool IsDefault() const;
+
+ /**
+ * Check if the current node is empty.
+ * @return true if empty, false otherwise
+ */
+ bool IsEmptyElement() const;
+
+ /**
+ * The local name of the node.
+ * @return the local name or empty string if not available
+ */
+ TStringBuf GetLocalName() const;
+
+ /**
+ * The qualified name of the node, equal to Prefix:LocalName.
+ * @return the name or empty string if not available
+ */
+ TStringBuf GetName() const;
+
+ /**
+ * The URI defining the namespace associated with the node.
+ * @return the namespace URI or empty string if not available
+ */
+ TStringBuf GetNamespaceUri() const;
+
+ /**
+ * Get the node type of the current node.
+ * @return the ENodeType of the current node
+ */
+ ENodeType GetNodeType() const;
+
+ /**
+ * Get the namespace prefix associated with the current node.
+ * @return the namespace prefix, or an empty string if not available
+ */
+ TStringBuf GetPrefix() const;
+
+ /**
+ * Get the quotation mark character used to enclose the value of an attribute.
+ * @return " or '
+ */
+ char GetQuoteChar() const;
+
+ /**
+ * Provides the text value of the node if present.
+ * @return the string or empty if not available
+ */
+ TStringBuf GetValue() const;
+
+ /**
+ * Gets the read state of the reader.
+ * @return the state value
+ */
+ EReadState GetReadState() const;
+
+ /**
+ * This method releases any resources allocated by the current instance
+ * changes the state to Closed and close any underlying input.
+ */
+ void Close();
+
+ /**
+ * Provides the value of the attribute with the specified index relative to the containing element.
+ * @param number the zero-based index of the attribute relative to the containing element
+ */
+ TString GetAttribute(int number) const;
+
+ /**
+ * Provides the value of the attribute with the specified qualified name.
+ * @param name the qualified name of the attribute
+ */
+ TString GetAttribute(TZtStringBuf name) const;
+
+ /**
+ * Provides the value of the specified attribute.
+ * @param localName the local name of the attribute
+ * @param nsUri the namespace URI of the attribute
+ */
+ TString GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const;
+
+ /**
+ * Resolves a namespace prefix in the scope of the current element.
+ * @param prefix the prefix whose namespace URI is to be resolved. To return the default namespace, specify empty string.
+ * @return a string containing the namespace URI to which the prefix maps.
+ */
+ TString LookupNamespace(TZtStringBuf prefix) const;
+
+ /**
+ * Moves the position of the current instance to the attribute with the specified index relative to the containing element.
+ * @param number the zero-based index of the attribute relative to the containing element
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(int number);
+
+ /**
+ * Moves the position of the current instance to the attribute with the specified qualified name.
+ * @param name the qualified name of the attribute
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(TZtStringBuf name);
+
+ /**
+ * Moves the position of the current instance to the attribute with the specified local name and namespace URI.
+ * @param localName the local name of the attribute
+ * @param nsUri the namespace URI of the attribute
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri);
+
+ /**
+ * Moves the position of the current instance to the first attribute associated with the current node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToFirstAttribute();
+
+ /**
+ * Moves the position of the current instance to the next attribute associated with the current node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToNextAttribute();
+
+ /**
+ * Moves the position of the current instance to the node that contains the current Attribute node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToElement();
+
+ /**
+ * Reads the contents of the current node and the full subtree. It then makes the subtree available until the next Read() call.
+ */
+ TConstNode Expand() const;
+
+ /**
+ * Skip to the node following the current one in document order while avoiding the subtree if any.
+ * @return true if the node was read successfully, false if there is no more nodes to read
+ */
+ bool Next();
+
+ /**
+ * Retrieve the validity status from the parser context.
+ */
+ bool IsValid() const;
+
+ private:
+ static int ReadFromInputStreamCallback(void* context, char* buffer, int len);
+ static void OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator);
+
+ void SetupErrorHandler();
+ TStringStream& LogError() const;
+ void CheckForExceptions() const;
+ void ThrowException() const;
+
+ // helpers that check return codes of C functions from libxml
+ bool BoolResult(int value) const;
+ int IntResult(int value) const;
+ char CharResult(int value) const;
+ TStringBuf ConstStringResult(const xmlChar* value) const;
+ TStringBuf ConstStringOrEmptyResult(const xmlChar* value) const;
+ TString TempStringResult(TCharPtr value) const;
+ TString TempStringOrEmptyResult(TCharPtr value) const;
+
+ private:
+ IInputStream& Stream;
+
+ mutable bool IsError;
+ mutable TStringStream ErrorBuffer;
+
+ struct TDeleter;
+ THolder<xmlTextReader, TDeleter> Impl;
+ };
+
+}