path: root/library/cpp/xml/document
diff options
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/xml/document
intermediate changes
Diffstat (limited to 'library/cpp/xml/document')
15 files changed, 2790 insertions, 0 deletions
diff --git a/library/cpp/xml/document/README b/library/cpp/xml/document/README
new file mode 100644
index 0000000000..b2649523d8
--- /dev/null
+++ b/library/cpp/xml/document/README
@@ -0,0 +1,42 @@
+A wrapper around the DOM interface of libxml2.
+The standard way to use it is as follows:
+ #include <library/cpp/xml/document/xml-document.h>
+ ...
+ // open a document
+ NXml::TDocument xml("filename.xml");
+ // get a nodeset from an XPath query
+ NXml::TConstNodes nodes = xml.Root().Nodes("xpath/expression/here");
+ // iterate over the nodeset
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ using namespace NXml;
+ TConstNode& node = nodes[i];
+ // query node
+ TString name = node.Name();
+ TString lang = node.Attr<TString>("lang");
+ TString text = node.Value<TString>();
+ TConstNode child = node.GetFirstChild("");
+ // edit node
+ TNode node = child.ConstCast();
+ node.DelAttr("id");
+ node.SetAttr("x", 2);
+ node.SetValue(5);
+ node.AddText(" apples");
+ }
+ // edit documents with copy-paste
+ NXml::TDocument xml2("<xpath><node/></xpath>", NXml::TDocument::String);
+ NXml::TNode place = xml2.Root().Node("xpath/node");
+ // copy node's subtree from one document to another
+ place.AddChild(xml.Root());
+ // save (render) single element
+ TString modifiedNode = place.ToString();
+ // save whole document with optional encoding
+ TString modifiedDoc = xml2.ToString("ISO-8559-1");
+See xml-document_ut.cpp for more examples.
diff --git a/library/cpp/xml/document/libxml-guards.h b/library/cpp/xml/document/libxml-guards.h
new file mode 100644
index 0000000000..4188cecff1
--- /dev/null
+++ b/library/cpp/xml/document/libxml-guards.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <library/cpp/xml/init/ptr.h>
+#include <util/generic/ptr.h>
+#include <libxml/xmlstring.h>
+#include <libxml/tree.h>
+#include <libxml/xpath.h>
+#include <libxml/uri.h>
+#include <libxml/xmlsave.h>
+namespace NXml {
+ namespace NDetail {
+ struct TSignedCharPtrTraits {
+ static void Destroy(char* handle) {
+ xmlFree(handle);
+ }
+ };
+ struct TCharPtrTraits {
+ static void Destroy(xmlChar* handle) {
+ xmlFree(handle);
+ }
+ };
+ struct TOutputBufferPtrTraits {
+ static void Destroy(xmlOutputBufferPtr handle) {
+ xmlOutputBufferClose(handle);
+ }
+ };
+ struct TSaveCtxtPtrTraits {
+ static void Destroy(xmlSaveCtxtPtr handle) {
+ xmlSaveClose(handle);
+ }
+ };
+ }
+ typedef TxmlXPathContextPtr TXPathContextPtr;
+ typedef TxmlXPathObjectPtr TXPathObjectPtr;
+ typedef TAutoPtr<char, NDetail::TSignedCharPtrTraits> TSignedCharPtr;
+ typedef TAutoPtr<xmlChar, NDetail::TCharPtrTraits> TCharPtr;
+ typedef TxmlDocHolder TDocHolder;
+ typedef TxmlURIPtr TURIPtr;
+ typedef TxmlNodePtr TNodePtr;
+ typedef TAutoPtr<xmlOutputBuffer, NDetail::TOutputBufferPtrTraits> TOutputBufferPtr;
+ typedef TxmlParserCtxtPtr TParserCtxtPtr;
+ typedef TAutoPtr<xmlSaveCtxt, NDetail::TSaveCtxtPtrTraits> TSaveCtxtPtr;
diff --git a/library/cpp/xml/document/node-attr.h b/library/cpp/xml/document/node-attr.h
new file mode 100644
index 0000000000..6e74403943
--- /dev/null
+++ b/library/cpp/xml/document/node-attr.h
@@ -0,0 +1,209 @@
+#pragma once
+#include "xml-document-decl.h"
+#include "libxml-guards.h"
+#include <util/stream/str.h>
+#include <util/string/cast.h>
+namespace NXml {
+#define THROW(x, y) ythrow yexception() << #x << ": " << y
+ // libxml defines unsigned char -> xmlChar,
+ // and all functions use xmlChar.
+ inline static const char* CAST2CHAR(const xmlChar* x) {
+ return reinterpret_cast<const char*>(x);
+ }
+ inline static const xmlChar* XMLCHAR(const char* x) {
+ return reinterpret_cast<const xmlChar*>(x);
+ }
+ template <class T>
+ void TNode::AttrInternal(TCharPtr& value, T& res, TStringBuf errContext) const {
+ try {
+ res = FromString<T>(CAST2CHAR(value.Get()));
+ } catch (TFromStringException&) {
+ THROW(XmlException, "Failed to convert string " << TString{TStringBuf(CAST2CHAR(value.Get())).substr(0, 50)}.Quote() << " from '" << errContext << "' to requested type");
+ }
+ }
+ template <>
+ inline void TNode::AttrInternal(TCharPtr& value, TString& res, TStringBuf /*errContext*/) const {
+ TString tmp(CAST2CHAR(value.Get()));
+ res.swap(tmp);
+ }
+ template <class T>
+ T TNode::Attr(TZtStringBuf name) const {
+ TCharPtr value(xmlGetProp(NodePointer, XMLCHAR(name.c_str())));
+ if (!value) {
+ THROW(AttributeNotFound, Path() << "@" << name);
+ }
+ T t;
+ AttrInternal(value, t, name);
+ return t;
+ }
+ template <class T>
+ T TNode::Attr(TZtStringBuf name, const T& defvalue) const {
+ TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str())));
+ if (!attr) {
+ return defvalue;
+ }
+ T t;
+ AttrInternal(attr, t, name);
+ return t;
+ }
+ template <class T>
+ void TNode::Attr(TZtStringBuf name, T& value) const {
+ TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str())));
+ if (!attr) {
+ THROW(AttributeNotFound, Path() << name);
+ }
+ AttrInternal(attr, value, name);
+ }
+ template <class T>
+ void TNode::Attr(TZtStringBuf name, T& value, const T& defvalue) const {
+ TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str())));
+ if (!attr) {
+ value = defvalue;
+ } else {
+ AttrInternal(attr, value, name);
+ }
+ }
+ template <class T>
+ T TNode::Value() const {
+ if (!NodePointer || xmlIsBlankNode(NodePointer)) {
+ THROW(NodeIsBlank, Path());
+ }
+ TCharPtr val(xmlNodeGetContent(NodePointer));
+ T t;
+ AttrInternal(val, t, this->Name());
+ return t;
+ }
+ template <class T>
+ T TNode::Value(const T& defvalue) const {
+ if (!NodePointer || xmlIsBlankNode(NodePointer)) {
+ return defvalue;
+ }
+ TCharPtr val(xmlNodeGetContent(NodePointer));
+ T t;
+ AttrInternal(val, t, this->Name());
+ return t;
+ }
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, void>::type
+ TNode::SetValue(const T& value) {
+ TStringStream ss;
+ ss << value;
+ SetValue(ss.Str());
+ }
+ inline void TNode::SetValue(TStringBuf value) {
+ xmlNodeSetContent(NodePointer, XMLCHAR(""));
+ xmlNodeAddContentLen(NodePointer, XMLCHAR(value.data()), value.Size());
+ }
+ inline void TNode::SetAttr(TZtStringBuf name, TZtStringBuf value) {
+ xmlAttr* attr = xmlSetProp(NodePointer, XMLCHAR(name.c_str()), XMLCHAR(value.c_str()));
+ if (!attr) {
+ THROW(XmlException, "Can't set node attribute <"
+ << name
+ << "> to <"
+ << value
+ << ">");
+ }
+ }
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, void>::type
+ TNode::SetAttr(TZtStringBuf name, const T& value) {
+ TStringStream ss;
+ ss << value;
+ SetAttr(name, TZtStringBuf(ss.Str()));
+ }
+ inline void TNode::SetAttr(TZtStringBuf name) {
+ xmlAttr* attr = xmlSetProp(NodePointer, XMLCHAR(name.c_str()), nullptr);
+ if (!attr) {
+ THROW(XmlException, "Can't set node empty attribute <"
+ << name
+ << ">");
+ }
+ }
+ inline void TNode::DelAttr(TZtStringBuf name) {
+ if (xmlUnsetProp(NodePointer, XMLCHAR(name.c_str())) < 0)
+ THROW(XmlException, "Can't delete node attribute <"
+ << name
+ << ">");
+ }
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, TNode>::type
+ TNode::AddChild(TZtStringBuf name, const T& value) {
+ TStringStream ss;
+ ss << value;
+ return AddChild(name, TZtStringBuf(ss.Str()));
+ }
+ inline TNode TNode::AddChild(TZtStringBuf name, TZtStringBuf value) {
+ if (IsNull()) {
+ THROW(XmlException, "addChild [name=" << name << ", value=" << value
+ << "]: can't add child to null node");
+ }
+ xmlNode* child = nullptr;
+ if (value.empty()) {
+ child = xmlNewTextChild(NodePointer, nullptr, XMLCHAR(name.c_str()), nullptr);
+ } else {
+ child = xmlNewTextChild(
+ NodePointer, nullptr, XMLCHAR(name.c_str()), XMLCHAR(value.c_str()));
+ }
+ if (!child) {
+ THROW(XmlException, "addChild [name=" << name << ", value=" << value
+ << "]: xmlNewTextChild returned NULL");
+ }
+ return TNode(DocPointer, child);
+ }
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, TNode>::type
+ TNode::AddText(const T& value) {
+ TStringStream ss;
+ ss << value;
+ return AddText(ss.Str());
+ }
+ inline TNode TNode::AddText(TStringBuf value) {
+ if (IsNull()) {
+ THROW(XmlException, "addChild [value=" << value
+ << "]: can't add child to null node");
+ }
+ xmlNode* child = xmlNewTextLen((xmlChar*)value.data(), value.size());
+ child = xmlAddChild(NodePointer, child);
+ if (!child) {
+ THROW(XmlException, "addChild [value=" << value
+ << "]: xmlNewTextChild returned NULL");
+ }
+ return TNode(DocPointer, child);
+ }
diff --git a/library/cpp/xml/document/ut/ya.make b/library/cpp/xml/document/ut/ya.make
new file mode 100644
index 0000000000..e955448c66
--- /dev/null
+++ b/library/cpp/xml/document/ut/ya.make
@@ -0,0 +1,11 @@
+ xml-document_ut.cpp
+ xml-textreader_ut.cpp
+ xml-options_ut.cpp
diff --git a/library/cpp/xml/document/xml-document-decl.h b/library/cpp/xml/document/xml-document-decl.h
new file mode 100644
index 0000000000..bfda1fb7e6
--- /dev/null
+++ b/library/cpp/xml/document/xml-document-decl.h
@@ -0,0 +1,718 @@
+#pragma once
+#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/output.h>
+#include <util/stream/str.h>
+#include <algorithm>
+#include "libxml-guards.h"
+namespace NXml {
+ class TNode;
+ class TConstNodes;
+ class TConstNode;
+ using TXPathContext = xmlXPathContext;
+ class TDocument {
+ public:
+ enum Source {
+ File,
+ String,
+ RootName,
+ };
+ public:
+ /**
+ * create TDocument
+ * @param source: filename, XML string, or name for the root element (depends on @src)
+ * @param src: source type: File | String | RootName
+ * throws if file not found or cannot be parsed
+ */
+ TDocument(const TString& source, Source type = File);
+ public:
+ TDocument(const TDocument& that) = delete;
+ TDocument& operator=(const TDocument& that) = delete;
+ TDocument(TDocument&& that);
+ TDocument& operator=(TDocument&& that);
+ /**
+ * get root element
+ */
+ TNode Root();
+ TConstNode Root() const;
+ void Save(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = true) const {
+ int bufferSize = 0;
+ xmlChar* xmlBuff = nullptr;
+ const char* encoding = enc.size() ? enc.data() : Doc->encoding ? nullptr : "UTF-8";
+ xmlDocDumpFormatMemoryEnc(Doc.Get(), &xmlBuff, &bufferSize, encoding, shouldFormat);
+ TCharPtr xmlCharBuffPtr(xmlBuff);
+ stream.Write(xmlBuff, bufferSize);
+ }
+ TString ToString(TZtStringBuf enc = "", bool shouldFormat = true) const {
+ TStringStream s;
+ Save(s, enc, shouldFormat);
+ return s.Str();
+ }
+ void Swap(TDocument& that) {
+ std::swap(this->Doc, that.Doc);
+ }
+ xmlDocPtr GetImpl() {
+ return Doc.Get();
+ }
+ private:
+ void ParseFile(const TString& file);
+ void ParseString(TZtStringBuf xml);
+ TDocument(TDocHolder doc)
+ : Doc(std::move(doc))
+ {
+ }
+ TDocHolder Doc;
+ };
+ struct TNamespaceForXPath {
+ TString Prefix;
+ TString Url;
+ };
+ typedef TVector<TNamespaceForXPath> TNamespacesForXPath;
+ class TConstNodes {
+ private:
+ struct TConstNodesRef {
+ explicit TConstNodesRef(TConstNodes& n)
+ : r_(n)
+ {
+ }
+ TConstNodes& r_;
+ };
+ public:
+ TConstNodes(const TConstNodes& nodes);
+ TConstNodes& operator=(const TConstNodes& nodes);
+ TConstNodes(TConstNodesRef ref);
+ TConstNodes& operator=(TConstNodesRef ref);
+ operator TConstNodesRef();
+ /**
+ * get node by id
+ * @param number: node id
+ */
+ TConstNode operator[](size_t number) const;
+ /**
+ * get number of nodes
+ */
+ size_t Size() const {
+ return SizeValue;
+ }
+ size_t size() const {
+ return SizeValue;
+ }
+ struct TNodeIter {
+ const TConstNodes& Nodes;
+ size_t Index;
+ TConstNode operator*() const;
+ bool operator==(const TNodeIter& other) const {
+ return Index == other.Index;
+ }
+ bool operator!=(const TNodeIter& other) const {
+ return !(*this == other);
+ }
+ TNodeIter operator++() {
+ Index++;
+ return *this;
+ }
+ };
+ TNodeIter begin() const {
+ return TNodeIter{*this, 0};
+ }
+ TNodeIter end() const {
+ return TNodeIter{*this, size()};
+ }
+ private:
+ friend class TDocument;
+ friend class TConstNode;
+ friend class TNode;
+ TConstNodes(xmlDoc* doc, TXPathObjectPtr obj);
+ size_t SizeValue;
+ xmlDoc* Doc;
+ TXPathObjectPtr Obj;
+ };
+ class TNode {
+ public:
+ friend class TDocument;
+ friend class TConstNode;
+ friend class TTextReader;
+ /**
+ * check if node is null
+ */
+ bool IsNull() const;
+ /**
+ * check if node is element node
+ */
+ bool IsElementNode() const;
+ /**
+ * Create xpath context to be used later for fast xpath evaluation.
+ * @param nss: explicitly specify XML namespaces to use and their prefixes
+ *
+ * For better performance, when you need to evaluate several xpath expressions,
+ * it makes sense to create a context, load namespace prefixes once
+ * and use the context several times in Node(), Nodes(), XPath() function calls for several nodes.
+ * The context may be used with any node of the current document, but
+ * cannot be shared between different XML documents.
+ */
+ TXPathContextPtr CreateXPathContext(const TNamespacesForXPath& nss = TNamespacesForXPath()) const;
+ /**
+ * get all element nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNodes Nodes(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const;
+ /**
+ * get all element nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ctxt: reusable xpath context
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNodes Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const;
+ /**
+ * get all nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ */
+ TConstNodes XPath(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const;
+ /**
+ * get all nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ctxt: reusable xpath context
+ */
+ TConstNodes XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const;
+ /**
+ * get the first element node matching given xpath expression
+ * @param xpath: path to node (from current node)
+ * @param quiet: don't throw exception if node not found,
+ * return null node (@see IsNull())
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ /// @todo: quiet should be default, empty nodeset is not an error
+ TNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath());
+ TConstNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const;
+ /**
+ * get the first element node matching given xpath expression
+ * @param xpath: path to node (from current node)
+ * @param quiet: don't throw exception if node not found,
+ * return null node (@see IsNull())
+ * @param ctxt: reusable xpath context
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt);
+ TConstNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const;
+ /**
+ * get node first child
+ * @param name: child name
+ * @note if name is empty, returns the first child node of type "element"
+ * @note returns null node if no child found
+ */
+ TNode FirstChild(TZtStringBuf name);
+ TConstNode FirstChild(TZtStringBuf name) const;
+ TNode FirstChild();
+ TConstNode FirstChild() const;
+ /**
+ * get parent node
+ * throws exception if has no parent
+ */
+ TNode Parent();
+ TConstNode Parent() const;
+ /**
+ * get node neighbour
+ * @param name: neighbour name
+ * @note if name is empty, returns the next sibling node of type "element"
+ * @node returns null node if no neighbour found
+ */
+ TNode NextSibling(TZtStringBuf name);
+ TConstNode NextSibling(TZtStringBuf name) const;
+ TNode NextSibling();
+ TConstNode NextSibling() const;
+ /**
+ * create child node
+ * @param name: child name
+ * returns new empty node
+ */
+ TNode AddChild(TZtStringBuf name);
+ /**
+ * create child node with given value
+ * @param name: child name
+ * @param value: node value
+ */
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, TNode>::type
+ AddChild(TZtStringBuf name, const T& value);
+ TNode AddChild(TZtStringBuf name, TZtStringBuf value);
+ /**
+ * add child node, making recursive copy of original
+ * @param node: node to copy from
+ * returns added node
+ */
+ TNode AddChild(const TConstNode& node);
+ /**
+ * create text child node
+ * @param name: child name
+ * @param value: node value
+ */
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, TNode>::type
+ AddText(const T& value);
+ TNode AddText(TStringBuf value);
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * throws exception if attribute not found
+ */
+ template <class T>
+ T Attr(TZtStringBuf name) const;
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * returns default value if attribute not found
+ */
+ template <class T>
+ T Attr(TZtStringBuf name, const T& defvalue) const;
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * @param value: return-value
+ * throws exception if attribute not found
+ */
+ template <class T>
+ void Attr(TZtStringBuf name, T& value) const;
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * @param defvalue: default value
+ * @param value: return-value
+ * returns default value if attribute not found, attr value otherwise
+ */
+ template <class T>
+ void Attr(TZtStringBuf name, T& value, const T& defvalue) const;
+ /**
+ * get node value (text)
+ * @throws exception if node is blank
+ */
+ template <class T>
+ T Value() const;
+ /**
+ * get node value
+ * @param defvalue: default value
+ * returns default value if node is blank
+ */
+ template <class T>
+ T Value(const T& defvalue) const;
+ /**
+ * set node value
+ * @param value: new text value
+ */
+ template <class T>
+ typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, void>::type
+ SetValue(const T& value);
+ void SetValue(TStringBuf value);
+ /**
+ * set/reset node attribute value,
+ * if attribute does not exist, it'll be created
+ * @param name: attribute name
+ * @param value: attribute value
+ */
+ template<class T>
+ typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, void>::type
+ SetAttr(TZtStringBuf name, const T& value);
+ void SetAttr(TZtStringBuf name, TZtStringBuf value);
+ void SetAttr(TZtStringBuf name);
+ /**
+ * delete node attribute
+ * @param name: attribute name
+ */
+ void DelAttr(TZtStringBuf name);
+ /**
+ * set node application data
+ * @param priv: new application data pointer
+ */
+ void SetPrivate(void* priv);
+ /**
+ * @return application data pointer, passed by SetPrivate
+ */
+ void* GetPrivate() const;
+ /**
+ * get node name
+ */
+ TString Name() const;
+ /**
+ * get node xpath
+ */
+ TString Path() const;
+ /**
+ * get node xml representation
+ */
+ TString ToString(TZtStringBuf enc = "") const {
+ TStringStream s;
+ Save(s, enc);
+ return s.Str();
+ }
+ void Save(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = false) const;
+ void SaveAsHtml(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = false) const;
+ /**
+ * get pointer to internal node
+ */
+ xmlNode* GetPtr();
+ const xmlNode* GetPtr() const;
+ /**
+ * check if node is text-only node
+ */
+ bool IsText() const;
+ /**
+ * unlink node from parent and free
+ */
+ void Remove();
+ /**
+ * constructs null node
+ */
+ TNode()
+ : NodePointer(nullptr)
+ , DocPointer(nullptr)
+ {
+ }
+ private:
+ friend class TConstNodes;
+ TNode(xmlDoc* doc, xmlNode* node)
+ : NodePointer(node)
+ , DocPointer(doc)
+ {
+ }
+ TNode Find(xmlNode* start, TZtStringBuf name);
+ template <class T>
+ void AttrInternal(TCharPtr& value, T& res, TStringBuf errContext) const;
+ void SaveInternal(IOutputStream& stream, TZtStringBuf enc, int options) const;
+ xmlNode* NodePointer;
+ xmlDoc* DocPointer;
+ };
+ class TConstNode {
+ public:
+ friend class TDocument;
+ friend class TConstNodes;
+ friend class TNode;
+ /**
+ * check if node is null
+ */
+ bool IsNull() const {
+ return ActualNode.IsNull();
+ }
+ bool IsElementNode() const {
+ return ActualNode.IsElementNode();
+ }
+ TConstNode Parent() const {
+ return ActualNode.Parent();
+ }
+ /**
+ * Create xpath context to be used later for fast xpath evaluation.
+ * @param nss: explicitly specify XML namespaces to use and their prefixes
+ */
+ TXPathContextPtr CreateXPathContext(const TNamespacesForXPath& nss = TNamespacesForXPath()) const {
+ return ActualNode.CreateXPathContext(nss);
+ }
+ /**
+ * get all element nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNodes Nodes(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const {
+ return ActualNode.Nodes(xpath, quiet, ns);
+ }
+ /**
+ * get all element nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ctxt: reusable xpath context
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNodes Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ return ActualNode.Nodes(xpath, quiet, ctxt);
+ }
+ /**
+ * get all nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ */
+ TConstNodes XPath(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const {
+ return ActualNode.XPath(xpath, quiet, ns);
+ }
+ /**
+ * get all nodes matching given xpath expression
+ * @param xpath: xpath expression
+ * @param quiet: don't throw exception if zero nodes found
+ * @param ctxt: reusable xpath context
+ */
+ TConstNodes XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ return ActualNode.XPath(xpath, quiet, ctxt);
+ }
+ /**
+ * get the first element node matching given xpath expression
+ * @param xpath: path to node (from current node)
+ * @param quiet: don't throw exception if node not found,
+ * return null node (@see IsNull())
+ * @param ns: explicitly specify XML namespaces to use and their prefixes
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const {
+ return ActualNode.Node(xpath, quiet, ns);
+ }
+ /**
+ * get the first element node matching given xpath expression
+ * @param xpath: path to node (from current node)
+ * @param quiet: don't throw exception if node not found,
+ * return null node (@see IsNull())
+ * @param ctxt: reusable xpath context
+ *
+ * For historical reasons, this only works for *element* nodes.
+ * Use the XPath function if you need other kinds of nodes.
+ */
+ TConstNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ return ActualNode.Node(xpath, quiet, ctxt);
+ }
+ TConstNode FirstChild(TZtStringBuf name) const {
+ return ActualNode.FirstChild(name);
+ }
+ TConstNode FirstChild() const {
+ return ActualNode.FirstChild();
+ }
+ /**
+ * get node neighbour
+ * @param name: neighbour name
+ * throws exception if no neighbour found
+ */
+ TConstNode NextSibling(TZtStringBuf name) const {
+ return ActualNode.NextSibling(name);
+ }
+ TConstNode NextSibling() const {
+ return ActualNode.NextSibling();
+ }
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * throws exception if attribute not found
+ */
+ template <class T>
+ T Attr(TZtStringBuf name) const {
+ return ActualNode.Attr<T>(name);
+ }
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * returns default value if attribute not found
+ */
+ template <class T>
+ T Attr(TZtStringBuf name, const T& defvalue) const {
+ return ActualNode.Attr(name, defvalue);
+ }
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * @param value: return-value
+ * throws exception if attribute not found
+ */
+ template <class T>
+ void Attr(TZtStringBuf name, T& value) const {
+ return ActualNode.Attr(name, value);
+ }
+ /**
+ * get node attribute
+ * @param name: attribute name
+ * @param defvalue: default value
+ * @param value: return-value
+ * returns default value if attribute not found, attr value otherwise
+ */
+ template <class T>
+ void Attr(TZtStringBuf name, T& value, const T& defvalue) const {
+ return ActualNode.Attr(name, value, defvalue);
+ }
+ /**
+ * get node value (text)
+ * @throws exception if node is blank
+ */
+ template <class T>
+ T Value() const {
+ return ActualNode.Value<T>();
+ }
+ /**
+ * get node value
+ * @param defvalue: default value
+ * returns default value if node is blank
+ */
+ template <class T>
+ T Value(const T& defvalue) const {
+ return ActualNode.Value(defvalue);
+ }
+ /**
+ * get node name
+ */
+ TString Name() const {
+ return ActualNode.Name();
+ }
+ /**
+ * @return application data pointer, passed by SetPrivate
+ */
+ void* GetPrivate() const {
+ return ActualNode.GetPrivate();
+ }
+ /**
+ * get pointer to internal node
+ */
+ const xmlNode* GetPtr() const {
+ return ActualNode.GetPtr();
+ }
+ /**
+ * check if node is text-only node
+ */
+ bool IsText() const {
+ return ActualNode.IsText();
+ }
+ /**
+ * get node xpath
+ */
+ TString Path() const {
+ return ActualNode.Path();
+ }
+ /**
+ * get node xml representation
+ */
+ TString ToString(TZtStringBuf enc = "") const {
+ return ActualNode.ToString(enc);
+ }
+ TConstNode() = default;
+ TConstNode(TNode node)
+ : ActualNode(node)
+ {
+ }
+ TNode ConstCast() const {
+ return ActualNode;
+ }
+ private:
+ TNode ActualNode;
+ };
diff --git a/library/cpp/xml/document/xml-document.cpp b/library/cpp/xml/document/xml-document.cpp
new file mode 100644
index 0000000000..18a554d732
--- /dev/null
+++ b/library/cpp/xml/document/xml-document.cpp
@@ -0,0 +1,393 @@
+#include "xml-document.h"
+#include <libxml/xinclude.h>
+#include <libxml/xpathInternals.h>
+#include <library/cpp/xml/init/init.h>
+#include <util/generic/yexception.h>
+#include <util/folder/dirut.h>
+namespace {
+ struct TInit {
+ inline TInit() {
+ NXml::InitEngine();
+ }
+ } initer;
+namespace NXml {
+ TDocument::TDocument(const TString& xml, Source type) {
+ switch (type) {
+ case File:
+ ParseFile(xml);
+ break;
+ case String:
+ ParseString(xml);
+ break;
+ case RootName: {
+ TDocHolder doc(xmlNewDoc(XMLCHAR("1.0")));
+ if (!doc)
+ THROW(XmlException, "Can't create xml document.");
+ doc->encoding = xmlStrdup(XMLCHAR("utf-8"));
+ TNodePtr node(xmlNewNode(nullptr, XMLCHAR(xml.c_str())));
+ if (!node)
+ THROW(XmlException, "Can't create root node.");
+ xmlDocSetRootElement(doc.Get(), node.Get());
+ Y_UNUSED(node.Release());
+ Doc = std::move(doc);
+ } break;
+ default:
+ THROW(InvalidArgument, "Wrong source type");
+ }
+ }
+ TDocument::TDocument(TDocument&& doc)
+ : Doc(std::move(doc.Doc))
+ {
+ }
+ TDocument& TDocument::operator=(TDocument&& doc) {
+ if (this != &doc)
+ doc.Swap(*this);
+ return *this;
+ }
+ void TDocument::ParseFile(const TString& file) {
+ if (!NFs::Exists(file))
+ THROW(XmlException, "File " << file << " doesn't exist");
+ TParserCtxtPtr pctx(xmlNewParserCtxt());
+ if (!pctx)
+ THROW(XmlException, "Can't create parser context");
+ TDocHolder doc(xmlCtxtReadFile(pctx.Get(), file.c_str(), nullptr, XML_PARSE_NOCDATA));
+ if (!doc)
+ THROW(XmlException, "Can't parse file " << file);
+ int res = xmlXIncludeProcessFlags(doc.Get(), XML_PARSE_XINCLUDE | XML_PARSE_NOCDATA | XML_PARSE_NOXINCNODE);
+ if (res == -1)
+ THROW(XmlException, "XIncludes processing failed");
+ Doc = std::move(doc);
+ }
+ void TDocument::ParseString(TZtStringBuf xml) {
+ TParserCtxtPtr pctx(xmlNewParserCtxt());
+ if (pctx.Get() == nullptr)
+ THROW(XmlException, "Can't create parser context");
+ TDocHolder doc(xmlCtxtReadMemory(pctx.Get(), xml.c_str(), (int)xml.size(), nullptr, nullptr, XML_PARSE_NOCDATA));
+ if (!doc)
+ THROW(XmlException, "Can't parse string");
+ Doc = std::move(doc);
+ }
+ TNode TDocument::Root() {
+ xmlNode* r = xmlDocGetRootElement(Doc.Get());
+ if (r == nullptr)
+ THROW(XmlException, "TDocument hasn't root element");
+ return TNode(Doc.Get(), r);
+ }
+ TConstNode TDocument::Root() const {
+ xmlNode* r = xmlDocGetRootElement(Doc.Get());
+ if (r == nullptr)
+ THROW(XmlException, "TDocument hasn't root element");
+ return TConstNode(TNode(Doc.Get(), r));
+ }
+ bool TNode::IsNull() const {
+ return NodePointer == nullptr;
+ }
+ bool TNode::IsElementNode() const {
+ return !IsNull() && (NodePointer->type == XML_ELEMENT_NODE);
+ }
+ TXPathContextPtr TNode::CreateXPathContext(const TNamespacesForXPath& nss) const {
+ TXPathContextPtr ctx = xmlXPathNewContext(DocPointer);
+ if (!ctx)
+ THROW(XmlException, "Can't create empty xpath context");
+ for (const auto& ns : nss) {
+ const int r = xmlXPathRegisterNs(ctx.Get(), XMLCHAR(ns.Prefix.c_str()), XMLCHAR(ns.Url.c_str()));
+ if (r != 0)
+ THROW(XmlException, "Can't register namespace " << ns.Url << " with prefix " << ns.Prefix);
+ }
+ return ctx;
+ }
+ TConstNodes TNode::XPath(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const {
+ TXPathContextPtr ctxt = CreateXPathContext(ns);
+ return XPath(xpath, quiet, *ctxt);
+ }
+ TConstNodes TNode::XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ if (xmlXPathSetContextNode(NodePointer, &ctxt) != 0)
+ THROW(XmlException, "Can't set xpath context node, probably the context is associated with another document");
+ TXPathObjectPtr obj = xmlXPathEvalExpression(XMLCHAR(xpath.c_str()), &ctxt);
+ if (!obj)
+ THROW(XmlException, "Can't evaluate xpath expression " << xpath);
+ TConstNodes nodes(DocPointer, obj);
+ if (nodes.Size() == 0 && !quiet)
+ THROW(NodeNotFound, xpath);
+ return nodes;
+ }
+ TConstNodes TNode::Nodes(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const {
+ TXPathContextPtr ctxt = CreateXPathContext(ns);
+ return Nodes(xpath, quiet, *ctxt);
+ }
+ TConstNodes TNode::Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ TConstNodes nodes = XPath(xpath, quiet, ctxt);
+ if (nodes.Size() != 0 && !nodes[0].IsElementNode())
+ THROW(XmlException, "xpath points to non-element nodes: " << xpath);
+ return nodes;
+ }
+ TNode TNode::Node(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) {
+ TXPathContextPtr ctxt = CreateXPathContext(ns);
+ return Node(xpath, quiet, *ctxt);
+ }
+ TConstNode TNode::Node(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const {
+ TXPathContextPtr ctxt = CreateXPathContext(ns);
+ return Node(xpath, quiet, *ctxt);
+ }
+ TNode TNode::Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) {
+ TConstNodes n = Nodes(xpath, quiet, ctxt);
+ if (n.Size() == 0 && !quiet)
+ THROW(NodeNotFound, xpath);
+ if (n.Size() == 0)
+ return TNode();
+ else
+ return n[0].ConstCast();
+ }
+ TConstNode TNode::Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const {
+ return const_cast<TNode*>(this)->Node(xpath, quiet, ctxt);
+ }
+ TNode TNode::FirstChild(TZtStringBuf name) {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return Find(NodePointer->children, name);
+ }
+ TConstNode TNode::FirstChild(TZtStringBuf name) const {
+ return const_cast<TNode*>(this)->FirstChild(name);
+ }
+ TNode TNode::FirstChild() {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return TNode(DocPointer, NodePointer->children);
+ }
+ TConstNode TNode::FirstChild() const {
+ return const_cast<TNode*>(this)->FirstChild();
+ }
+ TNode TNode::Parent() {
+ if (nullptr == NodePointer->parent)
+ THROW(XmlException, "Parent node not exists");
+ return TNode(DocPointer, NodePointer->parent);
+ }
+ TConstNode TNode::Parent() const {
+ return const_cast<TNode*>(this)->Parent();
+ }
+ TNode TNode::NextSibling(TZtStringBuf name) {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return Find(NodePointer->next, name);
+ }
+ TConstNode TNode::NextSibling(TZtStringBuf name) const {
+ return const_cast<TNode*>(this)->NextSibling(name);
+ }
+ TNode TNode::NextSibling() {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return TNode(DocPointer, NodePointer->next);
+ }
+ TConstNode TNode::NextSibling() const {
+ return const_cast<TNode*>(this)->NextSibling();
+ }
+ /* NOTE: by default child will inherit it's parent ns */
+ TNode TNode::AddChild(TZtStringBuf name) {
+ return AddChild(name, "");
+ }
+ /* NOTE: source node will be copied, as otherwise it will be double-freed from this and its own document */
+ TNode TNode::AddChild(const TConstNode& node) {
+ xmlNodePtr copy = xmlDocCopyNode(node.ConstCast().NodePointer, DocPointer, 1 /* recursive */);
+ copy = xmlAddChild(NodePointer, copy);
+ return TNode(DocPointer, copy);
+ }
+ void TNode::SetPrivate(void* priv) {
+ NodePointer->_private = priv;
+ }
+ void* TNode::GetPrivate() const {
+ return NodePointer->_private;
+ }
+ TNode TNode::Find(xmlNode* start, TZtStringBuf name) {
+ for (; start; start = start->next)
+ if (start->type == XML_ELEMENT_NODE && (name.empty() || !xmlStrcmp(start->name, XMLCHAR(name.c_str()))))
+ return TNode(DocPointer, start);
+ return TNode();
+ }
+ TString TNode::Name() const {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return CAST2CHAR(NodePointer->name);
+ }
+ TString TNode::Path() const {
+ TCharPtr path(xmlGetNodePath(NodePointer));
+ if (!!path)
+ return CAST2CHAR(path.Get());
+ else
+ return "";
+ }
+ xmlNode* TNode::GetPtr() {
+ return NodePointer;
+ }
+ const xmlNode* TNode::GetPtr() const {
+ return NodePointer;
+ }
+ bool TNode::IsText() const {
+ if (IsNull())
+ THROW(XmlException, "Node is null");
+ return NodePointer->type == XML_TEXT_NODE;
+ }
+ void TNode::Remove() {
+ xmlNode* nodePtr = GetPtr();
+ xmlUnlinkNode(nodePtr);
+ xmlFreeNode(nodePtr);
+ }
+ static int XmlWriteToOstream(void* context, const char* buffer, int len) {
+ // possibly use to save doc as well
+ IOutputStream* out = (IOutputStream*)context;
+ out->Write(buffer, len);
+ return len;
+ }
+ void TNode::SaveInternal(IOutputStream& stream, TZtStringBuf enc, int options) const {
+ const char* encoding = enc.size() ? enc.data() : "utf-8";
+ TSaveCtxtPtr ctx(xmlSaveToIO(XmlWriteToOstream, /* close */ nullptr, &stream,
+ encoding, options));
+ if (xmlSaveTree(ctx.Get(), (xmlNode*)GetPtr()) < 0)
+ THROW(XmlException, "Failed saving node to stream");
+ }
+ void TNode::Save(IOutputStream& stream, TZtStringBuf enc, bool shouldFormat) const {
+ SaveInternal(stream, enc, shouldFormat ? XML_SAVE_FORMAT : 0);
+ }
+ void TNode::SaveAsHtml(IOutputStream& stream, TZtStringBuf enc, bool shouldFormat) const {
+ int options = XML_SAVE_AS_HTML;
+ options |= shouldFormat ? XML_SAVE_FORMAT : 0;
+ SaveInternal(stream, enc, options);
+ }
+ TConstNodes::TConstNodes(const TConstNodes& nodes)
+ : SizeValue(nodes.Size())
+ , Doc(nodes.Doc)
+ , Obj(nodes.Obj)
+ {
+ }
+ TConstNodes& TConstNodes::operator=(const TConstNodes& nodes) {
+ if (this != &nodes) {
+ SizeValue = nodes.Size();
+ Doc = nodes.Doc;
+ Obj = nodes.Obj;
+ }
+ return *this;
+ }
+ TConstNodes::TConstNodes(TConstNodesRef ref)
+ : SizeValue(ref.r_.Size())
+ , Doc(ref.r_.Doc)
+ , Obj(ref.r_.Obj)
+ {
+ }
+ TConstNodes& TConstNodes::operator=(TConstNodesRef ref) {
+ if (this != &ref.r_) {
+ SizeValue = ref.r_.Size();
+ Doc = ref.r_.Doc;
+ Obj = ref.r_.Obj;
+ }
+ return *this;
+ }
+ TConstNodes::operator TConstNodesRef() {
+ return TConstNodesRef(*this);
+ }
+ TConstNodes::TConstNodes(xmlDoc* doc, TXPathObjectPtr obj)
+ : SizeValue(obj && obj->nodesetval ? obj->nodesetval->nodeNr : 0)
+ , Doc(doc)
+ , Obj(obj)
+ {
+ }
+ TConstNode TConstNodes::operator[](size_t number) const {
+ if (number + 1 > Size())
+ THROW(XmlException, "index out of range " << number);
+ if (!Obj || !Obj->nodesetval)
+ THROW(XmlException, "Broken TConstNodes object, Obj is null");
+ xmlNode* node = Obj->nodesetval->nodeTab[number];
+ return TNode(Doc, node);
+ }
+ TConstNode TConstNodes::TNodeIter::operator*() const {
+ return Nodes[Index];
+ }
diff --git a/library/cpp/xml/document/xml-document.h b/library/cpp/xml/document/xml-document.h
new file mode 100644
index 0000000000..829ba09cc4
--- /dev/null
+++ b/library/cpp/xml/document/xml-document.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "xml-document-decl.h"
+#include "node-attr.h"
diff --git a/library/cpp/xml/document/xml-document_ut.cpp b/library/cpp/xml/document/xml-document_ut.cpp
new file mode 100644
index 0000000000..9f537b75c4
--- /dev/null
+++ b/library/cpp/xml/document/xml-document_ut.cpp
@@ -0,0 +1,319 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/generic/map.h>
+#include "xml-document.h"
+Y_UNIT_TEST_SUITE(TestXmlDocument) {
+ Y_UNIT_TEST(Iteration) {
+ NXml::TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root>qq<a><b></b></a>ww<c></c></root>",
+ NXml::TDocument::String);
+ NXml::TConstNode root = xml.Root();
+ UNIT_ASSERT_EQUAL(root.Name(), "root");
+ NXml::TConstNode n = root.FirstChild().NextSibling();
+ UNIT_ASSERT_EQUAL(n.Name(), "a");
+ n = n.NextSibling().NextSibling();
+ UNIT_ASSERT_EQUAL(n.Name(), "c");
+ }
+ Y_UNIT_TEST(ParseString) {
+ NXml::TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root>\n"
+ "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n"
+ "<text>Некоторый текст</text>\n"
+ "</root>",
+ NXml::TDocument::String);
+ NXml::TConstNode root = xml.Root();
+ NXml::TConstNode b = root.Node("a/b");
+ UNIT_ASSERT_EQUAL(b.Attr<int>("len"), 15);
+ UNIT_ASSERT_EQUAL(b.Attr<bool>("correct"), true);
+ NXml::TConstNode text = root.Node("text");
+ UNIT_ASSERT_EQUAL(text.Value<TString>(), "Некоторый текст");
+ }
+ Y_UNIT_TEST(SerializeString) {
+ NXml::TDocument xml("frob", NXml::TDocument::RootName);
+ xml.Root().SetAttr("xyzzy", "Frobozz");
+ xml.Root().SetAttr("kulness", 0.3);
+ xml.Root().SetAttr("timelimit", 3);
+ NXml::TNode authors = xml.Root().AddChild("authors");
+ authors.AddChild("graham").SetAttr("name", "Nelson");
+ authors.AddChild("zarf").SetValue("Andrew Plotkin");
+ authors.AddChild("emshort", "Emily Short");
+ TString data = xml.ToString("utf-8");
+ UNIT_ASSERT_EQUAL(data, "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<frob xyzzy=\"Frobozz\" kulness=\"0.3\" timelimit=\"3\">\n"
+ " <authors>\n"
+ " <graham name=\"Nelson\"/>\n"
+ " <zarf>Andrew Plotkin</zarf>\n"
+ " <emshort>Emily Short</emshort>\n"
+ " </authors>\n"
+ "</frob>\n");
+ // check default utf8 output with ru
+ {
+ NXml::TDocument xml2("frob", NXml::TDocument::RootName);
+ xml2.Root().SetAttr("xyzzy", "привет =)");
+ UNIT_ASSERT_VALUES_EQUAL(xml2.ToString(), "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<frob xyzzy=\"привет =)\"/>\n");
+ }
+ }
+ Y_UNIT_TEST(XPathNs) {
+ using namespace NXml;
+ TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root xmlns='http://hello.com/hello'>\n"
+ "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n"
+ "<text>Некоторый текст</text>\n"
+ "</root>",
+ TDocument::String);
+ TNamespacesForXPath nss;
+ TNamespaceForXPath ns = {"h", "http://hello.com/hello"};
+ nss.push_back(ns);
+ TConstNode root = xml.Root();
+ TConstNode b = root.Node("h:a/h:b", false, nss);
+ UNIT_ASSERT_EQUAL(b.Attr<int>("len"), 15);
+ UNIT_ASSERT_EQUAL(b.Attr<bool>("correct"), true);
+ TConstNode text = root.Node("h:text", false, nss);
+ UNIT_ASSERT_EQUAL(text.Value<TString>(), "Некоторый текст");
+ // For performance you can create xpath context once using nss and pass it.
+ TXPathContextPtr ctxt = root.CreateXPathContext(nss);
+ UNIT_ASSERT(root.Node("text", true, *ctxt).IsNull());
+ UNIT_ASSERT_EXCEPTION(root.Node("text", false, *ctxt), yexception);
+ UNIT_ASSERT_EQUAL(root.Node("h:text", false, *ctxt).Value<TString>(), "Некоторый текст");
+ }
+ Y_UNIT_TEST(XmlNodes) {
+ using namespace NXml;
+ TDocument xml("<?xml version=\"1.0\"?>\n"
+ "<root>qq<a><b>asdfg</b></a>ww<c></c></root>",
+ NXml::TDocument::String);
+ TNode root = xml.Root();
+ UNIT_ASSERT_EQUAL(root.Value<TString>(), "qqasdfgww");
+ TConstNode node = root.FirstChild();
+ UNIT_ASSERT_EQUAL(node.IsText(), true);
+ UNIT_ASSERT_EQUAL(node.Value<TString>(), "qq");
+ node = node.NextSibling();
+ UNIT_ASSERT_EQUAL(node.IsText(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "a");
+ UNIT_ASSERT_EQUAL(node.Value<TString>(), "asdfg");
+ node = node.NextSibling();
+ UNIT_ASSERT_EQUAL(node.IsText(), true);
+ UNIT_ASSERT_EQUAL(node.Value<TString>(), "ww");
+ node = node.NextSibling();
+ UNIT_ASSERT_EQUAL(node.IsText(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "c");
+ UNIT_ASSERT_EQUAL(node.Value<TString>(), "");
+ node = node.NextSibling();
+ UNIT_ASSERT_EQUAL(node.IsNull(), true);
+ TStringStream iterLog;
+ for (const auto& node2 : root.Nodes("/root/*")) {
+ iterLog << node2.Name() << ';';
+ }
+ UNIT_ASSERT_STRINGS_EQUAL(iterLog.Str(), "a;c;");
+ // get only element nodes, ignore text nodes with empty "name" param
+ node = root.FirstChild(TString());
+ UNIT_ASSERT_EQUAL(node.IsText(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "a");
+ node = node.NextSibling(TString());
+ UNIT_ASSERT_EQUAL(node.IsText(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "c");
+ // use exact "name" to retrieve children and siblings
+ node = root.FirstChild("a");
+ UNIT_ASSERT_EQUAL(node.IsNull(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "a");
+ node = node.NextSibling("c");
+ UNIT_ASSERT_EQUAL(node.IsNull(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "c");
+ node = root.FirstChild("c"); // skip "a"
+ UNIT_ASSERT_EQUAL(node.IsNull(), false);
+ UNIT_ASSERT_EQUAL(node.Name(), "c");
+ // node not found: no exceptions, null nodes are returned
+ node = root.FirstChild("b"); // b is not direct child of root
+ UNIT_ASSERT_EQUAL(node.IsNull(), true);
+ node = root.FirstChild("nosuchnode");
+ UNIT_ASSERT_EQUAL(node.IsNull(), true);
+ node = root.FirstChild();
+ node = root.NextSibling("unknownnode");
+ UNIT_ASSERT_EQUAL(node.IsNull(), true);
+ UNIT_ASSERT_EXCEPTION(node.Name(), yexception);
+ UNIT_ASSERT_EXCEPTION(node.Value<TString>(), yexception);
+ UNIT_ASSERT_EXCEPTION(node.IsText(), yexception);
+ }
+ Y_UNIT_TEST(DefVal) {
+ using namespace NXml;
+ TDocument xml("<?xml version=\"1.0\"?>\n"
+ "<root><a></a></root>",
+ NXml::TDocument::String);
+ UNIT_ASSERT_EQUAL(xml.Root().Node("a", true).Node("b", true).Value<int>(3), 3);
+ }
+ Y_UNIT_TEST(NodesVsXPath) {
+ using namespace NXml;
+ TDocument xml("<?xml version=\"1.0\"?>\n"
+ "<root><a x=\"y\"></a></root>",
+ NXml::TDocument::String);
+ UNIT_ASSERT_EXCEPTION(xml.Root().Nodes("/root/a/@x"), yexception);
+ UNIT_ASSERT_VALUES_EQUAL(xml.Root().XPath("/root/a/@x").Size(), 1);
+ }
+ Y_UNIT_TEST(NodeIsFirst) {
+ using namespace NXml;
+ TDocument xml("<?xml version=\"1.0\"?>\n"
+ "<root><a x=\"y\">first</a>"
+ "<a>second</a></root>",
+ NXml::TDocument::String);
+ UNIT_ASSERT_EXCEPTION(xml.Root().Node("/root/a/@x"), yexception);
+ UNIT_ASSERT_STRINGS_EQUAL(xml.Root().Node("/root/a").Value<TString>(), "first");
+ }
+ Y_UNIT_TEST(CopyNode) {
+ using namespace NXml;
+ // default-construct empty node
+ TNode empty;
+ // put to container
+ TMap<int, TNode> nmap;
+ nmap[2];
+ // do copy
+ TDocument xml("<?xml version=\"1.0\"?>\n"
+ "<root><a></a></root>",
+ TDocument::String);
+ TDocument xml2("<?xml version=\"1.0\"?>\n"
+ "<root><node><b>bold</b><i>ita</i></node></root>",
+ TDocument::String);
+ TNode node = xml2.Root().Node("//node");
+ TNode place = xml.Root().Node("//a");
+ place.AddChild(node);
+ TStringStream s;
+ xml.Save(s, "", false);
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<root><a><node><b>bold</b><i>ita</i></node></a></root>\n");
+ }
+ Y_UNIT_TEST(RenderNode) {
+ using namespace NXml;
+ {
+ // no namespaces
+ TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root>\n"
+ "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n"
+ "<text>Некоторый текст</text>\n"
+ "</root>",
+ TDocument::String);
+ TNode n = xml.Root().Node("//a");
+ UNIT_ASSERT_VALUES_EQUAL(n.ToString(), "<a><b len=\"15\" correct=\"1\">hello world</b></a>");
+ }
+ {
+ // namespaces
+ TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root xmlns='http://hello.com/hello'>\n"
+ "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n"
+ "<text>Некоторый текст</text>\n"
+ "</root>",
+ TDocument::String);
+ TNamespacesForXPath nss;
+ TNamespaceForXPath ns = {"h", "http://hello.com/hello"};
+ nss.push_back(ns);
+ TNode n = xml.Root().Node("//h:a", false, nss);
+ UNIT_ASSERT_VALUES_EQUAL(n.ToString(), "<a><b len=\"15\" correct=\"1\">hello world</b></a>");
+ }
+ }
+ Y_UNIT_TEST(ReuseXPathContext) {
+ using namespace NXml;
+ TDocument xml(
+ "<?xml version=\"1.0\"?>\n"
+ "<root>\n"
+ "<a><b><c>Hello, world!</c></b></a>\n"
+ "<text x=\"10\">First</text>\n"
+ "<text y=\"20\">Second</text>\n"
+ "</root>",
+ TDocument::String);
+ TXPathContextPtr rootCtxt = xml.Root().CreateXPathContext();
+ // Check Node()
+ TConstNode b = xml.Root().Node("a/b", false, *rootCtxt);
+ // We can use root node context for xpath evaluation in any node
+ TConstNode c1 = b.Node("c", false, *rootCtxt);
+ UNIT_ASSERT_EQUAL(c1.Value<TString>(), "Hello, world!");
+ TXPathContextPtr bCtxt = b.CreateXPathContext();
+ TConstNode c2 = b.Node("c", false, *bCtxt);
+ UNIT_ASSERT_EQUAL(c2.Value<TString>(), "Hello, world!");
+ // Mixing contexts from different documents is forbidden
+ TDocument otherXml("<root></root>", TDocument::String);
+ TXPathContextPtr otherCtxt = otherXml.Root().CreateXPathContext();
+ UNIT_ASSERT_EXCEPTION(b.Node("c", false, *otherCtxt), yexception);
+ // Check Nodes()
+ TConstNodes texts = xml.Root().Nodes("text", true, *rootCtxt);
+ UNIT_ASSERT_EQUAL(texts.Size(), 2);
+ // Nodes() does't work for non-element nodes
+ UNIT_ASSERT_EXCEPTION(xml.Root().Nodes("text/@x", true, *rootCtxt), yexception);
+ // Check XPath()
+ TConstNodes ys = xml.Root().XPath("text/@y", true, *rootCtxt);
+ UNIT_ASSERT_EQUAL(ys.Size(), 1);
+ UNIT_ASSERT_EQUAL(ys[0].Value<int>(), 20);
+ }
+ Y_UNIT_TEST(Html) {
+ using namespace NXml;
+ TDocument htmlChunk("video", TDocument::RootName);
+ TNode videoNode = htmlChunk.Root();
+ videoNode.SetAttr("controls");
+ TStringStream ss;
+ videoNode.SaveAsHtml(ss);
+ UNIT_ASSERT_EQUAL(ss.Str(), "<video controls></video>");
+ }
+ Y_UNIT_TEST(Move) {
+ using namespace NXml;
+ TDocument xml1("foo", TDocument::RootName);
+ xml1.Root().AddChild("bar");
+ UNIT_ASSERT_VALUES_EQUAL(xml1.Root().ToString(), "<foo><bar/></foo>");
+ TDocument xml2 = std::move(xml1);
+ UNIT_ASSERT_EXCEPTION(xml1.Root(), yexception);
+ UNIT_ASSERT_VALUES_EQUAL(xml2.Root().ToString(), "<foo><bar/></foo>");
+ }
+ Y_UNIT_TEST(StringConversion) {
+ using namespace NXml;
+ TDocument xml("foo", TDocument::RootName);
+ auto root = xml.Root();
+ const TStringBuf stringBuf = "bar";
+ root.SetAttr("bar", stringBuf);
+ const TString tString = "baz";
+ root.SetAttr("baz", tString);
+ root.SetAttr("quux", "literal");
+ root.SetAttr("frob", 500);
+ }
diff --git a/library/cpp/xml/document/xml-options.cpp b/library/cpp/xml/document/xml-options.cpp
new file mode 100644
index 0000000000..74e7545de3
--- /dev/null
+++ b/library/cpp/xml/document/xml-options.cpp
@@ -0,0 +1 @@
+#include "xml-options.h"
diff --git a/library/cpp/xml/document/xml-options.h b/library/cpp/xml/document/xml-options.h
new file mode 100644
index 0000000000..bb07da0cfb
--- /dev/null
+++ b/library/cpp/xml/document/xml-options.h
@@ -0,0 +1,67 @@
+#pragma once
+#include <contrib/libs/libxml/include/libxml/parser.h>
+namespace NXml {
+ enum class EOption : int {
+ // clang-format off
+ Old10 = XML_PARSE_OLD10,
+ // clang-format on
+ };
+ class TOptions {
+ public:
+ TOptions()
+ : Mask(0)
+ {
+ }
+ template <typename... TArgs>
+ TOptions(TArgs... args)
+ : Mask(0)
+ {
+ Set(args...);
+ }
+ TOptions& Set(EOption option) {
+ Mask |= static_cast<int>(option);
+ return *this;
+ }
+ template <typename... TArgs>
+ TOptions& Set(EOption arg, TArgs... args) {
+ Set(arg);
+ return Set(args...);
+ }
+ int GetMask() const {
+ return Mask;
+ }
+ private:
+ int Mask;
+ };
diff --git a/library/cpp/xml/document/xml-options_ut.cpp b/library/cpp/xml/document/xml-options_ut.cpp
new file mode 100644
index 0000000000..9be16baf3d
--- /dev/null
+++ b/library/cpp/xml/document/xml-options_ut.cpp
@@ -0,0 +1,26 @@
+#include "xml-options.h"
+#include <library/cpp/testing/unittest/registar.h>
+Y_UNIT_TEST_SUITE(TestXmlOptions) {
+ Y_UNIT_TEST(SetHuge) {
+ NXml::TOptions opts;
+ opts.Set(NXml::EOption::Huge);
+ }
+ Y_UNIT_TEST(VariadicContructor) {
+ NXml::TOptions opts(NXml::EOption::Huge, NXml::EOption::Compact, NXml::EOption::SAX1);
+ }
+ Y_UNIT_TEST(Chaining) {
+ NXml::TOptions opts;
+ opts
+ .Set(NXml::EOption::Huge)
+ .Set(NXml::EOption::Compact);
+ }
diff --git a/library/cpp/xml/document/xml-textreader.cpp b/library/cpp/xml/document/xml-textreader.cpp
new file mode 100644
index 0000000000..b946f1fbf2
--- /dev/null
+++ b/library/cpp/xml/document/xml-textreader.cpp
@@ -0,0 +1,318 @@
+#include "xml-textreader.h"
+#include <contrib/libs/libxml/include/libxml/xmlreader.h>
+#include <util/generic/yexception.h>
+#include <util/string/strip.h>
+#include <util/system/compiler.h>
+namespace NXml {
+ TTextReader::TTextReader(IInputStream& stream, const TOptions& options)
+ : Stream(stream)
+ , IsError(false)
+ {
+ Impl.Reset(xmlReaderForIO(ReadFromInputStreamCallback, nullptr, this, nullptr, nullptr, options.GetMask()));
+ if (!Impl) {
+ ythrow yexception() << "cannot instantiate underlying xmlTextReader structure";
+ }
+ SetupErrorHandler();
+ CheckForExceptions();
+ }
+ TTextReader::~TTextReader() {
+ }
+ bool TTextReader::Read() {
+ return BoolResult(xmlTextReaderRead(Impl.Get()));
+ }
+ TString TTextReader::ReadInnerXml() const {
+ return TempStringOrEmptyResult(xmlTextReaderReadInnerXml(Impl.Get()));
+ }
+ TString TTextReader::ReadOuterXml() const {
+ return TempStringOrEmptyResult(xmlTextReaderReadOuterXml(Impl.Get()));
+ }
+ TString TTextReader::ReadString() const {
+ return TempStringOrEmptyResult(xmlTextReaderReadString(Impl.Get()));
+ }
+ bool TTextReader::ReadAttributeValue() const {
+ return BoolResult(xmlTextReaderReadAttributeValue(Impl.Get()));
+ }
+ int TTextReader::GetAttributeCount() const {
+ return IntResult(xmlTextReaderAttributeCount(Impl.Get()));
+ }
+ TStringBuf TTextReader::GetBaseUri() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstBaseUri(Impl.Get()));
+ }
+ int TTextReader::GetDepth() const {
+ return IntResult(xmlTextReaderDepth(Impl.Get()));
+ }
+ bool TTextReader::HasAttributes() const {
+ return BoolResult(xmlTextReaderHasAttributes(Impl.Get()));
+ }
+ bool TTextReader::HasValue() const {
+ return BoolResult(xmlTextReaderHasValue(Impl.Get()));
+ }
+ bool TTextReader::IsDefault() const {
+ return BoolResult(xmlTextReaderIsDefault(Impl.Get()));
+ }
+ bool TTextReader::IsEmptyElement() const {
+ return BoolResult(xmlTextReaderIsEmptyElement(Impl.Get()));
+ }
+ TStringBuf TTextReader::GetLocalName() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstLocalName(Impl.Get()));
+ }
+ TStringBuf TTextReader::GetName() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstName(Impl.Get()));
+ }
+ TStringBuf TTextReader::GetNamespaceUri() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstNamespaceUri(Impl.Get()));
+ }
+ TTextReader::ENodeType TTextReader::GetNodeType() const {
+ return static_cast<ENodeType>(IntResult(xmlTextReaderNodeType(Impl.Get())));
+ }
+ TStringBuf TTextReader::GetPrefix() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstPrefix(Impl.Get()));
+ }
+ char TTextReader::GetQuoteChar() const {
+ return CharResult(xmlTextReaderQuoteChar(Impl.Get()));
+ }
+ TStringBuf TTextReader::GetValue() const {
+ return ConstStringOrEmptyResult(xmlTextReaderConstValue(Impl.Get()));
+ }
+ TTextReader::EReadState TTextReader::GetReadState() const {
+ return static_cast<EReadState>(IntResult(xmlTextReaderReadState(Impl.Get())));
+ }
+ void TTextReader::Close() {
+ if (xmlTextReaderClose(Impl.Get()) == -1) {
+ ThrowException();
+ }
+ }
+ TString TTextReader::GetAttribute(int number) const {
+ return TempStringResult(xmlTextReaderGetAttributeNo(Impl.Get(), number));
+ }
+ TString TTextReader::GetAttribute(TZtStringBuf name) const {
+ return TempStringResult(xmlTextReaderGetAttribute(Impl.Get(), XMLCHAR(name.data())));
+ }
+ TString TTextReader::GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const {
+ return TempStringResult(xmlTextReaderGetAttributeNs(Impl.Get(), XMLCHAR(localName.data()), XMLCHAR(nsUri.data())));
+ }
+ TString TTextReader::LookupNamespace(TZtStringBuf prefix) const {
+ return TempStringResult(xmlTextReaderLookupNamespace(Impl.Get(), XMLCHAR(prefix.data())));
+ }
+ bool TTextReader::MoveToAttribute(int number) {
+ return BoolResult(xmlTextReaderMoveToAttributeNo(Impl.Get(), number));
+ }
+ bool TTextReader::MoveToAttribute(TZtStringBuf name) {
+ return BoolResult(xmlTextReaderMoveToAttribute(Impl.Get(), XMLCHAR(name.data())));
+ }
+ bool TTextReader::MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri) {
+ return BoolResult(xmlTextReaderMoveToAttributeNs(Impl.Get(), XMLCHAR(localName.data()), XMLCHAR(nsUri.data())));
+ }
+ bool TTextReader::MoveToFirstAttribute() {
+ return BoolResult(xmlTextReaderMoveToFirstAttribute(Impl.Get()));
+ }
+ bool TTextReader::MoveToNextAttribute() {
+ return BoolResult(xmlTextReaderMoveToNextAttribute(Impl.Get()));
+ }
+ bool TTextReader::MoveToElement() {
+ return BoolResult(xmlTextReaderMoveToElement(Impl.Get()));
+ }
+ TConstNode TTextReader::Expand() const {
+ const xmlNodePtr node = xmlTextReaderExpand(Impl.Get());
+ if (node == nullptr) {
+ ThrowException();
+ }
+ return TConstNode(TNode(node->doc, node));
+ }
+ bool TTextReader::Next() {
+ return BoolResult(xmlTextReaderNext(Impl.Get()));
+ }
+ bool TTextReader::IsValid() const {
+ return BoolResult(xmlTextReaderIsValid(Impl.Get()));
+ }
+ // Callback for xmlReaderForIO() to read more data.
+ // It is almost "noexcept" (std::bad_alloc may happen when saving exception message to new TString).
+ // Waiting for std::exception_ptr and std::rethrow_exception from C++11 in Arcadia to make it really "noexcept".
+ int TTextReader::ReadFromInputStreamCallback(void* context, char* buffer, int len) {
+ Y_ASSERT(len >= 0);
+ TTextReader* reader = static_cast<TTextReader*>(context);
+ int result = -1;
+ // Exception may be thrown by IInputStream::Read().
+ // It is caught unconditionally because exceptions cannot safely pass through libxml2 plain C code
+ // (no destructors, no RAII, raw pointers, so in case of stack unwinding some memory gets leaked).
+ try {
+ result = reader->Stream.Read(buffer, len);
+ } catch (const yexception& ex) {
+ reader->LogError() << "read from input stream failed: " << ex;
+ } catch (...) {
+ reader->LogError() << "read from input stream failed";
+ }
+ return result;
+ }
+ void TTextReader::OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator) {
+ TTextReader* reader = static_cast<TTextReader*>(arg);
+ Y_ASSERT(reader != nullptr);
+ TStringStream& out = reader->LogError();
+ if (severity == XML_PARSER_SEVERITY_ERROR) {
+ out << "libxml parse error";
+ } else if (severity == XML_PARSER_SEVERITY_VALIDITY_ERROR) {
+ out << "libxml validity error";
+ } else {
+ out << "libxml error";
+ }
+ if (locator != nullptr) {
+ const int line = xmlTextReaderLocatorLineNumber(locator);
+ const TCharPtr baseUri = xmlTextReaderLocatorBaseURI(locator);
+ out << " (";
+ if (line != -1) {
+ out << "at line " << line;
+ if (baseUri) {
+ out << ", ";
+ }
+ }
+ if (baseUri) {
+ out << "base URI " << CAST2CHAR(baseUri.Get());
+ }
+ out << ")";
+ }
+ TStringBuf message = (msg != nullptr) ? msg : "unknown";
+ message = StripStringRight(message); // remove trailing \n that is added by libxml
+ if (!message.empty()) {
+ out << ": " << message;
+ }
+ }
+ void TTextReader::SetupErrorHandler() {
+ xmlTextReaderErrorFunc func = nullptr;
+ void* arg = nullptr;
+ // We respect any other error handlers already set up:
+ xmlTextReaderGetErrorHandler(Impl.Get(), &func, &arg);
+ if (!func) {
+ func = TTextReader::OnLibxmlError;
+ xmlTextReaderSetErrorHandler(Impl.Get(), func, this);
+ }
+ }
+ TStringStream& TTextReader::LogError() const {
+ if (IsError) { // maybe there are previous errors
+ ErrorBuffer << Endl;
+ }
+ IsError = true;
+ return ErrorBuffer;
+ }
+ void TTextReader::CheckForExceptions() const {
+ if (Y_LIKELY(!IsError)) {
+ return;
+ }
+ const TString message = ErrorBuffer.Str();
+ ErrorBuffer.clear();
+ IsError = false;
+ ythrow yexception() << message;
+ }
+ void TTextReader::ThrowException() const {
+ CheckForExceptions();
+ // Probably CheckForExceptions() would throw an exception with more verbose message. As the last resort
+ // (we do not even know the name of the failed libxml function, but it's possible to deduce it from stacktrace):
+ ythrow yexception() << "libxml function returned error exit code";
+ }
+ bool TTextReader::BoolResult(int value) const {
+ if (Y_UNLIKELY(value == -1)) {
+ ThrowException();
+ }
+ return (value != 0);
+ }
+ int TTextReader::IntResult(int value) const {
+ if (Y_UNLIKELY(value == -1)) {
+ ThrowException();
+ }
+ return value;
+ }
+ char TTextReader::CharResult(int value) const {
+ if (Y_UNLIKELY(value == -1)) {
+ ThrowException();
+ }
+ return static_cast<char>(value);
+ }
+ TStringBuf TTextReader::ConstStringResult(const xmlChar* value) const {
+ if (Y_UNLIKELY(value == nullptr)) {
+ ThrowException();
+ }
+ return CAST2CHAR(value);
+ }
+ TStringBuf TTextReader::ConstStringOrEmptyResult(const xmlChar* value) const {
+ CheckForExceptions();
+ return (value != nullptr) ? TStringBuf(CAST2CHAR(value)) : TStringBuf();
+ }
+ TString TTextReader::TempStringResult(TCharPtr value) const {
+ if (Y_UNLIKELY(value == nullptr)) {
+ ThrowException();
+ }
+ return TString(CAST2CHAR(value.Get()));
+ }
+ TString TTextReader::TempStringOrEmptyResult(TCharPtr value) const {
+ CheckForExceptions();
+ return (value != nullptr) ? TString(CAST2CHAR(value.Get())) : TString();
+ }
+ struct TTextReader::TDeleter {
+ static inline void Destroy(xmlTextReaderPtr handle) {
+ xmlFreeTextReader(handle);
+ }
+ };
diff --git a/library/cpp/xml/document/xml-textreader.h b/library/cpp/xml/document/xml-textreader.h
new file mode 100644
index 0000000000..ab4c329d26
--- /dev/null
+++ b/library/cpp/xml/document/xml-textreader.h
@@ -0,0 +1,325 @@
+#pragma once
+#include "xml-document.h"
+#include "xml-options.h"
+#include <contrib/libs/libxml/include/libxml/xmlreader.h>
+#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h>
+#include <util/generic/noncopyable.h>
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <functional>
+#include <util/stream/input.h>
+#include <util/stream/str.h>
+namespace NXml {
+ /**
+ * TextReader Parser
+ *
+ * API of the XML streaming API based on C# interfaces.
+ * Provides fast, non-cached, forward-only access to XML data.
+ *
+ * Like the SAX parser, the TextReader parser is suitable for sequential
+ * parsing, but instead of implementing handlers for specific parts of the
+ * document, it allows you to detect the current node type, process the node
+ * accordingly, and skip forward in the document as much as necessary.
+ *
+ * Unlike the DOM parser, you may not move backwards in the XML document.
+ * And unlike the SAX parser, you must not waste time processing nodes that do not
+ * interest you.
+ *
+ * All methods are on the single parser instance, but their result depends on the current context.
+ * For instance, use Read() to move to the next node, and MoveToElement() to navigate to child nodes.
+ * These methods will return false when no more nodes are available. Then use
+ * methods such as GetName() and GetValue() to examine the elements and their attributes.
+ *
+ * This wrapper is inspired by TextReader from libxml++.
+ */
+ class TTextReader: private TNonCopyable {
+ public:
+ // strongly-typed alias for enum from xmlreader.h
+ enum class ENodeType : int {
+ // clang-format off
+ // clang-format on
+ };
+ enum class EReadState : int {
+ // clang-format off
+ // clang-format on
+ };
+ public:
+ TTextReader(IInputStream& stream, const TOptions& options = TOptions());
+ ~TTextReader();
+ /**
+ * Moves the position of the current instance to the next node in the stream, exposing its properties.
+ * @return true if the node was read successfully, false if there are no more nodes to read
+ */
+ bool Read();
+ /**
+ * Reads the contents of the current node, including child nodes and markup.
+ * @return A string containing the XML content, or an empty string
+ * if the current node is neither an element nor attribute, or has no child nodes
+ */
+ TString ReadInnerXml() const;
+ /**
+ * Reads the current node and its contents, including child nodes and markup.
+ * @return A string containing the XML content, or an empty string
+ * if the current node is neither an element nor attribute
+ */
+ TString ReadOuterXml() const;
+ /**
+ * Reads the contents of an element or a text node as a string.
+ * @return A string containing the contents of the Element or Text node,
+ * or an empty string if the reader is positioned on any other type of node
+ */
+ TString ReadString() const;
+ /**
+ * Parses an attribute value into one or more Text and EntityReference nodes.
+ * @return A bool where true indicates the attribute value was parsed,
+ * and false indicates the reader was not positioned on an attribute node
+ * or all the attribute values have been read
+ */
+ bool ReadAttributeValue() const;
+ /**
+ * Gets the number of attributes on the current node.
+ * @return The number of attributes on the current node, or zero if the current node
+ * does not support attributes
+ */
+ int GetAttributeCount() const;
+ /**
+ * Gets the base Uniform Resource Identifier (URI) of the current node.
+ * @return The base URI of the current node or an empty string if not available
+ */
+ TStringBuf GetBaseUri() const;
+ /**
+ * Gets the depth of the current node in the XML document.
+ * @return The depth of the current node in the XML document
+ */
+ int GetDepth() const;
+ /**
+ * Gets a value indicating whether the current node has any attributes.
+ * @return true if the current has attributes, false otherwise
+ */
+ bool HasAttributes() const;
+ /**
+ * Whether the node can have a text value.
+ * @return true if the current node can have an associated text value, false otherwise
+ */
+ bool HasValue() const;
+ /**
+ * Whether an Attribute node was generated from the default value defined in the DTD or schema.
+ * @return true if defaulted, false otherwise
+ */
+ bool IsDefault() const;
+ /**
+ * Check if the current node is empty.
+ * @return true if empty, false otherwise
+ */
+ bool IsEmptyElement() const;
+ /**
+ * The local name of the node.
+ * @return the local name or empty string if not available
+ */
+ TStringBuf GetLocalName() const;
+ /**
+ * The qualified name of the node, equal to Prefix:LocalName.
+ * @return the name or empty string if not available
+ */
+ TStringBuf GetName() const;
+ /**
+ * The URI defining the namespace associated with the node.
+ * @return the namespace URI or empty string if not available
+ */
+ TStringBuf GetNamespaceUri() const;
+ /**
+ * Get the node type of the current node.
+ * @return the ENodeType of the current node
+ */
+ ENodeType GetNodeType() const;
+ /**
+ * Get the namespace prefix associated with the current node.
+ * @return the namespace prefix, or an empty string if not available
+ */
+ TStringBuf GetPrefix() const;
+ /**
+ * Get the quotation mark character used to enclose the value of an attribute.
+ * @return " or '
+ */
+ char GetQuoteChar() const;
+ /**
+ * Provides the text value of the node if present.
+ * @return the string or empty if not available
+ */
+ TStringBuf GetValue() const;
+ /**
+ * Gets the read state of the reader.
+ * @return the state value
+ */
+ EReadState GetReadState() const;
+ /**
+ * This method releases any resources allocated by the current instance
+ * changes the state to Closed and close any underlying input.
+ */
+ void Close();
+ /**
+ * Provides the value of the attribute with the specified index relative to the containing element.
+ * @param number the zero-based index of the attribute relative to the containing element
+ */
+ TString GetAttribute(int number) const;
+ /**
+ * Provides the value of the attribute with the specified qualified name.
+ * @param name the qualified name of the attribute
+ */
+ TString GetAttribute(TZtStringBuf name) const;
+ /**
+ * Provides the value of the specified attribute.
+ * @param localName the local name of the attribute
+ * @param nsUri the namespace URI of the attribute
+ */
+ TString GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const;
+ /**
+ * Resolves a namespace prefix in the scope of the current element.
+ * @param prefix the prefix whose namespace URI is to be resolved. To return the default namespace, specify empty string.
+ * @return a string containing the namespace URI to which the prefix maps.
+ */
+ TString LookupNamespace(TZtStringBuf prefix) const;
+ /**
+ * Moves the position of the current instance to the attribute with the specified index relative to the containing element.
+ * @param number the zero-based index of the attribute relative to the containing element
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(int number);
+ /**
+ * Moves the position of the current instance to the attribute with the specified qualified name.
+ * @param name the qualified name of the attribute
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(TZtStringBuf name);
+ /**
+ * Moves the position of the current instance to the attribute with the specified local name and namespace URI.
+ * @param localName the local name of the attribute
+ * @param nsUri the namespace URI of the attribute
+ * @return true in case of success, false if not found
+ */
+ bool MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri);
+ /**
+ * Moves the position of the current instance to the first attribute associated with the current node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToFirstAttribute();
+ /**
+ * Moves the position of the current instance to the next attribute associated with the current node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToNextAttribute();
+ /**
+ * Moves the position of the current instance to the node that contains the current Attribute node.
+ * @return true in case of success, false if not found
+ */
+ bool MoveToElement();
+ /**
+ * Reads the contents of the current node and the full subtree. It then makes the subtree available until the next Read() call.
+ */
+ TConstNode Expand() const;
+ /**
+ * Skip to the node following the current one in document order while avoiding the subtree if any.
+ * @return true if the node was read successfully, false if there is no more nodes to read
+ */
+ bool Next();
+ /**
+ * Retrieve the validity status from the parser context.
+ */
+ bool IsValid() const;
+ private:
+ static int ReadFromInputStreamCallback(void* context, char* buffer, int len);
+ static void OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator);
+ void SetupErrorHandler();
+ TStringStream& LogError() const;
+ void CheckForExceptions() const;
+ void ThrowException() const;
+ // helpers that check return codes of C functions from libxml
+ bool BoolResult(int value) const;
+ int IntResult(int value) const;
+ char CharResult(int value) const;
+ TStringBuf ConstStringResult(const xmlChar* value) const;
+ TStringBuf ConstStringOrEmptyResult(const xmlChar* value) const;
+ TString TempStringResult(TCharPtr value) const;
+ TString TempStringOrEmptyResult(TCharPtr value) const;
+ private:
+ IInputStream& Stream;
+ mutable bool IsError;
+ mutable TStringStream ErrorBuffer;
+ struct TDeleter;
+ THolder<xmlTextReader, TDeleter> Impl;
+ };
diff --git a/library/cpp/xml/document/xml-textreader_ut.cpp b/library/cpp/xml/document/xml-textreader_ut.cpp
new file mode 100644
index 0000000000..6232dfe47e
--- /dev/null
+++ b/library/cpp/xml/document/xml-textreader_ut.cpp
@@ -0,0 +1,290 @@
+#include "xml-textreader.h"
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/generic/hash.h>
+#include <util/generic/vector.h>
+#include <util/string/join.h>
+namespace {
+ /**
+ * Simple wrapper around the xmlTextReader wrapper
+ */
+ void ParseXml(const TString& xmlData,
+ std::function<void(NXml::TConstNode)> nodeHandlerFunc,
+ const TString& localName,
+ const TString& namespaceUri = TString()) {
+ TStringInput in(xmlData);
+ NXml::TTextReader reader(in);
+ while (reader.Read()) {
+ if (reader.GetNodeType() == NXml::TTextReader::ENodeType::Element &&
+ reader.GetLocalName() == localName &&
+ reader.GetNamespaceUri() == namespaceUri)
+ {
+ const NXml::TConstNode node = reader.Expand();
+ nodeHandlerFunc(node);
+ }
+ }
+ }
+Y_UNIT_TEST_SUITE(TestXmlTextReader) {
+ Y_UNIT_TEST(BasicExample) {
+ const TString xml = "<?xml version=\"1.0\"?>\n"
+ "<example toto=\"1\">\n"
+ " <examplechild id=\"1\">\n"
+ " <child_of_child/>\n"
+ " </examplechild>\n"
+ " <examplechild id=\"2\" toto=\"3\">\n"
+ " <child_of_child>Some content : -)</child_of_child>\n"
+ " </examplechild>\n"
+ "</example>\n";
+ TStringInput input(xml);
+ NXml::TTextReader reader(input);
+ using ENT = NXml::TTextReader::ENodeType;
+ struct TItem {
+ int Depth;
+ ENT Type;
+ TString Name;
+ TString Attrs;
+ TString Value;
+ };
+ TVector<TItem> found;
+ TVector<TString> msgs;
+ while (reader.Read()) {
+ // dump attributes as "k1: v1, k2: v2, ..."
+ TVector<TString> kv;
+ if (reader.HasAttributes()) {
+ reader.MoveToFirstAttribute();
+ do {
+ kv.push_back(TString::Join(reader.GetName(), ": ", reader.GetValue()));
+ } while (reader.MoveToNextAttribute());
+ reader.MoveToElement();
+ }
+ found.push_back(TItem{
+ reader.GetDepth(),
+ reader.GetNodeType(),
+ TString(reader.GetName()),
+ JoinSeq(", ", kv),
+ reader.HasValue() ? TString(reader.GetValue()) : TString(),
+ });
+ }
+ const TVector<TItem> expected = {
+ TItem{0, ENT::Element, "example", "toto: 1", ""},
+ TItem{1, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{1, ENT::Element, "examplechild", "id: 1", ""},
+ TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{2, ENT::Element, "child_of_child", "", ""},
+ TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{1, ENT::EndElement, "examplechild", "id: 1", ""},
+ TItem{1, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{1, ENT::Element, "examplechild", "id: 2, toto: 3", ""},
+ TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{2, ENT::Element, "child_of_child", "", ""},
+ TItem{3, ENT::Text, "#text", "", "Some content : -)"},
+ TItem{2, ENT::EndElement, "child_of_child", "", ""},
+ TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "},
+ TItem{1, ENT::EndElement, "examplechild", "id: 2, toto: 3", ""},
+ TItem{1, ENT::SignificantWhitespace, "#text", "", "\n"},
+ TItem{0, ENT::EndElement, "example", "toto: 1", ""}};
+ UNIT_ASSERT_VALUES_EQUAL(found.size(), expected.size());
+ for (size_t i = 0; i < expected.size(); ++i) {
+ UNIT_ASSERT_VALUES_EQUAL_C(found[i].Depth, expected[i].Depth, "line " << i);
+ UNIT_ASSERT_EQUAL_C(found[i].Type, expected[i].Type, "line " << i);
+ UNIT_ASSERT_VALUES_EQUAL_C(found[i].Name, expected[i].Name, "line " << i);
+ UNIT_ASSERT_VALUES_EQUAL_C(found[i].Attrs, expected[i].Attrs, "line " << i);
+ UNIT_ASSERT_VALUES_EQUAL_C(found[i].Value, expected[i].Value, "line " << i);
+ }
+ }
+ const TString GEODATA = "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
+ "<root>"
+ ""
+ " <country id=\"225\">"
+ " <name>Россия</name>"
+ " <cities>"
+ " <city>Москва</city>"
+ " <city>Санкт-Петербург</city>"
+ " </cities>"
+ " </country>"
+ ""
+ " <country id=\"149\">"
+ " <name>Беларусь</name>"
+ " <cities>"
+ " <city>Минск</city>"
+ " </cities>"
+ " </country>"
+ ""
+ " <country id=\"187\">"
+ " <name>Украина</name>"
+ " <cities>"
+ " <city>Киев</city>"
+ " </cities>"
+ " </country>"
+ ""
+ "</root>";
+ Y_UNIT_TEST(ParseXmlSimple) {
+ struct TCountry {
+ TString Name;
+ TVector<TString> Cities;
+ };
+ THashMap<int, TCountry> data;
+ auto handler = [&data](NXml::TConstNode node) {
+ const int id = node.Attr<int>("id");
+ TCountry& c = data[id];
+ c.Name = node.FirstChild("name").Value<TString>();
+ const NXml::TConstNodes cityNodes = node.Nodes("cities/city");
+ for (auto cityNode : cityNodes) {
+ c.Cities.push_back(cityNode.Value<TString>());
+ }
+ };
+ ParseXml(GEODATA, handler, "country");
+ UNIT_ASSERT_EQUAL(data.size(), 3);
+ UNIT_ASSERT(data.contains(225));
+ const TCountry& russia = data.at(225);
+ UNIT_ASSERT_EQUAL(russia.Name, "Россия");
+ UNIT_ASSERT_EQUAL(russia.Cities.size(), 2);
+ UNIT_ASSERT_EQUAL(russia.Cities[0], "Москва");
+ UNIT_ASSERT_EQUAL(russia.Cities[1], "Санкт-Петербург");
+ UNIT_ASSERT(data.contains(149));
+ const TCountry& belarus = data.at(149);
+ UNIT_ASSERT_EQUAL(belarus.Name, "Беларусь");
+ UNIT_ASSERT_EQUAL(belarus.Cities.size(), 1);
+ UNIT_ASSERT_EQUAL(belarus.Cities[0], "Минск");
+ UNIT_ASSERT(data.contains(187));
+ const TCountry& ukraine = data.at(187);
+ UNIT_ASSERT_EQUAL(ukraine.Name, "Украина");
+ UNIT_ASSERT_EQUAL(ukraine.Cities.size(), 1);
+ UNIT_ASSERT_EQUAL(ukraine.Cities[0], "Киев");
+ }
+ Y_UNIT_TEST(ParseXmlDeepLevel) {
+ TVector<TString> cities;
+ auto handler = [&cities](NXml::TConstNode node) {
+ cities.push_back(node.Value<TString>());
+ };
+ ParseXml(GEODATA, handler, "city");
+ UNIT_ASSERT_EQUAL(cities.size(), 4);
+ UNIT_ASSERT_EQUAL(cities[0], "Москва");
+ UNIT_ASSERT_EQUAL(cities[1], "Санкт-Петербург");
+ UNIT_ASSERT_EQUAL(cities[2], "Минск");
+ UNIT_ASSERT_EQUAL(cities[3], "Киев");
+ }
+ Y_UNIT_TEST(ParseXmlException) {
+ // Check that exception properly passes through plain C code of libxml,
+ // no leaks are detected by valgrind.
+ auto handler = [](NXml::TConstNode node) {
+ const int id = node.Attr<int>("id");
+ if (id != 225) {
+ ythrow yexception() << "unsupported id: " << id;
+ }
+ };
+ UNIT_ASSERT_EXCEPTION(ParseXml(GEODATA, handler, "country"), yexception);
+ UNIT_ASSERT_EXCEPTION(ParseXml("<a></b>", handler, "a"), yexception);
+ UNIT_ASSERT_EXCEPTION(ParseXml("<root><a id=\"1\"></a><a id=\"2\"></b></root>", handler, "a"), yexception);
+ UNIT_ASSERT_EXCEPTION(ParseXml("<root><a id=\"1\"></a><a id=\"2></a></root>", handler, "a"), yexception);
+ }
+ const TString BACKA = // UTF-8 encoding is used implicitly
+ "<Companies"
+ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\""
+ " xmlns=\"http://maps.yandex.ru/backa/1.x\""
+ " xmlns:atom=\"http://www.w3.org/2005/Atom\""
+ " xmlns:biz=\"http://maps.yandex.ru/business/1.x\""
+ " xmlns:xal=\"urn:oasis:names:tc:ciq:xsdschema:xAL:2.0\""
+ " xmlns:gml=\"http://www.opengis.net/gml\""
+ ">"
+ ""
+ " <Company id=\"0001\">"
+ " <Geo>"
+ " <Location>"
+ " <gml:pos>37.62669 55.664827</gml:pos>"
+ " <kind>house</kind>"
+ " </Location>"
+ " <AddressDetails xmlns=\"urn:oasis:names:tc:ciq:xsdschema:xAL:2.0\">"
+ " <Country>"
+ " <AddressLine xml:lang=\"ru\">Москва, Каширское ш., 14</AddressLine>"
+ " </Country>"
+ " </AddressDetails>"
+ " </Geo>"
+ " </Company>"
+ ""
+ " <Company id=\"0002\">"
+ " <Geo>"
+ " <Location>"
+ " <pos xmlns=\"http://www.opengis.net/gml\">150.819797 59.56092</pos>"
+ " <kind>locality</kind>"
+ " </Location>"
+ " <xal:AddressDetails>"
+ " <xal:Country>"
+ " <xal:AddressLine xml:lang=\"ru\">Магадан, ул. Пролетарская, 43</xal:AddressLine>"
+ " </xal:Country>"
+ " </xal:AddressDetails>"
+ " </Geo>"
+ " </Company>"
+ ""
+ "</Companies>";
+ Y_UNIT_TEST(NamespaceHell) {
+ using TNS = NXml::TNamespaceForXPath;
+ const NXml::TNamespacesForXPath ns = {
+ TNS{"b", "http://maps.yandex.ru/backa/1.x"},
+ TNS{"gml", "http://www.opengis.net/gml"},
+ TNS{"xal", "urn:oasis:names:tc:ciq:xsdschema:xAL:2.0"}};
+ int count = 0;
+ THashMap<TString, TString> positions;
+ THashMap<TString, TString> addresses;
+ auto handler = [&](NXml::TConstNode node) {
+ count++;
+ const auto id = node.Attr<TString>("id");
+ NXml::TXPathContextPtr ctxt = node.CreateXPathContext(ns);
+ const NXml::TConstNode location = node.Node("b:Geo/b:Location", false, *ctxt);
+ positions[id] = location.Node("gml:pos", false, *ctxt).Value<TString>();
+ addresses[id] = node.Node("b:Geo/xal:AddressDetails/xal:Country/xal:AddressLine", false, *ctxt).Value<TString>();
+ };
+ ParseXml(BACKA, handler, "Company");
+ UNIT_ASSERT_EQUAL(count, 0);
+ // nothing found because namespace was not specified
+ ParseXml(BACKA, handler, "Company", "http://maps.yandex.ru/backa/1.x");
+ UNIT_ASSERT_VALUES_EQUAL(positions["0001"], "37.62669 55.664827");
+ UNIT_ASSERT_VALUES_EQUAL(positions["0002"], "150.819797 59.56092");
+ UNIT_ASSERT_VALUES_EQUAL(addresses["0001"], "Москва, Каширское ш., 14");
+ UNIT_ASSERT_VALUES_EQUAL(addresses["0002"], "Магадан, ул. Пролетарская, 43");
+ }
diff --git a/library/cpp/xml/document/ya.make b/library/cpp/xml/document/ya.make
new file mode 100644
index 0000000000..86bbd639cf
--- /dev/null
+++ b/library/cpp/xml/document/ya.make
@@ -0,0 +1,17 @@
+ xml-document.cpp
+ xml-textreader.cpp
+ xml-options.cpp
+ library/cpp/xml/init
+ contrib/libs/libxml
+ library/cpp/string_utils/ztstrbuf