diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/xml/document | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/xml/document')
-rw-r--r-- | library/cpp/xml/document/README | 42 | ||||
-rw-r--r-- | library/cpp/xml/document/libxml-guards.h | 50 | ||||
-rw-r--r-- | library/cpp/xml/document/node-attr.h | 209 | ||||
-rw-r--r-- | library/cpp/xml/document/ut/ya.make | 11 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-document-decl.h | 718 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-document.cpp | 393 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-document.h | 4 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-document_ut.cpp | 319 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-options.cpp | 1 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-options.h | 67 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-options_ut.cpp | 26 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-textreader.cpp | 318 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-textreader.h | 325 | ||||
-rw-r--r-- | library/cpp/xml/document/xml-textreader_ut.cpp | 290 | ||||
-rw-r--r-- | library/cpp/xml/document/ya.make | 17 |
15 files changed, 2790 insertions, 0 deletions
diff --git a/library/cpp/xml/document/README b/library/cpp/xml/document/README new file mode 100644 index 0000000000..b2649523d8 --- /dev/null +++ b/library/cpp/xml/document/README @@ -0,0 +1,42 @@ +A wrapper around the DOM interface of libxml2. + +The standard way to use it is as follows: + + #include <library/cpp/xml/document/xml-document.h> + ... + + // open a document + NXml::TDocument xml("filename.xml"); + + // get a nodeset from an XPath query + NXml::TConstNodes nodes = xml.Root().Nodes("xpath/expression/here"); + + // iterate over the nodeset + for (size_t i = 0; i < nodes.size(); ++i) { + using namespace NXml; + TConstNode& node = nodes[i]; + // query node + TString name = node.Name(); + TString lang = node.Attr<TString>("lang"); + TString text = node.Value<TString>(); + TConstNode child = node.GetFirstChild(""); + // edit node + TNode node = child.ConstCast(); + node.DelAttr("id"); + node.SetAttr("x", 2); + node.SetValue(5); + node.AddText(" apples"); + } + + // edit documents with copy-paste + NXml::TDocument xml2("<xpath><node/></xpath>", NXml::TDocument::String); + NXml::TNode place = xml2.Root().Node("xpath/node"); + // copy node's subtree from one document to another + place.AddChild(xml.Root()); + // save (render) single element + TString modifiedNode = place.ToString(); + // save whole document with optional encoding + TString modifiedDoc = xml2.ToString("ISO-8559-1"); + + +See xml-document_ut.cpp for more examples. diff --git a/library/cpp/xml/document/libxml-guards.h b/library/cpp/xml/document/libxml-guards.h new file mode 100644 index 0000000000..4188cecff1 --- /dev/null +++ b/library/cpp/xml/document/libxml-guards.h @@ -0,0 +1,50 @@ +#pragma once + +#include <library/cpp/xml/init/ptr.h> +#include <util/generic/ptr.h> +#include <libxml/xmlstring.h> +#include <libxml/tree.h> +#include <libxml/xpath.h> +#include <libxml/uri.h> +#include <libxml/xmlsave.h> + +namespace NXml { + namespace NDetail { + struct TSignedCharPtrTraits { + static void Destroy(char* handle) { + xmlFree(handle); + } + }; + + struct TCharPtrTraits { + static void Destroy(xmlChar* handle) { + xmlFree(handle); + } + }; + + struct TOutputBufferPtrTraits { + static void Destroy(xmlOutputBufferPtr handle) { + xmlOutputBufferClose(handle); + } + }; + + struct TSaveCtxtPtrTraits { + static void Destroy(xmlSaveCtxtPtr handle) { + xmlSaveClose(handle); + } + }; + + } + + typedef TxmlXPathContextPtr TXPathContextPtr; + typedef TxmlXPathObjectPtr TXPathObjectPtr; + typedef TAutoPtr<char, NDetail::TSignedCharPtrTraits> TSignedCharPtr; + typedef TAutoPtr<xmlChar, NDetail::TCharPtrTraits> TCharPtr; + typedef TxmlDocHolder TDocHolder; + typedef TxmlURIPtr TURIPtr; + typedef TxmlNodePtr TNodePtr; + typedef TAutoPtr<xmlOutputBuffer, NDetail::TOutputBufferPtrTraits> TOutputBufferPtr; + typedef TxmlParserCtxtPtr TParserCtxtPtr; + typedef TAutoPtr<xmlSaveCtxt, NDetail::TSaveCtxtPtrTraits> TSaveCtxtPtr; + +} diff --git a/library/cpp/xml/document/node-attr.h b/library/cpp/xml/document/node-attr.h new file mode 100644 index 0000000000..6e74403943 --- /dev/null +++ b/library/cpp/xml/document/node-attr.h @@ -0,0 +1,209 @@ +#pragma once + +#include "xml-document-decl.h" +#include "libxml-guards.h" +#include <util/stream/str.h> +#include <util/string/cast.h> + +namespace NXml { +#define THROW(x, y) ythrow yexception() << #x << ": " << y + + // libxml defines unsigned char -> xmlChar, + // and all functions use xmlChar. + inline static const char* CAST2CHAR(const xmlChar* x) { + return reinterpret_cast<const char*>(x); + } + inline static const xmlChar* XMLCHAR(const char* x) { + return reinterpret_cast<const xmlChar*>(x); + } + + template <class T> + void TNode::AttrInternal(TCharPtr& value, T& res, TStringBuf errContext) const { + try { + res = FromString<T>(CAST2CHAR(value.Get())); + } catch (TFromStringException&) { + THROW(XmlException, "Failed to convert string " << TString{TStringBuf(CAST2CHAR(value.Get())).substr(0, 50)}.Quote() << " from '" << errContext << "' to requested type"); + } + } + + template <> + inline void TNode::AttrInternal(TCharPtr& value, TString& res, TStringBuf /*errContext*/) const { + TString tmp(CAST2CHAR(value.Get())); + res.swap(tmp); + } + + template <class T> + T TNode::Attr(TZtStringBuf name) const { + TCharPtr value(xmlGetProp(NodePointer, XMLCHAR(name.c_str()))); + if (!value) { + THROW(AttributeNotFound, Path() << "@" << name); + } + + T t; + AttrInternal(value, t, name); + return t; + } + + template <class T> + T TNode::Attr(TZtStringBuf name, const T& defvalue) const { + TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str()))); + if (!attr) { + return defvalue; + } + + T t; + AttrInternal(attr, t, name); + return t; + } + + template <class T> + void TNode::Attr(TZtStringBuf name, T& value) const { + TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str()))); + if (!attr) { + THROW(AttributeNotFound, Path() << name); + } + + AttrInternal(attr, value, name); + } + + template <class T> + void TNode::Attr(TZtStringBuf name, T& value, const T& defvalue) const { + TCharPtr attr(xmlGetProp(NodePointer, XMLCHAR(name.c_str()))); + + if (!attr) { + value = defvalue; + } else { + AttrInternal(attr, value, name); + } + } + + template <class T> + T TNode::Value() const { + if (!NodePointer || xmlIsBlankNode(NodePointer)) { + THROW(NodeIsBlank, Path()); + } + + TCharPtr val(xmlNodeGetContent(NodePointer)); + T t; + AttrInternal(val, t, this->Name()); + return t; + } + + template <class T> + T TNode::Value(const T& defvalue) const { + if (!NodePointer || xmlIsBlankNode(NodePointer)) { + return defvalue; + } + + TCharPtr val(xmlNodeGetContent(NodePointer)); + T t; + AttrInternal(val, t, this->Name()); + return t; + } + + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, void>::type + TNode::SetValue(const T& value) { + TStringStream ss; + ss << value; + SetValue(ss.Str()); + } + + inline void TNode::SetValue(TStringBuf value) { + xmlNodeSetContent(NodePointer, XMLCHAR("")); + xmlNodeAddContentLen(NodePointer, XMLCHAR(value.data()), value.Size()); + } + + inline void TNode::SetAttr(TZtStringBuf name, TZtStringBuf value) { + xmlAttr* attr = xmlSetProp(NodePointer, XMLCHAR(name.c_str()), XMLCHAR(value.c_str())); + + if (!attr) { + THROW(XmlException, "Can't set node attribute <" + << name + << "> to <" + << value + << ">"); + } + } + + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, void>::type + TNode::SetAttr(TZtStringBuf name, const T& value) { + TStringStream ss; + ss << value; + SetAttr(name, TZtStringBuf(ss.Str())); + } + + inline void TNode::SetAttr(TZtStringBuf name) { + xmlAttr* attr = xmlSetProp(NodePointer, XMLCHAR(name.c_str()), nullptr); + + if (!attr) { + THROW(XmlException, "Can't set node empty attribute <" + << name + << ">"); + } + } + + inline void TNode::DelAttr(TZtStringBuf name) { + if (xmlUnsetProp(NodePointer, XMLCHAR(name.c_str())) < 0) + THROW(XmlException, "Can't delete node attribute <" + << name + << ">"); + } + + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, TNode>::type + TNode::AddChild(TZtStringBuf name, const T& value) { + TStringStream ss; + ss << value; + return AddChild(name, TZtStringBuf(ss.Str())); + } + + inline TNode TNode::AddChild(TZtStringBuf name, TZtStringBuf value) { + if (IsNull()) { + THROW(XmlException, "addChild [name=" << name << ", value=" << value + << "]: can't add child to null node"); + } + + xmlNode* child = nullptr; + + if (value.empty()) { + child = xmlNewTextChild(NodePointer, nullptr, XMLCHAR(name.c_str()), nullptr); + } else { + child = xmlNewTextChild( + NodePointer, nullptr, XMLCHAR(name.c_str()), XMLCHAR(value.c_str())); + } + + if (!child) { + THROW(XmlException, "addChild [name=" << name << ", value=" << value + << "]: xmlNewTextChild returned NULL"); + } + + return TNode(DocPointer, child); + } + + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, TNode>::type + TNode::AddText(const T& value) { + TStringStream ss; + ss << value; + return AddText(ss.Str()); + } + + inline TNode TNode::AddText(TStringBuf value) { + if (IsNull()) { + THROW(XmlException, "addChild [value=" << value + << "]: can't add child to null node"); + } + + xmlNode* child = xmlNewTextLen((xmlChar*)value.data(), value.size()); + child = xmlAddChild(NodePointer, child); + + if (!child) { + THROW(XmlException, "addChild [value=" << value + << "]: xmlNewTextChild returned NULL"); + } + + return TNode(DocPointer, child); + } +} diff --git a/library/cpp/xml/document/ut/ya.make b/library/cpp/xml/document/ut/ya.make new file mode 100644 index 0000000000..e955448c66 --- /dev/null +++ b/library/cpp/xml/document/ut/ya.make @@ -0,0 +1,11 @@ +UNITTEST_FOR(library/cpp/xml/document) + +OWNER(finder) + +SRCS( + xml-document_ut.cpp + xml-textreader_ut.cpp + xml-options_ut.cpp +) + +END() diff --git a/library/cpp/xml/document/xml-document-decl.h b/library/cpp/xml/document/xml-document-decl.h new file mode 100644 index 0000000000..bfda1fb7e6 --- /dev/null +++ b/library/cpp/xml/document/xml-document-decl.h @@ -0,0 +1,718 @@ +#pragma once + +#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h> + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/stream/output.h> +#include <util/stream/str.h> +#include <algorithm> +#include "libxml-guards.h" + +namespace NXml { + class TNode; + + class TConstNodes; + class TConstNode; + + using TXPathContext = xmlXPathContext; + + class TDocument { + public: + enum Source { + File, + String, + RootName, + }; + + public: + /** + * create TDocument + * @param source: filename, XML string, or name for the root element (depends on @src) + * @param src: source type: File | String | RootName + * throws if file not found or cannot be parsed + */ + TDocument(const TString& source, Source type = File); + + public: + TDocument(const TDocument& that) = delete; + TDocument& operator=(const TDocument& that) = delete; + + TDocument(TDocument&& that); + TDocument& operator=(TDocument&& that); + + /** + * get root element + */ + TNode Root(); + TConstNode Root() const; + + void Save(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = true) const { + int bufferSize = 0; + xmlChar* xmlBuff = nullptr; + const char* encoding = enc.size() ? enc.data() : Doc->encoding ? nullptr : "UTF-8"; + xmlDocDumpFormatMemoryEnc(Doc.Get(), &xmlBuff, &bufferSize, encoding, shouldFormat); + TCharPtr xmlCharBuffPtr(xmlBuff); + stream.Write(xmlBuff, bufferSize); + } + + TString ToString(TZtStringBuf enc = "", bool shouldFormat = true) const { + TStringStream s; + Save(s, enc, shouldFormat); + return s.Str(); + } + + void Swap(TDocument& that) { + std::swap(this->Doc, that.Doc); + } + + xmlDocPtr GetImpl() { + return Doc.Get(); + } + + private: + void ParseFile(const TString& file); + void ParseString(TZtStringBuf xml); + + TDocument(TDocHolder doc) + : Doc(std::move(doc)) + { + } + + TDocHolder Doc; + }; + + struct TNamespaceForXPath { + TString Prefix; + TString Url; + }; + typedef TVector<TNamespaceForXPath> TNamespacesForXPath; + + class TConstNodes { + private: + struct TConstNodesRef { + explicit TConstNodesRef(TConstNodes& n) + : r_(n) + { + } + TConstNodes& r_; + }; + + public: + TConstNodes(const TConstNodes& nodes); + TConstNodes& operator=(const TConstNodes& nodes); + + TConstNodes(TConstNodesRef ref); + TConstNodes& operator=(TConstNodesRef ref); + + operator TConstNodesRef(); + + /** + * get node by id + * @param number: node id + */ + TConstNode operator[](size_t number) const; + + /** + * get number of nodes + */ + size_t Size() const { + return SizeValue; + } + size_t size() const { + return SizeValue; + } + + struct TNodeIter { + const TConstNodes& Nodes; + size_t Index; + TConstNode operator*() const; + bool operator==(const TNodeIter& other) const { + return Index == other.Index; + } + bool operator!=(const TNodeIter& other) const { + return !(*this == other); + } + TNodeIter operator++() { + Index++; + return *this; + } + }; + TNodeIter begin() const { + return TNodeIter{*this, 0}; + } + TNodeIter end() const { + return TNodeIter{*this, size()}; + } + + private: + friend class TDocument; + friend class TConstNode; + friend class TNode; + + TConstNodes(xmlDoc* doc, TXPathObjectPtr obj); + + size_t SizeValue; + xmlDoc* Doc; + TXPathObjectPtr Obj; + }; + + class TNode { + public: + friend class TDocument; + friend class TConstNode; + friend class TTextReader; + + /** + * check if node is null + */ + bool IsNull() const; + + /** + * check if node is element node + */ + bool IsElementNode() const; + + /** + * Create xpath context to be used later for fast xpath evaluation. + * @param nss: explicitly specify XML namespaces to use and their prefixes + * + * For better performance, when you need to evaluate several xpath expressions, + * it makes sense to create a context, load namespace prefixes once + * and use the context several times in Node(), Nodes(), XPath() function calls for several nodes. + * The context may be used with any node of the current document, but + * cannot be shared between different XML documents. + */ + TXPathContextPtr CreateXPathContext(const TNamespacesForXPath& nss = TNamespacesForXPath()) const; + + /** + * get all element nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ns: explicitly specify XML namespaces to use and their prefixes + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNodes Nodes(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const; + + /** + * get all element nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ctxt: reusable xpath context + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNodes Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const; + + /** + * get all nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ns: explicitly specify XML namespaces to use and their prefixes + */ + TConstNodes XPath(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const; + + /** + * get all nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ctxt: reusable xpath context + */ + TConstNodes XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const; + + /** + * get the first element node matching given xpath expression + * @param xpath: path to node (from current node) + * @param quiet: don't throw exception if node not found, + * return null node (@see IsNull()) + * @param ns: explicitly specify XML namespaces to use and their prefixes + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + /// @todo: quiet should be default, empty nodeset is not an error + TNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()); + TConstNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const; + + /** + * get the first element node matching given xpath expression + * @param xpath: path to node (from current node) + * @param quiet: don't throw exception if node not found, + * return null node (@see IsNull()) + * @param ctxt: reusable xpath context + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt); + TConstNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const; + + /** + * get node first child + * @param name: child name + * @note if name is empty, returns the first child node of type "element" + * @note returns null node if no child found + */ + TNode FirstChild(TZtStringBuf name); + TConstNode FirstChild(TZtStringBuf name) const; + + TNode FirstChild(); + TConstNode FirstChild() const; + + /** + * get parent node + * throws exception if has no parent + */ + TNode Parent(); + TConstNode Parent() const; + + /** + * get node neighbour + * @param name: neighbour name + * @note if name is empty, returns the next sibling node of type "element" + * @node returns null node if no neighbour found + */ + TNode NextSibling(TZtStringBuf name); + TConstNode NextSibling(TZtStringBuf name) const; + + TNode NextSibling(); + TConstNode NextSibling() const; + + /** + * create child node + * @param name: child name + * returns new empty node + */ + TNode AddChild(TZtStringBuf name); + + /** + * create child node with given value + * @param name: child name + * @param value: node value + */ + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, TNode>::type + AddChild(TZtStringBuf name, const T& value); + + TNode AddChild(TZtStringBuf name, TZtStringBuf value); + + /** + * add child node, making recursive copy of original + * @param node: node to copy from + * returns added node + */ + TNode AddChild(const TConstNode& node); + + /** + * create text child node + * @param name: child name + * @param value: node value + */ + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, TNode>::type + AddText(const T& value); + + TNode AddText(TStringBuf value); + + /** + * get node attribute + * @param name: attribute name + * throws exception if attribute not found + */ + template <class T> + T Attr(TZtStringBuf name) const; + + /** + * get node attribute + * @param name: attribute name + * returns default value if attribute not found + */ + template <class T> + T Attr(TZtStringBuf name, const T& defvalue) const; + + /** + * get node attribute + * @param name: attribute name + * @param value: return-value + * throws exception if attribute not found + */ + template <class T> + void Attr(TZtStringBuf name, T& value) const; + + /** + * get node attribute + * @param name: attribute name + * @param defvalue: default value + * @param value: return-value + * returns default value if attribute not found, attr value otherwise + */ + template <class T> + void Attr(TZtStringBuf name, T& value, const T& defvalue) const; + + /** + * get node value (text) + * @throws exception if node is blank + */ + template <class T> + T Value() const; + + /** + * get node value + * @param defvalue: default value + * returns default value if node is blank + */ + template <class T> + T Value(const T& defvalue) const; + + /** + * set node value + * @param value: new text value + */ + template <class T> + typename std::enable_if<!std::is_convertible_v<T, TStringBuf>, void>::type + SetValue(const T& value); + + void SetValue(TStringBuf value); + + /** + * set/reset node attribute value, + * if attribute does not exist, it'll be created + * @param name: attribute name + * @param value: attribute value + */ + template<class T> + typename std::enable_if<!std::is_convertible_v<T, TZtStringBuf>, void>::type + SetAttr(TZtStringBuf name, const T& value); + + void SetAttr(TZtStringBuf name, TZtStringBuf value); + + void SetAttr(TZtStringBuf name); + + /** + * delete node attribute + * @param name: attribute name + */ + void DelAttr(TZtStringBuf name); + + /** + * set node application data + * @param priv: new application data pointer + */ + void SetPrivate(void* priv); + + /** + * @return application data pointer, passed by SetPrivate + */ + void* GetPrivate() const; + + /** + * get node name + */ + TString Name() const; + + /** + * get node xpath + */ + TString Path() const; + + /** + * get node xml representation + */ + TString ToString(TZtStringBuf enc = "") const { + TStringStream s; + Save(s, enc); + return s.Str(); + } + void Save(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = false) const; + void SaveAsHtml(IOutputStream& stream, TZtStringBuf enc = "", bool shouldFormat = false) const; + + /** + * get pointer to internal node + */ + xmlNode* GetPtr(); + const xmlNode* GetPtr() const; + + /** + * check if node is text-only node + */ + bool IsText() const; + + /** + * unlink node from parent and free + */ + void Remove(); + + /** + * constructs null node + */ + TNode() + : NodePointer(nullptr) + , DocPointer(nullptr) + { + } + + private: + friend class TConstNodes; + + TNode(xmlDoc* doc, xmlNode* node) + : NodePointer(node) + , DocPointer(doc) + { + } + + TNode Find(xmlNode* start, TZtStringBuf name); + + template <class T> + void AttrInternal(TCharPtr& value, T& res, TStringBuf errContext) const; + + void SaveInternal(IOutputStream& stream, TZtStringBuf enc, int options) const; + + xmlNode* NodePointer; + xmlDoc* DocPointer; + }; + + class TConstNode { + public: + friend class TDocument; + friend class TConstNodes; + friend class TNode; + /** + * check if node is null + */ + bool IsNull() const { + return ActualNode.IsNull(); + } + + bool IsElementNode() const { + return ActualNode.IsElementNode(); + } + + TConstNode Parent() const { + return ActualNode.Parent(); + } + + /** + * Create xpath context to be used later for fast xpath evaluation. + * @param nss: explicitly specify XML namespaces to use and their prefixes + */ + TXPathContextPtr CreateXPathContext(const TNamespacesForXPath& nss = TNamespacesForXPath()) const { + return ActualNode.CreateXPathContext(nss); + } + + /** + * get all element nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ns: explicitly specify XML namespaces to use and their prefixes + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNodes Nodes(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const { + return ActualNode.Nodes(xpath, quiet, ns); + } + + /** + * get all element nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ctxt: reusable xpath context + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNodes Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + return ActualNode.Nodes(xpath, quiet, ctxt); + } + + /** + * get all nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ns: explicitly specify XML namespaces to use and their prefixes + */ + TConstNodes XPath(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const { + return ActualNode.XPath(xpath, quiet, ns); + } + + /** + * get all nodes matching given xpath expression + * @param xpath: xpath expression + * @param quiet: don't throw exception if zero nodes found + * @param ctxt: reusable xpath context + */ + TConstNodes XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + return ActualNode.XPath(xpath, quiet, ctxt); + } + + /** + * get the first element node matching given xpath expression + * @param xpath: path to node (from current node) + * @param quiet: don't throw exception if node not found, + * return null node (@see IsNull()) + * @param ns: explicitly specify XML namespaces to use and their prefixes + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNode Node(TZtStringBuf xpath, bool quiet = false, const TNamespacesForXPath& ns = TNamespacesForXPath()) const { + return ActualNode.Node(xpath, quiet, ns); + } + + /** + * get the first element node matching given xpath expression + * @param xpath: path to node (from current node) + * @param quiet: don't throw exception if node not found, + * return null node (@see IsNull()) + * @param ctxt: reusable xpath context + * + * For historical reasons, this only works for *element* nodes. + * Use the XPath function if you need other kinds of nodes. + */ + TConstNode Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + return ActualNode.Node(xpath, quiet, ctxt); + } + + TConstNode FirstChild(TZtStringBuf name) const { + return ActualNode.FirstChild(name); + } + + TConstNode FirstChild() const { + return ActualNode.FirstChild(); + } + + /** + * get node neighbour + * @param name: neighbour name + * throws exception if no neighbour found + */ + TConstNode NextSibling(TZtStringBuf name) const { + return ActualNode.NextSibling(name); + } + + TConstNode NextSibling() const { + return ActualNode.NextSibling(); + } + + /** + * get node attribute + * @param name: attribute name + * throws exception if attribute not found + */ + template <class T> + T Attr(TZtStringBuf name) const { + return ActualNode.Attr<T>(name); + } + + /** + * get node attribute + * @param name: attribute name + * returns default value if attribute not found + */ + template <class T> + T Attr(TZtStringBuf name, const T& defvalue) const { + return ActualNode.Attr(name, defvalue); + } + + /** + * get node attribute + * @param name: attribute name + * @param value: return-value + * throws exception if attribute not found + */ + template <class T> + void Attr(TZtStringBuf name, T& value) const { + return ActualNode.Attr(name, value); + } + + /** + * get node attribute + * @param name: attribute name + * @param defvalue: default value + * @param value: return-value + * returns default value if attribute not found, attr value otherwise + */ + template <class T> + void Attr(TZtStringBuf name, T& value, const T& defvalue) const { + return ActualNode.Attr(name, value, defvalue); + } + + /** + * get node value (text) + * @throws exception if node is blank + */ + template <class T> + T Value() const { + return ActualNode.Value<T>(); + } + + /** + * get node value + * @param defvalue: default value + * returns default value if node is blank + */ + template <class T> + T Value(const T& defvalue) const { + return ActualNode.Value(defvalue); + } + + /** + * get node name + */ + TString Name() const { + return ActualNode.Name(); + } + + /** + * @return application data pointer, passed by SetPrivate + */ + void* GetPrivate() const { + return ActualNode.GetPrivate(); + } + + /** + * get pointer to internal node + */ + const xmlNode* GetPtr() const { + return ActualNode.GetPtr(); + } + + /** + * check if node is text-only node + */ + bool IsText() const { + return ActualNode.IsText(); + } + + /** + * get node xpath + */ + TString Path() const { + return ActualNode.Path(); + } + + /** + * get node xml representation + */ + TString ToString(TZtStringBuf enc = "") const { + return ActualNode.ToString(enc); + } + + TConstNode() = default; + TConstNode(TNode node) + : ActualNode(node) + { + } + + TNode ConstCast() const { + return ActualNode; + } + + private: + TNode ActualNode; + }; + +} diff --git a/library/cpp/xml/document/xml-document.cpp b/library/cpp/xml/document/xml-document.cpp new file mode 100644 index 0000000000..18a554d732 --- /dev/null +++ b/library/cpp/xml/document/xml-document.cpp @@ -0,0 +1,393 @@ +#include "xml-document.h" + +#include <libxml/xinclude.h> +#include <libxml/xpathInternals.h> + +#include <library/cpp/xml/init/init.h> + +#include <util/generic/yexception.h> +#include <util/folder/dirut.h> + +namespace { + struct TInit { + inline TInit() { + NXml::InitEngine(); + } + } initer; +} + +namespace NXml { + TDocument::TDocument(const TString& xml, Source type) { + switch (type) { + case File: + ParseFile(xml); + break; + case String: + ParseString(xml); + break; + case RootName: { + TDocHolder doc(xmlNewDoc(XMLCHAR("1.0"))); + if (!doc) + THROW(XmlException, "Can't create xml document."); + doc->encoding = xmlStrdup(XMLCHAR("utf-8")); + + TNodePtr node(xmlNewNode(nullptr, XMLCHAR(xml.c_str()))); + if (!node) + THROW(XmlException, "Can't create root node."); + xmlDocSetRootElement(doc.Get(), node.Get()); + Y_UNUSED(node.Release()); + Doc = std::move(doc); + } break; + default: + THROW(InvalidArgument, "Wrong source type"); + } + } + + TDocument::TDocument(TDocument&& doc) + : Doc(std::move(doc.Doc)) + { + } + + TDocument& TDocument::operator=(TDocument&& doc) { + if (this != &doc) + doc.Swap(*this); + + return *this; + } + + void TDocument::ParseFile(const TString& file) { + if (!NFs::Exists(file)) + THROW(XmlException, "File " << file << " doesn't exist"); + + TParserCtxtPtr pctx(xmlNewParserCtxt()); + if (!pctx) + THROW(XmlException, "Can't create parser context"); + + TDocHolder doc(xmlCtxtReadFile(pctx.Get(), file.c_str(), nullptr, XML_PARSE_NOCDATA)); + if (!doc) + THROW(XmlException, "Can't parse file " << file); + + int res = xmlXIncludeProcessFlags(doc.Get(), XML_PARSE_XINCLUDE | XML_PARSE_NOCDATA | XML_PARSE_NOXINCNODE); + + if (res == -1) + THROW(XmlException, "XIncludes processing failed"); + + Doc = std::move(doc); + } + + void TDocument::ParseString(TZtStringBuf xml) { + TParserCtxtPtr pctx(xmlNewParserCtxt()); + if (pctx.Get() == nullptr) + THROW(XmlException, "Can't create parser context"); + + TDocHolder doc(xmlCtxtReadMemory(pctx.Get(), xml.c_str(), (int)xml.size(), nullptr, nullptr, XML_PARSE_NOCDATA)); + + if (!doc) + THROW(XmlException, "Can't parse string"); + + Doc = std::move(doc); + } + + TNode TDocument::Root() { + xmlNode* r = xmlDocGetRootElement(Doc.Get()); + if (r == nullptr) + THROW(XmlException, "TDocument hasn't root element"); + + return TNode(Doc.Get(), r); + } + + TConstNode TDocument::Root() const { + xmlNode* r = xmlDocGetRootElement(Doc.Get()); + if (r == nullptr) + THROW(XmlException, "TDocument hasn't root element"); + + return TConstNode(TNode(Doc.Get(), r)); + } + + bool TNode::IsNull() const { + return NodePointer == nullptr; + } + + bool TNode::IsElementNode() const { + return !IsNull() && (NodePointer->type == XML_ELEMENT_NODE); + } + + TXPathContextPtr TNode::CreateXPathContext(const TNamespacesForXPath& nss) const { + TXPathContextPtr ctx = xmlXPathNewContext(DocPointer); + if (!ctx) + THROW(XmlException, "Can't create empty xpath context"); + + for (const auto& ns : nss) { + const int r = xmlXPathRegisterNs(ctx.Get(), XMLCHAR(ns.Prefix.c_str()), XMLCHAR(ns.Url.c_str())); + if (r != 0) + THROW(XmlException, "Can't register namespace " << ns.Url << " with prefix " << ns.Prefix); + } + + return ctx; + } + + TConstNodes TNode::XPath(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const { + TXPathContextPtr ctxt = CreateXPathContext(ns); + return XPath(xpath, quiet, *ctxt); + } + + TConstNodes TNode::XPath(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + if (xmlXPathSetContextNode(NodePointer, &ctxt) != 0) + THROW(XmlException, "Can't set xpath context node, probably the context is associated with another document"); + + TXPathObjectPtr obj = xmlXPathEvalExpression(XMLCHAR(xpath.c_str()), &ctxt); + if (!obj) + THROW(XmlException, "Can't evaluate xpath expression " << xpath); + + TConstNodes nodes(DocPointer, obj); + + if (nodes.Size() == 0 && !quiet) + THROW(NodeNotFound, xpath); + + return nodes; + } + + TConstNodes TNode::Nodes(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const { + TXPathContextPtr ctxt = CreateXPathContext(ns); + return Nodes(xpath, quiet, *ctxt); + } + + TConstNodes TNode::Nodes(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + TConstNodes nodes = XPath(xpath, quiet, ctxt); + if (nodes.Size() != 0 && !nodes[0].IsElementNode()) + THROW(XmlException, "xpath points to non-element nodes: " << xpath); + return nodes; + } + + TNode TNode::Node(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) { + TXPathContextPtr ctxt = CreateXPathContext(ns); + return Node(xpath, quiet, *ctxt); + } + + TConstNode TNode::Node(TZtStringBuf xpath, bool quiet, const TNamespacesForXPath& ns) const { + TXPathContextPtr ctxt = CreateXPathContext(ns); + return Node(xpath, quiet, *ctxt); + } + + TNode TNode::Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) { + TConstNodes n = Nodes(xpath, quiet, ctxt); + + if (n.Size() == 0 && !quiet) + THROW(NodeNotFound, xpath); + + if (n.Size() == 0) + return TNode(); + else + return n[0].ConstCast(); + } + + TConstNode TNode::Node(TZtStringBuf xpath, bool quiet, TXPathContext& ctxt) const { + return const_cast<TNode*>(this)->Node(xpath, quiet, ctxt); + } + + TNode TNode::FirstChild(TZtStringBuf name) { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return Find(NodePointer->children, name); + } + + TConstNode TNode::FirstChild(TZtStringBuf name) const { + return const_cast<TNode*>(this)->FirstChild(name); + } + + TNode TNode::FirstChild() { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return TNode(DocPointer, NodePointer->children); + } + + TConstNode TNode::FirstChild() const { + return const_cast<TNode*>(this)->FirstChild(); + } + + TNode TNode::Parent() { + if (nullptr == NodePointer->parent) + THROW(XmlException, "Parent node not exists"); + + return TNode(DocPointer, NodePointer->parent); + } + + TConstNode TNode::Parent() const { + return const_cast<TNode*>(this)->Parent(); + } + + TNode TNode::NextSibling(TZtStringBuf name) { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return Find(NodePointer->next, name); + } + + TConstNode TNode::NextSibling(TZtStringBuf name) const { + return const_cast<TNode*>(this)->NextSibling(name); + } + + TNode TNode::NextSibling() { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return TNode(DocPointer, NodePointer->next); + } + + TConstNode TNode::NextSibling() const { + return const_cast<TNode*>(this)->NextSibling(); + } + + /* NOTE: by default child will inherit it's parent ns */ + + TNode TNode::AddChild(TZtStringBuf name) { + return AddChild(name, ""); + } + + /* NOTE: source node will be copied, as otherwise it will be double-freed from this and its own document */ + + TNode TNode::AddChild(const TConstNode& node) { + xmlNodePtr copy = xmlDocCopyNode(node.ConstCast().NodePointer, DocPointer, 1 /* recursive */); + copy = xmlAddChild(NodePointer, copy); + return TNode(DocPointer, copy); + } + + void TNode::SetPrivate(void* priv) { + NodePointer->_private = priv; + } + + void* TNode::GetPrivate() const { + return NodePointer->_private; + } + + TNode TNode::Find(xmlNode* start, TZtStringBuf name) { + for (; start; start = start->next) + if (start->type == XML_ELEMENT_NODE && (name.empty() || !xmlStrcmp(start->name, XMLCHAR(name.c_str())))) + return TNode(DocPointer, start); + + return TNode(); + } + + TString TNode::Name() const { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return CAST2CHAR(NodePointer->name); + } + + TString TNode::Path() const { + TCharPtr path(xmlGetNodePath(NodePointer)); + if (!!path) + return CAST2CHAR(path.Get()); + else + return ""; + } + + xmlNode* TNode::GetPtr() { + return NodePointer; + } + + const xmlNode* TNode::GetPtr() const { + return NodePointer; + } + + bool TNode::IsText() const { + if (IsNull()) + THROW(XmlException, "Node is null"); + + return NodePointer->type == XML_TEXT_NODE; + } + + void TNode::Remove() { + xmlNode* nodePtr = GetPtr(); + xmlUnlinkNode(nodePtr); + xmlFreeNode(nodePtr); + } + + static int XmlWriteToOstream(void* context, const char* buffer, int len) { + // possibly use to save doc as well + IOutputStream* out = (IOutputStream*)context; + out->Write(buffer, len); + return len; + } + + void TNode::SaveInternal(IOutputStream& stream, TZtStringBuf enc, int options) const { + const char* encoding = enc.size() ? enc.data() : "utf-8"; + TSaveCtxtPtr ctx(xmlSaveToIO(XmlWriteToOstream, /* close */ nullptr, &stream, + encoding, options)); + if (xmlSaveTree(ctx.Get(), (xmlNode*)GetPtr()) < 0) + THROW(XmlException, "Failed saving node to stream"); + } + + void TNode::Save(IOutputStream& stream, TZtStringBuf enc, bool shouldFormat) const { + SaveInternal(stream, enc, shouldFormat ? XML_SAVE_FORMAT : 0); + } + + void TNode::SaveAsHtml(IOutputStream& stream, TZtStringBuf enc, bool shouldFormat) const { + int options = XML_SAVE_AS_HTML; + options |= shouldFormat ? XML_SAVE_FORMAT : 0; + SaveInternal(stream, enc, options); + } + + TConstNodes::TConstNodes(const TConstNodes& nodes) + : SizeValue(nodes.Size()) + , Doc(nodes.Doc) + , Obj(nodes.Obj) + { + } + + TConstNodes& TConstNodes::operator=(const TConstNodes& nodes) { + if (this != &nodes) { + SizeValue = nodes.Size(); + Doc = nodes.Doc; + Obj = nodes.Obj; + } + + return *this; + } + + TConstNodes::TConstNodes(TConstNodesRef ref) + : SizeValue(ref.r_.Size()) + , Doc(ref.r_.Doc) + , Obj(ref.r_.Obj) + { + } + + TConstNodes& TConstNodes::operator=(TConstNodesRef ref) { + if (this != &ref.r_) { + SizeValue = ref.r_.Size(); + Doc = ref.r_.Doc; + Obj = ref.r_.Obj; + } + return *this; + } + + TConstNodes::operator TConstNodesRef() { + return TConstNodesRef(*this); + } + + TConstNodes::TConstNodes(xmlDoc* doc, TXPathObjectPtr obj) + : SizeValue(obj && obj->nodesetval ? obj->nodesetval->nodeNr : 0) + , Doc(doc) + , Obj(obj) + { + } + + TConstNode TConstNodes::operator[](size_t number) const { + if (number + 1 > Size()) + THROW(XmlException, "index out of range " << number); + + if (!Obj || !Obj->nodesetval) + THROW(XmlException, "Broken TConstNodes object, Obj is null"); + + xmlNode* node = Obj->nodesetval->nodeTab[number]; + return TNode(Doc, node); + } + + TConstNode TConstNodes::TNodeIter::operator*() const { + return Nodes[Index]; + } + +} diff --git a/library/cpp/xml/document/xml-document.h b/library/cpp/xml/document/xml-document.h new file mode 100644 index 0000000000..829ba09cc4 --- /dev/null +++ b/library/cpp/xml/document/xml-document.h @@ -0,0 +1,4 @@ +#pragma once + +#include "xml-document-decl.h" +#include "node-attr.h" diff --git a/library/cpp/xml/document/xml-document_ut.cpp b/library/cpp/xml/document/xml-document_ut.cpp new file mode 100644 index 0000000000..9f537b75c4 --- /dev/null +++ b/library/cpp/xml/document/xml-document_ut.cpp @@ -0,0 +1,319 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <util/generic/map.h> + +#include "xml-document.h" + +Y_UNIT_TEST_SUITE(TestXmlDocument) { + Y_UNIT_TEST(Iteration) { + NXml::TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root>qq<a><b></b></a>ww<c></c></root>", + NXml::TDocument::String); + + NXml::TConstNode root = xml.Root(); + UNIT_ASSERT_EQUAL(root.Name(), "root"); + NXml::TConstNode n = root.FirstChild().NextSibling(); + UNIT_ASSERT_EQUAL(n.Name(), "a"); + n = n.NextSibling().NextSibling(); + UNIT_ASSERT_EQUAL(n.Name(), "c"); + } + + Y_UNIT_TEST(ParseString) { + NXml::TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root>\n" + "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n" + "<text>Некоторый текст</text>\n" + "</root>", + NXml::TDocument::String); + + NXml::TConstNode root = xml.Root(); + NXml::TConstNode b = root.Node("a/b"); + UNIT_ASSERT_EQUAL(b.Attr<int>("len"), 15); + UNIT_ASSERT_EQUAL(b.Attr<bool>("correct"), true); + + NXml::TConstNode text = root.Node("text"); + UNIT_ASSERT_EQUAL(text.Value<TString>(), "Некоторый текст"); + } + Y_UNIT_TEST(SerializeString) { + NXml::TDocument xml("frob", NXml::TDocument::RootName); + xml.Root().SetAttr("xyzzy", "Frobozz"); + xml.Root().SetAttr("kulness", 0.3); + xml.Root().SetAttr("timelimit", 3); + + NXml::TNode authors = xml.Root().AddChild("authors"); + authors.AddChild("graham").SetAttr("name", "Nelson"); + authors.AddChild("zarf").SetValue("Andrew Plotkin"); + authors.AddChild("emshort", "Emily Short"); + + TString data = xml.ToString("utf-8"); + UNIT_ASSERT_EQUAL(data, "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + "<frob xyzzy=\"Frobozz\" kulness=\"0.3\" timelimit=\"3\">\n" + " <authors>\n" + " <graham name=\"Nelson\"/>\n" + " <zarf>Andrew Plotkin</zarf>\n" + " <emshort>Emily Short</emshort>\n" + " </authors>\n" + "</frob>\n"); + // check default utf8 output with ru + { + NXml::TDocument xml2("frob", NXml::TDocument::RootName); + xml2.Root().SetAttr("xyzzy", "привет =)"); + UNIT_ASSERT_VALUES_EQUAL(xml2.ToString(), "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + "<frob xyzzy=\"привет =)\"/>\n"); + } + } + Y_UNIT_TEST(XPathNs) { + using namespace NXml; + TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root xmlns='http://hello.com/hello'>\n" + "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n" + "<text>Некоторый текст</text>\n" + "</root>", + TDocument::String); + + TNamespacesForXPath nss; + TNamespaceForXPath ns = {"h", "http://hello.com/hello"}; + nss.push_back(ns); + + TConstNode root = xml.Root(); + TConstNode b = root.Node("h:a/h:b", false, nss); + UNIT_ASSERT_EQUAL(b.Attr<int>("len"), 15); + UNIT_ASSERT_EQUAL(b.Attr<bool>("correct"), true); + + TConstNode text = root.Node("h:text", false, nss); + UNIT_ASSERT_EQUAL(text.Value<TString>(), "Некоторый текст"); + + // For performance you can create xpath context once using nss and pass it. + TXPathContextPtr ctxt = root.CreateXPathContext(nss); + UNIT_ASSERT(root.Node("text", true, *ctxt).IsNull()); + UNIT_ASSERT_EXCEPTION(root.Node("text", false, *ctxt), yexception); + UNIT_ASSERT_EQUAL(root.Node("h:text", false, *ctxt).Value<TString>(), "Некоторый текст"); + } + Y_UNIT_TEST(XmlNodes) { + using namespace NXml; + TDocument xml("<?xml version=\"1.0\"?>\n" + "<root>qq<a><b>asdfg</b></a>ww<c></c></root>", + NXml::TDocument::String); + TNode root = xml.Root(); + UNIT_ASSERT_EQUAL(root.Value<TString>(), "qqasdfgww"); + TConstNode node = root.FirstChild(); + UNIT_ASSERT_EQUAL(node.IsText(), true); + UNIT_ASSERT_EQUAL(node.Value<TString>(), "qq"); + node = node.NextSibling(); + UNIT_ASSERT_EQUAL(node.IsText(), false); + UNIT_ASSERT_EQUAL(node.Name(), "a"); + UNIT_ASSERT_EQUAL(node.Value<TString>(), "asdfg"); + node = node.NextSibling(); + UNIT_ASSERT_EQUAL(node.IsText(), true); + UNIT_ASSERT_EQUAL(node.Value<TString>(), "ww"); + node = node.NextSibling(); + UNIT_ASSERT_EQUAL(node.IsText(), false); + UNIT_ASSERT_EQUAL(node.Name(), "c"); + UNIT_ASSERT_EQUAL(node.Value<TString>(), ""); + node = node.NextSibling(); + UNIT_ASSERT_EQUAL(node.IsNull(), true); + TStringStream iterLog; + for (const auto& node2 : root.Nodes("/root/*")) { + iterLog << node2.Name() << ';'; + } + UNIT_ASSERT_STRINGS_EQUAL(iterLog.Str(), "a;c;"); + + // get only element nodes, ignore text nodes with empty "name" param + node = root.FirstChild(TString()); + UNIT_ASSERT_EQUAL(node.IsText(), false); + UNIT_ASSERT_EQUAL(node.Name(), "a"); + node = node.NextSibling(TString()); + UNIT_ASSERT_EQUAL(node.IsText(), false); + UNIT_ASSERT_EQUAL(node.Name(), "c"); + + // use exact "name" to retrieve children and siblings + node = root.FirstChild("a"); + UNIT_ASSERT_EQUAL(node.IsNull(), false); + UNIT_ASSERT_EQUAL(node.Name(), "a"); + node = node.NextSibling("c"); + UNIT_ASSERT_EQUAL(node.IsNull(), false); + UNIT_ASSERT_EQUAL(node.Name(), "c"); + node = root.FirstChild("c"); // skip "a" + UNIT_ASSERT_EQUAL(node.IsNull(), false); + UNIT_ASSERT_EQUAL(node.Name(), "c"); + + // node not found: no exceptions, null nodes are returned + node = root.FirstChild("b"); // b is not direct child of root + UNIT_ASSERT_EQUAL(node.IsNull(), true); + node = root.FirstChild("nosuchnode"); + UNIT_ASSERT_EQUAL(node.IsNull(), true); + node = root.FirstChild(); + node = root.NextSibling("unknownnode"); + UNIT_ASSERT_EQUAL(node.IsNull(), true); + UNIT_ASSERT_EXCEPTION(node.Name(), yexception); + UNIT_ASSERT_EXCEPTION(node.Value<TString>(), yexception); + UNIT_ASSERT_EXCEPTION(node.IsText(), yexception); + } + Y_UNIT_TEST(DefVal) { + using namespace NXml; + TDocument xml("<?xml version=\"1.0\"?>\n" + "<root><a></a></root>", + NXml::TDocument::String); + UNIT_ASSERT_EQUAL(xml.Root().Node("a", true).Node("b", true).Value<int>(3), 3); + } + Y_UNIT_TEST(NodesVsXPath) { + using namespace NXml; + TDocument xml("<?xml version=\"1.0\"?>\n" + "<root><a x=\"y\"></a></root>", + NXml::TDocument::String); + UNIT_ASSERT_EXCEPTION(xml.Root().Nodes("/root/a/@x"), yexception); + UNIT_ASSERT_VALUES_EQUAL(xml.Root().XPath("/root/a/@x").Size(), 1); + } + Y_UNIT_TEST(NodeIsFirst) { + using namespace NXml; + TDocument xml("<?xml version=\"1.0\"?>\n" + "<root><a x=\"y\">first</a>" + "<a>second</a></root>", + NXml::TDocument::String); + UNIT_ASSERT_EXCEPTION(xml.Root().Node("/root/a/@x"), yexception); + UNIT_ASSERT_STRINGS_EQUAL(xml.Root().Node("/root/a").Value<TString>(), "first"); + } + Y_UNIT_TEST(CopyNode) { + using namespace NXml; + // default-construct empty node + TNode empty; + // put to container + TMap<int, TNode> nmap; + nmap[2]; + + // do copy + TDocument xml("<?xml version=\"1.0\"?>\n" + "<root><a></a></root>", + TDocument::String); + + TDocument xml2("<?xml version=\"1.0\"?>\n" + "<root><node><b>bold</b><i>ita</i></node></root>", + TDocument::String); + + TNode node = xml2.Root().Node("//node"); + TNode place = xml.Root().Node("//a"); + + place.AddChild(node); + + TStringStream s; + xml.Save(s, "", false); + UNIT_ASSERT_VALUES_EQUAL(s.Str(), + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<root><a><node><b>bold</b><i>ita</i></node></a></root>\n"); + } + + Y_UNIT_TEST(RenderNode) { + using namespace NXml; + { + // no namespaces + TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root>\n" + "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n" + "<text>Некоторый текст</text>\n" + "</root>", + TDocument::String); + TNode n = xml.Root().Node("//a"); + UNIT_ASSERT_VALUES_EQUAL(n.ToString(), "<a><b len=\"15\" correct=\"1\">hello world</b></a>"); + } + { + // namespaces + TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root xmlns='http://hello.com/hello'>\n" + "<a><b len=\"15\" correct=\"1\">hello world</b></a>\n" + "<text>Некоторый текст</text>\n" + "</root>", + TDocument::String); + TNamespacesForXPath nss; + TNamespaceForXPath ns = {"h", "http://hello.com/hello"}; + nss.push_back(ns); + + TNode n = xml.Root().Node("//h:a", false, nss); + UNIT_ASSERT_VALUES_EQUAL(n.ToString(), "<a><b len=\"15\" correct=\"1\">hello world</b></a>"); + } + } + + Y_UNIT_TEST(ReuseXPathContext) { + using namespace NXml; + + TDocument xml( + "<?xml version=\"1.0\"?>\n" + "<root>\n" + "<a><b><c>Hello, world!</c></b></a>\n" + "<text x=\"10\">First</text>\n" + "<text y=\"20\">Second</text>\n" + "</root>", + TDocument::String); + + TXPathContextPtr rootCtxt = xml.Root().CreateXPathContext(); + + // Check Node() + TConstNode b = xml.Root().Node("a/b", false, *rootCtxt); + + // We can use root node context for xpath evaluation in any node + TConstNode c1 = b.Node("c", false, *rootCtxt); + UNIT_ASSERT_EQUAL(c1.Value<TString>(), "Hello, world!"); + + TXPathContextPtr bCtxt = b.CreateXPathContext(); + TConstNode c2 = b.Node("c", false, *bCtxt); + UNIT_ASSERT_EQUAL(c2.Value<TString>(), "Hello, world!"); + + // Mixing contexts from different documents is forbidden + TDocument otherXml("<root></root>", TDocument::String); + TXPathContextPtr otherCtxt = otherXml.Root().CreateXPathContext(); + UNIT_ASSERT_EXCEPTION(b.Node("c", false, *otherCtxt), yexception); + + // Check Nodes() + TConstNodes texts = xml.Root().Nodes("text", true, *rootCtxt); + UNIT_ASSERT_EQUAL(texts.Size(), 2); + + // Nodes() does't work for non-element nodes + UNIT_ASSERT_EXCEPTION(xml.Root().Nodes("text/@x", true, *rootCtxt), yexception); + + // Check XPath() + TConstNodes ys = xml.Root().XPath("text/@y", true, *rootCtxt); + UNIT_ASSERT_EQUAL(ys.Size(), 1); + UNIT_ASSERT_EQUAL(ys[0].Value<int>(), 20); + } + + Y_UNIT_TEST(Html) { + using namespace NXml; + + TDocument htmlChunk("video", TDocument::RootName); + TNode videoNode = htmlChunk.Root(); + + videoNode.SetAttr("controls"); + + TStringStream ss; + videoNode.SaveAsHtml(ss); + UNIT_ASSERT_EQUAL(ss.Str(), "<video controls></video>"); + } + + Y_UNIT_TEST(Move) { + using namespace NXml; + + TDocument xml1("foo", TDocument::RootName); + xml1.Root().AddChild("bar"); + + UNIT_ASSERT_VALUES_EQUAL(xml1.Root().ToString(), "<foo><bar/></foo>"); + + TDocument xml2 = std::move(xml1); + UNIT_ASSERT_EXCEPTION(xml1.Root(), yexception); + UNIT_ASSERT_VALUES_EQUAL(xml2.Root().ToString(), "<foo><bar/></foo>"); + } + + Y_UNIT_TEST(StringConversion) { + using namespace NXml; + TDocument xml("foo", TDocument::RootName); + auto root = xml.Root(); + const TStringBuf stringBuf = "bar"; + root.SetAttr("bar", stringBuf); + const TString tString = "baz"; + root.SetAttr("baz", tString); + root.SetAttr("quux", "literal"); + root.SetAttr("frob", 500); + } +} diff --git a/library/cpp/xml/document/xml-options.cpp b/library/cpp/xml/document/xml-options.cpp new file mode 100644 index 0000000000..74e7545de3 --- /dev/null +++ b/library/cpp/xml/document/xml-options.cpp @@ -0,0 +1 @@ +#include "xml-options.h" diff --git a/library/cpp/xml/document/xml-options.h b/library/cpp/xml/document/xml-options.h new file mode 100644 index 0000000000..bb07da0cfb --- /dev/null +++ b/library/cpp/xml/document/xml-options.h @@ -0,0 +1,67 @@ +#pragma once + +#include <contrib/libs/libxml/include/libxml/parser.h> + +namespace NXml { + enum class EOption : int { + // clang-format off + Recover = XML_PARSE_RECOVER, + NoEnt = XML_PARSE_NOENT, + DTDLoad = XML_PARSE_DTDLOAD, + DTDAttr = XML_PARSE_DTDATTR, + DTDValid = XML_PARSE_DTDVALID, + NoError = XML_PARSE_NOERROR, + NoWarning = XML_PARSE_NOWARNING, + Pedantic = XML_PARSE_PEDANTIC, + NoBlanks = XML_PARSE_NOBLANKS, + SAX1 = XML_PARSE_SAX1, + XInclude = XML_PARSE_XINCLUDE, + NoNet = XML_PARSE_NONET, + NoDict = XML_PARSE_NODICT, + NSClean = XML_PARSE_NSCLEAN, + NoCData = XML_PARSE_NOCDATA, + NoXInclude = XML_PARSE_NOXINCNODE, + Compact = XML_PARSE_COMPACT, + Old10 = XML_PARSE_OLD10, + NoBaseFix = XML_PARSE_NOBASEFIX, + Huge = XML_PARSE_HUGE, + OldSAX = XML_PARSE_OLDSAX, + IgnoreEnc = XML_PARSE_IGNORE_ENC, + BigLines = XML_PARSE_BIG_LINES, + // clang-format on + }; + + class TOptions { + public: + TOptions() + : Mask(0) + { + } + + template <typename... TArgs> + TOptions(TArgs... args) + : Mask(0) + { + Set(args...); + } + + TOptions& Set(EOption option) { + Mask |= static_cast<int>(option); + return *this; + } + + template <typename... TArgs> + TOptions& Set(EOption arg, TArgs... args) { + Set(arg); + return Set(args...); + } + + int GetMask() const { + return Mask; + } + + private: + int Mask; + }; + +} diff --git a/library/cpp/xml/document/xml-options_ut.cpp b/library/cpp/xml/document/xml-options_ut.cpp new file mode 100644 index 0000000000..9be16baf3d --- /dev/null +++ b/library/cpp/xml/document/xml-options_ut.cpp @@ -0,0 +1,26 @@ +#include "xml-options.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TestXmlOptions) { + Y_UNIT_TEST(SetHuge) { + NXml::TOptions opts; + opts.Set(NXml::EOption::Huge); + UNIT_ASSERT_EQUAL(XML_PARSE_HUGE, opts.GetMask()); + } + + Y_UNIT_TEST(VariadicContructor) { + NXml::TOptions opts(NXml::EOption::Huge, NXml::EOption::Compact, NXml::EOption::SAX1); + UNIT_ASSERT_EQUAL(XML_PARSE_HUGE | XML_PARSE_COMPACT | XML_PARSE_SAX1, opts.GetMask()); + } + + Y_UNIT_TEST(Chaining) { + NXml::TOptions opts; + + opts + .Set(NXml::EOption::Huge) + .Set(NXml::EOption::Compact); + + UNIT_ASSERT_EQUAL(XML_PARSE_HUGE | XML_PARSE_COMPACT, opts.GetMask()); + } +} diff --git a/library/cpp/xml/document/xml-textreader.cpp b/library/cpp/xml/document/xml-textreader.cpp new file mode 100644 index 0000000000..b946f1fbf2 --- /dev/null +++ b/library/cpp/xml/document/xml-textreader.cpp @@ -0,0 +1,318 @@ +#include "xml-textreader.h" + +#include <contrib/libs/libxml/include/libxml/xmlreader.h> + +#include <util/generic/yexception.h> +#include <util/string/strip.h> +#include <util/system/compiler.h> + +namespace NXml { + TTextReader::TTextReader(IInputStream& stream, const TOptions& options) + : Stream(stream) + , IsError(false) + { + Impl.Reset(xmlReaderForIO(ReadFromInputStreamCallback, nullptr, this, nullptr, nullptr, options.GetMask())); + + if (!Impl) { + ythrow yexception() << "cannot instantiate underlying xmlTextReader structure"; + } + SetupErrorHandler(); + CheckForExceptions(); + } + + TTextReader::~TTextReader() { + } + + bool TTextReader::Read() { + return BoolResult(xmlTextReaderRead(Impl.Get())); + } + + TString TTextReader::ReadInnerXml() const { + return TempStringOrEmptyResult(xmlTextReaderReadInnerXml(Impl.Get())); + } + + TString TTextReader::ReadOuterXml() const { + return TempStringOrEmptyResult(xmlTextReaderReadOuterXml(Impl.Get())); + } + + TString TTextReader::ReadString() const { + return TempStringOrEmptyResult(xmlTextReaderReadString(Impl.Get())); + } + + bool TTextReader::ReadAttributeValue() const { + return BoolResult(xmlTextReaderReadAttributeValue(Impl.Get())); + } + + int TTextReader::GetAttributeCount() const { + return IntResult(xmlTextReaderAttributeCount(Impl.Get())); + } + + TStringBuf TTextReader::GetBaseUri() const { + return ConstStringOrEmptyResult(xmlTextReaderConstBaseUri(Impl.Get())); + } + + int TTextReader::GetDepth() const { + return IntResult(xmlTextReaderDepth(Impl.Get())); + } + + bool TTextReader::HasAttributes() const { + return BoolResult(xmlTextReaderHasAttributes(Impl.Get())); + } + + bool TTextReader::HasValue() const { + return BoolResult(xmlTextReaderHasValue(Impl.Get())); + } + + bool TTextReader::IsDefault() const { + return BoolResult(xmlTextReaderIsDefault(Impl.Get())); + } + + bool TTextReader::IsEmptyElement() const { + return BoolResult(xmlTextReaderIsEmptyElement(Impl.Get())); + } + + TStringBuf TTextReader::GetLocalName() const { + return ConstStringOrEmptyResult(xmlTextReaderConstLocalName(Impl.Get())); + } + + TStringBuf TTextReader::GetName() const { + return ConstStringOrEmptyResult(xmlTextReaderConstName(Impl.Get())); + } + + TStringBuf TTextReader::GetNamespaceUri() const { + return ConstStringOrEmptyResult(xmlTextReaderConstNamespaceUri(Impl.Get())); + } + + TTextReader::ENodeType TTextReader::GetNodeType() const { + return static_cast<ENodeType>(IntResult(xmlTextReaderNodeType(Impl.Get()))); + } + + TStringBuf TTextReader::GetPrefix() const { + return ConstStringOrEmptyResult(xmlTextReaderConstPrefix(Impl.Get())); + } + + char TTextReader::GetQuoteChar() const { + return CharResult(xmlTextReaderQuoteChar(Impl.Get())); + } + + TStringBuf TTextReader::GetValue() const { + return ConstStringOrEmptyResult(xmlTextReaderConstValue(Impl.Get())); + } + + TTextReader::EReadState TTextReader::GetReadState() const { + return static_cast<EReadState>(IntResult(xmlTextReaderReadState(Impl.Get()))); + } + + void TTextReader::Close() { + if (xmlTextReaderClose(Impl.Get()) == -1) { + ThrowException(); + } + } + + TString TTextReader::GetAttribute(int number) const { + return TempStringResult(xmlTextReaderGetAttributeNo(Impl.Get(), number)); + } + + TString TTextReader::GetAttribute(TZtStringBuf name) const { + return TempStringResult(xmlTextReaderGetAttribute(Impl.Get(), XMLCHAR(name.data()))); + } + + TString TTextReader::GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const { + return TempStringResult(xmlTextReaderGetAttributeNs(Impl.Get(), XMLCHAR(localName.data()), XMLCHAR(nsUri.data()))); + } + + TString TTextReader::LookupNamespace(TZtStringBuf prefix) const { + return TempStringResult(xmlTextReaderLookupNamespace(Impl.Get(), XMLCHAR(prefix.data()))); + } + + bool TTextReader::MoveToAttribute(int number) { + return BoolResult(xmlTextReaderMoveToAttributeNo(Impl.Get(), number)); + } + + bool TTextReader::MoveToAttribute(TZtStringBuf name) { + return BoolResult(xmlTextReaderMoveToAttribute(Impl.Get(), XMLCHAR(name.data()))); + } + + bool TTextReader::MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri) { + return BoolResult(xmlTextReaderMoveToAttributeNs(Impl.Get(), XMLCHAR(localName.data()), XMLCHAR(nsUri.data()))); + } + + bool TTextReader::MoveToFirstAttribute() { + return BoolResult(xmlTextReaderMoveToFirstAttribute(Impl.Get())); + } + + bool TTextReader::MoveToNextAttribute() { + return BoolResult(xmlTextReaderMoveToNextAttribute(Impl.Get())); + } + + bool TTextReader::MoveToElement() { + return BoolResult(xmlTextReaderMoveToElement(Impl.Get())); + } + + TConstNode TTextReader::Expand() const { + const xmlNodePtr node = xmlTextReaderExpand(Impl.Get()); + if (node == nullptr) { + ThrowException(); + } + return TConstNode(TNode(node->doc, node)); + } + + bool TTextReader::Next() { + return BoolResult(xmlTextReaderNext(Impl.Get())); + } + + bool TTextReader::IsValid() const { + return BoolResult(xmlTextReaderIsValid(Impl.Get())); + } + + // Callback for xmlReaderForIO() to read more data. + // It is almost "noexcept" (std::bad_alloc may happen when saving exception message to new TString). + // Waiting for std::exception_ptr and std::rethrow_exception from C++11 in Arcadia to make it really "noexcept". + int TTextReader::ReadFromInputStreamCallback(void* context, char* buffer, int len) { + Y_ASSERT(len >= 0); + TTextReader* reader = static_cast<TTextReader*>(context); + + int result = -1; + + // Exception may be thrown by IInputStream::Read(). + // It is caught unconditionally because exceptions cannot safely pass through libxml2 plain C code + // (no destructors, no RAII, raw pointers, so in case of stack unwinding some memory gets leaked). + + try { + result = reader->Stream.Read(buffer, len); + } catch (const yexception& ex) { + reader->LogError() << "read from input stream failed: " << ex; + } catch (...) { + reader->LogError() << "read from input stream failed"; + } + + return result; + } + + void TTextReader::OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator) { + TTextReader* reader = static_cast<TTextReader*>(arg); + Y_ASSERT(reader != nullptr); + + TStringStream& out = reader->LogError(); + + if (severity == XML_PARSER_SEVERITY_ERROR) { + out << "libxml parse error"; + } else if (severity == XML_PARSER_SEVERITY_VALIDITY_ERROR) { + out << "libxml validity error"; + } else { + out << "libxml error"; + } + + if (locator != nullptr) { + const int line = xmlTextReaderLocatorLineNumber(locator); + const TCharPtr baseUri = xmlTextReaderLocatorBaseURI(locator); + out << " ("; + if (line != -1) { + out << "at line " << line; + if (baseUri) { + out << ", "; + } + } + if (baseUri) { + out << "base URI " << CAST2CHAR(baseUri.Get()); + } + out << ")"; + } + + TStringBuf message = (msg != nullptr) ? msg : "unknown"; + message = StripStringRight(message); // remove trailing \n that is added by libxml + if (!message.empty()) { + out << ": " << message; + } + } + + void TTextReader::SetupErrorHandler() { + xmlTextReaderErrorFunc func = nullptr; + void* arg = nullptr; + + // We respect any other error handlers already set up: + xmlTextReaderGetErrorHandler(Impl.Get(), &func, &arg); + if (!func) { + func = TTextReader::OnLibxmlError; + xmlTextReaderSetErrorHandler(Impl.Get(), func, this); + } + } + + TStringStream& TTextReader::LogError() const { + if (IsError) { // maybe there are previous errors + ErrorBuffer << Endl; + } + IsError = true; + return ErrorBuffer; + } + + void TTextReader::CheckForExceptions() const { + if (Y_LIKELY(!IsError)) { + return; + } + + const TString message = ErrorBuffer.Str(); + ErrorBuffer.clear(); + IsError = false; + + ythrow yexception() << message; + } + + void TTextReader::ThrowException() const { + CheckForExceptions(); + // Probably CheckForExceptions() would throw an exception with more verbose message. As the last resort + // (we do not even know the name of the failed libxml function, but it's possible to deduce it from stacktrace): + ythrow yexception() << "libxml function returned error exit code"; + } + + bool TTextReader::BoolResult(int value) const { + if (Y_UNLIKELY(value == -1)) { + ThrowException(); + } + return (value != 0); + } + + int TTextReader::IntResult(int value) const { + if (Y_UNLIKELY(value == -1)) { + ThrowException(); + } + return value; + } + + char TTextReader::CharResult(int value) const { + if (Y_UNLIKELY(value == -1)) { + ThrowException(); + } + return static_cast<char>(value); + } + + TStringBuf TTextReader::ConstStringResult(const xmlChar* value) const { + if (Y_UNLIKELY(value == nullptr)) { + ThrowException(); + } + return CAST2CHAR(value); + } + + TStringBuf TTextReader::ConstStringOrEmptyResult(const xmlChar* value) const { + CheckForExceptions(); + return (value != nullptr) ? TStringBuf(CAST2CHAR(value)) : TStringBuf(); + } + + TString TTextReader::TempStringResult(TCharPtr value) const { + if (Y_UNLIKELY(value == nullptr)) { + ThrowException(); + } + return TString(CAST2CHAR(value.Get())); + } + + TString TTextReader::TempStringOrEmptyResult(TCharPtr value) const { + CheckForExceptions(); + return (value != nullptr) ? TString(CAST2CHAR(value.Get())) : TString(); + } + + struct TTextReader::TDeleter { + static inline void Destroy(xmlTextReaderPtr handle) { + xmlFreeTextReader(handle); + } + }; +} diff --git a/library/cpp/xml/document/xml-textreader.h b/library/cpp/xml/document/xml-textreader.h new file mode 100644 index 0000000000..ab4c329d26 --- /dev/null +++ b/library/cpp/xml/document/xml-textreader.h @@ -0,0 +1,325 @@ +#pragma once + +#include "xml-document.h" +#include "xml-options.h" + +#include <contrib/libs/libxml/include/libxml/xmlreader.h> + +#include <library/cpp/string_utils/ztstrbuf/ztstrbuf.h> + +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <functional> +#include <util/stream/input.h> +#include <util/stream/str.h> + +namespace NXml { + /** + * TextReader Parser + * + * API of the XML streaming API based on C# interfaces. + * Provides fast, non-cached, forward-only access to XML data. + * + * Like the SAX parser, the TextReader parser is suitable for sequential + * parsing, but instead of implementing handlers for specific parts of the + * document, it allows you to detect the current node type, process the node + * accordingly, and skip forward in the document as much as necessary. + * + * Unlike the DOM parser, you may not move backwards in the XML document. + * And unlike the SAX parser, you must not waste time processing nodes that do not + * interest you. + * + * All methods are on the single parser instance, but their result depends on the current context. + * For instance, use Read() to move to the next node, and MoveToElement() to navigate to child nodes. + * These methods will return false when no more nodes are available. Then use + * methods such as GetName() and GetValue() to examine the elements and their attributes. + * + * This wrapper is inspired by TextReader from libxml++. + */ + + class TTextReader: private TNonCopyable { + public: + // strongly-typed alias for enum from xmlreader.h + enum class ENodeType : int { + // clang-format off + Attribute = XML_READER_TYPE_ATTRIBUTE, + CDATA = XML_READER_TYPE_CDATA, + Comment = XML_READER_TYPE_COMMENT, + Document = XML_READER_TYPE_DOCUMENT, + DocumentFragment = XML_READER_TYPE_DOCUMENT_FRAGMENT, + DocumentType = XML_READER_TYPE_DOCUMENT_TYPE, + Element = XML_READER_TYPE_ELEMENT, + EndElement = XML_READER_TYPE_END_ELEMENT, + EndEntity = XML_READER_TYPE_END_ENTITY, + Entity = XML_READER_TYPE_ENTITY, + EntityReference = XML_READER_TYPE_ENTITY_REFERENCE, + None = XML_READER_TYPE_NONE, + Notation = XML_READER_TYPE_NOTATION, + ProcessingInstruction = XML_READER_TYPE_PROCESSING_INSTRUCTION, + SignificantWhitespace = XML_READER_TYPE_SIGNIFICANT_WHITESPACE, + Text = XML_READER_TYPE_TEXT, + Whitespace = XML_READER_TYPE_WHITESPACE, + XmlDeclaration = XML_READER_TYPE_XML_DECLARATION, + // clang-format on + }; + + enum class EReadState : int { + // clang-format off + Closed = XML_TEXTREADER_MODE_CLOSED, + EndOfFile = XML_TEXTREADER_MODE_EOF, + Error = XML_TEXTREADER_MODE_ERROR, + Initial = XML_TEXTREADER_MODE_INITIAL, + Interactive = XML_TEXTREADER_MODE_INTERACTIVE, + Reading = XML_TEXTREADER_MODE_READING, + // clang-format on + }; + + public: + TTextReader(IInputStream& stream, const TOptions& options = TOptions()); + ~TTextReader(); + + /** + * Moves the position of the current instance to the next node in the stream, exposing its properties. + * @return true if the node was read successfully, false if there are no more nodes to read + */ + bool Read(); + + /** + * Reads the contents of the current node, including child nodes and markup. + * @return A string containing the XML content, or an empty string + * if the current node is neither an element nor attribute, or has no child nodes + */ + TString ReadInnerXml() const; + + /** + * Reads the current node and its contents, including child nodes and markup. + * @return A string containing the XML content, or an empty string + * if the current node is neither an element nor attribute + */ + TString ReadOuterXml() const; + + /** + * Reads the contents of an element or a text node as a string. + * @return A string containing the contents of the Element or Text node, + * or an empty string if the reader is positioned on any other type of node + */ + TString ReadString() const; + + /** + * Parses an attribute value into one or more Text and EntityReference nodes. + * @return A bool where true indicates the attribute value was parsed, + * and false indicates the reader was not positioned on an attribute node + * or all the attribute values have been read + */ + bool ReadAttributeValue() const; + + /** + * Gets the number of attributes on the current node. + * @return The number of attributes on the current node, or zero if the current node + * does not support attributes + */ + int GetAttributeCount() const; + + /** + * Gets the base Uniform Resource Identifier (URI) of the current node. + * @return The base URI of the current node or an empty string if not available + */ + TStringBuf GetBaseUri() const; + + /** + * Gets the depth of the current node in the XML document. + * @return The depth of the current node in the XML document + */ + int GetDepth() const; + + /** + * Gets a value indicating whether the current node has any attributes. + * @return true if the current has attributes, false otherwise + */ + bool HasAttributes() const; + + /** + * Whether the node can have a text value. + * @return true if the current node can have an associated text value, false otherwise + */ + bool HasValue() const; + + /** + * Whether an Attribute node was generated from the default value defined in the DTD or schema. + * @return true if defaulted, false otherwise + */ + bool IsDefault() const; + + /** + * Check if the current node is empty. + * @return true if empty, false otherwise + */ + bool IsEmptyElement() const; + + /** + * The local name of the node. + * @return the local name or empty string if not available + */ + TStringBuf GetLocalName() const; + + /** + * The qualified name of the node, equal to Prefix:LocalName. + * @return the name or empty string if not available + */ + TStringBuf GetName() const; + + /** + * The URI defining the namespace associated with the node. + * @return the namespace URI or empty string if not available + */ + TStringBuf GetNamespaceUri() const; + + /** + * Get the node type of the current node. + * @return the ENodeType of the current node + */ + ENodeType GetNodeType() const; + + /** + * Get the namespace prefix associated with the current node. + * @return the namespace prefix, or an empty string if not available + */ + TStringBuf GetPrefix() const; + + /** + * Get the quotation mark character used to enclose the value of an attribute. + * @return " or ' + */ + char GetQuoteChar() const; + + /** + * Provides the text value of the node if present. + * @return the string or empty if not available + */ + TStringBuf GetValue() const; + + /** + * Gets the read state of the reader. + * @return the state value + */ + EReadState GetReadState() const; + + /** + * This method releases any resources allocated by the current instance + * changes the state to Closed and close any underlying input. + */ + void Close(); + + /** + * Provides the value of the attribute with the specified index relative to the containing element. + * @param number the zero-based index of the attribute relative to the containing element + */ + TString GetAttribute(int number) const; + + /** + * Provides the value of the attribute with the specified qualified name. + * @param name the qualified name of the attribute + */ + TString GetAttribute(TZtStringBuf name) const; + + /** + * Provides the value of the specified attribute. + * @param localName the local name of the attribute + * @param nsUri the namespace URI of the attribute + */ + TString GetAttribute(TZtStringBuf localName, TZtStringBuf nsUri) const; + + /** + * Resolves a namespace prefix in the scope of the current element. + * @param prefix the prefix whose namespace URI is to be resolved. To return the default namespace, specify empty string. + * @return a string containing the namespace URI to which the prefix maps. + */ + TString LookupNamespace(TZtStringBuf prefix) const; + + /** + * Moves the position of the current instance to the attribute with the specified index relative to the containing element. + * @param number the zero-based index of the attribute relative to the containing element + * @return true in case of success, false if not found + */ + bool MoveToAttribute(int number); + + /** + * Moves the position of the current instance to the attribute with the specified qualified name. + * @param name the qualified name of the attribute + * @return true in case of success, false if not found + */ + bool MoveToAttribute(TZtStringBuf name); + + /** + * Moves the position of the current instance to the attribute with the specified local name and namespace URI. + * @param localName the local name of the attribute + * @param nsUri the namespace URI of the attribute + * @return true in case of success, false if not found + */ + bool MoveToAttribute(TZtStringBuf localName, TZtStringBuf nsUri); + + /** + * Moves the position of the current instance to the first attribute associated with the current node. + * @return true in case of success, false if not found + */ + bool MoveToFirstAttribute(); + + /** + * Moves the position of the current instance to the next attribute associated with the current node. + * @return true in case of success, false if not found + */ + bool MoveToNextAttribute(); + + /** + * Moves the position of the current instance to the node that contains the current Attribute node. + * @return true in case of success, false if not found + */ + bool MoveToElement(); + + /** + * Reads the contents of the current node and the full subtree. It then makes the subtree available until the next Read() call. + */ + TConstNode Expand() const; + + /** + * Skip to the node following the current one in document order while avoiding the subtree if any. + * @return true if the node was read successfully, false if there is no more nodes to read + */ + bool Next(); + + /** + * Retrieve the validity status from the parser context. + */ + bool IsValid() const; + + private: + static int ReadFromInputStreamCallback(void* context, char* buffer, int len); + static void OnLibxmlError(void* arg, const char* msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator); + + void SetupErrorHandler(); + TStringStream& LogError() const; + void CheckForExceptions() const; + void ThrowException() const; + + // helpers that check return codes of C functions from libxml + bool BoolResult(int value) const; + int IntResult(int value) const; + char CharResult(int value) const; + TStringBuf ConstStringResult(const xmlChar* value) const; + TStringBuf ConstStringOrEmptyResult(const xmlChar* value) const; + TString TempStringResult(TCharPtr value) const; + TString TempStringOrEmptyResult(TCharPtr value) const; + + private: + IInputStream& Stream; + + mutable bool IsError; + mutable TStringStream ErrorBuffer; + + struct TDeleter; + THolder<xmlTextReader, TDeleter> Impl; + }; + +} diff --git a/library/cpp/xml/document/xml-textreader_ut.cpp b/library/cpp/xml/document/xml-textreader_ut.cpp new file mode 100644 index 0000000000..6232dfe47e --- /dev/null +++ b/library/cpp/xml/document/xml-textreader_ut.cpp @@ -0,0 +1,290 @@ +#include "xml-textreader.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/hash.h> +#include <util/generic/vector.h> +#include <util/string/join.h> + +namespace { + /** + * Simple wrapper around the xmlTextReader wrapper + */ + void ParseXml(const TString& xmlData, + std::function<void(NXml::TConstNode)> nodeHandlerFunc, + const TString& localName, + const TString& namespaceUri = TString()) { + TStringInput in(xmlData); + NXml::TTextReader reader(in); + + while (reader.Read()) { + if (reader.GetNodeType() == NXml::TTextReader::ENodeType::Element && + reader.GetLocalName() == localName && + reader.GetNamespaceUri() == namespaceUri) + { + const NXml::TConstNode node = reader.Expand(); + nodeHandlerFunc(node); + } + } + } +} + +Y_UNIT_TEST_SUITE(TestXmlTextReader) { + Y_UNIT_TEST(BasicExample) { + const TString xml = "<?xml version=\"1.0\"?>\n" + "<example toto=\"1\">\n" + " <examplechild id=\"1\">\n" + " <child_of_child/>\n" + " </examplechild>\n" + " <examplechild id=\"2\" toto=\"3\">\n" + " <child_of_child>Some content : -)</child_of_child>\n" + " </examplechild>\n" + "</example>\n"; + + TStringInput input(xml); + NXml::TTextReader reader(input); + + using ENT = NXml::TTextReader::ENodeType; + + struct TItem { + int Depth; + ENT Type; + TString Name; + TString Attrs; + TString Value; + }; + + TVector<TItem> found; + TVector<TString> msgs; + + while (reader.Read()) { + // dump attributes as "k1: v1, k2: v2, ..." + TVector<TString> kv; + if (reader.HasAttributes()) { + reader.MoveToFirstAttribute(); + do { + kv.push_back(TString::Join(reader.GetName(), ": ", reader.GetValue())); + } while (reader.MoveToNextAttribute()); + reader.MoveToElement(); + } + + found.push_back(TItem{ + reader.GetDepth(), + reader.GetNodeType(), + TString(reader.GetName()), + JoinSeq(", ", kv), + reader.HasValue() ? TString(reader.GetValue()) : TString(), + }); + } + + const TVector<TItem> expected = { + TItem{0, ENT::Element, "example", "toto: 1", ""}, + TItem{1, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{1, ENT::Element, "examplechild", "id: 1", ""}, + TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{2, ENT::Element, "child_of_child", "", ""}, + TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{1, ENT::EndElement, "examplechild", "id: 1", ""}, + TItem{1, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{1, ENT::Element, "examplechild", "id: 2, toto: 3", ""}, + TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{2, ENT::Element, "child_of_child", "", ""}, + TItem{3, ENT::Text, "#text", "", "Some content : -)"}, + TItem{2, ENT::EndElement, "child_of_child", "", ""}, + TItem{2, ENT::SignificantWhitespace, "#text", "", "\n "}, + TItem{1, ENT::EndElement, "examplechild", "id: 2, toto: 3", ""}, + TItem{1, ENT::SignificantWhitespace, "#text", "", "\n"}, + TItem{0, ENT::EndElement, "example", "toto: 1", ""}}; + + UNIT_ASSERT_VALUES_EQUAL(found.size(), expected.size()); + + for (size_t i = 0; i < expected.size(); ++i) { + UNIT_ASSERT_VALUES_EQUAL_C(found[i].Depth, expected[i].Depth, "line " << i); + UNIT_ASSERT_EQUAL_C(found[i].Type, expected[i].Type, "line " << i); + UNIT_ASSERT_VALUES_EQUAL_C(found[i].Name, expected[i].Name, "line " << i); + UNIT_ASSERT_VALUES_EQUAL_C(found[i].Attrs, expected[i].Attrs, "line " << i); + UNIT_ASSERT_VALUES_EQUAL_C(found[i].Value, expected[i].Value, "line " << i); + } + } + + const TString GEODATA = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + "<root>" + "" + " <country id=\"225\">" + " <name>Россия</name>" + " <cities>" + " <city>Москва</city>" + " <city>Санкт-Петербург</city>" + " </cities>" + " </country>" + "" + " <country id=\"149\">" + " <name>Беларусь</name>" + " <cities>" + " <city>Минск</city>" + " </cities>" + " </country>" + "" + " <country id=\"187\">" + " <name>Украина</name>" + " <cities>" + " <city>Киев</city>" + " </cities>" + " </country>" + "" + "</root>"; + + Y_UNIT_TEST(ParseXmlSimple) { + struct TCountry { + TString Name; + TVector<TString> Cities; + }; + + THashMap<int, TCountry> data; + + auto handler = [&data](NXml::TConstNode node) { + const int id = node.Attr<int>("id"); + + TCountry& c = data[id]; + + c.Name = node.FirstChild("name").Value<TString>(); + + const NXml::TConstNodes cityNodes = node.Nodes("cities/city"); + for (auto cityNode : cityNodes) { + c.Cities.push_back(cityNode.Value<TString>()); + } + }; + + ParseXml(GEODATA, handler, "country"); + + UNIT_ASSERT_EQUAL(data.size(), 3); + + UNIT_ASSERT(data.contains(225)); + const TCountry& russia = data.at(225); + UNIT_ASSERT_EQUAL(russia.Name, "Россия"); + UNIT_ASSERT_EQUAL(russia.Cities.size(), 2); + UNIT_ASSERT_EQUAL(russia.Cities[0], "Москва"); + UNIT_ASSERT_EQUAL(russia.Cities[1], "Санкт-Петербург"); + + UNIT_ASSERT(data.contains(149)); + const TCountry& belarus = data.at(149); + UNIT_ASSERT_EQUAL(belarus.Name, "Беларусь"); + UNIT_ASSERT_EQUAL(belarus.Cities.size(), 1); + UNIT_ASSERT_EQUAL(belarus.Cities[0], "Минск"); + + UNIT_ASSERT(data.contains(187)); + const TCountry& ukraine = data.at(187); + UNIT_ASSERT_EQUAL(ukraine.Name, "Украина"); + UNIT_ASSERT_EQUAL(ukraine.Cities.size(), 1); + UNIT_ASSERT_EQUAL(ukraine.Cities[0], "Киев"); + } + + Y_UNIT_TEST(ParseXmlDeepLevel) { + TVector<TString> cities; + + auto handler = [&cities](NXml::TConstNode node) { + cities.push_back(node.Value<TString>()); + }; + + ParseXml(GEODATA, handler, "city"); + + UNIT_ASSERT_EQUAL(cities.size(), 4); + UNIT_ASSERT_EQUAL(cities[0], "Москва"); + UNIT_ASSERT_EQUAL(cities[1], "Санкт-Петербург"); + UNIT_ASSERT_EQUAL(cities[2], "Минск"); + UNIT_ASSERT_EQUAL(cities[3], "Киев"); + } + + Y_UNIT_TEST(ParseXmlException) { + // Check that exception properly passes through plain C code of libxml, + // no leaks are detected by valgrind. + auto handler = [](NXml::TConstNode node) { + const int id = node.Attr<int>("id"); + if (id != 225) { + ythrow yexception() << "unsupported id: " << id; + } + }; + + UNIT_ASSERT_EXCEPTION(ParseXml(GEODATA, handler, "country"), yexception); + UNIT_ASSERT_EXCEPTION(ParseXml("<a></b>", handler, "a"), yexception); + UNIT_ASSERT_EXCEPTION(ParseXml("<root><a id=\"1\"></a><a id=\"2\"></b></root>", handler, "a"), yexception); + UNIT_ASSERT_EXCEPTION(ParseXml("<root><a id=\"1\"></a><a id=\"2></a></root>", handler, "a"), yexception); + } + + const TString BACKA = // UTF-8 encoding is used implicitly + "<Companies" + " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" + " xmlns=\"http://maps.yandex.ru/backa/1.x\"" + " xmlns:atom=\"http://www.w3.org/2005/Atom\"" + " xmlns:biz=\"http://maps.yandex.ru/business/1.x\"" + " xmlns:xal=\"urn:oasis:names:tc:ciq:xsdschema:xAL:2.0\"" + " xmlns:gml=\"http://www.opengis.net/gml\"" + ">" + "" + " <Company id=\"0001\">" + " <Geo>" + " <Location>" + " <gml:pos>37.62669 55.664827</gml:pos>" + " <kind>house</kind>" + " </Location>" + " <AddressDetails xmlns=\"urn:oasis:names:tc:ciq:xsdschema:xAL:2.0\">" + " <Country>" + " <AddressLine xml:lang=\"ru\">Москва, Каширское ш., 14</AddressLine>" + " </Country>" + " </AddressDetails>" + " </Geo>" + " </Company>" + "" + " <Company id=\"0002\">" + " <Geo>" + " <Location>" + " <pos xmlns=\"http://www.opengis.net/gml\">150.819797 59.56092</pos>" + " <kind>locality</kind>" + " </Location>" + " <xal:AddressDetails>" + " <xal:Country>" + " <xal:AddressLine xml:lang=\"ru\">Магадан, ул. Пролетарская, 43</xal:AddressLine>" + " </xal:Country>" + " </xal:AddressDetails>" + " </Geo>" + " </Company>" + "" + "</Companies>"; + + Y_UNIT_TEST(NamespaceHell) { + using TNS = NXml::TNamespaceForXPath; + const NXml::TNamespacesForXPath ns = { + TNS{"b", "http://maps.yandex.ru/backa/1.x"}, + TNS{"gml", "http://www.opengis.net/gml"}, + TNS{"xal", "urn:oasis:names:tc:ciq:xsdschema:xAL:2.0"}}; + + int count = 0; + THashMap<TString, TString> positions; + THashMap<TString, TString> addresses; + + auto handler = [&](NXml::TConstNode node) { + count++; + const auto id = node.Attr<TString>("id"); + + NXml::TXPathContextPtr ctxt = node.CreateXPathContext(ns); + + const NXml::TConstNode location = node.Node("b:Geo/b:Location", false, *ctxt); + positions[id] = location.Node("gml:pos", false, *ctxt).Value<TString>(); + addresses[id] = node.Node("b:Geo/xal:AddressDetails/xal:Country/xal:AddressLine", false, *ctxt).Value<TString>(); + }; + + ParseXml(BACKA, handler, "Company"); + UNIT_ASSERT_EQUAL(count, 0); + // nothing found because namespace was not specified + + ParseXml(BACKA, handler, "Company", "http://maps.yandex.ru/backa/1.x"); + + UNIT_ASSERT_VALUES_EQUAL(count, 2); + + UNIT_ASSERT_VALUES_EQUAL(positions["0001"], "37.62669 55.664827"); + UNIT_ASSERT_VALUES_EQUAL(positions["0002"], "150.819797 59.56092"); + + UNIT_ASSERT_VALUES_EQUAL(addresses["0001"], "Москва, Каширское ш., 14"); + UNIT_ASSERT_VALUES_EQUAL(addresses["0002"], "Магадан, ул. Пролетарская, 43"); + } +} diff --git a/library/cpp/xml/document/ya.make b/library/cpp/xml/document/ya.make new file mode 100644 index 0000000000..86bbd639cf --- /dev/null +++ b/library/cpp/xml/document/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +OWNER(finder) + +SRCS( + xml-document.cpp + xml-textreader.cpp + xml-options.cpp +) + +PEERDIR( + library/cpp/xml/init + contrib/libs/libxml + library/cpp/string_utils/ztstrbuf +) + +END() |