aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMaxim Akhmedov <max@tracto.ai>2024-11-28 15:52:58 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2024-11-28 16:04:24 +0300
commitd914da33a8058c17411fe1c33b6deed930f29450 (patch)
tree8243136dda861b60d20f5844fa5d8e3f6eb7006e
parent667d7a8073070f148e1adc7650b55e7a7ef33439 (diff)
downloadydb-d914da33a8058c17411fe1c33b6deed930f29450.tar.gz
Introduce YAML format support.
* Changelog entry Type: feature Component: proxy Support YAML format for structured data. See more details in RFC: https://github.com/ytsaurus/ytsaurus/wiki/%5BRFC%5D-YAML-format-support --- Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/938 commit_hash:2c6c1fbd1e3d1b83182a430b537c802eb8c6b79d
-rw-r--r--yt/yt/client/formats/config.cpp8
-rw-r--r--yt/yt/client/formats/config.h20
-rw-r--r--yt/yt/client/formats/public.h2
-rw-r--r--yt/yt/library/formats/format.cpp37
-rw-r--r--yt/yt/library/formats/ya.make4
-rw-r--r--yt/yt/library/formats/yaml_helpers.cpp209
-rw-r--r--yt/yt/library/formats/yaml_helpers.h114
-rw-r--r--yt/yt/library/formats/yaml_parser.cpp562
-rw-r--r--yt/yt/library/formats/yaml_parser.h20
-rw-r--r--yt/yt/library/formats/yaml_writer.cpp388
-rw-r--r--yt/yt/library/formats/yaml_writer.h18
11 files changed, 1382 insertions, 0 deletions
diff --git a/yt/yt/client/formats/config.cpp b/yt/yt/client/formats/config.cpp
index 71c1152061..ed4ee65a73 100644
--- a/yt/yt/client/formats/config.cpp
+++ b/yt/yt/client/formats/config.cpp
@@ -352,4 +352,12 @@ void TSkiffFormatConfig::Register(TRegistrar registrar)
////////////////////////////////////////////////////////////////////////////////
+void TYamlFormatConfig::Register(TRegistrar registrar)
+{
+ registrar.Parameter("write_uint_tag", &TThis::WriteUintTag)
+ .Default(false);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
} // namespace NYT::NFormats
diff --git a/yt/yt/client/formats/config.h b/yt/yt/client/formats/config.h
index aec3f30db4..84b4164b12 100644
--- a/yt/yt/client/formats/config.h
+++ b/yt/yt/client/formats/config.h
@@ -415,4 +415,24 @@ DEFINE_REFCOUNTED_TYPE(TSkiffFormatConfig)
////////////////////////////////////////////////////////////////////////////////
+class TYamlFormatConfig
+ : public NYTree::TYsonStruct
+{
+public:
+ //! Write explicit tag "!yt/uint64" for uint64 data type.
+ //! Use this option if you want to preserve information about
+ //! the original YT type (without it, numbers in range [0, 2^63-1]
+ //! will always be written as integers).
+ //! Option has no effect for parsing.
+ bool WriteUintTag;
+
+ REGISTER_YSON_STRUCT(TYamlFormatConfig);
+
+ static void Register(TRegistrar registrar);
+};
+
+DEFINE_REFCOUNTED_TYPE(TYamlFormatConfig)
+
+////////////////////////////////////////////////////////////////////////////////
+
} // namespace NYT::NFormats
diff --git a/yt/yt/client/formats/public.h b/yt/yt/client/formats/public.h
index 0ac2a23471..753456699b 100644
--- a/yt/yt/client/formats/public.h
+++ b/yt/yt/client/formats/public.h
@@ -58,6 +58,7 @@ DEFINE_ENUM(EFormatType,
(WebJson)
(Skiff)
(Arrow)
+ (Yaml)
);
////////////////////////////////////////////////////////////////////////////////
@@ -76,6 +77,7 @@ DECLARE_REFCOUNTED_CLASS(TProtobufTableConfig)
DECLARE_REFCOUNTED_CLASS(TProtobufFormatConfig)
DECLARE_REFCOUNTED_CLASS(TWebJsonFormatConfig)
DECLARE_REFCOUNTED_CLASS(TSkiffFormatConfig)
+DECLARE_REFCOUNTED_CLASS(TYamlFormatConfig)
DECLARE_REFCOUNTED_STRUCT(IYamrConsumer)
diff --git a/yt/yt/library/formats/format.cpp b/yt/yt/library/formats/format.cpp
index 6da0985abf..f551b9136e 100644
--- a/yt/yt/library/formats/format.cpp
+++ b/yt/yt/library/formats/format.cpp
@@ -12,6 +12,8 @@
#include "schemaless_writer_adapter.h"
#include "skiff_parser.h"
#include "skiff_writer.h"
+#include "yaml_parser.h"
+#include "yaml_writer.h"
#include "yamred_dsv_parser.h"
#include "yamred_dsv_writer.h"
#include "yamr_parser.h"
@@ -108,6 +110,18 @@ std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForDsv(
};
}
+std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForYaml(
+ EDataType dataType,
+ const IAttributeDictionary& attributes,
+ IZeroCopyOutput* output)
+{
+ if (dataType != EDataType::Structured) {
+ THROW_ERROR_EXCEPTION("YAML is supported only for structured data");
+ }
+ auto config = ConvertTo<TYamlFormatConfigPtr>(&attributes);
+ return CreateYamlWriter(output, DataTypeToYsonType(dataType), config);
+}
+
class TTableParserAdapter
: public IParser
{
@@ -161,6 +175,8 @@ std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForFormat(
return CreateConsumerForJson(dataType, format.Attributes(), output);
case EFormatType::Dsv:
return CreateConsumerForDsv(dataType, format.Attributes(), output);
+ case EFormatType::Yaml:
+ return CreateConsumerForYaml(dataType, format.Attributes(), output);
default:
THROW_ERROR_EXCEPTION("Unsupported output format %Qlv",
format.GetType());
@@ -408,6 +424,21 @@ TYsonProducer CreateProducerForJson(
});
}
+TYsonProducer CreateProducerForYaml(
+ EDataType dataType,
+ const IAttributeDictionary& attributes,
+ IInputStream* input)
+{
+ if (dataType != EDataType::Structured) {
+ THROW_ERROR_EXCEPTION("YAML is supported only for structured data");
+ }
+ auto ysonType = DataTypeToYsonType(dataType);
+ auto config = ConvertTo<TYamlFormatConfigPtr>(&attributes);
+ return BIND([=] (IYsonConsumer* consumer) {
+ ParseYaml(input, consumer, config, ysonType);
+ });
+}
+
TYsonProducer CreateProducerForYson(EDataType dataType, IInputStream* input)
{
auto ysonType = DataTypeToYsonType(dataType);
@@ -429,6 +460,8 @@ TYsonProducer CreateProducerForFormat(const TFormat& format, EDataType dataType,
return CreateProducerForYamredDsv(dataType, format.Attributes(), input);
case EFormatType::SchemafulDsv:
return CreateProducerForSchemafulDsv(dataType, format.Attributes(), input);
+ case EFormatType::Yaml:
+ return CreateProducerForYaml(dataType, format.Attributes(), input);
default:
THROW_ERROR_EXCEPTION("Unsupported input format %Qlv",
format.GetType());
@@ -489,6 +522,10 @@ std::unique_ptr<IParser> CreateParserForFormat(const TFormat& format, EDataType
auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&format.Attributes());
return CreateParserForSchemafulDsv(consumer, config);
}
+ case EFormatType::Yaml:
+ // We can only get here with EDataType::Tabular, so throw specific error about supporting
+ // only structured data in YAML.
+ THROW_ERROR_EXCEPTION("YAML is supported only for structured data");
default:
THROW_ERROR_EXCEPTION("Unsupported input format %Qlv",
format.GetType());
diff --git a/yt/yt/library/formats/ya.make b/yt/yt/library/formats/ya.make
index 58c8f28f9d..b15a6a04e5 100644
--- a/yt/yt/library/formats/ya.make
+++ b/yt/yt/library/formats/ya.make
@@ -22,6 +22,9 @@ SRCS(
skiff_yson_converter.cpp
unversioned_value_yson_writer.cpp
web_json_writer.cpp
+ yaml_helpers.cpp
+ yaml_parser.cpp
+ yaml_writer.cpp
yamred_dsv_parser.cpp
yamred_dsv_writer.cpp
yamr_parser_base.cpp
@@ -40,6 +43,7 @@ PEERDIR(
yt/yt/library/column_converters
contrib/libs/apache/arrow
+ contrib/libs/yaml
)
END()
diff --git a/yt/yt/library/formats/yaml_helpers.cpp b/yt/yt/library/formats/yaml_helpers.cpp
new file mode 100644
index 0000000000..1797514767
--- /dev/null
+++ b/yt/yt/library/formats/yaml_helpers.cpp
@@ -0,0 +1,209 @@
+#include "yaml_helpers.h"
+
+#include <yt/yt/core/ytree/fluent.h>
+
+#include <contrib/libs/re2/re2/re2.h>
+
+namespace NYT::NFormats {
+
+using namespace NYTree;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TLibYamlType, void(*Deleter)(TLibYamlType*)>
+TLibYamlTypeWrapper<TLibYamlType, Deleter>::TLibYamlTypeWrapper()
+{
+ // Just in case if we are allocated on stack and the destructor is called before
+ // the object is initialized.
+ memset(this, 0, sizeof(*this));
+}
+
+template <class TLibYamlType, void(*Deleter)(TLibYamlType*)>
+void TLibYamlTypeWrapper<TLibYamlType, Deleter>::Reset()
+{
+ Deleter(this);
+ memset(this, 0, sizeof(*this));
+}
+
+template <class TLibYamlType, void(*Deleter)(TLibYamlType*)>
+TLibYamlTypeWrapper<TLibYamlType, Deleter>::~TLibYamlTypeWrapper()
+{
+ Reset();
+}
+
+// Explicitly instantiate the wrappers for the types we use.
+template struct TLibYamlTypeWrapper<yaml_parser_t, yaml_parser_delete>;
+template struct TLibYamlTypeWrapper<yaml_emitter_t, yaml_emitter_delete>;
+template struct TLibYamlTypeWrapper<yaml_event_t, yaml_event_delete>;
+
+////////////////////////////////////////////////////////////////////////////////
+
+static THashMap<std::string_view, EYamlScalarType> YTTypeMap = {
+ {"!", EYamlScalarType::String},
+ {YAML_INT_TAG, EYamlScalarType::Int},
+ {YAML_FLOAT_TAG, EYamlScalarType::Float},
+ {YAML_BOOL_TAG, EYamlScalarType::Bool},
+ {YAML_NULL_TAG, EYamlScalarType::Null},
+ {YAML_STR_TAG, EYamlScalarType::String},
+ {YTUintTag, EYamlScalarType::Uint},
+};
+
+EYamlScalarType DeduceScalarTypeFromTag(const std::string_view& tag)
+{
+ auto it = YTTypeMap.find(tag);
+ if (it != YTTypeMap.end()) {
+ return it->second;
+ }
+ return EYamlScalarType::String;
+}
+
+EYamlScalarType DeduceScalarTypeFromValue(const std::string_view& value)
+{
+ // We conform to YAML 1.2 Core Schema:
+ // https://yaml.org/spec/1.2.2/#103-core-schema
+ static const re2::RE2 NullRE = "null|Null|NULL|~|";
+ static const re2::RE2 BoolRE = "true|True|TRUE|false|False|FALSE";
+ static const re2::RE2 IntRE = "[+-]?[0-9]+";
+ // In YAML 1.2 there are also octal and hexadecimal integers, but they are always positive.
+ // Therefore, we treat them separately and represent as a uint scalar type.
+ static const re2::RE2 UintRE = "0o[0-7]+|0x[0-9a-fA-F]+";
+ static const re2::RE2 FloatRE =
+ "[-+]?(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?|"
+ "[-+]?(\\.inf|\\.Inf|\\.INF)|"
+ "\\.nan|\\.NaN|\\.NAN";
+ if (re2::RE2::FullMatch(value, NullRE)) {
+ return EYamlScalarType::Null;
+ } else if (re2::RE2::FullMatch(value, BoolRE)) {
+ return EYamlScalarType::Bool;
+ } else if (re2::RE2::FullMatch(value, IntRE)) {
+ return EYamlScalarType::Int;
+ } else if (re2::RE2::FullMatch(value, UintRE)) {
+ return EYamlScalarType::Uint;
+ } else if (re2::RE2::FullMatch(value, FloatRE)) {
+ return EYamlScalarType::Float;
+ }
+ return EYamlScalarType::String;
+}
+
+bool ParseAndValidateYamlBool(const std::string_view& value)
+{
+ if (value == "true" || value == "True" || value == "TRUE") {
+ return true;
+ } else if (value == "false" || value == "False" || value == "FALSE") {
+ return false;
+ } else {
+ THROW_ERROR_EXCEPTION("Value %Qv is not a boolean", value);
+ }
+}
+
+std::pair<ENodeType, TNonStringScalar> ParseAndValidateYamlInteger(const std::string_view& value, EYamlScalarType yamlType)
+{
+ // First, detect the base and prepare a string to calling TryIntFromString function by
+ // optionally removing the 0x/0o prefix,
+ int base;
+ std::string_view adjustedValue;
+ if (value.starts_with("0x")) {
+ base = 16;
+ adjustedValue = value.substr(2);
+ } else if (value.starts_with("0o")) {
+ base = 8;
+ adjustedValue = value.substr(2);
+ } else {
+ base = 10;
+ adjustedValue = value;
+ }
+ i64 i64Value;
+ ui64 ui64Value;
+
+ auto tryFromString = [&] (auto& result) -> bool {
+ if (base == 10) {
+ return TryIntFromString<10>(adjustedValue, result);
+ } else if (base == 16) {
+ return TryIntFromString<16>(adjustedValue, result);
+ } else if (base = 8) {
+ return TryIntFromString<8>(adjustedValue, result);
+ } else {
+ YT_ABORT();
+ }
+ };
+
+ // For untagged or int-tagged values (EYamlScalarType::Int) we first try to fit the value into int64, then into uint64.
+ // For uint-tagged values (EYamlScalarType::Uint) we try to fit the value only into uint64.
+ if (yamlType == EYamlScalarType::Int && tryFromString(i64Value)) {
+ return {ENodeType::Int64, {.Int64 = i64Value}};
+ } else if (tryFromString(ui64Value)) {
+ return {ENodeType::Uint64, {.Uint64 = ui64Value}};
+ } else {
+ std::string requiredDomain = (yamlType == EYamlScalarType::Int) ? "either int64 or uint64" : "uint64";
+ THROW_ERROR_EXCEPTION("Value %Qv is not an integer or does not fit into %v", value, requiredDomain);
+ }
+}
+
+double ParseAndValidateYamlDouble(const std::string_view& value)
+{
+ double doubleValue;
+ if (value == ".inf" || value == ".Inf" || value == ".INF" ||
+ value == "+.inf" || value == "+.Inf" || value == "+.INF")
+ {
+ doubleValue = std::numeric_limits<double>::infinity();
+ } else if (value == "-.inf" || value == "-.Inf" || value == "-.INF") {
+ doubleValue = -std::numeric_limits<double>::infinity();
+ } else if (value == ".nan" || value == ".NaN" || value == ".NAN") {
+ doubleValue = std::numeric_limits<double>::quiet_NaN();
+ } else if (!TryFromString<double>(value, doubleValue)) {
+ THROW_ERROR_EXCEPTION("Value %Qv is not a floating point integer or does not fit into double", value);
+ }
+ return doubleValue;
+}
+
+std::pair<ENodeType, TNonStringScalar> ParseScalarValue(const std::string_view& value, EYamlScalarType yamlType)
+{
+ switch (yamlType) {
+ case EYamlScalarType::String:
+ return {ENodeType::String, {}};
+ case EYamlScalarType::Null:
+ return {ENodeType::Entity, {}};
+ case EYamlScalarType::Bool: {
+ bool boolValue = ParseAndValidateYamlBool(value);
+ return {ENodeType::Boolean, {.Boolean = boolValue}};
+ }
+ case EYamlScalarType::Int:
+ case EYamlScalarType::Uint: {
+ return ParseAndValidateYamlInteger(value, yamlType);
+ }
+ case EYamlScalarType::Float: {
+ auto doubleValue = ParseAndValidateYamlDouble(value);
+ return {ENodeType::Double, {.Double = doubleValue}};
+ }
+ }
+ YT_ABORT();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::string_view YamlLiteralToStringView(const yaml_char_t* literal, size_t length)
+{
+ return literal
+ ? std::string_view(reinterpret_cast<const char*>(literal), length)
+ : std::string_view();
+}
+
+std::string_view YamlLiteralToStringView(const yaml_char_t* literal)
+{
+ return literal
+ ? std::string_view(reinterpret_cast<const char*>(literal))
+ : std::string_view();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats
+
+void Serialize(const yaml_mark_t& mark, NYT::NYson::IYsonConsumer* consumer)
+{
+ NYT::NYTree::BuildYsonFluently(consumer)
+ .BeginMap()
+ .Item("position").Value(NYT::Format("%v:%v", mark.line, mark.column))
+ .Item("index").Value(static_cast<i64>(mark.index))
+ .EndMap();
+}
diff --git a/yt/yt/library/formats/yaml_helpers.h b/yt/yt/library/formats/yaml_helpers.h
new file mode 100644
index 0000000000..f9b3300ffc
--- /dev/null
+++ b/yt/yt/library/formats/yaml_helpers.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include "private.h"
+
+#include <yt/yt/core/yson/public.h>
+
+#include <yt/yt/core/ytree/public.h>
+
+#include <contrib/libs/yaml/include/yaml.h>
+
+namespace NYT::NFormats {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TLibYamlType, void(*Deleter)(TLibYamlType*)>
+struct TLibYamlTypeWrapper
+ : public TLibYamlType
+ , public TNonCopyable
+{
+ TLibYamlTypeWrapper();
+ void Reset();
+ ~TLibYamlTypeWrapper();
+};
+
+using TLibYamlParser = TLibYamlTypeWrapper<yaml_parser_t, yaml_parser_delete>;
+using TLibYamlEmitter = TLibYamlTypeWrapper<yaml_emitter_t, yaml_emitter_delete>;
+using TLibYamlEvent = TLibYamlTypeWrapper<yaml_event_t, yaml_event_delete>;
+
+////////////////////////////////////////////////////////////////////////////////
+
+// These enums are counterparts of the enums in the Yaml library.
+// Keep them in sync with the library.
+
+DEFINE_ENUM(EYamlErrorType,
+ ((NoError) (YAML_NO_ERROR))
+ ((Memory) (YAML_MEMORY_ERROR))
+ ((Reader) (YAML_READER_ERROR))
+ ((Scanner) (YAML_SCANNER_ERROR))
+ ((Parser) (YAML_PARSER_ERROR))
+ ((Composer) (YAML_COMPOSER_ERROR))
+ ((Writer) (YAML_WRITER_ERROR))
+ ((Emitter) (YAML_EMITTER_ERROR))
+);
+
+DEFINE_ENUM(EYamlEventType,
+ ((NoEvent) (YAML_NO_EVENT))
+ ((StreamStart) (YAML_STREAM_START_EVENT))
+ ((StreamEnd) (YAML_STREAM_END_EVENT))
+ ((DocumentStart) (YAML_DOCUMENT_START_EVENT))
+ ((DocumentEnd) (YAML_DOCUMENT_END_EVENT))
+ ((Alias) (YAML_ALIAS_EVENT))
+ ((Scalar) (YAML_SCALAR_EVENT))
+ ((SequenceStart) (YAML_SEQUENCE_START_EVENT))
+ ((SequenceEnd) (YAML_SEQUENCE_END_EVENT))
+ ((MappingStart) (YAML_MAPPING_START_EVENT))
+ ((MappingEnd) (YAML_MAPPING_END_EVENT))
+);
+
+//! This tag is used for denoting 2-element sequences that represent a YSON node with attributes.
+static constexpr std::string_view YTAttrNodeTag = "!yt/attrnode";
+
+//! Thia tag is used upon parsing to denote an integer scalar which should be
+//! represented by YT uint64 type. Writer by default omits this tag, but may be
+//! configured to force this tag on all uint64 values.
+static constexpr std::string_view YTUintTag = "!yt/uint64";
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! We support:
+//! - YAML 1.2 Core schema types
+//! - YT-specific uint type, for which we introduce a special tag "!yt/uint64".
+DEFINE_ENUM(EYamlScalarType,
+ (String)
+ (Int)
+ (Float)
+ (Bool)
+ (Null)
+ (Uint)
+);
+
+union TNonStringScalar
+{
+ i64 Int64;
+ ui64 Uint64;
+ double Double;
+ bool Boolean;
+};
+
+//! Extracts a recognized YAML scalar type from a tag.
+EYamlScalarType DeduceScalarTypeFromTag(const std::string_view& tag);
+//! Guesses a recognized YAML scalar type from a value.
+EYamlScalarType DeduceScalarTypeFromValue(const std::string_view& value);
+//! Given a recognized YAML type, transforms it into a YT type and,
+//! in case of a non-string result, parses a scalar value.
+std::pair<NYTree::ENodeType, TNonStringScalar> ParseScalarValue(
+ const std::string_view& value,
+ EYamlScalarType yamlType);
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Convenience helpers for transforming a weirdly represented (yaml_char_t* ~ unsigned char*)
+// YAML string into string_view, also handling the case of a null pointer.
+
+std::string_view YamlLiteralToStringView(const yaml_char_t* literal, size_t length);
+std::string_view YamlLiteralToStringView(const yaml_char_t* literal);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats
+
+// Note that ADL requires to put this function in the global namespace since
+// yaml_mark_t is defined in the global namespace from C++ POV
+
+void Serialize(const yaml_mark_t& mark, NYT::NYson::IYsonConsumer* consumer);
diff --git a/yt/yt/library/formats/yaml_parser.cpp b/yt/yt/library/formats/yaml_parser.cpp
new file mode 100644
index 0000000000..5e930f250e
--- /dev/null
+++ b/yt/yt/library/formats/yaml_parser.cpp
@@ -0,0 +1,562 @@
+#include "yaml_parser.h"
+
+#include "yaml_helpers.h"
+
+#include <yt/yt/client/formats/parser.h>
+
+#include <yt/yt/core/yson/consumer.h>
+
+#include <yt/yt/core/ytree/convert.h>
+
+#include <yt/yt/core/misc/coro_pipe.h>
+
+#include <contrib/libs/yaml/include/yaml.h>
+
+namespace NYT::NFormats {
+
+using namespace NYson;
+using namespace NYTree;
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! A helper class that takes care of the repeated parts of a YAML document that
+//! are expressed as anchors and aliases. Under the hood, materializes a YSON
+//! string for each anchor and emits it to the underlying consumer via
+//! OnRaw when needed.
+/*!
+ * Implementation notes:
+ * - Conforming to YAML 1.2, alias may refer only to a previously defined anchor.
+ * - Aliasing an anchor to an ancestor node is not supported as the resulting document
+ * cannot be represent as a finite YSON (even though some implementations with tree
+ * representations support that, e.g. PyYAML).
+ * - According to the YAML spec, alias may be "overridden" by a later definition.
+ * This feature is considered error-prone, will probably be removed in next
+ * versions of YAML spec (https://github.com/yaml/yaml-spec/pull/65) and is not
+ * supported by us.
+ * - Using an alias to a scalar anchor as a map key or anchoring a map key are not
+ * supported for the sake of simpler implementation (and are considered a weird thing
+ * to do by an author of this code).
+ */
+class TAnchorRecordingConsumer
+ : public IYsonConsumer
+{
+public:
+ explicit TAnchorRecordingConsumer(IYsonConsumer* underlyingConsumer)
+ : UnderlyingConsumer_(underlyingConsumer)
+ , RunListWriter_(&RunListStream_, EYsonType::ListFragment)
+ { }
+
+ void OnStringScalar(TStringBuf value) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnStringScalar(value); });
+ MaybeFinishAnchor();
+ }
+
+ void OnInt64Scalar(i64 value) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnInt64Scalar(value); });
+ MaybeFinishAnchor();
+ }
+
+ void OnUint64Scalar(ui64 value) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnUint64Scalar(value); });
+ MaybeFinishAnchor();
+ }
+
+ void OnDoubleScalar(double value) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnDoubleScalar(value); });
+ MaybeFinishAnchor();
+ }
+
+ void OnBooleanScalar(bool value) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnBooleanScalar(value); });
+ MaybeFinishAnchor();
+ }
+
+ void OnEntity() override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnEntity(); });
+ MaybeFinishAnchor();
+ }
+
+ void OnBeginList() override
+ {
+ ++CurrentDepth_;
+ ForAllConsumers([=] (auto* consumer) { consumer->OnBeginList(); });
+ }
+
+ void OnListItem() override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnListItem(); });
+ }
+
+ void OnEndList() override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnEndList(); });
+ --CurrentDepth_;
+ MaybeFinishAnchor();
+ }
+
+ void OnBeginMap() override
+ {
+ ++CurrentDepth_;
+ ForAllConsumers([] (auto* consumer) { consumer->OnBeginMap(); });
+ }
+
+ void OnKeyedItem(TStringBuf key) override
+ {
+ ForAllConsumers([=] (auto* consumer) { consumer->OnKeyedItem(key); });
+ }
+
+ void OnEndMap() override
+ {
+ ForAllConsumers([] (auto* consumer) { consumer->OnEndMap(); });
+ --CurrentDepth_;
+ MaybeFinishAnchor();
+ }
+
+ void OnBeginAttributes() override
+ {
+ ++CurrentDepth_;
+ ForAllConsumers([] (auto* consumer) { consumer->OnBeginAttributes(); });
+ }
+
+ void OnEndAttributes() override
+ {
+ ForAllConsumers([] (auto* consumer) { consumer->OnEndAttributes(); });
+ --CurrentDepth_;
+ // NB: do not call MaybeFinishAnchorOrRun here, as we do not want to record only
+ // attribute map part of the node.
+ }
+
+ void OnRaw(TStringBuf yson, EYsonType type) override
+ {
+ // The only caller for this OnRaw is ourselves in case of aliases, and aliases
+ // always point to YSON node.
+ YT_VERIFY(type == EYsonType::Node);
+ ForAllConsumers([=] (auto* consumer) { consumer->OnRaw(yson, type); });
+ MaybeFinishAnchor();
+ }
+
+ void OnAnchor(const std::string& anchor)
+ {
+ StartRun();
+ auto inserted = KnownAnchorNames_.insert(anchor).second;
+ if (!inserted) {
+ THROW_ERROR_EXCEPTION("Anchor %Qv is already defined", anchor);
+ }
+ auto& currentAnchor = ConstructingAnchors_.emplace_back();
+ currentAnchor = {
+ anchor,
+ CurrentDepth_,
+ GetCurrentRunOffset(),
+ };
+ }
+
+ void OnAlias(const std::string& alias)
+ {
+ auto it = FinishedAnchors_.find(alias);
+ if (it == FinishedAnchors_.end()) {
+ THROW_ERROR_EXCEPTION("Alias %Qv refers to an undefined or unfinished anchor", alias);
+ }
+ auto& anchor = it->second;
+
+ RunListWriter_.Flush();
+ std::string_view yson = RunListStream_.Str();
+ yson = yson.substr(anchor.StartOffset, anchor.EndOffset - anchor.StartOffset);
+ // NB: TBufferedBinaryYsonWriter writes ';' in a bit different way that you would expect from
+ // IYsonConsumer interface -- not as a reaction to OnListItem or OnKeyedItem, but rather when a node
+ // is finished. This leads to yson string above always containing extra trailing ';'.
+ // We strip it off, as our YSON consumers expect a YSON node to be serialized during OnAlias call.
+ YT_VERIFY(yson.ends_with(NYson::NDetail::ItemSeparatorSymbol));
+ yson.remove_suffix(1);
+ OnRaw(yson, EYsonType::Node);
+ }
+
+private:
+ IYsonConsumer* UnderlyingConsumer_;
+ //! Whenever there is at least one anchor being recorded, the stream is used to
+ //! record the YSON representation of the outermost anchor. We call the representation
+ //! of such an outermost anchor a run. Conveniently, we represent runs as elements of
+ //! a fictional YSON list, making each anchor a substring of that YSON list.
+ TStringStream RunListStream_;
+ TBufferedBinaryYsonWriter RunListWriter_;
+
+ struct TAnchor
+ {
+ std::string Name;
+ int Depth;
+ ssize_t StartOffset;
+ ssize_t EndOffset = -1;
+ };
+ //! A stack of all anchors currently being constructed.
+ std::vector<TAnchor> ConstructingAnchors_;
+ //! A set of all anchors that are currently constructed.
+ THashSet<std::string> KnownAnchorNames_;
+
+ //! A map containing YSON representations of anchors that have been finished.
+ THashMap<std::string, TAnchor> FinishedAnchors_;
+
+ int CurrentDepth_ = 0;
+
+ void ForAllConsumers(auto&& action)
+ {
+ action(UnderlyingConsumer_);
+ if (IsInRun()) {
+ action(&RunListWriter_);
+ }
+ }
+
+ void StartRun()
+ {
+ if (!IsInRun()) {
+ RunListWriter_.OnListItem();
+ }
+ }
+
+ bool IsInRun() const
+ {
+ return !ConstructingAnchors_.empty();
+ }
+
+ ssize_t GetCurrentRunOffset() const
+ {
+ YT_VERIFY(IsInRun());
+ return RunListWriter_.GetTotalWrittenSize();
+ }
+
+ //! Checks current depth, maybe finalizes the innermost anchor, and if
+ //! the anchor stack vanishes, finalizes the outermost anchor.
+ void MaybeFinishAnchor()
+ {
+ if (!ConstructingAnchors_.empty() && CurrentDepth_ == ConstructingAnchors_.back().Depth) {
+ // Finalize the innermost (stack topmost) anchor.
+ YT_VERIFY(IsInRun());
+ auto& anchor = ConstructingAnchors_.back();
+
+ anchor.EndOffset = GetCurrentRunOffset();
+ auto inserted = FinishedAnchors_.emplace(anchor.Name, std::move(anchor)).second;
+ // Insertion is ensured by the checks in OnAnchor.
+ YT_VERIFY(inserted);
+ ConstructingAnchors_.pop_back();
+ }
+ }
+};
+
+class TYamlParser
+{
+public:
+ TYamlParser(IInputStream* input, IYsonConsumer* consumer, TYamlFormatConfigPtr config, EYsonType ysonType)
+ : Input_(input)
+ , Consumer_(consumer)
+ , Config_(std::move(config))
+ , YsonType_(ysonType)
+ {
+ yaml_parser_initialize(&Parser_);
+ yaml_parser_set_input(&Parser_, &ReadHandler, this);
+ }
+
+ void Parse()
+ {
+ VisitStream();
+ }
+
+private:
+ IInputStream* Input_;
+ TAnchorRecordingConsumer Consumer_;
+ TYamlFormatConfigPtr Config_;
+ EYsonType YsonType_;
+
+ TLibYamlParser Parser_;
+
+ TError ReadError_;
+
+ TLibYamlEvent Event_;
+
+ //! Convenience helper to get rid of the ugly casts.
+ EYamlEventType GetEventType() const
+ {
+ return static_cast<EYamlEventType>(Event_.type);
+ }
+
+ static int ReadHandler(void* data, unsigned char* buffer, size_t size, size_t* sizeRead)
+ {
+ auto* yamlParser = reinterpret_cast<TYamlParser*>(data);
+ auto* input = yamlParser->Input_;
+
+ try {
+ // IInputStream is similar to yaml_read_handler_t interface
+ // in EOF case: former returns 0 from Read(), and latter
+ // expects handler to set size_read to 0 and return 1
+ *sizeRead = input->Read(buffer, size);
+ return 1;
+ } catch (const std::exception& ex) {
+ // We do not expect the read handler to be called after an error.
+ YT_ASSERT(yamlParser->ReadError_.IsOK());
+ yamlParser->ReadError_ = TError(ex);
+ // Not really used by libyaml, but let's set it to 0 just in case.
+ *sizeRead = 0;
+ return 0;
+ }
+ }
+
+ //! A wrapper around C-style libyaml API calls that return 0 on error which
+ //! throws an exception in case of an error.
+ int SafeInvoke(auto* method, auto... args)
+ {
+ int result = method(args...);
+ if (result == 0) {
+ ThrowError();
+ }
+ return result;
+ }
+
+ //! Throw an exception formed from the emitter state and possibly the exception
+ //! caught in the last write handler call.
+ [[noreturn]] void ThrowError()
+ {
+ // Unfortunately, libyaml may sometimes set error = YAML_NO_ERROR. This may lead
+ // to unclear exceptions during parsing.
+ auto yamlErrorType = static_cast<EYamlErrorType>(Parser_.error);
+ auto error = TError("YAML parser error: %v", Parser_.problem)
+ << TErrorAttribute("yaml_error_type", yamlErrorType)
+ << TErrorAttribute("problem_offset", Parser_.problem_offset)
+ << TErrorAttribute("problem_value", Parser_.problem_value)
+ << TErrorAttribute("problem_mark", Parser_.problem_mark);
+ if (Parser_.context) {
+ error <<= TErrorAttribute("context", Parser_.context);
+ error <<= TErrorAttribute("context_mark", Parser_.context_mark);
+ }
+ if (!ReadError_.IsOK()) {
+ error <<= ReadError_;
+ }
+
+ THROW_ERROR error;
+ }
+
+ //! Pull the next event from the parser into Event_ and check that it is one of the expected types.
+ void PullEvent(std::initializer_list<EYamlEventType> expectedTypes)
+ {
+ Event_.Reset();
+ SafeInvoke(yaml_parser_parse, &Parser_, &Event_);
+ for (const auto expectedType : expectedTypes) {
+ if (GetEventType() == expectedType) {
+ return;
+ }
+ }
+ // TODO(max42): stack and position!
+ THROW_ERROR_EXCEPTION(
+ "Unexpected event type %Qlv, expected one of %Qlv",
+ GetEventType(),
+ std::vector(expectedTypes));
+ }
+
+ void VisitStream()
+ {
+ PullEvent({EYamlEventType::StreamStart});
+ while (true) {
+ PullEvent({EYamlEventType::DocumentStart, EYamlEventType::StreamEnd});
+ if (GetEventType() == EYamlEventType::StreamEnd) {
+ break;
+ }
+ if (YsonType_ == EYsonType::ListFragment) {
+ Consumer_.OnListItem();
+ }
+ VisitDocument();
+ }
+ }
+
+ void VisitDocument()
+ {
+ PullEvent({
+ EYamlEventType::Scalar,
+ EYamlEventType::SequenceStart,
+ EYamlEventType::MappingStart,
+ EYamlEventType::Alias,
+ });
+ VisitNode();
+ PullEvent({EYamlEventType::DocumentEnd});
+ }
+
+ void VisitNode()
+ {
+ auto maybeOnAnchor = [&] (yaml_char_t* anchor) {
+ if (anchor) {
+ Consumer_.OnAnchor(std::string(YamlLiteralToStringView(anchor)));
+ }
+ };
+ switch (GetEventType()) {
+ case EYamlEventType::Scalar:
+ maybeOnAnchor(Event_.data.scalar.anchor);
+ VisitScalar();
+ break;
+ case EYamlEventType::SequenceStart:
+ maybeOnAnchor(Event_.data.scalar.anchor);
+ VisitSequence();
+ break;
+ case EYamlEventType::MappingStart:
+ maybeOnAnchor(Event_.data.scalar.anchor);
+ VisitMapping(/*isAttributes*/ false);
+ break;
+ case EYamlEventType::Alias:
+ Consumer_.OnAlias(std::string(YamlLiteralToStringView(Event_.data.alias.anchor)));
+ break;
+ default:
+ YT_ABORT();
+ }
+ }
+
+ void VisitScalar()
+ {
+ auto scalar = Event_.data.scalar;
+ auto yamlValue = YamlLiteralToStringView(scalar.value, scalar.length);
+
+ // According to YAML spec, there are two non-specific tags "!" and "?", and all other
+ // tags are specific.
+ //
+ // If the tag is missing, parser should assign tag "!" to non-plain (quoted) scalars,
+ // and "?" to plain scalars and collection nodes. For some reason, libyaml does not
+ // do that for us.
+ //
+ // Then, "!"-tagged scalars should always be treated as strings, i.e. "!" -> YT string.
+ //
+ // Specific tags are either recognized by us, in which case we deduce a corresponding YT type,
+ // or we assign a string type otherwise.
+ //
+ // For the "?"-tagged scalars we perform the type deduction based on the scalar value
+ // (which is the most often case, as almost nobody uses type tags in YAML).
+ //
+ // Cf. https://yaml.org/spec/1.2.2/#332-resolved-tags
+ std::string_view tag;
+ if (scalar.tag) {
+ tag = YamlLiteralToStringView(scalar.tag);
+ } else if (scalar.style != YAML_PLAIN_SCALAR_STYLE) {
+ tag = "!";
+ } else {
+ tag = "?";
+ }
+
+ EYamlScalarType yamlType;
+ if (tag != "?") {
+ yamlType = DeduceScalarTypeFromTag(tag);
+ } else {
+ yamlType = DeduceScalarTypeFromValue(yamlValue);
+ }
+ auto [ytType, nonStringScalar] = ParseScalarValue(yamlValue, yamlType);
+ switch (ytType) {
+ case ENodeType::String:
+ Consumer_.OnStringScalar(yamlValue);
+ break;
+ case ENodeType::Int64:
+ Consumer_.OnInt64Scalar(nonStringScalar.Int64);
+ break;
+ case ENodeType::Uint64:
+ Consumer_.OnUint64Scalar(nonStringScalar.Uint64);
+ break;
+ case ENodeType::Double:
+ Consumer_.OnDoubleScalar(nonStringScalar.Double);
+ break;
+ case ENodeType::Boolean:
+ Consumer_.OnBooleanScalar(nonStringScalar.Boolean);
+ break;
+ case ENodeType::Entity:
+ Consumer_.OnEntity();
+ break;
+ default:
+ YT_ABORT();
+ }
+ }
+
+ void VisitSequence()
+ {
+ // NB: YSON node with attributes is represented as a yt/attrnode-tagged YAML sequence,
+ // so handle it as a special case.
+ if (YamlLiteralToStringView(Event_.data.mapping_start.tag) == YTAttrNodeTag) {
+ VisitNodeWithAttributes();
+ return;
+ }
+
+ Consumer_.OnBeginList();
+ while (true) {
+ PullEvent({
+ EYamlEventType::SequenceEnd,
+ EYamlEventType::SequenceStart,
+ EYamlEventType::MappingStart,
+ EYamlEventType::Scalar,
+ EYamlEventType::Alias
+ });
+ if (GetEventType() == EYamlEventType::SequenceEnd) {
+ break;
+ }
+ Consumer_.OnListItem();
+ VisitNode();
+ }
+ Consumer_.OnEndList();
+ }
+
+ void VisitNodeWithAttributes()
+ {
+ PullEvent({EYamlEventType::MappingStart});
+ VisitMapping(/*isAttributes*/ true);
+
+ PullEvent({
+ EYamlEventType::Scalar,
+ EYamlEventType::SequenceStart,
+ EYamlEventType::MappingStart,
+ EYamlEventType::Alias,
+ });
+ VisitNode();
+
+ PullEvent({EYamlEventType::SequenceEnd});
+ }
+
+ void VisitMapping(bool isAttributes)
+ {
+ isAttributes ? Consumer_.OnBeginAttributes() : Consumer_.OnBeginMap();
+ while (true) {
+ PullEvent({
+ EYamlEventType::MappingEnd,
+ EYamlEventType::Scalar,
+ // Yes, YAML is weird enough to support aliases as keys!
+ EYamlEventType::Alias,
+ });
+ if (GetEventType() == EYamlEventType::MappingEnd) {
+ break;
+ } else if (GetEventType() == EYamlEventType::Alias) {
+ THROW_ERROR_EXCEPTION("Using alias as a map key is not supported");
+ } else {
+ if (Event_.data.scalar.anchor) {
+ THROW_ERROR_EXCEPTION("Putting anchors on map keys is not supported");
+ }
+ auto key = YamlLiteralToStringView(Event_.data.scalar.value, Event_.data.scalar.length);
+ Consumer_.OnKeyedItem(key);
+ }
+
+ PullEvent({
+ EYamlEventType::Scalar,
+ EYamlEventType::SequenceStart,
+ EYamlEventType::MappingStart,
+ EYamlEventType::Alias,
+ });
+ VisitNode();
+ }
+ isAttributes ? Consumer_.OnEndAttributes() : Consumer_.OnEndMap();
+ }
+};
+
+void ParseYaml(
+ IInputStream* input,
+ IYsonConsumer* consumer,
+ TYamlFormatConfigPtr config,
+ EYsonType ysonType)
+{
+ TYamlParser parser(input, consumer, config, ysonType);
+ parser.Parse();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats
diff --git a/yt/yt/library/formats/yaml_parser.h b/yt/yt/library/formats/yaml_parser.h
new file mode 100644
index 0000000000..6cdfc4dab6
--- /dev/null
+++ b/yt/yt/library/formats/yaml_parser.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <yt/yt/client/formats/public.h>
+
+#include <yt/yt/core/yson/public.h>
+
+namespace NYT::NFormats {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Parses a YAML stream in pull mode (may be used for structured driver commands).
+void ParseYaml(
+ IInputStream* input,
+ NYson::IYsonConsumer* consumer,
+ TYamlFormatConfigPtr config,
+ NYson::EYsonType ysonType);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats
diff --git a/yt/yt/library/formats/yaml_writer.cpp b/yt/yt/library/formats/yaml_writer.cpp
new file mode 100644
index 0000000000..3d2961d46a
--- /dev/null
+++ b/yt/yt/library/formats/yaml_writer.cpp
@@ -0,0 +1,388 @@
+#include "yaml_writer.h"
+
+#include "helpers.h"
+#include "yaml_helpers.h"
+
+#include <yt/yt/client/formats/config.h>
+
+#include <contrib/libs/yaml/include/yaml.h>
+
+namespace NYT::NFormats {
+
+using namespace NYson;
+
+////////////////////////////////////////////////////////////////////////////
+
+class TYamlWriter
+ : public TFormatsConsumerBase
+{
+public:
+ TYamlWriter(
+ IOutputStream* output,
+ NYson::EYsonType /*type*/,
+ TYamlFormatConfigPtr config)
+ : Output_(output)
+ , Config_(config)
+ {
+ SafeInvoke(yaml_emitter_initialize, &Emitter_);
+ yaml_emitter_set_output(&Emitter_, &WriteHandler, this);
+ EmitEvent(yaml_stream_start_event_initialize, YAML_ANY_ENCODING);
+ }
+
+ void Flush() override
+ {
+ SafeInvoke(yaml_emitter_flush, &Emitter_);
+ }
+
+ void OnStringScalar(TStringBuf value) override
+ {
+ OnNodeEnter();
+ // We try to emit a plain (unquoted) scalar if possible. It may be not possible
+ // either because of YAML syntax restrictions (which will be handled by libyaml switching
+ // to a quoted style automatically), or because the plain style would produce a scalar
+ // which belongs to one of Core YAML schema regexps, making reasonable parsers interpret it as
+ // int/float/bool/null instead of string.
+ //
+ // PyYAML and Go YAML parsers handle this issue by checking the type that would be deduced from
+ // an unquoted representation and quoting the scalar if it would be interpreted as a non-string type.
+ // We utilize the same approach here.
+
+ auto plainYamlType = DeduceScalarTypeFromValue(value);
+ auto desiredScalarStyle = YAML_ANY_SCALAR_STYLE;
+ if (plainYamlType != EYamlScalarType::String) {
+ desiredScalarStyle = YAML_DOUBLE_QUOTED_SCALAR_STYLE;
+ }
+
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ nullptr,
+ reinterpret_cast<const yaml_char_t*>(value.Data()),
+ value.Size(),
+ /*plain_implicit*/ 1,
+ /*quoted_implicit*/ 1,
+ desiredScalarStyle);
+ OnNodeLeave();
+ }
+
+ void OnInt64Scalar(i64 value) override
+ {
+ OnNodeEnter();
+ // Int64 scalars are always represented as plain (unquoted) YAML scalars.
+ // Core YAML schema regexps ensures that they will be interpreted as integers
+ // by all reasonable YAML parsers.
+ // Cf. https://yaml.org/spec/1.2.2/#1032-tag-resolution
+ char buf[64];
+ auto length = IntToString<10>(value, buf, sizeof(buf));
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ nullptr,
+ reinterpret_cast<yaml_char_t*>(buf),
+ length,
+ /*plain_implicit*/ 1,
+ /*quoted_implicit*/ 0,
+ YAML_PLAIN_SCALAR_STYLE);
+ OnNodeLeave();
+ }
+
+ void OnUint64Scalar(ui64 value) override
+ {
+ OnNodeEnter();
+ // Uint64 scalars are by default represented as plain (unquoted) YAML scalars,
+ // similar to Int64 scalars (see the comment in OnInt64Scalar).
+ // However, we optionally support a custom "!yt/uint64" tag to preserve the
+ // information that the value is unsigned, which may be useful to control
+ // signedness upon writing and for YT -> YAML -> YT roundtrip consistency.
+ char buf[64];
+ auto length = IntToString<10>(value, buf, sizeof(buf));
+
+ // In libyaml API plainImplicit defines whether the writer omits the tag.
+ bool plainImplicit = !Config_->WriteUintTag;
+
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ reinterpret_cast<const yaml_char_t*>(YTUintTag.data()),
+ reinterpret_cast<yaml_char_t*>(buf),
+ length,
+ /*plain_implicit*/ plainImplicit,
+ /*quoted_implicit*/ 0,
+ YAML_PLAIN_SCALAR_STYLE);
+ OnNodeLeave();
+ }
+
+ void OnDoubleScalar(double value) override
+ {
+ OnNodeEnter();
+ // Double scalars are by default represented as plain (unquoted) YAML scalars,
+ // similar to Int64 scalars (see the comment in OnInt64Scalar).
+
+ char buf[512];
+ auto length = DoubleToYamlString(value, buf, sizeof(buf));
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_FLOAT_TAG),
+ reinterpret_cast<yaml_char_t*>(buf),
+ length,
+ /*plain_implicit*/ 1,
+ /*quoted_implicit*/ 0,
+ YAML_PLAIN_SCALAR_STYLE);
+ OnNodeLeave();
+ }
+
+ void OnBooleanScalar(bool value) override
+ {
+ OnNodeEnter();
+ static const std::string_view trueLiteral = "true";
+ static const std::string_view falseLiteral = "false";
+ const std::string_view& literal = value ? trueLiteral : falseLiteral;
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_BOOL_TAG),
+ reinterpret_cast<yaml_char_t*>(const_cast<char*>(literal.data())),
+ literal.size(),
+ /*plain_implicit*/ 1,
+ /*quoted_implicit*/ 0,
+ YAML_PLAIN_SCALAR_STYLE);
+ OnNodeLeave();
+ }
+
+ virtual void OnEntity() override
+ {
+ OnNodeEnter();
+ static const std::string_view nullLiteral = "null";
+ EmitEvent(
+ yaml_scalar_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_NULL_TAG),
+ reinterpret_cast<yaml_char_t*>(const_cast<char*>(nullLiteral.data())),
+ nullLiteral.size(),
+ /*plain_implicit*/ 1,
+ /*quoted_implicit*/ 0,
+ YAML_PLAIN_SCALAR_STYLE);
+ OnNodeLeave();
+ }
+
+ virtual void OnBeginList() override
+ {
+ OnNodeEnter();
+ EmitEvent(
+ yaml_sequence_start_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ nullptr,
+ /*implicit*/ 1,
+ YAML_ANY_SEQUENCE_STYLE);
+ }
+
+ virtual void OnListItem() override
+ { }
+
+ virtual void OnEndList() override
+ {
+ EmitEvent(yaml_sequence_end_event_initialize);
+ OnNodeLeave();
+ }
+
+ virtual void OnBeginMap() override
+ {
+ OnNodeEnter();
+ EmitEvent(
+ yaml_mapping_start_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ nullptr,
+ /*implicit*/ 1,
+ YAML_ANY_MAPPING_STYLE);
+ }
+
+ virtual void OnKeyedItem(TStringBuf key) override
+ {
+ OnStringScalar(key);
+ }
+
+ virtual void OnEndMap() override
+ {
+ EmitEvent(yaml_mapping_end_event_initialize);
+ OnNodeLeave();
+ }
+
+ virtual void OnBeginAttributes() override
+ {
+ // NB: node with attributes in YAML is represented as a yt/attrnode-tagged 2-item sequence.
+ OnNodeEnter();
+ EmitEvent(
+ yaml_sequence_start_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ reinterpret_cast<const yaml_char_t*>(YTAttrNodeTag.data()),
+ /*implicit*/ 0,
+ YAML_ANY_SEQUENCE_STYLE);
+ EmitEvent(
+ yaml_mapping_start_event_initialize,
+ /*anchor*/ nullptr,
+ /*tag*/ nullptr,
+ /*implicit*/ 1,
+ YAML_ANY_MAPPING_STYLE);
+ }
+
+ virtual void OnEndAttributes() override
+ {
+ EmitEvent(yaml_mapping_end_event_initialize);
+ ImmediatelyAfterAttributes_ = true;
+ OnNodeLeave(/*isAttributes*/ true);
+ DepthsWithPendingValueClosure_.push_back(CurrentDepth_);
+ }
+
+private:
+ using TEmitterPtr = std::unique_ptr<yaml_emitter_t, decltype(&yaml_emitter_delete)>;
+ using TEventPtr = std::unique_ptr<yaml_event_t, decltype(&yaml_event_delete)>;
+
+ IOutputStream* Output_;
+ TYamlFormatConfigPtr Config_;
+ TLibYamlEmitter Emitter_;
+ TError WriteError_;
+
+ // Utilities for tracking the current depth and the stack of depths at which
+ // we must perform extra sequence closure due to yt/attrnode-tagged 2-item sequence convention.
+
+ //! The depth of the current node in the YSON tree.
+ int CurrentDepth_ = 0;
+ //! A stack of depths at which attributes are present.
+ std::vector<int> DepthsWithPendingValueClosure_ = {-1};
+ //! A flag indicating that the we are immediately after the OnEndAttributes() event.
+ bool ImmediatelyAfterAttributes_ = false;
+
+ static int WriteHandler(void* data, unsigned char* buffer, size_t size)
+ {
+ auto* yamlWriter = reinterpret_cast<TYamlWriter*>(data);
+ auto* output = yamlWriter->Output_;
+
+ try {
+ output->Write(buffer, size);
+ } catch (const std::exception& ex) {
+ // We do not expect the write handler to be called after an error.
+ YT_ASSERT(yamlWriter->WriteError_.IsOK());
+ yamlWriter->WriteError_ = TError(ex);
+ return 0;
+ }
+ return 1;
+ }
+
+ //! A wrapper around C-style libyaml API calls that return 0 on error which
+ //! throws an exception in case of an error.
+ int SafeInvoke(auto* method, auto... args)
+ {
+ int result = method(args...);
+ if (result == 0) {
+ ThrowError();
+ }
+ return result;
+ }
+
+ //! Throw an exception formed from the emitter state and possibly the exception
+ //! caught in the last write handler call.
+ void ThrowError()
+ {
+ // Unfortunately, libyaml may sometimes YAML_NO_ERROR. This may lead
+ // to unclear exceptions during parsing.
+ auto yamlErrorType = static_cast<EYamlErrorType>(Emitter_.error);
+ auto error = TError("YAML emitter error: %v", Emitter_.problem)
+ << TErrorAttribute("yaml_error_type", yamlErrorType);
+
+ if (!WriteError_.IsOK()) {
+ error <<= WriteError_;
+ }
+
+ THROW_ERROR error;
+ }
+
+ void EmitEvent(auto* eventInitializer, auto... args)
+ {
+ yaml_event_t event;
+ // Event initializer is guaranteed to release all resources in case of an error.
+ SafeInvoke(eventInitializer, &event, args...);
+ SafeInvoke(yaml_emitter_emit, &Emitter_, &event);
+ }
+
+ void OnNodeEnter()
+ {
+ // If we are at the depth 0 and it is not a break between the root node attributes and the root node,
+ // emit the document start event.
+ if (CurrentDepth_ == 0 && !ImmediatelyAfterAttributes_) {
+ EmitEvent(
+ yaml_document_start_event_initialize,
+ /*version_directive*/ nullptr,
+ /*tag_directives_start*/ nullptr,
+ /*tag_directives_end*/ nullptr,
+ /*implicit*/ 1);
+ }
+ ++CurrentDepth_;
+ ImmediatelyAfterAttributes_ = false;
+ }
+
+ void OnNodeLeave(bool isAttributes = false)
+ {
+ --CurrentDepth_;
+ if (CurrentDepth_ == DepthsWithPendingValueClosure_.back()) {
+ EmitEvent(yaml_sequence_end_event_initialize);
+ DepthsWithPendingValueClosure_.pop_back();
+ }
+ if (isAttributes) {
+ ImmediatelyAfterAttributes_ = true;
+ }
+ // If we are leaving the root node and it is not a break between the root node attributes and the root node,
+ // emit the document end event.
+ if (CurrentDepth_ == 0 && !isAttributes) {
+ EmitEvent(yaml_document_end_event_initialize, /*implicit*/ 1);
+ }
+ }
+
+ size_t DoubleToYamlString(double value, char* buf, size_t size)
+ {
+ // Extra care must be taken to handle non-finite values (NaN, Inf, -Inf),
+ // and also to ensure that the resulting value cannot be parsed as an integer.
+ // Both things are done similarly to the corresponding logic in the YSON writer.
+ // Cf. NYson::TUncheckedYsonTokenWriter::WriteTextDouble.
+
+ if (std::isfinite(value)) {
+ auto length = FloatToString(value, buf, size);
+ std::string_view str(buf, length);
+ if (str.find('.') == std::string::npos && str.find('e') == std::string::npos) {
+ YT_VERIFY(length + 1 <= size);
+ buf[length++] = '.';
+ }
+ return length;
+ } else {
+ static const std::string_view nanLiteral = ".nan";
+ static const std::string_view infLiteral = ".inf";
+ static const std::string_view negativeInfLiteral = "-.inf";
+
+ std::string_view str;
+ if (std::isnan(value)) {
+ str = nanLiteral;
+ } else if (std::isinf(value) && value > 0) {
+ str = infLiteral;
+ } else {
+ str = negativeInfLiteral;
+ }
+ YT_VERIFY(str.size() + 1 <= size);
+ ::memcpy(buf, str.data(), str.size() + 1);
+ return str.size();
+ }
+ }
+};
+
+std::unique_ptr<IFlushableYsonConsumer> CreateYamlWriter(
+ IZeroCopyOutput* output,
+ NYson::EYsonType type,
+ TYamlFormatConfigPtr config)
+{
+ // Note that output gets narrowed to IOutputStream* as the currently used yaml library
+ // interface is not zero-copy by its nature.
+ return std::make_unique<TYamlWriter>(output, type, config);
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats
diff --git a/yt/yt/library/formats/yaml_writer.h b/yt/yt/library/formats/yaml_writer.h
new file mode 100644
index 0000000000..c3e7fb76fd
--- /dev/null
+++ b/yt/yt/library/formats/yaml_writer.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <yt/yt/core/yson/consumer.h>
+
+#include <yt/yt/client/formats/public.h>
+
+namespace NYT::NFormats {
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::unique_ptr<NYson::IFlushableYsonConsumer> CreateYamlWriter(
+ IZeroCopyOutput* output,
+ NYson::EYsonType type,
+ TYamlFormatConfigPtr config);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NFormats