diff options
author | Maxim Akhmedov <max@tracto.ai> | 2024-11-28 15:52:58 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2024-11-28 16:04:24 +0300 |
commit | d914da33a8058c17411fe1c33b6deed930f29450 (patch) | |
tree | 8243136dda861b60d20f5844fa5d8e3f6eb7006e | |
parent | 667d7a8073070f148e1adc7650b55e7a7ef33439 (diff) | |
download | ydb-d914da33a8058c17411fe1c33b6deed930f29450.tar.gz |
Introduce YAML format support.
* Changelog entry
Type: feature
Component: proxy
Support YAML format for structured data.
See more details in RFC:
https://github.com/ytsaurus/ytsaurus/wiki/%5BRFC%5D-YAML-format-support
---
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/938
commit_hash:2c6c1fbd1e3d1b83182a430b537c802eb8c6b79d
-rw-r--r-- | yt/yt/client/formats/config.cpp | 8 | ||||
-rw-r--r-- | yt/yt/client/formats/config.h | 20 | ||||
-rw-r--r-- | yt/yt/client/formats/public.h | 2 | ||||
-rw-r--r-- | yt/yt/library/formats/format.cpp | 37 | ||||
-rw-r--r-- | yt/yt/library/formats/ya.make | 4 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_helpers.cpp | 209 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_helpers.h | 114 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_parser.cpp | 562 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_parser.h | 20 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_writer.cpp | 388 | ||||
-rw-r--r-- | yt/yt/library/formats/yaml_writer.h | 18 |
11 files changed, 1382 insertions, 0 deletions
diff --git a/yt/yt/client/formats/config.cpp b/yt/yt/client/formats/config.cpp index 71c1152061..ed4ee65a73 100644 --- a/yt/yt/client/formats/config.cpp +++ b/yt/yt/client/formats/config.cpp @@ -352,4 +352,12 @@ void TSkiffFormatConfig::Register(TRegistrar registrar) //////////////////////////////////////////////////////////////////////////////// +void TYamlFormatConfig::Register(TRegistrar registrar) +{ + registrar.Parameter("write_uint_tag", &TThis::WriteUintTag) + .Default(false); +} + +//////////////////////////////////////////////////////////////////////////////// + } // namespace NYT::NFormats diff --git a/yt/yt/client/formats/config.h b/yt/yt/client/formats/config.h index aec3f30db4..84b4164b12 100644 --- a/yt/yt/client/formats/config.h +++ b/yt/yt/client/formats/config.h @@ -415,4 +415,24 @@ DEFINE_REFCOUNTED_TYPE(TSkiffFormatConfig) //////////////////////////////////////////////////////////////////////////////// +class TYamlFormatConfig + : public NYTree::TYsonStruct +{ +public: + //! Write explicit tag "!yt/uint64" for uint64 data type. + //! Use this option if you want to preserve information about + //! the original YT type (without it, numbers in range [0, 2^63-1] + //! will always be written as integers). + //! Option has no effect for parsing. + bool WriteUintTag; + + REGISTER_YSON_STRUCT(TYamlFormatConfig); + + static void Register(TRegistrar registrar); +}; + +DEFINE_REFCOUNTED_TYPE(TYamlFormatConfig) + +//////////////////////////////////////////////////////////////////////////////// + } // namespace NYT::NFormats diff --git a/yt/yt/client/formats/public.h b/yt/yt/client/formats/public.h index 0ac2a23471..753456699b 100644 --- a/yt/yt/client/formats/public.h +++ b/yt/yt/client/formats/public.h @@ -58,6 +58,7 @@ DEFINE_ENUM(EFormatType, (WebJson) (Skiff) (Arrow) + (Yaml) ); //////////////////////////////////////////////////////////////////////////////// @@ -76,6 +77,7 @@ DECLARE_REFCOUNTED_CLASS(TProtobufTableConfig) DECLARE_REFCOUNTED_CLASS(TProtobufFormatConfig) DECLARE_REFCOUNTED_CLASS(TWebJsonFormatConfig) DECLARE_REFCOUNTED_CLASS(TSkiffFormatConfig) +DECLARE_REFCOUNTED_CLASS(TYamlFormatConfig) DECLARE_REFCOUNTED_STRUCT(IYamrConsumer) diff --git a/yt/yt/library/formats/format.cpp b/yt/yt/library/formats/format.cpp index 6da0985abf..f551b9136e 100644 --- a/yt/yt/library/formats/format.cpp +++ b/yt/yt/library/formats/format.cpp @@ -12,6 +12,8 @@ #include "schemaless_writer_adapter.h" #include "skiff_parser.h" #include "skiff_writer.h" +#include "yaml_parser.h" +#include "yaml_writer.h" #include "yamred_dsv_parser.h" #include "yamred_dsv_writer.h" #include "yamr_parser.h" @@ -108,6 +110,18 @@ std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForDsv( }; } +std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForYaml( + EDataType dataType, + const IAttributeDictionary& attributes, + IZeroCopyOutput* output) +{ + if (dataType != EDataType::Structured) { + THROW_ERROR_EXCEPTION("YAML is supported only for structured data"); + } + auto config = ConvertTo<TYamlFormatConfigPtr>(&attributes); + return CreateYamlWriter(output, DataTypeToYsonType(dataType), config); +} + class TTableParserAdapter : public IParser { @@ -161,6 +175,8 @@ std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForFormat( return CreateConsumerForJson(dataType, format.Attributes(), output); case EFormatType::Dsv: return CreateConsumerForDsv(dataType, format.Attributes(), output); + case EFormatType::Yaml: + return CreateConsumerForYaml(dataType, format.Attributes(), output); default: THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", format.GetType()); @@ -408,6 +424,21 @@ TYsonProducer CreateProducerForJson( }); } +TYsonProducer CreateProducerForYaml( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + if (dataType != EDataType::Structured) { + THROW_ERROR_EXCEPTION("YAML is supported only for structured data"); + } + auto ysonType = DataTypeToYsonType(dataType); + auto config = ConvertTo<TYamlFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseYaml(input, consumer, config, ysonType); + }); +} + TYsonProducer CreateProducerForYson(EDataType dataType, IInputStream* input) { auto ysonType = DataTypeToYsonType(dataType); @@ -429,6 +460,8 @@ TYsonProducer CreateProducerForFormat(const TFormat& format, EDataType dataType, return CreateProducerForYamredDsv(dataType, format.Attributes(), input); case EFormatType::SchemafulDsv: return CreateProducerForSchemafulDsv(dataType, format.Attributes(), input); + case EFormatType::Yaml: + return CreateProducerForYaml(dataType, format.Attributes(), input); default: THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", format.GetType()); @@ -489,6 +522,10 @@ std::unique_ptr<IParser> CreateParserForFormat(const TFormat& format, EDataType auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&format.Attributes()); return CreateParserForSchemafulDsv(consumer, config); } + case EFormatType::Yaml: + // We can only get here with EDataType::Tabular, so throw specific error about supporting + // only structured data in YAML. + THROW_ERROR_EXCEPTION("YAML is supported only for structured data"); default: THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", format.GetType()); diff --git a/yt/yt/library/formats/ya.make b/yt/yt/library/formats/ya.make index 58c8f28f9d..b15a6a04e5 100644 --- a/yt/yt/library/formats/ya.make +++ b/yt/yt/library/formats/ya.make @@ -22,6 +22,9 @@ SRCS( skiff_yson_converter.cpp unversioned_value_yson_writer.cpp web_json_writer.cpp + yaml_helpers.cpp + yaml_parser.cpp + yaml_writer.cpp yamred_dsv_parser.cpp yamred_dsv_writer.cpp yamr_parser_base.cpp @@ -40,6 +43,7 @@ PEERDIR( yt/yt/library/column_converters contrib/libs/apache/arrow + contrib/libs/yaml ) END() diff --git a/yt/yt/library/formats/yaml_helpers.cpp b/yt/yt/library/formats/yaml_helpers.cpp new file mode 100644 index 0000000000..1797514767 --- /dev/null +++ b/yt/yt/library/formats/yaml_helpers.cpp @@ -0,0 +1,209 @@ +#include "yaml_helpers.h" + +#include <yt/yt/core/ytree/fluent.h> + +#include <contrib/libs/re2/re2/re2.h> + +namespace NYT::NFormats { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TLibYamlType, void(*Deleter)(TLibYamlType*)> +TLibYamlTypeWrapper<TLibYamlType, Deleter>::TLibYamlTypeWrapper() +{ + // Just in case if we are allocated on stack and the destructor is called before + // the object is initialized. + memset(this, 0, sizeof(*this)); +} + +template <class TLibYamlType, void(*Deleter)(TLibYamlType*)> +void TLibYamlTypeWrapper<TLibYamlType, Deleter>::Reset() +{ + Deleter(this); + memset(this, 0, sizeof(*this)); +} + +template <class TLibYamlType, void(*Deleter)(TLibYamlType*)> +TLibYamlTypeWrapper<TLibYamlType, Deleter>::~TLibYamlTypeWrapper() +{ + Reset(); +} + +// Explicitly instantiate the wrappers for the types we use. +template struct TLibYamlTypeWrapper<yaml_parser_t, yaml_parser_delete>; +template struct TLibYamlTypeWrapper<yaml_emitter_t, yaml_emitter_delete>; +template struct TLibYamlTypeWrapper<yaml_event_t, yaml_event_delete>; + +//////////////////////////////////////////////////////////////////////////////// + +static THashMap<std::string_view, EYamlScalarType> YTTypeMap = { + {"!", EYamlScalarType::String}, + {YAML_INT_TAG, EYamlScalarType::Int}, + {YAML_FLOAT_TAG, EYamlScalarType::Float}, + {YAML_BOOL_TAG, EYamlScalarType::Bool}, + {YAML_NULL_TAG, EYamlScalarType::Null}, + {YAML_STR_TAG, EYamlScalarType::String}, + {YTUintTag, EYamlScalarType::Uint}, +}; + +EYamlScalarType DeduceScalarTypeFromTag(const std::string_view& tag) +{ + auto it = YTTypeMap.find(tag); + if (it != YTTypeMap.end()) { + return it->second; + } + return EYamlScalarType::String; +} + +EYamlScalarType DeduceScalarTypeFromValue(const std::string_view& value) +{ + // We conform to YAML 1.2 Core Schema: + // https://yaml.org/spec/1.2.2/#103-core-schema + static const re2::RE2 NullRE = "null|Null|NULL|~|"; + static const re2::RE2 BoolRE = "true|True|TRUE|false|False|FALSE"; + static const re2::RE2 IntRE = "[+-]?[0-9]+"; + // In YAML 1.2 there are also octal and hexadecimal integers, but they are always positive. + // Therefore, we treat them separately and represent as a uint scalar type. + static const re2::RE2 UintRE = "0o[0-7]+|0x[0-9a-fA-F]+"; + static const re2::RE2 FloatRE = + "[-+]?(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?|" + "[-+]?(\\.inf|\\.Inf|\\.INF)|" + "\\.nan|\\.NaN|\\.NAN"; + if (re2::RE2::FullMatch(value, NullRE)) { + return EYamlScalarType::Null; + } else if (re2::RE2::FullMatch(value, BoolRE)) { + return EYamlScalarType::Bool; + } else if (re2::RE2::FullMatch(value, IntRE)) { + return EYamlScalarType::Int; + } else if (re2::RE2::FullMatch(value, UintRE)) { + return EYamlScalarType::Uint; + } else if (re2::RE2::FullMatch(value, FloatRE)) { + return EYamlScalarType::Float; + } + return EYamlScalarType::String; +} + +bool ParseAndValidateYamlBool(const std::string_view& value) +{ + if (value == "true" || value == "True" || value == "TRUE") { + return true; + } else if (value == "false" || value == "False" || value == "FALSE") { + return false; + } else { + THROW_ERROR_EXCEPTION("Value %Qv is not a boolean", value); + } +} + +std::pair<ENodeType, TNonStringScalar> ParseAndValidateYamlInteger(const std::string_view& value, EYamlScalarType yamlType) +{ + // First, detect the base and prepare a string to calling TryIntFromString function by + // optionally removing the 0x/0o prefix, + int base; + std::string_view adjustedValue; + if (value.starts_with("0x")) { + base = 16; + adjustedValue = value.substr(2); + } else if (value.starts_with("0o")) { + base = 8; + adjustedValue = value.substr(2); + } else { + base = 10; + adjustedValue = value; + } + i64 i64Value; + ui64 ui64Value; + + auto tryFromString = [&] (auto& result) -> bool { + if (base == 10) { + return TryIntFromString<10>(adjustedValue, result); + } else if (base == 16) { + return TryIntFromString<16>(adjustedValue, result); + } else if (base = 8) { + return TryIntFromString<8>(adjustedValue, result); + } else { + YT_ABORT(); + } + }; + + // For untagged or int-tagged values (EYamlScalarType::Int) we first try to fit the value into int64, then into uint64. + // For uint-tagged values (EYamlScalarType::Uint) we try to fit the value only into uint64. + if (yamlType == EYamlScalarType::Int && tryFromString(i64Value)) { + return {ENodeType::Int64, {.Int64 = i64Value}}; + } else if (tryFromString(ui64Value)) { + return {ENodeType::Uint64, {.Uint64 = ui64Value}}; + } else { + std::string requiredDomain = (yamlType == EYamlScalarType::Int) ? "either int64 or uint64" : "uint64"; + THROW_ERROR_EXCEPTION("Value %Qv is not an integer or does not fit into %v", value, requiredDomain); + } +} + +double ParseAndValidateYamlDouble(const std::string_view& value) +{ + double doubleValue; + if (value == ".inf" || value == ".Inf" || value == ".INF" || + value == "+.inf" || value == "+.Inf" || value == "+.INF") + { + doubleValue = std::numeric_limits<double>::infinity(); + } else if (value == "-.inf" || value == "-.Inf" || value == "-.INF") { + doubleValue = -std::numeric_limits<double>::infinity(); + } else if (value == ".nan" || value == ".NaN" || value == ".NAN") { + doubleValue = std::numeric_limits<double>::quiet_NaN(); + } else if (!TryFromString<double>(value, doubleValue)) { + THROW_ERROR_EXCEPTION("Value %Qv is not a floating point integer or does not fit into double", value); + } + return doubleValue; +} + +std::pair<ENodeType, TNonStringScalar> ParseScalarValue(const std::string_view& value, EYamlScalarType yamlType) +{ + switch (yamlType) { + case EYamlScalarType::String: + return {ENodeType::String, {}}; + case EYamlScalarType::Null: + return {ENodeType::Entity, {}}; + case EYamlScalarType::Bool: { + bool boolValue = ParseAndValidateYamlBool(value); + return {ENodeType::Boolean, {.Boolean = boolValue}}; + } + case EYamlScalarType::Int: + case EYamlScalarType::Uint: { + return ParseAndValidateYamlInteger(value, yamlType); + } + case EYamlScalarType::Float: { + auto doubleValue = ParseAndValidateYamlDouble(value); + return {ENodeType::Double, {.Double = doubleValue}}; + } + } + YT_ABORT(); +} + +//////////////////////////////////////////////////////////////////////////////// + +std::string_view YamlLiteralToStringView(const yaml_char_t* literal, size_t length) +{ + return literal + ? std::string_view(reinterpret_cast<const char*>(literal), length) + : std::string_view(); +} + +std::string_view YamlLiteralToStringView(const yaml_char_t* literal) +{ + return literal + ? std::string_view(reinterpret_cast<const char*>(literal)) + : std::string_view(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats + +void Serialize(const yaml_mark_t& mark, NYT::NYson::IYsonConsumer* consumer) +{ + NYT::NYTree::BuildYsonFluently(consumer) + .BeginMap() + .Item("position").Value(NYT::Format("%v:%v", mark.line, mark.column)) + .Item("index").Value(static_cast<i64>(mark.index)) + .EndMap(); +} diff --git a/yt/yt/library/formats/yaml_helpers.h b/yt/yt/library/formats/yaml_helpers.h new file mode 100644 index 0000000000..f9b3300ffc --- /dev/null +++ b/yt/yt/library/formats/yaml_helpers.h @@ -0,0 +1,114 @@ +#pragma once + +#include "private.h" + +#include <yt/yt/core/yson/public.h> + +#include <yt/yt/core/ytree/public.h> + +#include <contrib/libs/yaml/include/yaml.h> + +namespace NYT::NFormats { + +//////////////////////////////////////////////////////////////////////////////// + +template <class TLibYamlType, void(*Deleter)(TLibYamlType*)> +struct TLibYamlTypeWrapper + : public TLibYamlType + , public TNonCopyable +{ + TLibYamlTypeWrapper(); + void Reset(); + ~TLibYamlTypeWrapper(); +}; + +using TLibYamlParser = TLibYamlTypeWrapper<yaml_parser_t, yaml_parser_delete>; +using TLibYamlEmitter = TLibYamlTypeWrapper<yaml_emitter_t, yaml_emitter_delete>; +using TLibYamlEvent = TLibYamlTypeWrapper<yaml_event_t, yaml_event_delete>; + +//////////////////////////////////////////////////////////////////////////////// + +// These enums are counterparts of the enums in the Yaml library. +// Keep them in sync with the library. + +DEFINE_ENUM(EYamlErrorType, + ((NoError) (YAML_NO_ERROR)) + ((Memory) (YAML_MEMORY_ERROR)) + ((Reader) (YAML_READER_ERROR)) + ((Scanner) (YAML_SCANNER_ERROR)) + ((Parser) (YAML_PARSER_ERROR)) + ((Composer) (YAML_COMPOSER_ERROR)) + ((Writer) (YAML_WRITER_ERROR)) + ((Emitter) (YAML_EMITTER_ERROR)) +); + +DEFINE_ENUM(EYamlEventType, + ((NoEvent) (YAML_NO_EVENT)) + ((StreamStart) (YAML_STREAM_START_EVENT)) + ((StreamEnd) (YAML_STREAM_END_EVENT)) + ((DocumentStart) (YAML_DOCUMENT_START_EVENT)) + ((DocumentEnd) (YAML_DOCUMENT_END_EVENT)) + ((Alias) (YAML_ALIAS_EVENT)) + ((Scalar) (YAML_SCALAR_EVENT)) + ((SequenceStart) (YAML_SEQUENCE_START_EVENT)) + ((SequenceEnd) (YAML_SEQUENCE_END_EVENT)) + ((MappingStart) (YAML_MAPPING_START_EVENT)) + ((MappingEnd) (YAML_MAPPING_END_EVENT)) +); + +//! This tag is used for denoting 2-element sequences that represent a YSON node with attributes. +static constexpr std::string_view YTAttrNodeTag = "!yt/attrnode"; + +//! Thia tag is used upon parsing to denote an integer scalar which should be +//! represented by YT uint64 type. Writer by default omits this tag, but may be +//! configured to force this tag on all uint64 values. +static constexpr std::string_view YTUintTag = "!yt/uint64"; + +//////////////////////////////////////////////////////////////////////////////// + +//! We support: +//! - YAML 1.2 Core schema types +//! - YT-specific uint type, for which we introduce a special tag "!yt/uint64". +DEFINE_ENUM(EYamlScalarType, + (String) + (Int) + (Float) + (Bool) + (Null) + (Uint) +); + +union TNonStringScalar +{ + i64 Int64; + ui64 Uint64; + double Double; + bool Boolean; +}; + +//! Extracts a recognized YAML scalar type from a tag. +EYamlScalarType DeduceScalarTypeFromTag(const std::string_view& tag); +//! Guesses a recognized YAML scalar type from a value. +EYamlScalarType DeduceScalarTypeFromValue(const std::string_view& value); +//! Given a recognized YAML type, transforms it into a YT type and, +//! in case of a non-string result, parses a scalar value. +std::pair<NYTree::ENodeType, TNonStringScalar> ParseScalarValue( + const std::string_view& value, + EYamlScalarType yamlType); + +//////////////////////////////////////////////////////////////////////////////// + +// Convenience helpers for transforming a weirdly represented (yaml_char_t* ~ unsigned char*) +// YAML string into string_view, also handling the case of a null pointer. + +std::string_view YamlLiteralToStringView(const yaml_char_t* literal, size_t length); +std::string_view YamlLiteralToStringView(const yaml_char_t* literal); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats + +// Note that ADL requires to put this function in the global namespace since +// yaml_mark_t is defined in the global namespace from C++ POV + +void Serialize(const yaml_mark_t& mark, NYT::NYson::IYsonConsumer* consumer); diff --git a/yt/yt/library/formats/yaml_parser.cpp b/yt/yt/library/formats/yaml_parser.cpp new file mode 100644 index 0000000000..5e930f250e --- /dev/null +++ b/yt/yt/library/formats/yaml_parser.cpp @@ -0,0 +1,562 @@ +#include "yaml_parser.h" + +#include "yaml_helpers.h" + +#include <yt/yt/client/formats/parser.h> + +#include <yt/yt/core/yson/consumer.h> + +#include <yt/yt/core/ytree/convert.h> + +#include <yt/yt/core/misc/coro_pipe.h> + +#include <contrib/libs/yaml/include/yaml.h> + +namespace NYT::NFormats { + +using namespace NYson; +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +//! A helper class that takes care of the repeated parts of a YAML document that +//! are expressed as anchors and aliases. Under the hood, materializes a YSON +//! string for each anchor and emits it to the underlying consumer via +//! OnRaw when needed. +/*! + * Implementation notes: + * - Conforming to YAML 1.2, alias may refer only to a previously defined anchor. + * - Aliasing an anchor to an ancestor node is not supported as the resulting document + * cannot be represent as a finite YSON (even though some implementations with tree + * representations support that, e.g. PyYAML). + * - According to the YAML spec, alias may be "overridden" by a later definition. + * This feature is considered error-prone, will probably be removed in next + * versions of YAML spec (https://github.com/yaml/yaml-spec/pull/65) and is not + * supported by us. + * - Using an alias to a scalar anchor as a map key or anchoring a map key are not + * supported for the sake of simpler implementation (and are considered a weird thing + * to do by an author of this code). + */ +class TAnchorRecordingConsumer + : public IYsonConsumer +{ +public: + explicit TAnchorRecordingConsumer(IYsonConsumer* underlyingConsumer) + : UnderlyingConsumer_(underlyingConsumer) + , RunListWriter_(&RunListStream_, EYsonType::ListFragment) + { } + + void OnStringScalar(TStringBuf value) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnStringScalar(value); }); + MaybeFinishAnchor(); + } + + void OnInt64Scalar(i64 value) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnInt64Scalar(value); }); + MaybeFinishAnchor(); + } + + void OnUint64Scalar(ui64 value) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnUint64Scalar(value); }); + MaybeFinishAnchor(); + } + + void OnDoubleScalar(double value) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnDoubleScalar(value); }); + MaybeFinishAnchor(); + } + + void OnBooleanScalar(bool value) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnBooleanScalar(value); }); + MaybeFinishAnchor(); + } + + void OnEntity() override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnEntity(); }); + MaybeFinishAnchor(); + } + + void OnBeginList() override + { + ++CurrentDepth_; + ForAllConsumers([=] (auto* consumer) { consumer->OnBeginList(); }); + } + + void OnListItem() override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnListItem(); }); + } + + void OnEndList() override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnEndList(); }); + --CurrentDepth_; + MaybeFinishAnchor(); + } + + void OnBeginMap() override + { + ++CurrentDepth_; + ForAllConsumers([] (auto* consumer) { consumer->OnBeginMap(); }); + } + + void OnKeyedItem(TStringBuf key) override + { + ForAllConsumers([=] (auto* consumer) { consumer->OnKeyedItem(key); }); + } + + void OnEndMap() override + { + ForAllConsumers([] (auto* consumer) { consumer->OnEndMap(); }); + --CurrentDepth_; + MaybeFinishAnchor(); + } + + void OnBeginAttributes() override + { + ++CurrentDepth_; + ForAllConsumers([] (auto* consumer) { consumer->OnBeginAttributes(); }); + } + + void OnEndAttributes() override + { + ForAllConsumers([] (auto* consumer) { consumer->OnEndAttributes(); }); + --CurrentDepth_; + // NB: do not call MaybeFinishAnchorOrRun here, as we do not want to record only + // attribute map part of the node. + } + + void OnRaw(TStringBuf yson, EYsonType type) override + { + // The only caller for this OnRaw is ourselves in case of aliases, and aliases + // always point to YSON node. + YT_VERIFY(type == EYsonType::Node); + ForAllConsumers([=] (auto* consumer) { consumer->OnRaw(yson, type); }); + MaybeFinishAnchor(); + } + + void OnAnchor(const std::string& anchor) + { + StartRun(); + auto inserted = KnownAnchorNames_.insert(anchor).second; + if (!inserted) { + THROW_ERROR_EXCEPTION("Anchor %Qv is already defined", anchor); + } + auto& currentAnchor = ConstructingAnchors_.emplace_back(); + currentAnchor = { + anchor, + CurrentDepth_, + GetCurrentRunOffset(), + }; + } + + void OnAlias(const std::string& alias) + { + auto it = FinishedAnchors_.find(alias); + if (it == FinishedAnchors_.end()) { + THROW_ERROR_EXCEPTION("Alias %Qv refers to an undefined or unfinished anchor", alias); + } + auto& anchor = it->second; + + RunListWriter_.Flush(); + std::string_view yson = RunListStream_.Str(); + yson = yson.substr(anchor.StartOffset, anchor.EndOffset - anchor.StartOffset); + // NB: TBufferedBinaryYsonWriter writes ';' in a bit different way that you would expect from + // IYsonConsumer interface -- not as a reaction to OnListItem or OnKeyedItem, but rather when a node + // is finished. This leads to yson string above always containing extra trailing ';'. + // We strip it off, as our YSON consumers expect a YSON node to be serialized during OnAlias call. + YT_VERIFY(yson.ends_with(NYson::NDetail::ItemSeparatorSymbol)); + yson.remove_suffix(1); + OnRaw(yson, EYsonType::Node); + } + +private: + IYsonConsumer* UnderlyingConsumer_; + //! Whenever there is at least one anchor being recorded, the stream is used to + //! record the YSON representation of the outermost anchor. We call the representation + //! of such an outermost anchor a run. Conveniently, we represent runs as elements of + //! a fictional YSON list, making each anchor a substring of that YSON list. + TStringStream RunListStream_; + TBufferedBinaryYsonWriter RunListWriter_; + + struct TAnchor + { + std::string Name; + int Depth; + ssize_t StartOffset; + ssize_t EndOffset = -1; + }; + //! A stack of all anchors currently being constructed. + std::vector<TAnchor> ConstructingAnchors_; + //! A set of all anchors that are currently constructed. + THashSet<std::string> KnownAnchorNames_; + + //! A map containing YSON representations of anchors that have been finished. + THashMap<std::string, TAnchor> FinishedAnchors_; + + int CurrentDepth_ = 0; + + void ForAllConsumers(auto&& action) + { + action(UnderlyingConsumer_); + if (IsInRun()) { + action(&RunListWriter_); + } + } + + void StartRun() + { + if (!IsInRun()) { + RunListWriter_.OnListItem(); + } + } + + bool IsInRun() const + { + return !ConstructingAnchors_.empty(); + } + + ssize_t GetCurrentRunOffset() const + { + YT_VERIFY(IsInRun()); + return RunListWriter_.GetTotalWrittenSize(); + } + + //! Checks current depth, maybe finalizes the innermost anchor, and if + //! the anchor stack vanishes, finalizes the outermost anchor. + void MaybeFinishAnchor() + { + if (!ConstructingAnchors_.empty() && CurrentDepth_ == ConstructingAnchors_.back().Depth) { + // Finalize the innermost (stack topmost) anchor. + YT_VERIFY(IsInRun()); + auto& anchor = ConstructingAnchors_.back(); + + anchor.EndOffset = GetCurrentRunOffset(); + auto inserted = FinishedAnchors_.emplace(anchor.Name, std::move(anchor)).second; + // Insertion is ensured by the checks in OnAnchor. + YT_VERIFY(inserted); + ConstructingAnchors_.pop_back(); + } + } +}; + +class TYamlParser +{ +public: + TYamlParser(IInputStream* input, IYsonConsumer* consumer, TYamlFormatConfigPtr config, EYsonType ysonType) + : Input_(input) + , Consumer_(consumer) + , Config_(std::move(config)) + , YsonType_(ysonType) + { + yaml_parser_initialize(&Parser_); + yaml_parser_set_input(&Parser_, &ReadHandler, this); + } + + void Parse() + { + VisitStream(); + } + +private: + IInputStream* Input_; + TAnchorRecordingConsumer Consumer_; + TYamlFormatConfigPtr Config_; + EYsonType YsonType_; + + TLibYamlParser Parser_; + + TError ReadError_; + + TLibYamlEvent Event_; + + //! Convenience helper to get rid of the ugly casts. + EYamlEventType GetEventType() const + { + return static_cast<EYamlEventType>(Event_.type); + } + + static int ReadHandler(void* data, unsigned char* buffer, size_t size, size_t* sizeRead) + { + auto* yamlParser = reinterpret_cast<TYamlParser*>(data); + auto* input = yamlParser->Input_; + + try { + // IInputStream is similar to yaml_read_handler_t interface + // in EOF case: former returns 0 from Read(), and latter + // expects handler to set size_read to 0 and return 1 + *sizeRead = input->Read(buffer, size); + return 1; + } catch (const std::exception& ex) { + // We do not expect the read handler to be called after an error. + YT_ASSERT(yamlParser->ReadError_.IsOK()); + yamlParser->ReadError_ = TError(ex); + // Not really used by libyaml, but let's set it to 0 just in case. + *sizeRead = 0; + return 0; + } + } + + //! A wrapper around C-style libyaml API calls that return 0 on error which + //! throws an exception in case of an error. + int SafeInvoke(auto* method, auto... args) + { + int result = method(args...); + if (result == 0) { + ThrowError(); + } + return result; + } + + //! Throw an exception formed from the emitter state and possibly the exception + //! caught in the last write handler call. + [[noreturn]] void ThrowError() + { + // Unfortunately, libyaml may sometimes set error = YAML_NO_ERROR. This may lead + // to unclear exceptions during parsing. + auto yamlErrorType = static_cast<EYamlErrorType>(Parser_.error); + auto error = TError("YAML parser error: %v", Parser_.problem) + << TErrorAttribute("yaml_error_type", yamlErrorType) + << TErrorAttribute("problem_offset", Parser_.problem_offset) + << TErrorAttribute("problem_value", Parser_.problem_value) + << TErrorAttribute("problem_mark", Parser_.problem_mark); + if (Parser_.context) { + error <<= TErrorAttribute("context", Parser_.context); + error <<= TErrorAttribute("context_mark", Parser_.context_mark); + } + if (!ReadError_.IsOK()) { + error <<= ReadError_; + } + + THROW_ERROR error; + } + + //! Pull the next event from the parser into Event_ and check that it is one of the expected types. + void PullEvent(std::initializer_list<EYamlEventType> expectedTypes) + { + Event_.Reset(); + SafeInvoke(yaml_parser_parse, &Parser_, &Event_); + for (const auto expectedType : expectedTypes) { + if (GetEventType() == expectedType) { + return; + } + } + // TODO(max42): stack and position! + THROW_ERROR_EXCEPTION( + "Unexpected event type %Qlv, expected one of %Qlv", + GetEventType(), + std::vector(expectedTypes)); + } + + void VisitStream() + { + PullEvent({EYamlEventType::StreamStart}); + while (true) { + PullEvent({EYamlEventType::DocumentStart, EYamlEventType::StreamEnd}); + if (GetEventType() == EYamlEventType::StreamEnd) { + break; + } + if (YsonType_ == EYsonType::ListFragment) { + Consumer_.OnListItem(); + } + VisitDocument(); + } + } + + void VisitDocument() + { + PullEvent({ + EYamlEventType::Scalar, + EYamlEventType::SequenceStart, + EYamlEventType::MappingStart, + EYamlEventType::Alias, + }); + VisitNode(); + PullEvent({EYamlEventType::DocumentEnd}); + } + + void VisitNode() + { + auto maybeOnAnchor = [&] (yaml_char_t* anchor) { + if (anchor) { + Consumer_.OnAnchor(std::string(YamlLiteralToStringView(anchor))); + } + }; + switch (GetEventType()) { + case EYamlEventType::Scalar: + maybeOnAnchor(Event_.data.scalar.anchor); + VisitScalar(); + break; + case EYamlEventType::SequenceStart: + maybeOnAnchor(Event_.data.scalar.anchor); + VisitSequence(); + break; + case EYamlEventType::MappingStart: + maybeOnAnchor(Event_.data.scalar.anchor); + VisitMapping(/*isAttributes*/ false); + break; + case EYamlEventType::Alias: + Consumer_.OnAlias(std::string(YamlLiteralToStringView(Event_.data.alias.anchor))); + break; + default: + YT_ABORT(); + } + } + + void VisitScalar() + { + auto scalar = Event_.data.scalar; + auto yamlValue = YamlLiteralToStringView(scalar.value, scalar.length); + + // According to YAML spec, there are two non-specific tags "!" and "?", and all other + // tags are specific. + // + // If the tag is missing, parser should assign tag "!" to non-plain (quoted) scalars, + // and "?" to plain scalars and collection nodes. For some reason, libyaml does not + // do that for us. + // + // Then, "!"-tagged scalars should always be treated as strings, i.e. "!" -> YT string. + // + // Specific tags are either recognized by us, in which case we deduce a corresponding YT type, + // or we assign a string type otherwise. + // + // For the "?"-tagged scalars we perform the type deduction based on the scalar value + // (which is the most often case, as almost nobody uses type tags in YAML). + // + // Cf. https://yaml.org/spec/1.2.2/#332-resolved-tags + std::string_view tag; + if (scalar.tag) { + tag = YamlLiteralToStringView(scalar.tag); + } else if (scalar.style != YAML_PLAIN_SCALAR_STYLE) { + tag = "!"; + } else { + tag = "?"; + } + + EYamlScalarType yamlType; + if (tag != "?") { + yamlType = DeduceScalarTypeFromTag(tag); + } else { + yamlType = DeduceScalarTypeFromValue(yamlValue); + } + auto [ytType, nonStringScalar] = ParseScalarValue(yamlValue, yamlType); + switch (ytType) { + case ENodeType::String: + Consumer_.OnStringScalar(yamlValue); + break; + case ENodeType::Int64: + Consumer_.OnInt64Scalar(nonStringScalar.Int64); + break; + case ENodeType::Uint64: + Consumer_.OnUint64Scalar(nonStringScalar.Uint64); + break; + case ENodeType::Double: + Consumer_.OnDoubleScalar(nonStringScalar.Double); + break; + case ENodeType::Boolean: + Consumer_.OnBooleanScalar(nonStringScalar.Boolean); + break; + case ENodeType::Entity: + Consumer_.OnEntity(); + break; + default: + YT_ABORT(); + } + } + + void VisitSequence() + { + // NB: YSON node with attributes is represented as a yt/attrnode-tagged YAML sequence, + // so handle it as a special case. + if (YamlLiteralToStringView(Event_.data.mapping_start.tag) == YTAttrNodeTag) { + VisitNodeWithAttributes(); + return; + } + + Consumer_.OnBeginList(); + while (true) { + PullEvent({ + EYamlEventType::SequenceEnd, + EYamlEventType::SequenceStart, + EYamlEventType::MappingStart, + EYamlEventType::Scalar, + EYamlEventType::Alias + }); + if (GetEventType() == EYamlEventType::SequenceEnd) { + break; + } + Consumer_.OnListItem(); + VisitNode(); + } + Consumer_.OnEndList(); + } + + void VisitNodeWithAttributes() + { + PullEvent({EYamlEventType::MappingStart}); + VisitMapping(/*isAttributes*/ true); + + PullEvent({ + EYamlEventType::Scalar, + EYamlEventType::SequenceStart, + EYamlEventType::MappingStart, + EYamlEventType::Alias, + }); + VisitNode(); + + PullEvent({EYamlEventType::SequenceEnd}); + } + + void VisitMapping(bool isAttributes) + { + isAttributes ? Consumer_.OnBeginAttributes() : Consumer_.OnBeginMap(); + while (true) { + PullEvent({ + EYamlEventType::MappingEnd, + EYamlEventType::Scalar, + // Yes, YAML is weird enough to support aliases as keys! + EYamlEventType::Alias, + }); + if (GetEventType() == EYamlEventType::MappingEnd) { + break; + } else if (GetEventType() == EYamlEventType::Alias) { + THROW_ERROR_EXCEPTION("Using alias as a map key is not supported"); + } else { + if (Event_.data.scalar.anchor) { + THROW_ERROR_EXCEPTION("Putting anchors on map keys is not supported"); + } + auto key = YamlLiteralToStringView(Event_.data.scalar.value, Event_.data.scalar.length); + Consumer_.OnKeyedItem(key); + } + + PullEvent({ + EYamlEventType::Scalar, + EYamlEventType::SequenceStart, + EYamlEventType::MappingStart, + EYamlEventType::Alias, + }); + VisitNode(); + } + isAttributes ? Consumer_.OnEndAttributes() : Consumer_.OnEndMap(); + } +}; + +void ParseYaml( + IInputStream* input, + IYsonConsumer* consumer, + TYamlFormatConfigPtr config, + EYsonType ysonType) +{ + TYamlParser parser(input, consumer, config, ysonType); + parser.Parse(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/yaml_parser.h b/yt/yt/library/formats/yaml_parser.h new file mode 100644 index 0000000000..6cdfc4dab6 --- /dev/null +++ b/yt/yt/library/formats/yaml_parser.h @@ -0,0 +1,20 @@ +#pragma once + +#include <yt/yt/client/formats/public.h> + +#include <yt/yt/core/yson/public.h> + +namespace NYT::NFormats { + +//////////////////////////////////////////////////////////////////////////////// + +//! Parses a YAML stream in pull mode (may be used for structured driver commands). +void ParseYaml( + IInputStream* input, + NYson::IYsonConsumer* consumer, + TYamlFormatConfigPtr config, + NYson::EYsonType ysonType); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/yaml_writer.cpp b/yt/yt/library/formats/yaml_writer.cpp new file mode 100644 index 0000000000..3d2961d46a --- /dev/null +++ b/yt/yt/library/formats/yaml_writer.cpp @@ -0,0 +1,388 @@ +#include "yaml_writer.h" + +#include "helpers.h" +#include "yaml_helpers.h" + +#include <yt/yt/client/formats/config.h> + +#include <contrib/libs/yaml/include/yaml.h> + +namespace NYT::NFormats { + +using namespace NYson; + +//////////////////////////////////////////////////////////////////////////// + +class TYamlWriter + : public TFormatsConsumerBase +{ +public: + TYamlWriter( + IOutputStream* output, + NYson::EYsonType /*type*/, + TYamlFormatConfigPtr config) + : Output_(output) + , Config_(config) + { + SafeInvoke(yaml_emitter_initialize, &Emitter_); + yaml_emitter_set_output(&Emitter_, &WriteHandler, this); + EmitEvent(yaml_stream_start_event_initialize, YAML_ANY_ENCODING); + } + + void Flush() override + { + SafeInvoke(yaml_emitter_flush, &Emitter_); + } + + void OnStringScalar(TStringBuf value) override + { + OnNodeEnter(); + // We try to emit a plain (unquoted) scalar if possible. It may be not possible + // either because of YAML syntax restrictions (which will be handled by libyaml switching + // to a quoted style automatically), or because the plain style would produce a scalar + // which belongs to one of Core YAML schema regexps, making reasonable parsers interpret it as + // int/float/bool/null instead of string. + // + // PyYAML and Go YAML parsers handle this issue by checking the type that would be deduced from + // an unquoted representation and quoting the scalar if it would be interpreted as a non-string type. + // We utilize the same approach here. + + auto plainYamlType = DeduceScalarTypeFromValue(value); + auto desiredScalarStyle = YAML_ANY_SCALAR_STYLE; + if (plainYamlType != EYamlScalarType::String) { + desiredScalarStyle = YAML_DOUBLE_QUOTED_SCALAR_STYLE; + } + + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ nullptr, + reinterpret_cast<const yaml_char_t*>(value.Data()), + value.Size(), + /*plain_implicit*/ 1, + /*quoted_implicit*/ 1, + desiredScalarStyle); + OnNodeLeave(); + } + + void OnInt64Scalar(i64 value) override + { + OnNodeEnter(); + // Int64 scalars are always represented as plain (unquoted) YAML scalars. + // Core YAML schema regexps ensures that they will be interpreted as integers + // by all reasonable YAML parsers. + // Cf. https://yaml.org/spec/1.2.2/#1032-tag-resolution + char buf[64]; + auto length = IntToString<10>(value, buf, sizeof(buf)); + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ nullptr, + reinterpret_cast<yaml_char_t*>(buf), + length, + /*plain_implicit*/ 1, + /*quoted_implicit*/ 0, + YAML_PLAIN_SCALAR_STYLE); + OnNodeLeave(); + } + + void OnUint64Scalar(ui64 value) override + { + OnNodeEnter(); + // Uint64 scalars are by default represented as plain (unquoted) YAML scalars, + // similar to Int64 scalars (see the comment in OnInt64Scalar). + // However, we optionally support a custom "!yt/uint64" tag to preserve the + // information that the value is unsigned, which may be useful to control + // signedness upon writing and for YT -> YAML -> YT roundtrip consistency. + char buf[64]; + auto length = IntToString<10>(value, buf, sizeof(buf)); + + // In libyaml API plainImplicit defines whether the writer omits the tag. + bool plainImplicit = !Config_->WriteUintTag; + + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ reinterpret_cast<const yaml_char_t*>(YTUintTag.data()), + reinterpret_cast<yaml_char_t*>(buf), + length, + /*plain_implicit*/ plainImplicit, + /*quoted_implicit*/ 0, + YAML_PLAIN_SCALAR_STYLE); + OnNodeLeave(); + } + + void OnDoubleScalar(double value) override + { + OnNodeEnter(); + // Double scalars are by default represented as plain (unquoted) YAML scalars, + // similar to Int64 scalars (see the comment in OnInt64Scalar). + + char buf[512]; + auto length = DoubleToYamlString(value, buf, sizeof(buf)); + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_FLOAT_TAG), + reinterpret_cast<yaml_char_t*>(buf), + length, + /*plain_implicit*/ 1, + /*quoted_implicit*/ 0, + YAML_PLAIN_SCALAR_STYLE); + OnNodeLeave(); + } + + void OnBooleanScalar(bool value) override + { + OnNodeEnter(); + static const std::string_view trueLiteral = "true"; + static const std::string_view falseLiteral = "false"; + const std::string_view& literal = value ? trueLiteral : falseLiteral; + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_BOOL_TAG), + reinterpret_cast<yaml_char_t*>(const_cast<char*>(literal.data())), + literal.size(), + /*plain_implicit*/ 1, + /*quoted_implicit*/ 0, + YAML_PLAIN_SCALAR_STYLE); + OnNodeLeave(); + } + + virtual void OnEntity() override + { + OnNodeEnter(); + static const std::string_view nullLiteral = "null"; + EmitEvent( + yaml_scalar_event_initialize, + /*anchor*/ nullptr, + /*tag*/ reinterpret_cast<const yaml_char_t*>(YAML_NULL_TAG), + reinterpret_cast<yaml_char_t*>(const_cast<char*>(nullLiteral.data())), + nullLiteral.size(), + /*plain_implicit*/ 1, + /*quoted_implicit*/ 0, + YAML_PLAIN_SCALAR_STYLE); + OnNodeLeave(); + } + + virtual void OnBeginList() override + { + OnNodeEnter(); + EmitEvent( + yaml_sequence_start_event_initialize, + /*anchor*/ nullptr, + /*tag*/ nullptr, + /*implicit*/ 1, + YAML_ANY_SEQUENCE_STYLE); + } + + virtual void OnListItem() override + { } + + virtual void OnEndList() override + { + EmitEvent(yaml_sequence_end_event_initialize); + OnNodeLeave(); + } + + virtual void OnBeginMap() override + { + OnNodeEnter(); + EmitEvent( + yaml_mapping_start_event_initialize, + /*anchor*/ nullptr, + /*tag*/ nullptr, + /*implicit*/ 1, + YAML_ANY_MAPPING_STYLE); + } + + virtual void OnKeyedItem(TStringBuf key) override + { + OnStringScalar(key); + } + + virtual void OnEndMap() override + { + EmitEvent(yaml_mapping_end_event_initialize); + OnNodeLeave(); + } + + virtual void OnBeginAttributes() override + { + // NB: node with attributes in YAML is represented as a yt/attrnode-tagged 2-item sequence. + OnNodeEnter(); + EmitEvent( + yaml_sequence_start_event_initialize, + /*anchor*/ nullptr, + /*tag*/ reinterpret_cast<const yaml_char_t*>(YTAttrNodeTag.data()), + /*implicit*/ 0, + YAML_ANY_SEQUENCE_STYLE); + EmitEvent( + yaml_mapping_start_event_initialize, + /*anchor*/ nullptr, + /*tag*/ nullptr, + /*implicit*/ 1, + YAML_ANY_MAPPING_STYLE); + } + + virtual void OnEndAttributes() override + { + EmitEvent(yaml_mapping_end_event_initialize); + ImmediatelyAfterAttributes_ = true; + OnNodeLeave(/*isAttributes*/ true); + DepthsWithPendingValueClosure_.push_back(CurrentDepth_); + } + +private: + using TEmitterPtr = std::unique_ptr<yaml_emitter_t, decltype(&yaml_emitter_delete)>; + using TEventPtr = std::unique_ptr<yaml_event_t, decltype(&yaml_event_delete)>; + + IOutputStream* Output_; + TYamlFormatConfigPtr Config_; + TLibYamlEmitter Emitter_; + TError WriteError_; + + // Utilities for tracking the current depth and the stack of depths at which + // we must perform extra sequence closure due to yt/attrnode-tagged 2-item sequence convention. + + //! The depth of the current node in the YSON tree. + int CurrentDepth_ = 0; + //! A stack of depths at which attributes are present. + std::vector<int> DepthsWithPendingValueClosure_ = {-1}; + //! A flag indicating that the we are immediately after the OnEndAttributes() event. + bool ImmediatelyAfterAttributes_ = false; + + static int WriteHandler(void* data, unsigned char* buffer, size_t size) + { + auto* yamlWriter = reinterpret_cast<TYamlWriter*>(data); + auto* output = yamlWriter->Output_; + + try { + output->Write(buffer, size); + } catch (const std::exception& ex) { + // We do not expect the write handler to be called after an error. + YT_ASSERT(yamlWriter->WriteError_.IsOK()); + yamlWriter->WriteError_ = TError(ex); + return 0; + } + return 1; + } + + //! A wrapper around C-style libyaml API calls that return 0 on error which + //! throws an exception in case of an error. + int SafeInvoke(auto* method, auto... args) + { + int result = method(args...); + if (result == 0) { + ThrowError(); + } + return result; + } + + //! Throw an exception formed from the emitter state and possibly the exception + //! caught in the last write handler call. + void ThrowError() + { + // Unfortunately, libyaml may sometimes YAML_NO_ERROR. This may lead + // to unclear exceptions during parsing. + auto yamlErrorType = static_cast<EYamlErrorType>(Emitter_.error); + auto error = TError("YAML emitter error: %v", Emitter_.problem) + << TErrorAttribute("yaml_error_type", yamlErrorType); + + if (!WriteError_.IsOK()) { + error <<= WriteError_; + } + + THROW_ERROR error; + } + + void EmitEvent(auto* eventInitializer, auto... args) + { + yaml_event_t event; + // Event initializer is guaranteed to release all resources in case of an error. + SafeInvoke(eventInitializer, &event, args...); + SafeInvoke(yaml_emitter_emit, &Emitter_, &event); + } + + void OnNodeEnter() + { + // If we are at the depth 0 and it is not a break between the root node attributes and the root node, + // emit the document start event. + if (CurrentDepth_ == 0 && !ImmediatelyAfterAttributes_) { + EmitEvent( + yaml_document_start_event_initialize, + /*version_directive*/ nullptr, + /*tag_directives_start*/ nullptr, + /*tag_directives_end*/ nullptr, + /*implicit*/ 1); + } + ++CurrentDepth_; + ImmediatelyAfterAttributes_ = false; + } + + void OnNodeLeave(bool isAttributes = false) + { + --CurrentDepth_; + if (CurrentDepth_ == DepthsWithPendingValueClosure_.back()) { + EmitEvent(yaml_sequence_end_event_initialize); + DepthsWithPendingValueClosure_.pop_back(); + } + if (isAttributes) { + ImmediatelyAfterAttributes_ = true; + } + // If we are leaving the root node and it is not a break between the root node attributes and the root node, + // emit the document end event. + if (CurrentDepth_ == 0 && !isAttributes) { + EmitEvent(yaml_document_end_event_initialize, /*implicit*/ 1); + } + } + + size_t DoubleToYamlString(double value, char* buf, size_t size) + { + // Extra care must be taken to handle non-finite values (NaN, Inf, -Inf), + // and also to ensure that the resulting value cannot be parsed as an integer. + // Both things are done similarly to the corresponding logic in the YSON writer. + // Cf. NYson::TUncheckedYsonTokenWriter::WriteTextDouble. + + if (std::isfinite(value)) { + auto length = FloatToString(value, buf, size); + std::string_view str(buf, length); + if (str.find('.') == std::string::npos && str.find('e') == std::string::npos) { + YT_VERIFY(length + 1 <= size); + buf[length++] = '.'; + } + return length; + } else { + static const std::string_view nanLiteral = ".nan"; + static const std::string_view infLiteral = ".inf"; + static const std::string_view negativeInfLiteral = "-.inf"; + + std::string_view str; + if (std::isnan(value)) { + str = nanLiteral; + } else if (std::isinf(value) && value > 0) { + str = infLiteral; + } else { + str = negativeInfLiteral; + } + YT_VERIFY(str.size() + 1 <= size); + ::memcpy(buf, str.data(), str.size() + 1); + return str.size(); + } + } +}; + +std::unique_ptr<IFlushableYsonConsumer> CreateYamlWriter( + IZeroCopyOutput* output, + NYson::EYsonType type, + TYamlFormatConfigPtr config) +{ + // Note that output gets narrowed to IOutputStream* as the currently used yaml library + // interface is not zero-copy by its nature. + return std::make_unique<TYamlWriter>(output, type, config); +} + +//////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/yaml_writer.h b/yt/yt/library/formats/yaml_writer.h new file mode 100644 index 0000000000..c3e7fb76fd --- /dev/null +++ b/yt/yt/library/formats/yaml_writer.h @@ -0,0 +1,18 @@ +#pragma once + +#include <yt/yt/core/yson/consumer.h> + +#include <yt/yt/client/formats/public.h> + +namespace NYT::NFormats { + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr<NYson::IFlushableYsonConsumer> CreateYamlWriter( + IZeroCopyOutput* output, + NYson::EYsonType type, + TYamlFormatConfigPtr config); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats |