diff options
author | nadya02 <nadya02@yandex-team.com> | 2023-09-07 11:57:32 +0300 |
---|---|---|
committer | nadya02 <nadya02@yandex-team.com> | 2023-09-07 13:04:37 +0300 |
commit | de934b312a042991ebf8875dd27c27bfe9dfd872 (patch) | |
tree | 860e6e2d982d770640ff0e3e5c421184a5949249 | |
parent | 3c6960025c3079d4e9a4150c0b962d694571f9ae (diff) | |
download | ydb-de934b312a042991ebf8875dd27c27bfe9dfd872.tar.gz |
YT-19430: Move formats from client to library
113 files changed, 3108 insertions, 14546 deletions
diff --git a/yt/yt/client/driver/driver.cpp b/yt/yt/client/driver/driver.cpp index be39258e1a..e4e7abfb61 100644 --- a/yt/yt/client/driver/driver.cpp +++ b/yt/yt/client/driver/driver.cpp @@ -1,11 +1,11 @@ #include "driver.h" #include "authentication_commands.h" +#include "admin_commands.h" #include "chaos_commands.h" #include "command.h" #include "config.h" #include "cypress_commands.h" -#include "admin_commands.h" #include "etc_commands.h" #include "file_commands.h" #include "journal_commands.h" @@ -17,10 +17,10 @@ #include "proxy_discovery_cache.h" #include "query_commands.h" -#include <yt/yt/client/api/transaction.h> +#include <yt/yt/client/api/client_cache.h> #include <yt/yt/client/api/connection.h> #include <yt/yt/client/api/sticky_transaction_pool.h> -#include <yt/yt/client/api/client_cache.h> +#include <yt/yt/client/api/transaction.h> #include <yt/yt/client/api/rpc_proxy/connection_impl.h> @@ -30,8 +30,11 @@ #include <yt/yt/core/tracing/trace_context.h> +#include <yt/yt/library/formats/format.h> + #include <yt/yt/library/tvm/tvm_base.h> + namespace NYT::NDriver { using namespace NYTree; diff --git a/yt/yt/client/driver/journal_commands.cpp b/yt/yt/client/driver/journal_commands.cpp index a3a4406e8d..ad6944fa3d 100644 --- a/yt/yt/client/driver/journal_commands.cpp +++ b/yt/yt/client/driver/journal_commands.cpp @@ -10,6 +10,8 @@ #include <yt/yt/client/formats/format.h> #include <yt/yt/client/formats/parser.h> +#include <yt/yt/library/formats/format.h> + #include <yt/yt/core/concurrency/scheduler.h> #include <yt/yt/core/misc/blob_output.h> diff --git a/yt/yt/client/driver/query_commands.cpp b/yt/yt/client/driver/query_commands.cpp index 756c273175..14efed7825 100644 --- a/yt/yt/client/driver/query_commands.cpp +++ b/yt/yt/client/driver/query_commands.cpp @@ -4,6 +4,8 @@ #include <yt/yt/client/formats/config.h> +#include <yt/yt/library/formats/format.h> + #include <yt/yt/core/ytree/fluent.h> #include <yt/yt/core/ytree/convert.h> diff --git a/yt/yt/client/driver/queue_commands.cpp b/yt/yt/client/driver/queue_commands.cpp index 771fce09e1..89fbba7389 100644 --- a/yt/yt/client/driver/queue_commands.cpp +++ b/yt/yt/client/driver/queue_commands.cpp @@ -3,6 +3,8 @@ #include <yt/yt/client/api/config.h> +#include <yt/yt/library/formats/format.h> + namespace NYT::NDriver { using namespace NConcurrency; diff --git a/yt/yt/client/driver/ya.make b/yt/yt/client/driver/ya.make index d08c35855e..6e411686d0 100644 --- a/yt/yt/client/driver/ya.make +++ b/yt/yt/client/driver/ya.make @@ -25,7 +25,7 @@ SRCS( PEERDIR( yt/yt/client - yt/yt/client/formats + yt/yt/library/formats ) END() diff --git a/yt/yt/client/formats/format.cpp b/yt/yt/client/formats/format.cpp index 72db758f73..f6090edb62 100644 --- a/yt/yt/client/formats/format.cpp +++ b/yt/yt/client/formats/format.cpp @@ -1,22 +1,4 @@ #include "format.h" -#include "parser.h" -#include "dsv_parser.h" -#include "dsv_writer.h" -#include "protobuf_parser.h" -#include "protobuf_writer.h" -#include "schemaful_dsv_parser.h" -#include "schemaful_dsv_writer.h" -#include "schemaful_writer.h" -#include "web_json_writer.h" -#include "schemaless_writer_adapter.h" -#include "skiff_parser.h" -#include "skiff_writer.h" -#include "versioned_writer.h" -#include "yamred_dsv_parser.h" -#include "yamred_dsv_writer.h" -#include "yamr_parser.h" -#include "yamr_writer.h" -#include "yson_parser.h" #include <yt/yt/client/table_client/name_table.h> #include <yt/yt/client/table_client/table_consumer.h> @@ -36,13 +18,8 @@ namespace NYT::NFormats { -using namespace NConcurrency; using namespace NYTree; using namespace NYson; -using namespace NJson; -using namespace NTableClient; -using namespace NSkiffExt; -using namespace NComplexTypes; //////////////////////////////////////////////////////////////////////////////// @@ -109,551 +86,4 @@ void Deserialize(TFormat& value, NYson::TYsonPullParserCursor* cursor) //////////////////////////////////////////////////////////////////////////////// -namespace { - -EYsonType DataTypeToYsonType(EDataType dataType) -{ - switch (dataType) { - case EDataType::Structured: - return EYsonType::Node; - case EDataType::Tabular: - return EYsonType::ListFragment; - default: - THROW_ERROR_EXCEPTION("Data type %Qlv is not supported by YSON", - dataType); - } -} - -std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForYson( - EDataType dataType, - const IAttributeDictionary& attributes, - IZeroCopyOutput* output) -{ - auto config = ConvertTo<TYsonFormatConfigPtr>(&attributes); - return CreateYsonWriter( - output, - config->Format, - DataTypeToYsonType(dataType), - config->Format == EYsonFormat::Binary); -} - -std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForJson( - EDataType dataType, - const IAttributeDictionary& attributes, - IOutputStream* output) -{ - auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); - return CreateJsonConsumer(output, DataTypeToYsonType(dataType), config); -} - -std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForDsv( - EDataType dataType, - const IAttributeDictionary& attributes, - IOutputStream* output) -{ - auto config = ConvertTo<TDsvFormatConfigPtr>(&attributes); - switch (dataType) { - case EDataType::Structured: - return std::unique_ptr<IFlushableYsonConsumer>(new TDsvNodeConsumer(output, config)); - - case EDataType::Tabular: - case EDataType::Binary: - case EDataType::Null: - THROW_ERROR_EXCEPTION("Data type %Qlv is not supported by DSV", - dataType); - - default: - YT_ABORT(); - }; -} - -class TTableParserAdapter - : public IParser -{ -public: - TTableParserAdapter( - const TFormat& format, - std::vector<IValueConsumer*> valueConsumers, - int tableIndex) - : TableConsumer_(new TTableConsumer( - TYsonConverterConfig{ - .ComplexTypeMode = format.Attributes().Get("complex_type_mode", EComplexTypeMode::Named), - .StringKeyedDictMode = format.Attributes().Get("string_keyed_dict_mode", EDictMode::Positional), - .DecimalMode = format.Attributes().Get("decimal_mode", EDecimalMode::Binary), - .TimeMode = format.Attributes().Get("time_mode", ETimeMode::Binary), - .UuidMode = format.Attributes().Get("uuid_mode", EUuidMode::Binary), - }, - valueConsumers, - tableIndex)) - , Parser_(CreateParserForFormat( - format, - EDataType::Tabular, - TableConsumer_.get())) - { } - - void Read(TStringBuf data) override - { - Parser_->Read(data); - } - - void Finish() override - { - Parser_->Finish(); - } - -private: - const std::unique_ptr<IYsonConsumer> TableConsumer_; - const std::unique_ptr<IParser> Parser_; -}; - -} // namespace - -std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForFormat( - const TFormat& format, - EDataType dataType, - IZeroCopyOutput* output) -{ - switch (format.GetType()) { - case EFormatType::Yson: - return CreateConsumerForYson(dataType, format.Attributes(), output); - case EFormatType::Json: - return CreateConsumerForJson(dataType, format.Attributes(), output); - case EFormatType::Dsv: - return CreateConsumerForDsv(dataType, format.Attributes(), output); - default: - THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", - format.GetType()); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -template <class TWriter, class TConsumerAdapter> -TIntrusivePtr<TWriter> CreateAdaptedWriterForYson( - const IAttributeDictionary& attributes, - TTableSchemaPtr schema, - IAsyncOutputStreamPtr output) -{ - auto config = ConvertTo<TYsonFormatConfigPtr>(&attributes); - return New<TConsumerAdapter>(std::move(output), std::move(schema), [=] (IZeroCopyOutput* buffer) { - if (config->Format == EYsonFormat::Binary) { - return std::unique_ptr<IFlushableYsonConsumer>(new TBufferedBinaryYsonWriter( - buffer, - EYsonType::ListFragment, - true)); - } else { - return std::unique_ptr<IFlushableYsonConsumer>(new TYsonWriter( - buffer, - config->Format, - EYsonType::ListFragment)); - } - }); -} - -template <class TWriter, class TConsumerAdapter> -TIntrusivePtr<TWriter> CreateAdaptedWriterForJson( - const IAttributeDictionary& attributes, - TTableSchemaPtr schema, - IAsyncOutputStreamPtr output) -{ - auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); - return New<TConsumerAdapter>(std::move(output), std::move(schema), [&] (IOutputStream* buffer) { - return CreateJsonConsumer(buffer, EYsonType::ListFragment, config); - }); -} - -IUnversionedRowsetWriterPtr CreateSchemafulWriterForFormat( - const TFormat& format, - TTableSchemaPtr schema, - IAsyncOutputStreamPtr output) -{ - switch (format.GetType()) { - case EFormatType::Yson: - return CreateAdaptedWriterForYson<IUnversionedRowsetWriter, TSchemafulWriter>(format.Attributes(), std::move(schema), std::move(output)); - case EFormatType::Json: - return CreateAdaptedWriterForJson<IUnversionedRowsetWriter, TSchemafulWriter>(format.Attributes(), std::move(schema), std::move(output)); - case EFormatType::SchemafulDsv: - return CreateSchemafulWriterForSchemafulDsv(format.Attributes(), std::move(schema), std::move(output)); - case EFormatType::WebJson: { - auto webJsonFormatConfig = ConvertTo<TWebJsonFormatConfigPtr>(&format.Attributes()); - webJsonFormatConfig->SkipSystemColumns = false; - - return CreateWriterForWebJson( - std::move(webJsonFormatConfig), - TNameTable::FromSchema(*schema), - {schema}, - std::move(output)); - } - default: - THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", - format.GetType()); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -IVersionedWriterPtr CreateVersionedWriterForFormat( - const TFormat& format, - NTableClient::TTableSchemaPtr schema, - NConcurrency::IAsyncOutputStreamPtr output) -{ - switch (format.GetType()) { - case EFormatType::Yson: - return CreateAdaptedWriterForYson<IVersionedWriter, TVersionedWriter>(format.Attributes(), std::move(schema), std::move(output)); - case EFormatType::Json: - return CreateAdaptedWriterForJson<IVersionedWriter, TVersionedWriter>(format.Attributes(), std::move(schema), std::move(output)); - default: - THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", format.GetType()); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -ISchemalessFormatWriterPtr CreateStaticTableWriterForFormat( - const TFormat& format, - TNameTablePtr nameTable, - const std::vector<TTableSchemaPtr>& tableSchemas, - NConcurrency::IAsyncOutputStreamPtr output, - bool enableContextSaving, - TControlAttributesConfigPtr controlAttributesConfig, - int keyColumnCount) -{ - switch (format.GetType()) { - case EFormatType::Dsv: - return CreateSchemalessWriterForDsv( - format.Attributes(), - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - case EFormatType::Yamr: - return CreateSchemalessWriterForYamr( - format.Attributes(), - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - case EFormatType::YamredDsv: - return CreateSchemalessWriterForYamredDsv( - format.Attributes(), - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - case EFormatType::SchemafulDsv: - return CreateSchemalessWriterForSchemafulDsv( - format.Attributes(), - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - case EFormatType::Protobuf: - return CreateWriterForProtobuf( - format.Attributes(), - tableSchemas, - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - case EFormatType::WebJson: - return CreateWriterForWebJson( - format.Attributes(), - nameTable, - tableSchemas, - std::move(output)); - case EFormatType::Skiff: - return CreateWriterForSkiff( - format.Attributes(), - nameTable, - tableSchemas, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - default: - auto adapter = New<TSchemalessWriterAdapter>( - nameTable, - std::move(output), - enableContextSaving, - controlAttributesConfig, - keyColumnCount); - adapter->Init(tableSchemas, format); - return adapter; - } -} - -//////////////////////////////////////////////////////////////////////////////// - -TYsonProducer CreateProducerForDsv( - EDataType dataType, - const IAttributeDictionary& attributes, - IInputStream* input) -{ - if (dataType != EDataType::Tabular) { - THROW_ERROR_EXCEPTION("DSV is supported only for tabular data"); - } - auto config = ConvertTo<TDsvFormatConfigPtr>(&attributes); - return BIND([=] (IYsonConsumer* consumer) { - ParseDsv(input, consumer, config); - }); -} - -TYsonProducer CreateProducerForYamr( - EDataType dataType, - const IAttributeDictionary& attributes, - IInputStream* input) -{ - if (dataType != EDataType::Tabular) { - THROW_ERROR_EXCEPTION("YAMR is supported only for tabular data"); - } - auto config = ConvertTo<TYamrFormatConfigPtr>(&attributes); - return BIND([=] (IYsonConsumer* consumer) { - ParseYamr(input, consumer, config); - }); -} - -TYsonProducer CreateProducerForYamredDsv( - EDataType dataType, - const IAttributeDictionary& attributes, - IInputStream* input) -{ - if (dataType != EDataType::Tabular) { - THROW_ERROR_EXCEPTION("Yamred DSV is supported only for tabular data"); - } - auto config = ConvertTo<TYamredDsvFormatConfigPtr>(&attributes); - return BIND([=] (IYsonConsumer* consumer) { - ParseYamredDsv(input, consumer, config); - }); -} - -TYsonProducer CreateProducerForSchemafulDsv( - EDataType dataType, - const IAttributeDictionary& attributes, - IInputStream* input) -{ - if (dataType != EDataType::Tabular) { - THROW_ERROR_EXCEPTION("Schemaful DSV is supported only for tabular data"); - } - auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&attributes); - return BIND([=] (IYsonConsumer* consumer) { - ParseSchemafulDsv(input, consumer, config); - }); -} - -TYsonProducer CreateProducerForJson( - EDataType dataType, - const IAttributeDictionary& attributes, - IInputStream* input) -{ - auto ysonType = DataTypeToYsonType(dataType); - auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); - return BIND([=] (IYsonConsumer* consumer) { - ParseJson(input, consumer, config, ysonType); - }); -} - -TYsonProducer CreateProducerForYson(EDataType dataType, IInputStream* input) -{ - auto ysonType = DataTypeToYsonType(dataType); - return ConvertToProducer(TYsonInput(input, ysonType)); -} - -TYsonProducer CreateProducerForFormat(const TFormat& format, EDataType dataType, IInputStream* input) -{ - switch (format.GetType()) { - case EFormatType::Yson: - return CreateProducerForYson(dataType, input); - case EFormatType::Json: - return CreateProducerForJson(dataType, format.Attributes(), input); - case EFormatType::Dsv: - return CreateProducerForDsv(dataType, format.Attributes(), input); - case EFormatType::Yamr: - return CreateProducerForYamr(dataType, format.Attributes(), input); - case EFormatType::YamredDsv: - return CreateProducerForYamredDsv(dataType, format.Attributes(), input); - case EFormatType::SchemafulDsv: - return CreateProducerForSchemafulDsv(dataType, format.Attributes(), input); - default: - THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", - format.GetType()); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -template<class TBase> -struct TParserAdapter - : public TBase - , public IParser -{ -public: - template<class... TArgs> - TParserAdapter(TArgs&&... args) - : TBase(std::forward<TArgs>(args)...) - { } - - void Read(TStringBuf data) override - { - TBase::Read(data); - } - - void Finish() override - { - TBase::Finish(); - } -}; - -//////////////////////////////////////////////////////////////////////////////// - -std::unique_ptr<IParser> CreateParserForFormat(const TFormat& format, EDataType dataType, IYsonConsumer* consumer) -{ - switch (format.GetType()) { - case EFormatType::Yson: - return CreateParserForYson(consumer, DataTypeToYsonType(dataType)); - case EFormatType::Json: { - auto config = ConvertTo<TJsonFormatConfigPtr>(&format.Attributes()); - return std::unique_ptr<IParser>(new TParserAdapter<TJsonParser>(consumer, config, DataTypeToYsonType(dataType))); - } - case EFormatType::Dsv: { - auto config = ConvertTo<TDsvFormatConfigPtr>(&format.Attributes()); - return CreateParserForDsv(consumer, config); - } - case EFormatType::Yamr: { - auto config = ConvertTo<TYamrFormatConfigPtr>(&format.Attributes()); - return CreateParserForYamr(consumer, config); - } - case EFormatType::YamredDsv: { - auto config = ConvertTo<TYamredDsvFormatConfigPtr>(&format.Attributes()); - return CreateParserForYamredDsv(consumer, config); - } - case EFormatType::SchemafulDsv: { - auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&format.Attributes()); - return CreateParserForSchemafulDsv(consumer, config); - } - default: - THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", - format.GetType()); - } -} - -std::vector<std::unique_ptr<IParser>> CreateParsersForFormat( - const TFormat& format, - const std::vector<IValueConsumer*>& valueConsumers) -{ - std::vector<std::unique_ptr<IParser>> parsers; - - auto parserCount = std::ssize(valueConsumers); - parsers.reserve(parserCount); - - switch (format.GetType()) { - case EFormatType::Protobuf: { - auto config = ConvertTo<TProtobufFormatConfigPtr>(&format.Attributes()); - // TODO(max42): implementation of CreateParserForProtobuf clones config - // on each call, so this loop works in quadratic time. Fix that. - for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { - parsers.emplace_back(CreateParserForProtobuf(valueConsumers[tableIndex], config, tableIndex)); - } - break; - } - case EFormatType::Skiff: { - auto config = ConvertTo<TSkiffFormatConfigPtr>(&format.Attributes()); - auto skiffSchemas = ParseSkiffSchemas(config->SkiffSchemaRegistry, config->TableSkiffSchemas); - for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { - parsers.emplace_back(CreateParserForSkiff(valueConsumers[tableIndex], skiffSchemas, config, tableIndex)); - } - break; - } - default: - for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { - parsers.emplace_back(std::make_unique<TTableParserAdapter>(format, valueConsumers, tableIndex)); - } - break; - } - - return parsers; -} - -std::unique_ptr<IParser> CreateParserForFormat( - const TFormat& format, - IValueConsumer* valueConsumer) -{ - auto parsers = CreateParsersForFormat(format, {valueConsumer}); - return std::move(parsers.front()); -} - -//////////////////////////////////////////////////////////////////////////////// - -void ConfigureEscapeTable(const TSchemafulDsvFormatConfigPtr& config, TEscapeTable* escapeTable) -{ - std::vector<char> stopSymbols = {config->RecordSeparator, config->FieldSeparator}; - if (config->EnableEscaping) { - stopSymbols.push_back(config->EscapingSymbol); - escapeTable->EscapingSymbol = config->EscapingSymbol; - } - escapeTable->FillStops(stopSymbols); -} - -void ConfigureEscapeTables( - const TDsvFormatConfigBasePtr& config, - bool addCarriageReturn, - TEscapeTable* keyEscapeTable, - TEscapeTable* valueEscapeTable) -{ - std::vector<char> stopSymbols = {config->RecordSeparator, config->FieldSeparator, '\0'}; - - if (config->EnableEscaping) { - stopSymbols.push_back(config->EscapingSymbol); - keyEscapeTable->EscapingSymbol = valueEscapeTable->EscapingSymbol = config->EscapingSymbol; - } - - if (addCarriageReturn) { - stopSymbols.push_back('\r'); - } - - valueEscapeTable->FillStops(stopSymbols); - - stopSymbols.push_back(config->KeyValueSeparator); - keyEscapeTable->FillStops(stopSymbols); -} - -void ConfigureEscapeTables( - const TYamrFormatConfigBasePtr& config, - bool enableKeyEscaping, - bool enableValueEscaping, - bool escapingForWriter, - TEscapeTable* keyEscapeTable, - TEscapeTable* valueEscapeTable) -{ - std::vector<char> valueStopSymbols = {config->RecordSeparator}; - std::vector<char> keyStopSymbols = {config->RecordSeparator, config->FieldSeparator}; - - if (enableKeyEscaping) { - if (escapingForWriter) { - keyStopSymbols.push_back('\0'); - keyStopSymbols.push_back('\r'); - } - keyStopSymbols.push_back(config->EscapingSymbol); - keyEscapeTable->EscapingSymbol = config->EscapingSymbol; - } - - if (enableValueEscaping) { - if (escapingForWriter) { - valueStopSymbols.push_back('\0'); - valueStopSymbols.push_back('\r'); - } - valueStopSymbols.push_back(config->EscapingSymbol); - valueEscapeTable->EscapingSymbol = config->EscapingSymbol; - } - - keyEscapeTable->FillStops(keyStopSymbols); - valueEscapeTable->FillStops(valueStopSymbols); -} - -//////////////////////////////////////////////////////////////////////////////// - } // namespace NYT::NFormats diff --git a/yt/yt/client/formats/format.h b/yt/yt/client/formats/format.h index 752939aebe..749f428471 100644 --- a/yt/yt/client/formats/format.h +++ b/yt/yt/client/formats/format.h @@ -41,92 +41,4 @@ void Deserialize(TFormat& value, NYson::TYsonPullParserCursor* cursor); //////////////////////////////////////////////////////////////////////////////// -struct ISchemalessFormatWriter - : public NTableClient::IUnversionedRowsetWriter -{ - virtual TBlob GetContext() const = 0; - - virtual i64 GetWrittenSize() const = 0; - - [[nodiscard]] virtual TFuture<void> Flush() = 0; - - virtual bool WriteBatch(NTableClient::IUnversionedRowBatchPtr rowBatch) = 0; -}; - -DEFINE_REFCOUNTED_TYPE(ISchemalessFormatWriter) - -//////////////////////////////////////////////////////////////////////////////// - -// This function historically creates format for reading dynamic tables. -// It slightly differs from format for static tables. :( -NTableClient::IUnversionedRowsetWriterPtr CreateSchemafulWriterForFormat( - const TFormat& Format, - NTableClient::TTableSchemaPtr schema, - NConcurrency::IAsyncOutputStreamPtr output); - -//////////////////////////////////////////////////////////////////////////////// - -NTableClient::IVersionedWriterPtr CreateVersionedWriterForFormat( - const TFormat& Format, - NTableClient::TTableSchemaPtr schema, - NConcurrency::IAsyncOutputStreamPtr output); - -//////////////////////////////////////////////////////////////////////////////// - -ISchemalessFormatWriterPtr CreateStaticTableWriterForFormat( - const TFormat& format, - NTableClient::TNameTablePtr nameTable, - const std::vector<NTableClient::TTableSchemaPtr>& tableSchemas, - NConcurrency::IAsyncOutputStreamPtr output, - bool enableContextSaving, - TControlAttributesConfigPtr controlAttributesConfig, - int keyColumnCount); - -//////////////////////////////////////////////////////////////////////////////// - -std::unique_ptr<NYson::IFlushableYsonConsumer> CreateConsumerForFormat( - const TFormat& format, - EDataType dataType, - IZeroCopyOutput* output); - -NYson::TYsonProducer CreateProducerForFormat( - const TFormat& format, - EDataType dataType, - IInputStream* input); - -std::unique_ptr<IParser> CreateParserForFormat( - const TFormat& format, - EDataType dataType, - NYson::IYsonConsumer* consumer); - -//! Create own parser for each value consumer. -std::vector<std::unique_ptr<IParser>> CreateParsersForFormat( - const TFormat& format, - const std::vector<NTableClient::IValueConsumer*>& valueConsumers); - -//! Create parser for value consumer. Helper for previous method in singular case. -std::unique_ptr<IParser> CreateParserForFormat( - const TFormat& format, - NTableClient::IValueConsumer* valueConsumer); - -//////////////////////////////////////////////////////////////////////////////// - -void ConfigureEscapeTable(const TSchemafulDsvFormatConfigPtr& config, TEscapeTable* escapeTable); - -void ConfigureEscapeTables( - const TDsvFormatConfigBasePtr& config, - bool addCarriageReturn, - TEscapeTable* keyEscapeTable, - TEscapeTable* valueEscapeTable); - -void ConfigureEscapeTables( - const TYamrFormatConfigBasePtr& config, - bool enableKeyEscaping, - bool enableValueEscaping, - bool escapingForWriter, - TEscapeTable* keyEscapeTable, - TEscapeTable* valueEscapeTable); - -//////////////////////////////////////////////////////////////////////////////// - } // namespace NYT::NFormats diff --git a/yt/yt/client/formats/ya.make b/yt/yt/client/formats/ya.make index 18eb0e8384..14efe5d5c4 100644 --- a/yt/yt/client/formats/ya.make +++ b/yt/yt/client/formats/ya.make @@ -4,35 +4,8 @@ INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) SRCS( config.cpp - dsv_parser.cpp - dsv_writer.cpp - escape.cpp format.cpp - helpers.cpp parser.cpp - protobuf.cpp - protobuf_options.cpp - protobuf_parser.cpp - protobuf_writer.cpp - schemaful_dsv_parser.cpp - schemaful_dsv_writer.cpp - schemaful_writer.cpp - schemaless_writer_adapter.cpp - web_json_writer.cpp - skiff_parser.cpp - skiff_yson_converter.cpp - skiff_writer.cpp - unversioned_value_yson_writer.cpp - versioned_writer.cpp - yamr_parser.cpp - yamr_parser_base.cpp - yamr_writer.cpp - yamr_writer_base.cpp - yamred_dsv_parser.cpp - yamred_dsv_writer.cpp - yson_parser.cpp - yson_map_to_unversioned_value.cpp - yql_yson_converter.cpp ) PEERDIR( diff --git a/yt/yt/client/table_client/adapters.h b/yt/yt/client/table_client/adapters.h index b2baeb3486..9cafefb49e 100644 --- a/yt/yt/client/table_client/adapters.h +++ b/yt/yt/client/table_client/adapters.h @@ -5,7 +5,7 @@ #include <yt/yt/client/api/table_reader.h> -#include <yt/yt/client/formats/format.h> +#include <yt/yt/library/formats/format.h> #include <yt/yt/core/concurrency/async_stream.h> diff --git a/yt/yt/client/table_client/unittests/serialization_ut.cpp b/yt/yt/client/table_client/unittests/serialization_ut.cpp index ed1c64a113..261b24cb1b 100644 --- a/yt/yt/client/table_client/unittests/serialization_ut.cpp +++ b/yt/yt/client/table_client/unittests/serialization_ut.cpp @@ -1,6 +1,7 @@ -#include <yt/yt/client/formats/format.h> #include <yt/yt/client/table_client/schema.h> +#include <yt/yt/library/formats/format.h> + #include <yt/yt/core/misc/blob_output.h> #include <yt/yt/core/ytree/convert.h> #include <yt/yt/core/test_framework/framework.h> diff --git a/yt/yt/client/table_client/unittests/ya.make b/yt/yt/client/table_client/unittests/ya.make index 87dff43ad2..d0a82ab469 100644 --- a/yt/yt/client/table_client/unittests/ya.make +++ b/yt/yt/client/table_client/unittests/ya.make @@ -14,7 +14,7 @@ INCLUDE(${ARCADIA_ROOT}/yt/opensource_tests.inc) PEERDIR( yt/yt/client - yt/yt/client/formats + yt/yt/library/formats yt/yt/client/table_client/unittests/helpers yt/yt/client/unittests/mock yt/yt/core/test_framework diff --git a/yt/yt/client/unittests/check_type_compatibility_ut.cpp b/yt/yt/client/unittests/check_type_compatibility_ut.cpp index 488ee42800..57b91cb3ea 100644 --- a/yt/yt/client/unittests/check_type_compatibility_ut.cpp +++ b/yt/yt/client/unittests/check_type_compatibility_ut.cpp @@ -1,4 +1,4 @@ -#include "logical_type_shortcuts.h" +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> #include <yt/yt/core/test_framework/framework.h> diff --git a/yt/yt/client/unittests/dsv_parser_ut.cpp b/yt/yt/client/unittests/dsv_parser_ut.cpp deleted file mode 100644 index 0a0c724f9e..0000000000 --- a/yt/yt/client/unittests/dsv_parser_ut.cpp +++ /dev/null @@ -1,365 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/test_framework/yson_consumer_mock.h> - -#include <yt/yt/client/formats/dsv_parser.h> - -namespace NYT::NFormats { -namespace { - -using namespace NYson; - -using ::testing::InSequence; -using ::testing::StrictMock; -using ::testing::NiceMock; - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TDsvParserTest, Simple) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("integer")); - EXPECT_CALL(Mock, OnStringScalar("42")); - EXPECT_CALL(Mock, OnKeyedItem("string")); - EXPECT_CALL(Mock, OnStringScalar("some")); - EXPECT_CALL(Mock, OnKeyedItem("double")); - EXPECT_CALL(Mock, OnStringScalar("10")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("foo")); - EXPECT_CALL(Mock, OnStringScalar("bar")); - EXPECT_CALL(Mock, OnKeyedItem("one")); - EXPECT_CALL(Mock, OnStringScalar("1")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "integer=42\tstring=some\tdouble=10\n" - "foo=bar\tone=1\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, EmptyInput) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - TString input = ""; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, BinaryData) -{ - StrictMock<TMockYsonConsumer> Mock; - - auto a = TString("\0\0\0\0", 4); - auto b = TString("\x80\0\x16\xC8", 4); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("ntr")); - EXPECT_CALL(Mock, OnStringScalar(a)); - EXPECT_CALL(Mock, OnKeyedItem("xrp")); - EXPECT_CALL(Mock, OnStringScalar(b)); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "ntr=\\0\\0\\0\\0\txrp=\x80\\0\x16\xC8\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, EmptyRecord) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, EmptyRecords) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "\n\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, EmptyKeysAndValues) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "=\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, UnescapedZeroInInput) -{ - StrictMock<TMockYsonConsumer> Mock; - - TString input = TString("a\0b=v", 5); - EXPECT_ANY_THROW( - ParseDsv(input, &Mock); - ); -} - -TEST(TDsvParserTest, ZerosAreNotTerminals) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - TString key = TString("a\0b", 3); - TString value = TString("c\0d", 3); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem(key)); - EXPECT_CALL(Mock, OnStringScalar(value)); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "a\\0b=c\\0d\n"; - ParseDsv(input, &Mock); -} - -TEST(TDsvParserTest, UnterminatedRecord) -{ - NiceMock<TMockYsonConsumer> Mock; - - TString input = "a=b"; - EXPECT_ANY_THROW( - ParseDsv(input, &Mock); - ); -} - -//////////////////////////////////////////////////////////////////////////////// - -class TTskvParserTest: public ::testing::Test -{ -public: - StrictMock<TMockYsonConsumer> Mock; - NiceMock<TMockYsonConsumer> ErrorMock; - - TDsvFormatConfigPtr Config; - - void SetUp() override { - Config = New<TDsvFormatConfig>(); - Config->LinePrefix = "tskv"; - } -}; - -TEST_F(TTskvParserTest, Simple) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("id")); - EXPECT_CALL(Mock, OnStringScalar("1")); - EXPECT_CALL(Mock, OnKeyedItem("guid")); - EXPECT_CALL(Mock, OnStringScalar("100500")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("id")); - EXPECT_CALL(Mock, OnStringScalar("2")); - EXPECT_CALL(Mock, OnKeyedItem("guid")); - EXPECT_CALL(Mock, OnStringScalar("20025")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "tskv\n" - "tskv\tid=1\tguid=100500\t\n" - "tskv\tid=2\tguid=20025\n"; - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, SimpleWithNewLine) -{ - InSequence dummy; - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("foo")); - EXPECT_CALL(Mock, OnStringScalar("bar")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "tskv\tfoo=bar\n"; - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, Escaping) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a=b")); - EXPECT_CALL(Mock, OnStringScalar("c=d or e=f")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key_with_\t,\r_and_\n")); - EXPECT_CALL(Mock, OnStringScalar("value_with_\t,\\_and_\r\n")); - EXPECT_CALL(Mock, OnKeyedItem("another_key")); - EXPECT_CALL(Mock, OnStringScalar("another_value")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "t\\s\\kv\n" - "tskv" "\t" "a\\=b" "=" "c\\=d or e=f" "\n" // Note: unescaping is less strict - "tskv" "\t" - "key_with_\\t,\r_and_\\n" - "=" - "value_with_\\t,\\\\_and_\\r\\n" - "\t" - "an\\other_\\key=anoth\\er_v\\alue" - "\n"; - - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, DisabledEscaping) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a\\")); - EXPECT_CALL(Mock, OnStringScalar("b\\t=c\\=d or e=f\\0")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "tskv\t\\x\\y\n" - "tskv" "\t" "a\\=b\\t" "=" "c\\=d or e=f\\0" "\n"; - - Config->EnableEscaping = false; - - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, AllowedUnescapedSymbols) -{ - Config->LinePrefix = "prefix_with_="; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("just_key")); - EXPECT_CALL(Mock, OnStringScalar("value_with_=")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "prefix_with_=" "\t" "just_key" "=" "value_with_=" "\n"; - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, UndefinedValues) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("b")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "tskv" "\t" "tskv" "\t" "tskv" "\n" - "tskv\t" "some_key" "\t\t\t" "a=b" "\t" "another_key" "\n" // Note: consequent \t - "tskv\n"; - ParseDsv(input, &Mock, Config); -} - - -TEST_F(TTskvParserTest, OnlyLinePrefix) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "tskv\n"; - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, OnlyLinePrefixAndTab) -{ - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "tskv\t\n"; - ParseDsv(input, &Mock, Config); -} - -TEST_F(TTskvParserTest, NotFinishedLinePrefix) -{ - TString input = "tsk"; - - EXPECT_ANY_THROW( - ParseDsv(input, &ErrorMock, Config) - ); -} - -TEST_F(TTskvParserTest, WrongLinePrefix) -{ - TString input = - "tskv\ta=b\n" - "tZkv\tc=d\te=f\n" - "tskv\ta=b\n"; - - EXPECT_ANY_THROW( - ParseDsv(input, &ErrorMock, Config); - ); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NDriver diff --git a/yt/yt/client/unittests/dsv_writer_ut.cpp b/yt/yt/client/unittests/dsv_writer_ut.cpp deleted file mode 100644 index b5f96caacd..0000000000 --- a/yt/yt/client/unittests/dsv_writer_ut.cpp +++ /dev/null @@ -1,316 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/client/formats/dsv_parser.h> -#include <yt/yt/client/formats/dsv_writer.h> - -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/unversioned_row.h> - -#include <yt/yt/core/concurrency/async_stream.h> - -namespace NYT::NFormats { -namespace { - -using namespace NYTree; -using namespace NYson; -using namespace NConcurrency; -using namespace NTableClient; - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TDsvWriterTest, StringScalar) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnStringScalar("0-2-xb-1234"); - EXPECT_EQ("0-2-xb-1234", outputStream.Str()); -} - -TEST(TDsvWriterTest, ListContainingDifferentTypes) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnBeginList(); - consumer.OnListItem(); - consumer.OnInt64Scalar(100); - consumer.OnListItem(); - consumer.OnStringScalar("foo"); - consumer.OnListItem(); - consumer.OnListItem(); - consumer.OnBeginMap(); - consumer.OnKeyedItem("a"); - consumer.OnStringScalar("10"); - consumer.OnKeyedItem("b"); - consumer.OnStringScalar("c"); - consumer.OnEndMap(); - consumer.OnEndList(); - - TString output = - "100\n" - "foo\n" - "\n" - "a=10\tb=c\n"; - - EXPECT_EQ(output, outputStream.Str()); -} - -TEST(TDsvWriterTest, ListInsideList) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnBeginList(); - consumer.OnListItem(); - EXPECT_ANY_THROW(consumer.OnBeginList()); -} - -TEST(TDsvWriterTest, ListInsideMap) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnBeginMap(); - consumer.OnKeyedItem("foo"); - EXPECT_ANY_THROW(consumer.OnBeginList()); -} - -TEST(TDsvWriterTest, MapInsideMap) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnBeginMap(); - consumer.OnKeyedItem("foo"); - EXPECT_ANY_THROW(consumer.OnBeginMap()); -} - -TEST(TDsvWriterTest, WithoutEsacping) -{ - auto config = New<TDsvFormatConfig>(); - config->EnableEscaping = false; - - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream, config); - - consumer.OnStringScalar("string_with_\t_\\_=_and_\n"); - - TString output = "string_with_\t_\\_=_and_\n"; - - EXPECT_EQ(output, outputStream.Str()); -} - -TEST(TDsvWriterTest, ListUsingOnRaw) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnRaw("[10; 20; 30]", EYsonType::Node); - TString output = - "10\n" - "20\n" - "30\n"; - - EXPECT_EQ(output, outputStream.Str()); -} - -TEST(TDsvWriterTest, MapUsingOnRaw) -{ - TStringStream outputStream; - TDsvNodeConsumer consumer(&outputStream); - - consumer.OnRaw("{a=b; c=d}", EYsonType::Node); - TString output = "a=b\tc=d"; - - EXPECT_EQ(output, outputStream.Str()); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TDsvWriterTest, SimpleTabular) -{ - auto nameTable = New<TNameTable>(); - auto integerId = nameTable->RegisterName("integer"); - auto stringId = nameTable->RegisterName("string"); - auto doubleId = nameTable->RegisterName("double"); - auto fooId = nameTable->RegisterName("foo"); - auto oneId = nameTable->RegisterName("one"); - auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); - auto rowIndexId = nameTable->RegisterName(RowIndexColumnName); - auto rangeIndexId = nameTable->RegisterName(RangeIndexColumnName); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedInt64Value(42, integerId)); - row1.AddValue(MakeUnversionedStringValue("some", stringId)); - row1.AddValue(MakeUnversionedDoubleValue(10., doubleId)); - row1.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); - row1.AddValue(MakeUnversionedInt64Value(42, rowIndexId)); - row1.AddValue(MakeUnversionedInt64Value(1, rangeIndexId)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("bar", fooId)); - row2.AddValue(MakeUnversionedSentinelValue(EValueType::Null, integerId)); - row2.AddValue(MakeUnversionedInt64Value(1, oneId)); - row2.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); - row2.AddValue(MakeUnversionedInt64Value(43, rowIndexId)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow()}; - - TStringStream outputStream; - auto config = New<TDsvFormatConfig>(); - config->EnableTableIndex = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableTableIndex = true; - auto writer = CreateSchemalessWriterForDsv( - config, - nameTable, - CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), - false, - controlAttributes, - 0); - - EXPECT_EQ(true, writer->Write(rows)); - writer->Close() - .Get() - .ThrowOnError(); - - TString output = - "integer=42\tstring=some\tdouble=10.\t@table_index=2\n" - "foo=bar\tone=1\t@table_index=2\n"; - EXPECT_EQ(output, outputStream.Str()); -} - -TEST(TDsvWriterTest, AnyTabular) -{ - auto nameTable = New<TNameTable>(); - auto anyId = nameTable->RegisterName("any"); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedAnyValue("[]", anyId)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - TStringStream outputStream; - auto controlAttributes = New<TControlAttributesConfig>(); - auto writer = CreateSchemalessWriterForDsv( - New<TDsvFormatConfig>(), - nameTable, - CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), - false, - controlAttributes, - 0); - - EXPECT_FALSE(writer->Write(rows)); - EXPECT_ANY_THROW(writer->GetReadyEvent().Get().ThrowOnError()); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TTskvWriterTest, SimpleTabular) -{ - auto nameTable = New<TNameTable>(); - auto id1 = nameTable->RegisterName("id"); - auto id2 = nameTable->RegisterName("guid"); - auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); - auto rowIndexId = nameTable->RegisterName(RowIndexColumnName); - auto rangeIndexId = nameTable->RegisterName(RangeIndexColumnName); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); - row1.AddValue(MakeUnversionedInt64Value(42, rowIndexId)); - row1.AddValue(MakeUnversionedInt64Value(1, rangeIndexId)); - - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("1", id1)); - row2.AddValue(MakeUnversionedInt64Value(100500, id2)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("2", id1)); - row3.AddValue(MakeUnversionedInt64Value(20025, id2)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; - - TStringStream outputStream; - auto config = New<TDsvFormatConfig>(); - config->LinePrefix = "tskv"; - - auto controlAttributes = New<TControlAttributesConfig>(); - auto writer = CreateSchemalessWriterForDsv( - config, - nameTable, - CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), - false, - controlAttributes, - 0); - - EXPECT_EQ(true, writer->Write(rows)); - writer->Close() - .Get() - .ThrowOnError(); - - TString output = - "tskv\n" - "tskv\tid=1\tguid=100500\n" - "tskv\tid=2\tguid=20025\n"; - - EXPECT_EQ(output, outputStream.Str()); -} - -TEST(TTskvWriterTest, Escaping) -{ - auto key1 = TString("\0 is escaped", 12); - - auto nameTable = New<TNameTable>(); - auto id1 = nameTable->RegisterName(key1); - auto id2 = nameTable->RegisterName("Escaping in in key: \r \t \n \\ ="); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue(key1, id1)); - row.AddValue(MakeUnversionedStringValue("Escaping in value: \r \t \n \\ =", id2)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - TStringStream outputStream; - auto config = New<TDsvFormatConfig>(); - config->LinePrefix = "tskv"; - - auto controlAttributes = New<TControlAttributesConfig>(); - auto writer = CreateSchemalessWriterForDsv( - config, - nameTable, - CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), - false, - controlAttributes, - 0); - - EXPECT_EQ(true, writer->Write(rows)); - writer->Close() - .Get() - .ThrowOnError(); - - TString output = - "tskv" - "\t" - - "\\0 is escaped" - "=" - "\\0 is escaped" - - "\t" - - "Escaping in in key: \\r \\t \\n \\\\ \\=" - "=" - "Escaping in value: \\r \\t \\n \\\\ =" // Note: = is not escaped - - "\n"; - - EXPECT_EQ(output, outputStream.Str()); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/format_writer_ut.h b/yt/yt/client/unittests/format_writer_ut.h deleted file mode 100644 index 4680090755..0000000000 --- a/yt/yt/client/unittests/format_writer_ut.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include <yt/yt/client/formats/format.h> - -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/unversioned_row.h> - -namespace NYT::NFormats { -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -void TestNameTableExpansion(ISchemalessFormatWriterPtr writer, NTableClient::TNameTablePtr nameTable) -{ - // We write five rows, on each iteration we double number of - // columns in the NameTable. - for (int iteration = 0; iteration < 5; ++iteration) { - NTableClient::TUnversionedOwningRowBuilder row; - for (int index = 0; index < (1 << iteration); ++index) { - auto key = "Column" + ToString(index); - auto value = "Value" + ToString(index); - int columnId = nameTable->GetIdOrRegisterName(key); - row.AddValue(NTableClient::MakeUnversionedStringValue(value, columnId)); - } - auto completeRow = row.FinishRow(); - EXPECT_EQ(true, writer->Write({completeRow.Get()})); - } - writer->Close() - .Get() - .ThrowOnError(); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/logical_type_ut.cpp b/yt/yt/client/unittests/logical_type_ut.cpp index 0cf744313e..f6b98ae638 100644 --- a/yt/yt/client/unittests/logical_type_ut.cpp +++ b/yt/yt/client/unittests/logical_type_ut.cpp @@ -1,4 +1,4 @@ -#include "logical_type_shortcuts.h" +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> #include <yt/yt/core/test_framework/framework.h> diff --git a/yt/yt/client/unittests/protobuf_format_ut.cpp b/yt/yt/client/unittests/protobuf_format_ut.cpp deleted file mode 100644 index af9d2a0155..0000000000 --- a/yt/yt/client/unittests/protobuf_format_ut.cpp +++ /dev/null @@ -1,4657 +0,0 @@ -#include "row_helpers.h" -#include "yson_helpers.h" -#include "yt/yt/client/table_client/public.h" - -#include <yt/yt/client/unittests/protobuf_format_ut.pb.h> - -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/concurrency/async_stream.h> -#include <yt/yt/core/json/json_parser.h> -#include <yt/yt/core/yson/string.h> -#include <yt/yt/core/ytree/fluent.h> - -#include <yt/yt/client/formats/config.h> -#include <yt/yt/client/formats/parser.h> -#include <yt/yt/client/formats/lenval_control_constants.h> -#include <yt/yt/client/formats/protobuf_writer.h> -#include <yt/yt/client/formats/protobuf_parser.h> -#include <yt/yt/client/formats/protobuf.h> -#include <yt/yt/client/formats/format.h> -#include <yt/yt/client/table_client/logical_type.h> -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/value_consumer.h> -#include <yt/yt/client/table_client/unversioned_row.h> - -#include <yt/yt/library/named_value/named_value.h> - -#include <util/random/fast.h> - -#include <google/protobuf/text_format.h> -#include <google/protobuf/descriptor.h> -#include <google/protobuf/descriptor.pb.h> - -using namespace std::string_view_literals; - - -namespace NYT { -namespace { - -using namespace NYson; -using namespace NYTree; -using namespace NFormats; -using namespace NTableClient; -using namespace NConcurrency; -using namespace NProtobufFormatTest; - -using ::google::protobuf::FileDescriptor; -using NNamedValue::MakeRow; - -//////////////////////////////////////////////////////////////////////////////// - -DEFINE_ENUM(EProtoFormatType, - (FileDescriptorLegacy) - (FileDescriptor) - (Structured) -); - -//////////////////////////////////////////////////////////////////////////////// - -#define EXPECT_NODES_EQUAL(a, b) \ - EXPECT_TRUE(AreNodesEqual((a), (b))) \ - << #a ": " << ConvertToYsonString((a), EYsonFormat::Text).ToString() \ - << "\n\n" #b ": " << ConvertToYsonString((b), EYsonFormat::Text).ToString(); - -//////////////////////////////////////////////////////////////////////////////// - -TString ConvertToTextYson(const INodePtr& node) -{ - return ConvertToYsonString(node, EYsonFormat::Text).ToString(); -} - -// Hardcoded serialization of file descriptor used in old format description. -TString FileDescriptorLegacy = "\x0a\xb6\x03\x0a\x29\x6a\x75\x6e\x6b\x2f\x65\x72\x6d\x6f\x6c\x6f\x76\x64\x2f\x74\x65\x73\x74\x2d\x70\x72\x6f\x74\x6f\x62" - "\x75\x66\x2f\x6d\x65\x73\x73\x61\x67\x65\x2e\x70\x72\x6f\x74\x6f\x22\x2d\x0a\x0f\x54\x45\x6d\x62\x65\x64\x65\x64\x4d\x65\x73\x73\x61\x67\x65\x12" - "\x0b\x0a\x03\x4b\x65\x79\x18\x01\x20\x01\x28\x09\x12\x0d\x0a\x05\x56\x61\x6c\x75\x65\x18\x02\x20\x01\x28\x09\x22\xb3\x02\x0a\x08\x54\x4d\x65\x73" - "\x73\x61\x67\x65\x12\x0e\x0a\x06\x44\x6f\x75\x62\x6c\x65\x18\x01\x20\x01\x28\x01\x12\x0d\x0a\x05\x46\x6c\x6f\x61\x74\x18\x02\x20\x01\x28\x02\x12" - "\x0d\x0a\x05\x49\x6e\x74\x36\x34\x18\x03\x20\x01\x28\x03\x12\x0e\x0a\x06\x55\x49\x6e\x74\x36\x34\x18\x04\x20\x01\x28\x04\x12\x0e\x0a\x06\x53\x49" - "\x6e\x74\x36\x34\x18\x05\x20\x01\x28\x12\x12\x0f\x0a\x07\x46\x69\x78\x65\x64\x36\x34\x18\x06\x20\x01\x28\x06\x12\x10\x0a\x08\x53\x46\x69\x78\x65" - "\x64\x36\x34\x18\x07\x20\x01\x28\x10\x12\x0d\x0a\x05\x49\x6e\x74\x33\x32\x18\x08\x20\x01\x28\x05\x12\x0e\x0a\x06\x55\x49\x6e\x74\x33\x32\x18\x09" - "\x20\x01\x28\x0d\x12\x0e\x0a\x06\x53\x49\x6e\x74\x33\x32\x18\x0a\x20\x01\x28\x11\x12\x0f\x0a\x07\x46\x69\x78\x65\x64\x33\x32\x18\x0b\x20\x01\x28" - "\x07\x12\x10\x0a\x08\x53\x46\x69\x78\x65\x64\x33\x32\x18\x0c\x20\x01\x28\x0f\x12\x0c\x0a\x04\x42\x6f\x6f\x6c\x18\x0d\x20\x01\x28\x08\x12\x0e\x0a" - "\x06\x53\x74\x72\x69\x6e\x67\x18\x0e\x20\x01\x28\x09\x12\x0d\x0a\x05\x42\x79\x74\x65\x73\x18\x0f\x20\x01\x28\x0c\x12\x14\x0a\x04\x45\x6e\x75\x6d" - "\x18\x10\x20\x01\x28\x0e\x32\x06\x2e\x45\x45\x6e\x75\x6d\x12\x21\x0a\x07\x4d\x65\x73\x73\x61\x67\x65\x18\x11\x20\x01\x28\x0b\x32\x10\x2e\x54\x45" - "\x6d\x62\x65\x64\x65\x64\x4d\x65\x73\x73\x61\x67\x65\x2a\x24\x0a\x05\x45\x45\x6e\x75\x6d\x12\x07\x0a\x03\x4f\x6e\x65\x10\x01\x12\x07\x0a\x03\x54" - "\x77\x6f\x10\x02\x12\x09\x0a\x05\x54\x68\x72\x65\x65\x10\x03"; - -TString GenerateRandomLenvalString(TFastRng64& rng, ui32 size) -{ - TString result; - result.append(reinterpret_cast<const char*>(&size), sizeof(size)); - - size += sizeof(ui32); - - while (result.size() < size) { - ui64 num = rng.GenRand(); - result.append(reinterpret_cast<const char*>(&num), sizeof(num)); - } - if (result.size() > size) { - result.resize(size); - } - return result; -} - -static TProtobufFormatConfigPtr MakeProtobufFormatConfig(const std::vector<const ::google::protobuf::Descriptor*>& descriptorList) -{ - ::google::protobuf::FileDescriptorSet fileDescriptorSet; - THashSet<const ::google::protobuf::FileDescriptor*> files; - - std::function<void(const ::google::protobuf::FileDescriptor*)> addFile; - addFile = [&] (const ::google::protobuf::FileDescriptor* fileDescriptor) { - if (!files.insert(fileDescriptor).second) { - return; - } - - // N.B. We want to write dependencies in fileDescriptorSet in topological order - // so we traverse dependencies first and the add current fileDescriptor. - for (int i = 0; i < fileDescriptor->dependency_count(); ++i) { - addFile(fileDescriptor->dependency(i)); - } - fileDescriptor->CopyTo(fileDescriptorSet.add_file()); - }; - std::vector<TString> typeNames; - - for (const auto* descriptor : descriptorList) { - addFile(descriptor->file()); - typeNames.push_back(descriptor->full_name()); - } - - auto formatConfigYsonString = BuildYsonStringFluently() - .BeginMap() - .Item("file_descriptor_set_text").Value(fileDescriptorSet.ShortDebugString()) - .Item("type_names").Value(typeNames) - .EndMap(); - - return ConvertTo<TProtobufFormatConfigPtr>(formatConfigYsonString); -} - -INodePtr ParseYson(TStringBuf data) -{ - return ConvertToNode(NYson::TYsonString(TString{data})); -} - -TString LenvalBytes(const ::google::protobuf::Message& message) -{ - TStringStream out; - ui32 messageSize = static_cast<ui32>(message.ByteSizeLong()); - out.Write(&messageSize, sizeof(messageSize)); - if (!message.SerializeToArcadiaStream(&out)) { - THROW_ERROR_EXCEPTION("Can not serialize message"); - } - return out.Str(); -} - -void EnsureTypesMatch(EValueType expected, EValueType actual) -{ - if (expected != actual) { - THROW_ERROR_EXCEPTION("Mismatching type: expected %Qlv, actual %Qlv", - expected, - actual); - } -} - -double GetDouble(const TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Double, row.Type); - return row.Data.Double; -} - -template <typename TMessage> -TCollectingValueConsumer ParseRows( - const TMessage& message, - const TProtobufFormatConfigPtr& config, - const TTableSchemaPtr& schema = New<TTableSchema>(), - int count = 1) -{ - TString lenvalBytes; - TStringOutput out(lenvalBytes); - auto messageSize = static_cast<ui32>(message.ByteSize()); - for (int i = 0; i < count; ++i) { - out.Write(&messageSize, sizeof(messageSize)); - if (!message.SerializeToArcadiaStream(&out)) { - THROW_ERROR_EXCEPTION("Failed to serialize message"); - } - } - - TCollectingValueConsumer rowCollector(schema); - auto parser = CreateParserForProtobuf(&rowCollector, config, 0); - parser->Read(lenvalBytes); - parser->Finish(); - if (static_cast<ssize_t>(rowCollector.Size()) != count) { - THROW_ERROR_EXCEPTION("rowCollector has wrong size: expected %v, actual %v", - count, - rowCollector.Size()); - } - return rowCollector; -} - -template <typename TMessage> -TCollectingValueConsumer ParseRows( - const TMessage& message, - const INodePtr& config, - const TTableSchemaPtr& schema = New<TTableSchema>(), - int count = 1) -{ - return ParseRows(message, ConvertTo<TProtobufFormatConfigPtr>(config->Attributes().ToMap()), schema, count); -} - - -void AddDependencies( - const FileDescriptor* fileDescriptor, - std::vector<const FileDescriptor*>& fileDescriptors, - THashSet<const FileDescriptor*>& fileDescriptorSet) -{ - if (fileDescriptorSet.contains(fileDescriptor)) { - return; - } - fileDescriptorSet.insert(fileDescriptor); - for (int i = 0; i < fileDescriptor->dependency_count(); ++i) { - AddDependencies(fileDescriptor->dependency(i), fileDescriptors, fileDescriptorSet); - } - fileDescriptors.push_back(fileDescriptor); -} - -template <typename ... Ts> -INodePtr CreateFileDescriptorConfig(std::optional<EComplexTypeMode> complexTypeMode = {}) -{ - std::vector<const FileDescriptor*> fileDescriptors; - THashSet<const FileDescriptor*> fileDescriptorSet; - std::vector<const FileDescriptor*> originalFileDescriptors = {Ts::descriptor()->file()...}; - - for (auto d : originalFileDescriptors) { - AddDependencies(d, fileDescriptors, fileDescriptorSet); - } - - ::google::protobuf::FileDescriptorSet fileDescriptorSetProto; - for (auto fileDescriptor : fileDescriptors) { - fileDescriptor->CopyTo(fileDescriptorSetProto.add_file()); - } - TString fileDescriptorSetText; - ::google::protobuf::TextFormat::Printer().PrintToString(fileDescriptorSetProto, &fileDescriptorSetText); - std::vector<TString> typeNames = {Ts::descriptor()->full_name()...}; - return BuildYsonNodeFluently() - .BeginAttributes() - .Item("file_descriptor_set_text").Value(fileDescriptorSetText) - .Item("type_names").Value(typeNames) - .OptionalItem("complex_type_mode", complexTypeMode) - .EndAttributes() - .Value("protobuf"); -} - -static const auto EnumerationsConfig = BuildYsonNodeFluently() - .BeginMap() - .Item("EEnum") - .BeginMap() - .Item("One").Value(1) - .Item("Two").Value(2) - .Item("Three").Value(3) - .Item("MinusFortyTwo").Value(-42) - .Item("MaxInt32").Value(std::numeric_limits<int>::max()) - .Item("MinInt32").Value(std::numeric_limits<int>::min()) - .EndMap() - .EndMap(); - -INodePtr CreateAllFieldsConfig(EProtoFormatType protoFormatType) -{ - switch (protoFormatType) { - case EProtoFormatType::FileDescriptor: - return CreateFileDescriptorConfig<TMessage>(); - case EProtoFormatType::FileDescriptorLegacy: - return BuildYsonNodeFluently() - .BeginAttributes() - .Item("file_descriptor_set") - .Value(FileDescriptorLegacy) - .Item("file_indices") - .BeginList() - .Item().Value(0) - .EndList() - .Item("message_indices") - .BeginList() - .Item().Value(1) - .EndList() - .EndAttributes() - .Value("protobuf"); - case EProtoFormatType::Structured: - return BuildYsonNodeFluently() - .BeginAttributes() - .Item("enumerations").Value(EnumerationsConfig) - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("Double") - .Item("field_number").Value(1) - .Item("proto_type").Value("double") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Float") - .Item("field_number").Value(2) - .Item("proto_type").Value("float") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Int64") - .Item("field_number").Value(3) - .Item("proto_type").Value("int64") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("UInt64") - .Item("field_number").Value(4) - .Item("proto_type").Value("uint64") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("SInt64") - .Item("field_number").Value(5) - .Item("proto_type").Value("sint64") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Fixed64") - .Item("field_number").Value(6) - .Item("proto_type").Value("fixed64") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("SFixed64") - .Item("field_number").Value(7) - .Item("proto_type").Value("sfixed64") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Int32") - .Item("field_number").Value(8) - .Item("proto_type").Value("int32") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("UInt32") - .Item("field_number").Value(9) - .Item("proto_type").Value("uint32") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("SInt32") - .Item("field_number").Value(10) - .Item("proto_type").Value("sint32") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Fixed32") - .Item("field_number").Value(11) - .Item("proto_type").Value("fixed32") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("SFixed32") - .Item("field_number").Value(12) - .Item("proto_type").Value("sfixed32") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Bool") - .Item("field_number").Value(13) - .Item("proto_type").Value("bool") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("String") - .Item("field_number").Value(14) - .Item("proto_type").Value("string") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Bytes") - .Item("field_number").Value(15) - .Item("proto_type").Value("bytes") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Enum") - .Item("field_number").Value(16) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("Message") - .Item("field_number").Value(17) - .Item("proto_type").Value("message") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("AnyWithMap") - .Item("field_number").Value(18) - .Item("proto_type").Value("any") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("AnyWithInt64") - .Item("field_number").Value(19) - .Item("proto_type").Value("any") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("AnyWithString") - .Item("field_number").Value(20) - .Item("proto_type").Value("any") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("OtherColumns") - .Item("field_number").Value(21) - .Item("proto_type").Value("other_columns") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("MissingInt64") - .Item("field_number").Value(22) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndAttributes() - .Value("protobuf"); - } - Y_FAIL(); -} - -//////////////////////////////////////////////////////////////////////////////// - -struct TLenvalEntry -{ - TString RowData; - ui32 TableIndex; - ui64 TabletIndex; -}; - -//////////////////////////////////////////////////////////////////////////////// - -class TLenvalParser -{ -public: - explicit TLenvalParser(IInputStream* input) - : Input_(input) - { } - - explicit TLenvalParser(TStringBuf input) - : StreamHolder_(std::make_unique<TMemoryInput>(input)) - , Input_(StreamHolder_.get()) - { } - - std::optional<TLenvalEntry> Next() - { - ui32 rowSize; - size_t read = Input_->Load(&rowSize, sizeof(rowSize)); - if (read == 0) { - return std::nullopt; - } else if (read < sizeof(rowSize)) { - THROW_ERROR_EXCEPTION("corrupted lenval: can't read row length"); - } - switch (rowSize) { - case LenvalTableIndexMarker: { - ui32 tableIndex; - read = Input_->Load(&tableIndex, sizeof(tableIndex)); - if (read != sizeof(tableIndex)) { - THROW_ERROR_EXCEPTION("corrupted lenval: can't read table index"); - } - CurrentTableIndex_ = tableIndex; - return Next(); - } - case LenvalTabletIndexMarker: { - ui64 tabletIndex; - read = Input_->Load(&tabletIndex, sizeof(tabletIndex)); - if (read != sizeof(tabletIndex)) { - THROW_ERROR_EXCEPTION("corrupted lenval: can't read tablet index"); - } - CurrentTabletIndex_ = tabletIndex; - return Next(); - } - case LenvalEndOfStream: - EndOfStream_ = true; - return std::nullopt; - case LenvalKeySwitch: - case LenvalRangeIndexMarker: - case LenvalRowIndexMarker: - THROW_ERROR_EXCEPTION("marker is unsupported"); - default: { - TLenvalEntry result; - result.RowData.resize(rowSize); - result.TableIndex = CurrentTableIndex_; - result.TabletIndex = CurrentTabletIndex_; - Input_->Load(result.RowData.Detach(), rowSize); - - return result; - } - } - } - - bool IsEndOfStream() const - { - return EndOfStream_; - } - -private: - std::unique_ptr<IInputStream> StreamHolder_; - IInputStream* Input_; - ui32 CurrentTableIndex_ = 0; - ui64 CurrentTabletIndex_ = 0; - bool EndOfStream_ = false; -}; - -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -TProtobufFormatConfigPtr ParseAndValidateConfig(const INodePtr& node, std::vector<TTableSchemaPtr> schemas = {}) -{ - auto config = ConvertTo<TProtobufFormatConfigPtr>(node); - if (schemas.empty()) { - schemas.assign(config->Tables.size(), New<TTableSchema>()); - } - New<TProtobufParserFormatDescription>()->Init(config, schemas); - New<TProtobufWriterFormatDescription>()->Init(config, schemas); - return config; -} - -} // namespace - -INodePtr BuildEmbeddedConfig(EComplexTypeMode complexTypeMode, EProtoFormatType formatType) { - if (formatType == EProtoFormatType::FileDescriptor) { - return CreateFileDescriptorConfig<NYT::TEmbeddingMessage>(complexTypeMode); - } - - auto config = BuildYsonNodeFluently() - .BeginAttributes() - .Item("tables").BeginList() - .Item().BeginMap() - .Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("*") - .Item("field_number").Value(2) - .Item("proto_type").Value("embedded_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("other_columns_field") - .Item("field_number").Value(15) - .Item("proto_type").Value("other_columns") - .EndMap() - .Item().BeginMap() - .Item("name").Value("embedded_num") - .Item("field_number").Value(10) - .Item("proto_type").Value("uint64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("embedded_extra_field") - .Item("field_number").Value(11) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("variant") - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("str_variant") - .Item("field_number").Value(101) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("uint_variant") - .Item("field_number").Value(102) - .Item("proto_type").Value("uint64") - .EndMap() - .EndList() - .EndMap() - .Item().BeginMap() - .Item("name").Value("*") - .Item("field_number").Value(1) - .Item("proto_type").Value("embedded_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("embedded2_num") - .Item("field_number").Value(10) - .Item("proto_type").Value("uint64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("embedded2_struct") - .Item("field_number").Value(17) - .Item("proto_type").Value("structured_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("float1") - .Item("field_number").Value(1) - .Item("proto_type").Value("float") - .EndMap() - .Item().BeginMap() - .Item("name").Value("string1") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .Item().BeginMap() - .Item("name").Value("embedded2_repeated") - .Item("field_number").Value(42) - .Item("proto_type").Value("string") - .Item("repeated").Value(true) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .Item().BeginMap() - .Item("name").Value("num") - .Item("field_number").Value(12) - .Item("proto_type").Value("uint64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("extra_field") - .Item("field_number").Value(13) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList() - .Item("complex_type_mode").Value(complexTypeMode) - .EndAttributes() - .Value("protobuf"); - return config; -} - -TTableSchemaPtr BuildEmbeddedSchema() { - auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, - {"embedded_num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, - {"variant", VariantStructLogicalType({ - {"str_variant", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"uint_variant", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, - })}, - {"extra_column", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))}, - {"embedded2_num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, - {"embedded2_struct", StructLogicalType({ - {"float1", SimpleLogicalType(ESimpleLogicalValueType::Float)}, - {"string1", SimpleLogicalType(ESimpleLogicalValueType::String)}, - })}, - {"embedded2_repeated", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"other_complex_field", StructLogicalType({ - {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - {"extra_int", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - - }); - return schema; -} - -TEST(TProtobufFormat, TestConfigParsingEmbedded) { - auto config = BuildEmbeddedConfig(EComplexTypeMode::Positional, EProtoFormatType::Structured); - auto schema = BuildEmbeddedSchema(); - - EXPECT_NO_THROW( - ParseAndValidateConfig(config->Attributes().ToMap(), {schema}) - ); -} - -TEST(TProtobufFormat, TestConfigParsing) -{ - // Empty config. - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(ParseYson("{}")), - "one of \"tables\", \"file_descriptor_set\" and \"file_descriptor_set_text\" must be specified"); - - // Broken protobuf. - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(ParseYson(R"({file_descriptor_set="dfgxx"; file_indices=[0]; message_indices=[0]})")), - "Error parsing \"file_descriptor_set\" in protobuf config"); - - EXPECT_NO_THROW(ParseAndValidateConfig( - CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap() - )); - - EXPECT_NO_THROW(ParseAndValidateConfig( - CreateAllFieldsConfig(EProtoFormatType::FileDescriptorLegacy)->Attributes().ToMap() - )); - - EXPECT_NO_THROW(ParseAndValidateConfig( - CreateAllFieldsConfig(EProtoFormatType::FileDescriptor)->Attributes().ToMap() - )); - - auto embeddedInsideNonembeddedConfig = BuildYsonNodeFluently() - .BeginMap() - .Item("tables").BeginList() - .Item().BeginMap() - .Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("embedded_message1") - .Item("field_number").Value(1) - .Item("proto_type").Value("embedded_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("field1") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("embedded_message2") - .Item("field_number").Value(3) - .Item("proto_type").Value("embedded_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("field2") - .Item("field_number").Value(4) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto schemaForEmbedded = New<TTableSchema>(std::vector{ - TColumnSchema("field1", StructLogicalType({ - {"embedded_message2", StructLogicalType({ - {"field2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - })}, - })) - }); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(embeddedInsideNonembeddedConfig, {schemaForEmbedded}), - "embedded_message inside of structured_message is not allowed"); - - auto repeatedEmbeddedConfig = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("*") - .Item("field_number").Value(1) - .Item("proto_type").Value("embedded_message") - .Item("repeated").Value(true) - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("field1") - .Item("field_number").Value(1) - .Item("proto_type").Value("uint64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(repeatedEmbeddedConfig), - R"(type "embedded_message" can not be repeated)"); - - auto multipleOtherColumnsConfig = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("Other1") - .Item("field_number").Value(1) - .Item("proto_type").Value("other_columns") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("Other2") - .Item("field_number").Value(2) - .Item("proto_type").Value("other_columns") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(multipleOtherColumnsConfig), - "Multiple \"other_columns\" in protobuf config are not allowed"); - - auto duplicateColumnNamesConfig = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(duplicateColumnNamesConfig), - "Multiple fields with same column name \"SomeColumn\" are forbidden in protobuf format"); - - auto anyCorrespondsToStruct = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(1) - .Item("proto_type").Value("any") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto schema = New<TTableSchema>(std::vector{ - TColumnSchema("SomeColumn", StructLogicalType({})), - }); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(anyCorrespondsToStruct, {schema}), - "Table schema and protobuf format config mismatch"); - - auto configWithBytes = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(1) - .Item("proto_type").Value("bytes") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto schemaWithUtf8 = New<TTableSchema>(std::vector{ - TColumnSchema("SomeColumn", SimpleLogicalType(ESimpleLogicalValueType::Utf8)), - }); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(configWithBytes, {schemaWithUtf8}), - "mismatch: expected logical type to be one of"); - - auto configWithPackedNonRepeated = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .Item("packed").Value(true) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto schemaWithInt64List = New<TTableSchema>(std::vector<TColumnSchema>{ - {"SomeColumn", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - }); - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(configWithPackedNonRepeated, {schemaWithInt64List}), - "Field \"SomeColumn\" is marked \"packed\" but is not marked \"repeated\""); - - auto configWithPackedRepeatedString = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .Item("packed").Value(true) - .Item("repeated").Value(true) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto schemaWithStringList = New<TTableSchema>(std::vector{ - TColumnSchema("SomeColumn", ListLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::String))) - }); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(configWithPackedRepeatedString, {schemaWithStringList}), - "packed protobuf field must have primitive numeric type, got \"string\""); - - auto configWithMissingFieldNumber = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("SomeColumn") - .Item("proto_type").Value("string") - .Item("repeated").Value(true) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - EXPECT_THROW_WITH_SUBSTRING( - ParseAndValidateConfig(configWithMissingFieldNumber, {schemaWithStringList}), - "\"field_number\" is required"); -} - -TEST(TProtobufFormat, TestParseBigZigZag) -{ - constexpr i32 value = Min<i32>(); - TMessage message; - message.set_int32_field(value); - auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); - auto rowCollector = ParseRows(message, config); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "Int32")), value); -} - -TEST(TProtobufFormat, TestParseEnumerationString) -{ - auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); - { - TMessage message; - message.set_enum_field(EEnum::One); - auto rowCollector = ParseRows(message, config); - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "One"); - } - { - TMessage message; - message.set_enum_field(EEnum::Two); - auto rowCollector = ParseRows(message, config); - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "Two"); - } - { - TMessage message; - message.set_enum_field(EEnum::Three); - auto rowCollector = ParseRows(message, config); - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "Three"); - } - { - TMessage message; - message.set_enum_field(EEnum::MinusFortyTwo); - auto rowCollector = ParseRows(message, config); - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "MinusFortyTwo"); - } -} - -TEST(TProtobufFormat, TestParseWrongEnumeration) -{ - auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); - TMessage message; - auto enumTag = TMessage::descriptor()->FindFieldByName("enum_field")->number(); - message.mutable_unknown_fields()->AddVarint(enumTag, 30); - EXPECT_ANY_THROW(ParseRows(message, config)); -} - -TEST(TProtobufFormat, TestParseEnumerationInt) -{ - TCollectingValueConsumer rowCollector; - - auto config = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("Enum") - .Item("field_number").Value(16) - .Item("proto_type").Value("enum_int") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto parser = CreateParserForProtobuf(&rowCollector, ConvertTo<TProtobufFormatConfigPtr>(config), 0); - - { - TMessage message; - message.set_enum_field(EEnum::One); - parser->Read(LenvalBytes(message)); - } - { - TMessage message; - message.set_enum_field(EEnum::Two); - parser->Read(LenvalBytes(message)); - } - { - TMessage message; - message.set_enum_field(EEnum::Three); - parser->Read(LenvalBytes(message)); - } - { - TMessage message; - message.set_enum_field(EEnum::MinusFortyTwo); - parser->Read(LenvalBytes(message)); - } - { - TMessage message; - auto enumTag = TMessage::descriptor()->FindFieldByName("enum_field")->number(); - message.mutable_unknown_fields()->AddVarint(enumTag, 100500); - parser->Read(LenvalBytes(message)); - } - - parser->Finish(); - - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "Enum")), 1); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(1, "Enum")), 2); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(2, "Enum")), 3); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(3, "Enum")), -42); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(4, "Enum")), 100500); -} - -TEST(TProtobufFormat, TestParseRandomGarbage) -{ - // Check that we never crash. - - TFastRng64 rng(42); - for (int i = 0; i != 1000; ++i) { - auto bytes = GenerateRandomLenvalString(rng, 8); - - TCollectingValueConsumer rowCollector; - auto parser = CreateParserForProtobuf( - &rowCollector, - ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()), - 0); - try { - parser->Read(bytes); - parser->Finish(); - } catch (...) { - } - } -} - -TEST(TProtobufFormat, TestParseZeroColumns) -{ - auto config = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - TCollectingValueConsumer rowCollector; - auto parser = CreateParserForProtobuf( - &rowCollector, - ConvertTo<TProtobufFormatConfigPtr>(config), - 0); - - // Empty lenval values. - parser->Read("\0\0\0\0"sv); - parser->Read("\0\0\0\0"sv); - - parser->Finish(); - - ASSERT_EQ(static_cast<ssize_t>(rowCollector.Size()), 2); - EXPECT_EQ(static_cast<int>(rowCollector.GetRow(0).GetCount()), 0); - EXPECT_EQ(static_cast<int>(rowCollector.GetRow(1).GetCount()), 0); -} - -TEST(TProtobufFormat, TestWriteEnumerationString) -{ - auto config = CreateAllFieldsConfig(EProtoFormatType::Structured); - - auto nameTable = New<TNameTable>(); - - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - config->Attributes(), - {New<TTableSchema>()}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - - writer->Write({ - MakeRow(nameTable, { - {"Enum", "MinusFortyTwo"} - }).Get() - }); - writer->Write({ - MakeRow(nameTable, { - {"Enum", "Three"}, - }).Get() - }); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput si(result); - TLenvalParser parser(&si); - { - auto row = parser.Next(); - ASSERT_TRUE(row); - NYT::TMessage message; - ASSERT_TRUE(message.ParseFromString(row->RowData)); - ASSERT_EQ(message.enum_field(), NYT::EEnum::MinusFortyTwo); - } - { - auto row = parser.Next(); - ASSERT_TRUE(row); - NYT::TMessage message; - ASSERT_TRUE(message.ParseFromString(row->RowData)); - ASSERT_EQ(message.enum_field(), NYT::EEnum::Three); - } - { - auto row = parser.Next(); - ASSERT_FALSE(row); - } -} - -TEST(TProtobufFormat, TestWriteEnumerationInt) -{ - auto config = BuildYsonNodeFluently() - .BeginAttributes() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("Enum") - .Item("field_number").Value(16) - .Item("proto_type").Value("enum_int") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndAttributes() - .Value("protobuf"); - - auto nameTable = New<TNameTable>(); - - auto writeAndParseRow = [&] (TUnversionedRow row, TMessage* message) { - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - config->Attributes(), - {New<TTableSchema>()}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - writer->Write({row}); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput si(result); - TLenvalParser parser(&si); - auto protoRow = parser.Next(); - ASSERT_TRUE(protoRow); - - ASSERT_TRUE(message->ParseFromString(protoRow->RowData)); - - auto nextProtoRow = parser.Next(); - ASSERT_FALSE(nextProtoRow); - }; - - { - TMessage message; - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", -42}, - }).Get(), - &message); - ASSERT_EQ(message.enum_field(), EEnum::MinusFortyTwo); - } - { - TMessage message; - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", static_cast<ui64>(std::numeric_limits<i32>::max())}, - }).Get(), - &message); - ASSERT_EQ(message.enum_field(), EEnum::MaxInt32); - } - { - TMessage message; - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", std::numeric_limits<i32>::max()}, - }).Get(), - &message); - ASSERT_EQ(message.enum_field(), EEnum::MaxInt32); - } - { - TMessage message; - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", std::numeric_limits<i32>::min()}, - }).Get(), - &message); - ASSERT_EQ(message.enum_field(), EEnum::MinInt32); - } - - TMessage message; - ASSERT_THROW( - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", static_cast<i64>(std::numeric_limits<i32>::max()) + 1}, - }).Get(), - &message), - TErrorException); - - ASSERT_THROW( - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", static_cast<i64>(std::numeric_limits<i32>::min()) - 1}, - }).Get(), - &message), - TErrorException); - - ASSERT_THROW( - writeAndParseRow( - MakeRow(nameTable, { - {"Enum", static_cast<ui64>(std::numeric_limits<i32>::max()) + 1}, - }).Get(), - &message), - TErrorException); -} - - -TEST(TProtobufFormat, TestWriteZeroColumns) -{ - auto config = BuildYsonNodeFluently() - .BeginAttributes() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .EndList() - .EndMap() - .EndList() - .EndAttributes() - .Value("protobuf"); - - auto nameTable = New<TNameTable>(); - - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - config->Attributes(), - {New<TTableSchema>()}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - - writer->Write({ - MakeRow(nameTable, { - {"Int64", -1}, - {"String", "this_is_string"}, - }).Get() - }); - writer->Write({MakeRow(nameTable, { }).Get()}); - - writer->Close() - .Get() - .ThrowOnError(); - - ASSERT_EQ(result, "\0\0\0\0\0\0\0\0"sv); -} - -TEST(TProtobufFormat, TestTabletIndex) -{ - auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(3) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap()); - - auto nameTable = New<TNameTable>(); - - TString result; - TStringOutput resultStream(result); - auto controlAttributesConfig = New<TControlAttributesConfig>(); - controlAttributesConfig->EnableTabletIndex = true; - - auto writer = CreateWriterForProtobuf( - config, - {New<TTableSchema>()}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - controlAttributesConfig, - 0); - - writer->Write({ - MakeRow(nameTable, { - {TabletIndexColumnName, 1LL << 50}, - {"int64_field", -2345}, - }).Get(), - MakeRow(nameTable, { - {TabletIndexColumnName, 12}, - {"int64_field", 2345}, - }).Get(), - }); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput si(result); - TLenvalParser parser(&si); - { - auto row = parser.Next(); - ASSERT_TRUE(row); - ASSERT_EQ(row->TabletIndex, 1ULL << 50); - NYT::TMessage message; - ASSERT_TRUE(message.ParseFromString(row->RowData)); - ASSERT_EQ(message.int64_field(), -2345); - } - { - auto row = parser.Next(); - ASSERT_TRUE(row); - ASSERT_EQ(static_cast<int>(row->TabletIndex), 12); - NYT::TMessage message; - ASSERT_TRUE(message.ParseFromString(row->RowData)); - ASSERT_EQ(message.int64_field(), 2345); - } - { - auto row = parser.Next(); - ASSERT_FALSE(row); - } -} - -TEST(TProtobufFormat, TestContext) -{ - auto config = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - TCollectingValueConsumer rowCollector; - auto parser = CreateParserForProtobuf( - &rowCollector, - ConvertTo<TProtobufFormatConfigPtr>(config), - 0); - - TString context; - try { - TMessage message; - message.set_string_field("PYSHCH-PYSHCH"); - parser->Read(LenvalBytes(message)); - parser->Finish(); - GTEST_FATAL_FAILURE_("expected to throw"); - } catch (const NYT::TErrorException& e) { - context = *e.Error().Attributes().Find<TString>("context"); - } - ASSERT_NE(context.find("PYSHCH-PYSHCH"), TString::npos); -} - -//////////////////////////////////////////////////////////////////////////////// - -TTableSchemaPtr CreateSchemaWithStructuredMessage() -{ - auto keyValueStruct = StructLogicalType({ - {"key", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"value", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - }); - - return New<TTableSchema>(std::vector<TColumnSchema>{ - {"first", StructLogicalType({ - {"field_missing_from_proto1", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int32))}, - {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"another_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"message_field", keyValueStruct}, - {"repeated_message_field", ListLogicalType(keyValueStruct)}, - {"any_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"any_map_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any))}, - {"optional_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"repeated_optional_any_field", ListLogicalType(OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any)))}, - {"packed_repeated_enum_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"optional_repeated_bool_field", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean)))}, - {"oneof_field", VariantStructLogicalType({ - {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_message_field", keyValueStruct}, - })}, - {"optional_oneof_field", OptionalLogicalType(VariantStructLogicalType({ - {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_message_field", keyValueStruct}, - }))}, - {"map_field", DictLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::Int64), - OptionalLogicalType(keyValueStruct)) - }, - {"field_missing_from_proto2", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int32))}, - })}, - {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"another_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"repeated_message_field", ListLogicalType(keyValueStruct)}, - {"second", StructLogicalType({ - {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - {"any_field", SimpleLogicalType(ESimpleLogicalValueType::Any)}, - - {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"uint64_field", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, - {"int32_field", SimpleLogicalType(ESimpleLogicalValueType::Int32)}, - {"uint32_field", SimpleLogicalType(ESimpleLogicalValueType::Uint32)}, - - {"enum_int_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"enum_string_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"enum_string_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - - {"repeated_optional_any_field", ListLogicalType(OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any)))}, - - {"other_complex_field", StructLogicalType({ - {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - - {"utf8_field", SimpleLogicalType(ESimpleLogicalValueType::Utf8)}, - - {"packed_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - - {"optional_repeated_int64_field", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)))}, - - {"oneof_field", VariantStructLogicalType({ - {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_message_field", keyValueStruct}, - })}, - - {"optional_oneof_field", OptionalLogicalType(VariantStructLogicalType({ - {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"oneof_message_field", keyValueStruct}, - }))}, - - {"map_field", DictLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::Int64), - OptionalLogicalType(keyValueStruct)) - }, - }); -} - -INodePtr CreateConfigWithStructuredMessage(EComplexTypeMode complexTypeMode, EProtoFormatType formatType) -{ - if (formatType == EProtoFormatType::FileDescriptor) { - return CreateFileDescriptorConfig<TMessageWithStructuredEmbedded>(complexTypeMode); - } - YT_VERIFY(formatType == EProtoFormatType::Structured); - - auto buildOneofConfig = [] (TString prefix, int fieldNumberOffset) { - return BuildYsonNodeFluently() - .BeginMap() - .Item("name").Value(prefix + "oneof_field") - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value(prefix + "oneof_string_field_1") - .Item("field_number").Value(101 + fieldNumberOffset) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value(prefix + "oneof_string_field") - .Item("field_number").Value(102 + fieldNumberOffset) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value(prefix + "oneof_message_field") - .Item("field_number").Value(1000 + fieldNumberOffset) - .Item("proto_type").Value("structured_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("key") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("value") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - }; - auto oneofConfig = buildOneofConfig("", 0); - auto optionalOneofConfig = buildOneofConfig("optional_", 1000); - - auto keyValueFields = BuildYsonStringFluently() - .BeginList() - .Item().BeginMap() - .Item("name").Value("key") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("value") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList(); - - return BuildYsonNodeFluently() - .BeginAttributes() - .Item("enumerations").Value(EnumerationsConfig) - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("first") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("enum_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("packed_repeated_enum_field") - .Item("field_number").Value(11) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .Item("repeated").Value(true) - .Item("packed").Value(true) - .EndMap() - .Item().BeginMap() - .Item("name").Value("message_field") - .Item("field_number").Value(4) - .Item("proto_type").Value("structured_message") - .Item("fields").Value(keyValueFields) - .EndMap() - .Item().BeginMap() - .Item("name").Value("repeated_int64_field") - .Item("field_number").Value(3) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - .Item().BeginMap() - .Item("name").Value("another_repeated_int64_field") - .Item("field_number").Value(9) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - .Item().BeginMap() - .Item("name").Value("repeated_message_field") - .Item("field_number").Value(5) - .Item("proto_type").Value("structured_message") - .Item("repeated").Value(true) - .Item("fields").Value(keyValueFields) - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("any_int64_field") - .Item("field_number").Value(6) - .Item("proto_type").Value("any") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("any_map_field") - .Item("field_number").Value(7) - .Item("proto_type").Value("any") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("optional_int64_field") - .Item("field_number").Value(8) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("repeated_optional_any_field") - .Item("field_number").Value(10) - .Item("proto_type").Value("any") - .Item("repeated").Value(true) - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("optional_repeated_bool_field") - .Item("field_number").Value(12) - .Item("proto_type").Value("bool") - .Item("repeated").Value(true) - .EndMap() - .Item().Value(oneofConfig) - .Item().Value(optionalOneofConfig) - .Item() - .BeginMap() - .Item("name").Value("map_field") - .Item("field_number").Value(13) - .Item("proto_type").Value("structured_message") - .Item("repeated").Value(true) - .Item("fields") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("key") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("value") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields").Value(keyValueFields) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("second") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("one") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("two") - .Item("field_number").Value(500000000) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("three") - .Item("field_number").Value(100500) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("repeated_message_field") - .Item("field_number").Value(3) - .Item("proto_type").Value("structured_message") - .Item("repeated").Value(true) - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("key") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("value") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("repeated_int64_field") - .Item("field_number").Value(4) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("another_repeated_int64_field") - .Item("field_number").Value(13) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - .Item() - .BeginMap() - // In schema it is of type "any". - .Item("name").Value("any_field") - .Item("field_number").Value(5) - .Item("proto_type").Value("int64") - .EndMap() - // The next fields are for type casting testing - .Item() - .BeginMap() - // In schema it is of type "int64". - .Item("name").Value("int64_field") - .Item("field_number").Value(6) - .Item("proto_type").Value("int32") - .EndMap() - .Item() - .BeginMap() - // In schema it is of type "uint64". - .Item("name").Value("uint64_field") - .Item("field_number").Value(7) - .Item("proto_type").Value("uint32") - .EndMap() - .Item() - .BeginMap() - // In schema it is of type "int32". - .Item("name").Value("int32_field") - .Item("field_number").Value(8) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - // In schema it is of type "uint32". - .Item("name").Value("uint32_field") - .Item("field_number").Value(9) - .Item("proto_type").Value("uint64") - .EndMap() - - // Enums. - .Item() - .BeginMap() - .Item("name").Value("enum_int_field") - .Item("field_number").Value(10) - .Item("proto_type").Value("enum_int") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("enum_string_string_field") - .Item("field_number").Value(11) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("enum_string_int64_field") - .Item("field_number").Value(12) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("utf8_field") - .Item("field_number").Value(16) - .Item("proto_type").Value("string") - .EndMap() - - // list<optional<any>>. - .Item() - .BeginMap() - .Item("name").Value("repeated_optional_any_field") - .Item("field_number").Value(14) - .Item("proto_type").Value("any") - .Item("repeated").Value(true) - .EndMap() - - // Other columns. - .Item() - .BeginMap() - .Item("name").Value("other_columns_field") - .Item("field_number").Value(15) - .Item("proto_type").Value("other_columns") - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("packed_repeated_int64_field") - .Item("field_number").Value(17) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .Item("packed").Value(true) - .EndMap() - - .Item() - .BeginMap() - .Item("name").Value("optional_repeated_int64_field") - .Item("field_number").Value(18) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - - .Item().Value(oneofConfig) - .Item().Value(optionalOneofConfig) - - .Item() - .BeginMap() - .Item("name").Value("map_field") - .Item("field_number").Value(19) - .Item("proto_type").Value("structured_message") - .Item("repeated").Value(true) - .Item("fields") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("key") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("value") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields").Value(keyValueFields) - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .Item("complex_type_mode").Value(complexTypeMode) - .EndAttributes() - .Value("protobuf"); -} - -using TProtobufFormatStructuredMessageParameter = std::tuple<EComplexTypeMode, int, EProtoFormatType>; - -class TProtobufFormatStructuredMessage - : public ::testing::TestWithParam<TProtobufFormatStructuredMessageParameter> -{ }; - -INSTANTIATE_TEST_SUITE_P( - FileDescriptor, - TProtobufFormatStructuredMessage, - ::testing::Values(TProtobufFormatStructuredMessageParameter{ - EComplexTypeMode::Positional, - 1, - EProtoFormatType::FileDescriptor})); - -INSTANTIATE_TEST_SUITE_P( - Positional, - TProtobufFormatStructuredMessage, - ::testing::Values(TProtobufFormatStructuredMessageParameter{ - EComplexTypeMode::Positional, - 1, - EProtoFormatType::Structured})); - -INSTANTIATE_TEST_SUITE_P( - Named, - TProtobufFormatStructuredMessage, - ::testing::Values(TProtobufFormatStructuredMessageParameter{ - EComplexTypeMode::Named, - 1, - EProtoFormatType::Structured})); - -INSTANTIATE_TEST_SUITE_P( - ManyRows, - TProtobufFormatStructuredMessage, - ::testing::Values(TProtobufFormatStructuredMessageParameter{ - EComplexTypeMode::Named, - 30000, - EProtoFormatType::Structured})); - -TEST_P(TProtobufFormatStructuredMessage, EmbeddedWrite) -{ - auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); - - auto nameTable = New<TNameTable>(); - auto numId = nameTable->RegisterName("num"); - auto embeddedNumId = nameTable->RegisterName("embedded_num"); - auto variantId = nameTable->RegisterName("variant"); - auto embedded2NumId = nameTable->RegisterName("embedded2_num"); - auto embedded2StructId = nameTable->RegisterName("embedded2_struct"); - auto embedded2RepeatedId = nameTable->RegisterName("embedded2_repeated"); - auto extraIntId = nameTable->RegisterName("extra_int"); - auto otherComplexFieldId = nameTable->RegisterName("other_complex_field"); - - //message T2 { - // optional ui64 embedded2_num; - //}; - //message T1 { - // required T2 t2 [embedded]; - // optional ui64 embedded_num; - //}; - // - //message T { - // required T1 t1 [embedded]; - // optional ui64 num; - //}; - - auto schema = BuildEmbeddedSchema(); - auto config = BuildEmbeddedConfig(complexTypeMode, protoFormatType); - - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - ConvertTo<TProtobufFormatConfigPtr>(config->Attributes()), - {schema}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - - TUnversionedRowBuilder builder; - builder.AddValue(MakeUnversionedUint64Value(789, numId)); - builder.AddValue(MakeUnversionedUint64Value(123, embeddedNumId)); - builder.AddValue(MakeUnversionedUint64Value(456, embedded2NumId)); - builder.AddValue(MakeUnversionedCompositeValue("[1; 555u]", variantId)); - auto embeddedYson = BuildYsonStringFluently() - .BeginList() - // float1 - .Item().Value(1.5f) - // string1 - .Item().Value("abc") - .EndList(); - auto embeddedYsonStr = embeddedYson.ToString(); - builder.AddValue(MakeUnversionedCompositeValue(embeddedYsonStr, embedded2StructId)); - auto repeatedYsonStr = BuildYsonStringFluently() - .BeginList() - .Item().Value("a") - .Item().Value("b") - .EndList() - .ToString(); - builder.AddValue(MakeUnversionedCompositeValue(repeatedYsonStr, embedded2RepeatedId)); - builder.AddValue(MakeUnversionedInt64Value(111, extraIntId)); - auto otherComplexFieldYson = BuildYsonStringFluently() - .BeginList() - .Item().Value(22) - .Item().Value(23) - .Item().Value(24) - .EndList(); - auto otherComplexFieldYsonStr = otherComplexFieldYson.ToString(); - builder.AddValue(MakeUnversionedCompositeValue(otherComplexFieldYsonStr, otherComplexFieldId)); - - - auto rows = std::vector<TUnversionedRow>(rowCount, builder.GetRow()); - writer->Write(rows); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput input(result); - TLenvalParser lenvalParser(&input); - - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TEmbeddingMessage message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - EXPECT_EQ(message.num(), 789UL); - EXPECT_EQ(message.t1().embedded_num(), 123UL); - EXPECT_EQ(message.t1().t2().embedded2_num(), 456UL); - - EXPECT_FALSE(message.t1().has_str_variant()); - EXPECT_TRUE(message.t1().has_uint_variant()); - EXPECT_EQ(message.t1().uint_variant(), 555UL); - - EXPECT_EQ(message.t1().t2().embedded2_struct().float1(), 1.5f); - EXPECT_EQ(message.t1().t2().embedded2_struct().string1(), "abc"); - - ASSERT_EQ(message.t1().t2().embedded2_repeated_size(), 2); - EXPECT_EQ(message.t1().t2().embedded2_repeated(0), "a"); - EXPECT_EQ(message.t1().t2().embedded2_repeated(1), "b"); - - { - auto otherColumns = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); - auto mode = complexTypeMode; - auto expected = ([&] { - switch (mode) { - case EComplexTypeMode::Named: - return BuildYsonNodeFluently() - .BeginMap() - .Item("one").Value(22) - .Item("two").Value(23) - .Item("three").Value(24) - .EndMap(); - case EComplexTypeMode::Positional: - return ConvertToNode(otherComplexFieldYson); - } - YT_ABORT(); - })(); - - EXPECT_NODES_EQUAL(expected, otherColumns->GetChildOrThrow("other_complex_field")); - EXPECT_EQ(ConvertTo<i64>(otherColumns->GetChildOrThrow("extra_int")), 111); - } - - ASSERT_FALSE(message.has_extra_field()); - ASSERT_FALSE(message.t1().has_embedded_extra_field()); - } - - ASSERT_FALSE(lenvalParser.Next()); -} - -TEST_P(TProtobufFormatStructuredMessage, Write) -{ - auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); - - auto nameTable = New<TNameTable>(); - auto firstId = nameTable->RegisterName("first"); - auto secondId = nameTable->RegisterName("second"); - auto repeatedMessageId = nameTable->RegisterName("repeated_message_field"); - auto repeatedInt64Id = nameTable->RegisterName("repeated_int64_field"); - auto anotherRepeatedInt64Id = nameTable->RegisterName("another_repeated_int64_field"); - auto anyFieldId = nameTable->RegisterName("any_field"); - auto int64FieldId = nameTable->RegisterName("int64_field"); - auto uint64FieldId = nameTable->RegisterName("uint64_field"); - auto int32FieldId = nameTable->RegisterName("int32_field"); - auto uint32FieldId = nameTable->RegisterName("uint32_field"); - auto enumIntFieldId = nameTable->RegisterName("enum_int_field"); - auto enumStringStringFieldId = nameTable->RegisterName("enum_string_string_field"); - auto enumStringInt64FieldId = nameTable->RegisterName("enum_string_int64_field"); - auto utf8FieldId = nameTable->RegisterName("utf8_field"); - auto repeatedOptionalAnyFieldId = nameTable->RegisterName("repeated_optional_any_field"); - auto otherComplexFieldId = nameTable->RegisterName("other_complex_field"); - auto packedRepeatedInt64FieldId = nameTable->RegisterName("packed_repeated_int64_field"); - auto optionalRepeatedInt64FieldId = nameTable->RegisterName("optional_repeated_int64_field"); - auto oneofFieldId = nameTable->RegisterName("oneof_field"); - auto optionalOneofFieldId = nameTable->RegisterName("optional_oneof_field"); - auto mapFieldId = nameTable->RegisterName("map_field"); - - auto schema = CreateSchemaWithStructuredMessage(); - auto config = CreateConfigWithStructuredMessage(complexTypeMode, protoFormatType); - - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - ConvertTo<TProtobufFormatConfigPtr>(config->Attributes()), - {schema}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - - auto firstYsonStr = BuildYsonStringFluently() - .BeginList() - // field_missing_from_proto1 - .Item().Value(11111) - // enum_field - .Item().Value("Two") - // int64_field - .Item().Value(44) - // repeated_int64_field - .Item() - .BeginList() - .Item().Value(55) - .Item().Value(56) - .Item().Value(57) - .EndList() - // another_repeated_int64_field - .Item() - .BeginList() - .EndList() - // message_field - .Item() - .BeginList() - .Item().Value("key") - .Item().Value("value") - .EndList() - // repeated_message_field - .Item() - .BeginList() - .Item() - .BeginList() - .Item().Value("key1") - .Item().Value("value1") - .EndList() - .Item() - .BeginList() - .Item().Value("key2") - .Item().Value("value2") - .EndList() - .EndList() - // any_int64_field - .Item().Value(45) - // any_map_field - .Item() - .BeginMap() - .Item("key").Value("value") - .EndMap() - // optional_int64_field - .Item().Entity() - // repeated_optional_any_field - .Item() - .BeginList() - .Item().Value(2) - .Item().Entity() - .Item().Value("foo") - .EndList() - // packed_repeated_enum_field - .Item() - .BeginList() - .Item().Value("MinusFortyTwo") - .Item().Value("Two") - .EndList() - // optional_repeated_bool_field - .Item() - .BeginList() - .Item().Value(false) - .Item().Value(true) - .Item().Value(false) - .EndList() - // oneof_field - .Item() - .BeginList() - // message_field - .Item().Value(2) - .Item().BeginList() - .Item().Value("foo") - .Item().Entity() - .EndList() - .EndList() - // optional_oneof_field - .Item() - .Entity() - // map_field - .Item() - .BeginList() - .Item().BeginList() - .Item().Value(13) - .Item().BeginList() - .Item().Value("bac") - .Item().Value("cab") - .EndList() - .EndList() - .Item().BeginList() - .Item().Value(15) - .Item().BeginList() - .Item().Value("ya") - .Item().Value("make") - .EndList() - .EndList() - .EndList() - .EndList() - .ToString(); - - auto secondYsonStr = BuildYsonStringFluently() - .BeginList() - .Item().Value(101) - .Item().Value(102) - .Item().Value(103) - .EndList() - .ToString(); - - auto repeatedMessageYsonStr = BuildYsonStringFluently() - .BeginList() - .Item() - .BeginList() - .Item().Value("key11") - .Item().Value("value11") - .EndList() - .Item() - .BeginList() - .Item().Value("key21") - .Item().Value("value21") - .EndList() - .EndList() - .ToString(); - - auto repeatedInt64Yson = BuildYsonStringFluently() - .BeginList() - .Item().Value(31) - .Item().Value(32) - .Item().Value(33) - .EndList(); - auto repeatedInt64YsonStr = repeatedInt64Yson.ToString(); - - auto anotherRepeatedInt64YsonStr = BuildYsonStringFluently() - .BeginList() - .EndList() - .ToString(); - - auto repeatedOptionalAnyYson = BuildYsonStringFluently() - .BeginList() - .Item().Value(1) - .Item().Value("abc") - .Item().Entity() - .Item().Value(true) - .EndList(); - auto repeatedOptionalAnyYsonStr = repeatedOptionalAnyYson.ToString(); - - auto otherComplexFieldYson = BuildYsonStringFluently() - .BeginList() - .Item().Value(22) - .Item().Value(23) - .Item().Value(24) - .EndList(); - auto otherComplexFieldYsonStr = otherComplexFieldYson.ToString(); - - TUnversionedRowBuilder builder; - builder.AddValue(MakeUnversionedCompositeValue(firstYsonStr, firstId)); - builder.AddValue(MakeUnversionedCompositeValue(secondYsonStr, secondId)); - builder.AddValue(MakeUnversionedCompositeValue(repeatedMessageYsonStr, repeatedMessageId)); - builder.AddValue(MakeUnversionedCompositeValue(repeatedInt64YsonStr, repeatedInt64Id)); - builder.AddValue(MakeUnversionedCompositeValue(anotherRepeatedInt64YsonStr, anotherRepeatedInt64Id)); - builder.AddValue(MakeUnversionedInt64Value(4321, anyFieldId)); - - builder.AddValue(MakeUnversionedInt64Value(-64, int64FieldId)); - builder.AddValue(MakeUnversionedUint64Value(64, uint64FieldId)); - builder.AddValue(MakeUnversionedInt64Value(-32, int32FieldId)); - builder.AddValue(MakeUnversionedUint64Value(32, uint32FieldId)); - - builder.AddValue(MakeUnversionedInt64Value(-42, enumIntFieldId)); - builder.AddValue(MakeUnversionedStringValue("Three", enumStringStringFieldId)); - builder.AddValue(MakeUnversionedInt64Value(1, enumStringInt64FieldId)); - - const auto HelloWorldInRussian = "\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\xbc\xd0\xb8\xd1\x80!"; - builder.AddValue(MakeUnversionedStringValue(HelloWorldInRussian, utf8FieldId)); - - builder.AddValue(MakeUnversionedCompositeValue(repeatedOptionalAnyYsonStr, repeatedOptionalAnyFieldId)); - - builder.AddValue(MakeUnversionedCompositeValue(otherComplexFieldYsonStr, otherComplexFieldId)); - - builder.AddValue(MakeUnversionedCompositeValue("[12;-10;123456789000;]", packedRepeatedInt64FieldId)); - - builder.AddValue(MakeUnversionedCompositeValue("[1;2;3]", optionalRepeatedInt64FieldId)); - - builder.AddValue(MakeUnversionedCompositeValue("[0; foobaz]", oneofFieldId)); - builder.AddValue(MakeUnversionedNullValue(optionalOneofFieldId)); - - builder.AddValue(MakeUnversionedCompositeValue("[[2; [x; y]]; [5; [z; w]]]", mapFieldId)); - - auto rows = std::vector<TUnversionedRow>(rowCount, builder.GetRow()); - writer->Write(rows); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput input(result); - TLenvalParser lenvalParser(&input); - - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TMessageWithStructuredEmbedded message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - const auto& first = message.first(); - EXPECT_EQ(first.enum_field(), EEnum::Two); - EXPECT_EQ(first.int64_field(), 44); - std::vector<i64> firstRepeatedInt64Field( - first.repeated_int64_field().begin(), - first.repeated_int64_field().end()); - EXPECT_EQ(firstRepeatedInt64Field, (std::vector<i64>{55, 56, 57})); - std::vector<i64> firstAnotherRepeatedInt64Field( - first.another_repeated_int64_field().begin(), - first.another_repeated_int64_field().end()); - EXPECT_EQ(firstAnotherRepeatedInt64Field, (std::vector<i64>{})); - EXPECT_EQ(first.message_field().key(), "key"); - EXPECT_EQ(first.message_field().value(), "value"); - ASSERT_EQ(first.repeated_message_field_size(), 2); - EXPECT_EQ(first.repeated_message_field(0).key(), "key1"); - EXPECT_EQ(first.repeated_message_field(0).value(), "value1"); - EXPECT_EQ(first.repeated_message_field(1).key(), "key2"); - EXPECT_EQ(first.repeated_message_field(1).value(), "value2"); - - EXPECT_NODES_EQUAL( - ConvertToNode(TYsonString(first.any_int64_field())), - BuildYsonNodeFluently().Value(45)); - - EXPECT_NODES_EQUAL( - ConvertToNode(TYsonString(first.any_map_field())), - BuildYsonNodeFluently().BeginMap() - .Item("key").Value("value") - .EndMap()); - - std::vector<TYsonString> firstRepeatedOptionalAnyField( - first.repeated_optional_any_field().begin(), - first.repeated_optional_any_field().end()); - - EXPECT_NODES_EQUAL( - ConvertToNode(firstRepeatedOptionalAnyField), - BuildYsonNodeFluently() - .BeginList() - .Item().Value(2) - .Item().Entity() - .Item().Value("foo") - .EndList()); - - EXPECT_FALSE(first.has_optional_int64_field()); - - std::vector<EEnum> actualFirstPackedRepeatedEnumField; - for (auto x : first.packed_repeated_enum_field()) { - actualFirstPackedRepeatedEnumField.push_back(static_cast<EEnum>(x)); - } - auto expectedFirstPackedRepeatedEnumField = std::vector<EEnum>{EEnum::MinusFortyTwo, EEnum::Two}; - EXPECT_EQ(expectedFirstPackedRepeatedEnumField, actualFirstPackedRepeatedEnumField); - - std::vector<bool> firstOptionalRepeatedBoolField( - first.optional_repeated_bool_field().begin(), - first.optional_repeated_bool_field().end()); - auto expectedFirstOptionalRepeatedBoolField = std::vector<bool>{false, true, false}; - EXPECT_EQ(expectedFirstOptionalRepeatedBoolField, firstOptionalRepeatedBoolField); - - EXPECT_FALSE(first.has_oneof_string_field_1()); - EXPECT_FALSE(first.has_oneof_string_field()); - EXPECT_TRUE(first.has_oneof_message_field()); - EXPECT_EQ(first.oneof_message_field().key(), "foo"); - EXPECT_FALSE(first.oneof_message_field().has_value()); - - EXPECT_FALSE(first.has_optional_oneof_string_field_1()); - EXPECT_FALSE(first.has_optional_oneof_string_field()); - EXPECT_FALSE(first.has_optional_oneof_message_field()); - - EXPECT_EQ(std::ssize(first.map_field()), 2); - ASSERT_EQ(static_cast<int>(first.map_field().count(13)), 1); - EXPECT_EQ(first.map_field().at(13).key(), "bac"); - EXPECT_EQ(first.map_field().at(13).value(), "cab"); - ASSERT_EQ(static_cast<int>(first.map_field().count(15)), 1); - EXPECT_EQ(first.map_field().at(15).key(), "ya"); - EXPECT_EQ(first.map_field().at(15).value(), "make"); - - const auto& second = message.second(); - EXPECT_EQ(second.one(), 101); - EXPECT_EQ(second.two(), 102); - EXPECT_EQ(second.three(), 103); - - ASSERT_EQ(message.repeated_message_field_size(), 2); - EXPECT_EQ(message.repeated_message_field(0).key(), "key11"); - EXPECT_EQ(message.repeated_message_field(0).value(), "value11"); - EXPECT_EQ(message.repeated_message_field(1).key(), "key21"); - EXPECT_EQ(message.repeated_message_field(1).value(), "value21"); - - std::vector<i64> repeatedInt64Field( - message.repeated_int64_field().begin(), - message.repeated_int64_field().end()); - EXPECT_EQ(repeatedInt64Field, (std::vector<i64>{31, 32, 33})); - - std::vector<i64> anotherRepeatedInt64Field( - message.another_repeated_int64_field().begin(), - message.another_repeated_int64_field().end()); - EXPECT_EQ(anotherRepeatedInt64Field, (std::vector<i64>{})); - - EXPECT_EQ(message.int64_any_field(), 4321); - - // Note the reversal of 32 <-> 64. - EXPECT_EQ(message.int32_field(), -64); - EXPECT_EQ(message.uint32_field(), 64u); - EXPECT_EQ(message.int64_field(), -32); - EXPECT_EQ(message.uint64_field(), 32u); - - EXPECT_EQ(message.enum_int_field(), EEnum::MinusFortyTwo); - EXPECT_EQ(message.enum_string_string_field(), EEnum::Three); - EXPECT_EQ(message.enum_string_int64_field(), EEnum::One); - - EXPECT_EQ(message.utf8_field(), HelloWorldInRussian); - - std::vector<TYsonString> repeatedOptionalAnyField( - message.repeated_optional_any_field().begin(), - message.repeated_optional_any_field().end()); - EXPECT_NODES_EQUAL(ConvertToNode(repeatedOptionalAnyField), ConvertToNode(repeatedOptionalAnyYson)); - - { - auto otherColumns = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); - auto mode = complexTypeMode; - auto expected = ([&] { - switch (mode) { - case EComplexTypeMode::Named: - return BuildYsonNodeFluently() - .BeginMap() - .Item("one").Value(22) - .Item("two").Value(23) - .Item("three").Value(24) - .EndMap(); - case EComplexTypeMode::Positional: - return ConvertToNode(otherComplexFieldYson); - } - YT_ABORT(); - })(); - - EXPECT_NODES_EQUAL(expected, otherColumns->GetChildOrThrow("other_complex_field")); - } - - std::vector<i64> actualPackedRepeatedInt64Field( - message.packed_repeated_int64_field().begin(), - message.packed_repeated_int64_field().end()); - auto expectedPackedRepeatedInt64Field = std::vector<i64>{12, -10, 123456789000LL}; - EXPECT_EQ(expectedPackedRepeatedInt64Field, actualPackedRepeatedInt64Field); - - std::vector<i64> actualOptionalRepeatedInt64Field( - message.optional_repeated_int64_field().begin(), - message.optional_repeated_int64_field().end()); - auto expectedOptionalRepeatedInt64Field = std::vector<i64>{1, 2, 3}; - EXPECT_EQ(expectedOptionalRepeatedInt64Field, actualOptionalRepeatedInt64Field); - - EXPECT_TRUE(message.has_oneof_string_field_1()); - EXPECT_EQ(message.oneof_string_field_1(), "foobaz"); - EXPECT_FALSE(message.has_oneof_string_field()); - EXPECT_FALSE(message.has_oneof_message_field()); - - EXPECT_FALSE(message.has_optional_oneof_string_field_1()); - EXPECT_FALSE(message.has_optional_oneof_string_field()); - EXPECT_FALSE(message.has_optional_oneof_message_field()); - - EXPECT_EQ(std::ssize(message.map_field()), 2); - ASSERT_EQ(static_cast<int>(message.map_field().count(2)), 1); - EXPECT_EQ(message.map_field().at(2).key(), "x"); - EXPECT_EQ(message.map_field().at(2).value(), "y"); - ASSERT_EQ(static_cast<int>(message.map_field().count(5)), 1); - EXPECT_EQ(message.map_field().at(5).key(), "z"); - EXPECT_EQ(message.map_field().at(5).value(), "w"); - } - - ASSERT_FALSE(lenvalParser.Next()); -} - -INodePtr SortMapByKey(const INodePtr& node) -{ - auto keyValuePairs = ConvertTo<std::vector<std::pair<i64, INodePtr>>>(node); - std::sort(std::begin(keyValuePairs), std::end(keyValuePairs)); - return ConvertTo<INodePtr>(keyValuePairs); -} - -TEST_P(TProtobufFormatStructuredMessage, EmbeddedParse) -{ - auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); - - auto schema = BuildEmbeddedSchema(); - auto config = BuildEmbeddedConfig(complexTypeMode, protoFormatType); - - NYT::TEmbeddingMessage message; - - message.set_num(789); - auto* t1 = message.mutable_t1(); - t1->set_embedded_num(123); - auto* t2 = t1->mutable_t2(); - t2->set_embedded2_num(456); - t1->set_uint_variant(555); - t2->add_embedded2_repeated("a"); - t2->add_embedded2_repeated("b"); - t2->add_embedded2_repeated("c"); - auto* embedded2_struct = t2->mutable_embedded2_struct(); - embedded2_struct->set_float1(1.5f); - embedded2_struct->set_string1("abc"); - - //message.set_extra_field("*"); - //t1->set_embedded_extra_field("*"); - - auto rowCollector = ParseRows(message, config, schema, rowCount); - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "num")), 789u); - EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "embedded_num")), 123u); - EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "embedded2_num")), 456u); - EXPECT_NODES_EQUAL( - GetComposite(rowCollector.GetRowValue(rowIndex, "variant")), - ConvertToNode(TYsonString(TStringBuf("[1; 555u]")))); - - auto embedded2_repeatedNode = GetComposite(rowCollector.GetRowValue(rowIndex, "embedded2_repeated")); - ASSERT_EQ(embedded2_repeatedNode->GetType(), ENodeType::List); - const auto& embedded2_repeatedList = embedded2_repeatedNode->AsList(); - ASSERT_EQ(embedded2_repeatedList->GetChildCount(), 3); - EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(0), "a"); - EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(1), "b"); - EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(2), "c"); - - auto embedded2_structNode = GetComposite(rowCollector.GetRowValue(rowIndex, "embedded2_struct")); - ASSERT_EQ(embedded2_structNode->GetType(), ENodeType::List); - const auto& embedded2_structList = embedded2_structNode->AsList(); - ASSERT_EQ(embedded2_structList->GetChildCount(), 2); - EXPECT_EQ(embedded2_structList->GetChildValueOrThrow<double>(0), 1.5f); - EXPECT_EQ(embedded2_structList->GetChildValueOrThrow<TString>(1), "abc"); - } -} - -TEST_P(TProtobufFormatStructuredMessage, Parse) -{ - auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); - - auto schema = CreateSchemaWithStructuredMessage(); - auto config = CreateConfigWithStructuredMessage(complexTypeMode, protoFormatType); - - NYT::TMessageWithStructuredEmbedded message; - - auto* first = message.mutable_first(); - first->set_enum_field(EEnum::Two); - first->set_int64_field(44); - - first->add_repeated_int64_field(55); - first->add_repeated_int64_field(56); - first->add_repeated_int64_field(57); - - // another_repeated_int64_field is intentionally empty. - - first->mutable_message_field()->set_key("key"); - first->mutable_message_field()->set_value("value"); - auto* firstSubfield1 = first->add_repeated_message_field(); - firstSubfield1->set_key("key1"); - firstSubfield1->set_value("value1"); - auto* firstSubfield2 = first->add_repeated_message_field(); - firstSubfield2->set_key("key2"); - firstSubfield2->set_value("value2"); - - first->set_any_int64_field(BuildYsonStringFluently().Value(4422).ToString()); - first->set_any_map_field( - BuildYsonStringFluently() - .BeginMap() - .Item("key").Value("value") - .EndMap() - .ToString()); - - first->add_repeated_optional_any_field("%false"); - first->add_repeated_optional_any_field("42"); - first->add_repeated_optional_any_field("#"); - - first->add_packed_repeated_enum_field(EEnum::MaxInt32); - first->add_packed_repeated_enum_field(EEnum::MinusFortyTwo); - - // optional_repeated_bool_field is intentionally empty. - - first->mutable_oneof_message_field()->set_key("KEY"); - - // optional_oneof_field is intentionally empty. - - (*first->mutable_map_field())[111].set_key("key111"); - (*first->mutable_map_field())[111].set_value("value111"); - (*first->mutable_map_field())[222].set_key("key222"); - (*first->mutable_map_field())[222].set_value("value222"); - - auto* second = message.mutable_second(); - second->set_one(101); - second->set_two(102); - second->set_three(103); - - message.add_repeated_int64_field(31); - message.add_repeated_int64_field(32); - message.add_repeated_int64_field(33); - - // another_repeated_int64_field is intentionally empty. - - auto* subfield1 = message.add_repeated_message_field(); - subfield1->set_key("key11"); - subfield1->set_value("value11"); - auto* subfield2 = message.add_repeated_message_field(); - subfield2->set_key("key21"); - subfield2->set_value("value21"); - - message.set_int64_any_field(4321); - - // Note the reversal of 32 <-> 64. - message.set_int64_field(-32); - message.set_uint64_field(32); - message.set_int32_field(-64); - message.set_uint32_field(64); - - // Note that we don't set the "enum_string_int64_field" as it would fail during parsing. - message.set_enum_int_field(EEnum::MinusFortyTwo); - message.set_enum_string_string_field(EEnum::Three); - - const auto HelloWorldInChinese = "\xe4\xbd\xa0\xe5\xa5\xbd\xef\xbc\x8c\xe4\xb8\x96\xe7\x95\x8c"; - message.set_utf8_field(HelloWorldInChinese); - - message.add_repeated_optional_any_field("#"); - message.add_repeated_optional_any_field("1"); - message.add_repeated_optional_any_field("\"qwe\""); - message.add_repeated_optional_any_field("%true"); - - auto otherComplexFieldPositional = BuildYsonNodeFluently() - .BeginList() - .Item().Value(301) - .Item().Value(302) - .Item().Value(303) - .EndList(); - - auto mode = complexTypeMode; - auto otherComplexField = ([&] { - switch (mode) { - case EComplexTypeMode::Named: - return BuildYsonNodeFluently() - .BeginMap() - .Item("one").Value(301) - .Item("two").Value(302) - .Item("three").Value(303) - .EndMap(); - case EComplexTypeMode::Positional: - return otherComplexFieldPositional; - } - YT_ABORT(); - })(); - auto otherColumnsYson = BuildYsonStringFluently() - .BeginMap() - .Item("other_complex_field").Value(otherComplexField) - .EndMap(); - message.set_other_columns_field(otherColumnsYson.ToString()); - - message.add_packed_repeated_int64_field(-123456789000LL); - message.add_packed_repeated_int64_field(0); - - message.add_optional_repeated_int64_field(-4242); - - // optional_oneof_field is intentionally empty. - - message.set_oneof_string_field("spam"); - - (*message.mutable_map_field())[777].set_key("key777"); - (*message.mutable_map_field())[777].set_value("value777"); - (*message.mutable_map_field())[888].set_key("key888"); - (*message.mutable_map_field())[888].set_value("value888"); - - auto rowCollector = ParseRows(message, config, schema, rowCount); - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - auto firstNode = GetComposite(rowCollector.GetRowValue(rowIndex, "first")); - ASSERT_EQ(firstNode->GetType(), ENodeType::List); - const auto& firstList = firstNode->AsList(); - ASSERT_EQ(firstList->GetChildCount(), 17); - - EXPECT_EQ(firstList->GetChildOrThrow(0)->GetType(), ENodeType::Entity); - EXPECT_EQ(firstList->GetChildValueOrThrow<TString>(1), "Two"); - EXPECT_EQ(firstList->GetChildValueOrThrow<i64>(2), 44); - - ASSERT_EQ(firstList->GetChildOrThrow(3)->GetType(), ENodeType::List); - EXPECT_EQ(ConvertTo<std::vector<i64>>(firstList->GetChildOrThrow(3)), (std::vector<i64>{55, 56, 57})); - - ASSERT_EQ(firstList->GetChildOrThrow(4)->GetType(), ENodeType::List); - EXPECT_EQ(ConvertTo<std::vector<i64>>(firstList->GetChildOrThrow(4)), (std::vector<i64>{})); - - ASSERT_EQ(firstList->GetChildOrThrow(5)->GetType(), ENodeType::List); - EXPECT_EQ(firstList->GetChildOrThrow(5)->AsList()->GetChildValueOrThrow<TString>(0), "key"); - EXPECT_EQ(firstList->GetChildOrThrow(5)->AsList()->GetChildValueOrThrow<TString>(1), "value"); - - ASSERT_EQ(firstList->GetChildOrThrow(6)->GetType(), ENodeType::List); - ASSERT_EQ(firstList->GetChildOrThrow(6)->AsList()->GetChildCount(), 2); - - const auto& firstSubNode1 = firstList->GetChildOrThrow(6)->AsList()->GetChildOrThrow(0); - ASSERT_EQ(firstSubNode1->GetType(), ENodeType::List); - ASSERT_EQ(firstSubNode1->AsList()->GetChildCount(), 2); - EXPECT_EQ(firstSubNode1->AsList()->GetChildValueOrThrow<TString>(0), "key1"); - EXPECT_EQ(firstSubNode1->AsList()->GetChildValueOrThrow<TString>(1), "value1"); - - const auto& firstSubNode2 = firstList->GetChildOrThrow(6)->AsList()->GetChildOrThrow(1); - ASSERT_EQ(firstSubNode2->GetType(), ENodeType::List); - ASSERT_EQ(firstSubNode2->AsList()->GetChildCount(), 2); - EXPECT_EQ(firstSubNode2->AsList()->GetChildValueOrThrow<TString>(0), "key2"); - EXPECT_EQ(firstSubNode2->AsList()->GetChildValueOrThrow<TString>(1), "value2"); - - ASSERT_EQ(firstList->GetChildOrThrow(7)->GetType(), ENodeType::Int64); - EXPECT_EQ(firstList->GetChildValueOrThrow<i64>(7), 4422); - - ASSERT_EQ(firstList->GetChildOrThrow(8)->GetType(), ENodeType::Map); - EXPECT_NODES_EQUAL( - firstList->GetChildOrThrow(8), - BuildYsonNodeFluently() - .BeginMap() - .Item("key").Value("value") - .EndMap()); - - ASSERT_EQ(firstList->GetChildOrThrow(9)->GetType(), ENodeType::Entity); - - EXPECT_NODES_EQUAL( - firstList->GetChildOrThrow(10), - BuildYsonNodeFluently() - .BeginList() - .Item().Value(false) - .Item().Value(42) - .Item().Entity() - .EndList()); - - EXPECT_NODES_EQUAL( - firstList->GetChildOrThrow(11), - BuildYsonNodeFluently() - .BeginList() - .Item().Value("MaxInt32") - .Item().Value("MinusFortyTwo") - .EndList()); - - // optional_repeated_bool_field. - ASSERT_EQ(firstList->GetChildOrThrow(12)->GetType(), ENodeType::Entity); - - // oneof_field. - EXPECT_NODES_EQUAL( - firstList->GetChildOrThrow(13), - BuildYsonNodeFluently() - .BeginList() - .Item().Value(2) - .Item().BeginList() - .Item().Value("KEY") - .Item().Entity() - .EndList() - .EndList()); - - // optional_oneof_field. - ASSERT_EQ(firstList->GetChildOrThrow(14)->GetType(), ENodeType::Entity); - - // map_field. - EXPECT_NODES_EQUAL( - SortMapByKey(firstList->GetChildOrThrow(15)), - BuildYsonNodeFluently() - .BeginList() - .Item().BeginList() - .Item().Value(111) - .Item().BeginList() - .Item().Value("key111") - .Item().Value("value111") - .EndList() - .EndList() - .Item().BeginList() - .Item().Value(222) - .Item().BeginList() - .Item().Value("key222") - .Item().Value("value222") - .EndList() - .EndList() - .EndList()); - - // field_missing_from_proto2. - ASSERT_EQ(firstList->GetChildOrThrow(16)->GetType(), ENodeType::Entity); - - auto secondNode = GetComposite(rowCollector.GetRowValue(rowIndex, "second")); - ASSERT_EQ(secondNode->GetType(), ENodeType::List); - EXPECT_EQ(ConvertTo<std::vector<i64>>(secondNode), (std::vector<i64>{101, 102, 103})); - - auto repeatedMessageNode = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_message_field")); - ASSERT_EQ(repeatedMessageNode->GetType(), ENodeType::List); - ASSERT_EQ(repeatedMessageNode->AsList()->GetChildCount(), 2); - - const auto& subNode1 = repeatedMessageNode->AsList()->GetChildOrThrow(0); - ASSERT_EQ(subNode1->GetType(), ENodeType::List); - ASSERT_EQ(subNode1->AsList()->GetChildCount(), 2); - EXPECT_EQ(subNode1->AsList()->GetChildValueOrThrow<TString>(0), "key11"); - EXPECT_EQ(subNode1->AsList()->GetChildValueOrThrow<TString>(1), "value11"); - - const auto& subNode2 = repeatedMessageNode->AsList()->GetChildOrThrow(1); - ASSERT_EQ(subNode2->GetType(), ENodeType::List); - ASSERT_EQ(subNode2->AsList()->GetChildCount(), 2); - EXPECT_EQ(subNode2->AsList()->GetChildValueOrThrow<TString>(0), "key21"); - EXPECT_EQ(subNode2->AsList()->GetChildValueOrThrow<TString>(1), "value21"); - - auto repeatedInt64Node = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_int64_field")); - EXPECT_EQ(ConvertTo<std::vector<i64>>(repeatedInt64Node), (std::vector<i64>{31, 32, 33})); - - auto anotherRepeatedInt64Node = GetComposite(rowCollector.GetRowValue(rowIndex, "another_repeated_int64_field")); - EXPECT_EQ(ConvertTo<std::vector<i64>>(anotherRepeatedInt64Node), (std::vector<i64>{})); - - auto anyValue = rowCollector.GetRowValue(rowIndex, "any_field"); - ASSERT_EQ(anyValue.Type, EValueType::Int64); - EXPECT_EQ(anyValue.Data.Int64, 4321); - - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "int64_field")), -64); - EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "uint64_field")), 64u); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "int32_field")), -32); - EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "uint32_field")), 32u); - - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "enum_int_field")), -42); - EXPECT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "enum_string_string_field")), "Three"); - - EXPECT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "utf8_field")), HelloWorldInChinese); - - auto repeatedRepeatedOptionalAnyNode = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_optional_any_field")); - auto expectedRepeatedOptionalAnyNode = BuildYsonNodeFluently() - .BeginList() - .Item().Entity() - .Item().Value(1) - .Item().Value("qwe") - .Item().Value(true) - .EndList(); - EXPECT_NODES_EQUAL(repeatedRepeatedOptionalAnyNode, expectedRepeatedOptionalAnyNode); - - auto actualOtherComplexField = GetComposite(rowCollector.GetRowValue(rowIndex, "other_complex_field")); - EXPECT_NODES_EQUAL(actualOtherComplexField, otherComplexFieldPositional); - - EXPECT_NODES_EQUAL( - GetComposite(rowCollector.GetRowValue(rowIndex, "packed_repeated_int64_field")), - ConvertToNode(TYsonString(TStringBuf("[-123456789000;0]")))); - - EXPECT_NODES_EQUAL( - GetComposite(rowCollector.GetRowValue(rowIndex, "optional_repeated_int64_field")), - ConvertToNode(TYsonString(TStringBuf("[-4242]")))); - - EXPECT_NODES_EQUAL( - GetComposite(rowCollector.GetRowValue(rowIndex, "oneof_field")), - ConvertToNode(TYsonString(TStringBuf("[1; \"spam\"]")))); - - EXPECT_FALSE(rowCollector.FindRowValue(rowIndex, "optional_oneof_field")); - - // map_field. - EXPECT_NODES_EQUAL( - SortMapByKey(GetComposite(rowCollector.GetRowValue(rowIndex, "map_field"))), - ConvertToNode(TYsonString(TStringBuf("[[777; [key777; value777]]; [888; [key888; value888]]]")))); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -std::vector<TTableSchemaPtr> CreateSeveralTablesSchemas() -{ - return { - New<TTableSchema>(std::vector<TColumnSchema>{ - {"embedded", StructLogicalType({ - {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - })}, - {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - {"any_field", SimpleLogicalType(ESimpleLogicalValueType::Any)}, - }), - New<TTableSchema>(std::vector<TColumnSchema>{ - {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - }), - // Empty schema. - New<TTableSchema>(), - }; -} - -INodePtr CreateSeveralTablesConfig(EProtoFormatType protoFormatType) -{ - if (protoFormatType == EProtoFormatType::FileDescriptor) { - return CreateFileDescriptorConfig<TSeveralTablesMessageFirst, TSeveralTablesMessageSecond, TSeveralTablesMessageThird>(); - } - YT_VERIFY(protoFormatType == EProtoFormatType::Structured); - - return BuildYsonNodeFluently() - .BeginAttributes() - .Item("enumerations").Value(EnumerationsConfig) - .Item("tables") - .BeginList() - // Table #1. - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("embedded") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("enum_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .EndList() - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("repeated_int64_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .Item("repeated").Value(true) - .EndMap() - .Item() - .BeginMap() - // In schema it is of type "any". - .Item("name").Value("any_field") - .Item("field_number").Value(3) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - - // Table #2. - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .Item() - .BeginMap() - .Item("name").Value("enum_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("enum_string") - .Item("enumeration_name").Value("EEnum") - .EndMap() - .EndList() - .EndMap() - - // Table #3. - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("string_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndAttributes() - .Value("protobuf"); -} - -using TProtobufFormatSeveralTablesParam = std::tuple<EProtoFormatType>; - -class TProtobufFormatSeveralTables - : public ::testing::TestWithParam<TProtobufFormatSeveralTablesParam> -{ }; - -INSTANTIATE_TEST_SUITE_P( - FileDescriptor, - TProtobufFormatSeveralTables, - ::testing::Values(TProtobufFormatSeveralTablesParam{ - EProtoFormatType::FileDescriptor})); - -INSTANTIATE_TEST_SUITE_P( - Structured, - TProtobufFormatSeveralTables, - ::testing::Values(TProtobufFormatSeveralTablesParam{ - EProtoFormatType::Structured})); - -TEST_P(TProtobufFormatSeveralTables, Write) -{ - auto [protoFormatType] = GetParam(); - - auto schemas = CreateSeveralTablesSchemas(); - auto configNode = CreateSeveralTablesConfig(protoFormatType); - - auto config = ConvertTo<TProtobufFormatConfigPtr>(configNode->Attributes().ToMap()); - - auto nameTable = New<TNameTable>(); - auto embeddedId = nameTable->RegisterName("embedded"); - auto anyFieldId = nameTable->RegisterName("any_field"); - auto int64FieldId = nameTable->RegisterName("int64_field"); - auto repeatedInt64Id = nameTable->RegisterName("repeated_int64_field"); - auto enumFieldId = nameTable->RegisterName("enum_field"); - auto stringFieldId = nameTable->RegisterName("string_field"); - auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); - - TString result; - TStringOutput resultStream(result); - auto controlAttributesConfig = New<TControlAttributesConfig>(); - controlAttributesConfig->EnableTableIndex = true; - controlAttributesConfig->EnableEndOfStream = true; - auto writer = CreateWriterForProtobuf( - std::move(config), - schemas, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - std::move(controlAttributesConfig), - 0); - - auto embeddedYson = BuildYsonStringFluently() - .BeginList() - .Item().Value("Two") - .Item().Value(44) - .EndList() - .ToString(); - - auto repeatedInt64Yson = ConvertToYsonString(std::vector<i64>{31, 32, 33}).ToString(); - - { - TUnversionedRowBuilder builder; - builder.AddValue(MakeUnversionedCompositeValue(embeddedYson, embeddedId)); - builder.AddValue(MakeUnversionedCompositeValue(repeatedInt64Yson, repeatedInt64Id)); - builder.AddValue(MakeUnversionedInt64Value(4321, anyFieldId)); - writer->Write({builder.GetRow()}); - } - { - TUnversionedRowBuilder builder; - builder.AddValue(MakeUnversionedStringValue("Two", enumFieldId)); - builder.AddValue(MakeUnversionedInt64Value(999, int64FieldId)); - builder.AddValue(MakeUnversionedInt64Value(1, tableIndexId)); - writer->Write({builder.GetRow()}); - } - { - TUnversionedRowBuilder builder; - builder.AddValue(MakeUnversionedStringValue("blah", stringFieldId)); - builder.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); - writer->Write({builder.GetRow()}); - } - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput input(result); - TLenvalParser lenvalParser(&input); - - { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TSeveralTablesMessageFirst message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - const auto& embedded = message.embedded(); - EXPECT_EQ(embedded.enum_field(), EEnum::Two); - EXPECT_EQ(embedded.int64_field(), 44); - - std::vector<i64> repeatedInt64Field( - message.repeated_int64_field().begin(), - message.repeated_int64_field().end()); - EXPECT_EQ(repeatedInt64Field, (std::vector<i64>{31, 32, 33})); - EXPECT_EQ(message.int64_field(), 4321); - } - { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TSeveralTablesMessageSecond message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - EXPECT_EQ(message.enum_field(), EEnum::Two); - EXPECT_EQ(message.int64_field(), 999); - } - { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TSeveralTablesMessageThird message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - EXPECT_EQ(message.string_field(), "blah"); - } - ASSERT_FALSE(lenvalParser.IsEndOfStream()); - ASSERT_FALSE(lenvalParser.Next()); - ASSERT_TRUE(lenvalParser.IsEndOfStream()); - ASSERT_FALSE(lenvalParser.Next()); -} - -TEST_P(TProtobufFormatSeveralTables, Parse) -{ - auto [protoFormatType] = GetParam(); - - auto schemas = CreateSeveralTablesSchemas(); - auto configNode = CreateSeveralTablesConfig(protoFormatType); - auto config = ConvertTo<TProtobufFormatConfigPtr>(configNode->Attributes().ToMap()); - - std::vector<TCollectingValueConsumer> rowCollectors; - std::vector<std::unique_ptr<IParser>> parsers; - for (const auto& schema : schemas) { - rowCollectors.emplace_back(schema); - } - for (int tableIndex = 0; tableIndex < static_cast<int>(schemas.size()); ++tableIndex) { - parsers.push_back(CreateParserForProtobuf( - &rowCollectors[tableIndex], - config, - tableIndex)); - } - - NYT::TSeveralTablesMessageFirst firstMessage; - auto* embedded = firstMessage.mutable_embedded(); - embedded->set_enum_field(EEnum::Two); - embedded->set_int64_field(44); - - firstMessage.add_repeated_int64_field(55); - firstMessage.add_repeated_int64_field(56); - firstMessage.add_repeated_int64_field(57); - - firstMessage.set_int64_field(4444); - - NYT::TSeveralTablesMessageSecond secondMessage; - secondMessage.set_enum_field(EEnum::Two); - secondMessage.set_int64_field(44); - - NYT::TSeveralTablesMessageThird thirdMessage; - thirdMessage.set_string_field("blah"); - - auto parse = [] (auto& parser, const auto& message) { - TString lenvalBytes; - { - TStringOutput out(lenvalBytes); - auto messageSize = static_cast<ui32>(message.ByteSizeLong()); - out.Write(&messageSize, sizeof(messageSize)); - ASSERT_TRUE(message.SerializeToArcadiaStream(&out)); - } - parser->Read(lenvalBytes); - parser->Finish(); - }; - - parse(parsers[0], firstMessage); - parse(parsers[1], secondMessage); - parse(parsers[2], thirdMessage); - - { - const auto& rowCollector = rowCollectors[0]; - ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); - - auto embeddedNode = GetComposite(rowCollector.GetRowValue(0, "embedded")); - ASSERT_EQ(ConvertToTextYson(embeddedNode), "[\"Two\";44;]"); - - auto repeatedInt64Node = GetComposite(rowCollector.GetRowValue(0, "repeated_int64_field")); - ASSERT_EQ(ConvertToTextYson(repeatedInt64Node), "[55;56;57;]"); - - auto int64Field = GetInt64(rowCollector.GetRowValue(0, "any_field")); - EXPECT_EQ(int64Field, 4444); - } - - { - const auto& rowCollector = rowCollectors[1]; - ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); - - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "enum_field")), "Two"); - EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "int64_field")), 44); - } - - { - const auto& rowCollector = rowCollectors[2]; - ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); - - EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "string_field")), "blah"); - } -} - -TEST(TProtobufFormat, SchemaConfigMismatch) -{ - auto createParser = [] (const TTableSchemaPtr& schema, const INodePtr& configNode) { - TCollectingValueConsumer rowCollector(schema); - return CreateParserForProtobuf( - &rowCollector, - ConvertTo<TProtobufFormatConfigPtr>(configNode), - 0); - }; - auto createSeveralTableWriter = [] (const std::vector<TTableSchemaPtr>& schemas, const INodePtr& configNode) { - TString result; - TStringOutput resultStream(result); - return CreateWriterForProtobuf( - ConvertTo<TProtobufFormatConfigPtr>(configNode), - schemas, - New<TNameTable>(), - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - }; - auto createWriter = [&] (const TTableSchemaPtr& schema, const INodePtr& configNode) { - createSeveralTableWriter({schema}, configNode); - }; - - auto schema_struct_with_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"struct", StructLogicalType({ - {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - }); - - auto schema_struct_with_uint64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"struct", StructLogicalType({ - {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))}, - })}, - }); - - auto config_struct_with_int64 = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("struct") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(2) - // Wrong type. - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - // OK. - EXPECT_NO_THROW(createParser(schema_struct_with_int64, config_struct_with_int64)); - EXPECT_NO_THROW(createWriter(schema_struct_with_int64, config_struct_with_int64)); - - // Types mismatch. - EXPECT_THROW_WITH_SUBSTRING( - createParser(schema_struct_with_uint64, config_struct_with_int64), - "signedness of both types must be the same"); - EXPECT_THROW_WITH_SUBSTRING( - createWriter(schema_struct_with_uint64, config_struct_with_int64), - "signedness of both types must be the same"); - - // No schema for structured field is Ok. - EXPECT_NO_THROW(createParser(New<TTableSchema>(), config_struct_with_int64)); - EXPECT_NO_THROW(createWriter(New<TTableSchema>(), config_struct_with_int64)); - - auto schema_list_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"repeated", ListLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::Int64) - )}, - }); - - auto schema_list_optional_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"repeated", ListLogicalType( - OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)) - )}, - }); - - auto config_repeated_int64 = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("repeated") - .Item("field_number").Value(1) - .Item("repeated").Value(true) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - // OK. - EXPECT_NO_THROW(createParser(schema_list_int64, config_repeated_int64)); - EXPECT_NO_THROW(createWriter(schema_list_int64, config_repeated_int64)); - - // No schema for repeated field is Ok. - EXPECT_NO_THROW(createParser(New<TTableSchema>(), config_repeated_int64)); - EXPECT_NO_THROW(createWriter(New<TTableSchema>(), config_repeated_int64)); - - // List of optional is not allowed. - EXPECT_THROW_WITH_SUBSTRING( - createParser(schema_list_optional_int64, config_repeated_int64), - "unexpected logical metatype \"optional\""); - EXPECT_THROW_WITH_SUBSTRING( - createWriter(schema_list_optional_int64, config_repeated_int64), - "unexpected logical metatype \"optional\""); - - auto schema_optional_list_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"repeated", OptionalLogicalType( - ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)) - )}, - }); - - // Optional list is OK. - EXPECT_NO_THROW(createParser(schema_optional_list_int64, config_repeated_int64)); - EXPECT_NO_THROW(createWriter(schema_optional_list_int64, config_repeated_int64)); - - auto schema_optional_optional_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"field", OptionalLogicalType( - OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)) - )}, - }); - - auto config_int64 = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("field") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - // Optional of optional is not allowed. - EXPECT_THROW_WITH_SUBSTRING( - createParser(schema_optional_optional_int64, config_int64), - "unexpected logical metatype \"optional\""); - EXPECT_THROW_WITH_SUBSTRING( - createWriter(schema_optional_optional_int64, config_int64), - "unexpected logical metatype \"optional\""); - - auto schema_struct_with_both = New<TTableSchema>(std::vector<TColumnSchema>{ - {"struct", StructLogicalType({ - {"required_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"optional_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - }); - - auto config_struct_with_required = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("struct") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("required_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto config_struct_with_optional = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("struct") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("optional_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - auto config_struct_with_unknown = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("struct") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("required_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("optional_field") - .Item("field_number").Value(2) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("unknown_field") - .Item("field_number").Value(3) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - // Schema has more fields, non-optional field is missing in protobuf config. - // Parser should fail. - EXPECT_THROW_WITH_SUBSTRING( - createParser(schema_struct_with_both, config_struct_with_optional), - "non-optional field \"required_field\" in schema is missing from protobuf config"); - // Writer feels OK. - EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_optional)); - - // Schema has more fields, optional field is missing in protobuf config. - // It's OK for both the writer and the parser. - EXPECT_NO_THROW(createParser(schema_struct_with_both, config_struct_with_required)); - EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_required)); - - // Protobuf config has more fields, it is always OK. - EXPECT_NO_THROW(createParser(schema_struct_with_both, config_struct_with_unknown)); - EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_unknown)); - - auto schema_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ - {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - }); - - auto config_two_tables = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("int64_field") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - EXPECT_NO_THROW(createWriter(schema_int64, config_two_tables)); - EXPECT_THROW_WITH_SUBSTRING( - createSeveralTableWriter({schema_int64, schema_int64, schema_int64}, config_two_tables), - "Number of schemas is greater than number of tables in protobuf config: 3 > 2"); - - auto schema_variant_with_int = New<TTableSchema>(std::vector<TColumnSchema>{ - {"variant", VariantStructLogicalType({ - {"a", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - })}, - }); - auto schema_variant_with_optional_int = New<TTableSchema>(std::vector<TColumnSchema>{ - {"variant", VariantStructLogicalType({ - {"a", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - })}, - }); - - auto config_with_oneof = BuildYsonNodeFluently() - .BeginMap() - .Item("tables") - .BeginList() - .Item() - .BeginMap() - .Item("columns") - .BeginList() - .Item() - .BeginMap() - .Item("name").Value("variant") - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item() - .BeginMap() - .Item("name").Value("a") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap(); - - // Oneof fields require schematized columns. - EXPECT_THROW_WITH_SUBSTRING( - createParser(New<TTableSchema>(), config_with_oneof), - "requires a corresponding schematized column"); - EXPECT_THROW_WITH_SUBSTRING( - createWriter(New<TTableSchema>(), config_with_oneof), - "requires a corresponding schematized column"); - - EXPECT_THROW_WITH_SUBSTRING( - createParser(schema_variant_with_optional_int, config_with_oneof), - "Optional variant field \"variant.a\""); - EXPECT_THROW_WITH_SUBSTRING( - createWriter(schema_variant_with_optional_int, config_with_oneof), - "Optional variant field \"variant.a\""); - EXPECT_NO_THROW(createParser(schema_variant_with_int, config_with_oneof)); - EXPECT_NO_THROW(createWriter(schema_variant_with_int, config_with_oneof)); -} - -TEST(TProtobufFormat, MultipleOtherColumns) -{ - auto nameTable = New<TNameTable>(); - - TString data; - TStringOutput resultStream(data); - - auto controlAttributesConfig = New<TControlAttributesConfig>(); - controlAttributesConfig->EnableTableIndex = true; - controlAttributesConfig->EnableEndOfStream = true; - - auto protoWriter = CreateWriterForProtobuf( - MakeProtobufFormatConfig({TOtherColumnsMessage::descriptor(), TOtherColumnsMessage::descriptor()}), - std::vector<TTableSchemaPtr>(2, New<TTableSchema>()), - nameTable, - CreateAsyncAdapter(&resultStream), - true, - controlAttributesConfig, - 0); - - protoWriter->Write( - std::vector<TUnversionedRow>{ - NNamedValue::MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"field1", "foo"}, - }), - NNamedValue::MakeRow(nameTable, { - {TableIndexColumnName, 1}, - {"field2", "bar"}, - }), - } - ); - WaitFor(protoWriter->Close()) - .ThrowOnError(); - - std::vector<TString> otherColumnsValue; - auto parser = TLenvalParser(data); - while (auto item = parser.Next()) { - TOtherColumnsMessage message; - bool parsed = message.ParseFromString(item->RowData); - EXPECT_TRUE(parsed); - otherColumnsValue.push_back(CanonizeYson(message.other_columns_field())); - } - - EXPECT_EQ( - otherColumnsValue, - std::vector<TString>({ - CanonizeYson("{field1=foo}"), - CanonizeYson("{field2=bar}"), - })); -} - -//////////////////////////////////////////////////////////////////////////////// - -using TProtobufFormatAllFieldsParameter = std::tuple<int, EProtoFormatType>; -class TProtobufFormatAllFields - : public ::testing::TestWithParam<TProtobufFormatAllFieldsParameter> -{ -public: - bool IsLegacyFormat() const - { - auto [rowCount, protoFormatType] = GetParam(); - return protoFormatType == EProtoFormatType::FileDescriptorLegacy; - } -}; - -INSTANTIATE_TEST_SUITE_P( - Specification, - TProtobufFormatAllFields, - ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::Structured})); - -INSTANTIATE_TEST_SUITE_P( - FileDescriptorLegacy, - TProtobufFormatAllFields, - ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::FileDescriptorLegacy})); - -INSTANTIATE_TEST_SUITE_P( - FileDescriptor, - TProtobufFormatAllFields, - ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::FileDescriptor})); - -INSTANTIATE_TEST_SUITE_P( - ManyRows, - TProtobufFormatAllFields, - ::testing::Values(TProtobufFormatAllFieldsParameter{50000, EProtoFormatType::Structured})); - -TEST_P(TProtobufFormatAllFields, Writer) -{ - auto [rowCount, protoFormatType] = GetParam(); - auto config = CreateAllFieldsConfig(protoFormatType); - - auto nameTable = New<TNameTable>(); - - auto doubleId = nameTable->RegisterName("Double"); - auto floatId = nameTable->RegisterName("Float"); - - auto int64Id = nameTable->RegisterName("Int64"); - auto uint64Id = nameTable->RegisterName("UInt64"); - auto sint64Id = nameTable->RegisterName("SInt64"); - auto fixed64Id = nameTable->RegisterName("Fixed64"); - auto sfixed64Id = nameTable->RegisterName("SFixed64"); - - auto int32Id = nameTable->RegisterName("Int32"); - auto uint32Id = nameTable->RegisterName("UInt32"); - auto sint32Id = nameTable->RegisterName("SInt32"); - auto fixed32Id = nameTable->RegisterName("Fixed32"); - auto sfixed32Id = nameTable->RegisterName("SFixed32"); - - auto boolId = nameTable->RegisterName("Bool"); - auto stringId = nameTable->RegisterName("String"); - auto bytesId = nameTable->RegisterName("Bytes"); - - auto enumId = nameTable->RegisterName("Enum"); - - auto messageId = nameTable->RegisterName("Message"); - - auto anyWithMapId = nameTable->RegisterName("AnyWithMap"); - auto anyWithInt64Id = nameTable->RegisterName("AnyWithInt64"); - auto anyWithStringId = nameTable->RegisterName("AnyWithString"); - - auto otherInt64ColumnId = nameTable->RegisterName("OtherInt64Column"); - auto otherDoubleColumnId = nameTable->RegisterName("OtherDoubleColumn"); - auto otherStringColumnId = nameTable->RegisterName("OtherStringColumn"); - auto otherNullColumnId = nameTable->RegisterName("OtherNullColumn"); - auto otherBooleanColumnId = nameTable->RegisterName("OtherBooleanColumn"); - auto otherAnyColumnId = nameTable->RegisterName("OtherAnyColumn"); - - auto tableIndexColumnId = nameTable->RegisterName(TableIndexColumnName); - auto rowIndexColumnId = nameTable->RegisterName(RowIndexColumnName); - auto rangeIndexColumnId = nameTable->RegisterName(RangeIndexColumnName); - - auto missintInt64Id = nameTable->RegisterName("MissingInt64"); - - TString result; - TStringOutput resultStream(result); - auto writer = CreateWriterForProtobuf( - config->Attributes(), - {New<TTableSchema>()}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - - TEmbeddedMessage embeddedMessage; - embeddedMessage.set_key("embedded_key"); - embeddedMessage.set_value("embedded_value"); - TString embeddedMessageBytes; - ASSERT_TRUE(embeddedMessage.SerializeToString(&embeddedMessageBytes)); - - auto mapNode = BuildYsonNodeFluently() - .BeginMap() - .Item("Key").Value("Value") - .Item("Another") - .BeginList() - .Item().Value(1) - .Item().Value("two") - .EndList() - .EndMap(); - auto ysonString = ConvertToYsonString(mapNode).ToString(); - - TUnversionedRowBuilder builder; - for (const auto& value : { - MakeUnversionedDoubleValue(3.14159, doubleId), - MakeUnversionedDoubleValue(2.71828, floatId), - - MakeUnversionedInt64Value(-1, int64Id), - MakeUnversionedUint64Value(2, uint64Id), - MakeUnversionedInt64Value(-3, sint64Id), - MakeUnversionedUint64Value(4, fixed64Id), - MakeUnversionedInt64Value(-5, sfixed64Id), - - MakeUnversionedInt64Value(-6, int32Id), - MakeUnversionedUint64Value(7, uint32Id), - MakeUnversionedInt64Value(-8, sint32Id), - MakeUnversionedUint64Value(9, fixed32Id), - MakeUnversionedInt64Value(-10, sfixed32Id), - - MakeUnversionedBooleanValue(true, boolId), - MakeUnversionedStringValue("this_is_string", stringId), - MakeUnversionedStringValue("this_is_bytes", bytesId), - - MakeUnversionedStringValue("Two", enumId), - - MakeUnversionedStringValue(embeddedMessageBytes, messageId), - - MakeUnversionedNullValue(missintInt64Id), - - MakeUnversionedInt64Value(12, tableIndexColumnId), - MakeUnversionedInt64Value(42, rowIndexColumnId), - MakeUnversionedInt64Value(333, rangeIndexColumnId), - }) { - builder.AddValue(value); - } - - if (!IsLegacyFormat()) { - builder.AddValue(MakeUnversionedAnyValue(ysonString, anyWithMapId)); - builder.AddValue(MakeUnversionedInt64Value(22, anyWithInt64Id)); - builder.AddValue(MakeUnversionedStringValue("some_string", anyWithStringId)); - - builder.AddValue(MakeUnversionedInt64Value(-123, otherInt64ColumnId)); - builder.AddValue(MakeUnversionedDoubleValue(-123.456, otherDoubleColumnId)); - builder.AddValue(MakeUnversionedStringValue("some_string", otherStringColumnId)); - builder.AddValue(MakeUnversionedBooleanValue(true, otherBooleanColumnId)); - builder.AddValue(MakeUnversionedAnyValue(ysonString, otherAnyColumnId)); - builder.AddValue(MakeUnversionedNullValue(otherNullColumnId)); - } - - auto row = builder.GetRow(); - std::vector<TUnversionedRow> rows(rowCount, row); - writer->Write(rows); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput input(result); - TLenvalParser lenvalParser(&input); - - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - auto entry = lenvalParser.Next(); - ASSERT_TRUE(entry); - - NYT::TMessage message; - ASSERT_TRUE(message.ParseFromString(entry->RowData)); - - EXPECT_DOUBLE_EQ(message.double_field(), 3.14159); - EXPECT_FLOAT_EQ(message.float_field(), 2.71828); - EXPECT_EQ(message.int64_field(), -1); - EXPECT_EQ(message.uint64_field(), 2u); - EXPECT_EQ(message.sint64_field(), -3); - EXPECT_EQ(message.fixed64_field(), 4u); - EXPECT_EQ(message.sfixed64_field(), -5); - - EXPECT_EQ(message.int32_field(), -6); - EXPECT_EQ(message.uint32_field(), 7u); - EXPECT_EQ(message.sint32_field(), -8); - EXPECT_EQ(message.fixed32_field(), 9u); - EXPECT_EQ(message.sfixed32_field(), -10); - - EXPECT_EQ(message.bool_field(), true); - EXPECT_EQ(message.string_field(), "this_is_string"); - EXPECT_EQ(message.bytes_field(), "this_is_bytes"); - - EXPECT_EQ(message.enum_field(), EEnum::Two); - - EXPECT_EQ(message.message_field().key(), "embedded_key"); - EXPECT_EQ(message.message_field().value(), "embedded_value"); - - if (!IsLegacyFormat()) { - EXPECT_TRUE(AreNodesEqual(ConvertToNode(TYsonString(message.any_field_with_map())), mapNode)); - EXPECT_TRUE(AreNodesEqual( - ConvertToNode(TYsonString(message.any_field_with_int64())), - BuildYsonNodeFluently().Value(22))); - EXPECT_TRUE(AreNodesEqual( - ConvertToNode(TYsonString(message.any_field_with_string())), - BuildYsonNodeFluently().Value("some_string"))); - - auto otherColumnsMap = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); - EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<i64>("OtherInt64Column"), -123); - EXPECT_DOUBLE_EQ(otherColumnsMap->GetChildValueOrThrow<double>("OtherDoubleColumn"), -123.456); - EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<TString>("OtherStringColumn"), "some_string"); - EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<bool>("OtherBooleanColumn"), true); - EXPECT_TRUE(AreNodesEqual(otherColumnsMap->GetChildOrThrow("OtherAnyColumn"), mapNode)); - EXPECT_EQ(otherColumnsMap->GetChildOrThrow("OtherNullColumn")->GetType(), ENodeType::Entity); - - auto keys = otherColumnsMap->GetKeys(); - std::sort(keys.begin(), keys.end()); - std::vector<TString> expectedKeys = { - "OtherInt64Column", - "OtherDoubleColumn", - "OtherStringColumn", - "OtherBooleanColumn", - "OtherAnyColumn", - "OtherNullColumn"}; - std::sort(expectedKeys.begin(), expectedKeys.end()); - EXPECT_EQ(expectedKeys, keys); - } - } - - ASSERT_FALSE(lenvalParser.Next()); -} - -TEST_P(TProtobufFormatAllFields, Parser) -{ - auto [rowCount, protoFormatType] = GetParam(); - - auto config = CreateAllFieldsConfig(protoFormatType); - - TMessage message; - message.set_double_field(3.14159); - message.set_float_field(2.71828); - - message.set_int64_field(-1); - message.set_uint64_field(2); - message.set_sint64_field(-3); - message.set_fixed64_field(4); - message.set_sfixed64_field(-5); - - message.set_int32_field(-6); - message.set_uint32_field(7); - message.set_sint32_field(-8); - message.set_fixed32_field(9); - message.set_sfixed32_field(-10); - - message.set_bool_field(true); - message.set_string_field("this_is_string"); - message.set_bytes_field("this_is_bytes"); - message.set_enum_field(EEnum::Three); - - message.mutable_message_field()->set_key("embedded_key"); - message.mutable_message_field()->set_value("embedded_value"); - - auto mapNode = BuildYsonNodeFluently() - .BeginMap() - .Item("Key").Value("Value") - .Item("Another") - .BeginList() - .Item().Value(1) - .Item().Value("two") - .EndList() - .EndMap(); - - auto otherColumnsNode = BuildYsonNodeFluently() - .BeginMap() - .Item("OtherInt64Column").Value(-123) - .Item("OtherDoubleColumn").Value(-123.456) - .Item("OtherStringColumn").Value("some_string") - .Item("OtherBooleanColumn").Value(true) - .Item("OtherAnyColumn").Value(mapNode) - .Item("OtherNullColumn").Entity() - .EndMap(); - - if (!IsLegacyFormat()) { - message.set_any_field_with_map(ConvertToYsonString(mapNode).ToString()); - message.set_any_field_with_int64(BuildYsonStringFluently().Value(22).ToString()); - message.set_any_field_with_string(BuildYsonStringFluently().Value("some_string").ToString()); - message.set_other_columns_field(ConvertToYsonString(otherColumnsNode).ToString()); - } - - auto rowCollector = ParseRows( - message, - ConvertTo<TProtobufFormatConfigPtr>(config->Attributes().ToMap()), - New<TTableSchema>(), - rowCount); - - for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { - int expectedSize = IsLegacyFormat() ? 17 : 26; - ASSERT_EQ(static_cast<int>(rowCollector.GetRow(rowIndex).GetCount()), expectedSize); - - ASSERT_DOUBLE_EQ(GetDouble(rowCollector.GetRowValue(rowIndex, "Double")), 3.14159); - ASSERT_NEAR(GetDouble(rowCollector.GetRowValue(rowIndex, "Float")), 2.71828, 1e-5); - - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Int64")), -1); - ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "UInt64")), 2u); - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SInt64")), -3); - ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "Fixed64")), 4u); - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SFixed64")), -5); - - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Int32")), -6); - ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "UInt32")), 7u); - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SInt32")), -8); - ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "Fixed32")), 9u); - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SFixed32")), -10); - - ASSERT_EQ(GetBoolean(rowCollector.GetRowValue(rowIndex, "Bool")), true); - ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "String")), "this_is_string"); - ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "Bytes")), "this_is_bytes"); - - if (IsLegacyFormat()) { - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Enum")), 3); - } else { - ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "Enum")), "Three"); - } - - TEmbeddedMessage embeddedMessage; - ASSERT_TRUE(embeddedMessage.ParseFromString(GetString(rowCollector.GetRowValue(rowIndex, "Message")))); - ASSERT_EQ(embeddedMessage.key(), "embedded_key"); - ASSERT_EQ(embeddedMessage.value(), "embedded_value"); - - if (!IsLegacyFormat()) { - ASSERT_TRUE(AreNodesEqual(GetAny(rowCollector.GetRowValue(rowIndex, "AnyWithMap")), mapNode)); - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "AnyWithInt64")), 22); - ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "AnyWithString")), "some_string"); - - ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "OtherInt64Column")), -123); - ASSERT_DOUBLE_EQ(GetDouble(rowCollector.GetRowValue(rowIndex, "OtherDoubleColumn")), -123.456); - ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "OtherStringColumn")), "some_string"); - ASSERT_EQ(GetBoolean(rowCollector.GetRowValue(rowIndex, "OtherBooleanColumn")), true); - ASSERT_TRUE(AreNodesEqual(GetAny(rowCollector.GetRowValue(rowIndex, "OtherAnyColumn")), mapNode)); - ASSERT_EQ(rowCollector.GetRowValue(rowIndex, "OtherNullColumn").Type, EValueType::Null); - } - } -} - -//////////////////////////////////////////////////////////////////////////////// - -class TProtobufFormatCompat - : public ::testing::Test -{ -public: - static TTableSchemaPtr GetEarlySchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", OptionalLogicalType(VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - }))}, - }); - return schema; - } - - static TTableSchemaPtr GetFirstMiddleSchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", OptionalLogicalType(VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }))}, - {"b", OptionalLogicalType(StructLogicalType({ - {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }))}, - }); - return schema; - } - - static TTableSchemaPtr GetSecondMiddleSchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", OptionalLogicalType(VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }))}, - {"b", OptionalLogicalType(StructLogicalType({ - {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - }))}, - }); - return schema; - } - - static TTableSchemaPtr GetThirdMiddleSchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", OptionalLogicalType(VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }))}, - {"b", OptionalLogicalType(StructLogicalType({ - {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"z", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - }))}, - }); - return schema; - } - - static TTableSchemaPtr GetLateSchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", OptionalLogicalType(VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"f3", SimpleLogicalType(ESimpleLogicalValueType::Boolean)}, - }))}, - {"c", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean)))}, - {"b", OptionalLogicalType(StructLogicalType({ - {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"z", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - }))}, - }); - return schema; - } - - static TProtobufFormatConfigPtr GetFirstMiddleConfig() - { - static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("a") - .Item("field_number").Value(0) - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("f1") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .EndList() - .EndMap() - .Item().BeginMap() - .Item("name").Value("b") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("x") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList().EndMap().EndList().EndMap()); - return config; - } - - static TProtobufFormatConfigPtr GetSecondMiddleConfig() - { - static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("a") - .Item("field_number").Value(0) - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("f1") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("f2") - .Item("field_number").Value(101) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .Item().BeginMap() - .Item("name").Value("b") - .Item("field_number").Value(2) - .Item("proto_type").Value("structured_message") - .Item("fields") - .BeginList() - .Item().BeginMap() - .Item("name").Value("x") - .Item("field_number").Value(1) - .Item("proto_type").Value("string") - .EndMap() - .Item().BeginMap() - .Item("name").Value("y") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList().EndMap().EndList().EndMap()); - return config; - } -}; - -template <typename TMessage> -TMessage WriteRow( - TUnversionedRow row, - const TProtobufFormatConfigPtr& config, - const TTableSchemaPtr& schema, - const TNameTablePtr& nameTable) -{ - TString result; - TStringOutput resultStream(result); - - auto writer = CreateWriterForProtobuf( - config, - {schema}, - nameTable, - CreateAsyncAdapter(&resultStream), - true, - New<TControlAttributesConfig>(), - 0); - writer->Write(std::vector<TUnversionedRow>{row}); - writer->Close().Get().ThrowOnError(); - - TStringInput input(result); - TLenvalParser lenvalParser(&input); - auto entry = lenvalParser.Next(); - if (!entry) { - THROW_ERROR_EXCEPTION("Unexpected end of stream in lenval parser"); - } - TMessage message; - if (!message.ParseFromString(entry->RowData)) { - THROW_ERROR_EXCEPTION("Failed to parse message"); - } - if (lenvalParser.Next()) { - THROW_ERROR_EXCEPTION("Unexpected entry in lenval parser"); - } - return message; -} - -TEST_F(TProtobufFormatCompat, Write) -{ - auto nameTable = TNameTable::FromSchema(*GetLateSchema()); - auto config = GetSecondMiddleConfig(); - - auto writeRow = [&] (TUnversionedRow row, const TTableSchemaPtr& schema) { - return WriteRow<NYT::TCompatMessage>(row, config, schema, nameTable); - }; - - { - auto earlyRow = MakeRow(nameTable, { - {"a", EValueType::Composite, "[0; -24]"} - }); - - SCOPED_TRACE("early"); - auto message = writeRow(earlyRow, GetEarlySchema()); - EXPECT_EQ(message.f1(), -24); - EXPECT_FALSE(message.has_f2()); - EXPECT_EQ(message.has_b(), false); - } - { - auto firstMiddleRow = MakeRow(nameTable, { - {"a", EValueType::Composite, "[1; foobar]"}, - {"b", EValueType::Composite, "[foo]"}, - }); - - SCOPED_TRACE("firstMiddle"); - auto message = writeRow(firstMiddleRow, GetFirstMiddleSchema()); - EXPECT_FALSE(message.has_f1()); - EXPECT_EQ(message.f2(), "foobar"); - EXPECT_EQ(message.b().x(), "foo"); - EXPECT_EQ(message.b().has_y(), false); - } - { - auto secondMiddleRow = MakeRow(nameTable, { - {"a", EValueType::Composite, "[1; foobar]"}, - {"b", EValueType::Composite, "[foo; bar]"}, - }); - - SCOPED_TRACE("secondMiddle"); - auto message = writeRow(secondMiddleRow, GetSecondMiddleSchema()); - EXPECT_FALSE(message.has_f1()); - EXPECT_EQ(message.f2(), "foobar"); - EXPECT_EQ(message.b().x(), "foo"); - EXPECT_EQ(message.b().y(), "bar"); - } - { - auto thirdMiddleRow = MakeRow(nameTable, { - {"a", EValueType::Composite, "[1; foobar]"}, - {"b", EValueType::Composite, "[foo; bar; spam]"}, - }); - - SCOPED_TRACE("thirdMiddle"); - auto message = writeRow(thirdMiddleRow, GetThirdMiddleSchema()); - EXPECT_FALSE(message.has_f1()); - EXPECT_EQ(message.f2(), "foobar"); - EXPECT_EQ(message.b().x(), "foo"); - EXPECT_EQ(message.b().y(), "bar"); - } - { - auto lateRow = MakeRow(nameTable, { - {"a", EValueType::Composite, "[2; %true]"}, - {"c", EValueType::Composite, "[%false; %true; %false]"}, - {"b", EValueType::Composite, "[foo; bar; spam]"}, - }); - - SCOPED_TRACE("late"); - auto message = writeRow(lateRow, GetLateSchema()); - EXPECT_FALSE(message.has_f1()); - EXPECT_FALSE(message.has_f2()); - EXPECT_EQ(message.b().x(), "foo"); - EXPECT_EQ(message.b().y(), "bar"); - } -} - -TEST_F(TProtobufFormatCompat, Parse) -{ - auto config = GetSecondMiddleConfig(); - - NYT::TCompatMessage message; - message.set_f2("Sandiego"); - message.mutable_b()->set_x("foo"); - message.mutable_b()->set_y("bar"); - - { - SCOPED_TRACE("early"); - auto collector = ParseRows(message, config, GetEarlySchema()); - EXPECT_FALSE(collector.FindRowValue(0, "a")); - EXPECT_FALSE(collector.GetNameTable()->FindId("b")); - EXPECT_FALSE(collector.GetNameTable()->FindId("c")); - } - { - SCOPED_TRACE("firstMiddle"); - auto collector = ParseRows(message, config, GetFirstMiddleSchema()); - EXPECT_NODES_EQUAL( - GetComposite(collector.GetRowValue(0, "a")), - ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); - EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo]")))); - EXPECT_FALSE(collector.GetNameTable()->FindId("c")); - } - { - SCOPED_TRACE("secondMiddle"); - auto collector = ParseRows(message, config, GetSecondMiddleSchema()); - EXPECT_NODES_EQUAL( - GetComposite(collector.GetRowValue(0, "a")), - ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); - EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar]")))); - EXPECT_FALSE(collector.GetNameTable()->FindId("c")); - } - { - SCOPED_TRACE("thirdMiddle"); - auto collector = ParseRows(message, config, GetThirdMiddleSchema()); - EXPECT_NODES_EQUAL( - GetComposite(collector.GetRowValue(0, "a")), - ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); - EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar;#]")))); - EXPECT_FALSE(collector.GetNameTable()->FindId("c")); - } - { - SCOPED_TRACE("late"); - auto collector = ParseRows(message, config, GetLateSchema()); - EXPECT_NODES_EQUAL( - GetComposite(collector.GetRowValue(0, "a")), - ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); - EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar;#]")))); - EXPECT_TRUE(collector.GetNameTable()->FindId("c")); - } -} - -TEST_F(TProtobufFormatCompat, ParseWrong) -{ - NYT::TCompatMessage message; - message.set_f1(42); - message.mutable_b()->set_x("foo"); - message.mutable_b()->set_y("bar"); - - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetFirstMiddleConfig(), GetFirstMiddleSchema()), - "Unexpected field number 2"); -} - -//////////////////////////////////////////////////////////////////////////////// - -class TProtobufFormatEnumCompat - : public ::testing::Test -{ -public: - static TTableSchemaPtr CreateTableSchema() - { - static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"optional_enum", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"required_enum", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"packed_repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"inner", OptionalLogicalType(StructLogicalType({ - {"optional_enum", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"required_enum", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - {"packed_repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, - }))}, - }); - return schema; - } - static TProtobufFormatConfigPtr CreateProtobufFormatConfig() - { - static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap() - .Item("enumerations").BeginMap() - .Item("ECompatEnum") - .BeginMap() - .Item("One").Value(1) - .Item("Two").Value(2) - .Item("Three").Value(3) - .EndMap() - .EndMap() - .Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("optional_enum") - .Item("field_number").Value(1) - .Item("proto_type").Value("enum_string") - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("required_enum") - .Item("field_number").Value(2) - .Item("proto_type").Value("enum_string") - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("repeated_enum") - .Item("field_number").Value(3) - .Item("proto_type").Value("enum_string") - .Item("repeated").Value(true) - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("packed_repeated_enum") - .Item("field_number").Value(4) - .Item("proto_type").Value("enum_string") - .Item("repeated").Value(true) - .Item("packed").Value(true) - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("inner") - .Item("field_number").Value(100) - .Item("proto_type").Value("structured_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("optional_enum") - .Item("field_number").Value(1) - .Item("proto_type").Value("enum_string") - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("required_enum") - .Item("field_number").Value(2) - .Item("proto_type").Value("enum_string") - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("repeated_enum") - .Item("field_number").Value(3) - .Item("proto_type").Value("enum_string") - .Item("repeated").Value(true) - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .Item().BeginMap() - .Item("name").Value("packed_repeated_enum") - .Item("field_number").Value(4) - .Item("proto_type").Value("enum_string") - .Item("repeated").Value(true) - .Item("packed").Value(true) - .Item("enum_writing_mode").Value("skip_unknown_values") - .Item("enumeration_name").Value("ECompatEnum") - .EndMap() - .EndList() - .EndMap() - .EndList().EndMap().EndList().EndMap()); - return config; - } - -}; - -TEST_F(TProtobufFormatEnumCompat, WriteCanSkipUnknownEnumValues) -{ - auto schema = CreateTableSchema(); - auto config = CreateProtobufFormatConfig(); - - auto nameTable = TNameTable::FromSchema(*schema); - - auto row = MakeRow(nameTable, { - {"optional_enum", "MinusFortyTwo"}, - {"required_enum", "One"}, - {"repeated_enum", EValueType::Composite, "[MinusFortyTwo;One;MinusFortyTwo]"}, - {"packed_repeated_enum", EValueType::Composite, "[MinusFortyTwo;Two;MinusFortyTwo]"}, - {"inner", EValueType::Composite, "[MinusFortyTwo;Two;[MinusFortyTwo;Two];[One;MinusFortyTwo]]"}, - }); - - auto collectRepeated = [](const auto& repeated) { - std::vector<TEnumCompat::ECompatEnum> values; - for (auto value : repeated) { - values.push_back(static_cast<TEnumCompat::ECompatEnum>(value)); - } - return values; - }; - - auto message = WriteRow<TEnumCompat>(row, config, schema, nameTable); - - EXPECT_FALSE(message.has_optional_enum()); - EXPECT_EQ(message.required_enum(), TEnumCompat::One); - EXPECT_EQ(collectRepeated(message.repeated_enum()), std::vector{TEnumCompat::One}); - EXPECT_EQ(collectRepeated(message.packed_repeated_enum()), std::vector{TEnumCompat::Two}); - - ASSERT_TRUE(message.has_inner()); - EXPECT_FALSE(message.inner().has_optional_enum()); - EXPECT_EQ(message.inner().required_enum(), TEnumCompat::Two); - EXPECT_EQ(collectRepeated(message.inner().repeated_enum()), std::vector{TEnumCompat::Two}); - EXPECT_EQ(collectRepeated(message.inner().packed_repeated_enum()), std::vector{TEnumCompat::One}); -} - -TEST_F(TProtobufFormatEnumCompat, WriteDoesntSkipRequiredFields) -{ - auto schema = CreateTableSchema(); - auto config = CreateProtobufFormatConfig(); - - auto nameTable = TNameTable::FromSchema(*schema); - - { - auto row = MakeRow(nameTable, {{"required_enum", "MinusFortyTwo"}}); - EXPECT_THROW_WITH_SUBSTRING(WriteRow<TEnumCompat>(row, config, schema, nameTable), "Invalid value for enum"); - } - { - auto row = MakeRow(nameTable, {{"inner", EValueType::Composite, "[#;MinusFortyTwo;#;#]"},}); - EXPECT_THROW_WITH_SUBSTRING(WriteRow<TEnumCompat>(row, config, schema, nameTable), "Invalid value for enum"); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -class TProtobufFormatRuntimeErrors - : public ::testing::Test -{ -public: - static TTableSchemaPtr GetSchemaWithVariant(bool optional = false) - { - auto variantType = VariantStructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }); - return New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", optional ? OptionalLogicalType(variantType) : variantType}, - }); - } - - static TTableSchemaPtr GetSchemaWithStruct(bool optional = false) - { - auto structType = StructLogicalType({ - {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - }); - return New<TTableSchema>(std::vector<TColumnSchema>{ - {"a", optional ? OptionalLogicalType(structType) : structType}, - }); - } - - static TProtobufFormatConfigPtr GetConfigWithVariant() - { - static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("a") - .Item("proto_type").Value("oneof") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("f1") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("f2") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList().EndMap().EndList().EndMap()); - return config; - } - - static TProtobufFormatConfigPtr GetConfigWithStruct() - { - static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() - .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() - .Item().BeginMap() - .Item("name").Value("a") - .Item("field_number").Value(1) - .Item("proto_type").Value("structured_message") - .Item("fields").BeginList() - .Item().BeginMap() - .Item("name").Value("f1") - .Item("field_number").Value(1) - .Item("proto_type").Value("int64") - .EndMap() - .Item().BeginMap() - .Item("name").Value("f2") - .Item("field_number").Value(2) - .Item("proto_type").Value("string") - .EndMap() - .EndList() - .EndMap() - .EndList().EndMap().EndList().EndMap()); - return config; - } -}; - -TEST_F(TProtobufFormatRuntimeErrors, ParseVariant) -{ - { - SCOPED_TRACE("Optional variant, all missing"); - TMessageWithOneof message; - auto collector = ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant(/* optional */ true)); - EXPECT_FALSE(collector.FindRowValue(0, "a")); - } - { - SCOPED_TRACE("All missing"); - TMessageWithOneof message; - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant()), - "required field \"<root>.a\" is missing"); - } - { - SCOPED_TRACE("two alternatives"); - TMessageWithStruct::TStruct message; - message.set_f1(5); - message.set_f2("boo"); - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant()), - "multiple entries for oneof field \"<root>.a\""); - } -} - -TEST_F(TProtobufFormatRuntimeErrors, ParseStruct) -{ - { - SCOPED_TRACE("Optional submessage missing"); - TMessageWithStruct message; - auto collector = ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct(/* optional */ true)); - EXPECT_FALSE(collector.FindRowValue(0, "a")); - } - { - SCOPED_TRACE("Required submessage missing"); - TMessageWithStruct message; - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), - "required field \"<root>.a\" is missing"); - } - { - SCOPED_TRACE("All fields missing"); - TMessageWithStruct message; - message.mutable_a(); - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), - "required field \"<root>.a.f1\" is missing"); - } - { - SCOPED_TRACE("Second field missing"); - TMessageWithStruct message; - message.mutable_a()->set_f1(17); - EXPECT_THROW_WITH_SUBSTRING( - ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), - "required field \"<root>.a.f2\" is missing"); - } - { - SCOPED_TRACE("All present"); - TMessageWithStruct message; - message.mutable_a()->set_f1(17); - message.mutable_a()->set_f2("foobar"); - auto collector = ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()); - EXPECT_NODES_EQUAL( - GetComposite(collector.GetRowValue(0, "a")), - ConvertToNode(TYsonString(TStringBuf("[17;foobar]")))); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT diff --git a/yt/yt/client/unittests/protobuf_format_ut.proto b/yt/yt/client/unittests/protobuf_format_ut.proto deleted file mode 100644 index 06258de619..0000000000 --- a/yt/yt/client/unittests/protobuf_format_ut.proto +++ /dev/null @@ -1,255 +0,0 @@ -import "yt/yt_proto/yt/formats/extension.proto"; - -package NYT.NProtobufFormatTest; - -enum EEnum -{ - One = 1; - Two = 2; - Three = 3; - - MinusFortyTwo = -42; - - MinInt32 = -2147483648; - MaxInt32 = 2147483647; -} - -message TEmbeddedStruct { - optional float float1 = 1; - optional string string1 = 2; -}; - -message TEmbedded2Message { - option (NYT.default_field_flags) = SERIALIZATION_YT; - optional uint64 embedded2_num = 10; - optional TEmbeddedStruct embedded2_struct = 17; - repeated string embedded2_repeated = 42; -}; - -message TEmbedded1Message { - option (NYT.default_field_flags) = SERIALIZATION_YT; - optional TEmbedded2Message t2 = 1 [(NYT.flags) = EMBEDDED]; - oneof variant { - string str_variant = 101; - uint64 uint_variant = 102; - } - optional uint64 embedded_num = 10; // make intentional field_num collision! - optional string embedded_extra_field = 11; -}; -message TEmbeddingMessage { - optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; - optional TEmbedded1Message t1 = 2 [(NYT.flags) = EMBEDDED]; - optional uint64 num = 12; - optional string extra_field = 13; -}; - -message TEmbeddedMessage -{ - optional string key = 1; - optional string value = 2; -} - -message TMessageWithStructuredEmbedded -{ - option (NYT.default_field_flags) = SERIALIZATION_YT; - - message TFirstMessage - { - option (NYT.default_field_flags) = SERIALIZATION_YT; - - optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; - optional int64 int64_field = 2; - repeated int64 repeated_int64_field = 3; - optional TEmbeddedMessage message_field = 4; - repeated TEmbeddedMessage repeated_message_field = 5; - optional bytes any_int64_field = 6 [(NYT.flags) = ANY]; - optional bytes any_map_field = 7 [(NYT.flags) = ANY]; - optional int64 optional_int64_field = 8; - repeated int64 another_repeated_int64_field = 9; - repeated bytes repeated_optional_any_field = 10 [(NYT.flags) = ANY]; - repeated EEnum packed_repeated_enum_field = 11 [packed=true, (NYT.flags) = ENUM_STRING]; - repeated bool optional_repeated_bool_field = 12; - oneof oneof_field { - string oneof_string_field_1 = 101; - string oneof_string_field = 102; - TEmbeddedMessage oneof_message_field = 1000; - } - oneof optional_oneof_field { - string optional_oneof_string_field_1 = 201; - string optional_oneof_string_field = 202; - TEmbeddedMessage optional_oneof_message_field = 2000; - } - map<int64, TEmbeddedMessage> map_field = 13 [(NYT.flags) = MAP_AS_DICT]; - } - - message TSecondMessage - { - optional int64 one = 2; - optional int64 two = 500000000; - optional int64 three = 100500; - } - - optional TFirstMessage first = 1; - optional TSecondMessage second = 2; - repeated TEmbeddedMessage repeated_message_field = 3; - repeated int64 repeated_int64_field = 4; - optional int64 int64_any_field = 5 [(NYT.column_name) = "any_field"]; - - optional int32 int32_field = 6 [(NYT.column_name) = "int64_field"]; - optional uint32 uint32_field = 7 [(NYT.column_name) = "uint64_field"]; - optional int64 int64_field = 8 [(NYT.column_name) = "int32_field"]; - optional uint64 uint64_field = 9 [(NYT.column_name) = "uint32_field"]; - - optional EEnum enum_int_field = 10 [(NYT.flags) = ENUM_INT]; - optional EEnum enum_string_string_field = 11 [(NYT.flags) = ENUM_STRING]; - optional EEnum enum_string_int64_field = 12 [(NYT.flags) = ENUM_STRING]; - - - repeated int64 another_repeated_int64_field = 13; - - repeated bytes repeated_optional_any_field = 14 [(NYT.flags) = ANY]; - - optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; - - optional string utf8_field = 16; - - repeated int64 packed_repeated_int64_field = 17 [packed=true]; - - repeated int64 optional_repeated_int64_field = 18; - - oneof oneof_field { - string oneof_string_field_1 = 101; - string oneof_string_field = 102; - TEmbeddedMessage oneof_message_field = 1000; - } - - oneof optional_oneof_field { - string optional_oneof_string_field_1 = 201; - string optional_oneof_string_field = 202; - TEmbeddedMessage optional_oneof_message_field = 2000; - } - - map<int64, TEmbeddedMessage> map_field = 19 [(NYT.flags) = MAP_AS_DICT]; -} - -message TSeveralTablesMessageFirst -{ - option (NYT.default_field_flags) = SERIALIZATION_YT; - - message TEmbedded - { - optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; - optional int64 int64_field = 2; - } - optional TEmbedded embedded = 1; - repeated int64 repeated_int64_field = 2; - optional int64 int64_field = 3 [(NYT.column_name) = "any_field"]; -} - -message TSeveralTablesMessageSecond -{ - optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; - optional int64 int64_field = 2; -} - -message TSeveralTablesMessageThird -{ - optional string string_field = 1; -} - -message TMessage -{ - optional double double_field = 1 [(NYT.column_name) = "Double"]; - optional float float_field = 2 [(NYT.column_name) = "Float"]; - - optional int64 int64_field = 3 [(NYT.column_name) = "Int64"]; - optional uint64 uint64_field = 4 [(NYT.column_name) = "UInt64"]; - optional sint64 sint64_field = 5 [(NYT.column_name) = "SInt64"]; - optional fixed64 fixed64_field = 6 [(NYT.column_name) = "Fixed64"]; - optional sfixed64 sfixed64_field = 7 [(NYT.column_name) = "SFixed64"]; - - optional int32 int32_field = 8 [(NYT.column_name) = "Int32"]; - optional uint32 uint32_field = 9 [(NYT.column_name) = "UInt32"]; - optional sint32 sint32_field = 10 [(NYT.column_name) = "SInt32"]; - optional fixed32 fixed32_field = 11 [(NYT.column_name) = "Fixed32"]; - optional sfixed32 sfixed32_field = 12 [(NYT.column_name) = "SFixed32"]; - - optional bool bool_field = 13 [(NYT.column_name) = "Bool"]; - optional string string_field = 14 [(NYT.column_name) = "String"]; - optional bytes bytes_field = 15 [(NYT.column_name) = "Bytes"]; - - optional EEnum enum_field = 16 [(NYT.column_name) = "Enum", (NYT.flags) = ENUM_STRING]; - optional TEmbeddedMessage message_field = 17 [(NYT.column_name) = "Message"]; - - optional bytes any_field_with_map = 18 [(NYT.column_name) = "AnyWithMap", (NYT.flags) = ANY]; - optional bytes any_field_with_int64 = 19 [(NYT.column_name) = "AnyWithInt64", (NYT.flags) = ANY]; - optional bytes any_field_with_string = 20 [(NYT.column_name) = "AnyWithString", (NYT.flags) = ANY]; - optional bytes other_columns_field = 21 [(NYT.flags) = OTHER_COLUMNS]; - - optional int64 missing_int64_field = 22 [(NYT.column_name) = "MissingInt64"]; -} - -message TCompatMessage -{ - message TEmbedded - { - optional string x = 1; - optional string y = 2; - } - - oneof a { - int64 f1 = 1; - string f2 = 101; - } - optional TEmbedded b = 2; -} - -message TMessageWithOneof -{ - oneof variant { - int64 f1 = 1; - string f2 = 2; - } -} - -message TMessageWithStruct -{ - message TStruct - { - optional int64 f1 = 1; - optional string f2 = 2; - } - optional TStruct a = 1; -} - -message TOtherColumnsMessage -{ - optional bytes other_columns_field = 1 [(NYT.flags) = OTHER_COLUMNS]; -} - -message TEnumCompat { - option (NYT.default_field_flags) = SERIALIZATION_YT; - option (NYT.default_field_flags) = ENUM_SKIP_UNKNOWN_VALUES; - - enum ECompatEnum { - One = 1; - Two = 2; - Three = 3; - } - - - message TStruct - { - optional ECompatEnum optional_enum = 1; - required ECompatEnum required_enum = 2; - repeated ECompatEnum repeated_enum = 3; - repeated ECompatEnum packed_repeated_enum = 4 [packed=true, (NYT.flags) = ENUM_STRING]; - } - - optional ECompatEnum optional_enum = 1; - required ECompatEnum required_enum = 2; - repeated ECompatEnum repeated_enum = 3; - repeated ECompatEnum packed_repeated_enum = 4 [packed=true, (NYT.flags) = ENUM_STRING]; - - optional TStruct inner = 100; -} diff --git a/yt/yt/client/unittests/row_helpers.cpp b/yt/yt/client/unittests/row_helpers.cpp deleted file mode 100644 index d28628c5ab..0000000000 --- a/yt/yt/client/unittests/row_helpers.cpp +++ /dev/null @@ -1,70 +0,0 @@ -#include "row_helpers.h" - -#include <yt/yt/core/yson/string.h> -#include <yt/yt/core/ytree/convert.h> - -namespace NYT { - -using namespace NTableClient; - -//////////////////////////////////////////////////////////////////////////////// - -static void EnsureTypesMatch(EValueType expected, EValueType actual) -{ - if (expected != actual) { - THROW_ERROR_EXCEPTION("Unexpected type of TUnversionedValue: expected %Qlv, actual %Qlv", - expected, - actual); - } -} - -i64 GetInt64(const TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Int64, row.Type); - return row.Data.Int64; -} - -ui64 GetUint64(const TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Uint64, row.Type); - return row.Data.Uint64; -} - -double GetDouble(const NTableClient::TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Double, row.Type); - return row.Data.Double; -} - -bool GetBoolean(const TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Boolean, row.Type); - return row.Data.Boolean; -} - -TString GetString(const TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::String, row.Type); - return row.AsString(); -} - -NYTree::INodePtr GetAny(const NTableClient::TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Any, row.Type); - return NYTree::ConvertToNode(NYson::TYsonString(row.AsString())); -} - -NYTree::INodePtr GetComposite(const NTableClient::TUnversionedValue& row) -{ - EnsureTypesMatch(EValueType::Composite, row.Type); - return NYTree::ConvertToNode(NYson::TYsonString(row.AsString())); -} - -bool IsNull(const NTableClient::TUnversionedValue& row) -{ - return row.Type == EValueType::Null; -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/client/unittests/row_helpers.h b/yt/yt/client/unittests/row_helpers.h deleted file mode 100644 index 4a3fbd854f..0000000000 --- a/yt/yt/client/unittests/row_helpers.h +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include <yt/yt/client/table_client/unversioned_row.h> -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/schema.h> -#include <yt/yt/client/table_client/value_consumer.h> - -#include <vector> - -namespace NYT { - -//////////////////////////////////////////////////////////////////////////////// - -class TCollectingValueConsumer - : public NTableClient::IValueConsumer -{ -public: - explicit TCollectingValueConsumer(NTableClient::TTableSchemaPtr schema = New<NTableClient::TTableSchema>()) - : Schema_(std::move(schema)) - { } - - explicit TCollectingValueConsumer(NTableClient::TNameTablePtr nameTable, NTableClient::TTableSchemaPtr schema = New<NTableClient::TTableSchema>()) - : Schema_(std::move(schema)) - , NameTable_(std::move(nameTable)) - { } - - const NTableClient::TNameTablePtr& GetNameTable() const override - { - return NameTable_; - } - - const NTableClient::TTableSchemaPtr& GetSchema() const override - { - return Schema_; - } - - bool GetAllowUnknownColumns() const override - { - return true; - } - - void OnBeginRow() override - { } - - void OnValue(const NTableClient::TUnversionedValue& value) override - { - Builder_.AddValue(value); - } - - void OnEndRow() override - { - RowList_.emplace_back(Builder_.FinishRow()); - } - - NTableClient::TUnversionedRow GetRow(size_t rowIndex) - { - return RowList_.at(rowIndex); - } - - std::optional<NTableClient::TUnversionedValue> FindRowValue(size_t rowIndex, TStringBuf columnName) const - { - NTableClient::TUnversionedRow row = RowList_.at(rowIndex); - auto id = GetNameTable()->GetIdOrThrow(columnName); - - for (const auto& value : row) { - if (value.Id == id) { - return value; - } - } - return std::nullopt; - } - - NTableClient::TUnversionedValue GetRowValue(size_t rowIndex, TStringBuf columnName) const - { - auto row = FindRowValue(rowIndex, columnName); - if (!row) { - THROW_ERROR_EXCEPTION("Cannot find column %Qv", columnName); - } - return *row; - } - - size_t Size() const - { - return RowList_.size(); - } - - const std::vector<NTableClient::TUnversionedOwningRow>& GetRowList() const { - return RowList_; - } - -private: - const NTableClient::TTableSchemaPtr Schema_; - const NTableClient::TNameTablePtr NameTable_ = New<NTableClient::TNameTable>(); - NTableClient::TUnversionedOwningRowBuilder Builder_; - std::vector<NTableClient::TUnversionedOwningRow> RowList_; -}; - -//////////////////////////////////////////////////////////////////////////////// - -i64 GetInt64(const NTableClient::TUnversionedValue& row); -ui64 GetUint64(const NTableClient::TUnversionedValue& row); -double GetDouble(const NTableClient::TUnversionedValue& row); -bool GetBoolean(const NTableClient::TUnversionedValue& row); -TString GetString(const NTableClient::TUnversionedValue& row); -NYTree::INodePtr GetAny(const NTableClient::TUnversionedValue& row); -NYTree::INodePtr GetComposite(const NTableClient::TUnversionedValue& row); -bool IsNull(const NTableClient::TUnversionedValue& row); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/client/unittests/schema_ut.cpp b/yt/yt/client/unittests/schema_ut.cpp index 3482b866b5..2b5f2d5e8a 100644 --- a/yt/yt/client/unittests/schema_ut.cpp +++ b/yt/yt/client/unittests/schema_ut.cpp @@ -1,4 +1,4 @@ -#include "logical_type_shortcuts.h" +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> #include "yt/yt/client/table_client/logical_type.h" #include <yt/yt/core/test_framework/framework.h> diff --git a/yt/yt/client/unittests/schemaful_dsv_parser_ut.cpp b/yt/yt/client/unittests/schemaful_dsv_parser_ut.cpp deleted file mode 100644 index 000ae5f635..0000000000 --- a/yt/yt/client/unittests/schemaful_dsv_parser_ut.cpp +++ /dev/null @@ -1,259 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/test_framework/yson_consumer_mock.h> - -#include <yt/yt/client/formats/schemaful_dsv_parser.h> - -#include <yt/yt/core/yson/null_consumer.h> - -namespace NYT::NFormats { -namespace { - -using namespace NYson; - -using ::testing::InSequence; -using ::testing::StrictMock; -using ::testing::NiceMock; - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSchemafulDsvParserTest, Simple) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("5")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar("6")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("100")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar("max\tignat")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "5\t6\n" - "100\tmax\\tignat\n"; - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->Columns->push_back("b"); - - ParseSchemafulDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSchemafulDsvParserTest, TableIndex) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginAttributes()); - EXPECT_CALL(Mock, OnKeyedItem("table_index")); - EXPECT_CALL(Mock, OnInt64Scalar(1)); - EXPECT_CALL(Mock, OnEndAttributes()); - EXPECT_CALL(Mock, OnEntity()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("x")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginAttributes()); - EXPECT_CALL(Mock, OnKeyedItem("table_index")); - EXPECT_CALL(Mock, OnInt64Scalar(0)); - EXPECT_CALL(Mock, OnEndAttributes()); - EXPECT_CALL(Mock, OnEntity()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("y")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("z")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "1\tx\n" - "0\ty\n" - "0\tz\n"; - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->EnableTableIndex = true; - - ParseSchemafulDsv(input, &Mock, config); -} - -TEST(TSchemafulDsvParserTest, TooManyRows) -{ - TString input = "5\t6\n"; - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = {"a"}; - - EXPECT_THROW({ ParseSchemafulDsv(input, GetNullYsonConsumer(), config); }, std::exception); -} - -TEST(TSchemafulDsvParserTest, SpecialSymbols) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - auto value = TString("6\0", 2); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("5\r")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar(value)); - EXPECT_CALL(Mock, OnEndMap()); - - TString input("5\r\t6\0\n", 6); - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->Columns->push_back("b"); - - ParseSchemafulDsv(input, &Mock, config); -} - -TEST(TSchemafulDsvParserTest, EnabledEscaping) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - auto value = TString("6\0", 2); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("5\r\r")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar(value)); - EXPECT_CALL(Mock, OnEndMap()); - - TString input("5\r\\r\t6\0\n", 8); - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->Columns->push_back("b"); - config->EnableEscaping = true; - - ParseSchemafulDsv(input, &Mock, config); -} - -TEST(TSchemafulDsvParserTest, DisabledEscaping) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - auto value = TString("6\0", 2); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("5\r\\r")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar(value)); - EXPECT_CALL(Mock, OnEndMap()); - - TString input("5\r\\r\t6\0\n", 8); - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->Columns->push_back("b"); - config->EnableEscaping = false; - - ParseSchemafulDsv(input, &Mock, config); -} - -TEST(TSchemafulDsvParserTest, ColumnsNamesHeader) -{ - TString input("a\tb\n1\t2\n"); - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = std::vector<TString>(); - config->Columns->push_back("a"); - config->Columns->push_back("b"); - config->EnableColumnNamesHeader = true; - - EXPECT_THROW(ParseSchemafulDsv(input, GetNullYsonConsumer(), config), std::exception); -} - -TEST(TSchemafulDsvParserTest, MissingValueModePrintSentinel) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - TString input = "x\t\tz\n"; - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("x")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("c")); - EXPECT_CALL(Mock, OnStringScalar("z")); - EXPECT_CALL(Mock, OnEndMap()); - - auto config = New<TSchemafulDsvFormatConfig>(); - config->Columns = {"a", "b", "c"}; - // By default missing_value_mode = fail and no sentinel values are used, - // i. e. there is no way to represent YSON entity with this format. - - ParseSchemafulDsv(input, &Mock, config); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("x")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnEntity()); - EXPECT_CALL(Mock, OnKeyedItem("c")); - EXPECT_CALL(Mock, OnStringScalar("z")); - EXPECT_CALL(Mock, OnEndMap()); - - config->MissingValueMode = EMissingSchemafulDsvValueMode::PrintSentinel; - // By default missing_value_sentinel = "". - - ParseSchemafulDsv(input, &Mock, config); - - input = "null\tNULL\t\n"; - - config->MissingValueSentinel = "NULL"; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("null")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnEntity()); - EXPECT_CALL(Mock, OnKeyedItem("c")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnEndMap()); - - ParseSchemafulDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/schemaful_dsv_writer_ut.cpp b/yt/yt/client/unittests/schemaful_dsv_writer_ut.cpp deleted file mode 100644 index 90a3af0dcb..0000000000 --- a/yt/yt/client/unittests/schemaful_dsv_writer_ut.cpp +++ /dev/null @@ -1,344 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> -#include "format_writer_ut.h" - -#include <yt/yt/client/formats/schemaful_dsv_writer.h> -#include <yt/yt/client/formats/format.h> - -#include <yt/yt/client/table_client/name_table.h> - -#include <yt/yt/core/concurrency/async_stream.h> - -#include <limits> - -namespace NYT::NFormats { -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NYTree; -using namespace NYson; -using namespace NConcurrency; -using namespace NTableClient; - -class TSchemalessWriterForSchemafulDsvTest - : public ::testing::Test -{ -protected: - TNameTablePtr NameTable_; - int KeyAId_; - int KeyBId_; - int KeyCId_; - int KeyDId_; - int TableIndexId_; - int RangeIndexId_; - int RowIndexId_; - TSchemafulDsvFormatConfigPtr Config_; - - ISchemalessFormatWriterPtr Writer_; - - TStringStream OutputStream_; - - TSchemalessWriterForSchemafulDsvTest() { - NameTable_ = New<TNameTable>(); - KeyAId_ = NameTable_->RegisterName("column_a"); - KeyBId_ = NameTable_->RegisterName("column_b"); - KeyCId_ = NameTable_->RegisterName("column_c"); - KeyDId_ = NameTable_->RegisterName("column_d"); - TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); - RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); - RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); - - Config_ = New<TSchemafulDsvFormatConfig>(); - } - - void CreateStandardWriter() { - auto controlAttributesConfig = New<TControlAttributesConfig>(); - controlAttributesConfig->EnableTableIndex = Config_->EnableTableIndex; - Writer_ = CreateSchemalessWriterForSchemafulDsv( - Config_, - NameTable_, - CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), - false, // enableContextSaving - controlAttributesConfig, - 0 /* keyColumnCount */); - } -}; - -TEST_F(TSchemalessWriterForSchemafulDsvTest, Simple) -{ - Config_->Columns = {"column_b", "column_c", "column_a"}; - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("value_a", KeyAId_)); - row1.AddValue(MakeUnversionedInt64Value(-42, KeyBId_)); - row1.AddValue(MakeUnversionedBooleanValue(true, KeyCId_)); - row1.AddValue(MakeUnversionedStringValue("garbage", KeyDId_)); - - // Ignore system columns. - row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); - - TUnversionedRowBuilder row2; - // The order is reversed. - row2.AddValue(MakeUnversionedStringValue("value_c", KeyCId_)); - row2.AddValue(MakeUnversionedBooleanValue(false, KeyBId_)); - row2.AddValue(MakeUnversionedInt64Value(23, KeyAId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = - "-42\ttrue\tvalue_a\n" - "false\tvalue_c\t23\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -// This test shows the actual behavior of writer. It is OK to change it in the future. :) -TEST_F(TSchemalessWriterForSchemafulDsvTest, TrickyDoubleRepresentations) -{ - Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; - CreateStandardWriter(); - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedDoubleValue(1.234567890123456, KeyAId_)); - row1.AddValue(MakeUnversionedDoubleValue(42, KeyBId_)); - row1.AddValue(MakeUnversionedDoubleValue(1e300, KeyCId_)); - row1.AddValue(MakeUnversionedDoubleValue(-1e-300, KeyDId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = "1.234567890123456\t42.\t1e+300\t-1e-300\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, IntegralTypeRepresentations) -{ - Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); - row1.AddValue(MakeUnversionedInt64Value(-1LL, KeyBId_)); - row1.AddValue(MakeUnversionedInt64Value(1LL, KeyCId_)); - row1.AddValue(MakeUnversionedInt64Value(99LL, KeyDId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedInt64Value(123LL, KeyAId_)); - row2.AddValue(MakeUnversionedInt64Value(-123LL, KeyBId_)); - row2.AddValue(MakeUnversionedInt64Value(1234LL, KeyCId_)); - row2.AddValue(MakeUnversionedInt64Value(-1234LL, KeyDId_)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedUint64Value(0ULL, KeyAId_)); - row3.AddValue(MakeUnversionedUint64Value(98ULL, KeyBId_)); - row3.AddValue(MakeUnversionedUint64Value(987ULL, KeyCId_)); - row3.AddValue(MakeUnversionedUint64Value(9876ULL, KeyDId_)); - - TUnversionedRowBuilder row4; - row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::max(), KeyAId_)); - row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::min(), KeyBId_)); - row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::min() + 1LL, KeyCId_)); - row4.AddValue(MakeUnversionedUint64Value(std::numeric_limits<ui64>::max(), KeyDId_)); - - std::vector<TUnversionedRow> rows = - {row1.GetRow(), row2.GetRow(), row3.GetRow(), row4.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = - "0\t-1\t1\t99\n" - "123\t-123\t1234\t-1234\n" - "0\t98\t987\t9876\n" - "9223372036854775807\t-9223372036854775808\t-9223372036854775807\t18446744073709551615\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, EmptyColumnList) -{ - Config_->Columns = std::vector<TString>(); - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); - - - std::vector<TUnversionedRow> rows = { row1.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = "\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, MissingValueMode) -{ - Config_->Columns = {"column_a", "column_b", "column_c"}; - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("Value1A", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("Value1B", KeyBId_)); - row1.AddValue(MakeUnversionedStringValue("Value1C", KeyCId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("Value2A", KeyAId_)); - row2.AddValue(MakeUnversionedStringValue("Value2C", KeyCId_)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("Value3A", KeyAId_)); - row3.AddValue(MakeUnversionedStringValue("Value3B", KeyBId_)); - row3.AddValue(MakeUnversionedStringValue("Value3C", KeyCId_)); - - std::vector<TUnversionedRow> rows = - {row1.GetRow(), row2.GetRow(), row3.GetRow()}; - - { - Config_->MissingValueMode = EMissingSchemafulDsvValueMode::SkipRow; - CreateStandardWriter(); - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = - "Value1A\tValue1B\tValue1C\n" - "Value3A\tValue3B\tValue3C\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); - OutputStream_.Clear(); - } - - { - Config_->MissingValueMode = EMissingSchemafulDsvValueMode::Fail; - CreateStandardWriter(); - EXPECT_EQ(false, Writer_->Write(rows)); - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); - OutputStream_.Clear(); - } - - { - Config_->MissingValueMode = EMissingSchemafulDsvValueMode::PrintSentinel; - Config_->MissingValueSentinel = "~"; - CreateStandardWriter(); - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = - "Value1A\tValue1B\tValue1C\n" - "Value2A\t~\tValue2C\n" - "Value3A\tValue3B\tValue3C\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); - OutputStream_.Clear(); - } -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, NameTableExpansion) -{ - Config_->Columns = {"Column1"}; - Config_->MissingValueMode = {EMissingSchemafulDsvValueMode::PrintSentinel}; - CreateStandardWriter(); - TestNameTableExpansion(Writer_, NameTable_); -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, TableIndex) -{ - Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; - Config_->EnableTableIndex = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row0; - row0.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); - row0.AddValue(MakeUnversionedInt64Value(1LL, KeyBId_)); - row0.AddValue(MakeUnversionedInt64Value(2LL, KeyCId_)); - row0.AddValue(MakeUnversionedInt64Value(3LL, KeyDId_)); - - // It's necessary to specify a column corresponding to the table index - // when enable_table_index = true. - EXPECT_EQ(false, Writer_->Write(std::vector<TUnversionedRow>{row0.GetRow()})); - - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedInt64Value(42LL, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); - row1.AddValue(MakeUnversionedInt64Value(1LL, KeyBId_)); - row1.AddValue(MakeUnversionedInt64Value(2LL, KeyCId_)); - row1.AddValue(MakeUnversionedInt64Value(3LL, KeyDId_)); - - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedInt64Value(42LL, TableIndexId_)); - row2.AddValue(MakeUnversionedInt64Value(4LL, KeyAId_)); - row2.AddValue(MakeUnversionedInt64Value(5LL, KeyBId_)); - row2.AddValue(MakeUnversionedInt64Value(6LL, KeyCId_)); - row2.AddValue(MakeUnversionedInt64Value(7LL, KeyDId_)); - - EXPECT_EQ(true, Writer_->Write(std::vector<TUnversionedRow>{row1.GetRow(), row2.GetRow()})); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedInt64Value(23LL, TableIndexId_)); - row3.AddValue(MakeUnversionedUint64Value(8LL, KeyAId_)); - row3.AddValue(MakeUnversionedUint64Value(9LL, KeyBId_)); - row3.AddValue(MakeUnversionedUint64Value(10LL, KeyCId_)); - row3.AddValue(MakeUnversionedUint64Value(11ULL, KeyDId_)); - - EXPECT_EQ(true, Writer_->Write(std::vector<TUnversionedRow>{row3.GetRow()})); - - Writer_->Close() - .Get() - .ThrowOnError(); - TString expectedOutput = - "42\t0\t1\t2\t3\n" - "42\t4\t5\t6\t7\n" - "23\t8\t9\t10\t11\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - - -TEST_F(TSchemalessWriterForSchemafulDsvTest, ValidateDuplicateNames) -{ - Config_->Columns = {"column_a", "column_b", "column_a"}; - Config_->EnableTableIndex = true; - EXPECT_THROW(CreateStandardWriter(), TErrorException); -} - -TEST_F(TSchemalessWriterForSchemafulDsvTest, ColumnsHeader) -{ - Config_->Columns = {"column_b", "column_c", "column_a"}; - Config_->EnableColumnNamesHeader = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("value_a", KeyAId_)); - row1.AddValue(MakeUnversionedInt64Value(-42, KeyBId_)); - row1.AddValue(MakeUnversionedBooleanValue(true, KeyCId_)); - std::vector<TUnversionedRow> rows = {row1.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = - "column_b\tcolumn_c\tcolumn_a\n" - "-42\ttrue\tvalue_a\n"; - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/skiff_format_ut.cpp b/yt/yt/client/unittests/skiff_format_ut.cpp deleted file mode 100644 index 4878b7f673..0000000000 --- a/yt/yt/client/unittests/skiff_format_ut.cpp +++ /dev/null @@ -1,3006 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include "logical_type_shortcuts.h" -#include "value_examples.h" -#include "row_helpers.h" -#include "yson_helpers.h" - -#include <yt/yt/client/formats/config.h> -#include <yt/yt/client/formats/parser.h> -#include <yt/yt/client/formats/skiff_parser.h> -#include <yt/yt/client/formats/skiff_writer.h> -#include <yt/yt/client/formats/format.h> -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/validate_logical_type.h> - -#include <yt/yt/library/named_value/named_value.h> -#include <yt/yt/library/skiff_ext/schema_match.h> - -#include <yt/yt/core/yson/string.h> -#include <yt/yt/core/ytree/convert.h> -#include <yt/yt/core/ytree/fluent.h> -#include <yt/yt/core/ytree/tree_visitor.h> - -#include <library/cpp/skiff/skiff.h> -#include <library/cpp/skiff/skiff_schema.h> - -#include <util/stream/null.h> -#include <util/string/hex.h> - -namespace NYT { - -namespace { - -using namespace NFormats; -using namespace NNamedValue; -using namespace NSkiff; -using namespace NSkiffExt; -using namespace NTableClient; -using namespace NYTree; -using namespace NYson; - -//////////////////////////////////////////////////////////////////////////////// - -TString ConvertToSkiffSchemaShortDebugString(INodePtr node) -{ - auto skiffFormatConfig = ConvertTo<TSkiffFormatConfigPtr>(std::move(node)); - auto skiffSchemas = ParseSkiffSchemas(skiffFormatConfig->SkiffSchemaRegistry, skiffFormatConfig->TableSkiffSchemas); - TStringStream result; - result << '{'; - for (const auto& schema : skiffSchemas) { - result << GetShortDebugString(schema); - result << ','; - } - result << '}'; - return result.Str(); -} - -//////////////////////////////////////////////////////////////////////////////// - -TString ConvertToYsonTextStringStable(const INodePtr& node) -{ - TStringStream out; - TYsonWriter writer(&out, EYsonFormat::Text); - VisitTree(node, &writer, true, TAttributeFilter()); - writer.Flush(); - return out.Str(); -} - -TTableSchemaPtr CreateSingleValueTableSchema(const TLogicalTypePtr& logicalType) -{ - std::vector<TColumnSchema> columns; - if (logicalType) { - columns.emplace_back("value", logicalType); - - } - auto strict = static_cast<bool>(logicalType); - return New<TTableSchema>(columns, strict); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSkiffSchemaParse, TestAllowedTypes) -{ - EXPECT_EQ( - "{uint64,}", - - ConvertToSkiffSchemaShortDebugString( - BuildYsonNodeFluently() - .BeginMap() - .Item("table_skiff_schemas") - .BeginList() - .Item() - .BeginMap() - .Item("wire_type") - .Value("uint64") - .EndMap() - .EndList() - .EndMap())); - - EXPECT_EQ( - "{string32,}", - - ConvertToSkiffSchemaShortDebugString( - BuildYsonNodeFluently() - .BeginMap() - .Item("table_skiff_schemas") - .BeginList() - .Item() - .BeginMap() - .Item("wire_type") - .Value("string32") - .EndMap() - .EndList() - .EndMap())); - - EXPECT_EQ( - "{variant8<string32;int64;>,}", - - ConvertToSkiffSchemaShortDebugString( - BuildYsonNodeFluently() - .BeginMap() - .Item("table_skiff_schemas") - .BeginList() - .Item() - .BeginMap() - .Item("wire_type") - .Value("variant8") - .Item("children") - .BeginList() - .Item() - .BeginMap() - .Item("wire_type") - .Value("string32") - .EndMap() - .Item() - .BeginMap() - .Item("wire_type") - .Value("int64") - .EndMap() - .EndList() - .EndMap() - .EndList() - .EndMap())); - - EXPECT_EQ( - "{variant8<int64;string32;>,}", - - ConvertToSkiffSchemaShortDebugString( - BuildYsonNodeFluently() - .BeginMap() - .Item("skiff_schema_registry") - .BeginMap() - .Item("item1") - .BeginMap() - .Item("wire_type") - .Value("int64") - .EndMap() - .Item("item2") - .BeginMap() - .Item("wire_type") - .Value("string32") - .EndMap() - .EndMap() - .Item("table_skiff_schemas") - .BeginList() - .Item() - .BeginMap() - .Item("wire_type") - .Value("variant8") - .Item("children") - .BeginList() - .Item().Value("$item1") - .Item().Value("$item2") - .EndList() - .EndMap() - .EndList() - .EndMap())); -} - -TEST(TSkiffSchemaParse, TestRecursiveTypesAreDisallowed) -{ - try { - ConvertToSkiffSchemaShortDebugString( - BuildYsonNodeFluently() - .BeginMap() - .Item("skiff_schema_registry") - .BeginMap() - .Item("item1") - .BeginMap() - .Item("wire_type") - .Value("variant8") - .Item("children") - .BeginList() - .Item().Value("$item1") - .EndList() - .EndMap() - .EndMap() - .Item("table_skiff_schemas") - .BeginList() - .Item().Value("$item1") - .EndList() - .EndMap()); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("recursive types are forbidden")); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSkiffSchemaDescription, TestDescriptionDerivation) -{ - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Uint64), - })->SetName("Bar"), - }); - - auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - EXPECT_EQ(std::ssize(tableDescriptionList), 1); - EXPECT_EQ(tableDescriptionList[0].HasOtherColumns, false); - EXPECT_EQ(tableDescriptionList[0].SparseFieldDescriptionList.empty(), true); - - auto denseFieldDescriptionList = tableDescriptionList[0].DenseFieldDescriptionList; - EXPECT_EQ(std::ssize(denseFieldDescriptionList), 2); - - EXPECT_EQ(denseFieldDescriptionList[0].Name(), "Foo"); - EXPECT_EQ(denseFieldDescriptionList[0].ValidatedSimplify(), EWireType::Uint64); -} - -TEST(TSkiffSchemaDescription, TestKeySwitchColumn) -{ - { - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), - }); - - auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - EXPECT_EQ(std::ssize(tableDescriptionList), 1); - EXPECT_EQ(tableDescriptionList[0].KeySwitchFieldIndex, std::optional<size_t>(1)); - } - { - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$key_switch"), - }); - - try { - auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Column \"$key_switch\" has unexpected Skiff type")); - } - } -} - -TEST(TSkiffSchemaDescription, TestDisallowEmptyNames) -{ - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName(""), - }); - - try { - CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("must have a name")); - } -} - -TEST(TSkiffSchemaDescription, TestWrongRowType) -{ - auto schema = CreateRepeatedVariant16Schema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), - }); - - try { - CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Invalid wire type for table row")); - } -} - -TEST(TSkiffSchemaDescription, TestOtherColumnsOk) -{ - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), - }); - - auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ASSERT_EQ(std::ssize(tableDescriptionList), 1); - ASSERT_EQ(tableDescriptionList[0].HasOtherColumns, true); -} - -TEST(TSkiffSchemaDescription, TestOtherColumnsWrongType) -{ - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$other_columns"), - }); - - try { - CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Invalid wire type for column \"$other_columns\"")); - } -} - -TEST(TSkiffSchemaDescription, TestOtherColumnsWrongPlace) -{ - auto schema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$other_columns"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), - }); - - try { - CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Invalid placement of special column \"$other_columns\"")); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -ISchemalessFormatWriterPtr CreateSkiffWriter( - std::shared_ptr<TSkiffSchema> skiffSchema, - TNameTablePtr nameTable, - IOutputStream* outputStream, - const std::vector<TTableSchemaPtr>& tableSchemaList, - int keyColumnCount = 0, - bool enableEndOfStream = false) -{ - auto controlAttributesConfig = New<TControlAttributesConfig>(); - controlAttributesConfig->EnableKeySwitch = (keyColumnCount > 0); - controlAttributesConfig->EnableEndOfStream = enableEndOfStream; - return CreateWriterForSkiff( - {std::move(skiffSchema)}, - std::move(nameTable), - tableSchemaList, - NConcurrency::CreateAsyncAdapter(outputStream), - false, - controlAttributesConfig, - keyColumnCount); -} - -TString TableToSkiff( - const TLogicalTypePtr& logicalType, - const std::shared_ptr<TSkiffSchema>& typeSchema, - const TNamedValue::TValue& value) -{ - auto schema = CreateSingleValueTableSchema(logicalType); - auto skiffSchema = CreateTupleSchema({ - typeSchema->SetName("value") - }); - - auto nameTable = New<TNameTable>(); - - TStringStream resultStream; - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {schema}); - - writer->Write({ - MakeRow(nameTable, { - {"value", value} - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - auto result = resultStream.Str(); - if (!TStringBuf(result).StartsWith(TString(2, '\0'))) { - THROW_ERROR_EXCEPTION("Expected skiff value to start with \\x00\\x00, but prefix is %Qv", - EscapeC(result.substr(0, 2))); - } - - return result.substr(2); -} - -TNamedValue::TValue SkiffToTable( - const TLogicalTypePtr& logicalType, - const std::shared_ptr<TSkiffSchema>& typeSchema, - const TString& skiffValue) -{ - auto schema = CreateSingleValueTableSchema(logicalType); - auto skiffSchema = CreateTupleSchema({ - typeSchema->SetName("value") - }); - auto nameTable = New<TNameTable>(); - - TCollectingValueConsumer rowCollector(schema); - auto parser = CreateParserForSkiff(skiffSchema, &rowCollector); - parser->Read(TString(2, 0)); - parser->Read(skiffValue); - parser->Finish(); - - if (rowCollector.Size() != 1) { - THROW_ERROR_EXCEPTION("Expected 1 row collected, actual %v", - rowCollector.Size()); - } - auto value = rowCollector.GetRowValue(0, "value"); - return TNamedValue::ExtractValue(value); -} - -#define CHECK_BIDIRECTIONAL_CONVERSION(logicalTypeArg, skiffSchemaArg, tableValueArg, hexSkiffArg) \ - do { \ - try { \ - TLogicalTypePtr logicalType = (logicalTypeArg); \ - std::shared_ptr<TSkiffSchema> skiffSchema = (skiffSchemaArg); \ - TNamedValue::TValue tableValue = (tableValueArg); \ - TString hexSkiff = (hexSkiffArg); \ - auto nameTable = New<TNameTable>(); \ - auto actualSkiff = TableToSkiff(logicalType, skiffSchema, tableValue); \ - EXPECT_EQ(HexEncode(actualSkiff), hexSkiff); \ - auto actualValue = SkiffToTable(logicalType, skiffSchema, HexDecode(hexSkiff)); \ - EXPECT_EQ(actualValue, tableValue); \ - } catch (const std::exception& ex) { \ - ADD_FAILURE() << "unexpected exception: " << ex.what(); \ - } \ - } while (0) - -//////////////////////////////////////////////////////////////////////////////// - -void TestAllWireTypes(bool useSchema) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), - CreateSimpleTypeSchema(EWireType::Double)->SetName("double_1"), - CreateSimpleTypeSchema(EWireType::Double)->SetName("double_2"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("boolean"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), - CreateSimpleTypeSchema(EWireType::Nothing)->SetName("null"), - - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName("opt_int64"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Uint64), - })->SetName("opt_uint64"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Double), - })->SetName("opt_double_1"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Double), - })->SetName("opt_double_2"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Boolean), - })->SetName("opt_boolean"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::String32), - })->SetName("opt_string32"), - }); - std::vector<TTableSchemaPtr> tableSchemas; - if (useSchema) { - tableSchemas.push_back(New<TTableSchema>(std::vector{ - TColumnSchema("int64", EValueType::Int64), - TColumnSchema("uint64", EValueType::Uint64), - TColumnSchema("double_1", EValueType::Double), - TColumnSchema("double_2", ESimpleLogicalValueType::Float), - TColumnSchema("boolean", EValueType::Boolean), - TColumnSchema("string32", EValueType::String), - TColumnSchema("null", EValueType::Null), - TColumnSchema("opt_int64", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))), - TColumnSchema("opt_uint64", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))), - TColumnSchema("opt_double_1", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Double))), - TColumnSchema("opt_double_2", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Float))), - TColumnSchema("opt_boolean", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean))), - TColumnSchema("opt_string32", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))), - })); - } else { - tableSchemas.push_back(New<TTableSchema>()); - } - auto nameTable = New<TNameTable>(); - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, tableSchemas); - - writer->Write({ - MakeRow(nameTable, { - {"int64", -1}, - {"uint64", 2u}, - {"double_1", 3.0}, - {"double_2", 3.0}, - {"boolean", true}, - {"string32", "four"}, - {"null", nullptr}, - - {"opt_int64", -5}, - {"opt_uint64", 6u}, - {"opt_double_1", 7.0}, - {"opt_double_2", 7.0}, - {"opt_boolean", false}, - {"opt_string32", "eight"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Write({ - MakeRow(nameTable, { - {"int64", -9}, - {"uint64", 10u}, - {"double_1", 11.0}, - {"double_2", 11.0}, - {"boolean", false}, - {"string32", "twelve"}, - {"null", nullptr}, - - {"opt_int64", nullptr}, - {"opt_uint64", nullptr}, - {"opt_double_1", nullptr}, - {"opt_double_2", nullptr}, - {"opt_boolean", nullptr}, - {"opt_string32", nullptr}, - {TableIndexColumnName, 0}, - }).Get() - }); - - writer->Close() - .Get() - .ThrowOnError(); - } - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -1); - ASSERT_EQ(checkedSkiffParser.ParseUint64(), 2u); - // double_1 - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 3.0); - // double_2 - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 3.0); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "four"); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -5); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseUint64(), 6u); - - // double_1 - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 7.0); - - // double_2 - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 7.0); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "eight"); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -9); - ASSERT_EQ(checkedSkiffParser.ParseUint64(), 10u); - // double_1 - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 11.0); - // double_2 - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 11.0); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "twelve"); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - // double_1 - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - // double_2 - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestAllWireTypesNoSchema) -{ - TestAllWireTypes(false); -} - -TEST(TSkiffWriter, TestAllWireTypesWithSchema) -{ - TestAllWireTypes(true); -} - -class TSkiffYsonWireTypeP - : public ::testing::TestWithParam<std::tuple< - TLogicalTypePtr, - TNamedValue::TValue, - TString - >> -{ -public: - static std::vector<ParamType> GetCases() - { - using namespace NLogicalTypeShortcuts; - std::vector<ParamType> result; - - for (const auto& example : GetPrimitiveValueExamples()) { - result.emplace_back(example.LogicalType, example.Value, example.PrettyYson); - result.emplace_back(nullptr, example.Value, example.PrettyYson); - } - - for (const auto type : TEnumTraits<ESimpleLogicalValueType>::GetDomainValues()) { - auto logicalType = OptionalLogicalType(SimpleLogicalType(type)); - if (IsV3Composite(logicalType)) { - // Optional<Null> is not v1 type - continue; - } - result.emplace_back(logicalType, nullptr, "#"); - } - return result; - } - - static const std::vector<ParamType> Cases; -}; - -const std::vector<TSkiffYsonWireTypeP::ParamType> TSkiffYsonWireTypeP::Cases = TSkiffYsonWireTypeP::GetCases(); - -INSTANTIATE_TEST_SUITE_P( - Cases, - TSkiffYsonWireTypeP, - ::testing::ValuesIn(TSkiffYsonWireTypeP::Cases)); - -TEST_P(TSkiffYsonWireTypeP, Test) -{ - const auto& [logicalType, value, expectedYson] = GetParam(); - TTableSchemaPtr tableSchema; - if (logicalType) { - tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ - TColumnSchema("column", logicalType), - }); - } else { - tableSchema = New<TTableSchema>(); - } - auto skiffTableSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("column"), - }); - auto nameTable = New<TNameTable>(); - TStringStream actualSkiffDataStream; - auto writer = CreateSkiffWriter(skiffTableSchema, nameTable, &actualSkiffDataStream, {tableSchema}); - writer->Write({ - MakeRow(nameTable, {{"column", value}}) - }); - writer->Close() - .Get() - .ThrowOnError(); - - auto actualSkiffData = actualSkiffDataStream.Str(); - { - TMemoryInput in(actualSkiffData); - TCheckedSkiffParser parser(CreateVariant16Schema({skiffTableSchema}), &in); - EXPECT_EQ(parser.ParseVariant16Tag(), 0); - auto actualYson = parser.ParseYson32(); - parser.ValidateFinished(); - - EXPECT_EQ(CanonizeYson(actualYson), CanonizeYson(expectedYson)); - } - - TCollectingValueConsumer rowCollector(nameTable); - auto parser = CreateParserForSkiff(skiffTableSchema, tableSchema, &rowCollector); - parser->Read(actualSkiffDataStream.Str()); - parser->Finish(); - auto actualValue = rowCollector.GetRowValue(0, "column"); - EXPECT_EQ(actualValue, TNamedValue("column", value).ToUnversionedValue(nameTable)); -} - -TEST(TSkiffWriter, TestYsonWireType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson32"), - - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Yson32), - })->SetName("opt_yson32"), - }); - auto nameTable = New<TNameTable>(); - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - // Row 0 (Null) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", nullptr}, - {"opt_yson32", nullptr}, - }).Get(), - }); - - // Row 1 (Int64) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", -5}, - {"opt_yson32", -6}, - }).Get(), - }); - - // Row 2 (Uint64) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", 42u}, - {"opt_yson32", 43u}, - }).Get(), - }); - - // Row 3 ((Double) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", 2.7182818}, - {"opt_yson32", 3.1415926}, - }).Get(), - }); - - // Row 4 ((Boolean) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", true}, - {"opt_yson32", false}, - }).Get(), - }); - - // Row 5 ((String) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", "Yin"}, - {"opt_yson32", "Yang"}, - }).Get(), - }); - - // Row 6 ((Any) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - - {"yson32", EValueType::Any, "{foo=bar;}"}, - {"opt_yson32", EValueType::Any, "{bar=baz;}"}, - }).Get(), - }); - - // Row 7 ((missing optional values) - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - }).Get(), - }); - - writer->Close() - .Get() - .ThrowOnError(); - } - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - auto parseYson = [] (TCheckedSkiffParser* parser) { - auto yson = TString{parser->ParseYson32()}; - return ConvertToNode(TYsonString(yson)); - }; - - // Row 0 (Null) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->GetType(), ENodeType::Entity); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - // Row 1 (Int64) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsInt64()->GetValue(), -5); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsInt64()->GetValue(), -6); - - // Row 2 (Uint64) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsUint64()->GetValue(), 42u); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsUint64()->GetValue(), 43u); - - // Row 3 (Double) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsDouble()->GetValue(), 2.7182818); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsDouble()->GetValue(), 3.1415926); - - // Row 4 (Boolean) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsBoolean()->GetValue(), true); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsBoolean()->GetValue(), false); - - // Row 5 (String) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsString()->GetValue(), "Yin"); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsString()->GetValue(), "Yang"); - - // Row 6 (Any) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsMap()->GetChildOrThrow("foo")->AsString()->GetValue(), "bar"); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(parseYson(&checkedSkiffParser)->AsMap()->GetChildOrThrow("bar")->AsString()->GetValue(), "baz"); - - // Row 7 (Null) - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser)->GetType(), ENodeType::Entity); - - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -class TSkiffFormatSmallIntP -: public ::testing::TestWithParam<std::tuple< - std::shared_ptr<TSkiffSchema>, - TLogicalTypePtr, - TNamedValue::TValue, - TString ->> -{ -public: - static std::vector<ParamType> GetCases() - { - using namespace NLogicalTypeShortcuts; - - std::vector<ParamType> result; - - auto addSimpleCase = [&result] ( - EWireType wireType, - const TLogicalTypePtr& logicalType, - auto value, - TStringBuf skiffValue) - { - auto simpleSkiffSchema = CreateSimpleTypeSchema(wireType); - auto simpleSkiffData = TString(2, 0) + skiffValue; - result.emplace_back(simpleSkiffSchema, logicalType, value, simpleSkiffData); - }; - - auto addListCase = [&result] ( - EWireType wireType, - const TLogicalTypePtr& logicalType, - auto value, - TStringBuf skiffValue) - { - auto listSkiffSchema = CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(wireType)}); - auto listSkiffData = TString(3, 0) + skiffValue + TString(1, '\xff'); - auto listValue = TNamedValue::TValue{ - TNamedValue::TComposite{ - BuildYsonStringFluently() - .BeginList() - .Item().Value(value) - .EndList().ToString() - } - }; - result.emplace_back(listSkiffSchema, List(logicalType), listValue, listSkiffData); - }; - - auto addSimpleAndListCases = [&] ( - EWireType wireType, - const TLogicalTypePtr& logicalType, - auto value, - TStringBuf skiffValue) - { - addSimpleCase(wireType, logicalType, value, skiffValue); - addListCase(wireType, logicalType, value, skiffValue); - }; - - auto addMultiCase = [&] (EWireType wireType, auto value, TStringBuf skiffValue) { - auto add = [&] (const TLogicalTypePtr& logicalType) { - addSimpleAndListCases(wireType, logicalType, value, skiffValue); - }; - addSimpleCase(wireType, Yson(), value, skiffValue); - - using T = std::decay_t<decltype(value)>; - static_assert(std::is_integral_v<T>); - if constexpr (std::is_signed_v<T>) { - if (std::numeric_limits<i8>::min() <= value && value <= std::numeric_limits<i8>::max()) { - add(Int8()); - } - if (std::numeric_limits<i16>::min() <= value && value <= std::numeric_limits<i16>::max()) { - add(Int16()); - } - if (std::numeric_limits<i32>::min() <= value && value <= std::numeric_limits<i32>::max()) { - add(Int32()); - } - add(Int64()); - } else { - if (value <= std::numeric_limits<ui8>::max()) { - add(Uint8()); - } - if (value <= std::numeric_limits<ui16>::max()) { - add(Uint16()); - } - if (value <= std::numeric_limits<ui32>::max()) { - add(Uint32()); - } - add(Uint64()); - } - }; - addMultiCase(EWireType::Int8, 0, TStringBuf("\x00"sv)); - addMultiCase(EWireType::Int8, 42, TStringBuf("*")); - addMultiCase(EWireType::Int8, -42, TStringBuf("\xd6"sv)); - addMultiCase(EWireType::Int8, 127, TStringBuf("\x7f"sv)); - addMultiCase(EWireType::Int8, -128, TStringBuf("\x80"sv)); - - addMultiCase(EWireType::Int16, 0, TStringBuf("\x00\x00"sv)); - addMultiCase(EWireType::Int16, 42, TStringBuf("\x2a\x00"sv)); - addMultiCase(EWireType::Int16, -42, TStringBuf("\xd6\xff"sv)); - addMultiCase(EWireType::Int16, 0x7fff, TStringBuf("\xff\x7f"sv)); - addMultiCase(EWireType::Int16, -0x8000, TStringBuf("\x00\x80"sv)); - - addMultiCase(EWireType::Int32, 0, TStringBuf("\x00\x00\x00\x00"sv)); - addMultiCase(EWireType::Int32, 42, TStringBuf("\x2a\x00\x00\x00"sv)); - addMultiCase(EWireType::Int32, -42, TStringBuf("\xd6\xff\xff\xff"sv)); - addMultiCase(EWireType::Int32, 0x7fffffff, TStringBuf("\xff\xff\xff\x7f"sv)); - addMultiCase(EWireType::Int32, -0x80000000l, TStringBuf("\x00\x00\x00\x80"sv)); - - addMultiCase(EWireType::Uint8, 0ull, TStringBuf("\x00"sv)); - addMultiCase(EWireType::Uint8, 42ull, TStringBuf("*")); - addMultiCase(EWireType::Uint8, 255ull, TStringBuf("\xff"sv)); - - addMultiCase(EWireType::Uint16, 0ull, TStringBuf("\x00\x00"sv)); - addMultiCase(EWireType::Uint16, 42ull, TStringBuf("\x2a\x00"sv)); - addMultiCase(EWireType::Uint16, 0xFFFFull, TStringBuf("\xff\xff"sv)); - - addMultiCase(EWireType::Uint32, 0ull, TStringBuf("\x00\x00\x00\x00"sv)); - addMultiCase(EWireType::Uint32, 42ull, TStringBuf("\x2a\x00\x00\x00"sv)); - addMultiCase(EWireType::Uint32, 0xFFFFFFFFull, TStringBuf("\xff\xff\xff\xff"sv)); - - addSimpleAndListCases(EWireType::Uint16, Date(), 0ull, TStringBuf("\x00\x00"sv)); - addSimpleAndListCases(EWireType::Uint16, Date(), 42ull, TStringBuf("\x2a\x00"sv)); - addSimpleAndListCases(EWireType::Uint16, Date(), DateUpperBound - 1, TStringBuf("\x08\xc2"sv)); - - addSimpleAndListCases(EWireType::Uint32, Datetime(), 0ull, TStringBuf("\x00\x00\x00\x00"sv)); - addSimpleAndListCases(EWireType::Uint32, Datetime(), 42ull, TStringBuf("\x2a\x00\x00\x00"sv)); - addSimpleAndListCases(EWireType::Uint32, Datetime(), DatetimeUpperBound - 1, TStringBuf("\x7f\xdd\xce\xff"sv)); - - return result; - } - - static const std::vector<ParamType> Cases; -}; - -const std::vector<TSkiffFormatSmallIntP::ParamType> TSkiffFormatSmallIntP::Cases = TSkiffFormatSmallIntP::GetCases(); - -INSTANTIATE_TEST_SUITE_P( - Cases, - TSkiffFormatSmallIntP, - ::testing::ValuesIn(TSkiffFormatSmallIntP::Cases)); - -TEST_P(TSkiffFormatSmallIntP, Test) -{ - const auto& [skiffValueSchema, logicalType, value, expectedSkiffData] = GetParam(); - - const auto nameTable = New<TNameTable>(); - - TStringStream actualSkiffData; - auto skiffTableSchema = CreateTupleSchema({ - skiffValueSchema->SetName("column") - }); - auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ - TColumnSchema("column", logicalType), - }); - auto writer = CreateSkiffWriter(skiffTableSchema, nameTable, &actualSkiffData, {tableSchema}); - writer->Write({ - MakeRow(nameTable, {{"column", value}}) - }); - writer->Close() - .Get() - .ThrowOnError(); - EXPECT_EQ(actualSkiffData.Str(), expectedSkiffData); - - TCollectingValueConsumer rowCollector(nameTable); - auto parser = CreateParserForSkiff(skiffTableSchema, tableSchema, &rowCollector); - parser->Read(expectedSkiffData); - parser->Finish(); - auto actualValue = rowCollector.GetRowValue(0, "column"); - - EXPECT_EQ(actualValue, TNamedValue("common", value).ToUnversionedValue(nameTable)); -} - -TEST(TSkiffWriter, TestBadSmallIntegers) -{ - using namespace NLogicalTypeShortcuts; - auto writeSkiffValue = [] ( - std::shared_ptr<TSkiffSchema>&& typeSchema, - TLogicalTypePtr logicalType, - TNamedValue::TValue value) - { - TStringStream result; - auto skiffSchema = CreateTupleSchema({ - typeSchema->SetName("column") - }); - auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ - TColumnSchema("column", std::move(logicalType)), - }); - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); - writer->Write({ - MakeRow(nameTable, {{"column", std::move(value)}}) - }); - writer->Close() - .Get() - .ThrowOnError(); - return result.Str(); - }; - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int8), Int64(), 128), - "is out of range for possible values"); - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int8), Int64(), -129), - "is out of range for possible values"); - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int16), Int64(), 0x8000), - "is out of range for possible values"); - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int16), Int64(), -0x8001), - "is out of range for possible values"); - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int32), Int64(), 0x80000000ll), - "is out of range for possible values"); - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int32), Int64(), -0x80000001ll), - "is out of range for possible values"); - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint8), Uint64(), 256ull), - "is out of range for possible values"); - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint16), Uint64(), 0x1FFFFull), - "is out of range for possible values"); - - EXPECT_THROW_WITH_SUBSTRING( - writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint32), Uint64(), 0x100000000ull), - "is out of range for possible values"); -} - -class TSkiffFormatUuidTestP : public ::testing::TestWithParam<std::tuple< - TNameTablePtr, - TTableSchemaPtr, - std::shared_ptr<TSkiffSchema>, - std::vector<TUnversionedOwningRow>, - TString ->> -{ -public: - static std::vector<ParamType> GetCases() - { - using namespace NLogicalTypeShortcuts; - - auto nameTable = New<TNameTable>(); - const auto stringUuidValue = TStringBuf("\xee\x1f\x37\x70" "\xb9\x93\x64\xb5" "\xe4\xdf\xe9\x03" "\x67\x5c\x30\x62"); - const auto uint128UuidValue = TStringBuf("\x62\x30\x5c\x67" "\x03\xe9\xdf\xe4" "\xb5\x64\x93\xb9" "\x70\x37\x1f\xee"); - - const auto requiredTableSchema = New<TTableSchema>(std::vector<TColumnSchema>{TColumnSchema("uuid", Uuid())}); - const auto optionalTableSchema = New<TTableSchema>(std::vector<TColumnSchema>{TColumnSchema("uuid", Optional(Uuid()))}); - - const auto optionalUint128SkiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Uint128), - })->SetName("uuid"), - }); - - const auto requiredUint128SkiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint128)->SetName("uuid"), - }); - - const auto optionalStringSkiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::String32), - })->SetName("uuid"), - }); - - const auto requiredStringSkiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("uuid"), - }); - - std::vector<ParamType> result; - - result.emplace_back( - nameTable, - requiredTableSchema, - requiredUint128SkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + uint128UuidValue); - - result.emplace_back( - nameTable, - optionalTableSchema, - requiredUint128SkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + uint128UuidValue); - - result.emplace_back( - nameTable, - requiredTableSchema, - optionalUint128SkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + "\1" + uint128UuidValue); - - result.emplace_back( - nameTable, - optionalTableSchema, - optionalUint128SkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + "\1" + uint128UuidValue); - - const TString uuidLen = TString(TStringBuf("\x10\x00\x00\x00"sv)); - - result.emplace_back( - nameTable, - requiredTableSchema, - requiredStringSkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + uuidLen + stringUuidValue); - - result.emplace_back( - nameTable, - optionalTableSchema, - requiredStringSkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + uuidLen + stringUuidValue); - - result.emplace_back( - nameTable, - requiredTableSchema, - optionalStringSkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + "\1" + uuidLen + stringUuidValue); - - result.emplace_back( - nameTable, - optionalTableSchema, - optionalStringSkiffSchema, - std::vector<TUnversionedOwningRow>{ - MakeRow(nameTable, {{"uuid", stringUuidValue}}), - }, - TString(2, '\0') + "\1" + uuidLen + stringUuidValue); - - return result; - } - - static const std::vector<ParamType> Cases; -}; - -const std::vector<TSkiffFormatUuidTestP::ParamType> TSkiffFormatUuidTestP::Cases = TSkiffFormatUuidTestP::GetCases(); - -INSTANTIATE_TEST_SUITE_P( - Cases, - TSkiffFormatUuidTestP, - ::testing::ValuesIn(TSkiffFormatUuidTestP::Cases)); - -TEST_P(TSkiffFormatUuidTestP, Test) -{ - const auto& [nameTable, tableSchema, skiffSchema, rows, skiffString] = GetParam(); - - TStringStream result; - std::vector<TUnversionedRow> nonOwningRows; - for (const auto& row : rows) { - nonOwningRows.emplace_back(row); - } - auto skiffWriter = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); - skiffWriter->Write(MakeRange(nonOwningRows)); - skiffWriter->Close().Get().ThrowOnError(); - ASSERT_EQ(result.Str(), skiffString); - - TCollectingValueConsumer rowCollector(nameTable); - auto requiredParser = CreateParserForSkiff(skiffSchema, tableSchema, &rowCollector); - requiredParser->Read(result.Str()); - requiredParser->Finish(); - ASSERT_EQ(rowCollector.GetRowList(), rows); -} - -TEST(TSkiffFormatUuidTest, TestError) -{ - using namespace NLogicalTypeShortcuts; - - auto nameTable = New<TNameTable>(); - auto tableSchema = New<TTableSchema>( - std::vector<TColumnSchema>{TColumnSchema("uuid", Optional(Uuid()))}); - - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Uint128)->SetName("uuid"), - }); - - TStringStream result; - auto skiffWriter = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); - skiffWriter->Write({ - MakeRow(nameTable, {{"uuid", nullptr}}), - }); - EXPECT_THROW_WITH_SUBSTRING(skiffWriter->Close().Get().ThrowOnError(), - "Unexpected type"); - -} - -class TSkiffWriterSingular - : public ::testing::Test - , public ::testing::WithParamInterface<ESimpleLogicalValueType> -{}; - -INSTANTIATE_TEST_SUITE_P( - Singular, - TSkiffWriterSingular, - ::testing::Values(ESimpleLogicalValueType::Null, ESimpleLogicalValueType::Void)); - -TEST_P(TSkiffWriterSingular, TestOptionalSingular) -{ - const auto singularType = GetParam(); - - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Nothing), - })->SetName("opt_null"), - }); - - auto nameTable = New<TNameTable>(); - const std::vector<TTableSchemaPtr> tableSchemas = { - New<TTableSchema>(std::vector{ - TColumnSchema("opt_null", OptionalLogicalType(SimpleLogicalType(singularType))), - }), - }; - - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, tableSchemas); - // Row 0 - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"opt_null", nullptr}, - }).Get(), - }); - // Row 1 - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"opt_null", EValueType::Composite, "[#]"}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - } - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestRearrange) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::String32), - })->SetName("eng"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::String32), - })->SetName("rus"), - }); - auto nameTable = New<TNameTable>(); - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"number", 1}, - {"eng", "one"}, - {"rus", nullptr}, - }).Get() - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"eng", nullptr}, - {"number", 2}, - {"rus", "dva"}, - }).Get() - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"rus", "tri"}, - {"eng", "three"}, - {"number", 3}, - }).Get() - }); - - writer->Close() - .Get() - .ThrowOnError(); - } - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "dva"); - - // row 2 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 3); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "three"); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "tri"); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestMissingRequiredField) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("eng"), - }); - auto nameTable = New<TNameTable>(); - TString result; - try { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"number", 1}, - }).Get() - }); - writer->Close() - .Get() - .ThrowOnError(); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Unexpected type of \"eng\" column")); - } -} - -TEST(TSkiffWriter, TestSparse) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), - })->SetName("$sparse_columns"), - }); - - auto nameTable = New<TNameTable>(); - TString result; - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"int64", -1}, - {"string32", "minus one"}, - }).Get(), - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"string32", "minus five"}, - {"int64", -5}, - }).Get(), - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"uint64", 42u}, - }).Get(), - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"int64", -8}, - {"uint64", nullptr}, - {"string32", nullptr}, - }).Get(), - }); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - }).Get(), - }); - - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -1); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 2); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "minus one"); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 2); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "minus five"); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -5); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - // row 2 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseUint64(), 42u); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - // row 3 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), -8); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - // row 4 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestMissingFields) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - }); - - try { - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"unknown_column", "four"}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Column \"unknown_column\" is not described by Skiff schema")); - } - - try { - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto unknownColumnId = nameTable->RegisterName("unknown_column"); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}); - - ASSERT_TRUE(unknownColumnId < nameTable->GetId("value")); - - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"unknown_column", "four"}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - ADD_FAILURE(); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Column \"unknown_column\" is not described by Skiff schema")); - } -} - -TEST(TSkiffWriter, TestOtherColumns) -{ - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64) - })->SetName("int64_column"), - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), - }); - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - nameTable->RegisterName("string_column"); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"string_column", "foo"}, - }).Get(), - }); - - // Row 1. - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"int64_column", 42}, - }).Get(), - }); - // Row 2. - writer->Write({ - MakeRow(nameTable, { - {TableIndexColumnName, 0}, - {"other_string_column", "bar"}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - auto parseYson = [] (TCheckedSkiffParser* parser) { - auto yson = TString{parser->ParseYson32()}; - return ConvertToYsonTextStringStable(ConvertToNode(TYsonString(yson))); - }; - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser), "{\"string_column\"=\"foo\";}"); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 42); - ASSERT_EQ(parseYson(&checkedSkiffParser), "{}"); - - // row 2 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(parseYson(&checkedSkiffParser), "{\"other_string_column\"=\"bar\";}"); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestKeySwitch) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), - }); - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); - - writer->Write({ - // Row 0. - MakeRow(nameTable, { - {"value", "one"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - // Row 1. - writer->Write({ - MakeRow(nameTable, { - {"value", "one"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - // Row 2. - writer->Write({ - MakeRow(nameTable, { - {"value", "two"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - TString buf; - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - - // row 2 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "two"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestEndOfStream) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - }); - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1, true); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {"value", "zero"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - // Row 1. - writer->Write({ - MakeRow(nameTable, { - {"value", "one"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - TString buf; - - // Row 0. - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "zero"); - - // Row 1. - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - - // End of stream. - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0xffff); - - // The End. - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestRowRangeIndex) -{ - const auto rowAndRangeIndex = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName("$range_index"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName("$row_index"), - }); - - struct TRow { - int TableIndex; - std::optional<int> RangeIndex; - std::optional<int> RowIndex; - }; - auto generateUnversionedRow = [] (const TRow& row, const TNameTablePtr& nameTable) { - std::vector<TNamedValue> values = { - {TableIndexColumnName, row.TableIndex}, - }; - if (row.RangeIndex) { - values.emplace_back(RangeIndexColumnName, *row.RangeIndex); - } - if (row.RowIndex) { - values.push_back({RowIndexColumnName, *row.RowIndex}); - } - return MakeRow(nameTable, values); - }; - - auto skiffWrite = [generateUnversionedRow] (const std::vector<TRow>& rows, const std::shared_ptr<TSkiffSchema>& skiffSchema) { - std::vector<TTableSchemaPtr> tableSchemas; - { - THashSet<int> tableIndices; - for (const auto& row : rows) { - tableIndices.insert(row.TableIndex); - } - tableSchemas.assign(tableIndices.size(), New<TTableSchema>()); - } - - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter( - skiffSchema, - nameTable, - &resultStream, - tableSchemas); - - for (const auto& row : rows) { - writer->Write({generateUnversionedRow(row, nameTable)}); - } - writer->Close() - .Get() - .ThrowOnError(); - - return HexEncode(resultStream.Str()); - }; - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 0, 2}, - }, rowAndRangeIndex).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "00" "00" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 0, 3}, - }, rowAndRangeIndex).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "00" "01""03000000""00000000" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 1, 2}, - {0, 1, 3}, - }, rowAndRangeIndex).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "01""01000000""00000000" "01""02000000""00000000" - "0000" "00" "00" - ); - - EXPECT_THROW_WITH_SUBSTRING(skiffWrite({{0, 0, {}}}, rowAndRangeIndex), "index requested but reader did not return it"); - EXPECT_THROW_WITH_SUBSTRING(skiffWrite({{0, {}, 0}}, rowAndRangeIndex), "index requested but reader did not return it"); - - const auto rowAndRangeIndexAllowMissing = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Nothing), - })->SetName("$range_index"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Nothing), - })->SetName("$row_index"), - }); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 0, 2}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "00" "00" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 0, 3}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "00" "01""03000000""00000000" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, 0}, - {0, 0, 1}, - {0, 1, 2}, - {0, 1, 3}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "01""00000000""00000000" "01""00000000""00000000" - "0000" "00" "00" - "0000" "01""01000000""00000000" "01""02000000""00000000" - "0000" "00" "00" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, {}, {}}, - {0, {}, {}}, - {0, {}, {}}, - {0, {}, {}}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "02" "02" - "0000" "02" "02" - "0000" "02" "02" - "0000" "02" "02" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, {}, 0}, - {0, {}, 1}, - {0, {}, 3}, - {0, {}, 4}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "02" "01""00000000""00000000" - "0000" "02" "00" - "0000" "02" "01""03000000""00000000" - "0000" "02" "00" - ); - - EXPECT_STREQ( - skiffWrite({ - {0, 0, {}}, - {0, 0, {}}, - {0, 1, {}}, - {0, 1, {}}, - }, rowAndRangeIndexAllowMissing).data(), - - "0000" "01""00000000""00000000" "02" - "0000" "00" "02" - "0000" "01""01000000""00000000" "02" - "0000" "00" "02" - ); -} - -TEST(TSkiffWriter, TestRowIndexOnlyOrRangeIndexOnly) -{ - TString columnNameList[] = { - RowIndexColumnName, - RangeIndexColumnName, - }; - - for (const auto& columnName : columnNameList) { - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName(columnName), - }); - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {columnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } -} - -TEST(TSkiffWriter, TestComplexType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateRepeatedVariant8Schema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("x"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName("y"), - }) - })->SetName("points") - })->SetName("value"), - }); - - { - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("value", StructLogicalType({ - {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, - { - "points", - ListLogicalType( - StructLogicalType({ - {"x", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"y", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - }) - ) - } - })), - }); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {"value", EValueType::Composite, "[foo;[[0; 1];[2;3]]]"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 3); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), EndOfSequenceTag<ui8>()); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } -} - -TEST(TSkiffWriter, TestEmptyComplexType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - }) - })->SetName("value"), - }); - - { - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("value", OptionalLogicalType( - StructLogicalType({ - {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, - })) - ), - }); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {"value", nullptr}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } -} - -TEST(TSkiffWriter, TestSparseComplexType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - })->SetName("value"), - })->SetName("$sparse_columns"), - }); - - { - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("value", OptionalLogicalType( - StructLogicalType({ - {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, - })) - ), - }); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {"value", EValueType::Composite, "[foo;bar;]"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "bar"); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } -} - -TEST(TSkiffWriter, TestSparseComplexTypeWithExtraOptional) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - }) - })->SetName("value"), - })->SetName("$sparse_columns"), - }); - - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("value", OptionalLogicalType( - StructLogicalType({ - {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, - })) - ), - }); - - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); - - // Row 0. - writer->Write({ - MakeRow(nameTable, { - {"value", EValueType::Composite, "[foo;bar;]"}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "bar"); - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); - - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); -} - -TEST(TSkiffWriter, TestBadWireTypeForSimpleColumn) -{ - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Yson32), - }) - })->SetName("opt_yson32"), - }); - auto nameTable = New<TNameTable>(); - TStringStream resultStream; - EXPECT_THROW_WITH_SUBSTRING( - CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}), - "cannot be represented with Skiff schema" - ); -} - -TEST(TSkiffWriter, TestMissingComplexColumn) -{ - auto optionalSkiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Int64)}), - })->SetName("opt_list"), - }); - auto requiredSkiffSchema = CreateTupleSchema({ - CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Int64)})->SetName("opt_list"), - }); - - { // Non optional Skiff schema - auto nameTable = New<TNameTable>(); - EXPECT_THROW_WITH_SUBSTRING( - CreateSkiffWriter(requiredSkiffSchema, nameTable, &Cnull, std::vector{New<TTableSchema>()}), - "cannot be represented with Skiff schema" - ); - } - - { - auto nameTable = New<TNameTable>(); - TStringStream resultStream; - auto writer = CreateSkiffWriter(optionalSkiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}); - writer->Write({ - MakeRow(nameTable, { }).Get(), - MakeRow(nameTable, { - {"opt_list", nullptr}, - }).Get(), - MakeRow(nameTable, { }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - EXPECT_EQ(HexEncode(resultStream.Str()), "0000" "00" "0000" "00" "0000" "00"); - } -} - -TEST(TSkiffWriter, TestSkippedFields) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), - CreateSimpleTypeSchema(EWireType::Nothing)->SetName("string"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName(RangeIndexColumnName), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName(RowIndexColumnName), - CreateSimpleTypeSchema(EWireType::Double)->SetName("double"), - }); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("number", EValueType::Int64), - TColumnSchema("string", EValueType::String), - TColumnSchema("double", EValueType::Double), - }); - - auto nameTable = New<TNameTable>(); - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {tableSchema}); - - writer->Write({ - MakeRow(nameTable, { - {"number", 1}, - {"string", "hello"}, - {RangeIndexColumnName, 0}, - {RowIndexColumnName, 0}, - {"double", 1.5}, - }).Get() - }); - writer->Write({ - MakeRow(nameTable, { - {"number", 1}, - {RangeIndexColumnName, 5}, - {RowIndexColumnName, 1}, - {"double", 2.5}, - }).Get() - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 1.5); - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 5); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - ASSERT_EQ(checkedSkiffParser.ParseDouble(), 2.5); - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } - -} - -TEST(TSkiffWriter, TestSkippedFieldsOutOfRange) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Nothing)->SetName("string"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName(RangeIndexColumnName), - }); - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("string", EValueType::String), - }); - - auto nameTable = New<TNameTable>(); - TString result; - { - TStringOutput resultStream(result); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {tableSchema}); - - writer->Write({ - MakeRow(nameTable, { - {"string", "hello"}, - {RangeIndexColumnName, 0}, - }).Get() - }); - writer->Write({ - MakeRow(nameTable, { - {RangeIndexColumnName, 5}, - }).Get() - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(result); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 5); - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - } - -} - -TEST(TSkiffWriter, TestSkippedFieldsAndKeySwitch) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), - CreateSimpleTypeSchema(EWireType::Nothing)->SetName("skipped"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName("value1"), - }); - TStringStream resultStream; - auto nameTable = New<TNameTable>(); - auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); - - writer->Write({ - // Row 0. - MakeRow(nameTable, { - {"value", "one"}, - {"value1", 0}, - {TableIndexColumnName, 0}, - }).Get(), - }); - // Row 1. - writer->Write({ - MakeRow(nameTable, { - {"value", "one"}, - {"value1", 1}, - {TableIndexColumnName, 0}, - }).Get(), - }); - // Row 2. - writer->Write({ - MakeRow(nameTable, { - {"value", "two"}, - {"value1", 2}, - {TableIndexColumnName, 0}, - }).Get(), - }); - writer->Close() - .Get() - .ThrowOnError(); - - TStringInput resultInput(resultStream.Str()); - TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); - - TString buf; - - // row 0 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); - - // row 1 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); - - // row 2 - ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); - ASSERT_EQ(checkedSkiffParser.ParseString32(), "two"); - ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); - ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); - - // end - ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); - checkedSkiffParser.ValidateFinished(); - -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSkiffParser, Simple) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), - CreateSimpleTypeSchema(EWireType::Double)->SetName("double"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("boolean"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), - CreateSimpleTypeSchema(EWireType::Nothing)->SetName("null"), - - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - })->SetName("opt_int64"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Uint64), - })->SetName("opt_uint64"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Double), - })->SetName("opt_double"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Boolean), - })->SetName("opt_boolean"), - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::String32), - })->SetName("opt_string32"), - }); - - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteInt64(-1); - checkedSkiffWriter.WriteUint64(2); - checkedSkiffWriter.WriteDouble(3.0); - checkedSkiffWriter.WriteBoolean(true); - checkedSkiffWriter.WriteString32("foo"); - - checkedSkiffWriter.WriteVariant8Tag(0); - checkedSkiffWriter.WriteVariant8Tag(0); - checkedSkiffWriter.WriteVariant8Tag(0); - checkedSkiffWriter.WriteVariant8Tag(0); - checkedSkiffWriter.WriteVariant8Tag(0); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); - - ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "int64")), -1); - ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "uint64")), 2u); - ASSERT_EQ(GetDouble(collectedRows.GetRowValue(0, "double")), 3.0); - ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(0, "boolean")), true); - ASSERT_EQ(GetString(collectedRows.GetRowValue(0, "string32")), "foo"); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "null")), true); - - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_int64")), true); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_uint64")), true); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_double")), true); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_boolean")), true); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_string32")), true); -} - -TEST(TSkiffParser, TestOptionalNull) -{ - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Nothing), - })->SetName("opt_null"), - }); - auto nameTable = New<TNameTable>(); - - { - TCollectingValueConsumer collectedRows; - EXPECT_THROW_WITH_SUBSTRING( - CreateParserForSkiff(skiffSchema, &collectedRows), - "cannot be represented with Skiff schema"); - } - - auto tableSchema = New<TTableSchema>(std::vector{ - TColumnSchema("opt_null", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Null))), - }); - - TCollectingValueConsumer collectedRows(tableSchema); - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant8Tag(0); - - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant8Tag(1); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); - - ASSERT_EQ(collectedRows.GetRowValue(0, "opt_null").Type, EValueType::Null); -} - -TEST(TSkiffParser, TestSparse) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), - CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), - })->SetName("$sparse_columns"), - }); - - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // row 1 - checkedSkiffWriter.WriteVariant16Tag(0); - // sparse fields begin - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteInt64(-42); - checkedSkiffWriter.WriteVariant16Tag(1); - checkedSkiffWriter.WriteUint64(54); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - // row 2 - checkedSkiffWriter.WriteVariant16Tag(0); - // sparse fields begin - checkedSkiffWriter.WriteVariant16Tag(2); - checkedSkiffWriter.WriteString32("foo"); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); - - ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "int64")), -42); - ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "uint64")), 54u); - ASSERT_FALSE(collectedRows.FindRowValue(0, "string32")); - - ASSERT_FALSE(collectedRows.FindRowValue(1, "int64")); - ASSERT_FALSE(collectedRows.FindRowValue(1, "uint64")); - ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "string32")), "foo"); -} - -TEST(TSkiffParser, TestYsonWireType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), - }); - - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // Row 0. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("-42"); - - // Row 1. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("42u"); - - // Row 2. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("\"foobar\""); - - // Row 3. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("%true"); - - // Row 4. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("{foo=bar}"); - - // Row 5. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32("#"); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 6); - ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "yson")), -42); - ASSERT_EQ(GetUint64(collectedRows.GetRowValue(1, "yson")), 42u); - ASSERT_EQ(GetString(collectedRows.GetRowValue(2, "yson")), "foobar"); - ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(3, "yson")), true); - ASSERT_EQ(GetAny(collectedRows.GetRowValue(4, "yson"))->AsMap()->GetChildOrThrow("foo")->AsString()->GetValue(), "bar"); - ASSERT_EQ(IsNull(collectedRows.GetRowValue(5, "yson")), true); -} - -TEST(TSkiffParser, TestBadYsonWireType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), - }); - - auto parseYsonUsingSkiff = [&] (TStringBuf ysonValue) { - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - TStringStream dataStream; - ASSERT_NO_THROW({ - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteYson32(ysonValue); - - checkedSkiffWriter.Finish(); - }); - - parser->Read(dataStream.Str()); - parser->Finish(); - }; - - try { - parseYsonUsingSkiff("[42"); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Premature end of stream")); - } - - try { - parseYsonUsingSkiff("<foo=bar>42"); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Table values cannot have top-level attributes")); - } -} - -TEST(TSkiffParser, TestSpecialColumns) -{ - std::shared_ptr<TSkiffSchema> skiffSchemaList[] = { - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), - }), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$row_switch"), - }), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$range_switch"), - }), - }; - - for (const auto& skiffSchema : skiffSchemaList) { - try { - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - } catch (std::exception& e) { - EXPECT_THAT(e.what(), testing::HasSubstr("Skiff parser does not support \"$key_switch\"")); - } - } -} - -TEST(TSkiffParser, TestOtherColumns) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), - }); - - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // Row 0. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteString32("row_0"); - checkedSkiffWriter.WriteYson32("{foo=-42;}"); - - // Row 1. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteString32("row_1"); - checkedSkiffWriter.WriteYson32("{bar=qux;baz={boolean=%false;};}"); - - // Row 2. - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); - ASSERT_EQ(GetString(collectedRows.GetRowValue(0, "name")), "row_0"); - ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "foo")), -42); - - ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "name")), "row_1"); - ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "bar")), "qux"); - ASSERT_EQ(ConvertToYsonTextStringStable(GetAny(collectedRows.GetRowValue(1, "baz"))), "{\"boolean\"=%false;}"); -} - -TEST(TSkiffParser, TestComplexColumn) -{ - auto skiffSchema = CreateTupleSchema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), - })->SetName("column") - }); - - TCollectingValueConsumer collectedRows( - New<TTableSchema>(std::vector{ - TColumnSchema("column", NTableClient::StructLogicalType({ - {"key", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::Int64)} - })) - })); - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // Row 0. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteString32("row_0"); - checkedSkiffWriter.WriteInt64(42); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); - ASSERT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "column"))), "[\"row_0\";42;]"); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSkiffParser, TestEmptyInput) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("column"), - }); - - TCollectingValueConsumer collectedRows; - - { - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - parser->Finish(); - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); - } - { - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - parser->Read(""); - parser->Finish(); - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); - } - { - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - parser->Read(""); - parser->Read(""); - parser->Finish(); - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TSkiffParser, ColumnIds) -{ - auto skiffSchema = CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("field_a"), - CreateSimpleTypeSchema(EWireType::Uint64)->SetName("field_b") - }); - - TCollectingValueConsumer collectedRows; - collectedRows.GetNameTable()->GetIdOrRegisterName("field_b"); - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteInt64(-1); - checkedSkiffWriter.WriteUint64(2); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); - - ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "field_a")), -1); - ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "field_b")), 2u); -} - -TEST(TSkiffParser, TestSparseComplexType) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), - })->SetName("value"), - })->SetName("$sparse_columns"), - }); - - TCollectingValueConsumer collectedRows( - New<TTableSchema>(std::vector{ - TColumnSchema("value", OptionalLogicalType( - StructLogicalType({ - {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", SimpleLogicalType(ESimpleLogicalValueType::Int64)} - }) - )) - })); - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // Row 0. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteString32("row_0"); - checkedSkiffWriter.WriteInt64(10); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - // Row 1. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); - EXPECT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "value"))), "[\"row_0\";10;]"); - EXPECT_FALSE(collectedRows.FindRowValue(1, "value")); -} - -TEST(TSkiffParser, TestSparseComplexTypeWithExtraOptional) -{ - auto skiffSchema = CreateTupleSchema({ - CreateRepeatedVariant16Schema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), - }) - })->SetName("column"), - })->SetName("$sparse_columns"), - }); - - TCollectingValueConsumer collectedRows( - New<TTableSchema>(std::vector{ - TColumnSchema("column", OptionalLogicalType( - StructLogicalType({ - {"key", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"value", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::Int64)} - }) - )) - })); - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - TStringStream dataStream; - TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); - - // Row 0. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant8Tag(1); - checkedSkiffWriter.WriteString32("row_0"); - checkedSkiffWriter.WriteInt64(42); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - // Row 1. - checkedSkiffWriter.WriteVariant16Tag(0); - checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); - - checkedSkiffWriter.Finish(); - - parser->Read(dataStream.Str()); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); - ASSERT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "column"))), "[\"row_0\";42;]"); - ASSERT_FALSE(collectedRows.FindRowValue(1, "column")); -} - - -TEST(TSkiffParser, TestBadWireTypeForSimpleColumn) -{ - auto skiffSchema = CreateTupleSchema({ - CreateVariant8Schema({ - CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Yson32), - }) - })->SetName("opt_yson32"), - }); - - TCollectingValueConsumer collectedRows; - EXPECT_THROW_WITH_SUBSTRING( - CreateParserForSkiff(skiffSchema, &collectedRows), - "cannot be represented with Skiff schema" - ); -} - -TEST(TSkiffParser, TestEmptyColumns) -{ - auto skiffSchema = CreateTupleSchema({}); - TCollectingValueConsumer collectedRows; - auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); - - parser->Read(TStringBuf("\x00\x00\x00\x00"sv)); - parser->Finish(); - - ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); -} - -TEST(TSkiffFormat, TestTimestamp) -{ - using namespace NLogicalTypeShortcuts; - CHECK_BIDIRECTIONAL_CONVERSION(Timestamp(), CreateSimpleTypeSchema(EWireType::Uint64), 42ull, "2A000000" "00000000"); - CHECK_BIDIRECTIONAL_CONVERSION(Interval(), CreateSimpleTypeSchema(EWireType::Int64), 42, "2A000000" "00000000"); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT diff --git a/yt/yt/client/unittests/skiff_yson_converter_ut.cpp b/yt/yt/client/unittests/skiff_yson_converter_ut.cpp deleted file mode 100644 index 18ecfac352..0000000000 --- a/yt/yt/client/unittests/skiff_yson_converter_ut.cpp +++ /dev/null @@ -1,728 +0,0 @@ -#include "logical_type_shortcuts.h" - -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/client/table_client/logical_type.h> -#include <yt/yt/client/formats/skiff_yson_converter.h> - -#include <yt/yt/core/yson/parser.h> -#include <yt/yt/core/yson/pull_parser.h> -#include <yt/yt/core/yson/token_writer.h> -#include <yt/yt/core/yson/writer.h> - -#include <library/cpp/skiff/skiff.h> -#include <library/cpp/skiff/skiff_schema.h> - -#include <util/string/hex.h> - -#include <util/stream/mem.h> - -namespace NYT::NFormats { -namespace { - -using namespace NTableClient; -using namespace NSkiff; -using namespace NYson; -using namespace NTableClient::NLogicalTypeShortcuts; - -//////////////////////////////////////////////////////////////////////////////// - -std::shared_ptr<TSkiffSchema> SkiffOptional(std::shared_ptr<TSkiffSchema> skiffSchema) -{ - return CreateVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Nothing), - std::move(skiffSchema) - }); -} - -TString ConvertYsonHex( - const TLogicalTypePtr& logicalType, - const std::shared_ptr<TSkiffSchema>& skiffSchema, - TStringBuf ysonString, - const TYsonToSkiffConverterConfig& config = {}) -{ - auto converter = CreateYsonToSkiffConverter( - TComplexTypeFieldDescriptor("test-field", logicalType), - skiffSchema, - config); - - // Yson parsers have a bug when they can't parse some values that end unexpectedly. - TString spacedYsonInput = TString{ysonString} + " "; - - TStringStream out; - { - TCheckedInDebugSkiffWriter writer(skiffSchema, &out); - - TMemoryInput in(spacedYsonInput); - TYsonPullParser pullParser(&in, EYsonType::Node); - TYsonPullParserCursor cursor(&pullParser); - - converter(&cursor, &writer); - - EXPECT_EQ(cursor.GetCurrent().GetType(), EYsonItemType::EndOfStream); - writer.Finish(); - } - - auto result = HexEncode(out.Str()); - result.to_lower(); - return result; -} - -TString ConvertHexToTextYson( - const TLogicalTypePtr& logicalType, - const std::shared_ptr<TSkiffSchema>& skiffSchema, - TStringBuf hexString, - const TSkiffToYsonConverterConfig& config = {}) -{ - auto converter = CreateSkiffToYsonConverter(TComplexTypeFieldDescriptor("test-field", logicalType), skiffSchema, config); - - - TStringStream binaryOut; - { - TString binaryString = HexDecode(hexString); - TMemoryInput in(binaryString); - TCheckedInDebugSkiffParser parser(skiffSchema, &in); - - auto writer = TCheckedInDebugYsonTokenWriter(&binaryOut); - converter(&parser, &writer); - EXPECT_EQ(parser.GetReadBytesCount(), binaryString.size()); - } - binaryOut.Finish(); - - TStringStream out; - { - auto writer = TYsonWriter(&out, EYsonFormat::Text); - ParseYsonStringBuffer(binaryOut.Str(), EYsonType::Node, &writer); - } - out.Finish(); - - return out.Str(); -} - - -#define CHECK_BIDIRECTIONAL_CONVERSION(logicalType, skiffSchema, ysonString, skiffString, ...) \ - do { \ - std::tuple<TYsonToSkiffConverterConfig,TSkiffToYsonConverterConfig> cfg = {__VA_ARGS__}; \ - auto actualSkiffString = ConvertYsonHex(logicalType, skiffSchema, ysonString, std::get<0>(cfg)); \ - EXPECT_EQ(actualSkiffString, skiffString) << "Yson -> Skiff conversion error"; \ - auto actualYsonString = ConvertHexToTextYson(logicalType, skiffSchema, skiffString, std::get<1>(cfg)); \ - EXPECT_EQ(actualYsonString, ysonString) << "Skiff -> Yson conversion error"; \ - } while (0) - - -TEST(TYsonSkiffConverterTest, TestSimpleTypes) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - Int8(), - CreateSimpleTypeSchema(EWireType::Int64), - "-42", - "d6ffffff" "ffffffff"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Uint64(), - CreateSimpleTypeSchema(EWireType::Uint64), - "42u", - "2a000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Uint64(), - CreateSimpleTypeSchema(EWireType::Uint64), - "8u", - "08000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Bool(), - CreateSimpleTypeSchema(EWireType::Boolean), - "%true", - "01"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Double(), - CreateSimpleTypeSchema(EWireType::Double), - "0.", - "00000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Float(), - CreateSimpleTypeSchema(EWireType::Double), - "0.", - "00000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - String(), - CreateSimpleTypeSchema(EWireType::String32), - "\"foo\"", - "03000000" "666f6f"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Null(), - CreateSimpleTypeSchema(EWireType::Nothing), - "#", - ""); - - CHECK_BIDIRECTIONAL_CONVERSION( - Uuid(), - CreateSimpleTypeSchema(EWireType::Uint128), - "\"\\xF0\\xF1\\xF2\\xF3\\xF4\\xF5\\xF6\\xF7\\xF8\\xF9\\xFA\\xFB\\xFC\\xFD\\xFE\\xFF\"", - "fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Uuid(), - CreateSimpleTypeSchema(EWireType::String32), - "\"\\xF0\\xF1\\xF2\\xF3\\xF4\\xF5\\xF6\\xF7\\xF8\\xF9\\xFA\\xFB\\xFC\\xFD\\xFE\\xFF\"", - "10000000f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"); -} - -TEST(TYsonSkiffConverterTest, TestYson32) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - Yson(), - CreateSimpleTypeSchema(EWireType::Yson32), - "-42", - "02000000" "0253"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Yson(), - CreateSimpleTypeSchema(EWireType::Yson32), - "#", - "01000000" "23"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Yson(), - CreateSimpleTypeSchema(EWireType::Yson32), - "[1;2;[3;];]", - "0e000000" "5b02023b02043b5b02063b5d3b5d"); -} - -TEST(TYsonSkiffConverterTest, TestOptionalTypes) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Int64()), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - "-42", - "01" "d6ffffff" "ffffffff"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Int64()), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - "#", - "00"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(Bool())), - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), - "[%true;]", - "01" "01" "01"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(Bool())), - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), - "[#;]", - "01" "00"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(Bool())), - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), - "#", - "00"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(List(Bool())), - SkiffOptional(CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)})), - "#", - "00"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(List(Bool()))), - SkiffOptional( - SkiffOptional( - CreateRepeatedVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Boolean) - }) - ) - ), - "[[%true;%false;%true;];]", - "01" "01" "0001" "0000" "0001" "ff"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(List(Bool()))), - SkiffOptional( - SkiffOptional( - CreateRepeatedVariant8Schema({ - CreateSimpleTypeSchema(EWireType::Boolean) - }) - ) - ), - "[#;]", - "0100"); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertYsonHex( - Optional(Optional(Bool())), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), - " [ %true ] "), - "Optional nesting mismatch"); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertHexToTextYson( - Optional(Bool()), - CreateSimpleTypeSchema(EWireType::Boolean), - "00"), - "Optional nesting mismatch"); - - TYsonToSkiffConverterConfig ysonToSkiffConfig; - ysonToSkiffConfig.AllowOmitTopLevelOptional = true; - - TSkiffToYsonConverterConfig skiffToYsonConfig; - skiffToYsonConfig.AllowOmitTopLevelOptional = true; - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(Bool())), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), - "[%true;]", - "01" "01", - ysonToSkiffConfig, - skiffToYsonConfig); - - CHECK_BIDIRECTIONAL_CONVERSION( - Optional(Optional(Bool())), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), - "[#;]", - "00", - ysonToSkiffConfig, - skiffToYsonConfig); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertYsonHex( - Optional(Optional(Bool())), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), - " # ", - ysonToSkiffConfig), - "value expected to be nonempty"); -} - -TEST(TYsonSkiffConverterTest, TestListTypes) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - List(Bool()), - CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)}), - "[]", - "ff"); - - CHECK_BIDIRECTIONAL_CONVERSION( - List(Bool()), - CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)}), - "[%true;%true;%true;]", - "00" "01" "00" "01" "00" "01" "ff"); - - CHECK_BIDIRECTIONAL_CONVERSION( - List(List(Bool())), - CreateRepeatedVariant8Schema({CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)})}), - "[[];[%true;];[%true;%true;];]", - "00" "ff" "00" "0001ff" "00" "00010001ff" "ff"); -} - -TEST(TYsonSkiffConverterTest, TestStruct) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - Struct( - "key", String(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - }), - "[\"true\";%true;]", - "04000000" "74727565" "01"); -} - -TEST(TYsonSkiffConverterTest, TestSkippedFields) -{ - TString skiffString; - skiffString = ConvertYsonHex( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "04000000" "74727565" "01"sv); - - skiffString = ConvertYsonHex( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "01000000" "00000000"sv); - - try { - ConvertHexToTextYson( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), - }), - "01000000" "00000000"); - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::ContainsRegex("Non optional struct field .* is missing")); - } - - CHECK_BIDIRECTIONAL_CONVERSION( - Struct( - "key", Optional(String()), - "subkey", Int64(), - "value", Optional(Bool()) - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), - }), - "[#;15;#;]", - "0f000000" "00000000"); -} - -TEST(TYsonSkiffConverterTest, TestUnknownSkiffFields) -{ - TString skiffString; - skiffString = ConvertYsonHex( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - SkiffOptional(CreateSimpleTypeSchema(EWireType::String32))->SetName("key2"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "04000000" "74727565" "00" "01"sv); - - skiffString = ConvertYsonHex( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Yson32))->SetName("value2"), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "04000000" "74727565" "01" "00"sv); - - - try { - ConvertYsonHex( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - CreateSimpleTypeSchema(EWireType::Yson32)->SetName("value2"), - }), - " [ true ; 1; %true ] "); - GTEST_FAIL() << "exception expected"; - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::ContainsRegex("Non optional Skiff field .* is missing corresponding logical struct field")); - } - - try { - ConvertHexToTextYson( - Struct( - "key", String(), - "subkey", Int64(), - "value", Bool() - ), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), - SkiffOptional(CreateSimpleTypeSchema(EWireType::String32))->SetName("key2"), - CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), - }), - "04000000" "74727565" "00" "01"sv); - GTEST_FAIL() << "expected_exception"; - } catch (const std::exception& e) { - EXPECT_THAT(e.what(), testing::ContainsRegex("is not found in logical type")); - } -} - -TEST(TYsonSkiffConverterTest, TestTuple) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - Tuple(String(), Bool()), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32), - CreateSimpleTypeSchema(EWireType::Boolean), - }), - "[\"true\";%true;]", - "04000000" "74727565" "01"); - - CHECK_BIDIRECTIONAL_CONVERSION( - Tuple(Int64(), Optional(Int64())), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Int64), - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - }), - "[2;42;]", - "02000000" "00000000" "01" "2a000000" "00000000"); -} - -TEST(TYsonSkiffConverterTest, TestTupleSkippedFields) -{ - TString skiffString; - skiffString = ConvertYsonHex( - Tuple(String(), Int64(), Bool()), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32), - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Boolean), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "04000000" "74727565" "01"sv); - - skiffString = ConvertYsonHex( - Tuple(String(), Int64(), Bool()), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Nothing), - }), - " [ true ; 1; %true ] "); - EXPECT_EQ(skiffString, "01000000" "00000000"sv); - - skiffString = ConvertYsonHex( - Tuple(Optional(String()), Int64(), Optional(Bool())), - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::Nothing), - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Nothing) - }), - "[#;15;#;]" - ); - EXPECT_EQ(skiffString, "0f000000" "00000000"sv); -} - -TEST(TYsonSkiffConverterTest, TestDict) -{ - const auto logicalType = Dict(String(), Int64()); - const auto skiffSchema = CreateRepeatedVariant8Schema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32), - CreateSimpleTypeSchema(EWireType::Int64) - }) - }); - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - skiffSchema, - "[[\"one\";1;];[\"two\";2;];]", - "00" "03000000" "6f6e65" "01000000" "00000000" - "00" "03000000" "74776f" "02000000" "00000000" - "ff" - ); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertHexToTextYson(logicalType, skiffSchema, "01" "01000000" "6f" "01000000" "00000000" "ff"), - "Unexpected repeated_variant8 tag" - ); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertHexToTextYson(logicalType, skiffSchema, "00" "01000000" "6f" "01000000" "00000000"), - "Premature end of stream" - ); -} - -TEST(TYsonSkiffConverterTest, TestTagged) -{ - const auto logicalType = Tagged( - "tag", - Dict(Tagged("tag", String()), Int64())); - const auto skiffSchema = CreateRepeatedVariant8Schema({ - CreateTupleSchema({ - CreateSimpleTypeSchema(EWireType::String32), - CreateSimpleTypeSchema(EWireType::Int64) - }) - }); - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - skiffSchema, - "[[\"one\";1;];[\"two\";2;];]", - "00" "03000000" "6f6e65" "01000000" "00000000" - "00" "03000000" "74776f" "02000000" "00000000" - "ff" - ); -} - -TEST(TYsonSkiffConverterTest, TestOptionalVariantSimilarity) -{ - auto logicalType = Optional( - VariantTuple(Null(), Int64()) - ); - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), - "[1;42;]", - "01" "01" "2a000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), - "[0;#;]", - "01" "00"); - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), - "#", - "00"); - - TYsonToSkiffConverterConfig ysonToSkiffConfig; - ysonToSkiffConfig.AllowOmitTopLevelOptional = true; - - TSkiffToYsonConverterConfig skiffToYsonConfig; - skiffToYsonConfig.AllowOmitTopLevelOptional = true; - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - "[1;42;]", - "01" "2a000000" "00000000", - ysonToSkiffConfig, - skiffToYsonConfig); - - CHECK_BIDIRECTIONAL_CONVERSION( - logicalType, - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - "[0;#;]", - "00", - ysonToSkiffConfig, - skiffToYsonConfig); - - EXPECT_THROW_WITH_SUBSTRING( - ConvertYsonHex( - logicalType, - SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), - "#", - ysonToSkiffConfig), - "value expected to be nonempty" - ); -} - -class TYsonSkiffConverterTestVariant - : public ::testing::TestWithParam<std::tuple<ELogicalMetatype, EWireType>> -{ -public: - TLogicalTypePtr VariantLogicalType(const std::vector<TLogicalTypePtr>& elements) - { - auto [metatype, wireType] = GetParam(); - if (metatype == ELogicalMetatype::VariantTuple) { - return VariantTupleLogicalType(elements); - } else { - std::vector<TStructField> fields; - for (size_t i = 0; i < elements.size(); ++i) { - fields.push_back({Format("field%v", i), elements[i]}); - } - return VariantStructLogicalType(fields); - } - } - - std::shared_ptr<TSkiffSchema> VariantSkiffSchema(std::vector<std::shared_ptr<TSkiffSchema>> elements) - { - for (size_t i = 0; i < elements.size(); ++i) { - elements[i]->SetName(Format("field%v", i)); - } - auto [metatype, wireType] = GetParam(); - if (wireType == EWireType::Variant8) { - return CreateVariant8Schema(std::move(elements)); - } else if (wireType == EWireType::Variant16) { - return CreateVariant16Schema(std::move(elements)); - } - Y_UNREACHABLE(); - } - - TString VariantTagInfix() const - { - auto [metatype, wireType] = GetParam(); - if (wireType == EWireType::Variant16) { - return "00"; - } - return {}; - } -}; - -TEST_P(TYsonSkiffConverterTestVariant, TestVariant) -{ - CHECK_BIDIRECTIONAL_CONVERSION( - VariantLogicalType({ - Int64(), - Bool() - }), - VariantSkiffSchema({ - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Boolean), - }), - "[0;42;]", - "00" + VariantTagInfix() + "2a000000" "00000000"); - - CHECK_BIDIRECTIONAL_CONVERSION( - VariantLogicalType({ - Int64(), - Bool() - }), - VariantSkiffSchema({ - CreateSimpleTypeSchema(EWireType::Int64), - CreateSimpleTypeSchema(EWireType::Boolean), - }), - "[1;%true;]", - "01" + VariantTagInfix() + "01"); -} - -TEST_P(TYsonSkiffConverterTestVariant, TestMalformedVariants) -{ - auto logicalType = VariantLogicalType({ - Bool(), - Int64(), - }); - auto skiffSchema = VariantSkiffSchema({ - CreateSimpleTypeSchema(EWireType::Boolean), - CreateSimpleTypeSchema(EWireType::Int64), - }); - - EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[2; 42]"), "Yson to Skiff conversion error"); - EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[]"), "Yson to Skiff conversion error"); - EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[0]"), "Yson to Skiff conversion error"); - - EXPECT_THROW_WITH_SUBSTRING(ConvertHexToTextYson(logicalType, skiffSchema, "02" + VariantTagInfix() + "00"), - "Skiff to Yson conversion error"); -} - -INSTANTIATE_TEST_SUITE_P( - Variants, - TYsonSkiffConverterTestVariant, - ::testing::Combine( - ::testing::ValuesIn({ELogicalMetatype::VariantStruct, ELogicalMetatype::VariantTuple}), - ::testing::ValuesIn({EWireType::Variant8, EWireType::Variant16}) - ) -); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/validate_logical_type_ut.cpp b/yt/yt/client/unittests/validate_logical_type_ut.cpp index 1bebe1c952..dce4792430 100644 --- a/yt/yt/client/unittests/validate_logical_type_ut.cpp +++ b/yt/yt/client/unittests/validate_logical_type_ut.cpp @@ -1,4 +1,4 @@ -#include "logical_type_shortcuts.h" +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> #include <yt/yt/core/test_framework/framework.h> diff --git a/yt/yt/client/unittests/value_examples.cpp b/yt/yt/client/unittests/value_examples.cpp deleted file mode 100644 index c2714fc856..0000000000 --- a/yt/yt/client/unittests/value_examples.cpp +++ /dev/null @@ -1,147 +0,0 @@ -#include "value_examples.h" - -#include "logical_type_shortcuts.h" - -#include <yt/yt/library/decimal/decimal.h> - -#include <cmath> - -namespace NYT::NTableClient { - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NLogicalTypeShortcuts; -using namespace NNamedValue; - -//////////////////////////////////////////////////////////////////////////////// - -TValueExample::TValueExample(TLogicalTypePtr logicalType, TNamedValue::TValue value, TString prettyYson) - : LogicalType(std::move(logicalType)) - , Value(std::move(value)) - , PrettyYson(std::move(prettyYson)) -{ } - -//////////////////////////////////////////////////////////////////////////////// - -std::vector<TValueExample> GetPrimitiveValueExamples() -{ - static const std::vector<TValueExample> valueExamples = { - TValueExample{Int8(), 0, "0"}, - TValueExample{Int8(), -5, "-5"}, - TValueExample{Int8(), 42, "42"}, - TValueExample{Int8(), -128, "-128"}, - TValueExample{Int8(), 127, "127"}, - - TValueExample{Int16(), 0, "0"}, - TValueExample{Int16(), -6, "-6"}, - TValueExample{Int16(), 43, "43"}, - TValueExample{Int16(), 0x7FFF, "32767"}, - TValueExample{Int16(), -0x8000, "-32768"}, - - TValueExample{Int32(), 0, "0"}, - TValueExample{Int32(), -7, "-7"}, - TValueExample{Int32(), 44, "44"}, - TValueExample{Int32(), 0x7FFFFFFF, "2147483647"}, - TValueExample{Int32(), -0x80000000ll, "-2147483648"}, - - TValueExample{Int64(), 0, "0"}, - TValueExample{Int64(), -7, "-7"}, - TValueExample{Int64(), 45, "45"}, - TValueExample{Int64(), 0x7FFFFFFFFFFFFFFFll, "9223372036854775807"}, - TValueExample{Int64(), i64(-0x8000000000000000ll), "-9223372036854775808"}, - - TValueExample{Uint8(), 0ull, "0u"}, - TValueExample{Uint8(), 46ull, "46u"}, - TValueExample{Uint8(), 255ull, "255u"}, - - TValueExample{Uint16(), 0ull, "0u"}, - TValueExample{Uint16(), 47ull, "47u"}, - TValueExample{Uint16(), 0xFFFFull, "65535u"}, - - TValueExample{Uint32(), 0ull, "0u"}, - TValueExample{Uint32(), 48ull, "48u"}, - TValueExample{Uint32(), 0xFFFFFFFFull, "4294967295u"}, - - TValueExample{Uint64(), 0ull, "0u"}, - TValueExample{Uint64(), 49ull, "49u"}, - TValueExample{Uint64(), 0xFFFFFFFFFFFFFFFFull, "18446744073709551615u"}, - - TValueExample{String(), "", R"("")"}, - TValueExample{String(), "foo", R"("foo")"}, - TValueExample{String(), TString(TStringBuf("\xf0\x00"sv)), R"("\xf0\x00")"}, - - TValueExample{Utf8(), "", R"("")"}, - TValueExample{Utf8(), "bar", R"("bar")"}, - - TValueExample{Bool(), true, "%true"}, - TValueExample{Bool(), false, "%false"}, - - // NB. .125 = 1 / 8 is - TValueExample{Double(), 3.125, "3.125"}, - TValueExample{Double(), 2.775, "2.775"}, - // TPrimitiveTypeExample{Double(), std::nan("1"), "%nan"}, - TValueExample{Double(), INFINITY, "%inf"}, - TValueExample{Double(), -INFINITY, "%-inf"}, - - TValueExample{Float(), 5.125, "5.125"}, - TValueExample{Float(), 6.775, "6.775"}, - - TValueExample{Null(), nullptr, "#"}, - TValueExample{Void(), nullptr, "#"}, - - TValueExample{Json(), "83", R"("83")"}, - TValueExample{Json(), "[]", R"("[]")"}, - - TValueExample{ - Uuid(), - TString(16, 0), - TString(TStringBuf(R"("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")")) - }, - TValueExample{ - Uuid(), - TString(TStringBuf("\x01\x23\x45\x67\x89\xAB\xCD\xEF\xFE\xDC\xBA\x98\x76\x54\x32\x10"sv)), - TString(TStringBuf(R"("\x01\x23\x45\x67\x89\xAB\xCD\xEF\xFE\xDC\xBA\x98\x76\x54\x32\x10")")) - }, - - TValueExample{Date(), 0ull, "0u"}, - TValueExample{Date(), 18431ull, "18431u"}, - TValueExample{Date(), 49672ull, "49672u"}, - - TValueExample{Datetime(), 0ull, "0u"}, - TValueExample{Datetime(), 668800588ull, "668800588u"}, - TValueExample{Datetime(), 4291747199ull, "4291747199u"}, - - TValueExample{Timestamp(), 0ull, "0u"}, - TValueExample{Timestamp(), 2508452463052426ull, "2508452463052426u"}, - TValueExample{Timestamp(), 4291747199999999ull, "4291747199999999u"}, - - TValueExample{Interval(), 0, "0"}, - TValueExample{Timestamp(), 2208610308646589ll, "2208610308646589"}, - TValueExample{Timestamp(), 1187314596653899ll, "1187314596653899"}, - TValueExample{Timestamp(), 4291747199999999ll, "4291747199999999"}, - TValueExample{Timestamp(), -4291747199999999ll, "-4291747199999999"}, - - TValueExample{Yson(), "qux", R"("qux")"}, - - TValueExample{Decimal(3, 2), NDecimal::TDecimal::TextToBinary("3.14", 3, 2), R"("\x80\x00\x01\x3a")"}, - }; - - THashSet<ESimpleLogicalValueType> allValueTypes; - for (const auto value : TEnumTraits<ESimpleLogicalValueType>::GetDomainValues()) { - allValueTypes.insert(value); - } - for (const auto& example : valueExamples) { - if (example.LogicalType->GetMetatype() == ELogicalMetatype::Simple) { - allValueTypes.erase(example.LogicalType->AsSimpleTypeRef().GetElement()); - } - } - if (!allValueTypes.empty()) { - THROW_ERROR_EXCEPTION("PrimitiveTypeExample variable doesn't contain values: %v", - allValueTypes); - } - return valueExamples; -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NTableClient diff --git a/yt/yt/client/unittests/value_examples.h b/yt/yt/client/unittests/value_examples.h deleted file mode 100644 index 06644e2cd6..0000000000 --- a/yt/yt/client/unittests/value_examples.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include <yt/yt/library/named_value/named_value.h> - -#include <yt/yt/client/table_client/logical_type.h> - -namespace NYT::NTableClient { - -//////////////////////////////////////////////////////////////////////////////// - -struct TValueExample -{ - TLogicalTypePtr LogicalType; - NNamedValue::TNamedValue::TValue Value; - TString PrettyYson; - - TValueExample(TLogicalTypePtr logicalType, NNamedValue::TNamedValue::TValue value, TString prettyYson); -}; - -std::vector<TValueExample> GetPrimitiveValueExamples(); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NTableClient diff --git a/yt/yt/client/unittests/web_json_writer_ut.cpp b/yt/yt/client/unittests/web_json_writer_ut.cpp deleted file mode 100644 index a440002a8b..0000000000 --- a/yt/yt/client/unittests/web_json_writer_ut.cpp +++ /dev/null @@ -1,1570 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/client/formats/web_json_writer.h> - -#include <yt/yt/client/table_client/logical_type.h> -#include <yt/yt/client/table_client/name_table.h> -#include <yt/yt/client/table_client/schema.h> - -#include <yt/yt/core/concurrency/async_stream.h> - -#include <yt/yt/core/json/json_parser.h> - -#include <yt/yt/core/ytree/fluent.h> - -#include <yt/yt/library/named_value/named_value.h> - -#include <limits> - -namespace NYT::NFormats { -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NYTree; -using namespace NYson; -using namespace NConcurrency; -using namespace NTableClient; - -using NNamedValue::MakeRow; - -INodePtr ParseJsonToNode(TStringBuf string) -{ - TBuildingYsonConsumerViaTreeBuilder<INodePtr> builder(EYsonType::Node); - TMemoryInput stream(string); - - // For plain (raw) JSON parsing we need to switch off - // "smart" attribute analysis and UTF-8 decoding. - auto config = New<NJson::TJsonFormatConfig>(); - config->EncodeUtf8 = false; - config->Plain = true; - - NJson::ParseJson(&stream, &builder, std::move(config)); - return builder.Finish(); -} - -class TWriterForWebJson - : public ::testing::Test -{ -protected: - TNameTablePtr NameTable_ = New<TNameTable>(); - TWebJsonFormatConfigPtr Config_ = New<TWebJsonFormatConfig>(); - TStringStream OutputStream_; - ISchemalessFormatWriterPtr Writer_; - - void CreateStandardWriter(const std::vector<TTableSchemaPtr>& schemas = {New<TTableSchema>()}) - { - Writer_ = CreateWriterForWebJson( - Config_, - NameTable_, - schemas, - CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_))); - } -}; - -TEST_F(TWriterForWebJson, Simple) -{ - Config_->MaxAllColumnNamesCount = 2; - - CreateStandardWriter(); - - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", 100500u}, - {"column_b", true}, - {"column_c", "row1_c"}, - {RowIndexColumnName, 0}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row2_c"}, - {"column_b", "row2_b"}, - {RowIndexColumnName, 1}, - }).Get(), - }); - EXPECT_TRUE(written); - WaitFor(Writer_->Close()) - .ThrowOnError(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"column_a\":{" - "\"$type\":\"uint64\"," - "\"$value\":\"100500\"" - "}," - "\"column_b\":{" - "\"$type\":\"boolean\"," - "\"$value\":\"true\"" - "}," - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_c\"" - "}" - "}," - "{" - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row2_c\"" - "}," - "\"column_b\":{" - "\"$type\":\"string\"," - "\"$value\":\"row2_b\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"false\"," - "\"incomplete_all_column_names\":\"true\"," - "\"all_column_names\":[" - "\"column_a\"," - "\"column_b\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, SliceColumnsByMaxCount) -{ - Config_->MaxSelectedColumnCount = 2; - - CreateStandardWriter(); - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", "row1_a"}, - {"column_b", "row1_b"}, - {"column_c", "row1_c"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row2_c"}, - {"column_b", "row2_b"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row3_c"}, - }).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"column_a\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_a\"" - "}," - "\"column_b\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_b\"" - "}" - "}," - "{" - "\"column_b\":{" - "\"$type\":\"string\"," - "\"$value\":\"row2_b\"" - "}" - "}," - "{" - "}" - "]," - "\"incomplete_columns\":\"true\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"column_a\"," - "\"column_b\"," - "\"column_c\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, SliceStrings) -{ - Config_->FieldWeightLimit = 6; - - CreateStandardWriter(); - - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_b", "row1_b"}, - {"column_c", "rooooow1_c"}, - {"column_a", "row1_a"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row2_c"}, - {"column_b", "rooow2_b"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row3_c"}, - }).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"column_b\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_b\"" - "}," - "\"column_c\":{" - "\"$incomplete\":true," - "\"$type\":\"string\"," - "\"$value\":\"rooooo\"" - "}," - "\"column_a\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_a\"" - "}" - "}," - "{" - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row2_c\"" - "}," - "\"column_b\":{" - "\"$incomplete\":true," - "\"$type\":\"string\"," - "\"$value\":\"rooow2\"" - "}" - "}," - "{" - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row3_c\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"false\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"column_a\"," - "\"column_b\"," - "\"column_c\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, ReplaceAnyWithNull) -{ - Config_->FieldWeightLimit = 8; - - CreateStandardWriter(); - - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_b", EValueType::Any, "{key=a}"}, - {"column_c", "row1_c"}, - {"column_a", "row1_a"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", EValueType::Any, "{key=aaaaaa}"}, - {"column_b", "row2_b"}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row3_c"}, - }).Get(), - }); - EXPECT_TRUE(written); - WaitFor(Writer_->Close()) - .ThrowOnError(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"column_b\":{" - "\"key\":{" - "\"$type\":\"string\"," - "\"$value\":\"a\"" - "}" - "}," - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_c\"" - "}," - "\"column_a\":{" - "\"$type\":\"string\"," - "\"$value\":\"row1_a\"" - "}" - "}," - "{" - "\"column_c\":{" - "\"$incomplete\":true," - "\"$type\":\"any\"," - "\"$value\":\"\"" - "}," - "\"column_b\":{" - "\"$type\":\"string\"," - "\"$value\":\"row2_b\"" - "}" - "}," - "{" - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"row3_c\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"false\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"column_a\"," - "\"column_b\"," - "\"column_c\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, SkipSystemColumns) -{ - Config_->SkipSystemColumns = false; - - CreateStandardWriter(); - - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {TableIndexColumnName, 0}, - {RowIndexColumnName, 1}, - {TabletIndexColumnName, 2}, - }).Get(), - }); - EXPECT_TRUE(written); - WaitFor(Writer_->Close()) - .ThrowOnError(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"$$table_index\":{" - "\"$type\":\"int64\"," - "\"$value\":\"0\"" - "}," - "\"$$row_index\":{" - "\"$type\":\"int64\"," - "\"$value\":\"1\"" - "}," - "\"$$tablet_index\":{" - "\"$type\":\"int64\"," - "\"$value\":\"2\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"false\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"$row_index\"," - "\"$table_index\"," - "\"$tablet_index\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, SkipUnregisteredColumns) -{ - CreateStandardWriter(); - - TUnversionedRowBuilder row; - int keyDId = -1; - row.AddValue(MakeUnversionedBooleanValue(true, keyDId)); - std::vector<TUnversionedRow> rows = {row.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - - keyDId = NameTable_->RegisterName("column_d"); - - rows.clear(); - row.Reset(); - row.AddValue(MakeUnversionedBooleanValue(true, keyDId)); - rows.push_back(row.GetRow()); - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close(); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "}," - "{" - "\"column_d\":{" - "\"$type\":\"boolean\"," - "\"$value\":\"true\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"false\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"column_d\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -TEST_F(TWriterForWebJson, SliceColumnsByName) -{ - Config_->ColumnNames = { - "column_b", - "column_c", - "$tablet_index"}; - Config_->MaxSelectedColumnCount = 2; - Config_->SkipSystemColumns = false; - - CreateStandardWriter(); - - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", 100500u}, - {"column_b", 0.42}, - {"column_c", "abracadabra"}, - {TabletIndexColumnName, 10}, - }).Get(), - }); - EXPECT_TRUE(written); - WaitFor(Writer_->Close()) - .ThrowOnError(); - auto result = ParseJsonToNode(OutputStream_.Str()); - - TString expectedOutput = - "{" - "\"rows\":[" - "{" - "\"column_b\":{" - "\"$type\":\"double\"," - "\"$value\":\"0.42\"" - "}," - "\"column_c\":{" - "\"$type\":\"string\"," - "\"$value\":\"abracadabra\"" - "}," - "\"$$tablet_index\":{" - "\"$type\":\"int64\"," - "\"$value\":\"10\"" - "}" - "}" - "]," - "\"incomplete_columns\":\"true\"," - "\"incomplete_all_column_names\":\"false\"," - "\"all_column_names\":[" - "\"$tablet_index\"," - "\"column_a\"," - "\"column_b\"," - "\"column_c\"" - "]" - "}"; - - EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); - EXPECT_EQ(expectedOutput, OutputStream_.Str()); -} - -template <typename TValue> -void CheckYqlValue( - const INodePtr& valueNode, - const TValue& expectedValue) -{ - using TDecayedValue = std::decay_t<TValue>; - if constexpr (std::is_convertible_v<TDecayedValue, TString>) { - ASSERT_EQ(valueNode->GetType(), ENodeType::String); - EXPECT_EQ(valueNode->GetValue<TString>(), expectedValue); - } else if constexpr (std::is_same_v<TDecayedValue, double>) { - ASSERT_EQ(valueNode->GetType(), ENodeType::String); - EXPECT_FLOAT_EQ(FromString<double>(valueNode->GetValue<TString>()), expectedValue); - } else if constexpr (std::is_same_v<TDecayedValue, bool>) { - ASSERT_EQ(valueNode->GetType(), ENodeType::Boolean); - EXPECT_EQ(valueNode->GetValue<bool>(), expectedValue); - } else if constexpr (std::is_same_v<TDecayedValue, INodePtr>) { - EXPECT_TRUE(AreNodesEqual(valueNode, expectedValue)) - << "actualValueNode is " << ConvertToYsonString(valueNode, EYsonFormat::Pretty).AsStringBuf() - << "\nexpectedValue is " << ConvertToYsonString(expectedValue, EYsonFormat::Pretty).AsStringBuf(); - } else { - static_assert(TDependentFalse<TDecayedValue>, "Type not allowed"); - } -} - -template <typename TType> -void CheckYqlType( - const INodePtr& typeNode, - const TType& expectedType, - const std::vector<INodePtr>& yqlTypes) -{ - ASSERT_EQ(typeNode->GetType(), ENodeType::String); - auto typeIndexString = typeNode->GetValue<TString>(); - auto typeIndex = FromString<int>(typeIndexString); - ASSERT_LT(typeIndex, static_cast<int>(yqlTypes.size())); - ASSERT_GE(typeIndex, 0); - const auto& yqlType = yqlTypes[typeIndex]; - EXPECT_EQ(yqlType->GetType(), ENodeType::List); - - auto expectedTypeNode = [&] () -> INodePtr { - using TDecayedType = std::decay_t<TType>; - if constexpr (std::is_convertible_v<TDecayedType, TString>) { - return ConvertToNode(TYsonString(TString(expectedType))); - } else if constexpr (std::is_same_v<TDecayedType, INodePtr>) { - return expectedType; - } else { - static_assert(TDependentFalse<TDecayedType>, "Type not allowed"); - } - }(); - EXPECT_TRUE(AreNodesEqual(yqlType, expectedTypeNode)) - << "yqlType is " << ConvertToYsonString(yqlType, EYsonFormat::Pretty).AsStringBuf() - << "\nexpectedTypeNode is " << ConvertToYsonString(expectedTypeNode, EYsonFormat::Pretty).AsStringBuf(); -} - -template <typename TValue, typename TType> -void CheckYqlTypeAndValue( - const INodePtr& row, - TStringBuf name, - const TType& expectedType, - const TValue& expectedValue, - const std::vector<INodePtr>& yqlTypes) -{ - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto entry = row->AsMap()->FindChild(TString(name)); - ASSERT_TRUE(entry); - ASSERT_EQ(entry->GetType(), ENodeType::List); - ASSERT_EQ(entry->AsList()->GetChildCount(), 2); - auto valueNode = entry->AsList()->GetChildOrThrow(0); - CheckYqlValue(valueNode, expectedValue); - auto typeNode = entry->AsList()->GetChildOrThrow(1); - CheckYqlType(typeNode, expectedType, yqlTypes); -} - -#define CHECK_YQL_TYPE_AND_VALUE(row, name, expectedType, expectedValue, yqlTypes) \ - do { \ - SCOPED_TRACE(name); \ - CheckYqlTypeAndValue(row, name, expectedType, expectedValue, yqlTypes); \ - } while (0) - -TEST_F(TWriterForWebJson, YqlValueFormat_SimpleTypes) -{ - Config_->MaxAllColumnNamesCount = 2; - Config_->ValueFormat = EWebJsonValueFormat::Yql; - - // We will emulate writing rows from two tables. - CreateStandardWriter(std::vector{New<TTableSchema>(), New<TTableSchema>()}); - - { - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", 100500u}, - {"column_b", true}, - {"column_c", "row1_c"}, - {RowIndexColumnName, 0}, - {TableIndexColumnName, 0}, - }).Get(), - MakeRow(NameTable_, { - {"column_c", "row2_c"}, - {"column_b", "row2_b"}, - {RowIndexColumnName, 1}, - {TableIndexColumnName, 0}, - }).Get(), - MakeRow(NameTable_, { - {"column_a", -100500}, - {"column_b", EValueType::Any, "{x=2;y=3}"}, - {"column_c", 2.71828}, - {RowIndexColumnName, 1}, - }).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); - ASSERT_TRUE(incompleteColumns); - auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); - ASSERT_TRUE(incompleteAllColumnNames); - auto allColumnNames = result->AsMap()->FindChild("all_column_names"); - ASSERT_TRUE(allColumnNames); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(incompleteColumns->GetType(), ENodeType::String); - EXPECT_EQ(incompleteColumns->GetValue<TString>(), "false"); - - ASSERT_EQ(incompleteAllColumnNames->GetType(), ENodeType::String); - EXPECT_EQ(incompleteAllColumnNames->GetValue<TString>(), "true"); - - ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); - std::vector<TString> allColumnNamesVector; - ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); - EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_b"})); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 3); - - auto row1 = rows->AsList()->GetChildOrThrow(0); - auto row2 = rows->AsList()->GetChildOrThrow(1); - auto row3 = rows->AsList()->GetChildOrThrow(2); - - ASSERT_EQ(row1->GetType(), ENodeType::Map); - EXPECT_EQ(row1->AsMap()->GetChildCount(), 3); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", R"(["DataType"; "Uint64"])", "100500", yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_b", R"(["DataType"; "Boolean"])", true, yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_c", R"(["DataType"; "String"])", "row1_c", yqlTypes); - - ASSERT_EQ(row2->GetType(), ENodeType::Map); - EXPECT_EQ(row2->AsMap()->GetChildCount(), 2); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_b", R"(["DataType"; "String"])", "row2_b", yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_c", R"(["DataType"; "String"])", "row2_c", yqlTypes); - - ASSERT_EQ(row3->GetType(), ENodeType::Map); - EXPECT_EQ(row3->AsMap()->GetChildCount(), 3); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_a", R"(["DataType"; "Int64"])", "-100500", yqlTypes); - auto row3BValue = ConvertToNode(TYsonString(TStringBuf(R"({ - val = { - x = { - "$type" = "int64"; - "$value" = "2"; - }; - y = { - "$type" = "int64"; - "$value" = "3"; - } - } - })"))); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_b", R"(["DataType"; "Yson"])", row3BValue, yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_c", R"(["DataType"; "Double"])", 2.71828, yqlTypes); -} - -TEST_F(TWriterForWebJson, ColumnNameEncoding) -{ - Config_->MaxAllColumnNamesCount = 2; - Config_->ValueFormat = EWebJsonValueFormat::Yql; - - CreateStandardWriter(); - - { - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", 100500u}, - {"column_non_ascii_\xd0\x81", -100500}, - }).Get() - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); - ASSERT_TRUE(incompleteColumns); - auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); - ASSERT_TRUE(incompleteAllColumnNames); - auto allColumnNames = result->AsMap()->FindChild("all_column_names"); - ASSERT_TRUE(allColumnNames); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); - std::vector<TString> allColumnNamesVector; - ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); - EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_non_ascii_\xc3\x90\xc2\x81"})); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 1); - - auto row1 = rows->AsList()->GetChildOrThrow(0); - - ASSERT_EQ(row1->GetType(), ENodeType::Map); - EXPECT_EQ(row1->AsMap()->GetChildCount(), 2); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", R"(["DataType"; "Uint64"])", "100500", yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_non_ascii_\xc3\x90\xc2\x81", R"(["DataType"; "Int64"])", "-100500", yqlTypes); -} - -TEST_F(TWriterForWebJson, YqlValueFormat_ComplexTypes) -{ - Config_->ValueFormat = EWebJsonValueFormat::Yql; - - auto firstSchema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"column_a", OptionalLogicalType( - ListLogicalType(MakeLogicalType(ESimpleLogicalValueType::Int64, true)))}, - {"column_b", StructLogicalType({ - {"key", MakeLogicalType(ESimpleLogicalValueType::String, true)}, - {"value", MakeLogicalType(ESimpleLogicalValueType::String, true)}, - {"variant_tuple", VariantTupleLogicalType({ - MakeLogicalType(ESimpleLogicalValueType::Int8, true), - MakeLogicalType(ESimpleLogicalValueType::Boolean, false), - })}, - {"variant_struct", VariantStructLogicalType({ - {"a", MakeLogicalType(ESimpleLogicalValueType::Int8, true)}, - {"b", MakeLogicalType(ESimpleLogicalValueType::Boolean, false)}, - })}, - {"dict", DictLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::Int64), - SimpleLogicalType(ESimpleLogicalValueType::String) - )}, - {"tagged", TaggedLogicalType( - "MyTag", - SimpleLogicalType(ESimpleLogicalValueType::Int64) - )}, - {"timestamp", SimpleLogicalType(ESimpleLogicalValueType::Timestamp)}, - {"date", SimpleLogicalType(ESimpleLogicalValueType::Date)}, - {"datetime", SimpleLogicalType(ESimpleLogicalValueType::Datetime)}, - {"interval", SimpleLogicalType(ESimpleLogicalValueType::Interval)}, - {"json", SimpleLogicalType(ESimpleLogicalValueType::Json)}, - {"float", SimpleLogicalType(ESimpleLogicalValueType::Float)}, - })}, - {"column_c", ListLogicalType(StructLogicalType({ - {"very_optional_key", OptionalLogicalType(MakeLogicalType(ESimpleLogicalValueType::String, false))}, - {"optional_value", MakeLogicalType(ESimpleLogicalValueType::String, false)}, - }))}, - }); - - auto secondSchema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"column_a", VariantTupleLogicalType({ - SimpleLogicalType(ESimpleLogicalValueType::Null), - SimpleLogicalType(ESimpleLogicalValueType::Any), - })}, - {"column_b", SimpleLogicalType(ESimpleLogicalValueType::Null)}, - {"column_c", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Null))}, - {"column_d", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, - }); - - auto firstColumnAType = ConvertToNode(TYsonString(TStringBuf(R"([ - "OptionalType"; - [ - "ListType"; - ["DataType"; "Int64"] - ] - ])"))); - auto firstColumnBType = ConvertToNode(TYsonString(TStringBuf(R"([ - "StructType"; - [ - [ - "key"; - ["DataType"; "String"] - ]; - [ - "value"; - ["DataType"; "String"] - ]; - [ - "variant_tuple"; - [ - "VariantType"; - [ - "TupleType"; - [ - ["DataType"; "Int8"]; - [ - "OptionalType"; - ["DataType"; "Boolean"] - ] - ] - ] - ] - ]; - [ - "variant_struct"; - [ - "VariantType"; - [ - "StructType"; - [ - [ - "a"; - ["DataType"; "Int8"] - ]; - [ - "b"; - [ - "OptionalType"; - ["DataType"; "Boolean"] - ] - ] - ] - ] - ] - ]; - [ - "dict"; - [ - "DictType"; - ["DataType"; "Int64"]; - ["DataType"; "String"] - ] - ]; - [ - "tagged"; - [ - "TaggedType"; - "MyTag"; - ["DataType"; "Int64"] - ] - ]; - [ - "timestamp"; - ["DataType"; "Timestamp"] - ]; - [ - "date"; - ["DataType"; "Date"] - ]; - [ - "datetime"; - ["DataType"; "Datetime"] - ]; - [ - "interval"; - ["DataType"; "Interval"] - ]; - [ - "json"; - ["DataType"; "Json"] - ]; - [ - "float"; - ["DataType"; "Float"] - ]; - ] - ])"))); - auto firstColumnCType = ConvertToNode(TYsonString(TStringBuf(R"([ - "ListType"; - [ - "StructType"; - [ - [ - "very_optional_key"; - [ - "OptionalType"; - [ - "OptionalType"; - ["DataType"; "String"] - ] - ] - ]; - [ - "optional_value"; - [ - "OptionalType"; - ["DataType"; "String"] - ] - ] - ] - ] - ])"))); - auto secondColumnAType = ConvertToNode(TYsonString(TStringBuf(R"([ - "VariantType"; - [ - "TupleType"; - [ - ["NullType"]; - ["DataType"; "Yson"]; - ] - ] - ])"))); - auto secondColumnBType = ConvertToNode(TYsonString(TStringBuf(R"(["NullType"])"))); - auto secondColumnCType = ConvertToNode(TYsonString(TStringBuf(R"([ - "OptionalType"; - [ - "NullType"; - ] - ])"))); - auto secondColumnDType = ConvertToNode(TYsonString(TStringBuf(R"([ - "OptionalType"; - ["DataType"; "Int64"] - ])"))); - - CreateStandardWriter(std::vector{firstSchema, secondSchema}); - { - bool written = Writer_->Write({ - MakeRow(NameTable_, { - {"column_a", EValueType::Composite, R"([-1; -2; -5])"}, - { - "column_b", - EValueType::Composite, - R"([ - "key"; - "value"; - [0; 7]; - [1; #]; - [[1; "a"]; [2; "b"]]; - 99; - 100u; - 101u; - 102u; - 103; - "[\"a\", {\"b\": 42}]"; - -3.25; - ])", - }, - {"column_c", EValueType::Composite, R"([[[#]; "value"]; [["key"]; #]])"}, - {"column_d", -49}, - {TableIndexColumnName, 0}, - {RowIndexColumnName, 0}, - }).Get(), - MakeRow(NameTable_, { - {"column_a", EValueType::Composite, R"([0; -2; -5; 177])"}, - { - "column_b", - EValueType::Composite, - R"([ - "key1"; - "value1"; - [1; %false]; - [1; #]; - []; - 199; - 0u; - 1101u; - 1102u; - 1103; - "null"; - 0.0; - ])", - }, - {"column_c", EValueType::Composite, R"([[#; #]; [["key1"]; #]])"}, - {"column_d", 49u}, - {RowIndexColumnName, 1}, - }).Get(), - MakeRow(NameTable_, { - {"column_a", EValueType::Composite, "[]"}, - { - "column_b", - EValueType::Composite, - R"([ - "key2"; - "value2"; - [0; 127]; - [1; %true]; - [[0; ""]]; - 399; - 30u; - 3101u; - 3202u; - 3103; - "{\"x\": false}"; - 1e10; - ])" - }, - {"column_c", EValueType::Composite, "[[[key]; #]]"}, - {"column_d", "49"}, - {RowIndexColumnName, 2}, - }).Get(), - - MakeRow(NameTable_, { - {"column_a", nullptr}, - { - "column_b", - EValueType::Composite, - // First string is valid UTF-8, the second one should be Base64 encoded. - "[" - "\"\xC3\xBF\";" - "\"\xFA\xFB\xFC\xFD\";" - R"( - [0; 127]; - [1; %true]; - [[-1; "-1"]; [0; ""]]; - 499; - 40u; - 4101u; - 4202u; - 4103; - "{}"; - -2.125; - ])", - }, - {"column_c", EValueType::Composite, "[]"}, - {"column_d", EValueType::Any, "{x=49}"}, - {RowIndexColumnName, 3}, - }).Get(), - - // Here come rows from the second table. - MakeRow(NameTable_, { - {"column_a", EValueType::Composite, "[0; #]"}, - {"column_b", nullptr}, - {"column_c", nullptr}, - {"column_d", -49}, - {TableIndexColumnName, 1}, - {RowIndexColumnName, 0}, - }).Get(), - - MakeRow(NameTable_, { - {"column_a", EValueType::Composite, "[1; {z=z}]"}, - {"column_b", nullptr}, - {"column_c", EValueType::Composite, "[#]"}, - {"column_d", nullptr}, - {TableIndexColumnName, 1}, - {RowIndexColumnName, 1}, - }).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); - ASSERT_TRUE(incompleteColumns); - auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); - ASSERT_TRUE(incompleteAllColumnNames); - auto allColumnNames = result->AsMap()->FindChild("all_column_names"); - ASSERT_TRUE(allColumnNames); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(incompleteColumns->GetType(), ENodeType::String); - EXPECT_EQ(incompleteColumns->GetValue<TString>(), "false"); - - ASSERT_EQ(incompleteAllColumnNames->GetType(), ENodeType::String); - EXPECT_EQ(incompleteAllColumnNames->GetValue<TString>(), "false"); - - ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); - std::vector<TString> allColumnNamesVector; - ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); - EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_b", "column_c", "column_d"})); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 6); - - auto row1 = rows->AsList()->GetChildOrThrow(0); - auto row2 = rows->AsList()->GetChildOrThrow(1); - auto row3 = rows->AsList()->GetChildOrThrow(2); - auto row4 = rows->AsList()->GetChildOrThrow(3); - auto row5 = rows->AsList()->GetChildOrThrow(4); - auto row6 = rows->AsList()->GetChildOrThrow(5); - - ASSERT_EQ(row1->GetType(), ENodeType::Map); - EXPECT_EQ(row1->AsMap()->GetChildCount(), 4); - auto row1AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=["-1"; "-2"; "-5"]}])"))); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", firstColumnAType, row1AValue, yqlTypes); - auto row1BValue = ConvertToNode(TYsonString(TStringBuf( - R"([ - "key"; - "value"; - ["0"; "7"]; - ["1"; #]; - {"val"=[["1"; "a"]; ["2"; "b"]]}; - "99"; - "100"; - "101"; - "102"; - "103"; - "[\"a\", {\"b\": 42}]"; - "-3.25"; - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_b", firstColumnBType, row1BValue, yqlTypes); - auto row1CValue = ConvertToNode(TYsonString(TStringBuf(R"({ - "val"=[ - [[#]; ["value"]]; - [[["key"]]; #] - ] - })"))); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_c", firstColumnCType, row1CValue, yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row1, "column_d", R"(["DataType"; "Int64"])", "-49", yqlTypes); - - ASSERT_EQ(row2->GetType(), ENodeType::Map); - EXPECT_EQ(row2->AsMap()->GetChildCount(), 4); - auto row2AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=["0"; "-2"; "-5"; "177"]}])"))); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_a", firstColumnAType, row2AValue, yqlTypes); - auto row2BValue = ConvertToNode(TYsonString(TStringBuf( - R"([ - "key1"; - "value1"; - ["1"; [%false]]; - ["1"; #]; - {"val"=[]}; - "199"; - "0"; - "1101"; - "1102"; - "1103"; - "null"; - "0"; - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_b", firstColumnBType, row2BValue, yqlTypes); - auto row2CValue = ConvertToNode(TYsonString(TStringBuf(R"({ - "val"=[ - [#; #]; - [[["key1"]]; #] - ] - })"))); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_c", firstColumnCType, row2CValue, yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row2, "column_d", R"(["DataType"; "Uint64"])", "49", yqlTypes); - - ASSERT_EQ(row3->GetType(), ENodeType::Map); - EXPECT_EQ(row3->AsMap()->GetChildCount(), 4); - auto row3AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=[]}])"))); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_a", firstColumnAType, row3AValue, yqlTypes); - auto row3BValue = ConvertToNode(TYsonString(TStringBuf( - R"([ - "key2"; - "value2"; - ["0"; "127"]; - ["1"; [%true]]; - {"val"=[["0"; ""]]}; - "399"; - "30"; - "3101"; - "3202"; - "3103"; - "{\"x\": false}"; - "10000000000"; - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_b", firstColumnBType, row3BValue, yqlTypes); - auto row3CValue = ConvertToNode(TYsonString(TStringBuf(R"({ - "val"=[ - [[["key"]]; #] - ] - })"))); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_c", firstColumnCType, row3CValue, yqlTypes); - CHECK_YQL_TYPE_AND_VALUE(row3, "column_d", R"(["DataType"; "String"])", "49", yqlTypes); - - ASSERT_EQ(row4->GetType(), ENodeType::Map); - EXPECT_EQ(row4->AsMap()->GetChildCount(), 4); - auto row4AValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); - CHECK_YQL_TYPE_AND_VALUE(row4, "column_a", firstColumnAType, row4AValue, yqlTypes); - - auto row4BValue = ConvertToNode(TYsonString(TStringBuf( - "[" - "\"\xC3\xBF\";" - R"( - {"b64" = %true; "val" = "+vv8/Q=="}; - ["0"; "127"]; - ["1"; [%true]]; - {"val"=[["-1"; "-1"]; ["0"; ""]]}; - "499"; - "40"; - "4101"; - "4202"; - "4103"; - "{}"; - "-2.125"; - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row4, "column_b", firstColumnBType, row4BValue, yqlTypes); - - auto row4CValue = ConvertToNode(TYsonString(TStringBuf(R"({"val"=[]})"))); - CHECK_YQL_TYPE_AND_VALUE(row4, "column_c", firstColumnCType, row4CValue, yqlTypes); - auto row4DValue = ConvertToNode(TYsonString(TStringBuf(R"({ - val = { - x = { - "$type" = "int64"; - "$value" = "49"; - } - } - })"))); - CHECK_YQL_TYPE_AND_VALUE(row4, "column_d", R"(["DataType"; "Yson"])", row4DValue, yqlTypes); - - // Here must come rows from the second table. - - ASSERT_EQ(row5->GetType(), ENodeType::Map); - EXPECT_EQ(row5->AsMap()->GetChildCount(), 4); - auto row5AValue = ConvertToNode(TYsonString(TStringBuf(R"(["0"; #])"))); - CHECK_YQL_TYPE_AND_VALUE(row5, "column_a", secondColumnAType, row5AValue, yqlTypes); - auto row5BValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); - CHECK_YQL_TYPE_AND_VALUE(row5, "column_b", secondColumnBType, row5BValue, yqlTypes); - auto row5CValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); - CHECK_YQL_TYPE_AND_VALUE(row5, "column_c", secondColumnCType, row5CValue, yqlTypes); - auto row5DValue = ConvertToNode(TYsonString(TStringBuf(R"(["-49"])"))); - CHECK_YQL_TYPE_AND_VALUE(row5, "column_d", secondColumnDType, row5DValue, yqlTypes); - - ASSERT_EQ(row6->GetType(), ENodeType::Map); - EXPECT_EQ(row6->AsMap()->GetChildCount(), 4); - auto row6AValue = ConvertToNode(TYsonString(TStringBuf(R"([ - "1"; - { - val = { - z = { - "$type" = "string"; - "$value" = "z"; - } - } - }; - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row6, "column_a", secondColumnAType, row6AValue, yqlTypes); - auto row6BValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); - CHECK_YQL_TYPE_AND_VALUE(row6, "column_b", secondColumnBType, row6BValue, yqlTypes); - auto row6CValue = ConvertToNode(TYsonString(TStringBuf(R"([#])"))); - CHECK_YQL_TYPE_AND_VALUE(row6, "column_c", secondColumnCType, row6CValue, yqlTypes); - auto row6DValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); - CHECK_YQL_TYPE_AND_VALUE(row6, "column_d", secondColumnDType, row6DValue, yqlTypes); -} - -TEST_F(TWriterForWebJson, YqlValueFormat_Incomplete) -{ - Config_->ValueFormat = EWebJsonValueFormat::Yql; - Config_->FieldWeightLimit = 215; - Config_->StringWeightLimit = 10; - - auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"column_a", StructLogicalType({ - {"field1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, - {"list", ListLogicalType( - VariantStructLogicalType({ - {"a", DictLogicalType( - SimpleLogicalType(ESimpleLogicalValueType::Int64), - SimpleLogicalType(ESimpleLogicalValueType::String) - )}, - {"b", SimpleLogicalType(ESimpleLogicalValueType::Any)}, - }) - )}, - {"field2", SimpleLogicalType(ESimpleLogicalValueType::String)}, - {"field3", MakeLogicalType(ESimpleLogicalValueType::Int64, false)}, - })}, - {"column_b", SimpleLogicalType(ESimpleLogicalValueType::Any)}, - {"column_c", MakeLogicalType(ESimpleLogicalValueType::String, false)}, - }); - - auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"([ - "StructType"; - [ - [ - "field1"; - ["DataType"; "Int64"] - ]; - [ - "list"; - [ - "ListType"; - [ - "VariantType"; - [ - "StructType"; - [ - [ - "a"; - [ - "DictType"; - ["DataType"; "Int64"]; - ["DataType"; "String"] - ] - ]; - [ - "b"; - ["DataType"; "Yson"] - ]; - ] - ] - ] - ] - ]; - [ - "field2"; - ["DataType"; "String"] - ]; - [ - "field3"; - [ - "OptionalType"; - ["DataType"; "Int64"] - ] - ]; - ] - ])"))); - - auto yqlTypeB = ConvertToNode(TYsonString(TStringBuf(R"(["DataType"; "Yson"])"))); - auto yqlTypeC = ConvertToNode(TYsonString(TStringBuf(R"(["OptionalType"; ["DataType"; "String"]])"))); - { - CreateStandardWriter({schema}); - bool written = Writer_->Write({ - MakeRow(NameTable_, { - { - "column_a", - EValueType::Composite, - R"([ - -1; - [ - [ - 0; - [ - [-2; "UTF:)" + TString("\xF0\x90\x8D\x88") + "\xF0\x90\x8D\x88" + R"("]; - [2; "!UTF:)" + TString("\xFA\xFB\xFC\xFD\xFA\xFB\xFC\xFD") + R"("]; - [0; ""]; - ] - ]; - [ - 1; - "{kinda_long_key = kinda_even_longer_value}" - ]; - [ - 0; - [ - [0; "One more quite long string"]; - [1; "One more quite long string"]; - [2; "One more quite long string"]; - [3; "One more quite long string"]; - [4; "One more quite long string"]; - [5; "One more quite long string"]; - ] - ]; - [ - 1; - "{kinda_long_key = kinda_even_longer_value}" - ]; - ]; - "I'm short"; - 424242238133245 - ])" - }, - {"column_b", EValueType::Any, "{kinda_long_key = kinda_even_longer_value}"}, - {"column_c", "One more quite long string"}, - }).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 1); - - auto row = rows->AsList()->GetChildOrThrow(0); - ASSERT_EQ(row->GetType(), ENodeType::Map); - EXPECT_EQ(row->AsMap()->GetChildCount(), 3); - - auto rowAValue = ConvertToNode(TYsonString(R"([ - "-1"; - { - "inc" = %true; - "val" = [ - [ - "0"; - { - "val" = [ - ["-2"; {"inc"=%true; "val"="UTF:)" + TString("\xF0\x90\x8D\x88") + R"("}]; - ["2"; {"inc"=%true; "b64"=%true; "val"="IVVURjr6"}]; - ["0"; ""]; - ] - } - ]; - [ - "1"; - {"val"=""; "inc"=%true} - ]; - [ - "0"; - { - "inc" = %true; - "val" = [ - ["0"; {"val"="One more q"; "inc"=%true}]; - ["1"; {"val"="One more "; "inc"=%true}]; - ]; - } - ]; - ]; - }; - { - "val" = ""; - "inc" = %true; - }; - ["424242238133245"]; - ])")); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - - // Simple values are not truncated to |StringWeightLimit| - auto rowBValue = ConvertToNode(TYsonString(TStringBuf(R"({ - val = { - kinda_long_key = { - "$type" = "string"; - "$value" = kinda_even_longer_value; - } - } - })"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_b", yqlTypeB, rowBValue, yqlTypes); - auto rowCValue = ConvertToNode(TYsonString(TStringBuf(R"(["One more quite long string"])"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_c", yqlTypeC, rowCValue, yqlTypes); -} - - -TEST_F(TWriterForWebJson, YqlValueFormat_Any) -{ - Config_->ValueFormat = EWebJsonValueFormat::Yql; - - auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ - {"column_a", MakeLogicalType(ESimpleLogicalValueType::Any, false)}, - }); - - auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"([ - "OptionalType"; - ["DataType"; "Yson"] - ])"))); - - CreateStandardWriter({schema}); - { - bool written = Writer_->Write({ - MakeRow(NameTable_, {{"column_a", EValueType::Any, "{x=y;z=2}"}}).Get(), - MakeRow(NameTable_, {{"column_a", true}}).Get(), - MakeRow(NameTable_, {{"column_a", -42}}).Get(), - MakeRow(NameTable_, {{"column_a", 42u}}).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 4); - - { - auto row = rows->AsList()->GetChildOrThrow(0); - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ - { - val = { - x = { - "$type" = "string"; - "$value" = "y"; - }; - z = { - "$type" = "int64"; - "$value" = "2"; - } - } - } - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - } - { - auto row = rows->AsList()->GetChildOrThrow(1); - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ - { - val = { - "$type" = "boolean"; - "$value" = "true"; - } - } - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - } - { - auto row = rows->AsList()->GetChildOrThrow(2); - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ - { - val = { - "$type" = "int64"; - "$value" = "-42"; - } - } - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - } - { - auto row = rows->AsList()->GetChildOrThrow(3); - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ - { - val = { - "$type" = "uint64"; - "$value" = "42"; - } - } - ])"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - } -} - -TEST_F(TWriterForWebJson, YqlValueFormat_CompositeNoSchema) -{ - Config_->ValueFormat = EWebJsonValueFormat::Yql; - - auto schema = New<TTableSchema>(); - - auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"(["DataType"; "Yson"])"))); - - CreateStandardWriter({schema}); - { - bool written = Writer_->Write({ - MakeRow(NameTable_, {{"column_a", EValueType::Composite, "[1;2]"}}).Get(), - }); - EXPECT_TRUE(written); - Writer_->Close().Get().ThrowOnError(); - } - - auto result = ParseJsonToNode(OutputStream_.Str()); - ASSERT_EQ(result->GetType(), ENodeType::Map); - - auto rows = result->AsMap()->FindChild("rows"); - ASSERT_TRUE(rows); - auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); - ASSERT_TRUE(yqlTypeRegistry); - - ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); - auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); - - ASSERT_EQ(rows->GetType(), ENodeType::List); - ASSERT_EQ(rows->AsList()->GetChildCount(), 1); - - { - auto row = rows->AsList()->GetChildOrThrow(0); - ASSERT_EQ(row->GetType(), ENodeType::Map); - auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"({ - "val" = [ - { - "$type" = "int64"; - "$value" = "1"; - }; - { - "$type" = "int64"; - "$value" = "2"; - } - ] - })"))); - CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/ya.make b/yt/yt/client/unittests/ya.make index ab9f547e19..747cfe2aa8 100644 --- a/yt/yt/client/unittests/ya.make +++ b/yt/yt/client/unittests/ya.make @@ -7,8 +7,6 @@ ALLOCATOR(YT) PROTO_NAMESPACE(yt) SRCS( - protobuf_format_ut.proto - check_schema_compatibility_ut.cpp check_type_compatibility_ut.cpp chunk_replica_ut.cpp @@ -16,8 +14,6 @@ SRCS( comparator_ut.cpp composite_compare_ut.cpp connection_ut.cpp - dsv_parser_ut.cpp - dsv_writer_ut.cpp farm_fingerprint_stability_ut.cpp key_bound_ut.cpp key_bound_compressor_ut.cpp @@ -28,30 +24,17 @@ SRCS( uuid_text_ut.cpp time_text_ut.cpp node_directory_ut.cpp - protobuf_format_ut.cpp query_builder_ut.cpp read_limit_ut.cpp replication_progress_ut.cpp - row_helpers.cpp row_ut.cpp - schemaful_dsv_parser_ut.cpp - schemaful_dsv_writer_ut.cpp schema_ut.cpp - skiff_format_ut.cpp - skiff_yson_converter_ut.cpp table_consumer_ut.cpp unordered_reader_ut.cpp unversioned_row_ut.cpp validate_logical_type_ut.cpp - value_examples.cpp - web_json_writer_ut.cpp wire_protocol_ut.cpp - yamred_dsv_parser_ut.cpp - yamred_dsv_writer_ut.cpp - yamr_parser_ut.cpp - yamr_writer_ut.cpp ypath_ut.cpp - yson_helpers.cpp zookeeper_bus_ut.cpp zookeeper_protocol_ut.cpp ) diff --git a/yt/yt/client/unittests/yamr_parser_ut.cpp b/yt/yt/client/unittests/yamr_parser_ut.cpp deleted file mode 100644 index 74b8f530a1..0000000000 --- a/yt/yt/client/unittests/yamr_parser_ut.cpp +++ /dev/null @@ -1,606 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/test_framework/yson_consumer_mock.h> - -#include <yt/yt/client/formats/yamr_parser.h> - -#include <yt/yt/core/yson/null_consumer.h> - -namespace NYT::NFormats { -namespace { - -using namespace NYson; - -using ::testing::InSequence; -using ::testing::StrictMock; -using ::testing::NiceMock; - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamrParserTest, Simple) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginAttributes()); - EXPECT_CALL(Mock, OnKeyedItem("table_index")); - EXPECT_CALL(Mock, OnInt64Scalar(2)); - EXPECT_CALL(Mock, OnEndAttributes()); - EXPECT_CALL(Mock, OnEntity()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "key1\tvalue1\n" - "2\n" - "key2\tvalue2\n"; - - ParseYamr(input, &Mock); -} - -TEST(TYamrParserTest, ValueWithTabs) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar(TStringBuf("key1\0", 5))); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value with \t and some other")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar(TStringBuf("another\0 value with \t", 21))); - EXPECT_CALL(Mock, OnEndMap()); - - TString input( - "key1\0\tvalue with \t and some other\n" - "key2\tanother\0 value with \t\n", - 34 + - 27); - - ParseYamr(input, &Mock); -} - -TEST(TYamrParserTest, SimpleWithSubkey) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "key1\tsubkey1\tvalue1\n" - "key2\tsubkey2\tvalue2\n"; - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrParserTest, IncompleteRows) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "key1\tsubkey1\tvalue1\n" - "key\tsubkey\n" - "key2\tsubkey2\tvalue2\n"; - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrParserTest, IncorrectIncompleteRows) -{ - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = false; - - EXPECT_THROW(ParseYamr("\n", GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr("key\n", GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr("key\tvalue\nkey\n", GetNullYsonConsumer(), config), std::exception); -} - -TEST(TYamrParserTest, TabsInValue) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("a\tb\\tc\t")); - EXPECT_CALL(Mock, OnEndMap()); - - auto config = New<TYamrFormatConfig>(); - TString input = "key\ta\tb\\tc\t"; - ParseYamr(input, &Mock, config); -} - -TEST(TYamrParserTest, Escaping) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("\tkey\t")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("\n")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("a\tb\t\n")); - EXPECT_CALL(Mock, OnEndMap()); - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - config->EnableEscaping = true; - - TString input = "\\tkey\\t\t\\n\ta\tb\t\\n\n"; - ParseYamr(input, &Mock, config); -} - -TEST(TYamrParserTest, CustomSeparators) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - auto config = New<TYamrFormatConfig>(); - config->RecordSeparator = 'Y'; - config->FieldSeparator = 'X'; - - TString input = "keyXvalueYkey2Xvalue2Y"; - ParseYamr(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamrLenvalParserTest, Simple) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginAttributes()); - EXPECT_CALL(Mock, OnKeyedItem("table_index")); - EXPECT_CALL(Mock, OnInt64Scalar(1)); - EXPECT_CALL(Mock, OnEndAttributes()); - EXPECT_CALL(Mock, OnEntity()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - , 2 * (2 * 4 + 4 + 6) + 8 // all i32 + lengths of keys - ); - - auto config = New<TYamrFormatConfig>(); - config->Lenval = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrLenvalParserTest, SimpleWithSubkey) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("subkey2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x04\x00\x00\x00" "key1" - "\x07\x00\x00\x00" "subkey1" - "\x06\x00\x00\x00" "value1" - - "\x04\x00\x00\x00" "key2" - "\x07\x00\x00\x00" "subkey2" - "\x06\x00\x00\x00" "value2" - , 2 * (3 * 4 + 4 + 7 + 6) // all i32 + lengths of keys - ); - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - config->Lenval = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrLenvalParserTest, EmptyFields) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - , 3 * 4 - ); - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - config->Lenval = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrLenvalParserTest, HugeLength) -{ - TString input = TString( - "\xFF\xFF\xFF\xFF" - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - , 3 * 4 - ); - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - config->Lenval = true; - - EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); -} - -TEST(TYamrLenvalParserTest, SimpleEndOfMessage) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key1")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value1")); - EXPECT_CALL(Mock, OnEndMap()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginAttributes()); - EXPECT_CALL(Mock, OnKeyedItem("table_index")); - EXPECT_CALL(Mock, OnInt64Scalar(1)); - EXPECT_CALL(Mock, OnEndAttributes()); - EXPECT_CALL(Mock, OnEntity()); - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("key2")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("value2")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - auto config = New<TYamrFormatConfig>(); - config->Lenval = true; - config->EnableEom = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrLenvalParserTest, EmptyFieldsWithEOM) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("value")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00" - , 3 * 4 + 12 - ); - - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = true; - config->Lenval = true; - config->EnableEom = true; - - ParseYamr(input, &Mock, config); -} - -TEST(TYamrParserTest, IncorrectPlaceOfEOM) -{ - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = false; - config->Lenval = true; - config->EnableEom = true; - - TString input1 = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - TString input2 = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - - "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" - - "\x06\x00\x00\x00" "value2" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - EXPECT_THROW(ParseYamr(input1, GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr(input2, GetNullYsonConsumer(), config), std::exception); -} - -TEST(TYamrParserTest, IncorrectEOM) -{ - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = false; - config->Lenval = true; - config->EnableEom = true; - - // Garbage after EOM marker - TString input1 = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - // Row count mismatch - TString input2 = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\xfb\xff\xff\xff" "\x03\x00\x00\x00\x00\x00\x00\x00" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - // Missing EOM marker - TString input3 = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - , 2 * (2 * 4 + 4 + 6) + 8 // all i32 + lengths of keys - ); - - // Missing EOM marker with empty fields - TString input4 = TString( - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - "\x00\x00\x00\x00" - , 3 * 4 - ); - - EXPECT_THROW(ParseYamr(input1, GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr(input2, GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr(input3, GetNullYsonConsumer(), config), std::exception); - EXPECT_THROW(ParseYamr(input4, GetNullYsonConsumer(), config), std::exception); -} - -TEST(TYamrParserTest, UnsupportedEOMInTextMode) -{ - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = false; - config->Lenval = false; - config->EnableEom = true; - - TString input = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); -} - -TEST(TYamrParserTest, UnexpectedEOM) -{ - auto config = New<TYamrFormatConfig>(); - config->HasSubkey = false; - config->Lenval = true; - config->EnableEom = false; - - TString input = TString( - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\xff\xff\xff\xff" "\x01\x00\x00\x00" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" - , 2 * (2 * 4 + 4 + 6) + 8 + 12 // all i32 + lengths of keys - ); - - EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/yamr_writer_ut.cpp b/yt/yt/client/unittests/yamr_writer_ut.cpp deleted file mode 100644 index 747f542807..0000000000 --- a/yt/yt/client/unittests/yamr_writer_ut.cpp +++ /dev/null @@ -1,644 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/client/table_client/unversioned_row.h> -#include <yt/yt/client/table_client/name_table.h> - -#include <yt/yt/client/formats/yamr_writer.h> - -#include <yt/yt/core/concurrency/async_stream.h> - -namespace NYT::NFormats { -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NYTree; -using namespace NYson; -using namespace NConcurrency; -using namespace NTableClient; - -class TSchemalessWriterForYamrTest - : public ::testing::Test -{ -protected: - TNameTablePtr NameTable_; - int KeyId_; - int SubkeyId_; - int ValueId_; - int TableIndexId_; - int RangeIndexId_; - int RowIndexId_; - - TYamrFormatConfigPtr Config_; - - IUnversionedRowsetWriterPtr Writer_; - - TStringStream OutputStream_; - - TSchemalessWriterForYamrTest() { - NameTable_ = New<TNameTable>(); - KeyId_ = NameTable_->RegisterName("key"); - SubkeyId_ = NameTable_->RegisterName("subkey"); - ValueId_ = NameTable_->RegisterName("value"); - TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); - RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); - RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); - - Config_ = New<TYamrFormatConfig>(); - } - - void CreateStandardWriter(TControlAttributesConfigPtr controlAttributes = New<TControlAttributesConfig>()) - { - Writer_ = CreateSchemalessWriterForYamr( - Config_, - NameTable_, - CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), - false, /* enableContextSaving */ - controlAttributes, - 0 /* keyColumnCount */); - } -}; - -TEST_F(TSchemalessWriterForYamrTest, Simple) -{ - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - - // Ignore system columns. - row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); - - // Note that key and value follow not in order. - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = - "key1\tvalue1\n" - "key2\tvalue2\n"; - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SimpleWithSubkey) -{ - Config_->HasSubkey = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("subkey2", SubkeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = - "key1\tsubkey1\tvalue1\n" - "key2\tsubkey2\tvalue2\n"; - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SubkeyCouldBeSkipped) -{ - Config_->HasSubkey = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("key", KeyId_)); - row.AddValue(MakeUnversionedStringValue("value", ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = "key\t\tvalue\n"; - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SubkeyCouldBeNull) -{ - Config_->HasSubkey = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("key", KeyId_)); - row.AddValue(MakeUnversionedSentinelValue(EValueType::Null, SubkeyId_)); - row.AddValue(MakeUnversionedStringValue("value", ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = "key\t\tvalue\n"; - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, NonNullTerminatedStrings) -{ - Config_->HasSubkey = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row; - const char* longString = "trashkeytrashsubkeytrashvalue"; - row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 5, 3), KeyId_)); - row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 13, 6), SubkeyId_)); - row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 24, 5), ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = "key\tsubkey\tvalue\n"; - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SkippedKey) -{ - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("value", ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_FALSE(Writer_->Write(rows)); - - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); -} - -TEST_F(TSchemalessWriterForYamrTest, SkippedValue) -{ - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("key", KeyId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_FALSE(Writer_->Write(rows)); - - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); -} - -TEST_F(TSchemalessWriterForYamrTest, NotStringType) { - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("key", KeyId_)); - row.AddValue(MakeUnversionedInt64Value(42, ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_FALSE(Writer_->Write(rows)); - - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); -} - -TEST_F(TSchemalessWriterForYamrTest, ExtraItem) -{ - int trashId = NameTable_->RegisterName("trash"); - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("key", KeyId_)); - row.AddValue(MakeUnversionedStringValue("value", ValueId_)); - // This value will be ignored. - row.AddValue(MakeUnversionedStringValue("trash", trashId)); - // This value will also be ignored because Config_->HasSubkey is off, - // despite the fact it has non-string type. - row.AddValue(MakeUnversionedInt64Value(42, SubkeyId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = "key\tvalue\n"; - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, Escaping) -{ - Config_->HasSubkey = true; - Config_->EnableEscaping = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("\n", KeyId_)); - row.AddValue(MakeUnversionedStringValue("\t", SubkeyId_)); - row.AddValue(MakeUnversionedStringValue("\n", ValueId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = "\\n\t\\t\t\\n\n"; - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SimpleWithTableIndex) -{ - Config_->EnableTableIndex = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableTableIndex = true; - CreateStandardWriter(controlAttributes); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); - row3.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); - - rows = { row3.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = - "42\n" - "key1\tvalue1\n" - "key2\tvalue2\n" - "23\n" - "key3\tvalue3\n"; - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, SimpleWithRowIndexAndTableIndex) -{ - Config_->EnableTableIndex = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableTableIndex = true; - controlAttributes->EnableRowIndex = true; - CreateStandardWriter(controlAttributes); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(0, RowIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(0, RangeIndexId_)); - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); - row3.AddValue(MakeUnversionedInt64Value(5, RowIndexId_)); - row3.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); - rows = { row3.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row4; - row4.AddValue(MakeUnversionedStringValue("key4", KeyId_)); - row4.AddValue(MakeUnversionedStringValue("value4", ValueId_)); - row4.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); - row4.AddValue(MakeUnversionedInt64Value(10, RowIndexId_)); - row4.AddValue(MakeUnversionedInt64Value(2, RangeIndexId_)); - rows = { row4.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = - "42\n0\n" - "key1\tvalue1\n" - "key2\tvalue2\n" - "42\n5\n" - "key3\tvalue3\n" - "23\n10\n" - "key4\tvalue4\n"; - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, Lenval) -{ - Config_->HasSubkey = true; - Config_->Lenval = true; - CreateStandardWriter(); - - // Note that order in both rows is unusual. - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedStringValue("subkey2", SubkeyId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = TString( - "\x04\x00\x00\x00" "key1" - "\x07\x00\x00\x00" "subkey1" - "\x06\x00\x00\x00" "value1" - - "\x04\x00\x00\x00" "key2" - "\x07\x00\x00\x00" "subkey2" - "\x06\x00\x00\x00" "value2" - , 2 * (3 * 4 + 4 + 6 + 7) // all i32 + lengths of keys - ); - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, LenvalWithEmptyFields) -{ - Config_->HasSubkey = true; - Config_->Lenval = true; - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("", SubkeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("subkey3", SubkeyId_)); - row3.AddValue(MakeUnversionedStringValue("", ValueId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = TString( - "\x00\x00\x00\x00" "" - "\x07\x00\x00\x00" "subkey1" - "\x06\x00\x00\x00" "value1" - - "\x04\x00\x00\x00" "key2" - "\x00\x00\x00\x00" "" - "\x06\x00\x00\x00" "value2" - - "\x04\x00\x00\x00" "key3" - "\x07\x00\x00\x00" "subkey3" - "\x00\x00\x00\x00" "" - - , 9 * 4 + (7 + 6) + (4 + 6) + (4 + 7) // all i32 + lengths of keys - ); - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, LenvalWithKeySwitch) -{ - Config_->HasSubkey = true; - Config_->Lenval = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableKeySwitch = true; - - Writer_ = CreateSchemalessWriterForYamr( - Config_, - NameTable_, - CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), - false, /* enableContextSaving */ - controlAttributes, - 1 /* keyColumnCount */); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("subkey21", SubkeyId_)); - row2.AddValue(MakeUnversionedStringValue("value21", ValueId_)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("subkey22", SubkeyId_)); - row3.AddValue(MakeUnversionedStringValue("value22", ValueId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row4; - row4.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row4.AddValue(MakeUnversionedStringValue("subkey3", SubkeyId_)); - row4.AddValue(MakeUnversionedStringValue("value3", ValueId_)); - - rows = { row4.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output = TString( - "\x04\x00\x00\x00" "key1" - "\x07\x00\x00\x00" "subkey1" - "\x06\x00\x00\x00" "value1" - - "\xfe\xff\xff\xff" // key switch - - "\x04\x00\x00\x00" "key2" - "\x08\x00\x00\x00" "subkey21" - "\x07\x00\x00\x00" "value21" - - "\x04\x00\x00\x00" "key2" - "\x08\x00\x00\x00" "subkey22" - "\x07\x00\x00\x00" "value22" - - "\xfe\xff\xff\xff" - - "\x04\x00\x00\x00" "key3" - "\x07\x00\x00\x00" "subkey3" - "\x06\x00\x00\x00" "value3" - - , 14 * 4 + (4 + 7 + 6) + (4 + 8 + 7) + (4 + 8 + 7) + (4 + 7 + 6) // all i32 + lengths of keys - ); - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, LenvalWithTableIndex) -{ - Config_->EnableTableIndex = true; - Config_->Lenval = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableTableIndex = true; - CreateStandardWriter(controlAttributes); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); - row3.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); - - rows = { row3.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output( - "\xff\xff\xff\xff" "\x2a\x00\x00\x00" // 42 - - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\xff\xff\xff\xff" "\x17\x00\x00\x00" // 23 - - "\x04\x00\x00\x00" "key3" - "\x06\x00\x00\x00" "value3" - , 10 * 4 + 3 * (4 + 6)); - - EXPECT_EQ(output, OutputStream_.Str()); -} - -TEST_F(TSchemalessWriterForYamrTest, LenvalWithRangeAndRowIndex) -{ - Config_->Lenval = true; - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableRowIndex = true; - controlAttributes->EnableRangeIndex = true; - CreateStandardWriter(controlAttributes); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); - row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); - row1.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(23, RowIndexId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); - row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); - row2.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); - row2.AddValue(MakeUnversionedInt64Value(24, RowIndexId_)); - - std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - TUnversionedRowBuilder row3; - row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); - row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); - row3.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); - row3.AddValue(MakeUnversionedInt64Value(25, RowIndexId_)); - - rows = { row3.GetRow() }; - EXPECT_EQ(true, Writer_->Write(rows)); - - Writer_->Close() - .Get() - .ThrowOnError(); - - TString output( - "\xfd\xff\xff\xff" "\x2a\x00\x00\x00" // 42 - "\xfc\xff\xff\xff" "\x17\x00\x00\x00\x00\x00\x00\x00" // 23 - - "\x04\x00\x00\x00" "key1" - "\x06\x00\x00\x00" "value1" - - "\x04\x00\x00\x00" "key2" - "\x06\x00\x00\x00" "value2" - - "\x04\x00\x00\x00" "key3" - "\x06\x00\x00\x00" "value3" - , 11 * 4 + 3 * (4 + 6)); - - EXPECT_EQ(output, OutputStream_.Str()); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/yamred_dsv_parser_ut.cpp b/yt/yt/client/unittests/yamred_dsv_parser_ut.cpp deleted file mode 100644 index d29c9a4df6..0000000000 --- a/yt/yt/client/unittests/yamred_dsv_parser_ut.cpp +++ /dev/null @@ -1,187 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/test_framework/yson_consumer_mock.h> - -#include <yt/yt/client/formats/yamred_dsv_parser.h> - -namespace NYT::NFormats { -namespace { - -using namespace NYson; - -using ::testing::InSequence; -using ::testing::StrictMock; -using ::testing::NiceMock; - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamredDsvParserTest, Simple) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key_a")); - EXPECT_CALL(Mock, OnStringScalar("1")); - EXPECT_CALL(Mock, OnKeyedItem("key_b")); - EXPECT_CALL(Mock, OnStringScalar("2")); - EXPECT_CALL(Mock, OnKeyedItem("subkey_x")); - EXPECT_CALL(Mock, OnStringScalar("3")); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("5")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar("6")); - EXPECT_CALL(Mock, OnEndMap()); - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key_a")); - EXPECT_CALL(Mock, OnStringScalar("7")); - EXPECT_CALL(Mock, OnKeyedItem("key_b")); - EXPECT_CALL(Mock, OnStringScalar("8")); - EXPECT_CALL(Mock, OnKeyedItem("subkey_x")); - EXPECT_CALL(Mock, OnStringScalar("9")); - EXPECT_CALL(Mock, OnKeyedItem("b")); - EXPECT_CALL(Mock, OnStringScalar("max\tignat")); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("100")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = - "1 2\t3\ta=5\tb=6\n" - "7 8\t9\tb=max\\tignat\ta=100\n"; - - auto config = New<TYamredDsvFormatConfig>(); - config->HasSubkey = true; - config->KeyColumnNames.push_back("key_a"); - config->KeyColumnNames.push_back("key_b"); - config->SubkeyColumnNames.push_back("subkey_x"); - - ParseYamredDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamredDsvParserTest, EmptyField) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("0 1")); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("b")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "\t0 1\ta=b\n"; - - auto config = New<TYamredDsvFormatConfig>(); - config->HasSubkey = true; - config->KeyColumnNames.push_back("key"); - config->SubkeyColumnNames.push_back("subkey"); - - ParseYamredDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamredDsvParserTest, Escaping) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("\t")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("0\n1")); - EXPECT_CALL(Mock, OnKeyedItem("a")); - EXPECT_CALL(Mock, OnStringScalar("\tb\nc")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = "\\t\t0\\n1\ta=\\tb\\nc\n"; - - auto config = New<TYamredDsvFormatConfig>(); - config->HasSubkey = true; - config->EnableEscaping = true; - config->KeyColumnNames.push_back("key"); - config->SubkeyColumnNames.push_back("subkey"); - - ParseYamredDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TYamredDsvParserTest, Lenval) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("a")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("bc")); - EXPECT_CALL(Mock, OnKeyedItem("d")); - EXPECT_CALL(Mock, OnStringScalar("e")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x01\x00\x00\x00" "a" - "\x02\x00\x00\x00" "bc" - "\x03\x00\x00\x00" "d=e" - , 3 * 4 + 1 + 2 + 3 - ); - - auto config = New<TYamredDsvFormatConfig>(); - config->Lenval = true; - config->HasSubkey = true; - config->KeyColumnNames.push_back("key"); - config->SubkeyColumnNames.push_back("subkey"); - - ParseYamredDsv(input, &Mock, config); -} - -TEST(TYamredDsvParserTest, EOM) -{ - StrictMock<TMockYsonConsumer> Mock; - InSequence dummy; - - EXPECT_CALL(Mock, OnListItem()); - EXPECT_CALL(Mock, OnBeginMap()); - EXPECT_CALL(Mock, OnKeyedItem("key")); - EXPECT_CALL(Mock, OnStringScalar("a")); - EXPECT_CALL(Mock, OnKeyedItem("subkey")); - EXPECT_CALL(Mock, OnStringScalar("bc")); - EXPECT_CALL(Mock, OnKeyedItem("d")); - EXPECT_CALL(Mock, OnStringScalar("e")); - EXPECT_CALL(Mock, OnEndMap()); - - TString input = TString( - "\x01\x00\x00\x00" "a" - "\x02\x00\x00\x00" "bc" - "\x03\x00\x00\x00" "d=e" - "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00" - , 3 * 4 + 1 + 2 + 3 + 12 - ); - - auto config = New<TYamredDsvFormatConfig>(); - config->Lenval = true; - config->EnableEom = true; - config->HasSubkey = true; - config->KeyColumnNames.push_back("key"); - config->SubkeyColumnNames.push_back("subkey"); - - ParseYamredDsv(input, &Mock, config); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/yamred_dsv_writer_ut.cpp b/yt/yt/client/unittests/yamred_dsv_writer_ut.cpp deleted file mode 100644 index a45a895892..0000000000 --- a/yt/yt/client/unittests/yamred_dsv_writer_ut.cpp +++ /dev/null @@ -1,425 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/client/table_client/unversioned_row.h> -#include <yt/yt/client/table_client/name_table.h> - -#include <yt/yt/client/formats/yamred_dsv_writer.h> - -#include <yt/yt/core/concurrency/async_stream.h> - -#include <util/string/vector.h> - -#include <cstdio> - - -namespace NYT::NFormats { -namespace { - -using VectorStrok = TVector<TString>; - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NYTree; -using namespace NYson; -using namespace NConcurrency; -using namespace NTableClient; - -class TSchemalessWriterForYamredDsvTest - : public ::testing::Test -{ -protected: - TNameTablePtr NameTable_; - TYamredDsvFormatConfigPtr Config_; - IUnversionedRowsetWriterPtr Writer_; - - TStringStream OutputStream_; - - int KeyAId_; - int KeyBId_; - int KeyCId_; - int ValueXId_; - int ValueYId_; - int TableIndexId_; - int RangeIndexId_; - int RowIndexId_; - - TSchemalessWriterForYamredDsvTest() - { - NameTable_ = New<TNameTable>(); - KeyAId_ = NameTable_->RegisterName("key_a"); - KeyBId_ = NameTable_->RegisterName("key_b"); - KeyCId_ = NameTable_->RegisterName("key_c"); - ValueXId_ = NameTable_->RegisterName("value_x"); - ValueYId_ = NameTable_->RegisterName("value_y"); - TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); - RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); - RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); - Config_ = New<TYamredDsvFormatConfig>(); - } - - void CreateStandardWriter(TControlAttributesConfigPtr controlAttributes = New<TControlAttributesConfig>()) - { - Writer_ = CreateSchemalessWriterForYamredDsv( - Config_, - NameTable_, - CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), - false, /* enableContextSaving */ - controlAttributes, - 0 /* keyColumnCount */); - } - - // Splits output into key and sorted vector of values that are entries of the last YAMR column. - // Returns true if success (there are >= 2 values after splitting by field separator), otherwise false. - bool ExtractKeyValue(TString output, TString& key, VectorStrok& value, char fieldSeparator = '\t') - { - char delimiter[2] = {fieldSeparator, 0}; - // Splitting by field separator. - value = SplitString(output, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); - // We should at least have key and the rest of values. - if (value.size() < 2) - return false; - key = value[0]; - value.erase(value.begin()); - std::sort(value.begin(), value.end()); - return true; - } - - // The same function as previous, version with subkey. - bool ExtractKeySubkeyValue(TString output, TString& key, TString& subkey, VectorStrok& value, char fieldSeparator = '\t') - { - char delimiter[2] = {fieldSeparator, 0}; - // Splitting by field separator. - value = SplitString(output, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); - // We should at least have key, subkey and the rest of values. - if (value.size() < 3) - return false; - key = value[0]; - subkey = value[1]; - value.erase(value.begin(), value.end()); - std::sort(value.begin(), value.end()); - return true; - } - - // Compares output and expected output ignoring the order of entries in YAMR value column. - void CompareKeyValue(TString output, TString expected, char recordSeparator = '\n', char fieldSeparator = '\t') - { - char delimiter[2] = {recordSeparator, 0}; - VectorStrok outputRows = SplitString(output, delimiter, 0 /* maxFields */ , KEEP_EMPTY_TOKENS); - VectorStrok expectedRows = SplitString(expected, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); - EXPECT_EQ(outputRows.size(), expectedRows.size()); - // Since there is \n after each row, there will be an extra empty string in both vectors. - EXPECT_EQ(outputRows.back(), ""); - ASSERT_EQ(expectedRows.back(), ""); - outputRows.pop_back(); - expectedRows.pop_back(); - - TString outputKey; - TString expectedKey; - VectorStrok outputValue; - VectorStrok expectedValue; - for (int rowIndex = 0; rowIndex < static_cast<int>(outputRows.size()); rowIndex++) { - EXPECT_TRUE(ExtractKeyValue(outputRows[rowIndex], outputKey, outputValue, fieldSeparator)); - ASSERT_TRUE(ExtractKeyValue(expectedRows[rowIndex], expectedKey, expectedValue, fieldSeparator)); - EXPECT_EQ(outputKey, expectedKey); - EXPECT_EQ(outputValue, expectedValue); - } - } - - // The same function as previous, version with subkey. - void CompareKeySubkeyValue(TString output, TString expected, char recordSeparator = '\n', char fieldSeparator = '\t') - { - char delimiter[2] = {recordSeparator, 0}; - VectorStrok outputRows = SplitString(output, delimiter, 0 /* maxFields */ , KEEP_EMPTY_TOKENS); - VectorStrok expectedRows = SplitString(expected, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); - EXPECT_EQ(outputRows.size(), expectedRows.size()); - // Since there is \n after each row, there will be an extra empty string in both vectors. - EXPECT_EQ(outputRows.back(), ""); - ASSERT_EQ(expectedRows.back(), ""); - outputRows.pop_back(); - expectedRows.pop_back(); - - TString outputKey; - TString expectedKey; - TString outputSubkey; - TString expectedSubkey; - VectorStrok outputValue; - VectorStrok expectedValue; - for (int rowIndex = 0; rowIndex < static_cast<int>(outputRows.size()); rowIndex++) { - EXPECT_TRUE(ExtractKeySubkeyValue(outputRows[rowIndex], outputKey, outputSubkey, outputValue, fieldSeparator)); - ASSERT_TRUE(ExtractKeySubkeyValue(expectedRows[rowIndex], expectedKey, expectedSubkey, expectedValue, fieldSeparator)); - EXPECT_EQ(outputKey, expectedKey); - EXPECT_EQ(outputSubkey, expectedSubkey); - EXPECT_EQ(outputValue, expectedValue); - } - } -}; - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, Simple) -{ - Config_->KeyColumnNames.emplace_back("key_a"); - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("a1", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); - row1.AddValue(MakeUnversionedSentinelValue(EValueType::Null, ValueYId_)); - - // Ignore system columns. - row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("a2", KeyAId_)); - row2.AddValue(MakeUnversionedStringValue("y", ValueYId_)); - row2.AddValue(MakeUnversionedStringValue("b", KeyBId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = - "a1\tvalue_x=x\n" - "a2\tvalue_y=y\tkey_b=b\n"; - - TString output = OutputStream_.Str(); - - CompareKeyValue(expectedOutput, output); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, SimpleWithSubkey) -{ - Config_->HasSubkey = true; - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->KeyColumnNames.emplace_back("key_b"); - Config_->SubkeyColumnNames.emplace_back("key_c"); - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("b1", KeyBId_)); - row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - row2.AddValue(MakeUnversionedStringValue("b2", KeyBId_)); - row2.AddValue(MakeUnversionedStringValue("c", KeyCId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = - "a b1\tc\t\n" - "a b2\tc\t\n"; - - TString output = OutputStream_.Str(); - - CompareKeySubkeyValue(expectedOutput, output); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, Lenval) -{ - Config_->Lenval = true; - Config_->HasSubkey = true; - Config_->EnableTableIndex = true; - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->KeyColumnNames.emplace_back("key_b"); - Config_->SubkeyColumnNames.emplace_back("key_c"); - - auto controlAttributes = New<TControlAttributesConfig>(); - controlAttributes->EnableTableIndex = true; - controlAttributes->EnableRowIndex = true; - controlAttributes->EnableRangeIndex = true; - CreateStandardWriter(controlAttributes); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("b1", KeyBId_)); - row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); - row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); - - row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(23, RangeIndexId_)); - row1.AddValue(MakeUnversionedInt64Value(17, RowIndexId_)); - - TUnversionedRowBuilder row2; - row2.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - row2.AddValue(MakeUnversionedStringValue("b2", KeyBId_)); - row2.AddValue(MakeUnversionedStringValue("c", KeyCId_)); - - row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); - row2.AddValue(MakeUnversionedInt64Value(23, RangeIndexId_)); - row2.AddValue(MakeUnversionedInt64Value(18, RowIndexId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = TString( - "\xff\xff\xff\xff" "\x2a\x00\x00\x00" // Table index. - "\xfd\xff\xff\xff" "\x17\x00\x00\x00" // Range index. - "\xfc\xff\xff\xff" "\x11\x00\x00\x00\x00\x00\x00\x00" // Row index. - - "\x04\x00\x00\x00" "a b1" - "\x01\x00\x00\x00" "c" - "\x09\x00\x00\x00" "value_x=x" - - "\x04\x00\x00\x00" "a b2" - "\x01\x00\x00\x00" "c" - "\x00\x00\x00\x00" "", - - 13 * 4 + 4 + 1 + 9 + 4 + 1 + 0 - ); - - TString output = OutputStream_.Str(); - EXPECT_EQ(expectedOutput, output) - << "expected length: " << expectedOutput.length() - << ", " - << "actual length: " << output.length(); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, Escaping) -{ - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->KeyColumnNames.emplace_back("key_b"); - int columnWithEscapedNameId = NameTable_->GetIdOrRegisterName("value\t_t"); - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("a\n", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("\nb\t", KeyBId_)); - row1.AddValue(MakeUnversionedStringValue("\nva\\lue\t", columnWithEscapedNameId)); - - std::vector<TUnversionedRow> rows = {row1.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = "a\\n \\nb\\t\tvalue\\t_t=\\nva\\\\lue\\t\n"; - TString output = OutputStream_.Str(); - - EXPECT_EQ(expectedOutput, output); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, SkippedKey) -{ - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->KeyColumnNames.emplace_back("key_b"); - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("b", KeyBId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_FALSE(Writer_->Write(rows)); - - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, SkippedSubkey) -{ - Config_->HasSubkey = true; - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->SubkeyColumnNames.emplace_back("key_c"); - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_FALSE(Writer_->Write(rows)); - - EXPECT_THROW(Writer_->Close() - .Get() - .ThrowOnError(), std::exception); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, NonStringValues) -{ - Config_->HasSubkey = true; - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->SubkeyColumnNames.emplace_back("key_c"); - CreateStandardWriter(); - - TUnversionedRowBuilder row; - row.AddValue(MakeUnversionedInt64Value(-42, KeyAId_)); - row.AddValue(MakeUnversionedUint64Value(18, KeyCId_)); - row.AddValue(MakeUnversionedBooleanValue(true, KeyBId_)); - row.AddValue(MakeUnversionedDoubleValue(3.14, ValueXId_)); - row.AddValue(MakeUnversionedStringValue("yt", ValueYId_)); - - std::vector<TUnversionedRow> rows = { row.GetRow() }; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = "-42\t18\tkey_b=true\tvalue_x=3.14\tvalue_y=yt\n"; - TString output = OutputStream_.Str(); - - EXPECT_EQ(expectedOutput, output); -} - -//////////////////////////////////////////////////////////////////////////////// - -TEST_F(TSchemalessWriterForYamredDsvTest, ErasingSubkeyColumnsWhenHasSubkeyIsFalse) -{ - Config_->KeyColumnNames.emplace_back("key_a"); - Config_->SubkeyColumnNames.emplace_back("key_b"); - // Config->HasSubkey = false by default. - CreateStandardWriter(); - - TUnversionedRowBuilder row1; - row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); - row1.AddValue(MakeUnversionedStringValue("b", KeyBId_)); - row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); - row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); - - std::vector<TUnversionedRow> rows = {row1.GetRow()}; - - EXPECT_EQ(true, Writer_->Write(rows)); - Writer_->Close() - .Get() - .ThrowOnError(); - - TString expectedOutput = "a\tkey_c=c\tvalue_x=x\n"; - TString output = OutputStream_.Str(); - - EXPECT_EQ(expectedOutput, output); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NFormats diff --git a/yt/yt/client/unittests/yson_helpers.cpp b/yt/yt/client/unittests/yson_helpers.cpp deleted file mode 100644 index 669585caf7..0000000000 --- a/yt/yt/client/unittests/yson_helpers.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "yson_helpers.h" - -#include <yt/yt/core/ytree/convert.h> -#include <yt/yt/core/ytree/node.h> -#include <yt/yt/core/yson/string.h> - -namespace NYT { - -using namespace NYson; -using namespace NYTree; - -//////////////////////////////////////////////////////////////////////////////// - -TString CanonizeYson(TStringBuf input) -{ - auto node = ConvertToNode(TYsonString(input)); - auto binaryYson = ConvertToYsonString(node); - - TStringStream out; - { - TYsonWriter writer(&out, NYson::EYsonFormat::Pretty); - ParseYsonStringBuffer(binaryYson.AsStringBuf(), EYsonType::Node, &writer); - } - return out.Str(); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/column_converters/boolean_column_converter.cpp b/yt/yt/library/column_converters/boolean_column_converter.cpp new file mode 100644 index 0000000000..37e27bc56c --- /dev/null +++ b/yt/yt/library/column_converters/boolean_column_converter.cpp @@ -0,0 +1,100 @@ +#include "boolean_column_converter.h" + +#include "helpers.h" + +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +void FillColumnarBooleanValues( + TBatchColumn* column, + i64 startIndex, + i64 valueCount, + TRef bitmap) +{ + column->StartIndex = startIndex; + column->ValueCount = valueCount; + + auto& values = column->Values.emplace(); + values.BitWidth = 1; + values.Data = bitmap; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TBooleanColumnConverter + : public IColumnConverter +{ +public: + TBooleanColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) + : ColumnIndex_(columnIndex) + , ColumnSchema_(columnSchema) + { } + + TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) override + { + Reset(); + AddValues(rowsValues); + + auto column = std::make_shared<TBatchColumn>(); + auto nullBitmapRef = NullBitmap_.Flush<TConverterTag>(); + auto valuesRef = Values_.Flush<TConverterTag>(); + + FillColumnarBooleanValues(column.get(), 0, rowsValues.size(), valuesRef); + FillColumnarNullBitmap(column.get(), 0, rowsValues.size(), nullBitmapRef); + + column->Type = ColumnSchema_.LogicalType(); + column->Id = ColumnIndex_; + + TOwningColumn owner = { + .Column = std::move(column), + .NullBitmap = std::move(nullBitmapRef), + .ValueBuffer = std::move(valuesRef), + }; + + return {{owner}, owner.Column.get()}; + } + + +private: + const int ColumnIndex_; + const NTableClient::TColumnSchema ColumnSchema_; + + TBitmapOutput Values_; + TBitmapOutput NullBitmap_; + + void Reset() + { + Values_.Flush<TConverterTag>(); + NullBitmap_.Flush<TConverterTag>(); + } + + void AddValues(const std::vector<TUnversionedRowValues>& rowsValues) + { + for (auto rowValues : rowsValues) { + auto value = rowValues[ColumnIndex_]; + bool isNull = value == nullptr || value->Type == NTableClient::EValueType::Null; + bool data = isNull ? false : value->Data.Boolean; + NullBitmap_.Append(isNull); + Values_.Append(data); + } + } +}; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateBooleanColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TBooleanColumnConverter>(columnIndex, columnSchema); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/boolean_column_converter.h b/yt/yt/library/column_converters/boolean_column_converter.h new file mode 100644 index 0000000000..0495c4a188 --- /dev/null +++ b/yt/yt/library/column_converters/boolean_column_converter.h @@ -0,0 +1,15 @@ +#pragma once + +#include "column_converter.h" + +#include <yt/yt/client/table_client/public.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateBooleanColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/column_converter.cpp b/yt/yt/library/column_converters/column_converter.cpp new file mode 100644 index 0000000000..21c9982549 --- /dev/null +++ b/yt/yt/library/column_converters/column_converter.cpp @@ -0,0 +1,91 @@ +#include "column_converter.h" + +#include "boolean_column_converter.h" +#include "floating_point_column_converter.h" +#include "integer_column_converter.h" +#include "null_column_converter.h" +#include "string_column_converter.h" + +#include <yt/yt/client/table_client/row_base.h> +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +namespace NYT::NColumnConverters { + +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateColumnConvert( + const NTableClient::TColumnSchema& columnSchema, + int columnIndex) +{ + switch (columnSchema.GetWireType()) { + case EValueType::Int64: + return CreateInt64ColumnConverter(columnIndex, columnSchema); + + case EValueType::Uint64: + return CreateUint64ColumnConverter(columnIndex, columnSchema); + + case EValueType::Double: + switch (columnSchema.CastToV1Type()) { + case NTableClient::ESimpleLogicalValueType::Float: + return CreateFloatingPoint32ColumnConverter(columnIndex, columnSchema); + default: + return CreateFloatingPoint64ColumnConverter(columnIndex, columnSchema); + } + + case EValueType::String: + return CreateStringConverter(columnIndex, columnSchema); + + case EValueType::Boolean: + return CreateBooleanColumnConverter(columnIndex, columnSchema); + + case EValueType::Any: + return CreateAnyConverter(columnIndex, columnSchema); + + case EValueType::Composite: + return CreateCompositeConverter(columnIndex, columnSchema); + + case EValueType::Null: + return CreateNullConverter(columnIndex); + + case EValueType::Min: + case EValueType::TheBottom: + case EValueType::Max: + break; + } + ThrowUnexpectedValueType(columnSchema.GetWireType()); +} + +//////////////////////////////////////////////////////////////////////////////// + + +TConvertedColumnRange ConvertRowsToColumns( + TRange<TUnversionedRow> rows, + const std::vector<TColumnSchema>& columnSchema) +{ + TConvertedColumnRange convertedColumnsRange; + std::vector<TUnversionedRowValues> rowsValues; + rowsValues.reserve(rows.size()); + + for (const auto& row : rows) { + TUnversionedRowValues rowValues; + rowValues.resize(columnSchema.size(), nullptr); + for (const auto* item = row.Begin(); item != row.End(); ++item) { + rowValues[item->Id] = item; + } + rowsValues.push_back(std::move(rowValues)); + } + + for (int columnId = 0; columnId < std::ssize(columnSchema); columnId++) { + auto converter = CreateColumnConvert(columnSchema[columnId], columnId); + auto columns = converter->Convert(rowsValues); + convertedColumnsRange.push_back(columns); + } + return convertedColumnsRange; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/column_converter.h b/yt/yt/library/column_converters/column_converter.h new file mode 100644 index 0000000000..64cec2fd44 --- /dev/null +++ b/yt/yt/library/column_converters/column_converter.h @@ -0,0 +1,54 @@ +#pragma once + +#include <yt/yt/client/table_client/row_batch.h> + +#include <yt/yt/core/misc/bitmap.h> + +#include <library/cpp/yt/memory/ref.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +using TBatchColumn = NTableClient::IUnversionedColumnarRowBatch::TColumn; +using TBatchColumnPtr = std::shared_ptr<TBatchColumn>; +using TUnversionedRowValues = std::vector<const NTableClient::TUnversionedValue*>; + +//////////////////////////////////////////////////////////////////////////////// + +struct TOwningColumn +{ + TBatchColumnPtr Column; + TSharedRef NullBitmap; + TSharedRef ValueBuffer; + TSharedRef StringBuffer; +}; + +struct TConvertedColumn +{ + std::vector<TOwningColumn> Columns; + TBatchColumn* RootColumn; +}; + +using TConvertedColumnRange = std::vector<TConvertedColumn>; + +//////////////////////////////////////////////////////////////////////////////// + +struct IColumnConverter + : private TNonCopyable +{ + virtual ~IColumnConverter() = default; + virtual TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) = 0; +}; + +using IColumnConverterPtr = std::unique_ptr<IColumnConverter>; + +//////////////////////////////////////////////////////////////////////////////// + +TConvertedColumnRange ConvertRowsToColumns( + TRange<NTableClient::TUnversionedRow> rows, + const std::vector<NTableClient::TColumnSchema>& columnSchema); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/floating_point_column_converter.cpp b/yt/yt/library/column_converters/floating_point_column_converter.cpp new file mode 100644 index 0000000000..bc18a53f14 --- /dev/null +++ b/yt/yt/library/column_converters/floating_point_column_converter.cpp @@ -0,0 +1,135 @@ +#include "floating_point_column_converter.h" + +#include "helpers.h" + +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +namespace NYT::NColumnConverters { + +using namespace NProto; +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +template <typename T> +void FillColumnarFloatingPointValues( + NTableClient::IUnversionedColumnarRowBatch::TColumn* column, + i64 startIndex, + i64 valueCount, + TRef data) +{ + column->StartIndex = startIndex; + column->ValueCount = valueCount; + + auto& values = column->Values.emplace(); + values.BitWidth = sizeof(T) * 8; + values.Data = data; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TSharedRef SerializeFloatingPointVector(const std::vector<T>& values) +{ + auto data = TSharedMutableRef::Allocate<TConverterTag>(values.size() * sizeof(T) + sizeof(ui64), {.InitializeStorage = false}); + *reinterpret_cast<ui64*>(data.Begin()) = static_cast<ui64>(values.size()); + std::memcpy( + data.Begin() + sizeof(ui64), + values.data(), + values.size() * sizeof(T)); + return data; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class TValue, NTableClient::EValueType ValueType> +class TFloatingPointColumnConverter + : public IColumnConverter +{ +public: + static_assert(std::is_floating_point_v<TValue>); + + TFloatingPointColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) + : ColumnIndex_(columnIndex) + , ColumnSchema_(columnSchema) + + { } + + TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) + { + Reset(); + AddValues(rowsValues); + auto nullBitmapRef = NullBitmap_.Flush<TConverterTag>(); + auto valuesRef = TSharedRef::MakeCopy<TConverterTag>(TRef(Values_.data(), sizeof(TValue) * Values_.size())); + + auto column = std::make_shared<TBatchColumn>(); + + FillColumnarFloatingPointValues<TValue>( + column.get(), + 0, + rowsValues.size(), + valuesRef); + + FillColumnarNullBitmap( + column.get(), + 0, + rowsValues.size(), + nullBitmapRef); + + column->Type = ColumnSchema_.LogicalType(); + column->Id = ColumnIndex_; + + TOwningColumn owner = { + .Column = std::move(column), + .NullBitmap = std::move(nullBitmapRef), + .ValueBuffer = std::move(valuesRef), + }; + + return {{owner}, owner.Column.get()}; + } + +private: + const int ColumnIndex_; + const TColumnSchema ColumnSchema_; + + std::vector<TValue> Values_; + TBitmapOutput NullBitmap_; + + void Reset() + { + Values_.clear(); + NullBitmap_.Flush<TConverterTag>(); + } + + void AddValues(const std::vector<TUnversionedRowValues>& rowsValues) + { + for (auto rowValues : rowsValues) { + auto value = rowValues[ColumnIndex_]; + bool isNull = value == nullptr || value->Type == NTableClient::EValueType::Null; + TValue data = isNull ? 0 : value->Data.Double; + NullBitmap_.Append(isNull); + Values_.push_back(data); + } + } +}; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateFloatingPoint32ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TFloatingPointColumnConverter<float, NTableClient::EValueType::Double>>(columnIndex, columnSchema); +} + +IColumnConverterPtr CreateFloatingPoint64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TFloatingPointColumnConverter<double, NTableClient::EValueType::Double>>(columnIndex, columnSchema); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/floating_point_column_converter.h b/yt/yt/library/column_converters/floating_point_column_converter.h new file mode 100644 index 0000000000..3739d4e729 --- /dev/null +++ b/yt/yt/library/column_converters/floating_point_column_converter.h @@ -0,0 +1,15 @@ +#pragma once + +#include "column_converter.h" + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateFloatingPoint32ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema); + +IColumnConverterPtr CreateFloatingPoint64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/helpers.cpp b/yt/yt/library/column_converters/helpers.cpp new file mode 100644 index 0000000000..cddac06d79 --- /dev/null +++ b/yt/yt/library/column_converters/helpers.cpp @@ -0,0 +1,59 @@ +#include "helpers.h" + +#include <yt/yt/client/table_client/columnar.h> +#include <yt/yt/client/table_client/logical_type.h> +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +#include <yt/yt/core/misc/bitmap.h> + +namespace NYT::NColumnConverters { + +using namespace NProto; +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +void FillColumnarNullBitmap( + NTableClient::IUnversionedColumnarRowBatch::TColumn* column, + i64 startIndex, + i64 valueCount, + TRef bitmap) +{ + column->StartIndex = startIndex; + column->ValueCount = valueCount; + + auto& nullBitmap = column->NullBitmap.emplace(); + nullBitmap.Data = bitmap; +} + + +void FillColumnarDictionary( + NTableClient::IUnversionedColumnarRowBatch::TColumn* primaryColumn, + NTableClient::IUnversionedColumnarRowBatch::TColumn* dictionaryColumn, + NTableClient::IUnversionedColumnarRowBatch::TDictionaryId dictionaryId, + NTableClient::TLogicalTypePtr type, + i64 startIndex, + i64 valueCount, + TRef ids) +{ + primaryColumn->StartIndex = startIndex; + primaryColumn->ValueCount = valueCount; + + dictionaryColumn->Type = type && type->GetMetatype() == ELogicalMetatype::Optional + ? type->AsOptionalTypeRef().GetElement() + : type; + + auto& primaryValues = primaryColumn->Values.emplace(); + primaryValues.BitWidth = 32; + primaryValues.Data = ids; + + auto& dictionary = primaryColumn->Dictionary.emplace(); + dictionary.DictionaryId = dictionaryId; + dictionary.ZeroMeansNull = true; + dictionary.ValueColumn = dictionaryColumn; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/helpers.h b/yt/yt/library/column_converters/helpers.h new file mode 100644 index 0000000000..6957ff13c1 --- /dev/null +++ b/yt/yt/library/column_converters/helpers.h @@ -0,0 +1,39 @@ +#pragma once + +#include <yt/yt/client/table_client/row_batch.h> +#include <yt/yt/client/table_client/schema.h> + +#include <yt/yt/core/misc/common.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +void FillColumnarNullBitmap( + NTableClient::IUnversionedColumnarRowBatch::TColumn* column, + i64 startIndex, + i64 valueCount, + TRef bitmap); + +void FillColumnarDictionary( + NTableClient::IUnversionedColumnarRowBatch::TColumn* primaryColumn, + NTableClient::IUnversionedColumnarRowBatch::TColumn* dictionaryColumn, + NTableClient::IUnversionedColumnarRowBatch::TDictionaryId dictionaryId, + NTableClient::TLogicalTypePtr type, + i64 startIndex, + i64 valueCount, + TRef ids); + +//////////////////////////////////////////////////////////////////////////////// + +DEFINE_ENUM(EUnversionedStringSegmentType, + ((DictionaryDense) (0)) + ((DirectDense) (1)) +); + +struct TConverterTag +{}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/integer_column_converter.cpp b/yt/yt/library/column_converters/integer_column_converter.cpp new file mode 100644 index 0000000000..862c23e5b7 --- /dev/null +++ b/yt/yt/library/column_converters/integer_column_converter.cpp @@ -0,0 +1,175 @@ +#include "integer_column_converter.h" + +#include "helpers.h" + +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +#include <library/cpp/yt/coding/zig_zag.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +ui64 EncodeValue(i64 value) +{ + return ZigZagEncode64(value); +} + +ui64 EncodeValue(ui64 value) +{ + return value; +} + +template <class TValue> +typename std::enable_if<std::is_signed<TValue>::value, TValue>::type +GetValue(const NTableClient::TUnversionedValue& value) +{ + return value.Data.Int64; +} + +template <class TValue> +typename std::enable_if<std::is_unsigned<TValue>::value, TValue>::type +GetValue(const NTableClient::TUnversionedValue& value) +{ + return value.Data.Uint64; +} + +//////////////////////////////////////////////////////////////////////////////// + +void FillColumnarIntegerValues( + NTableClient::IUnversionedColumnarRowBatch::TColumn* column, + i64 startIndex, + i64 valueCount, + NTableClient::EValueType valueType, + ui64 baseValue, + TRef data) +{ + column->StartIndex = startIndex; + column->ValueCount = valueCount; + + auto& values = column->Values.emplace(); + values.BaseValue = baseValue; + values.BitWidth = 64; + values.ZigZagEncoded = (valueType == NTableClient::EValueType::Int64); + values.Data = data; +} + +//////////////////////////////////////////////////////////////////////////////// + +// TValue - i64 or ui64. +template <class TValue> +class TIntegerColumnConverter + : public IColumnConverter +{ +public: + static_assert(std::is_integral_v<TValue>); + + TIntegerColumnConverter( + int columnIndex, + NTableClient::EValueType ValueType, + NTableClient::TColumnSchema columnSchema) + : ColumnIndex_(columnIndex) + , ColumnSchema_(columnSchema) + , ValueType_(ValueType) + { } + + TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) override + { + Reset(); + AddValues(rowsValues); + for (i64 index = 0; index < std::ssize(Values_); ++index) { + if (!NullBitmap_[index]) { + Values_[index] -= MinValue_; + } + } + + auto nullBitmapRef = NullBitmap_.Flush<TConverterTag>(); + auto valuesRef = TSharedRef::MakeCopy<TConverterTag>(TRef(Values_.data(), sizeof(ui64) * Values_.size())); + auto column = std::make_shared<TBatchColumn>(); + + FillColumnarIntegerValues( + column.get(), + 0, + RowCount_, + ValueType_, + MinValue_, + valuesRef); + + FillColumnarNullBitmap( + column.get(), + 0, + RowCount_, + nullBitmapRef); + + column->Type = ColumnSchema_.LogicalType(); + column->Id = ColumnIndex_; + + TOwningColumn owner = { + .Column = std::move(column), + .NullBitmap = std::move(nullBitmapRef), + .ValueBuffer = std::move(valuesRef), + }; + + return {{owner}, owner.Column.get()}; + } + + +private: + const int ColumnIndex_; + const NTableClient::TColumnSchema ColumnSchema_; + const NTableClient::EValueType ValueType_; + + i64 RowCount_ = 0; + TBitmapOutput NullBitmap_; + std::vector<ui64> Values_; + + ui64 MaxValue_; + ui64 MinValue_; + + void Reset() + { + Values_.clear(); + RowCount_ = 0; + MaxValue_ = 0; + MinValue_ = std::numeric_limits<ui64>::max(); + NullBitmap_.Flush<TConverterTag>(); + } + + void AddValues(const std::vector<TUnversionedRowValues>& rowsValues) + { + for (auto rowValues : rowsValues) { + auto value = rowValues[ColumnIndex_]; + bool isNull = value == nullptr || value->Type == NTableClient::EValueType::Null; + ui64 data = 0; + if (!isNull) { + YT_VERIFY(value != nullptr); + data = EncodeValue(GetValue<TValue>(*value)); + } + Values_.push_back(data); + NullBitmap_.Append(isNull); + ++RowCount_; + } + } +}; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateInt64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TIntegerColumnConverter<i64>>(columnIndex, NTableClient::EValueType::Int64, columnSchema); +} + + +IColumnConverterPtr CreateUint64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TIntegerColumnConverter<ui64>>(columnIndex, NTableClient::EValueType::Uint64, columnSchema); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/integer_column_converter.h b/yt/yt/library/column_converters/integer_column_converter.h new file mode 100644 index 0000000000..99b9d86342 --- /dev/null +++ b/yt/yt/library/column_converters/integer_column_converter.h @@ -0,0 +1,17 @@ +#pragma once + +#include "column_converter.h" + +#include <yt/yt/client/table_client/public.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateInt64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema); + +std::unique_ptr<IColumnConverter> CreateUint64ColumnConverter(int columnIndex, const NTableClient::TColumnSchema& columnSchema); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/null_column_converter.cpp b/yt/yt/library/column_converters/null_column_converter.cpp new file mode 100644 index 0000000000..d07ab24ceb --- /dev/null +++ b/yt/yt/library/column_converters/null_column_converter.cpp @@ -0,0 +1,49 @@ +#include "null_column_converter.h" + +#include <yt/yt/client/table_client/logical_type.h> + +namespace NYT::NColumnConverters { + +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +class TNullColumnWriterConverter + : public IColumnConverter +{ +public: + TNullColumnWriterConverter(int columnIndex) + : ColumnIndex_(columnIndex) + { } + + TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) override + { + auto rowCount = rowsValues.size(); + + auto column = std::make_shared<TBatchColumn>(); + + column->Id = ColumnIndex_; + column->Type = SimpleLogicalType(ESimpleLogicalValueType::Null); + column->ValueCount = rowCount; + + TOwningColumn owner = { + .Column = std::move(column), + }; + + return {{owner}, owner.Column.get()}; + } + +private: + const int ColumnIndex_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateNullConverter(int columnIndex) +{ + return std::make_unique<TNullColumnWriterConverter>(columnIndex); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/client/unittests/yson_helpers.h b/yt/yt/library/column_converters/null_column_converter.h index 3cab460345..a8f97c84a1 100644 --- a/yt/yt/client/unittests/yson_helpers.h +++ b/yt/yt/library/column_converters/null_column_converter.h @@ -1,13 +1,13 @@ #pragma once -#include <util/generic/string.h> +#include "column_converter.h" -namespace NYT { +namespace NYT::NColumnConverters { //////////////////////////////////////////////////////////////////////////////// -TString CanonizeYson(TStringBuf yson); +IColumnConverterPtr CreateNullConverter(int columnIndex); //////////////////////////////////////////////////////////////////////////////// -} // namespace NYT
\ No newline at end of file +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/string_column_converter.cpp b/yt/yt/library/column_converters/string_column_converter.cpp new file mode 100644 index 0000000000..c8a4354c47 --- /dev/null +++ b/yt/yt/library/column_converters/string_column_converter.cpp @@ -0,0 +1,375 @@ +#include "string_column_converter.h" + +#include "helpers.h" + +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +#include <yt/yt/core/misc/bit_packed_unsigned_vector.h> + +#include <library/cpp/yt/string/string_builder.h> + +namespace NYT::NColumnConverters { + +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +void FillColumnarStringValues( + NTableClient::IUnversionedColumnarRowBatch::TColumn* column, + i64 startIndex, + i64 valueCount, + ui32 avgLength, + TRef offsets, + TRef stringData) +{ + column->StartIndex = startIndex; + column->ValueCount = valueCount; + + auto& values = column->Values.emplace(); + values.BitWidth = 32; + values.ZigZagEncoded = true; + values.Data = offsets; + + auto& strings = column->Strings.emplace(); + strings.AvgLength = avgLength; + strings.Data = stringData; +} + +bool IsValueNull(TStringBuf lhs) +{ + return !lhs.data(); +} + +//////////////////////////////////////////////////////////////////////////////// + + +template <EValueType ValueType> +class TStringConverter + : public IColumnConverter +{ +public: + TStringConverter( + int columnIndex, + const TColumnSchema& columnSchema) + : ColumnIndex_(columnIndex) + , ColumnSchema_(columnSchema) + { } + + TConvertedColumn Convert(const std::vector<TUnversionedRowValues>& rowsValues) override + { + Reset(); + AddValues(rowsValues); + return GetColumns(); + } + +private: + const int ColumnIndex_; + const TColumnSchema ColumnSchema_; + + ui32 RowCount_ = 0; + ui64 AllStringsSize_ = 0; + ui64 DictionaryByteSize_ = 0; + + std::vector<TStringBuf> Values_; + THashMap<TStringBuf, ui32> Dictionary_; + TStringBuilder DirectBuffer_; + + void Reset() + { + AllStringsSize_ = 0; + RowCount_ = 0; + DictionaryByteSize_ = 0; + + DirectBuffer_.Reset(); + Values_.clear(); + Dictionary_.clear(); + } + + TSharedRef GetDirectDenseNullBitmap() const + { + TBitmapOutput nullBitmap(Values_.size()); + + for (auto value : Values_) { + nullBitmap.Append(IsValueNull(value)); + } + + return nullBitmap.Flush<TConverterTag>(); + } + + std::vector<ui32> GetDirectDenseOffsets() const + { + std::vector<ui32> offsets; + offsets.reserve(Values_.size()); + + ui32 offset = 0; + for (auto value : Values_) { + offset += value.length(); + offsets.push_back(offset); + } + + return offsets; + } + + TConvertedColumn GetDirectColumn(TSharedRef nullBitmap) + { + auto offsets = GetDirectDenseOffsets(); + + // Save offsets as diff from expected. + ui32 expectedLength; + ui32 maxDiff; + PrepareDiffFromExpected(&offsets, &expectedLength, &maxDiff); + + auto directData = DirectBuffer_.GetBuffer(); + + auto offsetsRef = TSharedRef::MakeCopy<TConverterTag>(TRef(offsets.data(), sizeof(ui32) * offsets.size())); + auto directDataPtr = TSharedRef::MakeCopy<TConverterTag>(TRef(directData.data(), directData.size())); + auto column = std::make_shared<TBatchColumn>(); + + FillColumnarStringValues( + column.get(), + 0, + RowCount_, + expectedLength, + TRef(offsetsRef), + TRef(directDataPtr)); + + FillColumnarNullBitmap( + column.get(), + 0, + RowCount_, + TRef(nullBitmap)); + + column->Type = ColumnSchema_.LogicalType(); + column->Id = ColumnIndex_; + + TOwningColumn owner = { + .Column = std::move(column), + .NullBitmap = std::move(nullBitmap), + .ValueBuffer = std::move(offsetsRef), + .StringBuffer = std::move(directDataPtr), + }; + return {{owner}, owner.Column.get()}; + } + + TConvertedColumn GetDictionaryColumn() + { + auto dictionaryData = TSharedMutableRef::Allocate<TConverterTag>(DictionaryByteSize_, {.InitializeStorage = false}); + + std::vector<ui32> dictionaryOffsets; + dictionaryOffsets.reserve(Dictionary_.size()); + + std::vector<ui32> ids; + ids.reserve(Values_.size()); + + ui32 dictionarySize = 0; + ui32 dictionaryOffset = 0; + for (auto value : Values_) { + if (IsValueNull(value)) { + ids.push_back(0); + continue; + } + + ui32 id = GetOrCrash(Dictionary_, value); + ids.push_back(id); + + if (id > dictionarySize) { + std::memcpy( + dictionaryData.Begin() + dictionaryOffset, + value.data(), + value.length()); + dictionaryOffset += value.length(); + dictionaryOffsets.push_back(dictionaryOffset); + ++dictionarySize; + } + } + + YT_VERIFY(dictionaryOffset == DictionaryByteSize_); + + // 1. Value ids. + auto idsRef = TSharedRef::MakeCopy<TConverterTag>(TRef(ids.data(), sizeof(ui32) * ids.size())); + + // 2. Dictionary offsets. + ui32 expectedLength; + ui32 maxDiff; + PrepareDiffFromExpected(&dictionaryOffsets, &expectedLength, &maxDiff); + auto dictionaryOffsetsRef = TSharedRef::MakeCopy<TConverterTag>(TRef(dictionaryOffsets.data(), sizeof(ui32) * dictionaryOffsets.size())); + + auto primaryColumn = std::make_shared<TBatchColumn>(); + auto dictionaryColumn = std::make_shared<TBatchColumn>(); + + FillColumnarStringValues( + dictionaryColumn.get(), + 0, + dictionaryOffsets.size(), + expectedLength, + TRef(dictionaryOffsetsRef), + dictionaryData); + + FillColumnarDictionary( + primaryColumn.get(), + dictionaryColumn.get(), + NTableClient::IUnversionedColumnarRowBatch::GenerateDictionaryId(), + primaryColumn->Type, + 0, + RowCount_, + idsRef); + + dictionaryColumn->Type = ColumnSchema_.LogicalType(); + primaryColumn->Type = ColumnSchema_.LogicalType(); + primaryColumn->Id = ColumnIndex_; + + TOwningColumn dictOwner = { + .Column = std::move(dictionaryColumn), + .ValueBuffer = std::move(dictionaryOffsetsRef), + .StringBuffer = std::move(dictionaryData), + }; + + TOwningColumn primeOwner = { + .Column = std::move(primaryColumn), + .ValueBuffer = std::move(idsRef), + }; + + return {{primeOwner, dictOwner}, primeOwner.Column.get()}; + } + + TConvertedColumn GetColumns() + { + auto costs = GetEncodingMethodsCosts(); + + auto minElement = std::min_element(costs.begin(), costs.end()); + auto type = EUnversionedStringSegmentType(std::distance(costs.begin(), minElement)); + + switch (type) { + + case EUnversionedStringSegmentType::DirectDense: + return GetDirectColumn(GetDirectDenseNullBitmap()); + + case EUnversionedStringSegmentType::DictionaryDense: + return GetDictionaryColumn(); + + default: + YT_ABORT(); + } + } + + TEnumIndexedVector<EUnversionedStringSegmentType, ui64> GetEncodingMethodsCosts() const + { + TEnumIndexedVector<EUnversionedStringSegmentType, ui64> costs; + for (auto type : TEnumTraits<EUnversionedStringSegmentType>::GetDomainValues()) { + costs[type] = GetSpecificEncodingMethodCosts(type); + } + return costs; + } + + ui64 GetSpecificEncodingMethodCosts(EUnversionedStringSegmentType type) const + { + switch (type) { + case EUnversionedStringSegmentType::DictionaryDense: + return GetDictionaryByteSize(); + + case EUnversionedStringSegmentType::DirectDense: + return GetDirectByteSize(); + + default: + YT_ABORT(); + } + } + + void AddValues(const std::vector<TUnversionedRowValues>& rowsValues) + { + for (auto rowValues : rowsValues) { + auto unversionedValue = rowValues[ColumnIndex_]; + YT_VERIFY(unversionedValue != nullptr); + auto value = CaptureValue(*unversionedValue); + Values_.push_back(value); + ++RowCount_; + } + } + + ui64 GetDirectByteSize() const + { + return AllStringsSize_; + } + + ui64 GetDictionaryByteSize() const + { + return DictionaryByteSize_ + Values_.size() * sizeof(ui32); + } + + + TStringBuf CaptureValue(const TUnversionedValue& unversionedValue) + { + if (unversionedValue.Type == EValueType::Null) { + return {}; + } + + auto valueCapacity = IsAnyOrComposite(ValueType) && !IsAnyOrComposite(unversionedValue.Type) + ? GetYsonSize(unversionedValue) + : static_cast<i64>(unversionedValue.Length); + + char* buffer = DirectBuffer_.Preallocate(valueCapacity); + if (!buffer) { + // This means, that we reserved nothing, because all strings are either null or empty. + // To distinguish between null and empty, we set preallocated pointer to special value. + static char* const EmptyStringBase = reinterpret_cast<char*>(1); + buffer = EmptyStringBase; + } + + auto start = buffer; + + if (IsAnyOrComposite(ValueType) && !IsAnyOrComposite(unversionedValue.Type)) { + // Any non-any and non-null value convert to YSON. + buffer += WriteYson(buffer, unversionedValue); + } else { + std::memcpy( + buffer, + unversionedValue.Data.String, + unversionedValue.Length); + buffer += unversionedValue.Length; + } + + auto value = TStringBuf(start, buffer); + + YT_VERIFY(value.size() <= valueCapacity); + + DirectBuffer_.Advance(value.size()); + + if (Dictionary_.emplace(value, Dictionary_.size() + 1).second) { + DictionaryByteSize_ += value.size(); + } + AllStringsSize_ += value.size(); + return value; + } +}; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateStringConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TStringConverter<EValueType::String>>(columnIndex, columnSchema); +} + +IColumnConverterPtr CreateAnyConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TStringConverter<EValueType::Any>>(columnIndex, columnSchema); +} + +IColumnConverterPtr CreateCompositeConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema) +{ + return std::make_unique<TStringConverter<EValueType::Composite>>(columnIndex, columnSchema); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/string_column_converter.h b/yt/yt/library/column_converters/string_column_converter.h new file mode 100644 index 0000000000..b9c3d2bdf7 --- /dev/null +++ b/yt/yt/library/column_converters/string_column_converter.h @@ -0,0 +1,25 @@ +#pragma once + +#include "column_converter.h" + +#include <yt/yt/client/table_client/public.h> + +namespace NYT::NColumnConverters { + +//////////////////////////////////////////////////////////////////////////////// + +IColumnConverterPtr CreateStringConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema); + +IColumnConverterPtr CreateAnyConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema); + +IColumnConverterPtr CreateCompositeConverter( + int columnIndex, + const NTableClient::TColumnSchema& columnSchema); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NColumnConverters diff --git a/yt/yt/library/column_converters/ya.make b/yt/yt/library/column_converters/ya.make new file mode 100644 index 0000000000..55cd9f86c0 --- /dev/null +++ b/yt/yt/library/column_converters/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + boolean_column_converter.cpp + column_converter.cpp + floating_point_column_converter.cpp + helpers.cpp + integer_column_converter.cpp + null_column_converter.cpp + string_column_converter.cpp +) + +PEERDIR( + yt/yt/core +) + +END() diff --git a/yt/yt/library/formats/arrow_writer.cpp b/yt/yt/library/formats/arrow_writer.cpp new file mode 100644 index 0000000000..b93e54cf31 --- /dev/null +++ b/yt/yt/library/formats/arrow_writer.cpp @@ -0,0 +1,1065 @@ +#include "arrow_writer.h" + +#include <yt/yt/client/arrow/fbs/Message.fbs.h> +#include <yt/yt/client/arrow/fbs/Schema.fbs.h> + +#include <yt/yt/client/formats/public.h> +#include <yt/yt/library/formats/schemaless_writer_adapter.h> + +#include <yt/yt/client/table_client/columnar.h> +#include <yt/yt/client/table_client/logical_type.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/public.h> +#include <yt/yt/client/table_client/row_batch.h> +#include <yt/yt/client/table_client/schema.h> + +#include <yt/yt/library/column_converters/column_converter.h> + +#include <yt/yt/core/concurrency/async_stream.h> +#include <yt/yt/core/concurrency/public.h> + +#include <yt/yt/core/misc/blob_output.h> +#include <yt/yt/core/misc/error.h> +#include <yt/yt/core/misc/range.h> + +#include <vector> + +namespace NYT::NFormats { + +using namespace NTableClient; +using namespace NComplexTypes; + +static const auto& Logger = FormatsLogger; + +using TBodyWriter = std::function<void(TMutableRef)>; +using TBatchColumn = IUnversionedColumnarRowBatch::TColumn; + +//////////////////////////////////////////////////////////////////////////////// + +struct TTypedBatchColumn +{ + const TBatchColumn* Column; + TLogicalTypePtr Type; +}; + +//////////////////////////////////////////////////////////////////////////////// + +constexpr i64 ArrowAlignment = 8; + +flatbuffers::Offset<flatbuffers::String> SerializeString( + flatbuffers::FlatBufferBuilder* flatbufBuilder, + const TString& str) +{ + return flatbufBuilder->CreateString(str.data(), str.length()); +} + +std::tuple<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>> SerializeColumnType( + flatbuffers::FlatBufferBuilder* flatbufBuilder, + TColumnSchema schema) +{ + auto simpleType = CastToV1Type(schema.LogicalType()).first; + switch (simpleType) { + case ESimpleLogicalValueType::Null: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_Null, + org::apache::arrow::flatbuf::CreateNull(*flatbufBuilder) + .Union()); + + case ESimpleLogicalValueType::Int64: + case ESimpleLogicalValueType::Uint64: + case ESimpleLogicalValueType::Int8: + case ESimpleLogicalValueType::Uint8: + case ESimpleLogicalValueType::Int16: + case ESimpleLogicalValueType::Uint16: + case ESimpleLogicalValueType::Int32: + case ESimpleLogicalValueType::Uint32: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_Int, + org::apache::arrow::flatbuf::CreateInt( + *flatbufBuilder, + GetIntegralTypeBitWidth(simpleType), + IsIntegralTypeSigned(simpleType)) + .Union()); + + case ESimpleLogicalValueType::Double: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_FloatingPoint, + org::apache::arrow::flatbuf::CreateFloatingPoint( + *flatbufBuilder, + org::apache::arrow::flatbuf::Precision_DOUBLE) + .Union()); + + case ESimpleLogicalValueType::Boolean: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_Bool, + org::apache::arrow::flatbuf::CreateBool(*flatbufBuilder) + .Union()); + + case ESimpleLogicalValueType::String: + case ESimpleLogicalValueType::Any: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_Binary, + org::apache::arrow::flatbuf::CreateBinary(*flatbufBuilder) + .Union()); + + case ESimpleLogicalValueType::Utf8: + return std::make_tuple( + org::apache::arrow::flatbuf::Type_Utf8, + org::apache::arrow::flatbuf::CreateUtf8(*flatbufBuilder) + .Union()); + + // TODO(babenko): the following types are not supported: + // Date + // Datetime + // Interval + // Timestamp + + default: + THROW_ERROR_EXCEPTION("Column %v has type %Qlv that is not currently supported by Arrow encoder", + schema.GetDiagnosticNameString(), + simpleType); + } +} + +bool IsRleButNotDictionaryEncodedStringLikeColumn(const TBatchColumn& column) +{ + auto simpleType = CastToV1Type(column.Type).first; + return IsStringLikeType(simpleType) && + column.Rle && + !column.Rle->ValueColumn->Dictionary; +} + +bool IsRleAndDictionaryEncodedColumn(const TBatchColumn& column) +{ + return column.Rle && + column.Rle->ValueColumn->Dictionary; +} + +bool IsDictionaryEncodedColumn(const TBatchColumn& column) +{ + return column.Dictionary || + IsRleAndDictionaryEncodedColumn(column) || + IsRleButNotDictionaryEncodedStringLikeColumn(column); +} + + +struct TRecordBatchBodyPart +{ + i64 Size; + TBodyWriter Writer; +}; + +struct TRecordBatchSerializationContext final +{ + explicit TRecordBatchSerializationContext(flatbuffers::FlatBufferBuilder* flatbufBuilder) + : FlatbufBuilder(flatbufBuilder) + {} + + void AddFieldNode(i64 length, i64 nullCount) + { + FieldNodes.emplace_back(length, nullCount); + } + + void AddBuffer(i64 size, TBodyWriter writer) + { + YT_LOG_DEBUG("Buffer registered (Offset: %v, Size: %v)", + CurrentBodyOffset, + size); + + Buffers.emplace_back(CurrentBodyOffset, size); + CurrentBodyOffset += AlignUp<i64>(size, ArrowAlignment); + Parts.push_back(TRecordBatchBodyPart{size, std::move(writer)}); + } + + flatbuffers::FlatBufferBuilder* const FlatbufBuilder; + + i64 CurrentBodyOffset = 0; + std::vector<org::apache::arrow::flatbuf::FieldNode> FieldNodes; + std::vector<org::apache::arrow::flatbuf::Buffer> Buffers; + std::vector<TRecordBatchBodyPart> Parts; +}; + +template <class T> +TMutableRange<T> GetTypedValues(TMutableRef ref) +{ + return MakeMutableRange( + reinterpret_cast<T*>(ref.Begin()), + reinterpret_cast<T*>(ref.End())); +} + +void SerializeColumnPrologue( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + if (column->NullBitmap || + column->Rle && column->Rle->ValueColumn->NullBitmap) + { + if (column->Rle) { + const auto* valueColumn = column->Rle->ValueColumn; + auto rleIndexes = column->GetTypedValues<ui64>(); + + context->AddFieldNode( + column->ValueCount, + CountOnesInRleBitmap( + valueColumn->NullBitmap->Data, + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount)); + + context->AddBuffer( + GetBitmapByteSize(column->ValueCount), + [=] (TMutableRef dstRef) { + BuildValidityBitmapFromRleNullBitmap( + valueColumn->NullBitmap->Data, + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount, + dstRef); + }); + } else { + context->AddFieldNode( + column->ValueCount, + CountOnesInBitmap( + column->NullBitmap->Data, + column->StartIndex, + column->StartIndex + column->ValueCount)); + + context->AddBuffer( + GetBitmapByteSize(column->ValueCount), + [=] (TMutableRef dstRef) { + CopyBitmapRangeToBitmapNegated( + column->NullBitmap->Data, + column->StartIndex, + column->StartIndex + column->ValueCount, + dstRef); + }); + } + } else { + context->AddFieldNode( + column->ValueCount, + 0); + + context->AddBuffer( + 0, + [=] (TMutableRef /*dstRef*/) { + }); + } +} + +void SerializeRleButNotDictionaryEncodedStringLikeColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(column->Values->BitWidth == 64); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(!column->Values->ZigZagEncoded); + + YT_LOG_DEBUG("Adding RLE but not dictionary-encoded string-like column (ColumnId: %v, StartIndex: %v, ValueCount: %v)", + column->Id, + column->StartIndex, + column->ValueCount); + + SerializeColumnPrologue(typedColumn, context); + + auto rleIndexes = column->GetTypedValues<ui64>(); + + context->AddBuffer( + sizeof(ui32) * column->ValueCount, + [=] (TMutableRef dstRef) { + BuildIotaDictionaryIndexesFromRleIndexes( + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount, + GetTypedValues<ui32>(dstRef)); + }); +} + +void SerializeDictionaryColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(column->Dictionary->ZeroMeansNull); + YT_VERIFY(column->Values->BitWidth == 32); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(!column->Values->ZigZagEncoded); + + YT_LOG_DEBUG("Adding dictionary column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", + column->Id, + column->StartIndex, + column->ValueCount, + column->Rle.has_value()); + + auto relevantDictionaryIndexes = column->GetRelevantTypedValues<ui32>(); + + context->AddFieldNode( + column->ValueCount, + CountNullsInDictionaryIndexesWithZeroNull(relevantDictionaryIndexes)); + + context->AddBuffer( + GetBitmapByteSize(column->ValueCount), + [=] (TMutableRef dstRef) { + BuildValidityBitmapFromDictionaryIndexesWithZeroNull( + relevantDictionaryIndexes, + dstRef); + }); + + context->AddBuffer( + sizeof(ui32) * column->ValueCount, + [=] (TMutableRef dstRef) { + BuildDictionaryIndexesFromDictionaryIndexesWithZeroNull( + relevantDictionaryIndexes, + GetTypedValues<ui32>(dstRef)); + }); +} + +void SerializeRleDictionaryColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(column->Values->BitWidth == 64); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(!column->Values->ZigZagEncoded); + YT_VERIFY(column->Rle->ValueColumn->Dictionary->ZeroMeansNull); + YT_VERIFY(column->Rle->ValueColumn->Values->BitWidth == 32); + YT_VERIFY(column->Rle->ValueColumn->Values->BaseValue == 0); + YT_VERIFY(!column->Rle->ValueColumn->Values->ZigZagEncoded); + + YT_LOG_DEBUG("Adding dictionary column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", + column->Id, + column->StartIndex, + column->ValueCount, + column->Rle.has_value()); + + auto dictionaryIndexes = column->Rle->ValueColumn->GetTypedValues<ui32>(); + auto rleIndexes = column->GetTypedValues<ui64>(); + + context->AddFieldNode( + column->ValueCount, + CountNullsInRleDictionaryIndexesWithZeroNull( + dictionaryIndexes, + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount)); + + context->AddBuffer( + GetBitmapByteSize(column->ValueCount), + [=] (TMutableRef dstRef) { + BuildValidityBitmapFromRleDictionaryIndexesWithZeroNull( + dictionaryIndexes, + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount, + dstRef); + }); + + context->AddBuffer( + sizeof(ui32) * column->ValueCount, + [=] (TMutableRef dstRef) { + BuildDictionaryIndexesFromRleDictionaryIndexesWithZeroNull( + dictionaryIndexes, + rleIndexes, + column->StartIndex, + column->StartIndex + column->ValueCount, + GetTypedValues<ui32>(dstRef)); + }); +} + +void SerializeIntegerColumn( + const TTypedBatchColumn& typedColumn, + ESimpleLogicalValueType simpleType, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + + YT_LOG_DEBUG("Adding integer column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", + column->Id, + column->StartIndex, + column->ValueCount, + column->Rle.has_value()); + + SerializeColumnPrologue(typedColumn, context); + + context->AddBuffer( + column->ValueCount * GetIntegralTypeByteSize(simpleType), + [=] (TMutableRef dstRef) { + const auto* valueColumn = column->Rle + ? column->Rle->ValueColumn + : column; + auto values = valueColumn->GetTypedValues<ui64>(); + + auto rleIndexes = column->Rle + ? column->GetTypedValues<ui64>() + : TRange<ui64>(); + + switch (simpleType) { +#define XX(cppType, ytType) \ + case ESimpleLogicalValueType::ytType: { \ + auto dstValues = GetTypedValues<cppType>(dstRef); \ + auto* currentOutput = dstValues.Begin(); \ + DecodeIntegerVector( \ + column->StartIndex, \ + column->StartIndex + column->ValueCount, \ + valueColumn->Values->BaseValue, \ + valueColumn->Values->ZigZagEncoded, \ + TRange<ui32>(), \ + rleIndexes, \ + [&] (auto index) { \ + return values[index]; \ + }, \ + [&] (auto value) { \ + *currentOutput++ = value; \ + }); \ + break; \ + } + + XX(i8, Int8) + XX(i16, Int16) + XX(i32, Int32) + XX(i64, Int64) + XX(ui8, Uint8) + XX(ui16, Uint16) + XX(ui32, Uint32) + XX(ui64, Uint64) + +#undef XX + + default: + THROW_ERROR_EXCEPTION("Integer column %v has unexpected type %Qlv", + typedColumn.Column->Id, + simpleType); + } + }); +} + +void SerializeDoubleColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(column->Values->BitWidth == 64); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(!column->Values->ZigZagEncoded); + + YT_LOG_DEBUG("Adding double column (ColumnId: %v, StartIndex: %v, ValueCount: %v)", + column->Id, + column->StartIndex, + column->ValueCount, + column->Rle.has_value()); + + SerializeColumnPrologue(typedColumn, context); + + context->AddBuffer( + column->ValueCount * sizeof(double), + [=] (TMutableRef dstRef) { + auto relevantValues = column->GetRelevantTypedValues<double>(); + ::memcpy( + dstRef.Begin(), + relevantValues.Begin(), + column->ValueCount * sizeof(double)); + }); +} + +void SerializeStringLikeColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(column->Values->BitWidth == 32); + YT_VERIFY(column->Values->ZigZagEncoded); + YT_VERIFY(column->Strings); + YT_VERIFY(column->Strings->AvgLength); + YT_VERIFY(!column->Rle); + + auto startIndex = column->StartIndex; + auto endIndex = startIndex + column->ValueCount; + auto stringData = column->Strings->Data; + auto avgLength = *column->Strings->AvgLength; + + auto offsets = column->GetTypedValues<ui32>(); + auto startOffset = DecodeStringOffset(offsets, avgLength, startIndex); + auto endOffset = DecodeStringOffset(offsets, avgLength, endIndex); + auto stringsSize = endOffset - startOffset; + + YT_LOG_DEBUG("Adding string-like column (ColumnId: %v, StartIndex: %v, ValueCount: %v, StartOffset: %v, EndOffset: %v, StringsSize: %v)", + column->Id, + column->StartIndex, + column->ValueCount, + startOffset, + endOffset, + stringsSize); + + SerializeColumnPrologue(typedColumn, context); + + context->AddBuffer( + sizeof(i32) * (column->ValueCount + 1), + [=] (TMutableRef dstRef) { + DecodeStringOffsets( + offsets, + avgLength, + startIndex, + endIndex, + GetTypedValues<ui32>(dstRef)); + }); + + context->AddBuffer( + stringsSize, + [=] (TMutableRef dstRef) { + ::memcpy( + dstRef.Begin(), + stringData.Begin() + startOffset, + stringsSize); + }); +} + +void SerializeBooleanColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + YT_VERIFY(column->Values); + YT_VERIFY(!column->Values->ZigZagEncoded); + YT_VERIFY(column->Values->BaseValue == 0); + YT_VERIFY(column->Values->BitWidth == 1); + + YT_LOG_DEBUG("Adding boolean column (ColumnId: %v, StartIndex: %v, ValueCount: %v)", + column->Id, + column->StartIndex, + column->ValueCount); + + SerializeColumnPrologue(typedColumn, context); + + context->AddBuffer( + GetBitmapByteSize(column->ValueCount), + [=] (TMutableRef dstRef) { + CopyBitmapRangeToBitmap( + column->Values->Data, + column->StartIndex, + column->StartIndex + column->ValueCount, + dstRef); + }); +} + +void SerializeColumn( + const TTypedBatchColumn& typedColumn, + TRecordBatchSerializationContext* context) +{ + const auto* column = typedColumn.Column; + + if (IsRleButNotDictionaryEncodedStringLikeColumn(*typedColumn.Column)) { + SerializeRleButNotDictionaryEncodedStringLikeColumn(typedColumn, context); + return; + } + + if (column->Dictionary) { + SerializeDictionaryColumn(typedColumn, context); + return; + } + + if (column->Rle && column->Rle->ValueColumn->Dictionary) { + SerializeRleDictionaryColumn(typedColumn, context); + return; + } + + auto simpleType = CastToV1Type(typedColumn.Type).first; + if (IsIntegralType(simpleType)) { + SerializeIntegerColumn(typedColumn, simpleType, context); + } else if (simpleType == ESimpleLogicalValueType::Double) { + SerializeDoubleColumn(typedColumn, context); + } else if (IsStringLikeType(simpleType)) { + SerializeStringLikeColumn(typedColumn, context); + } else if (simpleType == ESimpleLogicalValueType::Boolean) { + SerializeBooleanColumn(typedColumn, context); + } else if (simpleType == ESimpleLogicalValueType::Null) { + // No buffers are allocated for null columns. + } else { + THROW_ERROR_EXCEPTION("Column %v has unexpected type %Qlv", + typedColumn.Column->Id, + simpleType); + } +} + +auto SerializeRecordBatch( + flatbuffers::FlatBufferBuilder* flatbufBuilder, + int length, + TRange<TTypedBatchColumn> typedColumns) +{ + auto context = New<TRecordBatchSerializationContext>(flatbufBuilder); + + for (const auto& typedColumn : typedColumns) { + SerializeColumn(typedColumn, context.Get()); + } + + auto fieldNodesOffset = flatbufBuilder->CreateVectorOfStructs(context->FieldNodes); + + auto buffersOffset = flatbufBuilder->CreateVectorOfStructs(context->Buffers); + + auto recordBatchOffset = org::apache::arrow::flatbuf::CreateRecordBatch( + *flatbufBuilder, + length, + fieldNodesOffset, + buffersOffset); + + auto totalSize = context->CurrentBodyOffset; + + return std::make_tuple( + recordBatchOffset, + totalSize, + [context = std::move(context)] (TMutableRef dstRef) { + char* current = dstRef.Begin(); + for (const auto& part : context->Parts) { + part.Writer(TMutableRef(current, current + part.Size)); + current += AlignUp<i64>(part.Size, ArrowAlignment); + } + YT_VERIFY(current == dstRef.End()); + }); +} +/////////////////////////////////////////////////////////////////////////////// + +class TArrowWriter + : public TSchemalessFormatWriterBase +{ +public: + TArrowWriter( + TNameTablePtr nameTable, + const std::vector<NTableClient::TTableSchemaPtr>& tableSchemas, + NConcurrency::IAsyncOutputStreamPtr output, + bool enableContextSaving, + TControlAttributesConfigPtr controlAttributesConfig, + int keyColumnCount) + : TSchemalessFormatWriterBase( + std::move(nameTable), + std::move(output), + enableContextSaving, + std::move(controlAttributesConfig), + keyColumnCount) + { + YT_VERIFY(tableSchemas.size() > 0); + + auto tableSchema = tableSchemas[0]; + auto columnCount = NameTable_->GetSize(); + + for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) { + ColumnSchemas_.push_back(GetColumnSchema(tableSchema, columnIndex)); + } + } + +private: + void Reset() + { + Messages_.clear(); + TypedColumns_.clear(); + NumberOfRows_ = 0; + } + + void DoWrite(TRange<TUnversionedRow> rows) override + { + Reset(); + + auto convertedColumns = NColumnConverters::ConvertRowsToColumns(rows, ColumnSchemas_); + + std::vector<const TBatchColumn*> rootColumns; + rootColumns.reserve( std::ssize(convertedColumns)); + for (ssize_t columnIndex = 0; columnIndex < std::ssize(convertedColumns); columnIndex++) { + rootColumns.push_back(convertedColumns[columnIndex].RootColumn); + } + NumberOfRows_ = rows.size(); + PrepareColumns(rootColumns); + Encode(); + } + + void DoWriteBatch(NTableClient::IUnversionedRowBatchPtr rowBatch) override + { + auto columnarBatch = rowBatch->TryAsColumnar(); + if (!columnarBatch) { + YT_LOG_DEBUG("Encoding non-columnar batch; running write rows"); + DoWrite(rowBatch->MaterializeRows()); + } else { + YT_LOG_DEBUG("Encoding columnar batch"); + Reset(); + NumberOfRows_ = rowBatch->GetRowCount(); + PrepareColumns(columnarBatch->MaterializeColumns()); + Encode(); + } + } + + void Encode() + { + auto output = GetOutputStream(); + if (IsSchemaMessageNeeded()) { + if (!IsFirstBatch_) { + RegisterEosMarker(); + } + ResetArrowDictionaries(); + PrepareSchema(); + } + IsFirstBatch_ = false; + PrepareDictionaryBatches(); + PrepareRecordBatch(); + + WritePayload(output); + TryFlushBuffer(true); + } + +private: + bool IsFirstBatch_ = true; + size_t NumberOfRows_ = 0; + std::vector<TTypedBatchColumn> TypedColumns_; + std::vector<TColumnSchema> ColumnSchemas_; + std::vector<IUnversionedColumnarRowBatch::TDictionaryId> ArrowDictionaryIds_; + + struct TMessage + { + std::optional<flatbuffers::FlatBufferBuilder> FlatbufBuilder; + i64 BodySize; + TBodyWriter BodyWriter; + }; + + std::vector<TMessage> Messages_; + + bool CheckIfSystemColumnEnable(int columnIndex) + { + return ControlAttributesConfig_->EnableTableIndex && IsTableIndexColumnId(columnIndex) || + ControlAttributesConfig_->EnableRangeIndex && IsRangeIndexColumnId(columnIndex) || + ControlAttributesConfig_->EnableRowIndex && IsRowIndexColumnId(columnIndex) || + ControlAttributesConfig_->EnableTabletIndex && IsTabletIndexColumnId(columnIndex); + } + + bool CheckIfTypeIsNotNull(int columnIndex) + { + YT_VERIFY(columnIndex >= 0 && columnIndex < std::ssize(ColumnSchemas_)); + return CastToV1Type(ColumnSchemas_[columnIndex].LogicalType()).first != ESimpleLogicalValueType::Null; + } + + TColumnSchema GetColumnSchema(NTableClient::TTableSchemaPtr& tableSchema, int columnIndex) + { + YT_VERIFY(columnIndex >= 0); + auto name = NameTable_->GetName(columnIndex); + auto columnSchema = tableSchema->FindColumn(name); + if (!columnSchema) { + if (IsSystemColumnId(columnIndex) && CheckIfSystemColumnEnable(columnIndex)) { + return TColumnSchema(TString(name), EValueType::Int64); + } + return TColumnSchema(TString(name), EValueType::Null); + } + return *columnSchema; + } + + void PrepareColumns(const TRange<const TBatchColumn*>& batchColumns) + { + TypedColumns_.reserve(batchColumns.Size()); + for (const auto* column : batchColumns) { + if (CheckIfTypeIsNotNull(column->Id)) { + YT_VERIFY(column->Id >= 0 && column->Id < std::ssize(ColumnSchemas_)); + TypedColumns_.push_back(TTypedBatchColumn{ + column, + ColumnSchemas_[column->Id].LogicalType()}); + } + } + } + + bool IsSchemaMessageNeeded() + { + if (IsFirstBatch_) { + return true; + } + YT_VERIFY(ArrowDictionaryIds_.size() == TypedColumns_.size()); + bool result = false; + for (int index = 0; index < std::ssize(TypedColumns_); ++index) { + bool currentDictionary = IsDictionaryEncodedColumn(*TypedColumns_[index].Column); + bool previousDictionary = ArrowDictionaryIds_[index] != IUnversionedColumnarRowBatch::NullDictionaryId; + if (currentDictionary != previousDictionary) { + result = true; + } + } + return result; + } + + void ResetArrowDictionaries() + { + ArrowDictionaryIds_.assign(TypedColumns_.size(), IUnversionedColumnarRowBatch::NullDictionaryId); + } + + void RegisterEosMarker() + { + YT_LOG_DEBUG("EOS marker registered"); + + Messages_.push_back(TMessage{ + std::nullopt, + 0, + TBodyWriter()}); + } + + void RegisterMessage( + [[maybe_unused]] org::apache::arrow::flatbuf::MessageHeader type, + flatbuffers::FlatBufferBuilder&& flatbufBuilder, + i64 bodySize = 0, + std::function<void(TMutableRef)> bodyWriter = nullptr) + { + YT_LOG_DEBUG("Message registered (Type: %v, MessageSize: %v, BodySize: %v)", + org::apache::arrow::flatbuf::EnumNamesMessageHeader()[type], + flatbufBuilder.GetSize(), + bodySize); + + YT_VERIFY((bodySize % ArrowAlignment) == 0); + Messages_.push_back(TMessage{ + std::move(flatbufBuilder), + bodySize, + std::move(bodyWriter)}); + } + + void PrepareSchema() + { + flatbuffers::FlatBufferBuilder flatbufBuilder; + + int arrowDictionaryIdCounter = 0; + std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> fieldOffsets; + for (int columnIndex = 0; columnIndex < std::ssize(TypedColumns_); columnIndex++) { + const auto& typedColumn = TypedColumns_[columnIndex]; + YT_VERIFY(typedColumn.Column->Id >= 0 && typedColumn.Column->Id < std::ssize(ColumnSchemas_)); + auto columnSchema = ColumnSchemas_[typedColumn.Column->Id]; + auto nameOffset = SerializeString(&flatbufBuilder, columnSchema.Name()); + + auto [typeType, typeOffset] = SerializeColumnType(&flatbufBuilder, columnSchema); + + flatbuffers::Offset<org::apache::arrow::flatbuf::DictionaryEncoding> dictionaryEncodingOffset; + auto index_type_offset = org::apache::arrow::flatbuf::CreateInt(flatbufBuilder, 32, false); + + if (IsDictionaryEncodedColumn(*typedColumn.Column)) { + dictionaryEncodingOffset = org::apache::arrow::flatbuf::CreateDictionaryEncoding( + flatbufBuilder, + arrowDictionaryIdCounter++, + index_type_offset); + } + + auto fieldOffset = org::apache::arrow::flatbuf::CreateField( + flatbufBuilder, + nameOffset, + columnSchema.LogicalType()->IsNullable(), + typeType, + typeOffset, + dictionaryEncodingOffset); + + fieldOffsets.push_back(fieldOffset); + } + + auto fieldsOffset = flatbufBuilder.CreateVector(fieldOffsets); + + auto schemaOffset = org::apache::arrow::flatbuf::CreateSchema( + flatbufBuilder, + org::apache::arrow::flatbuf::Endianness_Little, + fieldsOffset); + + auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( + flatbufBuilder, + org::apache::arrow::flatbuf::MetadataVersion_V4, + org::apache::arrow::flatbuf::MessageHeader_Schema, + schemaOffset.Union(), + 0); + + flatbufBuilder.Finish(messageOffset); + + RegisterMessage( + org::apache::arrow::flatbuf::MessageHeader_Schema, + std::move(flatbufBuilder)); + } + + void PrepareDictionaryBatches() + { + int arrowDictionaryIdCounter = 0; + auto prepareDictionaryBatch = [&] ( + int columnIndex, + IUnversionedColumnarRowBatch::TDictionaryId ytDictionaryId, + const TBatchColumn* dictionaryColumn) { + int arrowDictionaryId = arrowDictionaryIdCounter++; + const auto& typedColumn = TypedColumns_[columnIndex]; + auto previousYTDictionaryId = ArrowDictionaryIds_[columnIndex]; + if (ytDictionaryId == previousYTDictionaryId) { + YT_LOG_DEBUG("Reusing previous dictionary (ColumnId: %v, YTDictionaryId: %v, ArrowDictionaryId: %v)", + typedColumn.Column->Id, + ytDictionaryId, + arrowDictionaryId); + } else { + YT_LOG_DEBUG("Sending new dictionary (ColumnId: %v, YTDictionaryId: %v, ArrowDictionaryId: %v)", + typedColumn.Column->Id, + ytDictionaryId, + arrowDictionaryId); + PrepareDictionaryBatch( + TTypedBatchColumn{dictionaryColumn, typedColumn.Type}, + arrowDictionaryId); + ArrowDictionaryIds_[columnIndex] = ytDictionaryId; + } + }; + + for (int columnIndex = 0; columnIndex < std::ssize(TypedColumns_); ++columnIndex) { + const auto& typedColumn = TypedColumns_[columnIndex]; + if (typedColumn.Column->Dictionary) { + YT_LOG_DEBUG("Adding dictionary batch for dictionary-encoded column (ColumnId: %v)", + typedColumn.Column->Id); + prepareDictionaryBatch( + columnIndex, + typedColumn.Column->Dictionary->DictionaryId, + typedColumn.Column->Dictionary->ValueColumn); + } else if (IsRleButNotDictionaryEncodedStringLikeColumn(*typedColumn.Column)) { + YT_LOG_DEBUG("Adding dictionary batch for RLE but not dictionary-encoded string-like column (ColumnId: %v)", + typedColumn.Column->Id); + prepareDictionaryBatch( + columnIndex, + IUnversionedColumnarRowBatch::GenerateDictionaryId(), // any unique one will do + typedColumn.Column->Rle->ValueColumn); + } else if (IsRleAndDictionaryEncodedColumn(*typedColumn.Column)) { + YT_LOG_DEBUG("Adding dictionary batch for RLE and dictionary-encoded column (ColumnId: %v)", + typedColumn.Column->Id); + prepareDictionaryBatch( + columnIndex, + typedColumn.Column->Rle->ValueColumn->Dictionary->DictionaryId, + typedColumn.Column->Rle->ValueColumn->Dictionary->ValueColumn); + } + } + } + + void PrepareDictionaryBatch( + const TTypedBatchColumn& typedColumn, + int arrowDictionaryId) + { + flatbuffers::FlatBufferBuilder flatbufBuilder; + + auto [recordBatchOffset, bodySize, bodyWriter] = SerializeRecordBatch( + &flatbufBuilder, + typedColumn.Column->ValueCount, + MakeRange({typedColumn})); + + auto dictionaryBatchOffset = org::apache::arrow::flatbuf::CreateDictionaryBatch( + flatbufBuilder, + arrowDictionaryId, + recordBatchOffset); + + auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( + flatbufBuilder, + org::apache::arrow::flatbuf::MetadataVersion_V4, + org::apache::arrow::flatbuf::MessageHeader_DictionaryBatch, + dictionaryBatchOffset.Union(), + bodySize); + + flatbufBuilder.Finish(messageOffset); + + RegisterMessage( + org::apache::arrow::flatbuf::MessageHeader_DictionaryBatch, + std::move(flatbufBuilder), + bodySize, + std::move(bodyWriter)); + } + + void PrepareRecordBatch() + { + flatbuffers::FlatBufferBuilder flatbufBuilder; + + auto [recordBatchOffset, bodySize, bodyWriter] = SerializeRecordBatch( + &flatbufBuilder, + NumberOfRows_, + TypedColumns_); + + auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( + flatbufBuilder, + org::apache::arrow::flatbuf::MetadataVersion_V4, + org::apache::arrow::flatbuf::MessageHeader_RecordBatch, + recordBatchOffset.Union(), + bodySize); + + flatbufBuilder.Finish(messageOffset); + + RegisterMessage( + org::apache::arrow::flatbuf::MessageHeader_RecordBatch, + std::move(flatbufBuilder), + bodySize, + std::move(bodyWriter)); + } + + i64 GetPayloadSize() const + { + i64 size = 0; + for (const auto& message : Messages_) { + size += sizeof(ui32); // continuation indicator + size += sizeof(ui32); // metadata size + if (message.FlatbufBuilder) { + size += AlignUp<i64>(message.FlatbufBuilder->GetSize(), ArrowAlignment); // metadata message + size += AlignUp<i64>(message.BodySize, ArrowAlignment); // body + } + } + return size; + } + + void WritePayload(TBlobOutput* output) + { + YT_LOG_DEBUG("Started writing payload"); + for (const auto& message : Messages_) { + // Continuation indicator + ui32 constMax = 0xFFFFFFFF; + output->Write(&constMax, sizeof(ui32)); + + if (message.FlatbufBuilder) { + auto metadataSize = message.FlatbufBuilder->GetSize(); + + auto metadataPtr = message.FlatbufBuilder->GetBufferPointer(); + + + ui32 metadataSz = AlignUp<i64>(metadataSize, ArrowAlignment); + + output->Write(&metadataSz, sizeof(ui32)); + output->Write(metadataPtr, metadataSize); + + // Body + if (message.BodyWriter) { + TString current; + current.resize(message.BodySize); + // Double copying. + message.BodyWriter(TMutableRef::FromString(current)); + output->Write(current.data(), message.BodySize); + } else { + YT_VERIFY(message.BodySize == 0); + } + } else { + // EOS marker + ui32 zero = 0; + output->Write(&zero, sizeof(ui32)); + } + } + + YT_LOG_DEBUG("Finished writing payload"); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +ISchemalessFormatWriterPtr CreateWriterForArrow( + NTableClient::TNameTablePtr nameTable, + const std::vector<NTableClient::TTableSchemaPtr>& schemas, + NConcurrency::IAsyncOutputStreamPtr output, + bool enableContextSaving, + TControlAttributesConfigPtr controlAttributesConfig, + int keyColumnCount) +{ + auto result = New<TArrowWriter>( + std::move(nameTable), + schemas, + std::move(output), + enableContextSaving, + std::move(controlAttributesConfig), + keyColumnCount); + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/arrow_writer.h b/yt/yt/library/formats/arrow_writer.h new file mode 100644 index 0000000000..60b296f73b --- /dev/null +++ b/yt/yt/library/formats/arrow_writer.h @@ -0,0 +1,26 @@ +#pragma once + +#include <yt/yt/client/formats/public.h> + +#include <yt/yt/client/table_client/public.h> + +#include <yt/yt/core/concurrency/public.h> + +#include <yt/yt/core/ytree/public.h> + + +namespace NYT::NFormats { + +//////////////////////////////////////////////////////////////////////////////// + +ISchemalessFormatWriterPtr CreateWriterForArrow( + NTableClient::TNameTablePtr nameTable, + const std::vector<NTableClient::TTableSchemaPtr>& schemas, + NConcurrency::IAsyncOutputStreamPtr output, + bool enableContextSaving, + TControlAttributesConfigPtr controlAttributesConfig, + int keyColumnCount); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormat diff --git a/yt/yt/client/formats/dsv_parser.cpp b/yt/yt/library/formats/dsv_parser.cpp index c2ccb2babf..bd9035176c 100644 --- a/yt/yt/client/formats/dsv_parser.cpp +++ b/yt/yt/library/formats/dsv_parser.cpp @@ -2,7 +2,8 @@ #include "format.h" #include "escape.h" -#include "parser.h" + +#include <yt/yt/client/formats/parser.h> namespace NYT::NFormats { diff --git a/yt/yt/client/formats/dsv_parser.h b/yt/yt/library/formats/dsv_parser.h index 5a156d5db5..b93fc3a050 100644 --- a/yt/yt/client/formats/dsv_parser.h +++ b/yt/yt/library/formats/dsv_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> #include <yt/yt/core/yson/consumer.h> diff --git a/yt/yt/client/formats/dsv_writer.cpp b/yt/yt/library/formats/dsv_writer.cpp index 934b82ed26..934b82ed26 100644 --- a/yt/yt/client/formats/dsv_writer.cpp +++ b/yt/yt/library/formats/dsv_writer.cpp diff --git a/yt/yt/client/formats/dsv_writer.h b/yt/yt/library/formats/dsv_writer.h index 5d1c5de674..c17fcd3a6f 100644 --- a/yt/yt/client/formats/dsv_writer.h +++ b/yt/yt/library/formats/dsv_writer.h @@ -1,11 +1,12 @@ #pragma once -#include "config.h" #include "escape.h" #include "helpers.h" -#include "public.h" #include "schemaless_writer_adapter.h" +#include <yt/yt/client/formats/config.h> +#include <yt/yt/client/formats/public.h> + #include <yt/yt/client/table_client/public.h> #include <library/cpp/yt/misc/enum.h> diff --git a/yt/yt/client/formats/escape.cpp b/yt/yt/library/formats/escape.cpp index 50b1bf85e5..50b1bf85e5 100644 --- a/yt/yt/client/formats/escape.cpp +++ b/yt/yt/library/formats/escape.cpp diff --git a/yt/yt/client/formats/escape.h b/yt/yt/library/formats/escape.h index 979ff2689d..4efc743944 100644 --- a/yt/yt/client/formats/escape.h +++ b/yt/yt/library/formats/escape.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <string> #include <vector> diff --git a/yt/yt/library/formats/format.cpp b/yt/yt/library/formats/format.cpp new file mode 100644 index 0000000000..1b7e0cf749 --- /dev/null +++ b/yt/yt/library/formats/format.cpp @@ -0,0 +1,598 @@ +#include "format.h" + +#include "dsv_parser.h" +#include "dsv_writer.h" +#include "protobuf_parser.h" +#include "protobuf_writer.h" +#include "schemaful_dsv_parser.h" +#include "schemaful_dsv_writer.h" +#include "schemaful_writer.h" +#include "web_json_writer.h" +#include "schemaless_writer_adapter.h" +#include "skiff_parser.h" +#include "skiff_writer.h" +#include "versioned_writer.h" +#include "yamred_dsv_parser.h" +#include "yamred_dsv_writer.h" +#include "yamr_parser.h" +#include "yamr_writer.h" +#include "yson_parser.h" + +#include <yt/yt/client/formats/parser.h> + +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/table_consumer.h> + +#include <yt/yt/library/skiff_ext/schema_match.h> + +#include <yt/yt/core/misc/error.h> + +#include <yt/yt/core/yson/writer.h> + +#include <yt/yt/core/ytree/fluent.h> + +#include <yt/yt/core/yson/forwarding_consumer.h> + +#include <yt/yt/core/json/json_parser.h> +#include <yt/yt/core/json/json_writer.h> + +namespace NYT::NFormats { + +using namespace NConcurrency; +using namespace NYTree; +using namespace NYson; +using namespace NJson; +using namespace NTableClient; +using namespace NSkiffExt; +using namespace NComplexTypes; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +EYsonType DataTypeToYsonType(EDataType dataType) +{ + switch (dataType) { + case EDataType::Structured: + return EYsonType::Node; + case EDataType::Tabular: + return EYsonType::ListFragment; + default: + THROW_ERROR_EXCEPTION("Data type %Qlv is not supported by YSON", + dataType); + } +} + +std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForYson( + EDataType dataType, + const IAttributeDictionary& attributes, + IZeroCopyOutput* output) +{ + auto config = ConvertTo<TYsonFormatConfigPtr>(&attributes); + return CreateYsonWriter( + output, + config->Format, + DataTypeToYsonType(dataType), + config->Format == EYsonFormat::Binary); +} + +std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForJson( + EDataType dataType, + const IAttributeDictionary& attributes, + IOutputStream* output) +{ + auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); + return CreateJsonConsumer(output, DataTypeToYsonType(dataType), config); +} + +std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForDsv( + EDataType dataType, + const IAttributeDictionary& attributes, + IOutputStream* output) +{ + auto config = ConvertTo<TDsvFormatConfigPtr>(&attributes); + switch (dataType) { + case EDataType::Structured: + return std::unique_ptr<IFlushableYsonConsumer>(new TDsvNodeConsumer(output, config)); + + case EDataType::Tabular: + case EDataType::Binary: + case EDataType::Null: + THROW_ERROR_EXCEPTION("Data type %Qlv is not supported by DSV", + dataType); + + default: + YT_ABORT(); + }; +} + +class TTableParserAdapter + : public IParser +{ +public: + TTableParserAdapter( + const TFormat& format, + std::vector<IValueConsumer*> valueConsumers, + int tableIndex) + : TableConsumer_(new TTableConsumer( + TYsonConverterConfig{ + .ComplexTypeMode = format.Attributes().Get("complex_type_mode", EComplexTypeMode::Named), + .StringKeyedDictMode = format.Attributes().Get("string_keyed_dict_mode", EDictMode::Positional), + .DecimalMode = format.Attributes().Get("decimal_mode", EDecimalMode::Binary), + .TimeMode = format.Attributes().Get("time_mode", ETimeMode::Binary), + .UuidMode = format.Attributes().Get("uuid_mode", EUuidMode::Binary), + }, + valueConsumers, + tableIndex)) + , Parser_(CreateParserForFormat( + format, + EDataType::Tabular, + TableConsumer_.get())) + { } + + void Read(TStringBuf data) override + { + Parser_->Read(data); + } + + void Finish() override + { + Parser_->Finish(); + } + +private: + const std::unique_ptr<IYsonConsumer> TableConsumer_; + const std::unique_ptr<IParser> Parser_; +}; + +} // namespace + +std::unique_ptr<IFlushableYsonConsumer> CreateConsumerForFormat( + const TFormat& format, + EDataType dataType, + IZeroCopyOutput* output) +{ + switch (format.GetType()) { + case EFormatType::Yson: + return CreateConsumerForYson(dataType, format.Attributes(), output); + case EFormatType::Json: + return CreateConsumerForJson(dataType, format.Attributes(), output); + case EFormatType::Dsv: + return CreateConsumerForDsv(dataType, format.Attributes(), output); + default: + THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", + format.GetType()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class TWriter, class TConsumerAdapter> +TIntrusivePtr<TWriter> CreateAdaptedWriterForYson( + const IAttributeDictionary& attributes, + TTableSchemaPtr schema, + IAsyncOutputStreamPtr output) +{ + auto config = ConvertTo<TYsonFormatConfigPtr>(&attributes); + return New<TConsumerAdapter>(std::move(output), std::move(schema), [=] (IZeroCopyOutput* buffer) { + if (config->Format == EYsonFormat::Binary) { + return std::unique_ptr<IFlushableYsonConsumer>(new TBufferedBinaryYsonWriter( + buffer, + EYsonType::ListFragment, + true)); + } else { + return std::unique_ptr<IFlushableYsonConsumer>(new TYsonWriter( + buffer, + config->Format, + EYsonType::ListFragment)); + } + }); +} + +template <class TWriter, class TConsumerAdapter> +TIntrusivePtr<TWriter> CreateAdaptedWriterForJson( + const IAttributeDictionary& attributes, + TTableSchemaPtr schema, + IAsyncOutputStreamPtr output) +{ + auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); + return New<TConsumerAdapter>(std::move(output), std::move(schema), [&] (IOutputStream* buffer) { + return CreateJsonConsumer(buffer, EYsonType::ListFragment, config); + }); +} + +IUnversionedRowsetWriterPtr CreateSchemafulWriterForFormat( + const TFormat& format, + TTableSchemaPtr schema, + IAsyncOutputStreamPtr output) +{ + switch (format.GetType()) { + case EFormatType::Yson: + return CreateAdaptedWriterForYson<IUnversionedRowsetWriter, TSchemafulWriter>(format.Attributes(), std::move(schema), std::move(output)); + case EFormatType::Json: + return CreateAdaptedWriterForJson<IUnversionedRowsetWriter, TSchemafulWriter>(format.Attributes(), std::move(schema), std::move(output)); + case EFormatType::SchemafulDsv: + return CreateSchemafulWriterForSchemafulDsv(format.Attributes(), std::move(schema), std::move(output)); + case EFormatType::WebJson: { + auto webJsonFormatConfig = ConvertTo<TWebJsonFormatConfigPtr>(&format.Attributes()); + webJsonFormatConfig->SkipSystemColumns = false; + + return CreateWriterForWebJson( + std::move(webJsonFormatConfig), + TNameTable::FromSchema(*schema), + {schema}, + std::move(output)); + } + default: + THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", + format.GetType()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +IVersionedWriterPtr CreateVersionedWriterForFormat( + const TFormat& format, + NTableClient::TTableSchemaPtr schema, + NConcurrency::IAsyncOutputStreamPtr output) +{ + switch (format.GetType()) { + case EFormatType::Yson: + return CreateAdaptedWriterForYson<IVersionedWriter, TVersionedWriter>(format.Attributes(), std::move(schema), std::move(output)); + case EFormatType::Json: + return CreateAdaptedWriterForJson<IVersionedWriter, TVersionedWriter>(format.Attributes(), std::move(schema), std::move(output)); + default: + THROW_ERROR_EXCEPTION("Unsupported output format %Qlv", format.GetType()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +ISchemalessFormatWriterPtr CreateStaticTableWriterForFormat( + const TFormat& format, + TNameTablePtr nameTable, + const std::vector<TTableSchemaPtr>& tableSchemas, + NConcurrency::IAsyncOutputStreamPtr output, + bool enableContextSaving, + TControlAttributesConfigPtr controlAttributesConfig, + int keyColumnCount) +{ + switch (format.GetType()) { + case EFormatType::Dsv: + return CreateSchemalessWriterForDsv( + format.Attributes(), + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + case EFormatType::Yamr: + return CreateSchemalessWriterForYamr( + format.Attributes(), + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + case EFormatType::YamredDsv: + return CreateSchemalessWriterForYamredDsv( + format.Attributes(), + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + case EFormatType::SchemafulDsv: + return CreateSchemalessWriterForSchemafulDsv( + format.Attributes(), + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + case EFormatType::Protobuf: + return CreateWriterForProtobuf( + format.Attributes(), + tableSchemas, + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + case EFormatType::WebJson: + return CreateWriterForWebJson( + format.Attributes(), + nameTable, + tableSchemas, + std::move(output)); + case EFormatType::Skiff: + return CreateWriterForSkiff( + format.Attributes(), + nameTable, + tableSchemas, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + default: + auto adapter = New<TSchemalessWriterAdapter>( + nameTable, + std::move(output), + enableContextSaving, + controlAttributesConfig, + keyColumnCount); + adapter->Init(tableSchemas, format); + return adapter; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TYsonProducer CreateProducerForDsv( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + if (dataType != EDataType::Tabular) { + THROW_ERROR_EXCEPTION("DSV is supported only for tabular data"); + } + auto config = ConvertTo<TDsvFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseDsv(input, consumer, config); + }); +} + +TYsonProducer CreateProducerForYamr( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + if (dataType != EDataType::Tabular) { + THROW_ERROR_EXCEPTION("YAMR is supported only for tabular data"); + } + auto config = ConvertTo<TYamrFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseYamr(input, consumer, config); + }); +} + +TYsonProducer CreateProducerForYamredDsv( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + if (dataType != EDataType::Tabular) { + THROW_ERROR_EXCEPTION("Yamred DSV is supported only for tabular data"); + } + auto config = ConvertTo<TYamredDsvFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseYamredDsv(input, consumer, config); + }); +} + +TYsonProducer CreateProducerForSchemafulDsv( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + if (dataType != EDataType::Tabular) { + THROW_ERROR_EXCEPTION("Schemaful DSV is supported only for tabular data"); + } + auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseSchemafulDsv(input, consumer, config); + }); +} + +TYsonProducer CreateProducerForJson( + EDataType dataType, + const IAttributeDictionary& attributes, + IInputStream* input) +{ + auto ysonType = DataTypeToYsonType(dataType); + auto config = ConvertTo<TJsonFormatConfigPtr>(&attributes); + return BIND([=] (IYsonConsumer* consumer) { + ParseJson(input, consumer, config, ysonType); + }); +} + +TYsonProducer CreateProducerForYson(EDataType dataType, IInputStream* input) +{ + auto ysonType = DataTypeToYsonType(dataType); + return ConvertToProducer(TYsonInput(input, ysonType)); +} + +TYsonProducer CreateProducerForFormat(const TFormat& format, EDataType dataType, IInputStream* input) +{ + switch (format.GetType()) { + case EFormatType::Yson: + return CreateProducerForYson(dataType, input); + case EFormatType::Json: + return CreateProducerForJson(dataType, format.Attributes(), input); + case EFormatType::Dsv: + return CreateProducerForDsv(dataType, format.Attributes(), input); + case EFormatType::Yamr: + return CreateProducerForYamr(dataType, format.Attributes(), input); + case EFormatType::YamredDsv: + return CreateProducerForYamredDsv(dataType, format.Attributes(), input); + case EFormatType::SchemafulDsv: + return CreateProducerForSchemafulDsv(dataType, format.Attributes(), input); + default: + THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", + format.GetType()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template<class TBase> +struct TParserAdapter + : public TBase + , public IParser +{ +public: + template<class... TArgs> + TParserAdapter(TArgs&&... args) + : TBase(std::forward<TArgs>(args)...) + { } + + void Read(TStringBuf data) override + { + TBase::Read(data); + } + + void Finish() override + { + TBase::Finish(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr<IParser> CreateParserForFormat(const TFormat& format, EDataType dataType, IYsonConsumer* consumer) +{ + switch (format.GetType()) { + case EFormatType::Yson: + return CreateParserForYson(consumer, DataTypeToYsonType(dataType)); + case EFormatType::Json: { + auto config = ConvertTo<TJsonFormatConfigPtr>(&format.Attributes()); + return std::unique_ptr<IParser>(new TParserAdapter<TJsonParser>(consumer, config, DataTypeToYsonType(dataType))); + } + case EFormatType::Dsv: { + auto config = ConvertTo<TDsvFormatConfigPtr>(&format.Attributes()); + return CreateParserForDsv(consumer, config); + } + case EFormatType::Yamr: { + auto config = ConvertTo<TYamrFormatConfigPtr>(&format.Attributes()); + return CreateParserForYamr(consumer, config); + } + case EFormatType::YamredDsv: { + auto config = ConvertTo<TYamredDsvFormatConfigPtr>(&format.Attributes()); + return CreateParserForYamredDsv(consumer, config); + } + case EFormatType::SchemafulDsv: { + auto config = ConvertTo<TSchemafulDsvFormatConfigPtr>(&format.Attributes()); + return CreateParserForSchemafulDsv(consumer, config); + } + default: + THROW_ERROR_EXCEPTION("Unsupported input format %Qlv", + format.GetType()); + } +} + +std::vector<std::unique_ptr<IParser>> CreateParsersForFormat( + const TFormat& format, + const std::vector<IValueConsumer*>& valueConsumers) +{ + std::vector<std::unique_ptr<IParser>> parsers; + + auto parserCount = std::ssize(valueConsumers); + parsers.reserve(parserCount); + + switch (format.GetType()) { + case EFormatType::Protobuf: { + auto config = ConvertTo<TProtobufFormatConfigPtr>(&format.Attributes()); + // TODO(max42): implementation of CreateParserForProtobuf clones config + // on each call, so this loop works in quadratic time. Fix that. + for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { + parsers.emplace_back(CreateParserForProtobuf(valueConsumers[tableIndex], config, tableIndex)); + } + break; + } + case EFormatType::Skiff: { + auto config = ConvertTo<TSkiffFormatConfigPtr>(&format.Attributes()); + auto skiffSchemas = ParseSkiffSchemas(config->SkiffSchemaRegistry, config->TableSkiffSchemas); + for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { + parsers.emplace_back(CreateParserForSkiff(valueConsumers[tableIndex], skiffSchemas, config, tableIndex)); + } + break; + } + default: + for (int tableIndex = 0; tableIndex < parserCount; ++tableIndex) { + parsers.emplace_back(std::make_unique<TTableParserAdapter>(format, valueConsumers, tableIndex)); + } + break; + } + + return parsers; +} + +std::unique_ptr<IParser> CreateParserForFormat( + const TFormat& format, + IValueConsumer* valueConsumer) +{ + auto parsers = CreateParsersForFormat(format, {valueConsumer}); + return std::move(parsers.front()); +} + +//////////////////////////////////////////////////////////////////////////////// + +void ConfigureEscapeTable(const TSchemafulDsvFormatConfigPtr& config, TEscapeTable* escapeTable) +{ + std::vector<char> stopSymbols = {config->RecordSeparator, config->FieldSeparator}; + if (config->EnableEscaping) { + stopSymbols.push_back(config->EscapingSymbol); + escapeTable->EscapingSymbol = config->EscapingSymbol; + } + escapeTable->FillStops(stopSymbols); +} + +void ConfigureEscapeTables( + const TDsvFormatConfigBasePtr& config, + bool addCarriageReturn, + TEscapeTable* keyEscapeTable, + TEscapeTable* valueEscapeTable) +{ + std::vector<char> stopSymbols = {config->RecordSeparator, config->FieldSeparator, '\0'}; + + if (config->EnableEscaping) { + stopSymbols.push_back(config->EscapingSymbol); + keyEscapeTable->EscapingSymbol = valueEscapeTable->EscapingSymbol = config->EscapingSymbol; + } + + if (addCarriageReturn) { + stopSymbols.push_back('\r'); + } + + valueEscapeTable->FillStops(stopSymbols); + + stopSymbols.push_back(config->KeyValueSeparator); + keyEscapeTable->FillStops(stopSymbols); +} + +void ConfigureEscapeTables( + const TYamrFormatConfigBasePtr& config, + bool enableKeyEscaping, + bool enableValueEscaping, + bool escapingForWriter, + TEscapeTable* keyEscapeTable, + TEscapeTable* valueEscapeTable) +{ + std::vector<char> valueStopSymbols = {config->RecordSeparator}; + std::vector<char> keyStopSymbols = {config->RecordSeparator, config->FieldSeparator}; + + if (enableKeyEscaping) { + if (escapingForWriter) { + keyStopSymbols.push_back('\0'); + keyStopSymbols.push_back('\r'); + } + keyStopSymbols.push_back(config->EscapingSymbol); + keyEscapeTable->EscapingSymbol = config->EscapingSymbol; + } + + if (enableValueEscaping) { + if (escapingForWriter) { + valueStopSymbols.push_back('\0'); + valueStopSymbols.push_back('\r'); + } + valueStopSymbols.push_back(config->EscapingSymbol); + valueEscapeTable->EscapingSymbol = config->EscapingSymbol; + } + + keyEscapeTable->FillStops(keyStopSymbols); + valueEscapeTable->FillStops(valueStopSymbols); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/format.h b/yt/yt/library/formats/format.h new file mode 100644 index 0000000000..3a85d7f1a4 --- /dev/null +++ b/yt/yt/library/formats/format.h @@ -0,0 +1,109 @@ +#pragma once + +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/format.h> + +#include <yt/yt/client/table_client/public.h> +#include <yt/yt/client/table_client/unversioned_writer.h> + +#include <yt/yt/core/concurrency/public.h> + +#include <yt/yt/core/misc/property.h> + +#include <yt/yt/core/yson/public.h> + +#include <yt/yt/core/ytree/attributes.h> + +namespace NYT::NFormats { + +//////////////////////////////////////////////////////////////////////////////// + +struct ISchemalessFormatWriter + : public NTableClient::IUnversionedRowsetWriter +{ + virtual TBlob GetContext() const = 0; + + virtual i64 GetWrittenSize() const = 0; + + [[nodiscard]] virtual TFuture<void> Flush() = 0; + + virtual bool WriteBatch(NTableClient::IUnversionedRowBatchPtr rowBatch) = 0; +}; + +DEFINE_REFCOUNTED_TYPE(ISchemalessFormatWriter) + +//////////////////////////////////////////////////////////////////////////////// + +// This function historically creates format for reading dynamic tables. +// It slightly differs from format for static tables. :( +NTableClient::IUnversionedRowsetWriterPtr CreateSchemafulWriterForFormat( + const TFormat& Format, + NTableClient::TTableSchemaPtr schema, + NConcurrency::IAsyncOutputStreamPtr output); + +//////////////////////////////////////////////////////////////////////////////// + +NTableClient::IVersionedWriterPtr CreateVersionedWriterForFormat( + const TFormat& Format, + NTableClient::TTableSchemaPtr schema, + NConcurrency::IAsyncOutputStreamPtr output); + +//////////////////////////////////////////////////////////////////////////////// + +ISchemalessFormatWriterPtr CreateStaticTableWriterForFormat( + const TFormat& format, + NTableClient::TNameTablePtr nameTable, + const std::vector<NTableClient::TTableSchemaPtr>& tableSchemas, + NConcurrency::IAsyncOutputStreamPtr output, + bool enableContextSaving, + TControlAttributesConfigPtr controlAttributesConfig, + int keyColumnCount); + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr<NYson::IFlushableYsonConsumer> CreateConsumerForFormat( + const TFormat& format, + EDataType dataType, + IZeroCopyOutput* output); + +NYson::TYsonProducer CreateProducerForFormat( + const TFormat& format, + EDataType dataType, + IInputStream* input); + +std::unique_ptr<IParser> CreateParserForFormat( + const TFormat& format, + EDataType dataType, + NYson::IYsonConsumer* consumer); + +//! Create own parser for each value consumer. +std::vector<std::unique_ptr<IParser>> CreateParsersForFormat( + const TFormat& format, + const std::vector<NTableClient::IValueConsumer*>& valueConsumers); + +//! Create parser for value consumer. Helper for previous method in singular case. +std::unique_ptr<IParser> CreateParserForFormat( + const TFormat& format, + NTableClient::IValueConsumer* valueConsumer); + +//////////////////////////////////////////////////////////////////////////////// + +void ConfigureEscapeTable(const TSchemafulDsvFormatConfigPtr& config, TEscapeTable* escapeTable); + +void ConfigureEscapeTables( + const TDsvFormatConfigBasePtr& config, + bool addCarriageReturn, + TEscapeTable* keyEscapeTable, + TEscapeTable* valueEscapeTable); + +void ConfigureEscapeTables( + const TYamrFormatConfigBasePtr& config, + bool enableKeyEscaping, + bool enableValueEscaping, + bool escapingForWriter, + TEscapeTable* keyEscapeTable, + TEscapeTable* valueEscapeTable); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NFormats diff --git a/yt/yt/client/formats/helpers.cpp b/yt/yt/library/formats/helpers.cpp index 9609b447fb..9609b447fb 100644 --- a/yt/yt/client/formats/helpers.cpp +++ b/yt/yt/library/formats/helpers.cpp diff --git a/yt/yt/client/formats/helpers.h b/yt/yt/library/formats/helpers.h index 526a95db0f..1d73d9279f 100644 --- a/yt/yt/client/formats/helpers.h +++ b/yt/yt/library/formats/helpers.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/lenval_control_constants.h b/yt/yt/library/formats/lenval_control_constants.h index ccbbc71fe4..ccbbc71fe4 100644 --- a/yt/yt/client/formats/lenval_control_constants.h +++ b/yt/yt/library/formats/lenval_control_constants.h diff --git a/yt/yt/client/formats/private.h b/yt/yt/library/formats/private.h index 805168dd30..805168dd30 100644 --- a/yt/yt/client/formats/private.h +++ b/yt/yt/library/formats/private.h diff --git a/yt/yt/client/formats/protobuf.cpp b/yt/yt/library/formats/protobuf.cpp index e4c29652fa..e4c29652fa 100644 --- a/yt/yt/client/formats/protobuf.cpp +++ b/yt/yt/library/formats/protobuf.cpp diff --git a/yt/yt/client/formats/protobuf.h b/yt/yt/library/formats/protobuf.h index accf1e8b71..7f0b3e38f6 100644 --- a/yt/yt/client/formats/protobuf.h +++ b/yt/yt/library/formats/protobuf.h @@ -1,6 +1,7 @@ #pragma once -#include "config.h" +#include <yt/yt/client/formats/config.h> + #include "private.h" #include <google/protobuf/descriptor.h> diff --git a/yt/yt/client/formats/protobuf_options.cpp b/yt/yt/library/formats/protobuf_options.cpp index 5e9543d844..5e9543d844 100644 --- a/yt/yt/client/formats/protobuf_options.cpp +++ b/yt/yt/library/formats/protobuf_options.cpp diff --git a/yt/yt/client/formats/protobuf_options.h b/yt/yt/library/formats/protobuf_options.h index 03caaadaff..7bd51bd54f 100644 --- a/yt/yt/client/formats/protobuf_options.h +++ b/yt/yt/library/formats/protobuf_options.h @@ -1,6 +1,6 @@ #pragma once -#include "config.h" +#include <yt/yt/client/formats/config.h> #include <yt/yt_proto/yt/formats/extension.pb.h> diff --git a/yt/yt/client/formats/protobuf_parser.cpp b/yt/yt/library/formats/protobuf_parser.cpp index abaef0cb22..925dabd9ff 100644 --- a/yt/yt/client/formats/protobuf_parser.cpp +++ b/yt/yt/library/formats/protobuf_parser.cpp @@ -1,7 +1,9 @@ #include "protobuf_parser.h" #include "protobuf.h" -#include "parser.h" + +#include <yt/yt/client/formats/parser.h> + #include "yson_map_to_unversioned_value.h" #include <yt/yt/client/table_client/helpers.h> diff --git a/yt/yt/client/formats/protobuf_parser.h b/yt/yt/library/formats/protobuf_parser.h index 1ac356069f..14f32192b1 100644 --- a/yt/yt/client/formats/protobuf_parser.h +++ b/yt/yt/library/formats/protobuf_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> namespace NYT::NFormats { diff --git a/yt/yt/client/formats/protobuf_writer.cpp b/yt/yt/library/formats/protobuf_writer.cpp index f4321cd68a..f4321cd68a 100644 --- a/yt/yt/client/formats/protobuf_writer.cpp +++ b/yt/yt/library/formats/protobuf_writer.cpp diff --git a/yt/yt/client/formats/protobuf_writer.h b/yt/yt/library/formats/protobuf_writer.h index a6f7936405..d726f92e4c 100644 --- a/yt/yt/client/formats/protobuf_writer.h +++ b/yt/yt/library/formats/protobuf_writer.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/schemaful_dsv_parser.cpp b/yt/yt/library/formats/schemaful_dsv_parser.cpp index 8fb0bda433..3149f28851 100644 --- a/yt/yt/client/formats/schemaful_dsv_parser.cpp +++ b/yt/yt/library/formats/schemaful_dsv_parser.cpp @@ -1,9 +1,10 @@ #include "schemaful_dsv_parser.h" -#include "parser.h" #include "escape.h" #include "format.h" +#include <yt/yt/client/formats/parser.h> + #include <yt/yt/client/table_client/public.h> namespace NYT::NFormats { diff --git a/yt/yt/client/formats/schemaful_dsv_parser.h b/yt/yt/library/formats/schemaful_dsv_parser.h index cc01a9b399..164b51ebb2 100644 --- a/yt/yt/client/formats/schemaful_dsv_parser.h +++ b/yt/yt/library/formats/schemaful_dsv_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> #include <yt/yt/core/yson/consumer.h> diff --git a/yt/yt/client/formats/schemaful_dsv_writer.cpp b/yt/yt/library/formats/schemaful_dsv_writer.cpp index 17b9210ff9..17b9210ff9 100644 --- a/yt/yt/client/formats/schemaful_dsv_writer.cpp +++ b/yt/yt/library/formats/schemaful_dsv_writer.cpp diff --git a/yt/yt/client/formats/schemaful_dsv_writer.h b/yt/yt/library/formats/schemaful_dsv_writer.h index c420f9e7ea..a4c990b0a4 100644 --- a/yt/yt/client/formats/schemaful_dsv_writer.h +++ b/yt/yt/library/formats/schemaful_dsv_writer.h @@ -1,7 +1,8 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "helpers.h" #include "schemaless_writer_adapter.h" diff --git a/yt/yt/client/formats/schemaful_writer.cpp b/yt/yt/library/formats/schemaful_writer.cpp index c7f72d5544..c7f72d5544 100644 --- a/yt/yt/client/formats/schemaful_writer.cpp +++ b/yt/yt/library/formats/schemaful_writer.cpp diff --git a/yt/yt/client/formats/schemaful_writer.h b/yt/yt/library/formats/schemaful_writer.h index 2d4848431f..0a627be388 100644 --- a/yt/yt/client/formats/schemaful_writer.h +++ b/yt/yt/library/formats/schemaful_writer.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/complex_types/yson_format_conversion.h> diff --git a/yt/yt/client/formats/schemaless_writer_adapter.cpp b/yt/yt/library/formats/schemaless_writer_adapter.cpp index 8970177132..68c95b1b2d 100644 --- a/yt/yt/client/formats/schemaless_writer_adapter.cpp +++ b/yt/yt/library/formats/schemaless_writer_adapter.cpp @@ -1,5 +1,6 @@ #include "schemaless_writer_adapter.h" -#include "config.h" + +#include <yt/yt/client/formats/config.h> #include <yt/yt/client/table_client/name_table.h> #include <yt/yt/client/table_client/row_batch.h> diff --git a/yt/yt/client/formats/schemaless_writer_adapter.h b/yt/yt/library/formats/schemaless_writer_adapter.h index 52c85c7ffa..4055d1968f 100644 --- a/yt/yt/client/formats/schemaless_writer_adapter.h +++ b/yt/yt/library/formats/schemaless_writer_adapter.h @@ -1,10 +1,11 @@ #pragma once -#include "public.h" #include "format.h" #include "helpers.h" #include "unversioned_value_yson_writer.h" +#include <yt/yt/client/formats/public.h> + #include <yt/yt/client/table_client/unversioned_writer.h> #include <yt/yt/core/concurrency/public.h> diff --git a/yt/yt/client/formats/skiff_parser.cpp b/yt/yt/library/formats/skiff_parser.cpp index 8b2d71238b..77d887e9ce 100644 --- a/yt/yt/client/formats/skiff_parser.cpp +++ b/yt/yt/library/formats/skiff_parser.cpp @@ -2,7 +2,9 @@ #include "skiff_yson_converter.h" #include "helpers.h" -#include "parser.h" + +#include <yt/yt/client/formats/parser.h> + #include "yson_map_to_unversioned_value.h" #include <yt/yt/library/decimal/decimal.h> diff --git a/yt/yt/client/formats/skiff_parser.h b/yt/yt/library/formats/skiff_parser.h index 35cea6666e..7321054511 100644 --- a/yt/yt/client/formats/skiff_parser.h +++ b/yt/yt/library/formats/skiff_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> #include <library/cpp/skiff/skiff.h> diff --git a/yt/yt/client/formats/skiff_writer.cpp b/yt/yt/library/formats/skiff_writer.cpp index 1eaf09ce8d..4d84770ccc 100644 --- a/yt/yt/client/formats/skiff_writer.cpp +++ b/yt/yt/library/formats/skiff_writer.cpp @@ -1,9 +1,10 @@ #include "skiff_writer.h" -#include "public.h" #include "schemaless_writer_adapter.h" #include "skiff_yson_converter.h" +#include <yt/yt/client/formats/public.h> + #include <yt/yt/client/table_client/name_table.h> #include <yt/yt/client/table_client/logical_type.h> #include <yt/yt/client/table_client/schema.h> diff --git a/yt/yt/client/formats/skiff_writer.h b/yt/yt/library/formats/skiff_writer.h index 9cd8f66268..0e66a54156 100644 --- a/yt/yt/client/formats/skiff_writer.h +++ b/yt/yt/library/formats/skiff_writer.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/skiff_yson_converter-inl.h b/yt/yt/library/formats/skiff_yson_converter-inl.h index 2c667b35d6..2c667b35d6 100644 --- a/yt/yt/client/formats/skiff_yson_converter-inl.h +++ b/yt/yt/library/formats/skiff_yson_converter-inl.h diff --git a/yt/yt/client/formats/skiff_yson_converter.cpp b/yt/yt/library/formats/skiff_yson_converter.cpp index 171bfd9a9a..171bfd9a9a 100644 --- a/yt/yt/client/formats/skiff_yson_converter.cpp +++ b/yt/yt/library/formats/skiff_yson_converter.cpp diff --git a/yt/yt/client/formats/skiff_yson_converter.h b/yt/yt/library/formats/skiff_yson_converter.h index 233b106729..233b106729 100644 --- a/yt/yt/client/formats/skiff_yson_converter.h +++ b/yt/yt/library/formats/skiff_yson_converter.h diff --git a/yt/yt/client/formats/unversioned_value_yson_writer.cpp b/yt/yt/library/formats/unversioned_value_yson_writer.cpp index c89dd48373..c89dd48373 100644 --- a/yt/yt/client/formats/unversioned_value_yson_writer.cpp +++ b/yt/yt/library/formats/unversioned_value_yson_writer.cpp diff --git a/yt/yt/client/formats/unversioned_value_yson_writer.h b/yt/yt/library/formats/unversioned_value_yson_writer.h index 1b6f671298..7799f3c3a3 100644 --- a/yt/yt/client/formats/unversioned_value_yson_writer.h +++ b/yt/yt/library/formats/unversioned_value_yson_writer.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> #include <yt/yt/client/complex_types/yson_format_conversion.h> @@ -25,4 +25,4 @@ private: //////////////////////////////////////////////////////////////////////////////// -} // namespace NYT::NFormats
\ No newline at end of file +} // namespace NYT::NFormats diff --git a/yt/yt/client/formats/versioned_writer.cpp b/yt/yt/library/formats/versioned_writer.cpp index 056c86d5e1..056c86d5e1 100644 --- a/yt/yt/client/formats/versioned_writer.cpp +++ b/yt/yt/library/formats/versioned_writer.cpp diff --git a/yt/yt/client/formats/versioned_writer.h b/yt/yt/library/formats/versioned_writer.h index 215f4e6537..025ba8e1fe 100644 --- a/yt/yt/client/formats/versioned_writer.h +++ b/yt/yt/library/formats/versioned_writer.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/versioned_writer.h> #include <yt/yt/client/table_client/schema.h> diff --git a/yt/yt/client/formats/web_json_writer.cpp b/yt/yt/library/formats/web_json_writer.cpp index d4c7293c85..9233f55c6f 100644 --- a/yt/yt/client/formats/web_json_writer.cpp +++ b/yt/yt/library/formats/web_json_writer.cpp @@ -1,8 +1,9 @@ #include "web_json_writer.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "format.h" -#include "public.h" #include "schemaless_writer_adapter.h" #include "yql_yson_converter.h" diff --git a/yt/yt/client/formats/web_json_writer.h b/yt/yt/library/formats/web_json_writer.h index 25d476a98f..a2ca099260 100644 --- a/yt/yt/client/formats/web_json_writer.h +++ b/yt/yt/library/formats/web_json_writer.h @@ -1,7 +1,8 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "helpers.h" #include "schemaless_writer_adapter.h" diff --git a/yt/yt/library/formats/ya.make b/yt/yt/library/formats/ya.make new file mode 100644 index 0000000000..72e875b867 --- /dev/null +++ b/yt/yt/library/formats/ya.make @@ -0,0 +1,48 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + arrow_writer.cpp + dsv_parser.cpp + dsv_writer.cpp + escape.cpp + format.cpp + helpers.cpp + protobuf.cpp + protobuf_options.cpp + protobuf_parser.cpp + protobuf_writer.cpp + schemaful_dsv_parser.cpp + schemaful_dsv_writer.cpp + schemaful_writer.cpp + schemaless_writer_adapter.cpp + skiff_parser.cpp + skiff_writer.cpp + skiff_yson_converter.cpp + unversioned_value_yson_writer.cpp + versioned_writer.cpp + web_json_writer.cpp + yamred_dsv_parser.cpp + yamred_dsv_writer.cpp + yamr_parser_base.cpp + yamr_parser.cpp + yamr_writer_base.cpp + yamr_writer.cpp + yql_yson_converter.cpp + yson_map_to_unversioned_value.cpp + yson_parser.cpp +) + +PEERDIR( + yt/yt/client + yt/yt/client/formats + yt/yt/client/arrow/fbs + yt/yt/library/column_converters +) + +END() + +RECURSE_FOR_TESTS( + unittests +) diff --git a/yt/yt/client/formats/yamr_parser.cpp b/yt/yt/library/formats/yamr_parser.cpp index 0465741743..0465741743 100644 --- a/yt/yt/client/formats/yamr_parser.cpp +++ b/yt/yt/library/formats/yamr_parser.cpp diff --git a/yt/yt/client/formats/yamr_parser.h b/yt/yt/library/formats/yamr_parser.h index 10b5185d8e..3e7791a930 100644 --- a/yt/yt/client/formats/yamr_parser.h +++ b/yt/yt/library/formats/yamr_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> namespace NYT::NFormats { diff --git a/yt/yt/client/formats/yamr_parser_base.cpp b/yt/yt/library/formats/yamr_parser_base.cpp index 3a7a7d8833..935faaed80 100644 --- a/yt/yt/client/formats/yamr_parser_base.cpp +++ b/yt/yt/library/formats/yamr_parser_base.cpp @@ -1,7 +1,8 @@ #include "yamr_parser_base.h" #include "format.h" -#include "config.h" + +#include <yt/yt/client/formats/config.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/yamr_parser_base.h b/yt/yt/library/formats/yamr_parser_base.h index 240a2855a8..56968d4908 100644 --- a/yt/yt/client/formats/yamr_parser_base.h +++ b/yt/yt/library/formats/yamr_parser_base.h @@ -1,6 +1,6 @@ #pragma once -#include "parser.h" +#include <yt/yt/client/formats/parser.h> #include "escape.h" diff --git a/yt/yt/client/formats/yamr_writer.cpp b/yt/yt/library/formats/yamr_writer.cpp index 4408b35568..4408b35568 100644 --- a/yt/yt/client/formats/yamr_writer.cpp +++ b/yt/yt/library/formats/yamr_writer.cpp diff --git a/yt/yt/client/formats/yamr_writer.h b/yt/yt/library/formats/yamr_writer.h index d381b1d4bd..68b89efb26 100644 --- a/yt/yt/client/formats/yamr_writer.h +++ b/yt/yt/library/formats/yamr_writer.h @@ -1,7 +1,8 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "helpers.h" #include "yamr_writer_base.h" diff --git a/yt/yt/client/formats/yamr_writer_base.cpp b/yt/yt/library/formats/yamr_writer_base.cpp index 1dc73de0d8..1dc73de0d8 100644 --- a/yt/yt/client/formats/yamr_writer_base.cpp +++ b/yt/yt/library/formats/yamr_writer_base.cpp diff --git a/yt/yt/client/formats/yamr_writer_base.h b/yt/yt/library/formats/yamr_writer_base.h index d8483f6636..a8f0583be1 100644 --- a/yt/yt/client/formats/yamr_writer_base.h +++ b/yt/yt/library/formats/yamr_writer_base.h @@ -1,7 +1,8 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "helpers.h" #include "escape.h" #include "schemaless_writer_adapter.h" diff --git a/yt/yt/client/formats/yamred_dsv_parser.cpp b/yt/yt/library/formats/yamred_dsv_parser.cpp index 476e760ea6..476e760ea6 100644 --- a/yt/yt/client/formats/yamred_dsv_parser.cpp +++ b/yt/yt/library/formats/yamred_dsv_parser.cpp diff --git a/yt/yt/client/formats/yamred_dsv_parser.h b/yt/yt/library/formats/yamred_dsv_parser.h index e260ee6b30..6a214aa674 100644 --- a/yt/yt/client/formats/yamred_dsv_parser.h +++ b/yt/yt/library/formats/yamred_dsv_parser.h @@ -1,7 +1,7 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> namespace NYT::NFormats { diff --git a/yt/yt/client/formats/yamred_dsv_writer.cpp b/yt/yt/library/formats/yamred_dsv_writer.cpp index 956e771732..956e771732 100644 --- a/yt/yt/client/formats/yamred_dsv_writer.cpp +++ b/yt/yt/library/formats/yamred_dsv_writer.cpp diff --git a/yt/yt/client/formats/yamred_dsv_writer.h b/yt/yt/library/formats/yamred_dsv_writer.h index 6d9050abe3..6d1f4071be 100644 --- a/yt/yt/client/formats/yamred_dsv_writer.h +++ b/yt/yt/library/formats/yamred_dsv_writer.h @@ -1,7 +1,8 @@ #pragma once -#include "public.h" -#include "config.h" +#include <yt/yt/client/formats/public.h> +#include <yt/yt/client/formats/config.h> + #include "helpers.h" #include "yamr_writer_base.h" diff --git a/yt/yt/client/formats/yql_yson_converter.cpp b/yt/yt/library/formats/yql_yson_converter.cpp index dc57a1a69a..dc57a1a69a 100644 --- a/yt/yt/client/formats/yql_yson_converter.cpp +++ b/yt/yt/library/formats/yql_yson_converter.cpp diff --git a/yt/yt/client/formats/yql_yson_converter.h b/yt/yt/library/formats/yql_yson_converter.h index 5caafea963..1e5fa80e7f 100644 --- a/yt/yt/client/formats/yql_yson_converter.h +++ b/yt/yt/library/formats/yql_yson_converter.h @@ -1,4 +1,4 @@ -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/yson_map_to_unversioned_value.cpp b/yt/yt/library/formats/yson_map_to_unversioned_value.cpp index fced3477f5..fced3477f5 100644 --- a/yt/yt/client/formats/yson_map_to_unversioned_value.cpp +++ b/yt/yt/library/formats/yson_map_to_unversioned_value.cpp diff --git a/yt/yt/client/formats/yson_map_to_unversioned_value.h b/yt/yt/library/formats/yson_map_to_unversioned_value.h index 1e53ad6c88..023a0600a3 100644 --- a/yt/yt/client/formats/yson_map_to_unversioned_value.h +++ b/yt/yt/library/formats/yson_map_to_unversioned_value.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/client/table_client/public.h> #include <yt/yt/client/table_client/table_consumer.h> diff --git a/yt/yt/client/formats/yson_parser.cpp b/yt/yt/library/formats/yson_parser.cpp index 193b7caf31..d9bb6b303e 100644 --- a/yt/yt/client/formats/yson_parser.cpp +++ b/yt/yt/library/formats/yson_parser.cpp @@ -1,5 +1,6 @@ #include "yson_parser.h" -#include "parser.h" + +#include <yt/yt/client/formats/parser.h> #include <yt/yt/client/table_client/public.h> diff --git a/yt/yt/client/formats/yson_parser.h b/yt/yt/library/formats/yson_parser.h index a6e4880b30..5a5f9752da 100644 --- a/yt/yt/client/formats/yson_parser.h +++ b/yt/yt/library/formats/yson_parser.h @@ -1,6 +1,6 @@ #pragma once -#include "public.h" +#include <yt/yt/client/formats/public.h> #include <yt/yt/core/yson/public.h> diff --git a/yt/yt/client/unittests/logical_type_shortcuts.h b/yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h index 7b22006a06..7b22006a06 100644 --- a/yt/yt/client/unittests/logical_type_shortcuts.h +++ b/yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h |