diff options
author | Alexander Smirnov <alex@ydb.tech> | 2024-12-24 15:46:17 +0000 |
---|---|---|
committer | Alexander Smirnov <alex@ydb.tech> | 2024-12-24 15:46:17 +0000 |
commit | c7decaf9230ddcb1ec2c42d1f50fb3998166c4ef (patch) | |
tree | 4efde4e4276bb0f24c314909403a1f6ed94c60d7 /yt | |
parent | cf344b64297e6a79d1e538be9f8f59afb06a2a97 (diff) | |
parent | b821606f7bd364dc755d37b5bcb3559130675364 (diff) | |
download | ydb-c7decaf9230ddcb1ec2c42d1f50fb3998166c4ef.tar.gz |
Merge branch 'rightlib' into merge-libs-241224-1545
Diffstat (limited to 'yt')
166 files changed, 18342 insertions, 4415 deletions
diff --git a/yt/cpp/mapreduce/client/client.cpp b/yt/cpp/mapreduce/client/client.cpp index 9e3976b144..9fcb82f5b7 100644 --- a/yt/cpp/mapreduce/client/client.cpp +++ b/yt/cpp/mapreduce/client/client.cpp @@ -1352,11 +1352,27 @@ TNode::TListType TClient::SkyShareTable( const TSkyShareTableOptions& options) { CheckShutdown(); - return NRawClient::SkyShareTable( - ClientRetryPolicy_->CreatePolicyForGenericRequest(), - Context_, - tablePaths, - options); + + // As documented at https://wiki.yandex-team.ru/yt/userdoc/blob_tables/#shag3.sozdajomrazdachu + // first request returns HTTP status code 202 (Accepted). And we need retrying until we have 200 (OK). + NHttpClient::IHttpResponsePtr response; + do { + response = RequestWithRetry<NHttpClient::IHttpResponsePtr>( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + [this, &tablePaths, &options] (TMutationId /*mutationId*/) { + return RawClient_->SkyShareTable(tablePaths, options); + }); + TWaitProxy::Get()->Sleep(TDuration::Seconds(5)); + } while (response->GetStatusCode() != 200); + + if (options.KeyColumns_) { + return NodeFromJsonString(response->GetResponse())["torrents"].AsList(); + } else { + TNode torrent; + torrent["key"] = TNode::CreateList(); + torrent["rbtorrent"] = response->GetResponse(); + return TNode::TListType{torrent}; + } } TCheckPermissionResponse TClient::CheckPermission( diff --git a/yt/cpp/mapreduce/client/client_reader.cpp b/yt/cpp/mapreduce/client/client_reader.cpp index b312716877..e7538a22da 100644 --- a/yt/cpp/mapreduce/client/client_reader.cpp +++ b/yt/cpp/mapreduce/client/client_reader.cpp @@ -166,99 +166,28 @@ void TClientReader::CreateRequest(const TMaybe<ui32>& rangeIndex, const TMaybe<u CurrentRequestRetryPolicy_ = ClientRetryPolicy_->CreatePolicyForGenericRequest(); } - bool areRangesUpdated = false; + auto transactionId = (ReadTransaction_ ? ReadTransaction_->GetId() : ParentTransactionId_); - while (true) { - CurrentRequestRetryPolicy_->NotifyNewAttempt(); - - THttpHeader header("GET", GetReadTableCommand(Context_.Config->ApiVersion)); - if (Context_.ServiceTicketAuth) { - header.SetServiceTicket(Context_.ServiceTicketAuth->Ptr->IssueServiceTicket()); + if (rowIndex.Defined()) { + auto& ranges = Path_.MutableRanges(); + if (ranges.Empty()) { + ranges.ConstructInPlace(TVector{TReadRange()}); } else { - header.SetToken(Context_.Token); - } - - if (Context_.ImpersonationUser) { - header.SetImpersonationUser(*Context_.ImpersonationUser); - } - - auto transactionId = (ReadTransaction_ ? ReadTransaction_->GetId() : ParentTransactionId_); - header.AddTransactionId(transactionId); - - const auto& controlAttributes = Options_.ControlAttributes_; - header.AddParameter("control_attributes", TNode() - ("enable_row_index", controlAttributes.EnableRowIndex_) - ("enable_range_index", controlAttributes.EnableRangeIndex_)); - header.SetOutputFormat(Format_); - - header.SetResponseCompression(ToString(Context_.Config->AcceptEncoding)); - - if (rowIndex.Defined() && !areRangesUpdated) { - auto& ranges = Path_.MutableRanges(); - if (ranges.Empty()) { - ranges.ConstructInPlace(TVector{TReadRange()}); - } else { - if (rangeIndex.GetOrElse(0) >= ranges->size()) { - ythrow yexception() - << "range index " << rangeIndex.GetOrElse(0) - << " is out of range, input range count is " << ranges->size(); - } - ranges->erase(ranges->begin(), ranges->begin() + rangeIndex.GetOrElse(0)); + if (rangeIndex.GetOrElse(0) >= ranges->size()) { + ythrow yexception() + << "range index " << rangeIndex.GetOrElse(0) + << " is out of range, input range count is " << ranges->size(); } - ranges->begin()->LowerLimit(TReadLimit().RowIndex(*rowIndex)); - areRangesUpdated = true; - } - - header.MergeParameters(FormIORequestParameters(Path_, Options_)); - - auto requestId = CreateGuidAsString(); - - try { - const auto proxyName = GetProxyForHeavyRequest(Context_); - UpdateHeaderForProxyIfNeed(proxyName, Context_, header); - Response_ = Context_.HttpClient->Request(GetFullUrlForProxy(proxyName, Context_, header), requestId, header); - - Input_ = Response_->GetResponseStream(); - - YT_LOG_DEBUG( - "RSP %v - table stream (RangeIndex: %v, RowIndex: %v)", - requestId, - rangeIndex, - rowIndex); - - return; - } catch (const TErrorResponse& e) { - LogRequestError( - requestId, - header, - e.what(), - CurrentRequestRetryPolicy_->GetAttemptDescription()); - - if (!IsRetriable(e)) { - throw; - } - auto backoff = CurrentRequestRetryPolicy_->OnRetriableError(e); - if (!backoff) { - throw; - } - NDetail::TWaitProxy::Get()->Sleep(*backoff); - } catch (const std::exception& e) { - LogRequestError( - requestId, - header, - e.what(), - CurrentRequestRetryPolicy_->GetAttemptDescription()); - - Response_.reset(); - Input_ = nullptr; - - auto backoff = CurrentRequestRetryPolicy_->OnGenericError(e); - if (!backoff) { - throw; - } - NDetail::TWaitProxy::Get()->Sleep(*backoff); + ranges->erase(ranges->begin(), ranges->begin() + rangeIndex.GetOrElse(0)); } + ranges->begin()->LowerLimit(TReadLimit().RowIndex(*rowIndex)); } + + Input_ = NDetail::RequestWithRetry<std::unique_ptr<IInputStream>>( + CurrentRequestRetryPolicy_, + [this, &transactionId] (TMutationId /*mutationId*/) { + return RawClient_->ReadTable(transactionId, Path_, Format_, Options_); + }); } //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/cpp/mapreduce/client/client_reader.h b/yt/cpp/mapreduce/client/client_reader.h index 61bc698340..3f73080046 100644 --- a/yt/cpp/mapreduce/client/client_reader.h +++ b/yt/cpp/mapreduce/client/client_reader.h @@ -2,8 +2,6 @@ #include <yt/cpp/mapreduce/common/fwd.h> -#include <yt/cpp/mapreduce/interface/io.h> - #include <yt/cpp/mapreduce/http/context.h> #include <yt/cpp/mapreduce/http/requests.h> #include <yt/cpp/mapreduce/http/http.h> @@ -55,8 +53,7 @@ private: THolder<TPingableTransaction> ReadTransaction_; - NHttpClient::IHttpResponsePtr Response_; - IInputStream* Input_; + std::unique_ptr<IInputStream> Input_; IRequestRetryPolicyPtr CurrentRequestRetryPolicy_; diff --git a/yt/cpp/mapreduce/client/file_reader.cpp b/yt/cpp/mapreduce/client/file_reader.cpp index 06463d0af2..f88b40e38b 100644 --- a/yt/cpp/mapreduce/client/file_reader.cpp +++ b/yt/cpp/mapreduce/client/file_reader.cpp @@ -31,7 +31,7 @@ using ::ToString; static TMaybe<ui64> GetEndOffset(const TFileReaderOptions& options) { if (options.Length_) { - return options.Offset_.GetOrElse(0) + *options.Length_; + return options.Offset_ + *options.Length_; } else { return Nothing(); } @@ -46,7 +46,6 @@ TStreamReaderBase::TStreamReaderBase( const TClientContext& context, const TTransactionId& transactionId) : RawClient_(rawClient) - , Context_(context) , ClientRetryPolicy_(std::move(clientRetryPolicy)) , ReadTransaction_(MakeHolder<TPingableTransaction>( RawClient_, @@ -64,59 +63,26 @@ TYPath TStreamReaderBase::Snapshot(const TYPath& path) return NYT::Snapshot(RawClient_, ClientRetryPolicy_, ReadTransaction_->GetId(), path); } -TString TStreamReaderBase::GetActiveRequestId() const -{ - if (Response_) { - return Response_->GetRequestId();; - } else { - return "<no-active-request>"; - } -} - size_t TStreamReaderBase::DoRead(void* buf, size_t len) { - const int retryCount = Context_.Config->ReadRetryCount; - for (int attempt = 1; attempt <= retryCount; ++attempt) { - try { - if (!Input_) { - Response_ = Request(Context_, ReadTransaction_->GetId(), CurrentOffset_); - Input_ = Response_->GetResponseStream(); - } - if (len == 0) { - return 0; - } - const size_t read = Input_->Read(buf, len); - CurrentOffset_ += read; - return read; - } catch (TErrorResponse& e) { - YT_LOG_ERROR("RSP %v - failed: %v (attempt %v of %v)", - GetActiveRequestId(), - e.what(), - attempt, - retryCount); - - if (!IsRetriable(e) || attempt == retryCount) { - throw; - } - TWaitProxy::Get()->Sleep(GetBackoffDuration(e, Context_.Config)); - } catch (std::exception& e) { - YT_LOG_ERROR("RSP %v - failed: %v (attempt %v of %v)", - GetActiveRequestId(), - e.what(), - attempt, - retryCount); - - // Invalidate connection. - Response_.reset(); - - if (attempt == retryCount) { + if (len == 0) { + return 0; + } + return RequestWithRetry<size_t>( + ClientRetryPolicy_->CreatePolicyForReaderRequest(), + [this, &buf, len] (TMutationId /*mutationId*/) { + try { + if (!Input_) { + Input_ = Request(ReadTransaction_->GetId(), CurrentOffset_); + } + const size_t read = Input_->Read(buf, len); + CurrentOffset_ += read; + return read; + } catch (...) { + Input_ = nullptr; throw; } - TWaitProxy::Get()->Sleep(GetBackoffDuration(e, Context_.Config)); - } - Input_ = nullptr; - } - Y_UNREACHABLE(); // we should either return or throw from loop above + }); } //////////////////////////////////////////////////////////////////////////////// @@ -130,57 +96,25 @@ TFileReader::TFileReader( const TTransactionId& transactionId, const TFileReaderOptions& options) : TStreamReaderBase(rawClient, std::move(clientRetryPolicy), std::move(transactionPinger), context, transactionId) - , FileReaderOptions_(options) + , StartOffset_(options.Offset_) + , EndOffset_(GetEndOffset(options)) + , Options_(options) , Path_(path) - , StartOffset_(FileReaderOptions_.Offset_.GetOrElse(0)) - , EndOffset_(GetEndOffset(FileReaderOptions_)) { Path_.Path_ = TStreamReaderBase::Snapshot(Path_.Path_); } -NHttpClient::IHttpResponsePtr TFileReader::Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) +std::unique_ptr<IInputStream> TFileReader::Request(const TTransactionId& transactionId, ui64 readBytes) { const ui64 currentOffset = StartOffset_ + readBytes; - TString hostName = GetProxyForHeavyRequest(context); - - THttpHeader header("GET", GetReadFileCommand(context.Config->ApiVersion)); - if (context.ServiceTicketAuth) { - header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); - } else { - header.SetToken(context.Token); - } - - if (context.ImpersonationUser) { - header.SetImpersonationUser(*context.ImpersonationUser); - } - - UpdateHeaderForProxyIfNeed(hostName, context, header); - - header.AddTransactionId(transactionId); - header.SetOutputFormat(TMaybe<TFormat>()); // Binary format if (EndOffset_) { Y_ABORT_UNLESS(*EndOffset_ >= currentOffset); - FileReaderOptions_.Length(*EndOffset_ - currentOffset); - } - FileReaderOptions_.Offset(currentOffset); - header.MergeParameters(FormIORequestParameters(Path_, FileReaderOptions_)); - - header.SetResponseCompression(ToString(context.Config->AcceptEncoding)); - - auto requestId = CreateGuidAsString(); - NHttpClient::IHttpResponsePtr response; - try { - response = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); - } catch (const std::exception& ex) { - LogRequestError(requestId, header, ex.what(), ""); - throw; + Options_.Length(*EndOffset_ - currentOffset); } - YT_LOG_DEBUG("RSP %v - file stream", - requestId); - - return response; + Options_.Offset(currentOffset); + return RawClient_->ReadFile(transactionId, Path_, Options_); } //////////////////////////////////////////////////////////////////////////////// @@ -195,66 +129,22 @@ TBlobTableReader::TBlobTableReader( const TTransactionId& transactionId, const TBlobTableReaderOptions& options) : TStreamReaderBase(rawClient, std::move(retryPolicy), std::move(transactionPinger), context, transactionId) + , StartOffset_(options.Offset_) , Key_(key) , Options_(options) { Path_ = TStreamReaderBase::Snapshot(path); } -NHttpClient::IHttpResponsePtr TBlobTableReader::Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) +std::unique_ptr<IInputStream> TBlobTableReader::Request(const TTransactionId& transactionId, ui64 readBytes) { - TString hostName = GetProxyForHeavyRequest(context); - - THttpHeader header("GET", "read_blob_table"); - if (context.ServiceTicketAuth) { - header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); - } else { - header.SetToken(context.Token); - } - - if (context.ImpersonationUser) { - header.SetImpersonationUser(*context.ImpersonationUser); - } - - UpdateHeaderForProxyIfNeed(hostName, context, header); - - header.AddTransactionId(transactionId); - header.SetOutputFormat(TMaybe<TFormat>()); // Binary format - - const ui64 currentOffset = Options_.Offset_ + readBytes; + const i64 currentOffset = StartOffset_ + readBytes; const i64 startPartIndex = currentOffset / Options_.PartSize_; - const ui64 skipBytes = currentOffset - Options_.PartSize_ * startPartIndex; - auto lowerLimitKey = Key_; - lowerLimitKey.Parts_.push_back(startPartIndex); - auto upperLimitKey = Key_; - upperLimitKey.Parts_.push_back(std::numeric_limits<i64>::max()); - TNode params = PathToParamNode(TRichYPath(Path_).AddRange(TReadRange() - .LowerLimit(TReadLimit().Key(lowerLimitKey)) - .UpperLimit(TReadLimit().Key(upperLimitKey)))); - params["start_part_index"] = TNode(startPartIndex); - params["offset"] = skipBytes; - if (Options_.PartIndexColumnName_) { - params["part_index_column_name"] = *Options_.PartIndexColumnName_; - } - if (Options_.DataColumnName_) { - params["data_column_name"] = *Options_.DataColumnName_; - } - params["part_size"] = Options_.PartSize_; - header.MergeParameters(params); - header.SetResponseCompression(ToString(context.Config->AcceptEncoding)); - - auto requestId = CreateGuidAsString(); - NHttpClient::IHttpResponsePtr response; - try { - response = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); - } catch (const std::exception& ex) { - LogRequestError(requestId, header, ex.what(), ""); - throw; - } + const i64 skipBytes = currentOffset - Options_.PartSize_ * startPartIndex; - YT_LOG_DEBUG("RSP %v - blob table stream", - requestId); - return response; + Options_.Offset(skipBytes); + Options_.StartPartIndex(startPartIndex); + return RawClient_->ReadBlobTable(transactionId, Path_, Key_, Options_); } //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/cpp/mapreduce/client/file_reader.h b/yt/cpp/mapreduce/client/file_reader.h index 48248696d3..8aafdc860d 100644 --- a/yt/cpp/mapreduce/client/file_reader.h +++ b/yt/cpp/mapreduce/client/file_reader.h @@ -11,7 +11,6 @@ class IInputStream; namespace NYT { -class THttpRequest; class TPingableTransaction; namespace NDetail { @@ -35,19 +34,16 @@ protected: protected: const IRawClientPtr RawClient_; - const TClientContext Context_; private: size_t DoRead(void* buf, size_t len) override; - virtual NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) = 0; - TString GetActiveRequestId() const; + virtual std::unique_ptr<IInputStream> Request(const TTransactionId& transactionId, ui64 readBytes) = 0; private: const IClientRetryPolicyPtr ClientRetryPolicy_; TFileReaderOptions FileReaderOptions_; - NHttpClient::IHttpResponsePtr Response_; - IInputStream* Input_ = nullptr; + std::unique_ptr<IInputStream> Input_; THolder<TPingableTransaction> ReadTransaction_; @@ -67,17 +63,17 @@ public: ITransactionPingerPtr transactionPinger, const TClientContext& context, const TTransactionId& transactionId, - const TFileReaderOptions& options = TFileReaderOptions()); + const TFileReaderOptions& options = {}); private: - NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) override; + std::unique_ptr<IInputStream> Request(const TTransactionId& transactionId, ui64 readBytes) override; private: - TFileReaderOptions FileReaderOptions_; - - TRichYPath Path_; const ui64 StartOffset_; const TMaybe<ui64> EndOffset_; + + TFileReaderOptions Options_; + TRichYPath Path_; }; //////////////////////////////////////////////////////////////////////////////// @@ -94,14 +90,16 @@ public: ITransactionPingerPtr transactionPinger, const TClientContext& context, const TTransactionId& transactionId, - const TBlobTableReaderOptions& options); + const TBlobTableReaderOptions& options = {}); private: - NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) override; + std::unique_ptr<IInputStream> Request(const TTransactionId& transactionId, ui64 readBytes) override; private: + const ui64 StartOffset_; const TKey Key_; - const TBlobTableReaderOptions Options_; + + TBlobTableReaderOptions Options_; TYPath Path_; }; diff --git a/yt/cpp/mapreduce/common/retry_lib.cpp b/yt/cpp/mapreduce/common/retry_lib.cpp index 772a2ab0cd..8146eb8b46 100644 --- a/yt/cpp/mapreduce/common/retry_lib.cpp +++ b/yt/cpp/mapreduce/common/retry_lib.cpp @@ -118,6 +118,11 @@ public: return Wrap(MakeIntrusive<TAttemptLimitedRetryPolicy>(static_cast<ui32>(Config_->StartOperationRetryCount), Config_)); } + IRequestRetryPolicyPtr CreatePolicyForReaderRequest() override + { + return Wrap(MakeIntrusive<TAttemptLimitedRetryPolicy>(static_cast<ui32>(Config_->ReadRetryCount), Config_)); + } + IRequestRetryPolicyPtr Wrap(IRequestRetryPolicyPtr basePolicy) { auto config = RetryConfigProvider_->CreateRetryConfig(); diff --git a/yt/cpp/mapreduce/common/retry_lib.h b/yt/cpp/mapreduce/common/retry_lib.h index c6c061f614..5b406b075f 100644 --- a/yt/cpp/mapreduce/common/retry_lib.h +++ b/yt/cpp/mapreduce/common/retry_lib.h @@ -48,6 +48,7 @@ class IClientRetryPolicy public: virtual IRequestRetryPolicyPtr CreatePolicyForGenericRequest() = 0; virtual IRequestRetryPolicyPtr CreatePolicyForStartOperationRequest() = 0; + virtual IRequestRetryPolicyPtr CreatePolicyForReaderRequest() = 0; }; diff --git a/yt/cpp/mapreduce/http/http_client.h b/yt/cpp/mapreduce/http/http_client.h index 6087eca098..a01b619fab 100644 --- a/yt/cpp/mapreduce/http/http_client.h +++ b/yt/cpp/mapreduce/http/http_client.h @@ -42,7 +42,6 @@ public: virtual IHttpResponsePtr Finish() = 0; }; - class IHttpClient { public: @@ -65,6 +64,34 @@ public: //////////////////////////////////////////////////////////////////////////////// +class THttpResponseStream + : public IInputStream +{ +public: + THttpResponseStream(IHttpResponsePtr response) + : Response_(std::move(response)) + { + Underlying_ = Response_->GetResponseStream(); + } + +private: + size_t DoRead(void *buf, size_t len) override + { + return Underlying_->Read(buf, len); + } + + size_t DoSkip(size_t len) override + { + return Underlying_->Skip(len); + } + +private: + IHttpResponsePtr Response_; + IInputStream* Underlying_; +}; + +//////////////////////////////////////////////////////////////////////////////// + IHttpClientPtr CreateDefaultHttpClient(); IHttpClientPtr CreateCoreHttpClient(bool useTLS, const TConfigPtr& config); diff --git a/yt/cpp/mapreduce/http/retry_request.cpp b/yt/cpp/mapreduce/http/retry_request.cpp index 1d9267009f..a47b2952b1 100644 --- a/yt/cpp/mapreduce/http/retry_request.cpp +++ b/yt/cpp/mapreduce/http/retry_request.cpp @@ -20,7 +20,7 @@ namespace NDetail { //////////////////////////////////////////////////////////////////////////////// -static TResponseInfo Request( +static NHttpClient::IHttpResponsePtr Request( const TClientContext& context, THttpHeader& header, TMaybe<TStringBuf> body, @@ -38,16 +38,10 @@ static TResponseInfo Request( auto url = GetFullUrlForProxy(hostName, context, header); - auto response = context.HttpClient->Request(url, requestId, config.HttpConfig, header, body); - - TResponseInfo result; - result.RequestId = requestId; - result.Response = response->GetResponse(); - result.HttpCode = response->GetStatusCode(); - return result; + return context.HttpClient->Request(url, requestId, config.HttpConfig, header, body); } -TResponseInfo RequestWithoutRetry( +NHttpClient::IHttpResponsePtr RequestWithoutRetry( const TClientContext& context, TMutationId& mutationId, THttpHeader& header, @@ -118,7 +112,12 @@ TResponseInfo RetryRequestWithPolicy( } } - return Request(context, header, body, requestId, config); + auto response = Request(context, header, body, requestId, config); + return TResponseInfo{ + .RequestId = response->GetRequestId(), + .Response = response->GetResponse(), + .HttpCode = response->GetStatusCode(), + }; } catch (const TErrorResponse& e) { LogRequestError(requestId, header, e.what(), retryPolicy->GetAttemptDescription()); retryWithSameMutationId = e.IsTransportError(); diff --git a/yt/cpp/mapreduce/http/retry_request.h b/yt/cpp/mapreduce/http/retry_request.h index 9750d0b541..444ecbbafc 100644 --- a/yt/cpp/mapreduce/http/retry_request.h +++ b/yt/cpp/mapreduce/http/retry_request.h @@ -105,7 +105,7 @@ TResponseInfo RetryRequestWithPolicy( TMaybe<TStringBuf> body = {}, const TRequestConfig& config = TRequestConfig()); -TResponseInfo RequestWithoutRetry( +NHttpClient::IHttpResponsePtr RequestWithoutRetry( const TClientContext& context, TMutationId& mutationId, THttpHeader& header, diff --git a/yt/cpp/mapreduce/interface/client_method_options.h b/yt/cpp/mapreduce/interface/client_method_options.h index 9bfb79753d..d457bf5f43 100644 --- a/yt/cpp/mapreduce/interface/client_method_options.h +++ b/yt/cpp/mapreduce/interface/client_method_options.h @@ -287,9 +287,12 @@ struct TBlobTableReaderOptions /// /// All blob parts except the last part of the blob must be of this size /// otherwise blob table reader emits error. - FLUENT_FIELD_DEFAULT(ui64, PartSize, 4 * 1024 * 1024); + FLUENT_FIELD_DEFAULT(i64, PartSize, 4 * 1024 * 1024); - /// @brief Offset from which to start reading + /// @brief Part index from which to start reading. + FLUENT_FIELD_DEFAULT(i64, StartPartIndex, 0); + + /// @brief Offset from which to start reading. FLUENT_FIELD_DEFAULT(i64, Offset, 0); }; @@ -468,7 +471,7 @@ struct TFileReaderOptions /// @brief Offset to start reading from. /// /// By default reading is started from the beginning of the file. - FLUENT_FIELD_OPTION(i64, Offset); + FLUENT_FIELD_DEFAULT(i64, Offset, 0); /// /// @brief Maximum length to read. diff --git a/yt/cpp/mapreduce/interface/raw_client.h b/yt/cpp/mapreduce/interface/raw_client.h index 32055e3d00..4994826863 100644 --- a/yt/cpp/mapreduce/interface/raw_client.h +++ b/yt/cpp/mapreduce/interface/raw_client.h @@ -8,6 +8,13 @@ namespace NYT { //////////////////////////////////////////////////////////////////////////////// +namespace NHttpClient { + class IHttpResponse; + using IHttpResponsePtr = std::unique_ptr<IHttpResponse>; +} + +//////////////////////////////////////////////////////////////////////////////// + class IRawClient : public virtual TThrRefBase { @@ -196,6 +203,18 @@ public: const TOperationId& operationId, const TGetJobTraceOptions& options = {}) = 0; + // SkyShare + + virtual NHttpClient::IHttpResponsePtr SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options = {}) = 0; + + // Files + virtual std::unique_ptr<IInputStream> ReadFile( + const TTransactionId& transactionId, + const TRichYPath& path, + const TFileReaderOptions& options = {}) = 0; + // File cache virtual TMaybe<TYPath> GetFileFromCache( @@ -266,6 +285,18 @@ public: const TYPath& path, const TAlterTableOptions& options = {}) = 0; + virtual std::unique_ptr<IInputStream> ReadTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TMaybe<TFormat>& format, + const TTableReaderOptions& options = {}) = 0; + + virtual std::unique_ptr<IInputStream> ReadBlobTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options = {}) = 0; + virtual void AlterTableReplica( TMutationId& mutationId, const TReplicaId& replicaId, diff --git a/yt/cpp/mapreduce/io/helpers.h b/yt/cpp/mapreduce/io/helpers.h index 0733ff417c..0d3ec40ab6 100644 --- a/yt/cpp/mapreduce/io/helpers.h +++ b/yt/cpp/mapreduce/io/helpers.h @@ -63,9 +63,7 @@ inline TNode FormIORequestParameters( if (options.Config_) { params[TIOOptionsTraits<TTableReaderOptions>::ConfigName] = *options.Config_; } - if (options.Offset_) { - params["offset"] = *options.Offset_; - } + params["offset"] = options.Offset_; if (options.Length_) { params["length"] = *options.Length_; } diff --git a/yt/cpp/mapreduce/raw_client/raw_client.cpp b/yt/cpp/mapreduce/raw_client/raw_client.cpp index 71d8d5fba9..65bfa01cea 100644 --- a/yt/cpp/mapreduce/raw_client/raw_client.cpp +++ b/yt/cpp/mapreduce/raw_client/raw_client.cpp @@ -14,6 +14,8 @@ #include <yt/cpp/mapreduce/interface/operation.h> #include <yt/cpp/mapreduce/interface/tvm.h> +#include <yt/cpp/mapreduce/io/helpers.h> + #include <library/cpp/yson/node/node_io.h> namespace NYT::NDetail { @@ -32,7 +34,7 @@ TNode THttpRawClient::Get( TMutationId mutationId; THttpHeader header("GET", "get"); header.MergeParameters(NRawClient::SerializeParamsForGet(transactionId, Context_.Config->Prefix, path, options)); - return NodeFromYsonString(RequestWithoutRetry(Context_, mutationId, header).Response); + return NodeFromYsonString(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TNode THttpRawClient::TryGet( @@ -61,7 +63,7 @@ void THttpRawClient::Set( header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForSet(transactionId, Context_.Config->Prefix, path, options)); auto body = NodeToYsonString(value); - RequestWithoutRetry(Context_, mutationId, header, body); + RequestWithoutRetry(Context_, mutationId, header, body)->GetResponse(); } bool THttpRawClient::Exists( @@ -72,7 +74,7 @@ bool THttpRawClient::Exists( TMutationId mutationId; THttpHeader header("GET", "exists"); header.MergeParameters(NRawClient::SerializeParamsForExists(transactionId, Context_.Config->Prefix, path, options)); - return ParseBoolFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseBoolFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } void THttpRawClient::MultisetAttributes( @@ -86,7 +88,7 @@ void THttpRawClient::MultisetAttributes( header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForMultisetAttributes(transactionId, Context_.Config->Prefix, path, options)); auto body = NodeToYsonString(value); - RequestWithoutRetry(Context_, mutationId, header, body); + RequestWithoutRetry(Context_, mutationId, header, body)->GetResponse(); } TNodeId THttpRawClient::Create( @@ -99,7 +101,7 @@ TNodeId THttpRawClient::Create( THttpHeader header("POST", "create"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForCreate(transactionId, Context_.Config->Prefix, path, type, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TNodeId THttpRawClient::CopyWithoutRetries( @@ -112,7 +114,7 @@ TNodeId THttpRawClient::CopyWithoutRetries( THttpHeader header("POST", "copy"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForCopy(transactionId, Context_.Config->Prefix, sourcePath, destinationPath, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TNodeId THttpRawClient::CopyInsideMasterCell( @@ -129,7 +131,7 @@ TNodeId THttpRawClient::CopyInsideMasterCell( // Make cross cell copying disable. params["enable_cross_cell_copying"] = false; header.MergeParameters(params); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TNodeId THttpRawClient::MoveWithoutRetries( @@ -142,7 +144,7 @@ TNodeId THttpRawClient::MoveWithoutRetries( THttpHeader header("POST", "move"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForMove(transactionId, Context_.Config->Prefix, sourcePath, destinationPath, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TNodeId THttpRawClient::MoveInsideMasterCell( @@ -159,7 +161,7 @@ TNodeId THttpRawClient::MoveInsideMasterCell( // Make cross cell copying disable. params["enable_cross_cell_copying"] = false; header.MergeParameters(params); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } void THttpRawClient::Remove( @@ -171,7 +173,7 @@ void THttpRawClient::Remove( THttpHeader header("POST", "remove"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForRemove(transactionId, Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } TNode::TListType THttpRawClient::List( @@ -190,7 +192,7 @@ TNode::TListType THttpRawClient::List( } header.MergeParameters(NRawClient::SerializeParamsForList(transactionId, Context_.Config->Prefix, updatedPath, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NodeFromYsonString(responseInfo.Response).AsList(); + return NodeFromYsonString(responseInfo->GetResponse()).AsList(); } TNodeId THttpRawClient::Link( @@ -203,7 +205,7 @@ TNodeId THttpRawClient::Link( THttpHeader header("POST", "link"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForLink(transactionId, Context_.Config->Prefix, targetPath, linkPath, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } TLockId THttpRawClient::Lock( @@ -216,7 +218,7 @@ TLockId THttpRawClient::Lock( THttpHeader header("POST", "lock"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForLock(transactionId, Context_.Config->Prefix, path, mode, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } void THttpRawClient::Unlock( @@ -228,7 +230,7 @@ void THttpRawClient::Unlock( THttpHeader header("POST", "unlock"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForUnlock(transactionId, Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::Concatenate( @@ -241,7 +243,7 @@ void THttpRawClient::Concatenate( THttpHeader header("POST", "concatenate"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForConcatenate(transactionId, Context_.Config->Prefix, sourcePaths, destinationPath, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } TTransactionId THttpRawClient::StartTransaction( @@ -252,7 +254,7 @@ TTransactionId THttpRawClient::StartTransaction( THttpHeader header("POST", "start_tx"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForStartTransaction(parentTransactionId, Context_.Config->TxTimeout, options)); - return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header).Response); + return ParseGuidFromResponse(RequestWithoutRetry(Context_, mutationId, header)->GetResponse()); } void THttpRawClient::PingTransaction(const TTransactionId& transactionId) @@ -264,7 +266,7 @@ void THttpRawClient::PingTransaction(const TTransactionId& transactionId) requestConfig.HttpConfig = NHttpClient::THttpConfig{ .SocketTimeout = Context_.Config->PingTimeout }; - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::AbortTransaction( @@ -274,7 +276,7 @@ void THttpRawClient::AbortTransaction( THttpHeader header("POST", "abort_tx"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForAbortTransaction(transactionId)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::CommitTransaction( @@ -284,7 +286,7 @@ void THttpRawClient::CommitTransaction( THttpHeader header("POST", "commit_tx"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForCommitTransaction(transactionId)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } TOperationAttributes THttpRawClient::GetOperation( @@ -295,7 +297,7 @@ TOperationAttributes THttpRawClient::GetOperation( THttpHeader header("GET", "get_operation"); header.MergeParameters(NRawClient::SerializeParamsForGetOperation(operationId, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NRawClient::ParseOperationAttributes(NodeFromYsonString(responseInfo.Response)); + return NRawClient::ParseOperationAttributes(NodeFromYsonString(responseInfo->GetResponse())); } TOperationAttributes THttpRawClient::GetOperation( @@ -306,7 +308,7 @@ TOperationAttributes THttpRawClient::GetOperation( THttpHeader header("GET", "get_operation"); header.MergeParameters(NRawClient::SerializeParamsForGetOperation(alias, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NRawClient::ParseOperationAttributes(NodeFromYsonString(responseInfo.Response)); + return NRawClient::ParseOperationAttributes(NodeFromYsonString(responseInfo->GetResponse())); } void THttpRawClient::AbortOperation( @@ -316,7 +318,7 @@ void THttpRawClient::AbortOperation( THttpHeader header("POST", "abort_op"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForAbortOperation(operationId)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::CompleteOperation( @@ -326,7 +328,7 @@ void THttpRawClient::CompleteOperation( THttpHeader header("POST", "complete_op"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForCompleteOperation(operationId)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::SuspendOperation( @@ -337,7 +339,7 @@ void THttpRawClient::SuspendOperation( THttpHeader header("POST", "suspend_op"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForSuspendOperation(operationId, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::ResumeOperation( @@ -348,7 +350,7 @@ void THttpRawClient::ResumeOperation( THttpHeader header("POST", "resume_op"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForResumeOperation(operationId, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } template <typename TKey> @@ -367,7 +369,7 @@ TListOperationsResult THttpRawClient::ListOperations(const TListOperationsOption THttpHeader header("GET", "list_operations"); header.MergeParameters(NRawClient::SerializeParamsForListOperations(options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - auto resultNode = NodeFromYsonString(responseInfo.Response); + auto resultNode = NodeFromYsonString(responseInfo->GetResponse()); const auto& operationNodesList = resultNode["operations"].AsList(); @@ -417,7 +419,7 @@ NYson::TYsonString THttpRawClient::GetJob( THttpHeader header("GET", "get_job"); header.MergeParameters(NRawClient::SerializeParamsForGetJob(operationId, jobId, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NYson::TYsonString(responseInfo.Response); + return NYson::TYsonString(responseInfo->GetResponse()); } TListJobsResult THttpRawClient::ListJobs( @@ -428,7 +430,7 @@ TListJobsResult THttpRawClient::ListJobs( THttpHeader header("GET", "list_jobs"); header.MergeParameters(NRawClient::SerializeParamsForListJobs(operationId, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - auto resultNode = NodeFromYsonString(responseInfo.Response); + auto resultNode = NodeFromYsonString(responseInfo->GetResponse()); const auto& jobNodesList = resultNode["jobs"].AsList(); @@ -524,7 +526,7 @@ TString THttpRawClient::GetJobStderrWithRetries( TRequestConfig config; config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, {}, config); - return responseInfo.Response; + return responseInfo->GetResponse(); } IFileReaderPtr THttpRawClient::GetJobStderr( @@ -573,7 +575,7 @@ std::vector<TJobTraceEvent> THttpRawClient::GetJobTrace( THttpHeader header("GET", "get_job_trace"); header.MergeParameters(NRawClient::SerializeParamsForGetJobTrace(operationId, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - auto resultNode = NodeFromYsonString(responseInfo.Response); + auto resultNode = NodeFromYsonString(responseInfo->GetResponse()); const auto& traceEventNodesList = resultNode.AsList(); @@ -586,6 +588,50 @@ std::vector<TJobTraceEvent> THttpRawClient::GetJobTrace( return result; } +NHttpClient::IHttpResponsePtr THttpRawClient::SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) +{ + TMutationId mutationId; + THttpHeader header("POST", "api/v1/share", /*IsApi*/ false); + + auto proxyName = Context_.ServerName.substr(0, Context_.ServerName.find('.')); + + auto host = Context_.Config->SkynetApiHost; + if (host == "") { + host = "skynet." + proxyName + ".yt.yandex.net"; + } + + TSkyShareTableOptions patchedOptions = options; + + if (Context_.Config->Pool && !patchedOptions.Pool_) { + patchedOptions.Pool(Context_.Config->Pool); + } + + header.MergeParameters(NRawClient::SerializeParamsForSkyShareTable(proxyName, Context_.Config->Prefix, tablePaths, patchedOptions)); + TClientContext skyApiHost({.ServerName = host, .HttpClient = NHttpClient::CreateDefaultHttpClient()}); + + return RequestWithoutRetry(skyApiHost, mutationId, header, ""); +} + +std::unique_ptr<IInputStream> THttpRawClient::ReadFile( + const TTransactionId& transactionId, + const TRichYPath& path, + const TFileReaderOptions& options) +{ + TMutationId mutationId; + THttpHeader header("GET", GetReadFileCommand(Context_.Config->ApiVersion)); + header.AddTransactionId(transactionId); + header.SetOutputFormat(TMaybe<TFormat>()); // Binary format + header.MergeParameters(FormIORequestParameters(path, options)); + header.SetResponseCompression(ToString(Context_.Config->AcceptEncoding)); + + TRequestConfig config; + config.IsHeavy = true; + auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); + return std::make_unique<NHttpClient::THttpResponseStream>(std::move(responseInfo)); +} + TMaybe<TYPath> THttpRawClient::GetFileFromCache( const TTransactionId& transactionId, const TString& md5Signature, @@ -596,7 +642,7 @@ TMaybe<TYPath> THttpRawClient::GetFileFromCache( THttpHeader header("GET", "get_file_from_cache"); header.MergeParameters(NRawClient::SerializeParamsForGetFileFromCache(transactionId, md5Signature, cachePath, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - auto resultNode = NodeFromYsonString(responseInfo.Response).AsString(); + auto resultNode = NodeFromYsonString(responseInfo->GetResponse()).AsString(); return resultNode.empty() ? Nothing() : TMaybe<TYPath>(resultNode); } @@ -611,7 +657,7 @@ TYPath THttpRawClient::PutFileToCache( THttpHeader header("POST", "put_file_to_cache"); header.MergeParameters(NRawClient::SerializeParamsForPutFileToCache(transactionId, Context_.Config->Prefix, filePath, md5Signature, cachePath, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NodeFromYsonString(responseInfo.Response).AsString(); + return NodeFromYsonString(responseInfo->GetResponse()).AsString(); } void THttpRawClient::MountTable( @@ -626,7 +672,7 @@ void THttpRawClient::MountTable( header.AddParameter("cell_id", GetGuidAsString(*options.CellId_)); } header.AddParameter("freeze", options.Freeze_); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::UnmountTable( @@ -638,7 +684,7 @@ void THttpRawClient::UnmountTable( header.AddMutationId(); header.MergeParameters(NRawClient::SerializeTabletParams(Context_.Config->Prefix, path, options)); header.AddParameter("force", options.Force_); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::RemountTable( @@ -649,7 +695,7 @@ void THttpRawClient::RemountTable( THttpHeader header("POST", "remount_table"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeTabletParams(Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::ReshardTableByPivotKeys( @@ -662,7 +708,7 @@ void THttpRawClient::ReshardTableByPivotKeys( header.AddMutationId(); header.MergeParameters(NRawClient::SerializeTabletParams(Context_.Config->Prefix, path, options)); header.AddParameter("pivot_keys", BuildYsonNodeFluently().List(keys)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::ReshardTableByTabletCount( @@ -675,7 +721,7 @@ void THttpRawClient::ReshardTableByTabletCount( header.AddMutationId(); header.MergeParameters(NRawClient::SerializeTabletParams(Context_.Config->Prefix, path, options)); header.AddParameter("tablet_count", tabletCount); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::InsertRows( @@ -690,7 +736,7 @@ void THttpRawClient::InsertRows( auto body = NodeListToYsonString(rows); TRequestConfig config; config.IsHeavy = true; - RequestWithoutRetry(Context_, mutationId, header, body, config); + RequestWithoutRetry(Context_, mutationId, header, body, config)->GetResponse(); } void THttpRawClient::TrimRows( @@ -706,7 +752,7 @@ void THttpRawClient::TrimRows( header.MergeParameters(NRawClient::SerializeParametersForTrimRows(Context_.Config->Prefix, path, options)); TRequestConfig config; config.IsHeavy = true; - RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); + RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config)->GetResponse(); } TNode::TListType THttpRawClient::LookupRows( @@ -737,7 +783,7 @@ TNode::TListType THttpRawClient::LookupRows( TRequestConfig config; config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, body, config); - return NodeFromYsonString(responseInfo.Response, ::NYson::EYsonType::ListFragment).AsList(); + return NodeFromYsonString(responseInfo->GetResponse(), ::NYson::EYsonType::ListFragment).AsList(); } TNode::TListType THttpRawClient::SelectRows( @@ -769,7 +815,44 @@ TNode::TListType THttpRawClient::SelectRows( TRequestConfig config; config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); - return NodeFromYsonString(responseInfo.Response, ::NYson::EYsonType::ListFragment).AsList(); + return NodeFromYsonString(responseInfo->GetResponse(), ::NYson::EYsonType::ListFragment).AsList(); +} + +std::unique_ptr<IInputStream> THttpRawClient::ReadTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TMaybe<TFormat>& format, + const TTableReaderOptions& options) +{ + TMutationId mutationId; + THttpHeader header("GET", GetReadTableCommand(Context_.Config->ApiVersion)); + header.SetOutputFormat(format); + header.SetResponseCompression(ToString(Context_.Config->AcceptEncoding)); + header.MergeParameters(NRawClient::SerializeParamsForReadTable(transactionId, Context_.Config->Prefix, path, options)); + header.MergeParameters(FormIORequestParameters(path, options)); + + TRequestConfig config; + config.IsHeavy = true; + auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); + return std::make_unique<NHttpClient::THttpResponseStream>(std::move(responseInfo)); +} + +std::unique_ptr<IInputStream> THttpRawClient::ReadBlobTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options) +{ + TMutationId mutationId; + THttpHeader header("GET", "read_blob_table"); + header.SetOutputFormat(TMaybe<TFormat>()); // Binary format + header.SetResponseCompression(ToString(Context_.Config->AcceptEncoding)); + header.MergeParameters(NRawClient::SerializeParamsForReadBlobTable(transactionId, path, key, options)); + + TRequestConfig config; + config.IsHeavy = true; + auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); + return std::make_unique<NHttpClient::THttpResponseStream>(std::move(responseInfo)); } void THttpRawClient::AlterTable( @@ -781,7 +864,7 @@ void THttpRawClient::AlterTable( THttpHeader header("POST", "alter_table"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForAlterTable(transactionId, Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::AlterTableReplica( @@ -792,7 +875,7 @@ void THttpRawClient::AlterTableReplica( THttpHeader header("POST", "alter_table_replica"); header.AddMutationId(); header.MergeParameters(NRawClient::SerializeParamsForAlterTableReplica(replicaId, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::DeleteRows( @@ -808,7 +891,7 @@ void THttpRawClient::DeleteRows( auto body = NodeListToYsonString(keys); TRequestConfig config; config.IsHeavy = true; - RequestWithoutRetry(Context_, mutationId, header, body, config); + RequestWithoutRetry(Context_, mutationId, header, body, config)->GetResponse(); } void THttpRawClient::FreezeTable( @@ -818,7 +901,7 @@ void THttpRawClient::FreezeTable( TMutationId mutationId; THttpHeader header("POST", "freeze_table"); header.MergeParameters(NRawClient::SerializeParamsForFreezeTable(Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } void THttpRawClient::UnfreezeTable( @@ -828,7 +911,7 @@ void THttpRawClient::UnfreezeTable( TMutationId mutationId; THttpHeader header("POST", "unfreeze_table"); header.MergeParameters(NRawClient::SerializeParamsForUnfreezeTable(Context_.Config->Prefix, path, options)); - RequestWithoutRetry(Context_, mutationId, header); + RequestWithoutRetry(Context_, mutationId, header)->GetResponse(); } TCheckPermissionResponse THttpRawClient::CheckPermission( @@ -841,7 +924,7 @@ TCheckPermissionResponse THttpRawClient::CheckPermission( THttpHeader header("GET", "check_permission"); header.MergeParameters(NRawClient::SerializeParamsForCheckPermission(user, permission, Context_.Config->Prefix, path, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); - return NRawClient::ParseCheckPermissionResponse(NodeFromYsonString(responseInfo.Response)); + return NRawClient::ParseCheckPermissionResponse(NodeFromYsonString(responseInfo->GetResponse())); } TVector<TTabletInfo> THttpRawClient::GetTabletInfos( @@ -854,7 +937,7 @@ TVector<TTabletInfo> THttpRawClient::GetTabletInfos( header.MergeParameters(NRawClient::SerializeParamsForGetTabletInfos(Context_.Config->Prefix, path, tabletIndexes, options)); auto responseInfo = RequestWithoutRetry(Context_, mutationId, header); TVector<TTabletInfo> result; - Deserialize(result, *NodeFromYsonString(responseInfo.Response).AsMap().FindPtr("tablets")); + Deserialize(result, *NodeFromYsonString(responseInfo->GetResponse()).AsMap().FindPtr("tablets")); return result; } @@ -870,7 +953,7 @@ TVector<TTableColumnarStatistics> THttpRawClient::GetTableColumnarStatistics( config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); TVector<TTableColumnarStatistics> result; - Deserialize(result, NodeFromYsonString(responseInfo.Response)); + Deserialize(result, NodeFromYsonString(responseInfo->GetResponse())); return result; } @@ -886,7 +969,7 @@ TMultiTablePartitions THttpRawClient::GetTablePartitions( config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); TMultiTablePartitions result; - Deserialize(result, NodeFromYsonString(responseInfo.Response)); + Deserialize(result, NodeFromYsonString(responseInfo->GetResponse())); return result; } @@ -897,7 +980,7 @@ ui64 THttpRawClient::GenerateTimestamp() TRequestConfig config; config.IsHeavy = true; auto responseInfo = RequestWithoutRetry(Context_, mutationId, header, /*body*/ {}, config); - return NodeFromYsonString(responseInfo.Response).AsUint64(); + return NodeFromYsonString(responseInfo->GetResponse()).AsUint64(); } TAuthorizationInfo THttpRawClient::WhoAmI() @@ -908,7 +991,7 @@ TAuthorizationInfo THttpRawClient::WhoAmI() TAuthorizationInfo result; NJson::TJsonValue jsonValue; - bool ok = NJson::ReadJsonTree(requestResult.Response, &jsonValue, /*throwOnError*/ true); + bool ok = NJson::ReadJsonTree(requestResult->GetResponse(), &jsonValue, /*throwOnError*/ true); Y_ABORT_UNLESS(ok); result.Login = jsonValue["login"].GetString(); result.Realm = jsonValue["realm"].GetString(); diff --git a/yt/cpp/mapreduce/raw_client/raw_client.h b/yt/cpp/mapreduce/raw_client/raw_client.h index 08015f024f..e540d1b331 100644 --- a/yt/cpp/mapreduce/raw_client/raw_client.h +++ b/yt/cpp/mapreduce/raw_client/raw_client.h @@ -202,6 +202,18 @@ public: const TOperationId& operationId, const TGetJobTraceOptions& options = {}) override; + // SkyShare + + NHttpClient::IHttpResponsePtr SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options = {}) override; + + // Files + std::unique_ptr<IInputStream> ReadFile( + const TTransactionId& transactionId, + const TRichYPath& path, + const TFileReaderOptions& options = {}) override; + // File cache TMaybe<TYPath> GetFileFromCache( @@ -266,6 +278,18 @@ public: const TString& query, const TSelectRowsOptions& options = {}) override; + std::unique_ptr<IInputStream> ReadTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TMaybe<TFormat>& format, + const TTableReaderOptions& options = {}) override; + + std::unique_ptr<IInputStream> ReadBlobTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options = {}) override; + void AlterTable( TMutationId& mutationId, const TTransactionId& transactionId, diff --git a/yt/cpp/mapreduce/raw_client/raw_requests.cpp b/yt/cpp/mapreduce/raw_client/raw_requests.cpp index a3f10e6c41..a3f01da6fc 100644 --- a/yt/cpp/mapreduce/raw_client/raw_requests.cpp +++ b/yt/cpp/mapreduce/raw_client/raw_requests.cpp @@ -301,50 +301,6 @@ TCheckPermissionResponse ParseCheckPermissionResponse(const TNode& node) return result; } -TNode::TListType SkyShareTable( - const IRequestRetryPolicyPtr& retryPolicy, - const TClientContext& context, - const std::vector<TYPath>& tablePaths, - const TSkyShareTableOptions& options) -{ - THttpHeader header("POST", "api/v1/share", /*IsApi*/ false); - - auto proxyName = context.ServerName.substr(0, context.ServerName.find('.')); - - auto host = context.Config->SkynetApiHost; - if (host == "") { - host = "skynet." + proxyName + ".yt.yandex.net"; - } - - TSkyShareTableOptions patchedOptions = options; - - if (context.Config->Pool && !patchedOptions.Pool_) { - patchedOptions.Pool(context.Config->Pool); - } - - header.MergeParameters(NRawClient::SerializeParamsForSkyShareTable(proxyName, context.Config->Prefix, tablePaths, patchedOptions)); - TClientContext skyApiHost({ .ServerName = host, .HttpClient = NHttpClient::CreateDefaultHttpClient() }); - TResponseInfo response = {}; - - // As documented at https://wiki.yandex-team.ru/yt/userdoc/blob_tables/#shag3.sozdajomrazdachu - // first request returns HTTP status code 202 (Accepted). And we need retrying until we have 200 (OK). - while (response.HttpCode != 200) { - response = RetryRequestWithPolicy(retryPolicy, skyApiHost, header, ""); - TWaitProxy::Get()->Sleep(TDuration::Seconds(5)); - } - - if (options.KeyColumns_) { - return NodeFromJsonString(response.Response)["torrents"].AsList(); - } else { - TNode torrent; - - torrent["key"] = TNode::CreateList(); - torrent["rbtorrent"] = response.Response; - - return TNode::TListType{ torrent }; - } -} - TRichYPath CanonizeYPath( const IRequestRetryPolicyPtr& retryPolicy, const TClientContext& context, diff --git a/yt/cpp/mapreduce/raw_client/raw_requests.h b/yt/cpp/mapreduce/raw_client/raw_requests.h index c60536c86d..bcc9a4bfd7 100644 --- a/yt/cpp/mapreduce/raw_client/raw_requests.h +++ b/yt/cpp/mapreduce/raw_client/raw_requests.h @@ -29,7 +29,6 @@ TCheckPermissionResponse ParseCheckPermissionResponse(const TNode& node); //////////////////////////////////////////////////////////////////////////////// -// // marks `batchRequest' as executed void ExecuteBatch( IRequestRetryPolicyPtr retryPolicy, @@ -37,14 +36,6 @@ void ExecuteBatch( TRawBatchRequest& batchRequest, const TExecuteBatchOptions& options = {}); -// SkyShare - -TNode::TListType SkyShareTable( - const IRequestRetryPolicyPtr& retryPolicy, - const TClientContext& context, - const std::vector<TYPath>& tablePaths, - const TSkyShareTableOptions& options = {}); - // Misc TRichYPath CanonizeYPath( diff --git a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp index 8474bd0edc..2869ddcc0f 100644 --- a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp +++ b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp @@ -4,6 +4,7 @@ #include <yt/cpp/mapreduce/interface/config.h> #include <yt/cpp/mapreduce/interface/client_method_options.h> +#include <yt/cpp/mapreduce/interface/fluent.h> #include <yt/cpp/mapreduce/interface/operation.h> #include <yt/cpp/mapreduce/interface/serialize.h> @@ -639,13 +640,60 @@ TNode SerializeParametersForDeleteRows( TNode SerializeParametersForTrimRows( const TString& pathPrefix, const TYPath& path, - const TTrimRowsOptions& /* options*/) + const TTrimRowsOptions& /*options*/) { TNode result; SetPathParam(&result, pathPrefix, path); return result; } +TNode SerializeParamsForReadTable( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TRichYPath& path, + const TTableReaderOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + result["control_attributes"] = BuildYsonNodeFluently() + .BeginMap() + .Item("enable_row_index").Value(options.ControlAttributes_.EnableRowIndex_) + .Item("enable_range_index").Value(options.ControlAttributes_.EnableRangeIndex_) + .EndMap(); + return result; +} + +TNode SerializeParamsForReadBlobTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options) +{ + auto lowerLimitKey = key; + lowerLimitKey.Parts_.push_back(options.StartPartIndex_); + auto upperLimitKey = key; + upperLimitKey.Parts_.push_back(std::numeric_limits<i64>::max()); + + TNode result = PathToParamNode( + TRichYPath(path). + AddRange(TReadRange() + .LowerLimit(TReadLimit().Key(lowerLimitKey)) + .UpperLimit(TReadLimit().Key(upperLimitKey)))); + + SetTransactionIdParam(&result, transactionId); + + result["start_part_index"] = options.StartPartIndex_; + result["offset"] = options.Offset_; + if (options.PartIndexColumnName_) { + result["part_index_column_name"] = *options.PartIndexColumnName_; + } + if (options.DataColumnName_) { + result["data_column_name"] = *options.DataColumnName_; + } + result["part_size"] = options.PartSize_; + return result; +} + TNode SerializeParamsForParseYPath(const TRichYPath& path) { TNode result; diff --git a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h index 655198248c..acbf003b5c 100644 --- a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h +++ b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h @@ -146,6 +146,18 @@ TNode SerializeParametersForTrimRows( const TYPath& path, const TTrimRowsOptions& options); +TNode SerializeParamsForReadTable( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TRichYPath& path, + const TTableReaderOptions& options); + +TNode SerializeParamsForReadBlobTable( + const TTransactionId& transactionId, + const TRichYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options); + TNode SerializeParamsForParseYPath( const TRichYPath& path); diff --git a/yt/python/yt/yson/__init__.py b/yt/python/yt/yson/__init__.py index 2d5dad9663..ddaec8dd4c 100644 --- a/yt/python/yt/yson/__init__.py +++ b/yt/python/yt/yson/__init__.py @@ -54,7 +54,7 @@ except ImportError as error: print("Warning! Failed to import YSON bindings: " + message, file=_sys.stderr) try: - from yt_yson_bindings import upload_parquet, dump_parquet, dump_orc, upload_orc, async_dump_parquet # noqa + from yt_yson_bindings import upload_parquet, dump_parquet, dump_orc, upload_orc, async_dump_parquet, async_dump_orc # noqa HAS_PARQUET = True except ImportError as error: message = str(error) diff --git a/yt/yql/providers/yt/comp_nodes/dq/dq_yt_block_reader.cpp b/yt/yql/providers/yt/comp_nodes/dq/dq_yt_block_reader.cpp index c8c2b61607..8b3019ffa3 100644 --- a/yt/yql/providers/yt/comp_nodes/dq/dq_yt_block_reader.cpp +++ b/yt/yql/providers/yt/comp_nodes/dq/dq_yt_block_reader.cpp @@ -368,7 +368,7 @@ public: LocalListeners_.reserve(Inputs_.size()); for (size_t i = 0; i < Inputs_.size(); ++i) { auto& decoder = Settings_->Specs->Inputs[Settings_->OriginalIndexes[i]]; - bool native = decoder->NativeYtTypeFlags && !decoder->FieldsVec[i].ExplicitYson; + bool native = decoder->NativeYtTypeFlags; LocalListeners_.emplace_back(std::make_shared<TLocalListener>(Listener_, Settings_->ColumnNameMapping, ptr, types, *Settings_->Pool, Settings_->PgBuilder, native, jobStats)); LocalListeners_.back()->Init(LocalListeners_.back()); } diff --git a/yt/yql/providers/yt/gateway/native/ut/ya.make b/yt/yql/providers/yt/gateway/native/ut/ya.make index 702a53d5dd..23f262c22b 100644 --- a/yt/yql/providers/yt/gateway/native/ut/ya.make +++ b/yt/yql/providers/yt/gateway/native/ut/ya.make @@ -1,5 +1,3 @@ -IF (NOT OPENSOURCE) - UNITTEST() SRCS( @@ -11,7 +9,7 @@ PEERDIR( yt/yql/providers/yt/gateway/file yt/yql/providers/yt/codec/codegen yt/yql/providers/yt/comp_nodes/llvm14 - yql/essentials/core/ut_common + yt/yql/providers/yt/lib/ut_common library/cpp/testing/mock_server library/cpp/testing/common yql/essentials/public/udf/service/terminate_policy @@ -24,5 +22,3 @@ YQL_LAST_ABI_VERSION() END() -ENDIF() - diff --git a/yt/yql/providers/yt/gateway/native/ut/yql_yt_native_folders_ut.cpp b/yt/yql/providers/yt/gateway/native/ut/yql_yt_native_folders_ut.cpp new file mode 100644 index 0000000000..b08db52a50 --- /dev/null +++ b/yt/yql/providers/yt/gateway/native/ut/yql_yt_native_folders_ut.cpp @@ -0,0 +1,366 @@ +#include "library/cpp/testing/unittest/registar.h" +#include <library/cpp/yson/node/node_io.h> +#include <yt/yql/providers/yt/lib/ut_common/yql_ut_common.h> +#include <library/cpp/testing/common/network.h> +#include <library/cpp/testing/mock_server/server.h> +#include <yt/yql/providers/yt/gateway/native/yql_yt_native.h> +#include <yql/essentials/core/file_storage/proto/file_storage.pb.h> +#include <yql/essentials/providers/common/proto/gateways_config.pb.h> +#include <yt/yql/providers/yt/provider/yql_yt_provider.h> + +namespace NYql { + +namespace { + +constexpr auto CYPRES_TX_ID = "\"9518f6d4-f0480586-41103e8-ca595920\""; +constexpr auto CYPRES_NODE_A_CONTENT = R"( +[ + { + output = [ + < + "user_attributes" = {}; + "type" = "table"; + > "a"; + < + "user_attributes" = {}; + "type" = "table"; + > "b"; + < + "user_attributes" = {}; + "target_path" = "//link_dest"; + "broken" = %false; + "type" = "link"; + > "link"; + < + "user_attributes" = {}; + "target_path" = "//link_broken_dest"; + "broken" = %true; + "type" = "link"; + > "link_broken"; + < + "user_attributes" = {}; + "target_path" = "//link_access_denied"; + "broken" = %false; + "type" = "link"; + > "link_access_denied"; + ]; + }; +] +)"; + +constexpr auto CYPRES_NODE_W_LINK = R"( +[ + { + output = [ + < + "target_path" = "//link_dest"; + "broken" = %false; + "type" = "link"; + > "link"; + ]; + } +] +)"; + +constexpr auto CYPRES_LINK_DEST = R"( +[ + { + "output" = < + "user_attributes" = {}; + "type" = "table"; + > #; + }; +] +)"; + +constexpr auto CYPRES_ACCESS_ERROR = R"( +[ + { + "error" = { + "code" = 901; + "message" = "Access denied"; + } + } +] +)"; + +constexpr auto CYPRESS_BLACKBOX_ERROR = R"( +[ + { + "error" = { + "code" = 111; + "message" = "Blackbox rejected token"; + } + } +] +)"; + +TVector<IYtGateway::TFolderResult::TFolderItem> EXPECTED_ITEMS { + {"test/a/a", "table", R"({"user_attributes"={}})"}, + {"test/a/b", "table", R"({"user_attributes"={}})"}, + {"test/a/link", "table", R"({"user_attributes"={}})"}, + {"test/a/link_access_denied", "unknown", "{}"} +}; + +TGatewaysConfig MakeGatewaysConfig(size_t port) +{ + TGatewaysConfig config {}; + auto* clusters = config.MutableYt()->MutableClusterMapping(); + NYql::TYtClusterConfig cluster; + cluster.SetName("ut_cluster"); + cluster.SetYTName("ut_cluster"); + cluster.SetCluster("localhost:" + ToString(port)); + clusters->Add(std::move(cluster)); + return config; +} + +class TYtReplier : public TRequestReplier { +public: + using THandler = std::function<THttpResponse(TStringBuf path, const NYT::TNode& attributes)>; + + bool DoReply(const TReplyParams& params) override { + const TParsedHttpFull parsed(params.Input.FirstLine()); + Cout << parsed.Path << Endl; + + HttpCodes code = HTTP_NOT_FOUND; + TString content; + if (parsed.Path == "/api/v3/start_tx") { + content = CYPRES_TX_ID; + code = HTTP_OK; + } + else if (parsed.Path == "/api/v3/ping_tx") { + code = HTTP_OK; + } + else if (parsed.Path == "/api/v3/execute_batch") { + auto executeBatchRes = HandleExecuteBatch(params.Input); + executeBatchRes.OutTo(params.Output); + return true; + } + THttpResponse resp(code); + resp.SetContent(content); + resp.OutTo(params.Output); + + return true; + } + explicit TYtReplier(THandler handleListCommand, THandler handleGetCommand, TMaybe<std::function<void(const NYT::TNode& request)>> assertion): + HandleListCommand_(handleListCommand), HandleGetCommand_(handleGetCommand) { + if (assertion) { + Assertion_ = assertion.GetRef(); + } + } + +private: + THttpResponse HandleExecuteBatch(THttpInput& input) { + auto requestBody = input.ReadAll(); + auto requestBodyNode = NYT::NodeFromYsonString(requestBody); + if (!requestBodyNode.HasKey("requests")) { + return THttpResponse{HTTP_INTERNAL_SERVER_ERROR}; + } + auto& requests = requestBodyNode["requests"]; + if (!requests.IsList()) { + return THttpResponse{HTTP_INTERNAL_SERVER_ERROR}; + } + for (auto& request : requests.AsList()) { + Assertion_(request); + + const auto& command = request["command"]; + const auto& parameters = request["parameters"]; + const auto& path = parameters["path"].AsString(); + const auto& attributes = parameters.HasKey("attributes") ? parameters["attributes"] : NYT::TNode{}; + if (command == "list") { + return HandleListCommand_(path, attributes); + } + if (command == "get") { + return HandleGetCommand_(path, attributes); + } + } + return THttpResponse{HTTP_NOT_FOUND}; + } + + std::function<void(const NYT::TNode& request)> Assertion_ = [] ([[maybe_unused]] auto _) {}; + THandler HandleListCommand_; + THandler HandleGetCommand_; + +}; + +Y_UNIT_TEST_SUITE(YtNativeGateway) { + +std::pair<TIntrusivePtr<TYtState>, IYtGateway::TPtr> InitTest(const NTesting::TPortHolder& port, TTypeAnnotationContext* types) { + TYtNativeServices nativeServices; + auto gatewaysConfig = MakeGatewaysConfig(port); + nativeServices.Config = std::make_shared<TYtGatewayConfig>(gatewaysConfig.GetYt()); + nativeServices.FileStorage = CreateFileStorage(TFileStorageConfig{}); + + auto ytGateway = CreateYtNativeGateway(nativeServices); + auto ytState = MakeIntrusive<TYtState>(types); + ytState->Gateway = ytGateway; + + InitializeYtGateway(ytGateway, ytState); + return {ytState, ytGateway}; +} + +IYtGateway::TFolderResult GetFolderResult(TYtReplier::THandler handleList, TYtReplier::THandler handleGet, +TMaybe<std::function<void(const NYT::TNode& request)>> gatewayRequestAssertion, std::function<IYtGateway::TFolderOptions(TString)> makeFolderOptions) { + const auto port = NTesting::GetFreePort(); + NMock::TMockServer mockServer{port, + [gatewayRequestAssertion, handleList, handleGet] () {return new TYtReplier(handleList, handleGet, gatewayRequestAssertion);} + }; + + TTypeAnnotationContext types; + auto [ytState, ytGateway] = InitTest(port, &types); + + IYtGateway::TFolderOptions folderOptions = makeFolderOptions(ytState->SessionId); + auto folderFuture = ytGateway->GetFolder(std::move(folderOptions)); + + folderFuture.Wait(); + ytState->Gateway->CloseSession({ytState->SessionId}); + auto folderRes = folderFuture.GetValue(); + return folderRes; +} + +Y_UNIT_TEST(GetFolder) { + THashMap<TString, THashSet<TString>> requiredAttributes { + {"//test/a", {"type", "broken", "target_path", "user_attributes"}}, + {"//link_dest", {"type", "user_attributes"}} + }; + const auto checkRequiredAttributes = [&requiredAttributes] (const NYT::TNode& request) { + const auto& parameters = request["parameters"]; + const auto path = parameters["path"].AsString(); + const auto& attributes = parameters.HasKey("attributes") ? parameters["attributes"] : NYT::TNode{}; + + if (!requiredAttributes.contains(path)) { + return; + } + + THashSet<TString> attributesSet; + for (const auto& attribute : attributes.AsList()) { + attributesSet.insert(attribute.AsString()); + } + UNIT_ASSERT_VALUES_EQUAL(requiredAttributes[path], attributesSet); + }; + + const auto handleGet = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(attributes); + THttpResponse resp{HTTP_OK}; + if (path == "//link_dest") { + resp.SetContent(CYPRES_LINK_DEST); + return resp; + } + if (path == "//link_access_denied") { + resp.SetContent(CYPRES_ACCESS_ERROR); + return resp; + } + + return THttpResponse{HTTP_NOT_FOUND}; + }; + + const auto handleList = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(attributes); + THttpResponse resp{HTTP_OK}; + if (path == "//test/a") { + resp.SetContent(CYPRES_NODE_A_CONTENT); + return resp; + } + return THttpResponse{HTTP_NOT_FOUND}; + }; + + const auto makeFolderOptions = [] (const TString& sessionId) { + IYtGateway::TFolderOptions folderOptions{sessionId}; + TYtSettings ytSettings {}; + folderOptions.Cluster("ut_cluster") + .Config(std::make_shared<TYtSettings>(ytSettings)) + .Prefix("//test/a") + .Attributes({"user_attributes"}); + return folderOptions; + }; + + auto folderRes + = GetFolderResult(handleList, handleGet, checkRequiredAttributes, makeFolderOptions); + + UNIT_ASSERT_EQUAL_C(folderRes.Success(), true, folderRes.Issues().ToString()); + UNIT_ASSERT_EQUAL( + folderRes.ItemsOrFileLink, + (std::variant<TVector<IYtGateway::TFolderResult::TFolderItem>, TFileLinkPtr>(EXPECTED_ITEMS))); + } + +Y_UNIT_TEST(EmptyResolveIsNotError) { + const auto port = NTesting::GetFreePort(); + + const auto handleList = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(path); + Y_UNUSED(attributes); + + THttpResponse resp{HTTP_OK}; + resp.SetContent(CYPRES_NODE_W_LINK); + return resp; + }; + + const auto handleGet = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(path); + Y_UNUSED(attributes); + + THttpResponse resp{HTTP_OK}; + resp.SetContent(CYPRES_ACCESS_ERROR); + return resp; + }; + + const auto makeFolderOptions = [] (const TString& sessionId) { + IYtGateway::TFolderOptions folderOptions{sessionId}; + TYtSettings ytSettings {}; + folderOptions.Cluster("ut_cluster") + .Config(std::make_shared<TYtSettings>(ytSettings)) + .Prefix("//test/a") + .Attributes({"user_attributes"}); + return folderOptions; + }; + + auto folderRes + = GetFolderResult(handleList, handleGet, Nothing(), makeFolderOptions); + + UNIT_ASSERT_EQUAL_C(folderRes.Success(), true, folderRes.Issues().ToString()); +} + +Y_UNIT_TEST(GetFolderException) { + const auto port = NTesting::GetFreePort(); + + const auto handleList = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(path); + Y_UNUSED(attributes); + + THttpResponse resp{HTTP_UNAUTHORIZED}; + auto header = R"({"code":900,"message":"Authentication failed"})"; + resp.AddHeader(THttpInputHeader("X-YT-Error", header)); + resp.SetContent(CYPRESS_BLACKBOX_ERROR); + return resp; + }; + + const auto handleGet = [] (TStringBuf path, const NYT::TNode& attributes) { + Y_UNUSED(path); + Y_UNUSED(attributes); + + THttpResponse resp{HTTP_OK}; + resp.SetContent(""); + return resp; + }; + + const auto makeFolderOptions = [] (const TString& sessionId) { + IYtGateway::TFolderOptions folderOptions{sessionId}; + TYtSettings ytSettings {}; + folderOptions.Cluster("ut_cluster") + .Config(std::make_shared<TYtSettings>(ytSettings)) + .Prefix("//test/a") + .Attributes({"user_attributes"}); + return folderOptions; + }; + + const auto folderRes + = GetFolderResult(handleList, handleGet, Nothing(), makeFolderOptions); + + UNIT_ASSERT(!folderRes.Issues().Empty()); + UNIT_ASSERT_STRING_CONTAINS(folderRes.Issues().ToString(), "Authentication failed"); +} +} + +} // namespace + +} // namespace NYql diff --git a/yt/yql/providers/yt/lib/ut_common/ya.make b/yt/yql/providers/yt/lib/ut_common/ya.make new file mode 100644 index 0000000000..4084a3d770 --- /dev/null +++ b/yt/yql/providers/yt/lib/ut_common/ya.make @@ -0,0 +1,16 @@ +LIBRARY() + +SRCS( + yql_ut_common.cpp + yql_ut_common.h +) + +PEERDIR( + yql/essentials/core + yql/essentials/core/expr_nodes +) + +YQL_LAST_ABI_VERSION() + +END() + diff --git a/yt/yql/providers/yt/lib/ut_common/yql_ut_common.cpp b/yt/yql/providers/yt/lib/ut_common/yql_ut_common.cpp new file mode 100644 index 0000000000..cef3f2723c --- /dev/null +++ b/yt/yql/providers/yt/lib/ut_common/yql_ut_common.cpp @@ -0,0 +1,55 @@ +#include "yql_ut_common.h" + +#include <library/cpp/random_provider/random_provider.h> +#include <library/cpp/time_provider/time_provider.h> + +#include <util/generic/guid.h> +#include <util/system/user.h> +#include <util/stream/file.h> + +namespace NYql { + +TTestTablesMapping::TTestTablesMapping() + : TmpInput() + , TmpInputAttr(TmpInput.Name() + ".attr") + , TmpOutput() + , TmpOutputAttr(TmpOutput.Name() + ".attr") +{ + { + TUnbufferedFileOutput tmpInput(TmpInput); + tmpInput << "{\"key\"=\"\";\"subkey\"=\"\";\"value\"=\"\"}" << Endl; + TUnbufferedFileOutput tmpInputAttr(TmpInputAttr); + tmpInputAttr << "{\"_yql_row_spec\" = {\"Type\" = [\"StructType\";[" + << "[\"key\";[\"DataType\";\"String\"]];" + << "[\"subkey\";[\"DataType\";\"String\"]];" + << "[\"value\";[\"DataType\";\"String\"]]" + << "]]}}" << Endl; + } + insert(std::make_pair("yt.plato.Input", TmpInput.Name())); + + { + TUnbufferedFileOutput tmpOutput(TmpOutput); + tmpOutput << "{\"key\"=\"\";\"subkey\"=\"\";\"value\"=\"\"}" << Endl; + TUnbufferedFileOutput tmpOutputAttr(TmpOutputAttr); + tmpOutputAttr << "{\"_yql_row_spec\" = {\"Type\" = [\"StructType\";[" + << "[\"key\";[\"DataType\";\"String\"]];" + << "[\"subkey\";[\"DataType\";\"String\"]];" + << "[\"value\";[\"DataType\";\"String\"]]" + << "]]}}" << Endl; + } + insert(std::make_pair("yt.plato.Output", TmpOutput.Name())); +} + +void InitializeYtGateway(IYtGateway::TPtr gateway, TYtState::TPtr ytState) { + ytState->SessionId = CreateGuidAsString(); + gateway->OpenSession( + IYtGateway::TOpenSessionOptions(ytState->SessionId) + .UserName(GetUsername()) + .ProgressWriter(&NullProgressWriter) + .OperationOptions(TYqlOperationOptions()) + .RandomProvider(CreateDeterministicRandomProvider(1)) + .TimeProvider(CreateDeterministicTimeProvider(10000000)) + ); +} + +} diff --git a/yt/yql/providers/yt/lib/ut_common/yql_ut_common.h b/yt/yql/providers/yt/lib/ut_common/yql_ut_common.h new file mode 100644 index 0000000000..ddee02690a --- /dev/null +++ b/yt/yql/providers/yt/lib/ut_common/yql_ut_common.h @@ -0,0 +1,23 @@ +#pragma once + +#include <yql/essentials/core/yql_expr_type_annotation.h> + +#include <yt/yql/providers/yt/gateway/file/yql_yt_file.h> +#include <yt/yql/providers/yt/provider/yql_yt_provider.h> + +#include <util/system/tempfile.h> + +namespace NYql { + +struct TTestTablesMapping: public THashMap<TString, TString> { + TTempFileHandle TmpInput; + TTempFileHandle TmpInputAttr; + TTempFileHandle TmpOutput; + TTempFileHandle TmpOutputAttr; + + TTestTablesMapping(); +}; + +void InitializeYtGateway(IYtGateway::TPtr gateway, TYtState::TPtr ytState); + +} diff --git a/yt/yql/providers/yt/provider/ut/ya.make b/yt/yql/providers/yt/provider/ut/ya.make index 3b29f30999..888bfe2d25 100644 --- a/yt/yql/providers/yt/provider/ut/ya.make +++ b/yt/yql/providers/yt/provider/ut/ya.make @@ -17,7 +17,7 @@ PEERDIR( yt/yql/providers/yt/gateway/file yt/yql/providers/yt/codec/codegen yt/yql/providers/yt/comp_nodes/llvm14 - yql/essentials/core/ut_common + yt/yql/providers/yt/lib/ut_common yql/essentials/ast yql/essentials/public/udf/service/terminate_policy yql/essentials/core/services @@ -38,3 +38,4 @@ YQL_LAST_ABI_VERSION() END() ENDIF() + diff --git a/yt/yt/client/arrow/arrow_row_stream_encoder.cpp b/yt/yt/client/arrow/arrow_row_stream_encoder.cpp index 25a403790c..1d266f3c71 100644 --- a/yt/yt/client/arrow/arrow_row_stream_encoder.cpp +++ b/yt/yt/client/arrow/arrow_row_stream_encoder.cpp @@ -1,8 +1,5 @@ #include "arrow_row_stream_encoder.h" -#include <yt/yt/client/arrow/fbs/Message.fbs.h> -#include <yt/yt/client/arrow/fbs/Schema.fbs.h> - #include <yt/yt/client/api/rpc_proxy/row_stream.h> #include <yt/yt/client/api/rpc_proxy/wire_row_stream.h> @@ -14,6 +11,8 @@ #include <yt/yt/client/table_client/schema.h> #include <yt/yt/client/table_client/columnar.h> +#include <yt/yt/library/formats/format.h> + #include <yt/yt/core/misc/error.h> #include <yt/yt/core/misc/range.h> @@ -32,651 +31,6 @@ static constexpr auto& Logger = ArrowLogger; namespace { -using TBatchColumn = IUnversionedColumnarRowBatch::TColumn; -using TBodyWriter = std::function<void(TMutableRef)>; - -constexpr i64 ArrowAlignment = 8; - -flatbuffers::Offset<flatbuffers::String> SerializeString( - flatbuffers::FlatBufferBuilder* flatbufBuilder, - const std::string& str) -{ - return flatbufBuilder->CreateString(str.data(), str.length()); -} - -std::tuple<org::apache::arrow::flatbuf::Type, flatbuffers::Offset<void>> SerializeColumnType( - flatbuffers::FlatBufferBuilder* flatbufBuilder, - const TColumnSchema& schema) -{ - auto simpleType = CastToV1Type(schema.LogicalType()).first; - switch (simpleType) { - case ESimpleLogicalValueType::Null: - case ESimpleLogicalValueType::Void: - return std::tuple( - org::apache::arrow::flatbuf::Type_Null, - org::apache::arrow::flatbuf::CreateNull(*flatbufBuilder) - .Union()); - - case ESimpleLogicalValueType::Int64: - case ESimpleLogicalValueType::Uint64: - case ESimpleLogicalValueType::Int8: - case ESimpleLogicalValueType::Uint8: - case ESimpleLogicalValueType::Int16: - case ESimpleLogicalValueType::Uint16: - case ESimpleLogicalValueType::Int32: - case ESimpleLogicalValueType::Uint32: - return std::tuple( - org::apache::arrow::flatbuf::Type_Int, - org::apache::arrow::flatbuf::CreateInt( - *flatbufBuilder, - GetIntegralTypeBitWidth(simpleType), - IsIntegralTypeSigned(simpleType)).Union()); - - case ESimpleLogicalValueType::Double: - return std::tuple( - org::apache::arrow::flatbuf::Type_FloatingPoint, - org::apache::arrow::flatbuf::CreateFloatingPoint( - *flatbufBuilder, - org::apache::arrow::flatbuf::Precision_DOUBLE) - .Union()); - - case ESimpleLogicalValueType::Float: - return std::tuple( - org::apache::arrow::flatbuf::Type_FloatingPoint, - org::apache::arrow::flatbuf::CreateFloatingPoint( - *flatbufBuilder, - org::apache::arrow::flatbuf::Precision_SINGLE) - .Union()); - - case ESimpleLogicalValueType::Boolean: - return std::tuple( - org::apache::arrow::flatbuf::Type_Bool, - org::apache::arrow::flatbuf::CreateBool(*flatbufBuilder) - .Union()); - - case ESimpleLogicalValueType::String: - case ESimpleLogicalValueType::Any: - return std::tuple( - org::apache::arrow::flatbuf::Type_Binary, - org::apache::arrow::flatbuf::CreateBinary(*flatbufBuilder) - .Union()); - - case ESimpleLogicalValueType::Utf8: - return std::tuple( - org::apache::arrow::flatbuf::Type_Utf8, - org::apache::arrow::flatbuf::CreateUtf8(*flatbufBuilder) - .Union()); - - // TODO(babenko): the following types are not supported: - // Date - // Datetime - // Interval - // Timestamp - - default: - THROW_ERROR_EXCEPTION("Column %v has type %Qlv that is not currently supported by Arrow encoder", - schema.GetDiagnosticNameString(), - simpleType); - } -} - -bool IsRleButNotDictionaryEncodedStringLikeColumn(const TBatchColumn& column) -{ - auto simpleType = CastToV1Type(column.Type).first; - return - IsStringLikeType(simpleType) && - column.Rle && - !column.Rle->ValueColumn->Dictionary; -} - -bool IsRleAndDictionaryEncodedColumn(const TBatchColumn& column) -{ - return - column.Rle && - column.Rle->ValueColumn->Dictionary; -} - -bool IsDictionaryEncodedColumn(const TBatchColumn& column) -{ - return - column.Dictionary || - IsRleAndDictionaryEncodedColumn(column) || - IsRleButNotDictionaryEncodedStringLikeColumn(column); -} - -struct TTypedBatchColumn -{ - const TBatchColumn* Column; - TLogicalTypePtr Type; -}; - -struct TRecordBatchBodyPart -{ - i64 Size; - TBodyWriter Writer; -}; - -struct TRecordBatchSerializationContext final -{ - explicit TRecordBatchSerializationContext(flatbuffers::FlatBufferBuilder* flatbufBuilder) - : FlatbufBuilder(flatbufBuilder) - { } - - void AddFieldNode(i64 length, i64 nullCount) - { - FieldNodes.emplace_back(length, nullCount); - } - - void AddBuffer(i64 size, TBodyWriter writer) - { - YT_LOG_DEBUG("Buffer registered (Offset: %v, Size: %v)", - CurrentBodyOffset, - size); - - Buffers.emplace_back(CurrentBodyOffset, size); - CurrentBodyOffset += AlignUp<i64>(size, ArrowAlignment); - Parts.push_back(TRecordBatchBodyPart{size, std::move(writer)}); - } - - flatbuffers::FlatBufferBuilder* const FlatbufBuilder; - - i64 CurrentBodyOffset = 0; - std::vector<org::apache::arrow::flatbuf::FieldNode> FieldNodes; - std::vector<org::apache::arrow::flatbuf::Buffer> Buffers; - std::vector<TRecordBatchBodyPart> Parts; -}; - -template <class T> -TMutableRange<T> GetTypedValues(TMutableRef ref) -{ - return TMutableRange( - reinterpret_cast<T*>(ref.Begin()), - reinterpret_cast<T*>(ref.End())); -} - -void SerializeColumnPrologue( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - if (column->NullBitmap || - column->Rle && column->Rle->ValueColumn->NullBitmap) - { - if (column->Rle) { - const auto* valueColumn = column->Rle->ValueColumn; - auto rleIndexes = column->GetTypedValues<ui64>(); - - context->AddFieldNode( - column->ValueCount, - CountOnesInRleBitmap( - valueColumn->NullBitmap->Data, - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount)); - - context->AddBuffer( - GetBitmapByteSize(column->ValueCount), - [=] (TMutableRef dstRef) { - BuildValidityBitmapFromRleNullBitmap( - valueColumn->NullBitmap->Data, - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount, - dstRef); - }); - } else { - context->AddFieldNode( - column->ValueCount, - CountOnesInBitmap( - column->NullBitmap->Data, - column->StartIndex, - column->StartIndex + column->ValueCount)); - - context->AddBuffer( - GetBitmapByteSize(column->ValueCount), - [=] (TMutableRef dstRef) { - CopyBitmapRangeToBitmapNegated( - column->NullBitmap->Data, - column->StartIndex, - column->StartIndex + column->ValueCount, - dstRef); - }); - } - } else { - context->AddFieldNode( - column->ValueCount, - 0); - - context->AddBuffer( - 0, - [=] (TMutableRef /*dstRef*/) { }); - } -} - -void SerializeRleButNotDictionaryEncodedStringLikeColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Values->BitWidth == 64); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(!column->Values->ZigZagEncoded); - - YT_LOG_DEBUG("Adding RLE but not dictionary-encoded string-like column (ColumnId: %v, StartIndex: %v, ValueCount: %v)", - column->Id, - column->StartIndex, - column->ValueCount); - - SerializeColumnPrologue(typedColumn, context); - - auto rleIndexes = column->GetTypedValues<ui64>(); - - context->AddBuffer( - sizeof (ui32) * column->ValueCount, - [=] (TMutableRef dstRef) { - BuildIotaDictionaryIndexesFromRleIndexes( - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount, - GetTypedValues<ui32>(dstRef)); - }); -} - -void SerializeDictionaryColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Dictionary->ZeroMeansNull); - YT_VERIFY(column->Values->BitWidth == 32); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(!column->Values->ZigZagEncoded); - - YT_LOG_DEBUG("Adding dictionary column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - column->Rle.has_value()); - - auto relevantDictionaryIndexes = column->GetRelevantTypedValues<ui32>(); - - context->AddFieldNode( - column->ValueCount, - CountNullsInDictionaryIndexesWithZeroNull(relevantDictionaryIndexes)); - - context->AddBuffer( - GetBitmapByteSize(column->ValueCount), - [=] (TMutableRef dstRef) { - BuildValidityBitmapFromDictionaryIndexesWithZeroNull( - relevantDictionaryIndexes, - dstRef); - }); - - context->AddBuffer( - sizeof (ui32) * column->ValueCount, - [=] (TMutableRef dstRef) { - BuildDictionaryIndexesFromDictionaryIndexesWithZeroNull( - relevantDictionaryIndexes, - GetTypedValues<ui32>(dstRef)); - }); -} - -void SerializeRleDictionaryColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Values->BitWidth == 64); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(!column->Values->ZigZagEncoded); - YT_VERIFY(column->Rle->ValueColumn->Dictionary->ZeroMeansNull); - YT_VERIFY(column->Rle->ValueColumn->Values->BitWidth == 32); - YT_VERIFY(column->Rle->ValueColumn->Values->BaseValue == 0); - YT_VERIFY(!column->Rle->ValueColumn->Values->ZigZagEncoded); - - YT_LOG_DEBUG("Adding dictionary column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - column->Rle.has_value()); - - auto dictionaryIndexes = column->Rle->ValueColumn->GetTypedValues<ui32>(); - auto rleIndexes = column->GetTypedValues<ui64>(); - - context->AddFieldNode( - column->ValueCount, - CountNullsInRleDictionaryIndexesWithZeroNull( - dictionaryIndexes, - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount)); - - context->AddBuffer( - GetBitmapByteSize(column->ValueCount), - [=] (TMutableRef dstRef) { - BuildValidityBitmapFromRleDictionaryIndexesWithZeroNull( - dictionaryIndexes, - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount, - dstRef); - }); - - context->AddBuffer( - sizeof (ui32) * column->ValueCount, - [=] (TMutableRef dstRef) { - BuildDictionaryIndexesFromRleDictionaryIndexesWithZeroNull( - dictionaryIndexes, - rleIndexes, - column->StartIndex, - column->StartIndex + column->ValueCount, - GetTypedValues<ui32>(dstRef)); - }); -} - -void SerializeIntegerColumn( - const TTypedBatchColumn& typedColumn, - ESimpleLogicalValueType simpleType, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - - YT_LOG_DEBUG("Adding integer column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - column->Rle.has_value()); - - SerializeColumnPrologue(typedColumn, context); - - context->AddBuffer( - column->ValueCount * GetIntegralTypeByteSize(simpleType), - [=] (TMutableRef dstRef) { - const auto* valueColumn = column->Rle - ? column->Rle->ValueColumn - : column; - auto values = valueColumn->GetTypedValues<ui64>(); - - auto rleIndexes = column->Rle - ? column->GetTypedValues<ui64>() - : TRange<ui64>(); - - auto startIndex = column->StartIndex; - - switch (simpleType) { - #define XX(cppType, ytType) \ - case ESimpleLogicalValueType::ytType: { \ - auto dstValues = GetTypedValues<cppType>(dstRef); \ - auto* currentOutput = dstValues.Begin(); \ - DecodeIntegerVector( \ - startIndex, \ - startIndex + column->ValueCount, \ - valueColumn->Values->BaseValue, \ - valueColumn->Values->ZigZagEncoded, \ - TRange<ui32>(), \ - rleIndexes, \ - [&] (auto index) { \ - return values[index]; \ - }, \ - [&] (auto value) { \ - *currentOutput++ = value; \ - }); \ - break; \ - } - - XX( i8, Int8) - XX( i16, Int16) - XX( i32, Int32) - XX( i64, Int64) - XX( ui8, Uint8) - XX(ui16, Uint16) - XX(ui32, Uint32) - XX(ui64, Uint64) - - #undef XX - - default: - THROW_ERROR_EXCEPTION("Integer column %v has unexpected type %Qlv", - typedColumn.Column->Id, - simpleType); - } - }); -} - -void SerializeDoubleColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Values->BitWidth == 64); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(!column->Values->ZigZagEncoded); - - YT_LOG_DEBUG( - "Adding double column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - column->Rle.has_value()); - - SerializeColumnPrologue(typedColumn, context); - - context->AddBuffer( - column->ValueCount * sizeof(double), - [=] (TMutableRef dstRef) { - auto relevantValues = column->GetRelevantTypedValues<double>(); - ::memcpy( - dstRef.Begin(), - relevantValues.Begin(), - column->ValueCount * sizeof(double)); - }); -} - -void SerializeFloatColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Values->BitWidth == 32); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(!column->Values->ZigZagEncoded); - - YT_LOG_DEBUG( - "Adding float column (ColumnId: %v, StartIndex: %v, ValueCount: %v, Rle: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - column->Rle.has_value()); - - SerializeColumnPrologue(typedColumn, context); - - context->AddBuffer( - column->ValueCount * sizeof(float), - [=] (TMutableRef dstRef) { - auto relevantValues = column->GetRelevantTypedValues<float>(); - ::memcpy( - dstRef.Begin(), - relevantValues.Begin(), - column->ValueCount * sizeof(float)); - }); -} - -void SerializeStringLikeColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(column->Values->BitWidth == 32); - YT_VERIFY(column->Values->ZigZagEncoded); - YT_VERIFY(column->Strings); - YT_VERIFY(column->Strings->AvgLength); - YT_VERIFY(!column->Rle); - - auto startIndex = column->StartIndex; - auto endIndex = startIndex + column->ValueCount; - auto stringData = column->Strings->Data; - auto avgLength = *column->Strings->AvgLength; - - auto offsets = column->GetTypedValues<ui32>(); - auto startOffset = DecodeStringOffset(offsets, avgLength, startIndex); - auto endOffset = DecodeStringOffset(offsets, avgLength, endIndex); - auto stringsSize = endOffset - startOffset; - - YT_LOG_DEBUG("Adding string-like column (ColumnId: %v, StartIndex: %v, ValueCount: %v, StartOffset: %v, EndOffset: %v, StringsSize: %v)", - column->Id, - column->StartIndex, - column->ValueCount, - startOffset, - endOffset, - stringsSize); - - SerializeColumnPrologue(typedColumn, context); - - context->AddBuffer( - sizeof(i32) * (column->ValueCount + 1), - [=] (TMutableRef dstRef) { - DecodeStringOffsets( - offsets, - avgLength, - startIndex, - endIndex, - GetTypedValues<ui32>(dstRef)); - }); - - context->AddBuffer( - stringsSize, - [=] (TMutableRef dstRef) { - ::memcpy( - dstRef.Begin(), - stringData.Begin() + startOffset, - stringsSize); - }); -} - -void SerializeBooleanColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - YT_VERIFY(column->Values); - YT_VERIFY(!column->Values->ZigZagEncoded); - YT_VERIFY(column->Values->BaseValue == 0); - YT_VERIFY(column->Values->BitWidth == 1); - - YT_LOG_DEBUG("Adding boolean column (ColumnId: %v, StartIndex: %v, ValueCount: %v)", - column->Id, - column->StartIndex, - column->ValueCount); - - SerializeColumnPrologue(typedColumn, context); - - context->AddBuffer( - GetBitmapByteSize(column->ValueCount), - [=] (TMutableRef dstRef) { - CopyBitmapRangeToBitmap( - column->Values->Data, - column->StartIndex, - column->StartIndex + column->ValueCount, - dstRef); - }); -} - -void SerializeNullColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - SerializeColumnPrologue(typedColumn, context); -} - -void SerializeColumn( - const TTypedBatchColumn& typedColumn, - TRecordBatchSerializationContext* context) -{ - const auto* column = typedColumn.Column; - - if (IsRleButNotDictionaryEncodedStringLikeColumn(*typedColumn.Column)) { - SerializeRleButNotDictionaryEncodedStringLikeColumn(typedColumn, context); - return; - } - - if (column->Dictionary) { - SerializeDictionaryColumn(typedColumn, context); - return; - } - - if (column->Rle && column->Rle->ValueColumn->Dictionary) { - SerializeRleDictionaryColumn(typedColumn, context); - return; - } - - auto simpleType = CastToV1Type(typedColumn.Type).first; - if (IsIntegralType(simpleType)) { - SerializeIntegerColumn(typedColumn, simpleType, context); - } else if (simpleType == ESimpleLogicalValueType::Double) { - SerializeDoubleColumn(typedColumn, context); - } else if (simpleType == ESimpleLogicalValueType::Float) { - SerializeFloatColumn(typedColumn, context); - } else if (IsStringLikeType(simpleType)) { - SerializeStringLikeColumn(typedColumn, context); - } else if (simpleType == ESimpleLogicalValueType::Boolean) { - SerializeBooleanColumn(typedColumn, context); - } else if (simpleType == ESimpleLogicalValueType::Null) { - SerializeNullColumn(typedColumn, context); - } else if (simpleType == ESimpleLogicalValueType::Void) { - SerializeNullColumn(typedColumn, context); - } else { - THROW_ERROR_EXCEPTION("Column %v has unexpected type %Qlv", - typedColumn.Column->Id, - simpleType); - } -} - -auto SerializeRecordBatch( - flatbuffers::FlatBufferBuilder* flatbufBuilder, - int length, - TRange<TTypedBatchColumn> typedColumns) -{ - auto context = New<TRecordBatchSerializationContext>(flatbufBuilder); - - for (const auto& typedColumn : typedColumns) { - SerializeColumn(typedColumn, context.Get()); - } - - auto fieldNodesOffset = flatbufBuilder->CreateVectorOfStructs(context->FieldNodes); - - auto buffersOffset = flatbufBuilder->CreateVectorOfStructs(context->Buffers); - - auto recordBatchOffset = org::apache::arrow::flatbuf::CreateRecordBatch( - *flatbufBuilder, - length, - fieldNodesOffset, - buffersOffset); - - auto totalSize = context->CurrentBodyOffset; - - return std::tuple( - recordBatchOffset, - totalSize, - [context = std::move(context)] (TMutableRef dstRef) { - char* current = dstRef.Begin(); - for (const auto& part : context->Parts) { - part.Writer(TMutableRef(current, current + part.Size)); - current += AlignUp<i64>(part.Size, ArrowAlignment); - } - YT_VERIFY(current == dstRef.End()); - }); -} - -//////////////////////////////////////////////////////////////////////////////// - DECLARE_REFCOUNTED_CLASS(TArrowRowStreamEncoder) class TArrowRowStreamEncoder @@ -685,460 +39,41 @@ class TArrowRowStreamEncoder public: TArrowRowStreamEncoder( TTableSchemaPtr schema, + std::optional<std::vector<std::string>> columns, TNameTablePtr nameTable, IRowStreamEncoderPtr fallbackEncoder, NFormats::TControlAttributesConfigPtr controlAttributesConfig) : Schema_(std::move(schema)) + , Columns_(std::move(columns)) , NameTable_(std::move(nameTable)) , FallbackEncoder_(std::move(fallbackEncoder)) , ControlAttributesConfig_(controlAttributesConfig) + , OutputStream_(Data_) + , AsyncOutputStream_(NConcurrency::CreateAsyncAdapter(&OutputStream_)) { - if (ControlAttributesConfig_->EnableRowIndex) { - RowIndexId_ = NameTable_->GetIdOrRegisterName(RowIndexColumnName); - } - - if (ControlAttributesConfig_->EnableRangeIndex) { - RangeIndexId_ = NameTable_->GetIdOrRegisterName(RangeIndexColumnName); - } - - if (ControlAttributesConfig_->EnableTableIndex) { - TableIndexId_ = NameTable_->GetIdOrRegisterName(TableIndexColumnName); - } - - if (ControlAttributesConfig_->EnableTabletIndex) { - TabletIndexId_ = NameTable_->GetIdOrRegisterName(TabletIndexColumnName); - } - YT_LOG_DEBUG("Row stream encoder created (Schema: %v)", *Schema_); } - const TTableSchemaPtr& GetSchema() - { - return Schema_; - } - - const TNameTablePtr& GetNameTable() - { - return NameTable_; - } - - bool IsFirstBatch() - { - return FirstBatch_; - } - - std::vector<IUnversionedColumnarRowBatch::TDictionaryId>& ArrowDictionaryIds() - { - return ArrowDictionaryIds_; - } - - bool IsTableIndexColumnId(int id) const - { - return id == TableIndexId_; - } - - bool IsRowIndexColumnId(int id) const - { - return id == RowIndexId_; - } - - bool IsRangeIndexColumnId(int id) const - { - return id == RangeIndexId_; - } - - bool IsTabletIndexColumnId(int id) const - { - return id == TabletIndexId_; - } - - bool IsSystemColumnId(int id) const - { - return IsTableIndexColumnId(id) || - IsRangeIndexColumnId(id) || - IsRowIndexColumnId(id) || - IsTabletIndexColumnId(id); - } - - bool IsSystemColumnEnable(int columnIndex) - { - return ControlAttributesConfig_->EnableTableIndex && IsTableIndexColumnId(columnIndex) || - ControlAttributesConfig_->EnableRangeIndex && IsRangeIndexColumnId(columnIndex) || - ControlAttributesConfig_->EnableRowIndex && IsRowIndexColumnId(columnIndex) || - ControlAttributesConfig_->EnableTabletIndex && IsTabletIndexColumnId(columnIndex); - } - TSharedRef Encode( const IUnversionedRowBatchPtr& batch, const NApi::NRpcProxy::NProto::TRowsetStatistics* statistics) override; private: const TTableSchemaPtr Schema_; + const std::optional<std::vector<std::string>> Columns_; const TNameTablePtr NameTable_; const IRowStreamEncoderPtr FallbackEncoder_; const NFormats::TControlAttributesConfigPtr ControlAttributesConfig_; - int RowIndexId_ = -1; - int RangeIndexId_ = -1; - int TableIndexId_ = -1; - int TabletIndexId_ = -1; - - bool FirstBatch_ = true; - std::vector<IUnversionedColumnarRowBatch::TDictionaryId> ArrowDictionaryIds_; + NFormats::ISchemalessFormatWriterPtr Writer_ = nullptr; + TString Data_; + TStringOutput OutputStream_; + NConcurrency::IFlushableAsyncOutputStreamPtr AsyncOutputStream_; }; DEFINE_REFCOUNTED_TYPE(TArrowRowStreamEncoder) -//////////////////////////////////////////////////////////////////////////////// - -class TArrowRowStreamBlockEncoder -{ -public: - TArrowRowStreamBlockEncoder( - TArrowRowStreamEncoderPtr streamEncoder, - IUnversionedColumnarRowBatchPtr batch) - : StreamEncoder_(std::move(streamEncoder)) - , Batch_(std::move(batch)) - { - PrepareColumns(); - if (IsSchemaMessageNeeded()) { - if (!StreamEncoder_->IsFirstBatch()) { - RegisterEosMarker(); - } - ResetArrowDictionaries(); - PrepareSchema(); - } - PrepareDictionaryBatches(); - PrepareRecordBatch(); - } - - i64 GetPayloadSize() const - { - i64 size = 0; - for (const auto& message : Messages_) { - size += sizeof (ui32); // continuation indicator - size += sizeof (ui32); // metadata size - if (message.FlatbufBuilder) { - size += AlignUp<i64>(message.FlatbufBuilder->GetSize(), ArrowAlignment); // metadata message - size += AlignUp<i64>(message.BodySize, ArrowAlignment); // body - } - } - return size; - } - - void WritePayload(TMutableRef payloadRef) - { - YT_LOG_DEBUG("Started writing payload (Size: %v)", - payloadRef.Size()); - char* current = payloadRef.Begin(); - for (const auto& message : Messages_) { - // Continuation indicator - *reinterpret_cast<ui32*>(current) = 0xFFFFFFFF; - current += sizeof(ui32); - - if (message.FlatbufBuilder) { - auto metadataSize = message.FlatbufBuilder->GetSize(); - auto* metadataPtr = message.FlatbufBuilder->GetBufferPointer(); - - // Metadata size - *reinterpret_cast<ui32*>(current) = AlignUp<i64>(metadataSize, ArrowAlignment); - current += sizeof(ui32); - - // Metadata message - ::memcpy(current, metadataPtr, metadataSize); - current += AlignUp<i64>(metadataSize, ArrowAlignment); - - // Body - if (message.BodyWriter) { - message.BodyWriter(TMutableRef(current, current + message.BodySize)); - current += AlignUp<i64>(message.BodySize, ArrowAlignment); - } else { - YT_VERIFY(message.BodySize == 0); - } - } else { - // EOS marker - *reinterpret_cast<ui32*>(current) = 0; - current += sizeof(ui32); - } - } - YT_VERIFY(current == payloadRef.End()); - YT_LOG_DEBUG("Finished writing payload"); - } - -private: - const TArrowRowStreamEncoderPtr StreamEncoder_; - const IUnversionedColumnarRowBatchPtr Batch_; - - std::vector<TTypedBatchColumn> TypedColumns_; - - struct TMessage - { - std::optional<flatbuffers::FlatBufferBuilder> FlatbufBuilder; - i64 BodySize; - TBodyWriter BodyWriter; - }; - - std::vector<TMessage> Messages_; - - void RegisterEosMarker() - { - YT_LOG_DEBUG("EOS marker registered"); - - Messages_.push_back(TMessage{ - std::nullopt, - 0, - TBodyWriter() - }); - } - - void RegisterMessage( - org::apache::arrow::flatbuf::MessageHeader type, - flatbuffers::FlatBufferBuilder&& flatbufBuilder, - i64 bodySize = 0, - std::function<void(TMutableRef)> bodyWriter = nullptr) - { - YT_LOG_DEBUG("Message registered (Type: %v, MessageSize: %v, BodySize: %v)", - org::apache::arrow::flatbuf::EnumNamesMessageHeader()[type], - flatbufBuilder.GetSize(), - bodySize); - - YT_VERIFY((bodySize % ArrowAlignment) == 0); - Messages_.push_back(TMessage{ - std::move(flatbufBuilder), - bodySize, - std::move(bodyWriter) - }); - } - - std::optional<TColumnSchema> FindColumnSchema(const TBatchColumn& column) - { - YT_VERIFY(column.Id >= 0); - auto name = StreamEncoder_->GetNameTable()->GetName(column.Id); - auto columnSchemaPtr = StreamEncoder_->GetSchema()->FindColumn(name); - if (!columnSchemaPtr) { - if (StreamEncoder_->IsSystemColumnId(column.Id) && StreamEncoder_->IsSystemColumnEnable(column.Id)) { - return TColumnSchema(TString(name), EValueType::Int64); - } - return std::nullopt; - } - return *columnSchemaPtr; - } - - void PrepareColumns() - { - auto batchColumns = Batch_->MaterializeColumns(); - TypedColumns_.reserve(batchColumns.Size()); - for (const auto* column : batchColumns) { - // Ignoring null schema column and not enabled system columns. - if (auto columnSchema = FindColumnSchema(*column)) { - TypedColumns_.push_back(TTypedBatchColumn{ - column, - columnSchema->LogicalType() - }); - } - } - } - - bool IsSchemaMessageNeeded() - { - if (StreamEncoder_->IsFirstBatch()) { - return true; - } - - YT_VERIFY(StreamEncoder_->ArrowDictionaryIds().size() == TypedColumns_.size()); - - bool result = StreamEncoder_->IsFirstBatch(); - for (int index = 0; index < std::ssize(TypedColumns_); ++index) { - bool currentDictionary = IsDictionaryEncodedColumn(*TypedColumns_[index].Column); - bool previousDictionary = StreamEncoder_->ArrowDictionaryIds()[index] != IUnversionedColumnarRowBatch::NullDictionaryId; - if (currentDictionary != previousDictionary) { - result = true; - } - } - return result; - } - - void ResetArrowDictionaries() - { - StreamEncoder_->ArrowDictionaryIds().assign(TypedColumns_.size(), IUnversionedColumnarRowBatch::NullDictionaryId); - } - - - void PrepareSchema() - { - flatbuffers::FlatBufferBuilder flatbufBuilder; - - int arrowDictionaryIdCounter = 0; - std::vector<flatbuffers::Offset<org::apache::arrow::flatbuf::Field>> fieldOffsets; - fieldOffsets.reserve(TypedColumns_.size()); - - for (const auto& typedColumn : TypedColumns_) { - auto optionalColumnSchema = FindColumnSchema(*typedColumn.Column); - YT_VERIFY(optionalColumnSchema != std::nullopt); - auto columnSchema = *optionalColumnSchema; - - auto nameOffset = SerializeString(&flatbufBuilder, columnSchema.Name()); - - auto [typeType, typeOffset] = SerializeColumnType(&flatbufBuilder, columnSchema); - - flatbuffers::Offset<org::apache::arrow::flatbuf::DictionaryEncoding> dictionaryEncodingOffset; - - auto indexTypeOffset = org::apache::arrow::flatbuf::CreateInt(flatbufBuilder, 32, false); - - if (IsDictionaryEncodedColumn(*typedColumn.Column)) { - dictionaryEncodingOffset = org::apache::arrow::flatbuf::CreateDictionaryEncoding( - flatbufBuilder, - arrowDictionaryIdCounter++, - indexTypeOffset); - } - - auto fieldOffset = org::apache::arrow::flatbuf::CreateField( - flatbufBuilder, - nameOffset, - columnSchema.LogicalType()->IsNullable(), - typeType, - typeOffset, - dictionaryEncodingOffset); - - fieldOffsets.push_back(fieldOffset); - } - - auto fieldsOffset = flatbufBuilder.CreateVector(fieldOffsets); - - auto schemaOffset = org::apache::arrow::flatbuf::CreateSchema( - flatbufBuilder, - org::apache::arrow::flatbuf::Endianness_Little, - fieldsOffset); - - auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( - flatbufBuilder, - org::apache::arrow::flatbuf::MetadataVersion_V4, - org::apache::arrow::flatbuf::MessageHeader_Schema, - schemaOffset.Union(), - 0); - - flatbufBuilder.Finish(messageOffset); - - RegisterMessage( - org::apache::arrow::flatbuf::MessageHeader_Schema, - std::move(flatbufBuilder)); - } - - void PrepareDictionaryBatches() - { - int arrowDictionaryIdCounter = 0; - auto prepareDictionaryBatch = [&] ( - int columnIndex, - IUnversionedColumnarRowBatch::TDictionaryId ytDictionaryId, - const TBatchColumn* dictionaryColumn) - { - int arrowDictionaryId = arrowDictionaryIdCounter++; - const auto& typedColumn = TypedColumns_[columnIndex]; - auto previousYTDictionaryId = StreamEncoder_->ArrowDictionaryIds()[columnIndex]; - if (ytDictionaryId == previousYTDictionaryId) { - YT_LOG_DEBUG("Reusing previous dictionary (ColumnId: %v, YTDictionaryId: %v, ArrowDictionaryId: %v)", - typedColumn.Column->Id, - ytDictionaryId, - arrowDictionaryId); - } else { - YT_LOG_DEBUG("Sending new dictionary (ColumnId: %v, YTDictionaryId: %v, ArrowDictionaryId: %v)", - typedColumn.Column->Id, - ytDictionaryId, - arrowDictionaryId); - PrepareDictionaryBatch( - TTypedBatchColumn{dictionaryColumn, typedColumn.Type}, - arrowDictionaryId); - StreamEncoder_->ArrowDictionaryIds()[columnIndex] = ytDictionaryId; - } - }; - - for (int columnIndex = 0; columnIndex < std::ssize(TypedColumns_); ++columnIndex) { - const auto& typedColumn = TypedColumns_[columnIndex]; - if (typedColumn.Column->Dictionary) { - YT_LOG_DEBUG("Adding dictionary batch for dictionary-encoded column (ColumnId: %v)", - typedColumn.Column->Id); - prepareDictionaryBatch( - columnIndex, - typedColumn.Column->Dictionary->DictionaryId, - typedColumn.Column->Dictionary->ValueColumn); - } else if (IsRleButNotDictionaryEncodedStringLikeColumn(*typedColumn.Column)) { - YT_LOG_DEBUG("Adding dictionary batch for RLE but not dictionary-encoded string-like column (ColumnId: %v)", - typedColumn.Column->Id); - prepareDictionaryBatch( - columnIndex, - IUnversionedColumnarRowBatch::GenerateDictionaryId(), // any unique one will do - typedColumn.Column->Rle->ValueColumn); - } else if (IsRleAndDictionaryEncodedColumn(*typedColumn.Column)) { - YT_LOG_DEBUG("Adding dictionary batch for RLE and dictionary-encoded column (ColumnId: %v)", - typedColumn.Column->Id); - prepareDictionaryBatch( - columnIndex, - typedColumn.Column->Rle->ValueColumn->Dictionary->DictionaryId, - typedColumn.Column->Rle->ValueColumn->Dictionary->ValueColumn); - } - } - } - - void PrepareDictionaryBatch( - const TTypedBatchColumn& typedColumn, - int arrowDictionaryId) - { - flatbuffers::FlatBufferBuilder flatbufBuilder; - - auto [recordBatchOffset, bodySize, bodyWriter] = SerializeRecordBatch( - &flatbufBuilder, - typedColumn.Column->ValueCount, - TRange({typedColumn})); - - auto dictionaryBatchOffset = org::apache::arrow::flatbuf::CreateDictionaryBatch( - flatbufBuilder, - arrowDictionaryId, - recordBatchOffset); - - auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( - flatbufBuilder, - org::apache::arrow::flatbuf::MetadataVersion_V4, - org::apache::arrow::flatbuf::MessageHeader_DictionaryBatch, - dictionaryBatchOffset.Union(), - bodySize); - - flatbufBuilder.Finish(messageOffset); - - RegisterMessage( - org::apache::arrow::flatbuf::MessageHeader_DictionaryBatch, - std::move(flatbufBuilder), - bodySize, - std::move(bodyWriter)); - } - - void PrepareRecordBatch() - { - flatbuffers::FlatBufferBuilder flatbufBuilder; - - auto [recordBatchOffset, bodySize, bodyWriter] = SerializeRecordBatch( - &flatbufBuilder, - Batch_->GetRowCount(), - TypedColumns_); - - auto messageOffset = org::apache::arrow::flatbuf::CreateMessage( - flatbufBuilder, - org::apache::arrow::flatbuf::MetadataVersion_V4, - org::apache::arrow::flatbuf::MessageHeader_RecordBatch, - recordBatchOffset.Union(), - bodySize); - - flatbufBuilder.Finish(messageOffset); - - RegisterMessage( - org::apache::arrow::flatbuf::MessageHeader_RecordBatch, - std::move(flatbufBuilder), - bodySize, - std::move(bodyWriter)); - } -}; - -//////////////////////////////////////////////////////////////////////////////// TSharedRef TArrowRowStreamEncoder::Encode( const IUnversionedRowBatchPtr& batch, @@ -1157,16 +92,31 @@ TSharedRef TArrowRowStreamEncoder::Encode( descriptor.set_rowset_kind(NApi::NRpcProxy::NProto::RK_UNVERSIONED); descriptor.set_rowset_format(NApi::NRpcProxy::NProto::RF_ARROW); - TArrowRowStreamBlockEncoder blockEncoder(this, std::move(columnarBatch)); + if (!Writer_) { + // The writer is created lazily to avoid unnecessary errors in the constructor when using fallbackEncoder + Writer_ = CreateStaticTableWriterForFormat( + NFormats::EFormatType::Arrow, + NameTable_, + {Schema_}, + {Columns_}, + AsyncOutputStream_, + /*enableContextSaving*/ false, + ControlAttributesConfig_, + /*keyColumnCount*/ 0); + } + Data_.clear(); + Writer_->WriteBatch(batch); + NConcurrency::WaitFor(Writer_->Flush()) + .ThrowOnError(); + + auto rowRefs = TSharedRef::FromString(Data_); auto [block, payloadRef] = SerializeRowStreamBlockEnvelope( - blockEncoder.GetPayloadSize(), + rowRefs.Size(), descriptor, statistics); - blockEncoder.WritePayload(payloadRef); - - FirstBatch_ = false; + MergeRefsToRef(std::vector<TSharedRef>{rowRefs}, payloadRef); return block; } @@ -1177,12 +127,14 @@ TSharedRef TArrowRowStreamEncoder::Encode( IRowStreamEncoderPtr CreateArrowRowStreamEncoder( TTableSchemaPtr schema, + std::optional<std::vector<std::string>> columns, TNameTablePtr nameTable, IRowStreamEncoderPtr fallbackEncoder, NFormats::TControlAttributesConfigPtr controlAttributesConfig) { return New<TArrowRowStreamEncoder>( std::move(schema), + std::move(columns), std::move(nameTable), std::move(fallbackEncoder), std::move(controlAttributesConfig)); diff --git a/yt/yt/client/arrow/arrow_row_stream_encoder.h b/yt/yt/client/arrow/arrow_row_stream_encoder.h index 792b647d18..be20a949c4 100644 --- a/yt/yt/client/arrow/arrow_row_stream_encoder.h +++ b/yt/yt/client/arrow/arrow_row_stream_encoder.h @@ -14,6 +14,7 @@ namespace NYT::NArrow { NApi::NRpcProxy::IRowStreamEncoderPtr CreateArrowRowStreamEncoder( NTableClient::TTableSchemaPtr schema, + std::optional<std::vector<std::string>> columns, NTableClient::TNameTablePtr nameTable, NApi::NRpcProxy::IRowStreamEncoderPtr fallbackEncoder, NFormats::TControlAttributesConfigPtr controlAttributesConfig); diff --git a/yt/yt/client/arrow/ya.make b/yt/yt/client/arrow/ya.make index 40d27d8e07..097479ffdf 100644 --- a/yt/yt/client/arrow/ya.make +++ b/yt/yt/client/arrow/ya.make @@ -10,7 +10,7 @@ SRCS( PEERDIR( yt/yt/client - yt/yt/client/arrow/fbs + yt/yt/library/formats ) END() diff --git a/yt/yt/client/driver/proxy_discovery_cache.cpp b/yt/yt/client/driver/proxy_discovery_cache.cpp index a17894a93e..a9612bb359 100644 --- a/yt/yt/client/driver/proxy_discovery_cache.cpp +++ b/yt/yt/client/driver/proxy_discovery_cache.cpp @@ -70,6 +70,8 @@ public: private: const IClientPtr Client_; + const NLogging::TLogger Logger = DriverLogger(); + TFuture<TProxyDiscoveryResponse> DoGet( const TProxyDiscoveryRequest& request, bool /*isPeriodicUpdate*/) noexcept override @@ -93,7 +95,13 @@ private: options.ReadFrom = EMasterChannelKind::LocalCache; options.Attributes = {BalancersAttributeName}; - auto path = GetProxyRegistryPath(request.Type) + "/@"; + TYPath path; + try { + path = GetProxyRegistryPath(request.Type) + "/@"; + } catch (const std::exception& ex) { + YT_LOG_ERROR(ex, "Failed to get proxy registry path"); + return MakeFuture<std::optional<TProxyDiscoveryResponse>>(ex); + } return Client_->GetNode(path, options).Apply( BIND([=] (const TYsonString& yson) -> std::optional<TProxyDiscoveryResponse> { auto attributes = ConvertTo<IMapNodePtr>(yson); @@ -120,7 +128,13 @@ private: options.SuppressTransactionCoordinatorSync = true; options.Attributes = {BannedAttributeName, RoleAttributeName, AddressesAttributeName}; - auto path = GetProxyRegistryPath(request.Type); + TYPath path; + try { + path = GetProxyRegistryPath(request.Type); + } catch (const std::exception& ex) { + YT_LOG_ERROR(ex, "Failed to get proxy registry path"); + return MakeFuture<TProxyDiscoveryResponse>(ex); + } return Client_->GetNode(path, options).Apply(BIND([=] (const TYsonString& yson) { TProxyDiscoveryResponse response; diff --git a/yt/yt/client/signature/public.h b/yt/yt/client/signature/public.h new file mode 100644 index 0000000000..012d19cfda --- /dev/null +++ b/yt/yt/client/signature/public.h @@ -0,0 +1,11 @@ +#include <library/cpp/yt/memory/ref_counted.h> + +namespace NYT::NSignature { + +/////////////////////////////////////////////////////////////////////////////// + +DECLARE_REFCOUNTED_CLASS(TSignature) + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NSignature diff --git a/yt/yt/client/signature/signature.cpp b/yt/yt/client/signature/signature.cpp new file mode 100644 index 0000000000..4c071c1407 --- /dev/null +++ b/yt/yt/client/signature/signature.cpp @@ -0,0 +1,64 @@ +#include "signature.h" + +#include <yt/yt/core/yson/consumer.h> + +#include <yt/yt/core/ytree/fluent.h> +#include <yt/yt/core/ytree/convert.h> + +namespace NYT::NSignature { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYson; +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +TSignature::TSignature(NYson::TYsonString payload) + : Payload_(std::move(payload)) +{ } + +//////////////////////////////////////////////////////////////////////////////// + +const TYsonString& TSignature::Payload() const +{ + return Payload_; +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TSignature& signature, IYsonConsumer* consumer) +{ + consumer->OnBeginMap(); + BuildYsonMapFragmentFluently(consumer) + .Item("header").Value(signature.Header_.ToString()) + .Item("payload").Value(signature.Payload_.ToString()) + .Item("signature").Value(TString( + reinterpret_cast<const char*>(signature.Signature_.data()), + signature.Signature_.size())); + consumer->OnEndMap(); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Deserialize(TSignature& signature, INodePtr node) +{ + auto mapNode = node->AsMap(); + signature.Header_ = TYsonString(mapNode->GetChildValueOrThrow<TString>("header")); + signature.Payload_ = TYsonString(mapNode->GetChildValueOrThrow<TString>("payload")); + + auto signatureString = mapNode->GetChildValueOrThrow<TString>("signature"); + auto signatureBytes = std::as_bytes(std::span(TStringBuf(signatureString))); + signature.Signature_.resize(signatureBytes.size()); + + std::copy(signatureBytes.begin(), signatureBytes.end(), signature.Signature_.begin()); +} + +void Deserialize(TSignature& signature, TYsonPullParserCursor* cursor) +{ + Deserialize(signature, ExtractTo<INodePtr>(cursor)); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NSignature diff --git a/yt/yt/client/signature/signature.h b/yt/yt/client/signature/signature.h new file mode 100644 index 0000000000..f6602994ce --- /dev/null +++ b/yt/yt/client/signature/signature.h @@ -0,0 +1,45 @@ +#pragma once + +#include "public.h" + +#include <yt/yt/core/yson/string.h> + +#include <yt/yt/core/ytree/public.h> + +#include <vector> + +namespace NYT::NSignature { + +//////////////////////////////////////////////////////////////////////////////// + +class TSignature final +{ +public: + // NB(pavook) only needed for Deserialize internals. + + //! Constructs an empty TSignature. + TSignature() = default; + + //! Creates a TSignature containing the given payload without an actual signature. + explicit TSignature(NYson::TYsonString payload); + + [[nodiscard]] const NYson::TYsonString& Payload() const; + +private: + NYson::TYsonString Header_; + NYson::TYsonString Payload_; + std::vector<std::byte> Signature_; + + friend class TSignatureGenerator; + friend class TSignatureValidator; + + friend void Serialize(const TSignature& signature, NYson::IYsonConsumer* consumer); + friend void Deserialize(TSignature& signature, NYTree::INodePtr node); + friend void Deserialize(TSignature& signature, NYson::TYsonPullParserCursor* cursor); +}; + +DEFINE_REFCOUNTED_TYPE(TSignature) + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NSignature diff --git a/yt/yt/client/signature/unittests/signature_ut.cpp b/yt/yt/client/signature/unittests/signature_ut.cpp new file mode 100644 index 0000000000..75ae1733fa --- /dev/null +++ b/yt/yt/client/signature/unittests/signature_ut.cpp @@ -0,0 +1,54 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/client/signature/signature.h> + +#include <yt/yt/core/yson/string.h> + +#include <yt/yt/core/ytree/convert.h> + +namespace NYT::NSignature { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYson; +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSignatureTest, PayloadConstruct) +{ + TSignature signature(TYsonString("payload"_sb)); + EXPECT_EQ(signature.Payload().ToString(), "payload"); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSignatureTest, DeserializeSerialize) +{ + // SignatureSize bytes. + TYsonString ysonOK(R"({"header"="header";"payload"="payload";"signature"="signature";})"_sb); + + TSignaturePtr signature; + EXPECT_NO_THROW(signature = ConvertTo<TSignaturePtr>(ysonOK)); + EXPECT_EQ(signature->Payload().ToString(), "payload"); + + EXPECT_EQ(ConvertToYsonString(signature, EYsonFormat::Text).ToString(), ysonOK.ToString()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSignatureTest, DeserializeFail) +{ + { + TYsonString ysonFail( + R"({"header"="header";"buddy"="payload";"signature"="abacaba";})"_sb + ); + EXPECT_THROW_WITH_SUBSTRING(ConvertTo<TSignaturePtr>(ysonFail), "no child with key \"payload\""); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NSignature diff --git a/yt/yt/library/oom/unittests/ya.make b/yt/yt/client/signature/unittests/ya.make index 23392352b9..b7f6fd5fe5 100644 --- a/yt/yt/library/oom/unittests/ya.make +++ b/yt/yt/client/signature/unittests/ya.make @@ -1,17 +1,18 @@ -GTEST() +GTEST(unittester-client-signature) INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) -ALLOCATOR(TCMALLOC) - SRCS( - oom_ut.cpp + signature_ut.cpp ) INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc) PEERDIR( - yt/yt/library/oom + yt/yt/core/test_framework + yt/yt/client ) +SIZE(SMALL) + END() diff --git a/yt/yt/client/ya.make b/yt/yt/client/ya.make index 2b384fe818..9d54b9ab4c 100644 --- a/yt/yt/client/ya.make +++ b/yt/yt/client/ya.make @@ -101,6 +101,8 @@ SRCS( security_client/public.cpp security_client/helpers.cpp + signature/signature.cpp + table_client/public.cpp table_client/adapters.cpp table_client/table_output.cpp @@ -237,6 +239,7 @@ RECURSE( RECURSE_FOR_TESTS( api/unittests + signature/unittests table_client/unittests unittests ) diff --git a/yt/yt/core/bus/tcp/config.h b/yt/yt/core/bus/tcp/config.h index 22d1bd6cd7..914035e2d9 100644 --- a/yt/yt/core/bus/tcp/config.h +++ b/yt/yt/core/bus/tcp/config.h @@ -47,11 +47,11 @@ public: TEnumIndexedArray<EMultiplexingBand, TMultiplexingBandConfigPtr> MultiplexingBands; - TTcpDispatcherConfigPtr ApplyDynamic(const TTcpDispatcherDynamicConfigPtr& dynamicConfig) const; - //! Used to store TLS/SSL certificate files. std::optional<TString> BusCertsDirectoryPath; + TTcpDispatcherConfigPtr ApplyDynamic(const TTcpDispatcherDynamicConfigPtr& dynamicConfig) const; + REGISTER_YSON_STRUCT(TTcpDispatcherConfig); static void Register(TRegistrar registrar); @@ -78,6 +78,8 @@ public: //! Used to store TLS/SSL certificate files. std::optional<TString> BusCertsDirectoryPath; + static void Setup(auto&& registrar); + REGISTER_YSON_STRUCT(TTcpDispatcherDynamicConfig); static void Register(TRegistrar registrar); diff --git a/yt/yt/core/bus/tcp/configure_dispatcher.cpp b/yt/yt/core/bus/tcp/configure_dispatcher.cpp new file mode 100644 index 0000000000..75fdd662a5 --- /dev/null +++ b/yt/yt/core/bus/tcp/configure_dispatcher.cpp @@ -0,0 +1,41 @@ +#include "dispatcher.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NBus { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TTcpDispatcherConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TTcpDispatcherDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TTcpDispatcherConfigPtr& config) +{ + NBus::TTcpDispatcher::Get()->Configure(config); +} + +void ReconfigureSingleton( + const TTcpDispatcherConfigPtr& config, + const TTcpDispatcherDynamicConfigPtr& dynamicConfig) +{ + TTcpDispatcher::Get()->Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "tcp_dispatcher", + TTcpDispatcherConfig, + TTcpDispatcherDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NBus diff --git a/yt/yt/core/bus/tcp/dispatcher.cpp b/yt/yt/core/bus/tcp/dispatcher.cpp index 045f3d231d..4b5c7b64f2 100644 --- a/yt/yt/core/bus/tcp/dispatcher.cpp +++ b/yt/yt/core/bus/tcp/dispatcher.cpp @@ -1,4 +1,5 @@ #include "dispatcher.h" + #include "dispatcher_impl.h" #include <yt/yt/core/bus/private.h> diff --git a/yt/yt/core/bus/tcp/public.h b/yt/yt/core/bus/tcp/public.h index 5e7c54af91..0c86109c1a 100644 --- a/yt/yt/core/bus/tcp/public.h +++ b/yt/yt/core/bus/tcp/public.h @@ -1,5 +1,7 @@ #pragma once +#include <yt/yt/core/misc/configurable_singleton_decl.h> + #include <yt/yt/core/bus/public.h> namespace NYT::NBus { @@ -18,6 +20,8 @@ DECLARE_REFCOUNTED_CLASS(TBusClientConfig) struct IPacketTranscoderFactory; +YT_DECLARE_RECONFIGURABLE_SINGLETON(TTcpDispatcherConfig, TTcpDispatcherDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NBus diff --git a/yt/yt/core/concurrency/configure_fiber_manager.cpp b/yt/yt/core/concurrency/configure_fiber_manager.cpp new file mode 100644 index 0000000000..36168664fc --- /dev/null +++ b/yt/yt/core/concurrency/configure_fiber_manager.cpp @@ -0,0 +1,41 @@ +#include "fiber_manager.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NConcurrency { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TFiberManagerConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TFiberManagerDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TFiberManagerConfigPtr& config) +{ + TFiberManager::Configure(config); +} + +void ReconfigureSingleton( + const TFiberManagerConfigPtr& config, + const TFiberManagerDynamicConfigPtr& dynamicConfig) +{ + TFiberManager::Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "fiber_manager", + TFiberManagerConfig, + TFiberManagerDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NConcurrency diff --git a/yt/yt/core/concurrency/coroutine.h b/yt/yt/core/concurrency/coroutine.h index 7b395585e9..5a4e849908 100644 --- a/yt/yt/core/concurrency/coroutine.h +++ b/yt/yt/core/concurrency/coroutine.h @@ -5,6 +5,8 @@ #include <yt/yt/core/actions/callback.h> +#include <library/cpp/yt/misc/concepts.h> + #include <util/system/context.h> #include <optional> diff --git a/yt/yt/core/concurrency/execution_stack.h b/yt/yt/core/concurrency/execution_stack.h index 99038d3cd3..fe02810788 100644 --- a/yt/yt/core/concurrency/execution_stack.h +++ b/yt/yt/core/concurrency/execution_stack.h @@ -2,6 +2,10 @@ #include "public.h" +#if defined(_win_) +#include <windows.h> +#endif + namespace NYT::NConcurrency { //////////////////////////////////////////////////////////////////////////////// @@ -66,7 +70,6 @@ private: friend TExecutionContext CreateExecutionContext( TExecutionStack* stack, void (*trampoline)(void*)); - }; #else diff --git a/yt/yt/core/concurrency/new_fair_share_thread_pool.cpp b/yt/yt/core/concurrency/new_fair_share_thread_pool.cpp index a0e9fd4e8e..14541740eb 100644 --- a/yt/yt/core/concurrency/new_fair_share_thread_pool.cpp +++ b/yt/yt/core/concurrency/new_fair_share_thread_pool.cpp @@ -9,6 +9,7 @@ #include <yt/yt/core/actions/current_invoker.h> #include <yt/yt/core/misc/finally.h> +#include <yt/yt/core/misc/hazard_ptr.h> #include <yt/yt/core/misc/heap.h> #include <yt/yt/core/misc/ring_queue.h> #include <yt/yt/core/misc/mpsc_stack.h> @@ -714,7 +715,9 @@ public: while (true) { auto cookie = GetEventCount()->PrepareWait(); - auto hasAction = ThreadStates_[index].Action.BucketHolder; + auto& threadState = ThreadStates_[index]; + + auto hasAction = threadState.Action.BucketHolder; int activeThreadDelta = hasAction ? -1 : 0; auto callback = DoOnExecute(index, fetchNext); @@ -738,6 +741,7 @@ public: } YT_VERIFY(fetchNext); + MaybeRunMaintenance(&threadState, GetCpuInstant(), /*flush*/ true); Wait(cookie, isStopping); } } @@ -806,6 +810,7 @@ private: int LastActionsInQueue; TDuration TimeFromStart; TDuration TimeFromEnqueue; + TCpuInstant LastMaintenanceInstant = {}; }; static_assert(sizeof(TThreadState) >= CacheLineSize); @@ -1189,6 +1194,8 @@ private: ReportWaitTime(waitTime); } + MaybeRunMaintenance(&threadState, action.StartedAt, /*flush*/ false); + CumulativeSchedulingTimeCounter_.Add(CpuDurationToDuration(GetCpuInstant() - cpuInstant)); if (!fetchNext) { @@ -1240,6 +1247,17 @@ private: WaitTimeObserver_(waitTime); } } + + static void MaybeRunMaintenance(TThreadState* threadState, TCpuInstant now, bool flush) + { + YT_ASSERT(threadState); + + constexpr i64 MaintenancePeriod = 1'000'000'000; + if (flush || now > threadState->LastMaintenanceInstant + MaintenancePeriod) { + ReclaimHazardPointers(false); + threadState->LastMaintenanceInstant = now; + } + } }; DEFINE_REFCOUNTED_TYPE(TTwoLevelFairShareQueue) diff --git a/yt/yt/core/concurrency/public.h b/yt/yt/core/concurrency/public.h index b7634c0730..e25d455dfa 100644 --- a/yt/yt/core/concurrency/public.h +++ b/yt/yt/core/concurrency/public.h @@ -1,6 +1,9 @@ #pragma once #include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> + +#include <library/cpp/yt/misc/enum.h> namespace NYT::NConcurrency { @@ -127,6 +130,8 @@ DECLARE_REFCOUNTED_STRUCT(ICallbackProvider) class TPropagatingStorage; +YT_DECLARE_RECONFIGURABLE_SINGLETON(TFiberManagerConfig, TFiberManagerDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NConcurrency diff --git a/yt/yt/core/logging/configure_log_manager.cpp b/yt/yt/core/logging/configure_log_manager.cpp new file mode 100644 index 0000000000..4ace9c5d5b --- /dev/null +++ b/yt/yt/core/logging/configure_log_manager.cpp @@ -0,0 +1,51 @@ +#include "log_manager.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NLogging { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TLogManagerConfigPtr>& parameter) +{ + parameter + .DefaultCtor([] { return NLogging::TLogManagerConfig::CreateDefault(); }) + .ResetOnLoad(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TLogManagerDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TLogManagerConfigPtr& config) +{ + if (!NLogging::TLogManager::Get()->IsConfiguredFromEnv()) { + TLogManager::Get()->Configure( + config, + /*sync*/ true); + } +} + +void ReconfigureSingleton( + const TLogManagerConfigPtr& config, + const TLogManagerDynamicConfigPtr& dynamicConfig) +{ + if (!NLogging::TLogManager::Get()->IsConfiguredFromEnv()) { + NLogging::TLogManager::Get()->Configure( + config->ApplyDynamic(dynamicConfig), + /*sync*/ false); + } +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "logging", + TLogManagerConfig, + TLogManagerDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NLogging diff --git a/yt/yt/core/logging/public.h b/yt/yt/core/logging/public.h index 2ede67840c..cd9b4d1568 100644 --- a/yt/yt/core/logging/public.h +++ b/yt/yt/core/logging/public.h @@ -1,9 +1,12 @@ #pragma once #include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> #include <library/cpp/yt/logging/public.h> +#include <library/cpp/yt/misc/enum.h> + namespace NYT::NLogging { //////////////////////////////////////////////////////////////////////////////// @@ -45,6 +48,8 @@ DECLARE_REFCOUNTED_STRUCT(IFileLogWriter) DECLARE_REFCOUNTED_STRUCT(IStreamLogOutput) DECLARE_REFCOUNTED_STRUCT(ILogCompressionCodec) +YT_DECLARE_RECONFIGURABLE_SINGLETON(TLogManagerConfig, TLogManagerDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NLogging diff --git a/yt/yt/core/misc/configurable_singleton_decl-inl.h b/yt/yt/core/misc/configurable_singleton_decl-inl.h new file mode 100644 index 0000000000..75342d0715 --- /dev/null +++ b/yt/yt/core/misc/configurable_singleton_decl-inl.h @@ -0,0 +1,33 @@ +#ifndef CONFIGURABLE_SINGLETON_DECL_INL_H_ +#error "Direct inclusion of this file is not allowed, include configurable_singleton_decl.h" +// For the sake of sane code completion. +#include "configurable_singleton_decl.h" +#endif + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template <class TConfig, bool Static> +struct TSingletonConfigTag +{ }; + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +#undef YT_DECLARE_CONFIGURABLE_SINGLETON +#undef YT_DECLARE_RECONFIGURABLE_SINGLETON + +#define YT_DECLARE_CONFIGURABLE_SINGLETON(configType) \ + void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<configType, true>) \ + +#define YT_DECLARE_RECONFIGURABLE_SINGLETON(configType, dynamicConfigType) \ + void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<configType, true>); \ + void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<dynamicConfigType, false>) + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/core/misc/configurable_singleton_decl.h b/yt/yt/core/misc/configurable_singleton_decl.h new file mode 100644 index 0000000000..72d7b157c4 --- /dev/null +++ b/yt/yt/core/misc/configurable_singleton_decl.h @@ -0,0 +1,16 @@ +#pragma once + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +#define YT_DECLARE_CONFIGURABLE_SINGLETON(configType) +#define YT_DECLARE_RECONFIGURABLE_SINGLETON(configType, dynamicConfigType) + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define CONFIGURABLE_SINGLETON_DECL_INL_H_ +#include "configurable_singleton_decl-inl.h" +#undef CONFIGURABLE_SINGLETON_DECL_INL_H_ diff --git a/yt/yt/core/misc/configurable_singleton_def-inl.h b/yt/yt/core/misc/configurable_singleton_def-inl.h new file mode 100644 index 0000000000..0f46b7c445 --- /dev/null +++ b/yt/yt/core/misc/configurable_singleton_def-inl.h @@ -0,0 +1,150 @@ +#ifndef CONFIGURABLE_SINGLETON_DEF_INL_H_ +#error "Direct inclusion of this file is not allowed, include configurable_singleton_def.h" +// For the sake of sane code completion. +#include "configurable_singleton_def.h" +#endif + +#include <library/cpp/yt/misc/static_initializer.h> + +#include <yt/yt/core/misc/collection_helpers.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template <bool Static> +template <class TConfig> +TIntrusivePtr<TConfig> TSingletonsConfigBase<Static>::TryGetSingletonConfig() +{ + CheckSingletonConfigRegistered(TSingletonConfigTag<TConfig, true>()); + return std::any_cast<TIntrusivePtr<TConfig>>(*GetOrCrash(TypeToConfig_, typeid(TConfig))); +} + +template <bool Static> +template <class TConfig> +TIntrusivePtr<TConfig> TSingletonsConfigBase<Static>::GetSingletonConfig() +{ + auto config = TryGetSingletonConfig<TConfig>(); + YT_VERIFY(config); + return config; +} + +template <bool Static> +template <class TConfig> +void TSingletonsConfigBase<Static>::SetSingletonConfig(TIntrusivePtr<TConfig> config) +{ + CheckSingletonConfigRegistered(TSingletonConfigTag<TConfig, Static>()); + *GetOrCrash(TypeToConfig_, typeid(TConfig)) = std::move(config); +} + +template <class TManagerConfig> +using TRegisterSingletonField = std::function<void(NYTree::TYsonStructRegistrar<TManagerConfig> registrar)>; +using TConfigureSingleton = std::function<void(const std::any& config)>; +using TReconfigureSingleton = std::function<void(const std::any& config, const std::any& dynamicConfig)>; + +struct TSingletonTraits +{ + TRegisterSingletonField<TSingletonsConfig> RegisterField; + TRegisterSingletonField<TSingletonsDynamicConfig> RegisterDynamicField; + TConfigureSingleton Configure; + TReconfigureSingleton Reconfigure; +}; + +struct TSingletonConfigHelpers +{ + static void RegisterSingleton( + const std::string& singletonName, + TSingletonTraits singletonTraits); + + template <class TSingletonConfig, class TManagerConfig> + static TRegisterSingletonField<TManagerConfig> MakeRegisterField(const std::string& singletonName) + { + return [=] (NYTree::TYsonStructRegistrar<TManagerConfig> registrar) { + SetupSingletonConfigParameter( + registrar.template ParameterWithUniversalAccessor<TIntrusivePtr<TSingletonConfig>>( + // TODO(babenko): switch to std::string + TString(singletonName), + [=] (TManagerConfig* config) -> auto& { + auto it = config->NameToConfig_.find(singletonName); + if (it == config->NameToConfig_.end()) { + it = config->NameToConfig_.emplace(singletonName, std::any(TIntrusivePtr<TSingletonConfig>())).first; + EmplaceOrCrash(config->TypeToConfig_, std::type_index(typeid(TSingletonConfig)), &it->second); + } + return *std::any_cast<TIntrusivePtr<TSingletonConfig>>(&it->second); + })); + }; + } + + template <class TSingletonConfig> + static TConfigureSingleton MakeConfigureSingleton() + { + return [] (const std::any& config) { + auto typedConfig = std::any_cast<TIntrusivePtr<TSingletonConfig>>(config); + ConfigureSingleton(typedConfig); + }; + } + + template <class TSingletonConfig, class TDynamicSingletonConfig> + static TReconfigureSingleton MakeReconfigureSingleton() + { + return [] (const std::any& config, const std::any& dynamicConfig) { + auto typedConfig = std::any_cast<TIntrusivePtr<TSingletonConfig>>(config); + auto typedDynamicConfig = std::any_cast<TIntrusivePtr<TDynamicSingletonConfig>>(dynamicConfig); + ReconfigureSingleton(typedConfig, typedDynamicConfig); + }; + } + + template <class TSingletonConfig> + static void RegisterSingleton(const std::string& singletonName) + { + RegisterSingleton( + singletonName, + TSingletonTraits{ + .RegisterField = MakeRegisterField<TSingletonConfig, TSingletonsConfig>(singletonName), + .Configure = MakeConfigureSingleton<TSingletonConfig>(), + }); + } + + template <class TSingletonConfig, class TDynamicSingletonConfig> + static void RegisterReconfigurableSingleton(const std::string& singletonName) + { + RegisterSingleton( + singletonName, + TSingletonTraits{ + .RegisterField = MakeRegisterField<TSingletonConfig, TSingletonsConfig>(singletonName), + .RegisterDynamicField = MakeRegisterField<TDynamicSingletonConfig, TSingletonsDynamicConfig>(singletonName), + .Configure = MakeConfigureSingleton<TSingletonConfig>(), + .Reconfigure = MakeReconfigureSingleton<TSingletonConfig, TDynamicSingletonConfig>(), + }); + } +}; + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +#undef YT_DEFINE_CONFIGURABLE_SINGLETON +#undef YT_DEFINE_RECONFIGURABLE_SINGLETON + +#define YT_DEFINE_CONFIGURABLE_SINGLETON(singletonName, configType) \ + [[maybe_unused]] void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<configType, true>) \ + { } \ + \ + YT_STATIC_INITIALIZER( \ + ::NYT::NDetail::TSingletonConfigHelpers::RegisterSingleton<configType>(singletonName)) + +#define YT_DEFINE_RECONFIGURABLE_SINGLETON(singletonName, configType, dynamicConfigType) \ + [[maybe_unused]] void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<configType, true>) \ + { } \ + \ + [[maybe_unused]] void CheckSingletonConfigRegistered(::NYT::NDetail::TSingletonConfigTag<dynamicConfigType, false>) \ + { } \ + \ + YT_STATIC_INITIALIZER( \ + ::NYT::NDetail::TSingletonConfigHelpers::RegisterReconfigurableSingleton<configType, dynamicConfigType>(singletonName)) \ + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/core/misc/configurable_singleton_def.cpp b/yt/yt/core/misc/configurable_singleton_def.cpp new file mode 100644 index 0000000000..acad95481d --- /dev/null +++ b/yt/yt/core/misc/configurable_singleton_def.cpp @@ -0,0 +1,151 @@ +#include "configurable_singleton_def.h" + +#include <library/cpp/yt/memory/leaky_singleton.h> + +#include <library/cpp/yt/threading/spin_lock.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TSingletonManagerImpl +{ +public: + static TSingletonManagerImpl* Get() + { + return LeakySingleton<TSingletonManagerImpl>(); + } + + void Register( + const std::string& singletonName, + TSingletonTraits singletonTraits) + { + YT_VERIFY(!AllRegistered_.load()); + EmplaceOrCrash(SingletonMap_, singletonName, std::move(singletonTraits)); + } + + + void Configure(const TSingletonsConfigPtr& config) + { + auto guard = Guard(ConfigureLock_); + + if (std::exchange(Configured_, true)) { + THROW_ERROR_EXCEPTION("Singletons have already been configured"); + } + + Config_ = config; + + for (const auto& [name, traits] : Singletons()) { + const auto& field = GetOrCrash(config->NameToConfig_, name); + traits.Configure(field); + } + } + + void Reconfigure(const TSingletonsDynamicConfigPtr& dynamicConfig) + { + auto guard = Guard(ConfigureLock_); + + if (!Configured_) { + THROW_ERROR_EXCEPTION("Singletons are not configured yet"); + } + + for (const auto& [name, traits] : Singletons()) { + if (const auto& reconfigure = traits.Reconfigure) { + const auto& singletonConfig = GetOrCrash(Config_->NameToConfig_, name); + const auto& singletonDynamicConfig = GetOrCrash(dynamicConfig->NameToConfig_, name); + reconfigure(singletonConfig, singletonDynamicConfig); + } + } + } + + using TSingletonMap = THashMap<std::string, TSingletonTraits>; + + const TSingletonMap& Singletons() const + { + AllRegistered_.store(true); + return SingletonMap_; + } + +private: + DECLARE_LEAKY_SINGLETON_FRIEND(); + TSingletonManagerImpl() = default; + + mutable std::atomic<bool> AllRegistered_ = false; + THashMap<std::string, TSingletonTraits> SingletonMap_; + + NThreading::TSpinLock ConfigureLock_; + TSingletonsConfigPtr Config_; + bool Configured_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +void TSingletonConfigHelpers::RegisterSingleton( + const std::string& fieldName, + TSingletonTraits singletonTraits) +{ + TSingletonManagerImpl::Get()->Register( + fieldName, + std::move(singletonTraits)); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <bool Static> +void TSingletonsConfigBase<Static>::RegisterSingletons( + auto&& registrar, + auto&& registerFieldSelector) +{ + for (const auto& [_, traits] : NDetail::TSingletonManagerImpl::Get()->Singletons()) { + if (const auto& register_ = registerFieldSelector(traits)) { + register_(registrar); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template class TSingletonsConfigBase<false>; +template class TSingletonsConfigBase<true>; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +void TSingletonManager::Configure(const TSingletonsConfigPtr& config) +{ + NDetail::TSingletonManagerImpl::Get()->Configure(config); +} + +void TSingletonManager::Reconfigure(const TSingletonsDynamicConfigPtr& dynamicConfig) +{ + NDetail::TSingletonManagerImpl::Get()->Reconfigure(dynamicConfig); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TSingletonsConfig::Register(TRegistrar registrar) +{ + RegisterSingletons( + registrar, + [] (const NDetail::TSingletonTraits& traits) { return traits.RegisterField; }); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TSingletonsDynamicConfig::Register(TRegistrar registrar) +{ + RegisterSingletons( + registrar, + [] (const NDetail::TSingletonTraits& traits) { return traits.RegisterDynamicField; }); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/core/misc/configurable_singleton_def.h b/yt/yt/core/misc/configurable_singleton_def.h new file mode 100644 index 0000000000..684d50e314 --- /dev/null +++ b/yt/yt/core/misc/configurable_singleton_def.h @@ -0,0 +1,93 @@ +#pragma once + +#include <yt/yt/core/ytree/yson_struct.h> + +#include <any> +#include <typeindex> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +struct TSingletonConfigHelpers; +class TSingletonManagerImpl; + +template <bool Static> +class TSingletonsConfigBase +{ +public: + template <class TConfig> + TIntrusivePtr<TConfig> TryGetSingletonConfig(); + + template <class TConfig> + TIntrusivePtr<TConfig> GetSingletonConfig(); + + template <class TConfig> + void SetSingletonConfig(TIntrusivePtr<TConfig> config); + +protected: + static void RegisterSingletons( + auto&& registrar, + auto&& registerFieldSelector); + +private: + friend struct NYT::NDetail::TSingletonConfigHelpers; + friend class NYT::NDetail::TSingletonManagerImpl; + + THashMap<std::string, std::any> NameToConfig_; + THashMap<std::type_index, std::any*> TypeToConfig_; +}; + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +class TSingletonsConfig + : public NDetail::TSingletonsConfigBase<true> + , public virtual NYTree::TYsonStruct +{ +public: + REGISTER_YSON_STRUCT(TSingletonsConfig); + + static void Register(TRegistrar registrar); +}; + +DEFINE_REFCOUNTED_TYPE(TSingletonsConfig); + +//////////////////////////////////////////////////////////////////////////////// + +class TSingletonsDynamicConfig + : public NDetail::TSingletonsConfigBase<false> + , public virtual NYTree::TYsonStruct +{ +public: + REGISTER_YSON_STRUCT(TSingletonsDynamicConfig); + + static void Register(TRegistrar registrar); +}; + +DEFINE_REFCOUNTED_TYPE(TSingletonsDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +#define YT_DEFINE_CONFIGURABLE_SINGLETON(singletonName, configType) +#define YT_DEFINE_RECONFIGURABLE_SINGLETON(singletonName, configType, dynamicConfigType) + +//////////////////////////////////////////////////////////////////////////////// + +class TSingletonManager +{ +public: + static void Configure(const TSingletonsConfigPtr& config); + static void Reconfigure(const TSingletonsDynamicConfigPtr& dynamicConfig); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define CONFIGURABLE_SINGLETON_DEF_INL_H_ +#include "configurable_singleton_def-inl.h" +#undef CONFIGURABLE_SINGLETON_DEF_INL_H_ diff --git a/yt/yt/core/misc/public.h b/yt/yt/core/misc/public.h index 9cff9ba2cd..2ce8dd5a2d 100644 --- a/yt/yt/core/misc/public.h +++ b/yt/yt/core/misc/public.h @@ -3,8 +3,6 @@ #include "common.h" #include "error_code.h" -#include <library/cpp/yt/misc/concepts.h> - // Google Protobuf forward declarations. namespace google::protobuf { @@ -102,6 +100,9 @@ DECLARE_REFCOUNTED_CLASS(TAsyncExpiringCacheConfig) DECLARE_REFCOUNTED_CLASS(TLogDigestConfig) DECLARE_REFCOUNTED_CLASS(THistogramDigestConfig) +DECLARE_REFCOUNTED_CLASS(TSingletonsConfig) +DECLARE_REFCOUNTED_CLASS(TSingletonsDynamicConfig) + class TSignalRegistry; class TBloomFilterBuilder; diff --git a/yt/yt/core/misc/unittests/configurable_singleton_ut.cpp b/yt/yt/core/misc/unittests/configurable_singleton_ut.cpp new file mode 100644 index 0000000000..64fdbfd6f5 --- /dev/null +++ b/yt/yt/core/misc/unittests/configurable_singleton_ut.cpp @@ -0,0 +1,234 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT { +namespace { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +DECLARE_REFCOUNTED_STRUCT(TRequiredSingletonConfig); +DECLARE_REFCOUNTED_STRUCT(TOptionalSingletonConfig); +DECLARE_REFCOUNTED_STRUCT(TDefaultNewSingletonConfig); +DECLARE_REFCOUNTED_STRUCT(TReconfigurableSingletonConfig); +DECLARE_REFCOUNTED_STRUCT(TReconfigurableSingletonDynamicConfig); + +YT_DECLARE_CONFIGURABLE_SINGLETON(TRequiredSingletonConfig); +YT_DECLARE_CONFIGURABLE_SINGLETON(TOptionalSingletonConfig); +YT_DECLARE_CONFIGURABLE_SINGLETON(TDefaultNewSingletonConfig); +YT_DECLARE_RECONFIGURABLE_SINGLETON(TReconfigurableSingletonConfig, TReconfigurableSingletonDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +struct TRequiredSingletonConfig + : public TYsonStruct +{ + int Speed; + + REGISTER_YSON_STRUCT(TRequiredSingletonConfig); + + static void Register(TRegistrar registarar) + { + registarar.Parameter("speed", &TThis::Speed); + } +}; + +DEFINE_REFCOUNTED_TYPE(TRequiredSingletonConfig) + +int ConfiguredSpeed = -1; + +void SetupSingletonConfigParameter(TYsonStructParameter<TRequiredSingletonConfigPtr>& /*parameter*/) +{ } + +void ConfigureSingleton(const TRequiredSingletonConfigPtr& config) +{ + ConfiguredSpeed = config->Speed; +} + +YT_DEFINE_CONFIGURABLE_SINGLETON("required", TRequiredSingletonConfig); + +//////////////////////////////////////////////////////////////////////////////// + +struct TOptionalSingletonConfig + : public TYsonStruct +{ + int Depth; + + REGISTER_YSON_STRUCT(TOptionalSingletonConfig); + + static void Register(TRegistrar registarar) + { + registarar.Parameter("depth", &TThis::Depth); + } +}; + +DEFINE_REFCOUNTED_TYPE(TOptionalSingletonConfig) + +int ConfiguredDepth = -1; + +void SetupSingletonConfigParameter(TYsonStructParameter<TOptionalSingletonConfigPtr>& parameter) +{ + parameter.Optional(); +} + +void ConfigureSingleton(const TOptionalSingletonConfigPtr& config) +{ + if (config) { + ConfiguredDepth = config->Depth; + } +} + +YT_DEFINE_CONFIGURABLE_SINGLETON("optional", TOptionalSingletonConfig); + +//////////////////////////////////////////////////////////////////////////////// + +struct TDefaultNewSingletonConfig + : public TYsonStruct +{ + int Width; + + REGISTER_YSON_STRUCT(TDefaultNewSingletonConfig); + + static void Register(TRegistrar registarar) + { + registarar.Parameter("width", &TThis::Width) + .Default(456); + } +}; + +DEFINE_REFCOUNTED_TYPE(TDefaultNewSingletonConfig) + +int ConfiguredWidth = -1; + +void SetupSingletonConfigParameter(TYsonStructParameter<TDefaultNewSingletonConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TDefaultNewSingletonConfigPtr& config) +{ + ConfiguredWidth = config->Width; +} + +YT_DEFINE_CONFIGURABLE_SINGLETON("default_new", TDefaultNewSingletonConfig); + +//////////////////////////////////////////////////////////////////////////////// + +struct TReconfigurableSingletonConfig + : public TYsonStruct +{ + int Cost; + + REGISTER_YSON_STRUCT(TReconfigurableSingletonConfig); + + static void Register(TRegistrar registarar) + { + registarar.Parameter("cost", &TThis::Cost) + .Default(777); + } +}; + +DEFINE_REFCOUNTED_TYPE(TReconfigurableSingletonConfig) + +struct TReconfigurableSingletonDynamicConfig + : public TYsonStruct +{ + std::optional<int> Cost; + + REGISTER_YSON_STRUCT(TReconfigurableSingletonDynamicConfig); + + static void Register(TRegistrar registarar) + { + registarar.Parameter("cost", &TThis::Cost) + .Default(); + } +}; + +DEFINE_REFCOUNTED_TYPE(TReconfigurableSingletonDynamicConfig) + +int ConfiguredCost = -1; + +void SetupSingletonConfigParameter(TYsonStructParameter<TReconfigurableSingletonConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TReconfigurableSingletonDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TReconfigurableSingletonConfigPtr& config) +{ + ConfiguredCost = config->Cost; +} + +void ReconfigureSingleton( + const TReconfigurableSingletonConfigPtr& config, + const TReconfigurableSingletonDynamicConfigPtr& dynamicConfig) +{ + ConfiguredCost = dynamicConfig->Cost.value_or(config->Cost); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "reconfigurable", + TReconfigurableSingletonConfig, + TReconfigurableSingletonDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TConfigurableSingletonTest, Run) +{ + auto config = ConvertTo<TSingletonsConfigPtr>(NYson::TYsonString(TString(R"""({ + required = { + speed = 123; + }; + })"""))); + auto dynamicConfig1 = ConvertTo<TSingletonsDynamicConfigPtr>(NYson::TYsonString(TString(R"""({ + reconfigurable = { + cost = 888; + }; + })"""))); + auto dynamicConfig2 = ConvertTo<TSingletonsDynamicConfigPtr>(NYson::TYsonString(TString(R"""({ + reconfigurable = { + cost = 999; + }; + })"""))); + + EXPECT_THROW_WITH_SUBSTRING(TSingletonManager::Reconfigure(dynamicConfig1), "Singletons are not configured yet"); + + EXPECT_EQ(ConfiguredSpeed, -1); + EXPECT_EQ(ConfiguredDepth, -1); + EXPECT_EQ(ConfiguredWidth, -1); + EXPECT_EQ(ConfiguredCost, -1); + + TSingletonManager::Configure(config); + + EXPECT_EQ(ConfiguredSpeed, 123); + EXPECT_EQ(ConfiguredDepth, -1); + EXPECT_EQ(ConfiguredWidth, 456); + EXPECT_EQ(ConfiguredCost, 777); + + EXPECT_THROW_WITH_SUBSTRING(TSingletonManager::Configure(config), "Singletons have already been configured"); + + TSingletonManager::Reconfigure(dynamicConfig1); + + EXPECT_EQ(ConfiguredSpeed, 123); + EXPECT_EQ(ConfiguredDepth, -1); + EXPECT_EQ(ConfiguredWidth, 456); + EXPECT_EQ(ConfiguredCost, 888); + + TSingletonManager::Reconfigure(dynamicConfig2); + + EXPECT_EQ(ConfiguredSpeed, 123); + EXPECT_EQ(ConfiguredDepth, -1); + EXPECT_EQ(ConfiguredWidth, 456); + EXPECT_EQ(ConfiguredCost, 999); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT diff --git a/yt/yt/core/misc/unittests/ya.make b/yt/yt/core/misc/unittests/ya.make index a422838270..79d62c52c2 100644 --- a/yt/yt/core/misc/unittests/ya.make +++ b/yt/yt/core/misc/unittests/ya.make @@ -72,6 +72,7 @@ SRCS( yverify_ut.cpp zerocopy_output_writer_ut.cpp hedging_manager_ut.cpp + configurable_singleton_ut.cpp proto/ref_counted_tracker_ut.proto ) diff --git a/yt/yt/core/net/address.cpp b/yt/yt/core/net/address.cpp index ba490e7739..d048167af3 100644 --- a/yt/yt/core/net/address.cpp +++ b/yt/yt/core/net/address.cpp @@ -15,6 +15,7 @@ #include <yt/yt/core/misc/async_expiring_cache.h> #include <yt/yt/core/misc/fs.h> +#include <yt/yt/core/misc/configurable_singleton_def.h> #include <yt/yt/core/profiling/timing.h> diff --git a/yt/yt/core/net/configure_address_resolver.cpp b/yt/yt/core/net/configure_address_resolver.cpp new file mode 100644 index 0000000000..16ae5220fd --- /dev/null +++ b/yt/yt/core/net/configure_address_resolver.cpp @@ -0,0 +1,28 @@ +#include "address.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NNet { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TAddressResolverConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TAddressResolverConfigPtr& config) +{ + TAddressResolver::Get()->Configure(config); +} + +YT_DEFINE_CONFIGURABLE_SINGLETON( + "address_resolver", + TAddressResolverConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NNet diff --git a/yt/yt/core/net/public.h b/yt/yt/core/net/public.h index fde7eea47a..ea4891db83 100644 --- a/yt/yt/core/net/public.h +++ b/yt/yt/core/net/public.h @@ -1,9 +1,13 @@ #pragma once #include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> +#include <yt/yt/core/misc/error_code.h> #include <library/cpp/yt/memory/intrusive_ptr.h> +#include <library/cpp/yt/misc/guid.h> + namespace NYT::NNet { //////////////////////////////////////////////////////////////////////////////// @@ -32,6 +36,8 @@ YT_DEFINE_ERROR_ENUM( ((ResolveTimedOut) (1501)) ); +YT_DECLARE_CONFIGURABLE_SINGLETON(TAddressResolverConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NNet diff --git a/yt/yt/core/rpc/configure_dispatcher.cpp b/yt/yt/core/rpc/configure_dispatcher.cpp new file mode 100644 index 0000000000..ec95cd855d --- /dev/null +++ b/yt/yt/core/rpc/configure_dispatcher.cpp @@ -0,0 +1,41 @@ +#include "dispatcher.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NRpc { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TDispatcherConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TDispatcherDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TDispatcherConfigPtr& config) +{ + TDispatcher::Get()->Configure(config); +} + +void ReconfigureSingleton( + const TDispatcherConfigPtr& config, + const TDispatcherDynamicConfigPtr& dynamicConfig) +{ + TDispatcher::Get()->Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "rpc_dispatcher", + TDispatcherConfig, + TDispatcherDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NRpc diff --git a/yt/yt/core/rpc/dispatcher.cpp b/yt/yt/core/rpc/dispatcher.cpp index 64edfdc3ae..0ede4496f9 100644 --- a/yt/yt/core/rpc/dispatcher.cpp +++ b/yt/yt/core/rpc/dispatcher.cpp @@ -161,7 +161,6 @@ void TDispatcher::SetServiceDiscovery(IServiceDiscoveryPtr serviceDiscovery) Impl_->SetServiceDiscovery(std::move(serviceDiscovery)); } - //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NRpc diff --git a/yt/yt/core/rpc/grpc/configure_dispatcher.cpp b/yt/yt/core/rpc/grpc/configure_dispatcher.cpp new file mode 100644 index 0000000000..e12e1f1739 --- /dev/null +++ b/yt/yt/core/rpc/grpc/configure_dispatcher.cpp @@ -0,0 +1,28 @@ +#include "dispatcher.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NRpc::NGrpc { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TDispatcherConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TDispatcherConfigPtr& config) +{ + TDispatcher::Get()->Configure(config); +} + +YT_DEFINE_CONFIGURABLE_SINGLETON( + "grpc_dispatcher", + TDispatcherConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NRpc::NGrpc diff --git a/yt/yt/core/rpc/grpc/public.h b/yt/yt/core/rpc/grpc/public.h index 7cb846bf33..737faf3f4c 100644 --- a/yt/yt/core/rpc/grpc/public.h +++ b/yt/yt/core/rpc/grpc/public.h @@ -1,5 +1,7 @@ #pragma once +#include <yt/yt/core/misc/configurable_singleton_decl.h> + #include <yt/yt/core/logging/log.h> namespace NYT::NRpc::NGrpc { @@ -44,6 +46,8 @@ const THashSet<TStringBuf>& GetNativeMetadataKeys(); constexpr int GenericErrorStatusCode = 100; +YT_DECLARE_CONFIGURABLE_SINGLETON(TDispatcherConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NRpc::NGrpc diff --git a/yt/yt/core/rpc/grpc/ya.make b/yt/yt/core/rpc/grpc/ya.make index 4622b67b93..5fc1d908f7 100644 --- a/yt/yt/core/rpc/grpc/ya.make +++ b/yt/yt/core/rpc/grpc/ya.make @@ -6,6 +6,7 @@ PROTO_NAMESPACE(yt) SRCS( config.cpp + GLOBAL configure_dispatcher.cpp public.cpp dispatcher.cpp server.cpp diff --git a/yt/yt/core/rpc/http/server.cpp b/yt/yt/core/rpc/http/server.cpp index e0caa56bb7..783b82cd51 100644 --- a/yt/yt/core/rpc/http/server.cpp +++ b/yt/yt/core/rpc/http/server.cpp @@ -371,6 +371,11 @@ private: rpcHeader->set_request_codec(ToProto(NCompression::ECodec::None)); rpcHeader->set_response_codec(ToProto(NCompression::ECodec::None)); + ToProto( + rpcHeader->MutableExtension(NRpc::NProto::TRequestHeader::tracing_ext), + NTracing::TryGetCurrentTraceContext(), + /*sendBaggage*/ false); + return {}; } }; diff --git a/yt/yt/core/rpc/public.h b/yt/yt/core/rpc/public.h index 42933a8774..ea0a147594 100644 --- a/yt/yt/core/rpc/public.h +++ b/yt/yt/core/rpc/public.h @@ -1,5 +1,7 @@ #pragma once +#include <yt/yt/core/misc/configurable_singleton_decl.h> + #include <yt/yt/core/actions/callback.h> #include <yt/yt/core/concurrency/public.h> @@ -197,6 +199,8 @@ DEFINE_ENUM(EMessageFormat, ((Yson) (2)) ); +YT_DECLARE_RECONFIGURABLE_SINGLETON(TDispatcherConfig, TDispatcherDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NRpc diff --git a/yt/yt/core/service_discovery/yp/configure_service_discovery.cpp b/yt/yt/core/service_discovery/yp/configure_service_discovery.cpp new file mode 100644 index 0000000000..3321226a4e --- /dev/null +++ b/yt/yt/core/service_discovery/yp/configure_service_discovery.cpp @@ -0,0 +1,30 @@ +#include "service_discovery.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +#include <yt/yt/core/rpc/dispatcher.h> + +namespace NYT::NServiceDiscovery::NYP { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TServiceDiscoveryConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TServiceDiscoveryConfigPtr& config) +{ + NRpc::TDispatcher::Get()->SetServiceDiscovery(CreateServiceDiscovery(config)); +} + +YT_DEFINE_CONFIGURABLE_SINGLETON( + "yp_service_discovery", + TServiceDiscoveryConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NServiceDiscovery::NYP diff --git a/yt/yt/core/service_discovery/yp/public.h b/yt/yt/core/service_discovery/yp/public.h index 53d4de18d8..c90b9dc5a7 100644 --- a/yt/yt/core/service_discovery/yp/public.h +++ b/yt/yt/core/service_discovery/yp/public.h @@ -1,6 +1,7 @@ #pragma once #include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> namespace NYT::NServiceDiscovery::NYP { @@ -8,6 +9,8 @@ namespace NYT::NServiceDiscovery::NYP { DECLARE_REFCOUNTED_CLASS(TServiceDiscoveryConfig) +YT_DECLARE_CONFIGURABLE_SINGLETON(TServiceDiscoveryConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NServiceDiscovery::NYP diff --git a/yt/yt/core/service_discovery/yp/ya.make b/yt/yt/core/service_discovery/yp/ya.make index cc37fa9639..e0efcc5a88 100644 --- a/yt/yt/core/service_discovery/yp/ya.make +++ b/yt/yt/core/service_discovery/yp/ya.make @@ -9,6 +9,7 @@ PEERDIR( SRCS( config.cpp + GLOBAL configure_service_discovery.cpp ) IF (NOT OPENSOURCE) diff --git a/yt/yt/core/ya.make b/yt/yt/core/ya.make index 2368a67992..9794fce0a2 100644 --- a/yt/yt/core/ya.make +++ b/yt/yt/core/ya.make @@ -27,6 +27,7 @@ SRCS( bus/tcp/dispatcher.cpp bus/tcp/dispatcher_impl.cpp bus/tcp/config.cpp + GLOBAL bus/tcp/configure_dispatcher.cpp bus/tcp/packet.cpp bus/tcp/client.cpp bus/tcp/server.cpp @@ -53,6 +54,7 @@ SRCS( concurrency/async_stream_pipe.cpp concurrency/async_stream.cpp concurrency/config.cpp + GLOBAL concurrency/configure_fiber_manager.cpp concurrency/coroutine.cpp concurrency/delayed_executor.cpp concurrency/execution_stack.cpp @@ -97,6 +99,7 @@ SRCS( logging/compression.cpp logging/config.cpp + GLOBAL logging/configure_log_manager.cpp logging/formatter.cpp logging/fluent_log.cpp GLOBAL logging/log.cpp @@ -164,10 +167,12 @@ SRCS( misc/cache_config.cpp misc/utf8_decoder.cpp misc/zerocopy_output_writer.cpp + misc/configurable_singleton_def.cpp net/address.cpp net/connection.cpp net/config.cpp + GLOBAL net/configure_address_resolver.cpp net/dialer.cpp net/helpers.cpp net/listener.cpp @@ -195,6 +200,7 @@ SRCS( rpc/channel_detail.cpp rpc/client.cpp rpc/config.cpp + GLOBAL rpc/configure_dispatcher.cpp rpc/dispatcher.cpp rpc/dynamic_channel_pool.cpp rpc/hedging_channel.cpp @@ -244,6 +250,7 @@ SRCS( yson/async_writer.cpp yson/attribute_consumer.cpp yson/config.cpp + GLOBAL yson/configure_protobuf_interop.cpp yson/consumer.cpp yson/forwarding_consumer.cpp yson/lexer.cpp diff --git a/yt/yt/core/yson/configure_protobuf_interop.cpp b/yt/yt/core/yson/configure_protobuf_interop.cpp new file mode 100644 index 0000000000..ca621c0664 --- /dev/null +++ b/yt/yt/core/yson/configure_protobuf_interop.cpp @@ -0,0 +1,41 @@ +#include "protobuf_interop.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NYson { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TProtobufInteropConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TProtobufInteropDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TProtobufInteropConfigPtr& config) +{ + SetProtobufInteropConfig(config); +} + +void ReconfigureSingleton( + const TProtobufInteropConfigPtr& config, + const TProtobufInteropDynamicConfigPtr& dynamicConfig) +{ + ConfigureSingleton(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "protobuf_interop", + TProtobufInteropConfig, + TProtobufInteropDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NYson diff --git a/yt/yt/core/yson/public.h b/yt/yt/core/yson/public.h index d0932dea56..dae0c0b298 100644 --- a/yt/yt/core/yson/public.h +++ b/yt/yt/core/yson/public.h @@ -1,6 +1,7 @@ #pragma once #include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> #include <library/cpp/yt/yson/public.h> #include <library/cpp/yt/yson_string/public.h> @@ -98,6 +99,8 @@ class TProtobufMessageType; //! An opaque reflected counterpart of ::google::protobuf::EnumDescriptor. class TProtobufEnumType; +YT_DECLARE_RECONFIGURABLE_SINGLETON(TProtobufInteropConfig, TProtobufInteropDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NYson diff --git a/yt/yt/core/yson/string.h b/yt/yt/core/yson/string.h index 45938e3f7d..bb0992d9c7 100644 --- a/yt/yt/core/yson/string.h +++ b/yt/yt/core/yson/string.h @@ -2,6 +2,8 @@ #include "public.h" +#include <yt/yt/core/misc/serialize.h> + #include <library/cpp/yt/yson_string/string.h> namespace NYT::NYson { diff --git a/yt/yt/core/yson/token.h b/yt/yt/core/yson/token.h index 66c3e91075..c1de561903 100644 --- a/yt/yt/core/yson/token.h +++ b/yt/yt/core/yson/token.h @@ -4,6 +4,8 @@ #include <yt/yt/core/misc/property.h> +#include <library/cpp/yt/string/string_builder.h> + namespace NYT::NYson { //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/core/yson/writer.cpp b/yt/yt/core/yson/writer.cpp index d31abf0b46..32e3f1636c 100644 --- a/yt/yt/core/yson/writer.cpp +++ b/yt/yt/core/yson/writer.cpp @@ -132,30 +132,6 @@ void WriteUtf8String(const char* str, size_t len, IOutputStream& output) } } -size_t FloatToStringWithNanInf(double value, char* buf, size_t size) -{ - if (std::isfinite(value)) { - return FloatToString(value, buf, size); - } - - static const TStringBuf nanLiteral = "%nan"; - static const TStringBuf infLiteral = "%inf"; - static const TStringBuf negativeInfLiteral = "%-inf"; - - TStringBuf str; - if (std::isnan(value)) { - str = nanLiteral; - } else if (std::isinf(value) && value > 0) { - str = infLiteral; - } else { - str = negativeInfLiteral; - } - YT_VERIFY(str.size() + 1 <= size); - ::memcpy(buf, str.data(), str.size() + 1); - return str.size(); -} - - } // namespace //////////////////////////////////////////////////////////////////////////////// @@ -277,7 +253,7 @@ void TYsonWriter::OnDoubleScalar(double value) Stream_->Write(&value, sizeof(double)); } else { char buf[256]; - auto str = TStringBuf(buf, FloatToStringWithNanInf(value, buf, sizeof(buf))); + auto str = TStringBuf(buf, NDetail::FloatToStringWithNanInf(value, buf, sizeof(buf))); Stream_->Write(str); if (str.find('.') == TString::npos && str.find('e') == TString::npos && std::isfinite(value)) { Stream_->Write("."); diff --git a/yt/yt/core/ytree/unittests/text_yson_convert_ut.cpp b/yt/yt/core/ytree/unittests/text_yson_convert_ut.cpp new file mode 100644 index 0000000000..75913bed0b --- /dev/null +++ b/yt/yt/core/ytree/unittests/text_yson_convert_ut.cpp @@ -0,0 +1,273 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/ytree/convert.h> + +#include <library/cpp/yt/misc/source_location.h> + +#include <library/cpp/yt/yson_string/convert.h> + +namespace NYT::NYTree { +namespace { + +using namespace NYson; + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +void CheckEqualConversionToTextYson(const T& value, const TSourceLocation& loc = YT_CURRENT_SOURCE_LOCATION) +{ + EXPECT_EQ(ConvertToTextYsonString(value).AsStringBuf(), ConvertToYsonString(value, EYsonFormat::Text).AsStringBuf()) + << NYT::Format("At %v", loc); +} + +template <class T, class U> +void CheckEqualConversionToFromTextYson(const U& value, const TSourceLocation& loc = YT_CURRENT_SOURCE_LOCATION) +{ + auto yson = ConvertToTextYsonString(value); + EXPECT_EQ(ConvertFromTextYsonString<T>(yson), ConvertTo<T>(yson)) + << NYT::Format("At %v", loc); +} + +template <class T> +void CheckEqualConversionFromTextYson(TStringBuf value, const TSourceLocation& loc = YT_CURRENT_SOURCE_LOCATION) +{ + NYson::TYsonString yson(value); + EXPECT_EQ(ConvertTo<T>(yson), ConvertTo<T>(yson)) + << NYT::Format("At %v", loc); +} + +TEST(TTextYsonConvertTest, ConvertToTextIntegrals) +{ + CheckEqualConversionToTextYson<i8>(+14); + CheckEqualConversionToTextYson<i8>(0); + CheckEqualConversionToTextYson<i8>(-15); + CheckEqualConversionToTextYson<i32>(+100); + CheckEqualConversionToTextYson<i32>(0); + CheckEqualConversionToTextYson<i32>(-123); + CheckEqualConversionToTextYson<i64>(+100); + CheckEqualConversionToTextYson<i64>(0); + CheckEqualConversionToTextYson<i64>(-123); + + CheckEqualConversionToTextYson<ui8>(+100); + CheckEqualConversionToTextYson<ui8>(0); + CheckEqualConversionToTextYson<ui32>(+100); + CheckEqualConversionToTextYson<ui32>(0); + CheckEqualConversionToTextYson<ui64>(+100); + CheckEqualConversionToTextYson<ui64>(0); +} + +TEST(TTextYsonConvertTest, ConvertToTextIntegralsLimits) +{ + CheckEqualConversionToTextYson<i64>(std::numeric_limits<i64>::max()); + CheckEqualConversionToTextYson<i64>(std::numeric_limits<i64>::min()); + + CheckEqualConversionToTextYson<ui64>(std::numeric_limits<ui64>::max()); + CheckEqualConversionToTextYson<ui64>(std::numeric_limits<ui64>::min()); +} + +TEST(TTextYsonConvertTest, ConvertToTextFloats) +{ + CheckEqualConversionToTextYson<float>(0.0); + CheckEqualConversionToTextYson<float>(-0.0); + CheckEqualConversionToTextYson<float>(-7.7777); + CheckEqualConversionToTextYson<float>(+9.243); + + CheckEqualConversionToTextYson<double>(0.0); + CheckEqualConversionToTextYson<double>(-0.0); + CheckEqualConversionToTextYson<double>(-7.7777); + CheckEqualConversionToTextYson<double>(+9.243); +} + +TEST(TTextYsonConvertTest, ConvertToTextFloatsSpecialValues) +{ + CheckEqualConversionToTextYson<double>(std::numeric_limits<double>::min()); + CheckEqualConversionToTextYson<double>(std::numeric_limits<double>::max()); + CheckEqualConversionToTextYson<double>(std::numeric_limits<double>::infinity()); + CheckEqualConversionToTextYson<double>(-std::numeric_limits<double>::infinity()); + CheckEqualConversionToTextYson<double>(std::numeric_limits<double>::quiet_NaN()); +} + +TEST(TTextYsonConvertTest, ConvertToTextOtherPrimitiveTypes) +{ + CheckEqualConversionToTextYson<bool>(true); + CheckEqualConversionToTextYson<bool>(false); + + CheckEqualConversionToTextYson<TInstant>(TInstant::Now()); + CheckEqualConversionToTextYson<TInstant>(TInstant::Zero()); + CheckEqualConversionToTextYson<TInstant>(TInstant::FromValue(42)); + + CheckEqualConversionToTextYson<TDuration>(TDuration::Zero()); + CheckEqualConversionToTextYson<TDuration>(TDuration::Seconds(2)); + CheckEqualConversionToTextYson<TDuration>(TDuration::MilliSeconds(123)); + CheckEqualConversionToTextYson<TDuration>(TDuration::MicroSeconds(12)); + + CheckEqualConversionToTextYson<std::string>("Hello, world!"); + CheckEqualConversionToTextYson<std::string>("This is a so-called \"quotation marks\" test"); + CheckEqualConversionToTextYson<std::string>("This tests \r other \b hidden symbols \n"); + CheckEqualConversionToTextYson<std::string>("And this one tests special numbers numbers \x012"); + + CheckEqualConversionToTextYson<TGuid>(TGuid::Create()); +} + +TEST(TTextYsonConvertTest, ConvertFromTextIntegrals) +{ + CheckEqualConversionToFromTextYson<i8>(+15); + CheckEqualConversionToFromTextYson<i8>(0); + CheckEqualConversionToFromTextYson<i8>(-15); + CheckEqualConversionToFromTextYson<i32>(+100); + CheckEqualConversionToFromTextYson<i32>(0); + CheckEqualConversionToFromTextYson<i32>(-123); + CheckEqualConversionToFromTextYson<i64>(+100); + CheckEqualConversionToFromTextYson<i64>(0); + CheckEqualConversionToFromTextYson<i64>(-123); + + CheckEqualConversionToFromTextYson<ui8>(+100); + CheckEqualConversionToFromTextYson<ui8>(0); + CheckEqualConversionToFromTextYson<ui32>(+100); + CheckEqualConversionToFromTextYson<ui32>(0); + CheckEqualConversionToFromTextYson<ui64>(+100); + CheckEqualConversionToFromTextYson<ui64>(0); +} + +TEST(TTextYsonConvertTest, ConvertFromTextIntegralsLimits) +{ + CheckEqualConversionToFromTextYson<i64>(std::numeric_limits<i64>::max()); + CheckEqualConversionToFromTextYson<i64>(std::numeric_limits<i64>::min()); + + CheckEqualConversionToFromTextYson<ui64>(std::numeric_limits<ui64>::max()); + CheckEqualConversionToFromTextYson<ui64>(std::numeric_limits<ui64>::min()); +} + +TEST(TTextYsonConvertTest, ConvertFromTextFloats) +{ + CheckEqualConversionToFromTextYson<double>(0.0); + CheckEqualConversionToFromTextYson<double>(-0.0); + CheckEqualConversionToFromTextYson<double>(-7.7777); + CheckEqualConversionToFromTextYson<double>(+9.243); +} + +TEST(TTextYsonConvertTest, ConvertFromTextFloatsSpecialValues) +{ + CheckEqualConversionToFromTextYson<double>(std::numeric_limits<double>::min()); + CheckEqualConversionToFromTextYson<double>(std::numeric_limits<double>::max()); + CheckEqualConversionToFromTextYson<double>(std::numeric_limits<double>::infinity()); + CheckEqualConversionToFromTextYson<double>(-std::numeric_limits<double>::infinity()); + + // nans do not compare. + // CheckEqualConversionFromTextYson<double>(std::numeric_limits<double>::quiet_NaN()); +} + +TEST(TTextYsonConvertTest, ConvertFromTextOtherPrimitiveTypes) +{ + CheckEqualConversionToTextYson<bool>(true); + CheckEqualConversionToTextYson<bool>(false); + CheckEqualConversionToTextYson<bool>("true"); + CheckEqualConversionToTextYson<bool>("false"); + CheckEqualConversionToTextYson<bool>("0"); + CheckEqualConversionToTextYson<bool>("1"); + + CheckEqualConversionToTextYson<TInstant>(TInstant::Now()); + CheckEqualConversionToTextYson<TInstant>(TInstant::Zero()); + CheckEqualConversionToTextYson<TInstant>(TInstant::FromValue(42)); + + CheckEqualConversionToTextYson<TDuration>(TDuration::Zero()); + CheckEqualConversionToTextYson<TDuration>(TDuration::Seconds(2)); + CheckEqualConversionToTextYson<TDuration>(TDuration::MilliSeconds(123)); + CheckEqualConversionToTextYson<TDuration>(TDuration::MicroSeconds(12)); + + CheckEqualConversionToTextYson<std::string>("Hello, world!"); + CheckEqualConversionToTextYson<std::string>("This is a so-called \"quotation marks\" test"); + CheckEqualConversionToTextYson<std::string>("This tests \r other \b hidden symbols \n"); + CheckEqualConversionToTextYson<std::string>("And this one tests special numbers numbers \x012"); + + CheckEqualConversionToTextYson<TGuid>(TGuid::Create()); +} + +TEST(TTextYsonConvertTest, ConvertFromTextIntegralsTypeMissmatch) +{ + CheckEqualConversionToFromTextYson<i8>(static_cast<ui64>(+100)); + CheckEqualConversionToFromTextYson<i8>(static_cast<ui64>(0)); + CheckEqualConversionToFromTextYson<i32>(static_cast<ui64>(+100)); + CheckEqualConversionToFromTextYson<i32>(static_cast<ui64>(0)); + CheckEqualConversionToFromTextYson<i64>(static_cast<ui64>(+100)); + CheckEqualConversionToFromTextYson<i64>(static_cast<ui64>(0)); +} + +TEST(TTextYsonConvertTest, ConvertFromTextTypeMissmatch) +{ + CheckEqualConversionFromTextYson<bool>("%true"); + CheckEqualConversionFromTextYson<bool>("%false"); + CheckEqualConversionFromTextYson<bool>("1"); + CheckEqualConversionFromTextYson<bool>("0"); + CheckEqualConversionFromTextYson<bool>("-0"); + CheckEqualConversionFromTextYson<bool>("1u"); + CheckEqualConversionFromTextYson<bool>("0u"); + + CheckEqualConversionFromTextYson<bool>(ConvertToTextYsonString("true").AsStringBuf()); + CheckEqualConversionFromTextYson<bool>(ConvertToTextYsonString("false").AsStringBuf()); + CheckEqualConversionFromTextYson<bool>(ConvertToTextYsonString("1").AsStringBuf()); + CheckEqualConversionFromTextYson<bool>(ConvertToTextYsonString("0").AsStringBuf()); +} + +TEST(TTextYsonConvertTest, ConvertFromTextYsonStringThrowBasicCases) +{ + auto fromPayload = [] (const auto& value) { + return NYson::TYsonString(TString(value)); + }; + + // Overflow. + EXPECT_ANY_THROW(ConvertFromTextYsonString<i8>(fromPayload("123123123213"))); + EXPECT_ANY_THROW(ConvertTo<i8>(fromPayload("123123123213"))); + + // Negative. + EXPECT_ANY_THROW(ConvertFromTextYsonString<ui64>(fromPayload("-123"))); + EXPECT_ANY_THROW(ConvertTo<ui64>(fromPayload("-123"))); + + // Non-numeric. + EXPECT_ANY_THROW(ConvertFromTextYsonString<i64>(fromPayload("haha"))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<i64>(fromPayload("123qq"))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<i64>(fromPayload("-123u"))); + EXPECT_ANY_THROW(ConvertTo<i64>(fromPayload("haha"))); + EXPECT_ANY_THROW(ConvertTo<i64>(fromPayload("123qq"))); + EXPECT_ANY_THROW(ConvertTo<i64>(fromPayload("-123u"))); + + // Big positive to bool + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("42"))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("42"))); + + // Garbage to bool + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("%falsse"))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("%falsse"))); + + // Wrong string to bool + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("\"True\""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("\"False\""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("\"1u\""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<bool>(fromPayload("\"0u\""))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("\"True\""))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("\"False\""))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("\"1u\""))); + EXPECT_ANY_THROW(ConvertTo<bool>(fromPayload("\"0u\""))); + + // Wrong string to string + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload(""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload("\""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload("haha\""))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload("\'oops\'"))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload(""))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload("\""))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload("haha\""))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload("\'oops\'"))); + + // Wrong literal to double + EXPECT_ANY_THROW(ConvertFromTextYsonString<double>(fromPayload("%%"))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload("%42inf"))); + EXPECT_ANY_THROW(ConvertFromTextYsonString<std::string>(fromPayload("%NaaN"))); + EXPECT_ANY_THROW(ConvertTo<double>(fromPayload("%%"))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload("%42inf"))); + EXPECT_ANY_THROW(ConvertTo<std::string>(fromPayload("%NaaN"))); +} + +//////////////////////////////////////////////////////////////////////////////// +} // namespace +} // namespace NYT::NYTree diff --git a/yt/yt/core/ytree/unittests/ya.make b/yt/yt/core/ytree/unittests/ya.make index 7196cea98c..1bc54b95aa 100644 --- a/yt/yt/core/ytree/unittests/ya.make +++ b/yt/yt/core/ytree/unittests/ya.make @@ -10,6 +10,7 @@ SRCS( resolver_ut.cpp serialize_ut.cpp service_combiner_ut.cpp + text_yson_convert_ut.cpp tree_builder_ut.cpp lazy_ypath_service_ut.cpp yson_schema_ut.cpp diff --git a/yt/yt/core/ytree/ypath_client.h b/yt/yt/core/ytree/ypath_client.h index 08cb92c5a8..b9b2200baa 100644 --- a/yt/yt/core/ytree/ypath_client.h +++ b/yt/yt/core/ytree/ypath_client.h @@ -12,6 +12,8 @@ #include <library/cpp/yt/memory/ref.h> +#include <library/cpp/yt/logging/logger.h> + namespace NYT::NYTree { //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/library/backtrace_introspector/http/handler.cpp b/yt/yt/library/backtrace_introspector/http/handler.cpp deleted file mode 100644 index fe3cb65564..0000000000 --- a/yt/yt/library/backtrace_introspector/http/handler.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "handler.h" - -#include <yt/yt/core/http/server.h> - -#include <yt/yt/core/concurrency/action_queue.h> - -#include <yt/yt/library/backtrace_introspector/introspect.h> - -namespace NYT::NBacktraceIntrospector { - -using namespace NHttp; -using namespace NConcurrency; - -//////////////////////////////////////////////////////////////////////////////// - -class THandlerBase - : public IHttpHandler -{ -public: - void HandleRequest(const IRequestPtr& /*req*/, const IResponseWriterPtr& rsp) override - { - try { - static const auto queue = New<TActionQueue>("BacktraceIntro"); - auto dumpFuture = BIND(&THandlerBase::Dump, MakeStrong(this)) - .AsyncVia(queue->GetInvoker()) - .Run(); - - auto dump = WaitFor(dumpFuture) - .ValueOrThrow(); - - WaitFor(rsp->WriteBody(TSharedRef::FromString(dump))) - .ThrowOnError(); - - WaitFor(rsp->Close()) - .ThrowOnError(); - } catch (const std::exception& ex) { - if (!rsp->AreHeadersFlushed()) { - rsp->SetStatus(EStatusCode::InternalServerError); - WaitFor(rsp->WriteBody(TSharedRef::FromString(ex.what()))) - .ThrowOnError(); - } - throw; - } - } - -protected: - virtual TString Dump() = 0; -}; - -class TThreadsHandler - : public THandlerBase -{ -private: - TString Dump() override - { - return FormatIntrospectionInfos(IntrospectThreads()); - } -}; - -class TFibersHandler - : public THandlerBase -{ -private: - TString Dump() override - { - return FormatIntrospectionInfos(IntrospectFibers()); - } -}; - -void Register( - const IRequestPathMatcherPtr& handlers, - const TString& prefix) -{ - handlers->Add(prefix + "/threads", New<TThreadsHandler>()); - handlers->Add(prefix + "/fibers", New<TFibersHandler>()); -} - -void Register( - const IServerPtr& server, - const TString& prefix) -{ - Register(server->GetPathMatcher(), prefix); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/http/handler.h b/yt/yt/library/backtrace_introspector/http/handler.h deleted file mode 100644 index be795b7e5d..0000000000 --- a/yt/yt/library/backtrace_introspector/http/handler.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include <yt/yt/core/http/public.h> - -namespace NYT::NBacktraceIntrospector { - -//////////////////////////////////////////////////////////////////////////////// - -//! Registers introspector handlers. -void Register( - const NHttp::IRequestPathMatcherPtr& handlers, - const TString& prefix = {}); - -void Register( - const NHttp::IServerPtr& server, - const TString& prefix = {}); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/http/ya.make b/yt/yt/library/backtrace_introspector/http/ya.make deleted file mode 100644 index 504d20a2e3..0000000000 --- a/yt/yt/library/backtrace_introspector/http/ya.make +++ /dev/null @@ -1,16 +0,0 @@ -LIBRARY() - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - handler.cpp -) - -PEERDIR( - yt/yt/core - yt/yt/core/http - - yt/yt/library/backtrace_introspector -) - -END() diff --git a/yt/yt/library/backtrace_introspector/introspect.cpp b/yt/yt/library/backtrace_introspector/introspect.cpp deleted file mode 100644 index cfbd24a246..0000000000 --- a/yt/yt/library/backtrace_introspector/introspect.cpp +++ /dev/null @@ -1,224 +0,0 @@ -#include "introspect.h" - -#include "private.h" - -#include <yt/yt/core/misc/collection_helpers.h> -#include <yt/yt/core/misc/finally.h> -#include <yt/yt/core/misc/proc.h> - -#include <yt/yt/core/concurrency/fiber.h> -#include <yt/yt/core/concurrency/scheduler_api.h> - -#include <yt/yt/core/tracing/trace_context.h> - -#include <library/cpp/yt/memory/safe_memory_reader.h> - -#include <library/cpp/yt/backtrace/backtrace.h> - -#include <library/cpp/yt/backtrace/cursors/libunwind/libunwind_cursor.h> - -#include <library/cpp/yt/backtrace/cursors/frame_pointer/frame_pointer_cursor.h> - -#include <library/cpp/yt/backtrace/cursors/interop/interop.h> - -#include <util/system/yield.h> - -namespace NYT::NBacktraceIntrospector { - -using namespace NConcurrency; -using namespace NThreading; -using namespace NTracing; -using namespace NBacktrace; - -//////////////////////////////////////////////////////////////////////////////// - -static constexpr auto& Logger = BacktraceIntrospectorLogger; - -//////////////////////////////////////////////////////////////////////////////// - -std::vector<TFiberIntrospectionInfo> IntrospectFibers() -{ - YT_LOG_INFO("Fiber introspection started"); - - YT_LOG_INFO("Collecting waiting fibers backtraces"); - - std::vector<TFiberIntrospectionInfo> infos; - THashSet<TFiberId> waitingFiberIds; - THashMap<TFiberId, EFiberState> fiberStates; - - auto introspectionAction = [&] (NYT::NConcurrency::TFiber::TFiberList& fibers) { - for (auto& fiberRef : fibers) { - auto* fiber = fiberRef.AsFiber(); - - auto fiberId = fiber->GetFiberId(); - if (fiberId == InvalidFiberId) { - continue; - } - - EmplaceOrCrash(fiberStates, fiberId, EFiberState::Introspecting); - - EFiberState state; - - auto onIntrospectionLockAcquired = [&] { - YT_LOG_DEBUG("Waiting fiber is successfully locked for introspection (FiberId: %x)", - fiberId); - - const auto& propagatingStorage = *NConcurrency::TryGetPropagatingStorage(*fiber->GetFls()); - const auto* traceContext = TryGetTraceContextFromPropagatingStorage(propagatingStorage); - - TFiberIntrospectionInfo info{ - .State = EFiberState::Waiting, - .FiberId = fiberId, - .WaitingSince = fiber->GetWaitingSince(), - .TraceId = traceContext ? traceContext->GetTraceId() : TTraceId(), - .TraceLoggingTag = traceContext ? traceContext->GetLoggingTag() : TString(), - }; - - auto optionalContext = TrySynthesizeLibunwindContextFromMachineContext(*fiber->GetMachineContext()); - if (!optionalContext) { - YT_LOG_WARNING("Failed to synthesize libunwind context (FiberId: %x)", - fiberId); - return; - } - - TLibunwindCursor cursor(*optionalContext); - while (!cursor.IsFinished()) { - info.Backtrace.push_back(cursor.GetCurrentIP()); - cursor.MoveNext(); - } - - infos.push_back(std::move(info)); - InsertOrCrash(waitingFiberIds, fiberId); - - YT_LOG_DEBUG("Fiber introspection completed (FiberId: %x)", - info.FiberId); - }; - if (!fiber->TryLockForIntrospection(&state, onIntrospectionLockAcquired)) { - YT_LOG_DEBUG("Failed to lock fiber for introspection (FiberId: %x, State: %v)", - fiberId, - state); - fiberStates[fiberId] = state; - } - } - }; - - TFiber::ReadFibers(introspectionAction); - - YT_LOG_INFO("Collecting running fibers backtraces"); - - THashSet<TFiberId> runningFiberIds; - for (auto& info : IntrospectThreads()) { - if (info.FiberId == InvalidFiberId) { - continue; - } - - if (waitingFiberIds.contains(info.FiberId)) { - continue; - } - - if (!runningFiberIds.insert(info.FiberId).second) { - continue; - } - - infos.push_back(TFiberIntrospectionInfo{ - .State = EFiberState::Running, - .FiberId = info.FiberId, - .ThreadId = info.ThreadId, - .ThreadName = std::move(info.ThreadName), - .TraceId = info.TraceId, - .TraceLoggingTag = std::move(info.TraceLoggingTag), - .Backtrace = std::move(info.Backtrace), - }); - } - - for (const auto& [fiberId, fiberState] : fiberStates) { - if (fiberId == InvalidFiberId) { - continue; - } - if (runningFiberIds.contains(fiberId)) { - continue; - } - if (waitingFiberIds.contains(fiberId)) { - continue; - } - - infos.push_back(TFiberIntrospectionInfo{ - .State = fiberState, - .FiberId = fiberId, - }); - } - - YT_LOG_INFO("Fiber introspection completed"); - - return infos; -} - -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -void FormatBacktrace(TStringBuilder* builder, const std::vector<const void*>& backtrace) -{ - if (!backtrace.empty()) { - builder->AppendString("Backtrace:\n"); - SymbolizeBacktrace( - TRange(backtrace), - [&] (TStringBuf str) { - builder->AppendFormat(" %v", str); - }); - } -} - -} // namespace - -TString FormatIntrospectionInfos(const std::vector<TThreadIntrospectionInfo>& infos) -{ - TStringBuilder builder; - for (const auto& info : infos) { - builder.AppendFormat("Thread id: %v\n", info.ThreadId); - builder.AppendFormat("Thread name: %v\n", info.ThreadName); - if (info.FiberId != InvalidFiberId) { - builder.AppendFormat("Fiber id: %x\n", info.FiberId); - } - if (info.TraceId) { - builder.AppendFormat("Trace id: %v\n", info.TraceId); - } - if (info.TraceLoggingTag) { - builder.AppendFormat("Trace logging tag: %v\n", info.TraceLoggingTag); - } - FormatBacktrace(&builder, info.Backtrace); - builder.AppendString("\n"); - } - return builder.Flush(); -} - -TString FormatIntrospectionInfos(const std::vector<TFiberIntrospectionInfo>& infos) -{ - TStringBuilder builder; - for (const auto& info : infos) { - builder.AppendFormat("Fiber id: %x\n", info.FiberId); - builder.AppendFormat("State: %v\n", info.State); - if (info.WaitingSince) { - builder.AppendFormat("Waiting since: %v\n", info.WaitingSince); - } - if (info.ThreadId != InvalidThreadId) { - builder.AppendFormat("Thread id: %v\n", info.ThreadId); - } - if (!info.ThreadName.empty()) { - builder.AppendFormat("Thread name: %v\n", info.ThreadName); - } - if (info.TraceId) { - builder.AppendFormat("Trace id: %v\n", info.TraceId); - } - if (info.TraceLoggingTag) { - builder.AppendFormat("Trace logging tag: %v\n", info.TraceLoggingTag); - } - FormatBacktrace(&builder, info.Backtrace); - builder.AppendString("\n"); - } - return builder.Flush(); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/introspect.h b/yt/yt/library/backtrace_introspector/introspect.h deleted file mode 100644 index 2be09d2ec8..0000000000 --- a/yt/yt/library/backtrace_introspector/introspect.h +++ /dev/null @@ -1,57 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/concurrency/public.h> - -#include <yt/yt/core/threading/public.h> - -#include <yt/yt/core/tracing/public.h> - -namespace NYT::NBacktraceIntrospector { - -//////////////////////////////////////////////////////////////////////////////// -// Thread introspection API - -struct TThreadIntrospectionInfo -{ - NThreading::TThreadId ThreadId; - NConcurrency::TFiberId FiberId; - TString ThreadName; - NTracing::TTraceId TraceId; - //! Empty if no trace context is known. - TString TraceLoggingTag; - std::vector<const void*> Backtrace; -}; - -std::vector<TThreadIntrospectionInfo> IntrospectThreads(); - -//////////////////////////////////////////////////////////////////////////////// -// Fiber introspection API - -struct TFiberIntrospectionInfo -{ - NConcurrency::EFiberState State; - NConcurrency::TFiberId FiberId; - //! Zero if fiber is not waiting. - TInstant WaitingSince; - //! |InvalidThreadId| is fiber is not running. - NThreading::TThreadId ThreadId; - //! Empty if fiber is not running. - TString ThreadName; - NTracing::TTraceId TraceId; - //! Empty if no trace context is known. - TString TraceLoggingTag; - std::vector<const void*> Backtrace; -}; - -std::vector<TFiberIntrospectionInfo> IntrospectFibers(); - -//////////////////////////////////////////////////////////////////////////////// - -TString FormatIntrospectionInfos(const std::vector<TThreadIntrospectionInfo>& infos); -TString FormatIntrospectionInfos(const std::vector<TFiberIntrospectionInfo>& infos); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/introspect_dummy.cpp b/yt/yt/library/backtrace_introspector/introspect_dummy.cpp deleted file mode 100644 index e29293c7f5..0000000000 --- a/yt/yt/library/backtrace_introspector/introspect_dummy.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "introspect.h" - -namespace NYT::NBacktraceIntrospector { - -//////////////////////////////////////////////////////////////////////////////// - -std::vector<TThreadIntrospectionInfo> IntrospectThreads() -{ - return {}; -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/introspect_linux.cpp b/yt/yt/library/backtrace_introspector/introspect_linux.cpp deleted file mode 100644 index f2fdf1e8c2..0000000000 --- a/yt/yt/library/backtrace_introspector/introspect_linux.cpp +++ /dev/null @@ -1,217 +0,0 @@ -#include "introspect.h" - -#include "private.h" - -#include <yt/yt/core/misc/finally.h> -#include <yt/yt/core/misc/proc.h> - -#include <yt/yt/core/concurrency/fiber.h> -#include <yt/yt/core/concurrency/scheduler_api.h> - -#include <yt/yt/core/tracing/trace_context.h> - -#include <library/cpp/yt/memory/safe_memory_reader.h> - -#include <library/cpp/yt/backtrace/backtrace.h> - -#include <library/cpp/yt/backtrace/cursors/libunwind/libunwind_cursor.h> - -#include <library/cpp/yt/backtrace/cursors/frame_pointer/frame_pointer_cursor.h> - -#include <library/cpp/yt/backtrace/cursors/interop/interop.h> - -#include <library/cpp/yt/misc/thread_name.h> - -#include <util/system/yield.h> - -#include <sys/syscall.h> - -namespace NYT::NBacktraceIntrospector { - -using namespace NConcurrency; -using namespace NTracing; -using namespace NBacktrace; - -//////////////////////////////////////////////////////////////////////////////// - -static constexpr auto& Logger = BacktraceIntrospectorLogger; - -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -struct TStaticString -{ - TStaticString() = default; - - explicit TStaticString(TStringBuf str) - { - Length = std::min(std::ssize(str), std::ssize(Buffer)); - std::copy(str.data(), str.data() + Length, Buffer.data()); - } - - operator TString() const - { - return TString(Buffer.data(), static_cast<size_t>(Length)); - } - - std::array<char, 256> Buffer; - int Length = 0; -}; - -struct TStaticBacktrace -{ - operator std::vector<const void*>() const - { - return std::vector<const void*>(Frames.data(), Frames.data() + FrameCount); - } - - std::array<const void*, 100> Frames; - int FrameCount = 0; -}; - -struct TSignalHandlerContext -{ - TSignalHandlerContext(); - ~TSignalHandlerContext(); - - std::atomic<bool> Finished = false; - - TFiberId FiberId = {}; - TTraceId TraceId = {}; - TStaticString TraceLoggingTag; - TStaticBacktrace Backtrace; - TThreadName ThreadName = {}; - - TSafeMemoryReader* MemoryReader = Singleton<TSafeMemoryReader>(); - - void SetFinished() - { - Finished.store(true); - } - - void WaitUntilFinished() - { - while (!Finished.load()) { - ThreadYield(); - } - } -}; - -static TSignalHandlerContext* SignalHandlerContext; - -TSignalHandlerContext::TSignalHandlerContext() -{ - YT_VERIFY(!SignalHandlerContext); - SignalHandlerContext = this; -} - -TSignalHandlerContext::~TSignalHandlerContext() -{ - YT_VERIFY(SignalHandlerContext == this); - SignalHandlerContext = nullptr; -} - -void SignalHandler(int sig, siginfo_t* /*info*/, void* threadContext) -{ - YT_VERIFY(sig == SIGUSR1); - - SignalHandlerContext->FiberId = GetCurrentFiberId(); - SignalHandlerContext->ThreadName = GetCurrentThreadName(); - if (const auto* traceContext = TryGetCurrentTraceContext()) { - SignalHandlerContext->TraceId = traceContext->GetTraceId(); - SignalHandlerContext->TraceLoggingTag = TStaticString(traceContext->GetLoggingTag()); - } - - auto cursorContext = FramePointerCursorContextFromUcontext(*static_cast<const ucontext_t*>(threadContext)); - TFramePointerCursor cursor(SignalHandlerContext->MemoryReader, cursorContext); - while (!cursor.IsFinished() && SignalHandlerContext->Backtrace.FrameCount < std::ssize(SignalHandlerContext->Backtrace.Frames)) { - SignalHandlerContext->Backtrace.Frames[SignalHandlerContext->Backtrace.FrameCount++] = cursor.GetCurrentIP(); - cursor.MoveNext(); - } - - SignalHandlerContext->SetFinished(); -} - -} // namespace - -std::vector<TThreadIntrospectionInfo> IntrospectThreads() -{ - static std::atomic<bool> IntrospectionLock; - - if (IntrospectionLock.exchange(true)) { - THROW_ERROR_EXCEPTION("Thread introspection is already in progress"); - } - - auto introspectionLockGuard = Finally([] { - YT_VERIFY(IntrospectionLock.exchange(false)); - }); - - YT_LOG_INFO("Thread introspection started"); - - { - struct sigaction action; - action.sa_flags = SA_SIGINFO | SA_RESTART; - ::sigemptyset(&action.sa_mask); - action.sa_sigaction = SignalHandler; - - if (::sigaction(SIGUSR1, &action, nullptr) != 0) { - THROW_ERROR_EXCEPTION("Failed to install signal handler") - << TError::FromSystem(); - } - } - - std::vector<TThreadIntrospectionInfo> infos; - for (auto threadId : GetCurrentProcessThreadIds()) { - if (!IsUserspaceThread(threadId)) { - YT_LOG_DEBUG("Skipping a non-userspace thread (ThreadId: %v)", - threadId); - continue; - } - - TSignalHandlerContext signalHandlerContext; - if (::syscall(SYS_tkill, threadId, SIGUSR1) != 0) { - YT_LOG_DEBUG(TError::FromSystem(), "Failed to signal to thread (ThreadId: %v)", - threadId); - continue; - } - - YT_LOG_DEBUG("Sent signal to thread (ThreadId: %v)", - threadId); - - signalHandlerContext.WaitUntilFinished(); - - YT_LOG_DEBUG("Signal handler finished (ThreadId: %v, FiberId: %x)", - threadId, - signalHandlerContext.FiberId); - - infos.push_back(TThreadIntrospectionInfo{ - .ThreadId = threadId, - .FiberId = signalHandlerContext.FiberId, - .ThreadName = TString(signalHandlerContext.ThreadName.Buffer.data(), static_cast<size_t>(signalHandlerContext.ThreadName.Length)), - .TraceId = signalHandlerContext.TraceId, - .TraceLoggingTag = signalHandlerContext.TraceLoggingTag, - .Backtrace = signalHandlerContext.Backtrace, - }); - } - - { - struct sigaction action; - action.sa_flags = SA_RESTART; - ::sigemptyset(&action.sa_mask); - action.sa_handler = SIG_IGN; - - if (::sigaction(SIGUSR1, &action, nullptr) != 0) { - THROW_ERROR_EXCEPTION("Failed to de-install signal handler") - << TError::FromSystem(); - } - } - - YT_LOG_INFO("Thread introspection completed"); - - return infos; -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/private.h b/yt/yt/library/backtrace_introspector/private.h deleted file mode 100644 index 3f99c307a5..0000000000 --- a/yt/yt/library/backtrace_introspector/private.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/logging/log.h> - -namespace NYT::NBacktraceIntrospector { - -//////////////////////////////////////////////////////////////////////////////// - -YT_DEFINE_GLOBAL(const NLogging::TLogger, BacktraceIntrospectorLogger, "BacktraceIntrospector"); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector - diff --git a/yt/yt/library/backtrace_introspector/public.h b/yt/yt/library/backtrace_introspector/public.h deleted file mode 100644 index 54a8bd06ed..0000000000 --- a/yt/yt/library/backtrace_introspector/public.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -namespace NYT::NBacktraceIntrospector { - -//////////////////////////////////////////////////////////////////////////////// - -struct TThreadIntrospectionInfo; -struct TFiberIntrospectionInfo; - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/unittests/introspect_ut.cpp b/yt/yt/library/backtrace_introspector/unittests/introspect_ut.cpp deleted file mode 100644 index a939417958..0000000000 --- a/yt/yt/library/backtrace_introspector/unittests/introspect_ut.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/library/backtrace_introspector/introspect.h> - -#include <yt/yt/core/concurrency/action_queue.h> -#include <yt/yt/core/concurrency/delayed_executor.h> - -#include <yt/yt/core/actions/bind.h> -#include <yt/yt/core/actions/future.h> - -#include <yt/yt/core/tracing/trace_context.h> - -#include <yt/yt/core/logging/log.h> - -#include <yt/yt/core/misc/collection_helpers.h> - -namespace NYT::NBacktraceIntrospector { -namespace { - -using namespace NConcurrency; -using namespace NTracing; - -//////////////////////////////////////////////////////////////////////////////// - -NLogging::TLogger Logger("Test"); - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TBacktraceIntrospectorTest, Fibers) -{ - constexpr int HeavyQueueCount = 5; - std::vector<TActionQueuePtr> heavyQueues; - const TString HeavyThreadNamePrefix("Heavy:"); - for (int index = 0; index < HeavyQueueCount; ++index) { - heavyQueues.push_back(New<TActionQueue>(HeavyThreadNamePrefix + ToString(index))); - } - - constexpr int LightQueueCount = 3; - std::vector<TActionQueuePtr> lightQueues; - const TString LightThreadNamePrefix("Light:"); - for (int index = 0; index < LightQueueCount; ++index) { - lightQueues.push_back(New<TActionQueue>(LightThreadNamePrefix + ToString(index))); - } - - constexpr int HeavyCallbackCount = 3; - std::vector<TTraceContextPtr> heavyTraceContexts; - std::set<TTraceId> expectedHeavyTraceIds; - for (int index = 0; index < HeavyCallbackCount; ++index) { - auto traceContext = TTraceContext::NewRoot("Heavy"); - traceContext->SetLoggingTag(Format("HeavyLoggingTag:%v", index)); - heavyTraceContexts.push_back(traceContext); - InsertOrCrash(expectedHeavyTraceIds, traceContext->GetTraceId()); - } - - std::vector<TFuture<void>> heavyFutures; - for (int index = 0; index < HeavyCallbackCount; ++index) { - heavyFutures.push_back( - BIND([&, index] { - TTraceContextGuard traceContextGuard(heavyTraceContexts[index]); - YT_LOG_INFO("Heavy callback started (Index: %v)", index); - Sleep(TDuration::Seconds(3)); - YT_LOG_INFO("Heavy callback finished (Index: %v)", index); - }) - .AsyncVia(heavyQueues[index % HeavyQueueCount]->GetInvoker()) - .Run()); - } - - constexpr int LightCallbackCount = 10; - std::vector<TTraceContextPtr> lightTraceContexts; - std::set<TTraceId> expectedLightTraceIds; - for (int index = 0; index < LightCallbackCount; ++index) { - auto traceContext = TTraceContext::NewRoot("Light"); - traceContext->SetLoggingTag(Format("LightLoggingTag:%v", index)); - lightTraceContexts.push_back(traceContext); - InsertOrCrash(expectedLightTraceIds, traceContext->GetTraceId()); - } - - std::vector<TFuture<void>> lightFutures; - for (int index = 0; index < LightCallbackCount; ++index) { - lightFutures.push_back( - BIND([&, index] { - TTraceContextGuard traceContextGuard(lightTraceContexts[index]); - YT_LOG_INFO("Light callback started (Index: %v)", index); - TDelayedExecutor::WaitForDuration(TDuration::Seconds(1)); - YT_LOG_INFO("Light callback finished (Index: %v)", index); - }) - .AsyncVia(lightQueues[index % LightQueueCount]->GetInvoker()) - .Run()); - } - - Sleep(TDuration::MilliSeconds(100)); - - auto infos = IntrospectFibers(); - Cerr << FormatIntrospectionInfos(infos); - - std::set<TTraceId> actualHeavyTraceIds; - std::set<TTraceId> actualLightTraceIds; - for (const auto& info : infos) { - if (!info.TraceId) { - continue; - } - switch (info.State) { - case EFiberState::Running: - EXPECT_TRUE(actualHeavyTraceIds.insert(info.TraceId).second); - if (expectedHeavyTraceIds.contains(info.TraceId)) { - EXPECT_TRUE(info.ThreadName.StartsWith(HeavyThreadNamePrefix)); - } - break; - - case EFiberState::Waiting: - EXPECT_TRUE(actualLightTraceIds.insert(info.TraceId).second); - break; - - default: - break; - } - } - - EXPECT_EQ(expectedLightTraceIds, actualLightTraceIds); - EXPECT_EQ(expectedHeavyTraceIds, actualHeavyTraceIds); - - for (const auto& future : heavyFutures) { - future.Get().ThrowOnError(); - } - - for (const auto& future : lightFutures) { - future.Get().ThrowOnError(); - } - - for (const auto& queue : heavyQueues) { - queue->Shutdown(/*graceful*/ true); - } - for (const auto& queue : lightQueues) { - queue->Shutdown(/*graceful*/ true); - } -} - -TEST(TBacktraceIntrospectorTest, Threads) -{ - constexpr int QueueCount = 5; - std::vector<TActionQueuePtr> queues; - const TString ThreadNamePrefix("Queue:"); - for (int index = 0; index < QueueCount; ++index) { - queues.push_back(New<TActionQueue>(ThreadNamePrefix + ToString(index))); - } - - constexpr int CallbackCount = 3; - std::vector<TTraceContextPtr> traceContexts; - std::set<TTraceId> expectedTraceIds; - for (int index = 0; index < CallbackCount; ++index) { - auto traceContext = TTraceContext::NewRoot("Heavy"); - traceContexts.push_back(traceContext); - InsertOrCrash(expectedTraceIds, traceContext->GetTraceId()); - } - - std::vector<TFuture<void>> futures; - for (int index = 0; index < CallbackCount; ++index) { - futures.push_back( - BIND([&, index] { - TTraceContextGuard traceContextGuard(traceContexts[index]); - YT_LOG_INFO("Callback started (Index: %v)", index); - Sleep(TDuration::Seconds(3)); - YT_LOG_INFO("Callback finished (Index: %v)", index); - }) - .AsyncVia(queues[index % QueueCount]->GetInvoker()) - .Run()); - } - - Sleep(TDuration::MilliSeconds(100)); - - auto infos = IntrospectThreads(); - Cerr << FormatIntrospectionInfos(infos); - - std::set<TTraceId> actualTraceIds; - for (const auto& info : infos) { - if (!info.TraceId) { - continue; - } - EXPECT_TRUE(actualTraceIds.insert(info.TraceId).second); - if (expectedTraceIds.contains(info.TraceId)) { - EXPECT_TRUE(info.ThreadName.StartsWith(ThreadNamePrefix)); - } - } - - EXPECT_EQ(expectedTraceIds, actualTraceIds); - - for (const auto& future : futures) { - future.Get().ThrowOnError(); - } - for (const auto& queue : queues) { - queue->Shutdown(/*graceful*/ true); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT::NBacktraceIntrospector diff --git a/yt/yt/library/backtrace_introspector/unittests/ya.make b/yt/yt/library/backtrace_introspector/unittests/ya.make deleted file mode 100644 index 393215d01e..0000000000 --- a/yt/yt/library/backtrace_introspector/unittests/ya.make +++ /dev/null @@ -1,15 +0,0 @@ -GTEST() - -SRCS( - introspect_ut.cpp -) - -INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc) - -PEERDIR( - yt/yt/library/backtrace_introspector - - yt/yt/core/test_framework -) - -END() diff --git a/yt/yt/library/backtrace_introspector/ya.make b/yt/yt/library/backtrace_introspector/ya.make deleted file mode 100644 index 884b8fb562..0000000000 --- a/yt/yt/library/backtrace_introspector/ya.make +++ /dev/null @@ -1,31 +0,0 @@ -LIBRARY() - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - introspect.cpp -) -IF (OS_LINUX) - SRCS(introspect_linux.cpp) -ELSE() - SRCS(introspect_dummy.cpp) -ENDIF() - -PEERDIR( - yt/yt/core - - library/cpp/yt/backtrace/cursors/interop - library/cpp/yt/backtrace/cursors/libunwind - library/cpp/yt/backtrace/cursors/frame_pointer - library/cpp/yt/misc -) - -END() - -RECURSE( - http -) - -RECURSE_FOR_TESTS( - unittests -) diff --git a/yt/yt/library/formats/arrow_parser.cpp b/yt/yt/library/formats/arrow_parser.cpp index c3c169352b..7c112716e5 100644 --- a/yt/yt/library/formats/arrow_parser.cpp +++ b/yt/yt/library/formats/arrow_parser.cpp @@ -245,6 +245,7 @@ private: void ParseSimpleNumeric(FuncType makeUnversionedValueFunc) { auto array = std::static_pointer_cast<ArrayType>(Array_); + YT_VERIFY(array->length() <= std::ssize(*RowValues_)); for (int rowIndex = 0; rowIndex < array->length(); ++rowIndex) { if (array->IsNull(rowIndex)) { (*RowValues_)[rowIndex] = MakeUnversionedNullValue(ColumnId_); @@ -258,6 +259,7 @@ private: arrow::Status ParseStringLikeArray(auto makeUnversionedValueFunc) { auto array = std::static_pointer_cast<ArrayType>(Array_); + YT_VERIFY(array->length() <= std::ssize(*RowValues_)); for (int rowIndex = 0; rowIndex < array->length(); ++rowIndex) { if (array->IsNull(rowIndex)) { (*RowValues_)[rowIndex] = MakeUnversionedNullValue(ColumnId_); @@ -295,6 +297,7 @@ private: arrow::Status ParseBoolean() { auto array = std::static_pointer_cast<arrow::BooleanArray>(Array_); + YT_VERIFY(array->length() <= std::ssize(*RowValues_)); for (int rowIndex = 0; rowIndex < array->length(); rowIndex++) { if (array->IsNull(rowIndex)) { (*RowValues_)[rowIndex] = MakeUnversionedNullValue(ColumnId_); @@ -308,6 +311,7 @@ private: arrow::Status ParseNull() { auto array = std::static_pointer_cast<arrow::NullArray>(Array_); + YT_VERIFY(array->length() <= std::ssize(*RowValues_)); for (int rowIndex = 0; rowIndex < array->length(); rowIndex++) { (*RowValues_)[rowIndex] = MakeUnversionedNullValue(ColumnId_); } @@ -834,19 +838,21 @@ void PrepareArrayForSimpleLogicalType( { CheckMatchingArrowTypes(columnType, column); if (column->type()->id() == arrow::Type::DICTIONARY) { - auto dictionaryColumn = std::static_pointer_cast<arrow::DictionaryArray>(column); - TUnversionedRowValues dictionaryValues(rowsValues[columnIndex].size()); - auto dictionaryValuesColumn = dictionaryColumn->dictionary(); - CheckMatchingArrowTypes(columnType, dictionaryValuesColumn); + auto dictionaryArrayColumn = std::static_pointer_cast<arrow::DictionaryArray>(column); + auto dictionary = dictionaryArrayColumn->dictionary(); + TUnversionedRowValues dictionaryValues(dictionary->length()); + CheckMatchingArrowTypes(columnType, dictionary); - TArraySimpleVisitor visitor(columnType, columnId, dictionaryValuesColumn, bufferForStringLikeValues, &dictionaryValues); - ThrowOnError(dictionaryColumn->dictionary()->type()->Accept(&visitor)); + TArraySimpleVisitor visitor(columnType, columnId, dictionary, bufferForStringLikeValues, &dictionaryValues); + ThrowOnError(dictionaryArrayColumn->dictionary()->type()->Accept(&visitor)); for (int offset = 0; offset < std::ssize(rowsValues[columnIndex]); offset++) { - if (dictionaryColumn->IsNull(offset)) { + if (dictionaryArrayColumn->IsNull(offset)) { rowsValues[columnIndex][offset] = MakeUnversionedNullValue(columnId); } else { - rowsValues[columnIndex][offset] = dictionaryValues[dictionaryColumn->GetValueIndex(offset)]; + auto dictionaryValueIndex = dictionaryArrayColumn->GetValueIndex(offset); + YT_VERIFY(dictionaryValueIndex < std::ssize(dictionaryValues)); + rowsValues[columnIndex][offset] = dictionaryValues[dictionaryValueIndex]; } } } else { diff --git a/yt/yt/library/formats/unittests/arrow_parser_ut.cpp b/yt/yt/library/formats/unittests/arrow_parser_ut.cpp new file mode 100644 index 0000000000..4e960edb08 --- /dev/null +++ b/yt/yt/library/formats/unittests/arrow_parser_ut.cpp @@ -0,0 +1,690 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include "row_helpers.h" + +#include <yt/yt/library/formats/arrow_parser.h> + +#include <yt/yt/client/formats/config.h> +#include <yt/yt/client/formats/parser.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/validate_logical_type.h> +#include <yt/yt/library/formats/format.h> + +#include <contrib/libs/apache/arrow/cpp/src/arrow/api.h> +#include <contrib/libs/apache/arrow/cpp/src/arrow/io/api.h> +#include <contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h> +#include <contrib/libs/apache/arrow/cpp/src/arrow/ipc/api.h> +#include <contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h> + +namespace NYT { + +namespace { + +using namespace NFormats; +using namespace NTableClient; +using namespace NYTree; +using namespace NYson; + +using namespace std::string_literals; + +//////////////////////////////////////////////////////////////////////////////// + +std::string GetEos() +{ + std::string eos; + eos.assign(4, 0); + return eos; +} + +void Verify(const arrow::Status& status) +{ + YT_VERIFY(status.ok()); +} + +//////////////////////////////////////////////////////////////////////////////// + +std::string MakeOutputFromRecordBatch(const std::shared_ptr<arrow::RecordBatch>& recordBatch) +{ + auto outputStream = arrow::io::BufferOutputStream::Create().ValueOrDie(); + auto arrowWriter = arrow::ipc::MakeStreamWriter(outputStream, recordBatch->schema()).ValueOrDie(); + Verify(arrowWriter->WriteRecordBatch(*recordBatch)); + auto buffer = outputStream->Finish().ValueOrDie(); + return buffer->ToString(); +} + +std::string MakeIntegerArrow(const std::vector<int8_t>& data) +{ + arrow::Int8Builder builder; + + for (const auto& value : data) { + Verify(builder.Append(value)); + } + + auto intArray = builder.Finish(); + + auto arrowSchema = arrow::schema({arrow::field("integer", arrow::int8())}); + std::vector<std::shared_ptr<arrow::Array>> columns = {*intArray}; + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeOptionalIntegerArrow() +{ + arrow::Int8Builder builder; + + Verify(builder.Append(1)); + Verify(builder.AppendNull()); + Verify(builder.AppendNull()); + + auto data = builder.Finish(); + + auto arrowSchema = arrow::schema({arrow::field("opt", arrow::int8())}); + std::vector<std::shared_ptr<arrow::Array>> columns = {*data}; + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeBooleanArrow(const std::vector<bool>& data) +{ + arrow::BooleanBuilder builder; + + for (const auto& value : data) { + Verify(builder.Append(value)); + } + + auto boolArray = builder.Finish(); + + auto arrowSchema = arrow::schema({arrow::field("bool", arrow::boolean())}); + std::vector<std::shared_ptr<arrow::Array>> columns = {*boolArray}; + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeIntAndStringArrow(const std::vector<int8_t>& data, const std::vector<std::string>& stringData) +{ + arrow::Int8Builder builder; + + for (const auto& value : data) { + Verify(builder.Append(value)); + } + auto intArray = builder.Finish(); + + arrow::StringBuilder stringBuilder; + + for (const auto& value : stringData) { + Verify(stringBuilder.Append(value)); + } + + auto stringArray = stringBuilder.Finish(); + + auto arrowSchema = arrow::schema({ + arrow::field("integer", arrow::int8()), + arrow::field("string", arrow::binary()), + }); + + std::vector<std::shared_ptr<arrow::Array>> columns = {*intArray, *stringArray}; + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeIntListArrow(const std::vector<std::optional<std::vector<int32_t>>>& data) +{ + auto* pool = arrow::default_memory_pool(); + auto valueBuilder = std::make_shared<arrow::Int32Builder>(pool); + auto listBuilder = std::make_unique<arrow::ListBuilder>(pool, valueBuilder); + + for (const auto& list : data) { + if (list) { + Verify(listBuilder->Append()); + for (const auto& value : *list) { + Verify(valueBuilder->Append(value)); + } + } else { + Verify(listBuilder->AppendNull()); + } + } + + auto arrowSchema = arrow::schema({arrow::field("list", listBuilder->type())}); + + std::shared_ptr<arrow::Array> listArray; + Verify(listBuilder->Finish(&listArray)); + std::vector<std::shared_ptr<arrow::Array>> columns = {listArray}; + + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeStringListArrow(const std::vector<std::vector<std::string>>& data) +{ + auto* pool = arrow::default_memory_pool(); + + auto valueBuilder = std::make_shared<arrow::StringBuilder>(pool); + auto listBuilder = std::make_unique<arrow::ListBuilder>(pool, valueBuilder); + + for (const auto& list : data) { + Verify(listBuilder->Append()); + for (const auto& value : list) { + Verify(valueBuilder->Append(value)); + } + } + + auto arrowSchema = arrow::schema({arrow::field("list", listBuilder->type())}); + + std::shared_ptr<arrow::Array> listArray; + Verify(listBuilder->Finish(&listArray)); + std::vector<std::shared_ptr<arrow::Array>> columns = {listArray}; + + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeMapArrow(const std::vector<std::vector<int32_t>>& key, const std::vector<std::vector<int32_t>>& value) +{ + auto* pool = arrow::default_memory_pool(); + + auto keyBuilder = std::make_shared<arrow::Int32Builder>(pool); + auto valueBuilder = std::make_shared<arrow::Int32Builder>(pool); + auto mapBuilder = std::make_unique<arrow::MapBuilder>(pool, keyBuilder, valueBuilder); + + for (ssize_t mapIndex = 0; mapIndex < std::ssize(key); mapIndex++) { + Verify(mapBuilder->Append()); + for (int valueNumber = 0; valueNumber < std::ssize(key[mapIndex]); valueNumber++) { + Verify(keyBuilder->Append(key[mapIndex][valueNumber])); + Verify(valueBuilder->Append(value[mapIndex][valueNumber])); + } + } + + auto arrowSchema = arrow::schema({arrow::field("map", mapBuilder->type())}); + + std::shared_ptr<arrow::Array> mapArray; + Verify(mapBuilder->Finish(&mapArray)); + std::vector<std::shared_ptr<arrow::Array>> columns = {mapArray}; + + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeDictionaryArrow(bool addExtraValues = false) +{ + auto* pool = arrow::default_memory_pool(); + + arrow::DictionaryBuilder<arrow::Int32Type> dictionaryBuilder(pool); + + std::vector<int32_t> values = {1, 2, 1}; + + for (auto value : values) { + Verify(dictionaryBuilder.Append(value)); + } + + if (addExtraValues) { + arrow::Int32Builder builder; + Verify(builder.Append(3)); + Verify(builder.Append(4)); + Verify(builder.Append(5)); + auto intArray = *builder.Finish(); + Verify(dictionaryBuilder.InsertMemoValues(*intArray)); + } + + auto arrowSchema = arrow::schema({arrow::field("integer", dictionaryBuilder.type())}); + + std::shared_ptr<arrow::Array> array; + Verify(dictionaryBuilder.Finish(&array)); + + std::vector<std::shared_ptr<arrow::Array>> columns = {array}; + + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeStructArrow(const std::vector<std::string>& stringData, const std::vector<int64_t>& intData) +{ + auto* pool = arrow::default_memory_pool(); + + auto stringBuilder = std::make_shared<arrow::StringBuilder>(pool); + auto intBuilder = std::make_shared<arrow::Int64Builder>(pool); + + std::vector<std::shared_ptr<arrow::Field>> fields = { + std::make_shared<arrow::Field>("bar", std::make_shared<arrow::StringType>()), + std::make_shared<arrow::Field>("foo", std::make_shared<arrow::Int64Type>()) + }; + + arrow::StructBuilder structBuilder( + std::make_shared<arrow::StructType>(fields), + pool, + {stringBuilder, intBuilder}); + + for (int index = 0; index < std::ssize(stringData); index++) { + Verify(structBuilder.Append()); + Verify(stringBuilder->Append(stringData[index])); + Verify(intBuilder->Append(intData[index])); + } + + std::shared_ptr<arrow::Schema> arrowSchema = arrow::schema({arrow::field("struct", structBuilder.type())}); + + std::shared_ptr<arrow::Array> structArray; + Verify(structBuilder.Finish(&structArray)); + std::vector<std::shared_ptr<arrow::Array>> columns = {structArray}; + + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeDecimalArrows(std::vector<TString> values, std::vector<std::tuple<int, int, int>> columnParameters) +{ + auto* pool = arrow::default_memory_pool(); + + auto makeColumn = [&]<class TBuilder, class TType, class TValue>(int precision, int scale) { + auto builder = std::make_shared<TBuilder>(std::make_shared<TType>(precision, scale), pool); + for (const auto& value : values) { + Verify(builder->Append(TValue(std::string(value)))); + } + return builder->Finish().ValueOrDie(); + }; + + std::vector<std::shared_ptr<arrow::Array>> columns; + for (const auto& [bitness, precision, scale] : columnParameters) { + if (bitness == 128) { + columns.push_back(makeColumn.template operator()<arrow::Decimal128Builder, arrow::Decimal128Type, arrow::Decimal128>(precision, scale)); + } else if (bitness == 256) { + columns.push_back(makeColumn.template operator()<arrow::Decimal256Builder, arrow::Decimal256Type, arrow::Decimal256>(precision, scale)); + } else { + YT_ABORT(); + } + } + + arrow::FieldVector fields; + for (const auto& [bitness, precision, scale] : columnParameters) { + std::shared_ptr<arrow::DataType> type; + if (bitness == 128) { + type = std::make_shared<arrow::Decimal128Type>(precision, scale); + } else if (bitness == 256) { + type = std::make_shared<arrow::Decimal256Type>(precision, scale); + } else { + YT_ABORT(); + } + fields.push_back(std::make_shared<arrow::Field>(Format("decimal%v_%v_%v", bitness, precision, scale), type)); + } + + auto recordBatch = arrow::RecordBatch::Make(arrow::schema(std::move(fields)), columns[0]->length(), columns); + + return MakeOutputFromRecordBatch(recordBatch); +} + +std::string MakeDecimalListArrow(std::vector<TString> values) +{ + // Create a single column with one value, which is a list containing all the #values. + // Type of the list is Decimal128(10, 3). + auto* pool = arrow::default_memory_pool(); + auto decimalBuilder = std::make_shared<arrow::Decimal128Builder>(std::make_shared<arrow::Decimal128Type>(10, 3), pool); + auto listBuilder = std::make_unique<arrow::ListBuilder>(pool, decimalBuilder); + + Verify(listBuilder->Append()); + for (const auto& value : values) { + Verify(decimalBuilder->Append(arrow::Decimal128(std::string(value)))); + } + std::shared_ptr<arrow::Array> listArray; + Verify(listBuilder->Finish(&listArray)); + auto arrowSchema = arrow::schema({arrow::field("list", listArray->type())}); + std::vector<std::shared_ptr<arrow::Array>> columns = {listArray}; + auto recordBatch = arrow::RecordBatch::Make(arrowSchema, columns[0]->length(), columns); + return MakeOutputFromRecordBatch(recordBatch); +} + +void TestArrowParserWithDictionary(bool addExtraValues = false) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("integer", EValueType::Int64) + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeDictionaryArrow(addExtraValues); + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "integer")), 1); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(1, "integer")), 2); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(2, "integer")), 1); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TArrowParserTest, Simple) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("integer", EValueType::Int64) + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeIntegerArrow({1, 2, 3}); + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "integer")), 1); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(1, "integer")), 2); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(2, "integer")), 3); +} + +TEST(TArrowParserTest, Optional) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("opt", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))) + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeOptionalIntegerArrow(); + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "opt")), 1); + ASSERT_TRUE(IsNull(collectedRows.GetRowValue(1, "opt"))); + ASSERT_TRUE(IsNull(collectedRows.GetRowValue(2, "opt"))); +} + +TEST(TArrowParserTest, Dictionary) +{ + TestArrowParserWithDictionary(false); + TestArrowParserWithDictionary(true); +} + +TEST(TArrowParserTest, Bool) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("bool", EValueType::Boolean), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeBooleanArrow({true, false, true}); + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(0, "bool")), true); + ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(1, "bool")), false); + ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(2, "bool")), true); +} + +TEST(TArrowParserTest, String) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("integer", EValueType::Any), + TColumnSchema("string", EValueType::String), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeIntAndStringArrow({1, 2, 3}, {"foo", "bar", "yt"}); + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "integer")), 1); + ASSERT_EQ(GetString(collectedRows.GetRowValue(0, "string")), "foo"); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(1, "integer")), 2); + ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "string")), "bar"); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(2, "integer")), 3); + ASSERT_EQ(GetString(collectedRows.GetRowValue(2, "string")), "yt"); +} + + +TString ConvertToYsonTextStringStable(const INodePtr& node) +{ + TStringStream out; + TYsonWriter writer(&out, EYsonFormat::Text); + VisitTree(node, &writer, true, TAttributeFilter()); + writer.Flush(); + return out.Str(); +} + +TEST(TArrowParserTest, ListOfIntegers) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("list", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeIntListArrow({std::vector{1, 2, 3}, std::nullopt, std::vector{4, 5}}); + parser->Read(data); + parser->Finish(); + + auto firstNode = GetComposite(collectedRows.GetRowValue(0, "list")); + ASSERT_EQ(ConvertToYsonTextStringStable(firstNode), "[1;2;3;]"); + + ASSERT_EQ(EValueType::Null, collectedRows.GetRowValue(1, "list").Type); + + auto thirdNode = GetComposite(collectedRows.GetRowValue(2, "list")); + ASSERT_EQ(ConvertToYsonTextStringStable(thirdNode), "[4;5;]"); +} + +TEST(TArrowParserTest, ListOfStrings) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("list", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeStringListArrow({{"foo", "bar"}, {"42", "universe"}}); + parser->Read(data); + parser->Finish(); + + auto firstNode = GetComposite(collectedRows.GetRowValue(0, "list")); + ASSERT_EQ(ConvertToYsonTextStringStable(firstNode), "[\"foo\";\"bar\";]"); + + auto secondNode = GetComposite(collectedRows.GetRowValue(1, "list")); + ASSERT_EQ(ConvertToYsonTextStringStable(secondNode), "[\"42\";\"universe\";]"); +} + +TEST(TArrowParserTest, Map) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema( + "map", + DictLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64), + SimpleLogicalType(ESimpleLogicalValueType::Uint64))), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeMapArrow({{1, 3}, {3}}, {{2, 2}, {2}}); + parser->Read(data); + parser->Finish(); + + auto firstNode = GetComposite(collectedRows.GetRowValue(0, "map")); + ASSERT_EQ(ConvertToYsonTextStringStable(firstNode), "[[1;2;];[3;2;];]"); + + auto secondNode = GetComposite(collectedRows.GetRowValue(1, "map")); + ASSERT_EQ(ConvertToYsonTextStringStable(secondNode), "[[3;2;];]"); +} + +TEST(TArrowParserTest, SeveralIntArrays) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("integer", EValueType::Int64), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + auto data = Format("%v%v%v", MakeIntegerArrow({1, 2, 3}), GetEos(), MakeIntegerArrow({5, 6})); + + parser->Read(data); + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 5u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "integer")), 1); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(1, "integer")), 2); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(2, "integer")), 3); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(3, "integer")), 5); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(4, "integer")), 6); +} + +TEST(TArrowParserTest, Struct) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("struct", StructLogicalType({ + {"bar", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"foo", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + })), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + parser->Read(MakeStructArrow({"one", "two"}, {1, 2})); + parser->Finish(); + + auto firstNode = GetComposite(collectedRows.GetRowValue(0, "struct")); + ASSERT_EQ(ConvertToYsonTextStringStable(firstNode), "[\"one\";1;]"); + + auto secondNode = GetComposite(collectedRows.GetRowValue(1, "struct")); + ASSERT_EQ(ConvertToYsonTextStringStable(secondNode), "[\"two\";2;]"); +} + +TEST(TArrowParserTest, DecimalVariousPrecisions) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("decimal128_10_3", DecimalLogicalType(10, 3)), + TColumnSchema("decimal128_35_3", DecimalLogicalType(35, 3)), + TColumnSchema("decimal128_38_3", DecimalLogicalType(38, 3)), + TColumnSchema("decimal256_10_3", DecimalLogicalType(10, 3)), + TColumnSchema("decimal256_35_3", DecimalLogicalType(35, 3)), + TColumnSchema("decimal256_38_3", DecimalLogicalType(38, 3)), + TColumnSchema("decimal256_76_3", DecimalLogicalType(76, 3)), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + std::vector<TString> values = {"3.141", "0.000", "-2.718", "9999999.999"}; + + auto parser = CreateParserForArrow(&collectedRows); + + parser->Read(MakeDecimalArrows(values, {{128, 10, 3}, {128, 35, 3}, {128, 38, 3}, {256, 10, 3}, {256, 35, 3}, {256, 38, 3}, {256, 76, 3}})); + parser->Finish(); + + auto collectStrings = [&] (TStringBuf columnName) { + std::vector<TString> result; + for (size_t index = 0; index < values.size(); ++index) { + result.push_back(collectedRows.GetRowValue(index, columnName).AsString()); + } + return result; + }; + + std::vector<TString> expectedValues_10_3 = + {"\x80\x00\x00\x00\x00\x00\x0c\x45"s, "\x80\x00\x00\x00\x00\x00\x00\x00"s, "\x7f\xff\xff\xff\xff\xff\xf5\x62"s, "\x80\x00\x00\x02\x54\x0b\xe3\xff"s}; + std::vector<TString> expectedValues_35_3 = + { + "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x45"s, "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"s, + "\x7f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xf5\x62"s, "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x54\x0b\xe3\xff"s, + }; + std::vector<TString> expectedValues_38_3 = + { + "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x45"s, "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"s, + "\x7f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xf5\x62"s, "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x54\x0b\xe3\xff"s + }; + std::vector<TString> expectedValues_76_3 = + { + "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x45"s, + "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"s, + "\x7f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xf5\x62"s, + "\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x54\x0b\xe3\xff"s, + }; + ASSERT_EQ(expectedValues_10_3, collectStrings("decimal128_10_3")); + ASSERT_EQ(expectedValues_35_3, collectStrings("decimal128_35_3")); + ASSERT_EQ(expectedValues_38_3, collectStrings("decimal128_38_3")); + ASSERT_EQ(expectedValues_10_3, collectStrings("decimal256_10_3")); + ASSERT_EQ(expectedValues_35_3, collectStrings("decimal256_35_3")); + ASSERT_EQ(expectedValues_38_3, collectStrings("decimal256_38_3")); + ASSERT_EQ(expectedValues_76_3, collectStrings("decimal256_76_3")); +} + +TEST(TArrowParserTest, ListOfDecimals) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("list", ListLogicalType(DecimalLogicalType(10, 3))), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + std::vector<TString> values = {"3.141", "0.000", "-2.718", "9999999.999"}; + + auto parser = CreateParserForArrow(&collectedRows); + + parser->Read(MakeDecimalListArrow(values)); + parser->Finish(); + + auto firstList = ConvertTo<std::vector<TString>>(GetComposite(collectedRows.GetRowValue(0, "list"))); + std::vector<TString> secondList = { + "\x80\x00\x00\x00\x00\x00\x0c\x45"s, "\x80\x00\x00\x00\x00\x00\x00\x00"s, + "\x7f\xff\xff\xff\xff\xff\xf5\x62"s, "\x80\x00\x00\x02\x54\x0b\xe3\xff"s + }; + ASSERT_EQ(firstList, secondList); +} + +TEST(TArrowParserTest, BlockingInput) +{ + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("integer", EValueType::Int64) + }); + + TCollectingValueConsumer collectedRows(tableSchema); + + auto parser = CreateParserForArrow(&collectedRows); + + auto data = MakeIntegerArrow({1, 2, 3}); + for (auto i : data) { + std::string s(1, i); + parser->Read(s); + } + parser->Finish(); + + ASSERT_EQ(collectedRows.Size(), 3u); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "integer")), 1); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(1, "integer")), 2); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(2, "integer")), 3); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT diff --git a/yt/yt/library/formats/unittests/dsv_parser_ut.cpp b/yt/yt/library/formats/unittests/dsv_parser_ut.cpp new file mode 100644 index 0000000000..f939585099 --- /dev/null +++ b/yt/yt/library/formats/unittests/dsv_parser_ut.cpp @@ -0,0 +1,365 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/test_framework/yson_consumer_mock.h> + +#include <yt/yt/library/formats/dsv_parser.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; + +using ::testing::InSequence; +using ::testing::StrictMock; +using ::testing::NiceMock; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TDsvParserTest, Simple) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("integer")); + EXPECT_CALL(Mock, OnStringScalar("42")); + EXPECT_CALL(Mock, OnKeyedItem("string")); + EXPECT_CALL(Mock, OnStringScalar("some")); + EXPECT_CALL(Mock, OnKeyedItem("double")); + EXPECT_CALL(Mock, OnStringScalar("10")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("foo")); + EXPECT_CALL(Mock, OnStringScalar("bar")); + EXPECT_CALL(Mock, OnKeyedItem("one")); + EXPECT_CALL(Mock, OnStringScalar("1")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "integer=42\tstring=some\tdouble=10\n" + "foo=bar\tone=1\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, EmptyInput) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + TString input = ""; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, BinaryData) +{ + StrictMock<TMockYsonConsumer> Mock; + + auto a = TString("\0\0\0\0", 4); + auto b = TString("\x80\0\x16\xC8", 4); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("ntr")); + EXPECT_CALL(Mock, OnStringScalar(a)); + EXPECT_CALL(Mock, OnKeyedItem("xrp")); + EXPECT_CALL(Mock, OnStringScalar(b)); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "ntr=\\0\\0\\0\\0\txrp=\x80\\0\x16\xC8\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, EmptyRecord) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, EmptyRecords) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "\n\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, EmptyKeysAndValues) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "=\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, UnescapedZeroInInput) +{ + StrictMock<TMockYsonConsumer> Mock; + + TString input = TString("a\0b=v", 5); + EXPECT_ANY_THROW({ + ParseDsv(input, &Mock); + }); +} + +TEST(TDsvParserTest, ZerosAreNotTerminals) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + TString key = TString("a\0b", 3); + TString value = TString("c\0d", 3); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem(key)); + EXPECT_CALL(Mock, OnStringScalar(value)); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "a\\0b=c\\0d\n"; + ParseDsv(input, &Mock); +} + +TEST(TDsvParserTest, UnterminatedRecord) +{ + NiceMock<TMockYsonConsumer> Mock; + + TString input = "a=b"; + EXPECT_ANY_THROW({ + ParseDsv(input, &Mock); + }); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TTskvParserTest: public ::testing::Test +{ +public: + StrictMock<TMockYsonConsumer> Mock; + NiceMock<TMockYsonConsumer> ErrorMock; + + TDsvFormatConfigPtr Config; + + void SetUp() override { + Config = New<TDsvFormatConfig>(); + Config->LinePrefix = "tskv"; + } +}; + +TEST_F(TTskvParserTest, Simple) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("id")); + EXPECT_CALL(Mock, OnStringScalar("1")); + EXPECT_CALL(Mock, OnKeyedItem("guid")); + EXPECT_CALL(Mock, OnStringScalar("100500")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("id")); + EXPECT_CALL(Mock, OnStringScalar("2")); + EXPECT_CALL(Mock, OnKeyedItem("guid")); + EXPECT_CALL(Mock, OnStringScalar("20025")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "tskv\n" + "tskv\tid=1\tguid=100500\t\n" + "tskv\tid=2\tguid=20025\n"; + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, SimpleWithNewLine) +{ + InSequence dummy; + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("foo")); + EXPECT_CALL(Mock, OnStringScalar("bar")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "tskv\tfoo=bar\n"; + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, Escaping) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a=b")); + EXPECT_CALL(Mock, OnStringScalar("c=d or e=f")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key_with_\t,\r_and_\n")); + EXPECT_CALL(Mock, OnStringScalar("value_with_\t,\\_and_\r\n")); + EXPECT_CALL(Mock, OnKeyedItem("another_key")); + EXPECT_CALL(Mock, OnStringScalar("another_value")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "t\\s\\kv\n" + "tskv" "\t" "a\\=b" "=" "c\\=d or e=f" "\n" // Note: unescaping is less strict + "tskv" "\t" + "key_with_\\t,\r_and_\\n" + "=" + "value_with_\\t,\\\\_and_\\r\\n" + "\t" + "an\\other_\\key=anoth\\er_v\\alue" + "\n"; + + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, DisabledEscaping) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a\\")); + EXPECT_CALL(Mock, OnStringScalar("b\\t=c\\=d or e=f\\0")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "tskv\t\\x\\y\n" + "tskv" "\t" "a\\=b\\t" "=" "c\\=d or e=f\\0" "\n"; + + Config->EnableEscaping = false; + + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, AllowedUnescapedSymbols) +{ + Config->LinePrefix = "prefix_with_="; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("just_key")); + EXPECT_CALL(Mock, OnStringScalar("value_with_=")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "prefix_with_=" "\t" "just_key" "=" "value_with_=" "\n"; + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, UndefinedValues) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("b")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "tskv" "\t" "tskv" "\t" "tskv" "\n" + "tskv\t" "some_key" "\t\t\t" "a=b" "\t" "another_key" "\n" // Note: consequent \t + "tskv\n"; + ParseDsv(input, &Mock, Config); +} + + +TEST_F(TTskvParserTest, OnlyLinePrefix) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "tskv\n"; + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, OnlyLinePrefixAndTab) +{ + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "tskv\t\n"; + ParseDsv(input, &Mock, Config); +} + +TEST_F(TTskvParserTest, NotFinishedLinePrefix) +{ + TString input = "tsk"; + + EXPECT_ANY_THROW({ + ParseDsv(input, &ErrorMock, Config); + }); +} + +TEST_F(TTskvParserTest, WrongLinePrefix) +{ + TString input = + "tskv\ta=b\n" + "tZkv\tc=d\te=f\n" + "tskv\ta=b\n"; + + EXPECT_ANY_THROW({ + ParseDsv(input, &ErrorMock, Config); + }); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NDriver diff --git a/yt/yt/library/formats/unittests/dsv_writer_ut.cpp b/yt/yt/library/formats/unittests/dsv_writer_ut.cpp new file mode 100644 index 0000000000..842a669557 --- /dev/null +++ b/yt/yt/library/formats/unittests/dsv_writer_ut.cpp @@ -0,0 +1,316 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/library/formats/dsv_parser.h> +#include <yt/yt/library/formats/dsv_writer.h> + +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +#include <yt/yt/core/concurrency/async_stream.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYTree; +using namespace NYson; +using namespace NConcurrency; +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TDsvWriterTest, StringScalar) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnStringScalar("0-2-xb-1234"); + EXPECT_EQ("0-2-xb-1234", outputStream.Str()); +} + +TEST(TDsvWriterTest, ListContainingDifferentTypes) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnBeginList(); + consumer.OnListItem(); + consumer.OnInt64Scalar(100); + consumer.OnListItem(); + consumer.OnStringScalar("foo"); + consumer.OnListItem(); + consumer.OnListItem(); + consumer.OnBeginMap(); + consumer.OnKeyedItem("a"); + consumer.OnStringScalar("10"); + consumer.OnKeyedItem("b"); + consumer.OnStringScalar("c"); + consumer.OnEndMap(); + consumer.OnEndList(); + + TString output = + "100\n" + "foo\n" + "\n" + "a=10\tb=c\n"; + + EXPECT_EQ(output, outputStream.Str()); +} + +TEST(TDsvWriterTest, ListInsideList) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnBeginList(); + consumer.OnListItem(); + EXPECT_ANY_THROW(consumer.OnBeginList()); +} + +TEST(TDsvWriterTest, ListInsideMap) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnBeginMap(); + consumer.OnKeyedItem("foo"); + EXPECT_ANY_THROW(consumer.OnBeginList()); +} + +TEST(TDsvWriterTest, MapInsideMap) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnBeginMap(); + consumer.OnKeyedItem("foo"); + EXPECT_ANY_THROW(consumer.OnBeginMap()); +} + +TEST(TDsvWriterTest, WithoutEsacping) +{ + auto config = New<TDsvFormatConfig>(); + config->EnableEscaping = false; + + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream, config); + + consumer.OnStringScalar("string_with_\t_\\_=_and_\n"); + + TString output = "string_with_\t_\\_=_and_\n"; + + EXPECT_EQ(output, outputStream.Str()); +} + +TEST(TDsvWriterTest, ListUsingOnRaw) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnRaw("[10; 20; 30]", EYsonType::Node); + TString output = + "10\n" + "20\n" + "30\n"; + + EXPECT_EQ(output, outputStream.Str()); +} + +TEST(TDsvWriterTest, MapUsingOnRaw) +{ + TStringStream outputStream; + TDsvNodeConsumer consumer(&outputStream); + + consumer.OnRaw("{a=b; c=d}", EYsonType::Node); + TString output = "a=b\tc=d"; + + EXPECT_EQ(output, outputStream.Str()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TDsvWriterTest, SimpleTabular) +{ + auto nameTable = New<TNameTable>(); + auto integerId = nameTable->RegisterName("integer"); + auto stringId = nameTable->RegisterName("string"); + auto doubleId = nameTable->RegisterName("double"); + auto fooId = nameTable->RegisterName("foo"); + auto oneId = nameTable->RegisterName("one"); + auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); + auto rowIndexId = nameTable->RegisterName(RowIndexColumnName); + auto rangeIndexId = nameTable->RegisterName(RangeIndexColumnName); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedInt64Value(42, integerId)); + row1.AddValue(MakeUnversionedStringValue("some", stringId)); + row1.AddValue(MakeUnversionedDoubleValue(10., doubleId)); + row1.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); + row1.AddValue(MakeUnversionedInt64Value(42, rowIndexId)); + row1.AddValue(MakeUnversionedInt64Value(1, rangeIndexId)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("bar", fooId)); + row2.AddValue(MakeUnversionedSentinelValue(EValueType::Null, integerId)); + row2.AddValue(MakeUnversionedInt64Value(1, oneId)); + row2.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); + row2.AddValue(MakeUnversionedInt64Value(43, rowIndexId)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow()}; + + TStringStream outputStream; + auto config = New<TDsvFormatConfig>(); + config->EnableTableIndex = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableTableIndex = true; + auto writer = CreateSchemalessWriterForDsv( + config, + nameTable, + CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), + false, + controlAttributes, + 0); + + EXPECT_EQ(true, writer->Write(rows)); + writer->Close() + .Get() + .ThrowOnError(); + + TString output = + "integer=42\tstring=some\tdouble=10.\t@table_index=2\n" + "foo=bar\tone=1\t@table_index=2\n"; + EXPECT_EQ(output, outputStream.Str()); +} + +TEST(TDsvWriterTest, AnyTabular) +{ + auto nameTable = New<TNameTable>(); + auto anyId = nameTable->RegisterName("any"); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedAnyValue("[]", anyId)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + TStringStream outputStream; + auto controlAttributes = New<TControlAttributesConfig>(); + auto writer = CreateSchemalessWriterForDsv( + New<TDsvFormatConfig>(), + nameTable, + CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), + false, + controlAttributes, + 0); + + EXPECT_FALSE(writer->Write(rows)); + EXPECT_ANY_THROW(writer->GetReadyEvent().Get().ThrowOnError()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TTskvWriterTest, SimpleTabular) +{ + auto nameTable = New<TNameTable>(); + auto id1 = nameTable->RegisterName("id"); + auto id2 = nameTable->RegisterName("guid"); + auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); + auto rowIndexId = nameTable->RegisterName(RowIndexColumnName); + auto rangeIndexId = nameTable->RegisterName(RangeIndexColumnName); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); + row1.AddValue(MakeUnversionedInt64Value(42, rowIndexId)); + row1.AddValue(MakeUnversionedInt64Value(1, rangeIndexId)); + + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("1", id1)); + row2.AddValue(MakeUnversionedInt64Value(100500, id2)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("2", id1)); + row3.AddValue(MakeUnversionedInt64Value(20025, id2)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; + + TStringStream outputStream; + auto config = New<TDsvFormatConfig>(); + config->LinePrefix = "tskv"; + + auto controlAttributes = New<TControlAttributesConfig>(); + auto writer = CreateSchemalessWriterForDsv( + config, + nameTable, + CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), + false, + controlAttributes, + 0); + + EXPECT_EQ(true, writer->Write(rows)); + writer->Close() + .Get() + .ThrowOnError(); + + TString output = + "tskv\n" + "tskv\tid=1\tguid=100500\n" + "tskv\tid=2\tguid=20025\n"; + + EXPECT_EQ(output, outputStream.Str()); +} + +TEST(TTskvWriterTest, Escaping) +{ + auto key1 = TString("\0 is escaped", 12); + + auto nameTable = New<TNameTable>(); + auto id1 = nameTable->RegisterName(key1); + auto id2 = nameTable->RegisterName("Escaping in in key: \r \t \n \\ ="); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue(key1, id1)); + row.AddValue(MakeUnversionedStringValue("Escaping in value: \r \t \n \\ =", id2)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + TStringStream outputStream; + auto config = New<TDsvFormatConfig>(); + config->LinePrefix = "tskv"; + + auto controlAttributes = New<TControlAttributesConfig>(); + auto writer = CreateSchemalessWriterForDsv( + config, + nameTable, + CreateAsyncAdapter(static_cast<IOutputStream*>(&outputStream)), + false, + controlAttributes, + 0); + + EXPECT_EQ(true, writer->Write(rows)); + writer->Close() + .Get() + .ThrowOnError(); + + TString output = + "tskv" + "\t" + + "\\0 is escaped" + "=" + "\\0 is escaped" + + "\t" + + "Escaping in in key: \\r \\t \\n \\\\ \\=" + "=" + "Escaping in value: \\r \\t \\n \\\\ =" // Note: = is not escaped + + "\n"; + + EXPECT_EQ(output, outputStream.Str()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/format_writer_ut.h b/yt/yt/library/formats/unittests/format_writer_ut.h new file mode 100644 index 0000000000..9da194d588 --- /dev/null +++ b/yt/yt/library/formats/unittests/format_writer_ut.h @@ -0,0 +1,36 @@ +#pragma once + +#include <yt/yt/library/formats/format.h> + +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +namespace NYT::NFormats { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +void TestNameTableExpansion(ISchemalessFormatWriterPtr writer, NTableClient::TNameTablePtr nameTable) +{ + // We write five rows, on each iteration we double number of + // columns in the NameTable. + for (int iteration = 0; iteration < 5; ++iteration) { + NTableClient::TUnversionedOwningRowBuilder row; + for (int index = 0; index < (1 << iteration); ++index) { + auto key = "Column" + ToString(index); + auto value = "Value" + ToString(index); + int columnId = nameTable->GetIdOrRegisterName(key); + row.AddValue(NTableClient::MakeUnversionedStringValue(value, columnId)); + } + auto completeRow = row.FinishRow(); + EXPECT_EQ(true, writer->Write({completeRow.Get()})); + } + writer->Close() + .Get() + .ThrowOnError(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/protobuf_format_ut.cpp b/yt/yt/library/formats/unittests/protobuf_format_ut.cpp new file mode 100644 index 0000000000..f3cb743833 --- /dev/null +++ b/yt/yt/library/formats/unittests/protobuf_format_ut.cpp @@ -0,0 +1,4659 @@ +#include "row_helpers.h" +#include "yson_helpers.h" +#include "yt/yt/client/table_client/public.h" + +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/concurrency/async_stream.h> +#include <yt/yt/core/json/json_parser.h> +#include <yt/yt/core/yson/string.h> +#include <yt/yt/core/ytree/fluent.h> + +#include <yt/yt/client/formats/config.h> +#include <yt/yt/client/formats/parser.h> +#include <yt/yt/client/formats/format.h> +#include <yt/yt/client/table_client/logical_type.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/value_consumer.h> +#include <yt/yt/client/table_client/unversioned_row.h> + +#include <yt/yt/library/formats/format.h> +#include <yt/yt/library/formats/lenval_control_constants.h> +#include <yt/yt/library/formats/protobuf_writer.h> +#include <yt/yt/library/formats/protobuf_parser.h> +#include <yt/yt/library/formats/protobuf.h> + +#include <yt/yt/library/formats/unittests/protobuf_format_ut.pb.h> + +#include <yt/yt/library/named_value/named_value.h> + +#include <util/random/fast.h> + +#include <google/protobuf/text_format.h> +#include <google/protobuf/descriptor.h> +#include <google/protobuf/descriptor.pb.h> + +using namespace std::string_view_literals; + + +namespace NYT { +namespace { + +using namespace NYson; +using namespace NYTree; +using namespace NFormats; +using namespace NTableClient; +using namespace NConcurrency; +using namespace NProtobufFormatTest; + +using ::google::protobuf::FileDescriptor; +using NNamedValue::MakeRow; + +//////////////////////////////////////////////////////////////////////////////// + +DEFINE_ENUM(EProtoFormatType, + (FileDescriptorLegacy) + (FileDescriptor) + (Structured) +); + +//////////////////////////////////////////////////////////////////////////////// + +#define EXPECT_NODES_EQUAL(a, b) \ + EXPECT_TRUE(AreNodesEqual((a), (b))) \ + << #a ": " << ConvertToYsonString((a), EYsonFormat::Text).ToString() \ + << "\n\n" #b ": " << ConvertToYsonString((b), EYsonFormat::Text).ToString(); + +//////////////////////////////////////////////////////////////////////////////// + +TString ConvertToTextYson(const INodePtr& node) +{ + return ConvertToYsonString(node, EYsonFormat::Text).ToString(); +} + +// Hardcoded serialization of file descriptor used in old format description. +TString FileDescriptorLegacy = "\x0a\xb6\x03\x0a\x29\x6a\x75\x6e\x6b\x2f\x65\x72\x6d\x6f\x6c\x6f\x76\x64\x2f\x74\x65\x73\x74\x2d\x70\x72\x6f\x74\x6f\x62" + "\x75\x66\x2f\x6d\x65\x73\x73\x61\x67\x65\x2e\x70\x72\x6f\x74\x6f\x22\x2d\x0a\x0f\x54\x45\x6d\x62\x65\x64\x65\x64\x4d\x65\x73\x73\x61\x67\x65\x12" + "\x0b\x0a\x03\x4b\x65\x79\x18\x01\x20\x01\x28\x09\x12\x0d\x0a\x05\x56\x61\x6c\x75\x65\x18\x02\x20\x01\x28\x09\x22\xb3\x02\x0a\x08\x54\x4d\x65\x73" + "\x73\x61\x67\x65\x12\x0e\x0a\x06\x44\x6f\x75\x62\x6c\x65\x18\x01\x20\x01\x28\x01\x12\x0d\x0a\x05\x46\x6c\x6f\x61\x74\x18\x02\x20\x01\x28\x02\x12" + "\x0d\x0a\x05\x49\x6e\x74\x36\x34\x18\x03\x20\x01\x28\x03\x12\x0e\x0a\x06\x55\x49\x6e\x74\x36\x34\x18\x04\x20\x01\x28\x04\x12\x0e\x0a\x06\x53\x49" + "\x6e\x74\x36\x34\x18\x05\x20\x01\x28\x12\x12\x0f\x0a\x07\x46\x69\x78\x65\x64\x36\x34\x18\x06\x20\x01\x28\x06\x12\x10\x0a\x08\x53\x46\x69\x78\x65" + "\x64\x36\x34\x18\x07\x20\x01\x28\x10\x12\x0d\x0a\x05\x49\x6e\x74\x33\x32\x18\x08\x20\x01\x28\x05\x12\x0e\x0a\x06\x55\x49\x6e\x74\x33\x32\x18\x09" + "\x20\x01\x28\x0d\x12\x0e\x0a\x06\x53\x49\x6e\x74\x33\x32\x18\x0a\x20\x01\x28\x11\x12\x0f\x0a\x07\x46\x69\x78\x65\x64\x33\x32\x18\x0b\x20\x01\x28" + "\x07\x12\x10\x0a\x08\x53\x46\x69\x78\x65\x64\x33\x32\x18\x0c\x20\x01\x28\x0f\x12\x0c\x0a\x04\x42\x6f\x6f\x6c\x18\x0d\x20\x01\x28\x08\x12\x0e\x0a" + "\x06\x53\x74\x72\x69\x6e\x67\x18\x0e\x20\x01\x28\x09\x12\x0d\x0a\x05\x42\x79\x74\x65\x73\x18\x0f\x20\x01\x28\x0c\x12\x14\x0a\x04\x45\x6e\x75\x6d" + "\x18\x10\x20\x01\x28\x0e\x32\x06\x2e\x45\x45\x6e\x75\x6d\x12\x21\x0a\x07\x4d\x65\x73\x73\x61\x67\x65\x18\x11\x20\x01\x28\x0b\x32\x10\x2e\x54\x45" + "\x6d\x62\x65\x64\x65\x64\x4d\x65\x73\x73\x61\x67\x65\x2a\x24\x0a\x05\x45\x45\x6e\x75\x6d\x12\x07\x0a\x03\x4f\x6e\x65\x10\x01\x12\x07\x0a\x03\x54" + "\x77\x6f\x10\x02\x12\x09\x0a\x05\x54\x68\x72\x65\x65\x10\x03"; + +TString GenerateRandomLenvalString(TFastRng64& rng, ui32 size) +{ + TString result; + result.append(reinterpret_cast<const char*>(&size), sizeof(size)); + + size += sizeof(ui32); + + while (result.size() < size) { + ui64 num = rng.GenRand(); + result.append(reinterpret_cast<const char*>(&num), sizeof(num)); + } + if (result.size() > size) { + result.resize(size); + } + return result; +} + +static TProtobufFormatConfigPtr MakeProtobufFormatConfig(const std::vector<const ::google::protobuf::Descriptor*>& descriptorList) +{ + ::google::protobuf::FileDescriptorSet fileDescriptorSet; + THashSet<const ::google::protobuf::FileDescriptor*> files; + + std::function<void(const ::google::protobuf::FileDescriptor*)> addFile; + addFile = [&] (const ::google::protobuf::FileDescriptor* fileDescriptor) { + if (!files.insert(fileDescriptor).second) { + return; + } + + // N.B. We want to write dependencies in fileDescriptorSet in topological order + // so we traverse dependencies first and the add current fileDescriptor. + for (int i = 0; i < fileDescriptor->dependency_count(); ++i) { + addFile(fileDescriptor->dependency(i)); + } + fileDescriptor->CopyTo(fileDescriptorSet.add_file()); + }; + std::vector<TString> typeNames; + + for (const auto* descriptor : descriptorList) { + addFile(descriptor->file()); + typeNames.push_back(descriptor->full_name()); + } + + auto formatConfigYsonString = BuildYsonStringFluently() + .BeginMap() + .Item("file_descriptor_set_text").Value(fileDescriptorSet.ShortDebugString()) + .Item("type_names").Value(typeNames) + .EndMap(); + + return ConvertTo<TProtobufFormatConfigPtr>(formatConfigYsonString); +} + +INodePtr ParseYson(TStringBuf data) +{ + return ConvertToNode(NYson::TYsonString(TString{data})); +} + +TString LenvalBytes(const ::google::protobuf::Message& message) +{ + TStringStream out; + ui32 messageSize = static_cast<ui32>(message.ByteSizeLong()); + out.Write(&messageSize, sizeof(messageSize)); + if (!message.SerializeToArcadiaStream(&out)) { + THROW_ERROR_EXCEPTION("Can not serialize message"); + } + return out.Str(); +} + +void EnsureTypesMatch(EValueType expected, EValueType actual) +{ + if (expected != actual) { + THROW_ERROR_EXCEPTION("Mismatching type: expected %Qlv, actual %Qlv", + expected, + actual); + } +} + +double GetDouble(const TUnversionedValue& row) +{ + EnsureTypesMatch(EValueType::Double, row.Type); + return row.Data.Double; +} + +template <typename TMessage> +TCollectingValueConsumer ParseRows( + const TMessage& message, + const TProtobufFormatConfigPtr& config, + const TTableSchemaPtr& schema = New<TTableSchema>(), + int count = 1) +{ + TString lenvalBytes; + TStringOutput out(lenvalBytes); + auto messageSize = static_cast<ui32>(message.ByteSize()); + for (int i = 0; i < count; ++i) { + out.Write(&messageSize, sizeof(messageSize)); + if (!message.SerializeToArcadiaStream(&out)) { + THROW_ERROR_EXCEPTION("Failed to serialize message"); + } + } + + TCollectingValueConsumer rowCollector(schema); + auto parser = CreateParserForProtobuf(&rowCollector, config, 0); + parser->Read(lenvalBytes); + parser->Finish(); + if (static_cast<ssize_t>(rowCollector.Size()) != count) { + THROW_ERROR_EXCEPTION("rowCollector has wrong size: expected %v, actual %v", + count, + rowCollector.Size()); + } + return rowCollector; +} + +template <typename TMessage> +TCollectingValueConsumer ParseRows( + const TMessage& message, + const INodePtr& config, + const TTableSchemaPtr& schema = New<TTableSchema>(), + int count = 1) +{ + return ParseRows(message, ConvertTo<TProtobufFormatConfigPtr>(config->Attributes().ToMap()), schema, count); +} + + +void AddDependencies( + const FileDescriptor* fileDescriptor, + std::vector<const FileDescriptor*>& fileDescriptors, + THashSet<const FileDescriptor*>& fileDescriptorSet) +{ + if (fileDescriptorSet.contains(fileDescriptor)) { + return; + } + fileDescriptorSet.insert(fileDescriptor); + for (int i = 0; i < fileDescriptor->dependency_count(); ++i) { + AddDependencies(fileDescriptor->dependency(i), fileDescriptors, fileDescriptorSet); + } + fileDescriptors.push_back(fileDescriptor); +} + +template <typename ... Ts> +INodePtr CreateFileDescriptorConfig(std::optional<EComplexTypeMode> complexTypeMode = {}) +{ + std::vector<const FileDescriptor*> fileDescriptors; + THashSet<const FileDescriptor*> fileDescriptorSet; + std::vector<const FileDescriptor*> originalFileDescriptors = {Ts::descriptor()->file()...}; + + for (auto d : originalFileDescriptors) { + AddDependencies(d, fileDescriptors, fileDescriptorSet); + } + + ::google::protobuf::FileDescriptorSet fileDescriptorSetProto; + for (auto fileDescriptor : fileDescriptors) { + fileDescriptor->CopyTo(fileDescriptorSetProto.add_file()); + } + TString fileDescriptorSetText; + ::google::protobuf::TextFormat::Printer().PrintToString(fileDescriptorSetProto, &fileDescriptorSetText); + std::vector<TString> typeNames = {Ts::descriptor()->full_name()...}; + return BuildYsonNodeFluently() + .BeginAttributes() + .Item("file_descriptor_set_text").Value(fileDescriptorSetText) + .Item("type_names").Value(typeNames) + .OptionalItem("complex_type_mode", complexTypeMode) + .EndAttributes() + .Value("protobuf"); +} + +static const auto EnumerationsConfig = BuildYsonNodeFluently() + .BeginMap() + .Item("EEnum") + .BeginMap() + .Item("One").Value(1) + .Item("Two").Value(2) + .Item("Three").Value(3) + .Item("MinusFortyTwo").Value(-42) + .Item("MaxInt32").Value(std::numeric_limits<int>::max()) + .Item("MinInt32").Value(std::numeric_limits<int>::min()) + .EndMap() + .EndMap(); + +INodePtr CreateAllFieldsConfig(EProtoFormatType protoFormatType) +{ + switch (protoFormatType) { + case EProtoFormatType::FileDescriptor: + return CreateFileDescriptorConfig<TMessage>(); + case EProtoFormatType::FileDescriptorLegacy: + return BuildYsonNodeFluently() + .BeginAttributes() + .Item("file_descriptor_set") + .Value(FileDescriptorLegacy) + .Item("file_indices") + .BeginList() + .Item().Value(0) + .EndList() + .Item("message_indices") + .BeginList() + .Item().Value(1) + .EndList() + .EndAttributes() + .Value("protobuf"); + case EProtoFormatType::Structured: + return BuildYsonNodeFluently() + .BeginAttributes() + .Item("enumerations").Value(EnumerationsConfig) + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("Double") + .Item("field_number").Value(1) + .Item("proto_type").Value("double") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Float") + .Item("field_number").Value(2) + .Item("proto_type").Value("float") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Int64") + .Item("field_number").Value(3) + .Item("proto_type").Value("int64") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("UInt64") + .Item("field_number").Value(4) + .Item("proto_type").Value("uint64") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("SInt64") + .Item("field_number").Value(5) + .Item("proto_type").Value("sint64") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Fixed64") + .Item("field_number").Value(6) + .Item("proto_type").Value("fixed64") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("SFixed64") + .Item("field_number").Value(7) + .Item("proto_type").Value("sfixed64") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Int32") + .Item("field_number").Value(8) + .Item("proto_type").Value("int32") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("UInt32") + .Item("field_number").Value(9) + .Item("proto_type").Value("uint32") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("SInt32") + .Item("field_number").Value(10) + .Item("proto_type").Value("sint32") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Fixed32") + .Item("field_number").Value(11) + .Item("proto_type").Value("fixed32") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("SFixed32") + .Item("field_number").Value(12) + .Item("proto_type").Value("sfixed32") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Bool") + .Item("field_number").Value(13) + .Item("proto_type").Value("bool") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("String") + .Item("field_number").Value(14) + .Item("proto_type").Value("string") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Bytes") + .Item("field_number").Value(15) + .Item("proto_type").Value("bytes") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Enum") + .Item("field_number").Value(16) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("Message") + .Item("field_number").Value(17) + .Item("proto_type").Value("message") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("AnyWithMap") + .Item("field_number").Value(18) + .Item("proto_type").Value("any") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("AnyWithInt64") + .Item("field_number").Value(19) + .Item("proto_type").Value("any") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("AnyWithString") + .Item("field_number").Value(20) + .Item("proto_type").Value("any") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("OtherColumns") + .Item("field_number").Value(21) + .Item("proto_type").Value("other_columns") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("MissingInt64") + .Item("field_number").Value(22) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndAttributes() + .Value("protobuf"); + } + Y_ABORT(); +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TLenvalEntry +{ + TString RowData; + ui32 TableIndex; + ui64 TabletIndex; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TLenvalParser +{ +public: + explicit TLenvalParser(IInputStream* input) + : Input_(input) + { } + + explicit TLenvalParser(TStringBuf input) + : StreamHolder_(std::make_unique<TMemoryInput>(input)) + , Input_(StreamHolder_.get()) + { } + + std::optional<TLenvalEntry> Next() + { + ui32 rowSize; + size_t read = Input_->Load(&rowSize, sizeof(rowSize)); + if (read == 0) { + return std::nullopt; + } else if (read < sizeof(rowSize)) { + THROW_ERROR_EXCEPTION("corrupted lenval: can't read row length"); + } + switch (rowSize) { + case LenvalTableIndexMarker: { + ui32 tableIndex; + read = Input_->Load(&tableIndex, sizeof(tableIndex)); + if (read != sizeof(tableIndex)) { + THROW_ERROR_EXCEPTION("corrupted lenval: can't read table index"); + } + CurrentTableIndex_ = tableIndex; + return Next(); + } + case LenvalTabletIndexMarker: { + ui64 tabletIndex; + read = Input_->Load(&tabletIndex, sizeof(tabletIndex)); + if (read != sizeof(tabletIndex)) { + THROW_ERROR_EXCEPTION("corrupted lenval: can't read tablet index"); + } + CurrentTabletIndex_ = tabletIndex; + return Next(); + } + case LenvalEndOfStream: + EndOfStream_ = true; + return std::nullopt; + case LenvalKeySwitch: + case LenvalRangeIndexMarker: + case LenvalRowIndexMarker: + THROW_ERROR_EXCEPTION("marker is unsupported"); + default: { + TLenvalEntry result; + result.RowData.resize(rowSize); + result.TableIndex = CurrentTableIndex_; + result.TabletIndex = CurrentTabletIndex_; + Input_->Load(result.RowData.Detach(), rowSize); + + return result; + } + } + } + + bool IsEndOfStream() const + { + return EndOfStream_; + } + +private: + std::unique_ptr<IInputStream> StreamHolder_; + IInputStream* Input_; + ui32 CurrentTableIndex_ = 0; + ui64 CurrentTabletIndex_ = 0; + bool EndOfStream_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +TProtobufFormatConfigPtr ParseAndValidateConfig(const INodePtr& node, std::vector<TTableSchemaPtr> schemas = {}) +{ + auto config = ConvertTo<TProtobufFormatConfigPtr>(node); + if (schemas.empty()) { + schemas.assign(config->Tables.size(), New<TTableSchema>()); + } + New<TProtobufParserFormatDescription>()->Init(config, schemas); + New<TProtobufWriterFormatDescription>()->Init(config, schemas); + return config; +} + +} // namespace + +INodePtr BuildEmbeddedConfig(EComplexTypeMode complexTypeMode, EProtoFormatType formatType) { + if (formatType == EProtoFormatType::FileDescriptor) { + return CreateFileDescriptorConfig<NYT::TEmbeddingMessage>(complexTypeMode); + } + + auto config = BuildYsonNodeFluently() + .BeginAttributes() + .Item("tables").BeginList() + .Item().BeginMap() + .Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("*") + .Item("field_number").Value(2) + .Item("proto_type").Value("embedded_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("other_columns_field") + .Item("field_number").Value(15) + .Item("proto_type").Value("other_columns") + .EndMap() + .Item().BeginMap() + .Item("name").Value("embedded_num") + .Item("field_number").Value(10) + .Item("proto_type").Value("uint64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("embedded_extra_field") + .Item("field_number").Value(11) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("variant") + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("str_variant") + .Item("field_number").Value(101) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("uint_variant") + .Item("field_number").Value(102) + .Item("proto_type").Value("uint64") + .EndMap() + .EndList() + .EndMap() + .Item().BeginMap() + .Item("name").Value("*") + .Item("field_number").Value(1) + .Item("proto_type").Value("embedded_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("embedded2_num") + .Item("field_number").Value(10) + .Item("proto_type").Value("uint64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("embedded2_struct") + .Item("field_number").Value(17) + .Item("proto_type").Value("structured_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("float1") + .Item("field_number").Value(1) + .Item("proto_type").Value("float") + .EndMap() + .Item().BeginMap() + .Item("name").Value("string1") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .Item().BeginMap() + .Item("name").Value("embedded2_repeated") + .Item("field_number").Value(42) + .Item("proto_type").Value("string") + .Item("repeated").Value(true) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .Item().BeginMap() + .Item("name").Value("num") + .Item("field_number").Value(12) + .Item("proto_type").Value("uint64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("extra_field") + .Item("field_number").Value(13) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList() + .Item("complex_type_mode").Value(complexTypeMode) + .EndAttributes() + .Value("protobuf"); + return config; +} + +TTableSchemaPtr BuildEmbeddedSchema() +{ + auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, + {"embedded_num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, + {"variant", VariantStructLogicalType({ + {"str_variant", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"uint_variant", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, + })}, + {"extra_column", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))}, + {"embedded2_num", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, + {"embedded2_struct", StructLogicalType({ + {"float1", SimpleLogicalType(ESimpleLogicalValueType::Float)}, + {"string1", SimpleLogicalType(ESimpleLogicalValueType::String)}, + })}, + {"embedded2_repeated", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"other_complex_field", StructLogicalType({ + {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + {"extra_int", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + + }); + return schema; +} + +TEST(TProtobufFormat, TestConfigParsingEmbedded) { + auto config = BuildEmbeddedConfig(EComplexTypeMode::Positional, EProtoFormatType::Structured); + auto schema = BuildEmbeddedSchema(); + + EXPECT_NO_THROW( + ParseAndValidateConfig(config->Attributes().ToMap(), {schema})); +} + +TEST(TProtobufFormat, TestConfigParsing) +{ + // Empty config. + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(ParseYson("{}")), + "one of \"tables\", \"file_descriptor_set\" and \"file_descriptor_set_text\" must be specified"); + + // Broken protobuf. + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(ParseYson(R"({file_descriptor_set="dfgxx"; file_indices=[0]; message_indices=[0]})")), + "Error parsing \"file_descriptor_set\" in protobuf config"); + + EXPECT_NO_THROW(ParseAndValidateConfig( + CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap())); + + EXPECT_NO_THROW(ParseAndValidateConfig( + CreateAllFieldsConfig(EProtoFormatType::FileDescriptorLegacy)->Attributes().ToMap())); + + EXPECT_NO_THROW(ParseAndValidateConfig( + CreateAllFieldsConfig(EProtoFormatType::FileDescriptor)->Attributes().ToMap())); + + auto embeddedInsideNonembeddedConfig = BuildYsonNodeFluently() + .BeginMap() + .Item("tables").BeginList() + .Item().BeginMap() + .Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("embedded_message1") + .Item("field_number").Value(1) + .Item("proto_type").Value("embedded_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("field1") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("embedded_message2") + .Item("field_number").Value(3) + .Item("proto_type").Value("embedded_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("field2") + .Item("field_number").Value(4) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto schemaForEmbedded = New<TTableSchema>(std::vector{ + TColumnSchema("field1", StructLogicalType({ + {"embedded_message2", StructLogicalType({ + {"field2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + })}, + })) + }); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(embeddedInsideNonembeddedConfig, {schemaForEmbedded}), + "embedded_message inside of structured_message is not allowed"); + + auto repeatedEmbeddedConfig = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("*") + .Item("field_number").Value(1) + .Item("proto_type").Value("embedded_message") + .Item("repeated").Value(true) + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("field1") + .Item("field_number").Value(1) + .Item("proto_type").Value("uint64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(repeatedEmbeddedConfig), + R"(type "embedded_message" can not be repeated)"); + + auto multipleOtherColumnsConfig = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("Other1") + .Item("field_number").Value(1) + .Item("proto_type").Value("other_columns") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("Other2") + .Item("field_number").Value(2) + .Item("proto_type").Value("other_columns") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(multipleOtherColumnsConfig), + "Multiple \"other_columns\" in protobuf config are not allowed"); + + auto duplicateColumnNamesConfig = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(duplicateColumnNamesConfig), + "Multiple fields with same column name \"SomeColumn\" are forbidden in protobuf format"); + + auto anyCorrespondsToStruct = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(1) + .Item("proto_type").Value("any") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto schema = New<TTableSchema>(std::vector{ + TColumnSchema("SomeColumn", StructLogicalType({})), + }); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(anyCorrespondsToStruct, {schema}), + "Table schema and protobuf format config mismatch"); + + auto configWithBytes = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(1) + .Item("proto_type").Value("bytes") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto schemaWithUtf8 = New<TTableSchema>(std::vector{ + TColumnSchema("SomeColumn", SimpleLogicalType(ESimpleLogicalValueType::Utf8)), + }); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(configWithBytes, {schemaWithUtf8}), + "mismatch: expected logical type to be one of"); + + auto configWithPackedNonRepeated = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .Item("packed").Value(true) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto schemaWithInt64List = New<TTableSchema>(std::vector<TColumnSchema>{ + {"SomeColumn", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + }); + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(configWithPackedNonRepeated, {schemaWithInt64List}), + "Field \"SomeColumn\" is marked \"packed\" but is not marked \"repeated\""); + + auto configWithPackedRepeatedString = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .Item("packed").Value(true) + .Item("repeated").Value(true) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto schemaWithStringList = New<TTableSchema>(std::vector{ + TColumnSchema("SomeColumn", ListLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::String))) + }); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(configWithPackedRepeatedString, {schemaWithStringList}), + "packed protobuf field must have primitive numeric type, got \"string\""); + + auto configWithMissingFieldNumber = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("SomeColumn") + .Item("proto_type").Value("string") + .Item("repeated").Value(true) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + EXPECT_THROW_WITH_SUBSTRING( + ParseAndValidateConfig(configWithMissingFieldNumber, {schemaWithStringList}), + "\"field_number\" is required"); +} + +TEST(TProtobufFormat, TestParseBigZigZag) +{ + constexpr i32 value = Min<i32>(); + TMessage message; + message.set_int32_field(value); + auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); + auto rowCollector = ParseRows(message, config); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "Int32")), value); +} + +TEST(TProtobufFormat, TestParseEnumerationString) +{ + auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); + { + TMessage message; + message.set_enum_field(EEnum::One); + auto rowCollector = ParseRows(message, config); + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "One"); + } + { + TMessage message; + message.set_enum_field(EEnum::Two); + auto rowCollector = ParseRows(message, config); + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "Two"); + } + { + TMessage message; + message.set_enum_field(EEnum::Three); + auto rowCollector = ParseRows(message, config); + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "Three"); + } + { + TMessage message; + message.set_enum_field(EEnum::MinusFortyTwo); + auto rowCollector = ParseRows(message, config); + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "Enum")), "MinusFortyTwo"); + } +} + +TEST(TProtobufFormat, TestParseWrongEnumeration) +{ + auto config = ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()); + TMessage message; + auto enumTag = TMessage::descriptor()->FindFieldByName("enum_field")->number(); + message.mutable_unknown_fields()->AddVarint(enumTag, 30); + EXPECT_ANY_THROW(ParseRows(message, config)); +} + +TEST(TProtobufFormat, TestParseEnumerationInt) +{ + TCollectingValueConsumer rowCollector; + + auto config = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("Enum") + .Item("field_number").Value(16) + .Item("proto_type").Value("enum_int") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto parser = CreateParserForProtobuf(&rowCollector, ConvertTo<TProtobufFormatConfigPtr>(config), 0); + + { + TMessage message; + message.set_enum_field(EEnum::One); + parser->Read(LenvalBytes(message)); + } + { + TMessage message; + message.set_enum_field(EEnum::Two); + parser->Read(LenvalBytes(message)); + } + { + TMessage message; + message.set_enum_field(EEnum::Three); + parser->Read(LenvalBytes(message)); + } + { + TMessage message; + message.set_enum_field(EEnum::MinusFortyTwo); + parser->Read(LenvalBytes(message)); + } + { + TMessage message; + auto enumTag = TMessage::descriptor()->FindFieldByName("enum_field")->number(); + message.mutable_unknown_fields()->AddVarint(enumTag, 100500); + parser->Read(LenvalBytes(message)); + } + + parser->Finish(); + + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "Enum")), 1); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(1, "Enum")), 2); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(2, "Enum")), 3); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(3, "Enum")), -42); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(4, "Enum")), 100500); +} + +TEST(TProtobufFormat, TestParseRandomGarbage) +{ + // Check that we never crash. + + TFastRng64 rng(42); + for (int i = 0; i != 1000; ++i) { + auto bytes = GenerateRandomLenvalString(rng, 8); + + TCollectingValueConsumer rowCollector; + auto parser = CreateParserForProtobuf( + &rowCollector, + ConvertTo<TProtobufFormatConfigPtr>(CreateAllFieldsConfig(EProtoFormatType::Structured)->Attributes().ToMap()), + 0); + try { + parser->Read(bytes); + parser->Finish(); + } catch (...) { + } + } +} + +TEST(TProtobufFormat, TestParseZeroColumns) +{ + auto config = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + TCollectingValueConsumer rowCollector; + auto parser = CreateParserForProtobuf( + &rowCollector, + ConvertTo<TProtobufFormatConfigPtr>(config), + 0); + + // Empty lenval values. + parser->Read("\0\0\0\0"sv); + parser->Read("\0\0\0\0"sv); + + parser->Finish(); + + ASSERT_EQ(static_cast<ssize_t>(rowCollector.Size()), 2); + EXPECT_EQ(static_cast<int>(rowCollector.GetRow(0).GetCount()), 0); + EXPECT_EQ(static_cast<int>(rowCollector.GetRow(1).GetCount()), 0); +} + +TEST(TProtobufFormat, TestWriteEnumerationString) +{ + auto config = CreateAllFieldsConfig(EProtoFormatType::Structured); + + auto nameTable = New<TNameTable>(); + + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + config->Attributes(), + {New<TTableSchema>()}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + + EXPECT_EQ(true, writer->Write({ + MakeRow(nameTable, { + {"Enum", "MinusFortyTwo"} + }).Get() + })); + EXPECT_EQ(true, writer->Write({ + MakeRow(nameTable, { + {"Enum", "Three"}, + }).Get() + })); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput si(result); + TLenvalParser parser(&si); + { + auto row = parser.Next(); + ASSERT_TRUE(row); + NYT::TMessage message; + ASSERT_TRUE(message.ParseFromString(row->RowData)); + ASSERT_EQ(message.enum_field(), NYT::EEnum::MinusFortyTwo); + } + { + auto row = parser.Next(); + ASSERT_TRUE(row); + NYT::TMessage message; + ASSERT_TRUE(message.ParseFromString(row->RowData)); + ASSERT_EQ(message.enum_field(), NYT::EEnum::Three); + } + { + auto row = parser.Next(); + ASSERT_FALSE(row); + } +} + +TEST(TProtobufFormat, TestWriteEnumerationInt) +{ + auto config = BuildYsonNodeFluently() + .BeginAttributes() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("Enum") + .Item("field_number").Value(16) + .Item("proto_type").Value("enum_int") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndAttributes() + .Value("protobuf"); + + auto nameTable = New<TNameTable>(); + + auto writeAndParseRow = [&] (TUnversionedRow row, TMessage* message) { + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + config->Attributes(), + {New<TTableSchema>()}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + Y_UNUSED(writer->Write({row})); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput si(result); + TLenvalParser parser(&si); + auto protoRow = parser.Next(); + ASSERT_TRUE(protoRow); + + ASSERT_TRUE(message->ParseFromString(protoRow->RowData)); + + auto nextProtoRow = parser.Next(); + ASSERT_FALSE(nextProtoRow); + }; + + { + TMessage message; + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", -42}, + }).Get(), + &message); + ASSERT_EQ(message.enum_field(), EEnum::MinusFortyTwo); + } + { + TMessage message; + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", static_cast<ui64>(std::numeric_limits<i32>::max())}, + }).Get(), + &message); + ASSERT_EQ(message.enum_field(), EEnum::MaxInt32); + } + { + TMessage message; + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", std::numeric_limits<i32>::max()}, + }).Get(), + &message); + ASSERT_EQ(message.enum_field(), EEnum::MaxInt32); + } + { + TMessage message; + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", std::numeric_limits<i32>::min()}, + }).Get(), + &message); + ASSERT_EQ(message.enum_field(), EEnum::MinInt32); + } + + TMessage message; + ASSERT_THROW( + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", static_cast<i64>(std::numeric_limits<i32>::max()) + 1}, + }).Get(), + &message), + TErrorException); + + ASSERT_THROW( + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", static_cast<i64>(std::numeric_limits<i32>::min()) - 1}, + }).Get(), + &message), + TErrorException); + + ASSERT_THROW( + writeAndParseRow( + MakeRow(nameTable, { + {"Enum", static_cast<ui64>(std::numeric_limits<i32>::max()) + 1}, + }).Get(), + &message), + TErrorException); +} + + +TEST(TProtobufFormat, TestWriteZeroColumns) +{ + auto config = BuildYsonNodeFluently() + .BeginAttributes() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .EndList() + .EndMap() + .EndList() + .EndAttributes() + .Value("protobuf"); + + auto nameTable = New<TNameTable>(); + + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + config->Attributes(), + {New<TTableSchema>()}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + + EXPECT_EQ(true, writer->Write({ + MakeRow(nameTable, { + {"Int64", -1}, + {"String", "this_is_string"}, + }).Get() + })); + EXPECT_EQ(true, writer->Write({MakeRow(nameTable, { }).Get()})); + + writer->Close() + .Get() + .ThrowOnError(); + + ASSERT_EQ(result, "\0\0\0\0\0\0\0\0"sv); +} + +TEST(TProtobufFormat, TestTabletIndex) +{ + auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(3) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap()); + + auto nameTable = New<TNameTable>(); + + TString result; + TStringOutput resultStream(result); + auto controlAttributesConfig = New<TControlAttributesConfig>(); + controlAttributesConfig->EnableTabletIndex = true; + + auto writer = CreateWriterForProtobuf( + config, + {New<TTableSchema>()}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + controlAttributesConfig, + 0); + + EXPECT_EQ(true, writer->Write({ + MakeRow(nameTable, { + {TString(TabletIndexColumnName), 1LL << 50}, + {"int64_field", -2345}, + }).Get(), + MakeRow(nameTable, { + {TString(TabletIndexColumnName), 12}, + {"int64_field", 2345}, + }).Get(), + })); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput si(result); + TLenvalParser parser(&si); + { + auto row = parser.Next(); + ASSERT_TRUE(row); + ASSERT_EQ(row->TabletIndex, 1ULL << 50); + NYT::TMessage message; + ASSERT_TRUE(message.ParseFromString(row->RowData)); + ASSERT_EQ(message.int64_field(), -2345); + } + { + auto row = parser.Next(); + ASSERT_TRUE(row); + ASSERT_EQ(static_cast<int>(row->TabletIndex), 12); + NYT::TMessage message; + ASSERT_TRUE(message.ParseFromString(row->RowData)); + ASSERT_EQ(message.int64_field(), 2345); + } + { + auto row = parser.Next(); + ASSERT_FALSE(row); + } +} + +TEST(TProtobufFormat, TestContext) +{ + auto config = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + TCollectingValueConsumer rowCollector; + auto parser = CreateParserForProtobuf( + &rowCollector, + ConvertTo<TProtobufFormatConfigPtr>(config), + 0); + + TString context; + try { + TMessage message; + message.set_string_field("PYSHCH-PYSHCH"); + parser->Read(LenvalBytes(message)); + parser->Finish(); + GTEST_FATAL_FAILURE_("expected to throw"); + } catch (const NYT::TErrorException& e) { + context = *e.Error().Attributes().Find<TString>("context"); + } + ASSERT_NE(context.find("PYSHCH-PYSHCH"), TString::npos); +} + +//////////////////////////////////////////////////////////////////////////////// + +TTableSchemaPtr CreateSchemaWithStructuredMessage() +{ + auto keyValueStruct = StructLogicalType({ + {"key", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"value", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + }); + + return New<TTableSchema>(std::vector<TColumnSchema>{ + {"first", StructLogicalType({ + {"field_missing_from_proto1", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int32))}, + {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"another_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"message_field", keyValueStruct}, + {"repeated_message_field", ListLogicalType(keyValueStruct)}, + {"any_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"any_map_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any))}, + {"optional_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"repeated_optional_any_field", ListLogicalType(OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any)))}, + {"packed_repeated_enum_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"optional_repeated_bool_field", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean)))}, + {"oneof_field", VariantStructLogicalType({ + {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_message_field", keyValueStruct}, + })}, + {"optional_oneof_field", OptionalLogicalType(VariantStructLogicalType({ + {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_message_field", keyValueStruct}, + }))}, + {"map_field", DictLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64), + OptionalLogicalType(keyValueStruct)) + }, + {"field_missing_from_proto2", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int32))}, + })}, + {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"another_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"repeated_message_field", ListLogicalType(keyValueStruct)}, + {"second", StructLogicalType({ + {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + {"any_field", SimpleLogicalType(ESimpleLogicalValueType::Any)}, + + {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"uint64_field", SimpleLogicalType(ESimpleLogicalValueType::Uint64)}, + {"int32_field", SimpleLogicalType(ESimpleLogicalValueType::Int32)}, + {"uint32_field", SimpleLogicalType(ESimpleLogicalValueType::Uint32)}, + + {"enum_int_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"enum_string_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"enum_string_int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + + {"repeated_optional_any_field", ListLogicalType(OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Any)))}, + + {"other_complex_field", StructLogicalType({ + {"one", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"two", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"three", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + + {"utf8_field", SimpleLogicalType(ESimpleLogicalValueType::Utf8)}, + + {"packed_repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + + {"optional_repeated_int64_field", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)))}, + + {"oneof_field", VariantStructLogicalType({ + {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_message_field", keyValueStruct}, + })}, + + {"optional_oneof_field", OptionalLogicalType(VariantStructLogicalType({ + {"oneof_string_field_1", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_string_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"oneof_message_field", keyValueStruct}, + }))}, + + {"map_field", DictLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64), + OptionalLogicalType(keyValueStruct)) + }, + }); +} + +INodePtr CreateConfigWithStructuredMessage(EComplexTypeMode complexTypeMode, EProtoFormatType formatType) +{ + if (formatType == EProtoFormatType::FileDescriptor) { + return CreateFileDescriptorConfig<TMessageWithStructuredEmbedded>(complexTypeMode); + } + YT_VERIFY(formatType == EProtoFormatType::Structured); + + auto buildOneofConfig = [] (TString prefix, int fieldNumberOffset) { + return BuildYsonNodeFluently() + .BeginMap() + .Item("name").Value(prefix + "oneof_field") + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value(prefix + "oneof_string_field_1") + .Item("field_number").Value(101 + fieldNumberOffset) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value(prefix + "oneof_string_field") + .Item("field_number").Value(102 + fieldNumberOffset) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value(prefix + "oneof_message_field") + .Item("field_number").Value(1000 + fieldNumberOffset) + .Item("proto_type").Value("structured_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("key") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("value") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + }; + auto oneofConfig = buildOneofConfig("", 0); + auto optionalOneofConfig = buildOneofConfig("optional_", 1000); + + auto keyValueFields = BuildYsonStringFluently() + .BeginList() + .Item().BeginMap() + .Item("name").Value("key") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("value") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList(); + + return BuildYsonNodeFluently() + .BeginAttributes() + .Item("enumerations").Value(EnumerationsConfig) + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("first") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("enum_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("packed_repeated_enum_field") + .Item("field_number").Value(11) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .Item("repeated").Value(true) + .Item("packed").Value(true) + .EndMap() + .Item().BeginMap() + .Item("name").Value("message_field") + .Item("field_number").Value(4) + .Item("proto_type").Value("structured_message") + .Item("fields").Value(keyValueFields) + .EndMap() + .Item().BeginMap() + .Item("name").Value("repeated_int64_field") + .Item("field_number").Value(3) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + .Item().BeginMap() + .Item("name").Value("another_repeated_int64_field") + .Item("field_number").Value(9) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + .Item().BeginMap() + .Item("name").Value("repeated_message_field") + .Item("field_number").Value(5) + .Item("proto_type").Value("structured_message") + .Item("repeated").Value(true) + .Item("fields").Value(keyValueFields) + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("any_int64_field") + .Item("field_number").Value(6) + .Item("proto_type").Value("any") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("any_map_field") + .Item("field_number").Value(7) + .Item("proto_type").Value("any") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("optional_int64_field") + .Item("field_number").Value(8) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("repeated_optional_any_field") + .Item("field_number").Value(10) + .Item("proto_type").Value("any") + .Item("repeated").Value(true) + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("optional_repeated_bool_field") + .Item("field_number").Value(12) + .Item("proto_type").Value("bool") + .Item("repeated").Value(true) + .EndMap() + .Item().Value(oneofConfig) + .Item().Value(optionalOneofConfig) + .Item() + .BeginMap() + .Item("name").Value("map_field") + .Item("field_number").Value(13) + .Item("proto_type").Value("structured_message") + .Item("repeated").Value(true) + .Item("fields") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("key") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("value") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields").Value(keyValueFields) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("second") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("one") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("two") + .Item("field_number").Value(500000000) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("three") + .Item("field_number").Value(100500) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("repeated_message_field") + .Item("field_number").Value(3) + .Item("proto_type").Value("structured_message") + .Item("repeated").Value(true) + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("key") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("value") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("repeated_int64_field") + .Item("field_number").Value(4) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("another_repeated_int64_field") + .Item("field_number").Value(13) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + .Item() + .BeginMap() + // In schema it is of type "any". + .Item("name").Value("any_field") + .Item("field_number").Value(5) + .Item("proto_type").Value("int64") + .EndMap() + // The next fields are for type casting testing + .Item() + .BeginMap() + // In schema it is of type "int64". + .Item("name").Value("int64_field") + .Item("field_number").Value(6) + .Item("proto_type").Value("int32") + .EndMap() + .Item() + .BeginMap() + // In schema it is of type "uint64". + .Item("name").Value("uint64_field") + .Item("field_number").Value(7) + .Item("proto_type").Value("uint32") + .EndMap() + .Item() + .BeginMap() + // In schema it is of type "int32". + .Item("name").Value("int32_field") + .Item("field_number").Value(8) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + // In schema it is of type "uint32". + .Item("name").Value("uint32_field") + .Item("field_number").Value(9) + .Item("proto_type").Value("uint64") + .EndMap() + + // Enums. + .Item() + .BeginMap() + .Item("name").Value("enum_int_field") + .Item("field_number").Value(10) + .Item("proto_type").Value("enum_int") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("enum_string_string_field") + .Item("field_number").Value(11) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("enum_string_int64_field") + .Item("field_number").Value(12) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("utf8_field") + .Item("field_number").Value(16) + .Item("proto_type").Value("string") + .EndMap() + + // list<optional<any>>. + .Item() + .BeginMap() + .Item("name").Value("repeated_optional_any_field") + .Item("field_number").Value(14) + .Item("proto_type").Value("any") + .Item("repeated").Value(true) + .EndMap() + + // Other columns. + .Item() + .BeginMap() + .Item("name").Value("other_columns_field") + .Item("field_number").Value(15) + .Item("proto_type").Value("other_columns") + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("packed_repeated_int64_field") + .Item("field_number").Value(17) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .Item("packed").Value(true) + .EndMap() + + .Item() + .BeginMap() + .Item("name").Value("optional_repeated_int64_field") + .Item("field_number").Value(18) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + + .Item().Value(oneofConfig) + .Item().Value(optionalOneofConfig) + + .Item() + .BeginMap() + .Item("name").Value("map_field") + .Item("field_number").Value(19) + .Item("proto_type").Value("structured_message") + .Item("repeated").Value(true) + .Item("fields") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("key") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("value") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields").Value(keyValueFields) + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .Item("complex_type_mode").Value(complexTypeMode) + .EndAttributes() + .Value("protobuf"); +} + +using TProtobufFormatStructuredMessageParameter = std::tuple<EComplexTypeMode, int, EProtoFormatType>; + +class TProtobufFormatStructuredMessage + : public ::testing::TestWithParam<TProtobufFormatStructuredMessageParameter> +{ }; + +INSTANTIATE_TEST_SUITE_P( + FileDescriptor, + TProtobufFormatStructuredMessage, + ::testing::Values(TProtobufFormatStructuredMessageParameter{ + EComplexTypeMode::Positional, + 1, + EProtoFormatType::FileDescriptor})); + +INSTANTIATE_TEST_SUITE_P( + Positional, + TProtobufFormatStructuredMessage, + ::testing::Values(TProtobufFormatStructuredMessageParameter{ + EComplexTypeMode::Positional, + 1, + EProtoFormatType::Structured})); + +INSTANTIATE_TEST_SUITE_P( + Named, + TProtobufFormatStructuredMessage, + ::testing::Values(TProtobufFormatStructuredMessageParameter{ + EComplexTypeMode::Named, + 1, + EProtoFormatType::Structured})); + +INSTANTIATE_TEST_SUITE_P( + ManyRows, + TProtobufFormatStructuredMessage, + ::testing::Values(TProtobufFormatStructuredMessageParameter{ + EComplexTypeMode::Named, + 30000, + EProtoFormatType::Structured})); + +TEST_P(TProtobufFormatStructuredMessage, EmbeddedWrite) +{ + auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); + + auto nameTable = New<TNameTable>(); + auto numId = nameTable->RegisterName("num"); + auto embeddedNumId = nameTable->RegisterName("embedded_num"); + auto variantId = nameTable->RegisterName("variant"); + auto embedded2NumId = nameTable->RegisterName("embedded2_num"); + auto embedded2StructId = nameTable->RegisterName("embedded2_struct"); + auto embedded2RepeatedId = nameTable->RegisterName("embedded2_repeated"); + auto extraIntId = nameTable->RegisterName("extra_int"); + auto otherComplexFieldId = nameTable->RegisterName("other_complex_field"); + + //message T2 { + // optional ui64 embedded2_num; + //}; + //message T1 { + // required T2 t2 [embedded]; + // optional ui64 embedded_num; + //}; + // + //message T { + // required T1 t1 [embedded]; + // optional ui64 num; + //}; + + auto schema = BuildEmbeddedSchema(); + auto config = BuildEmbeddedConfig(complexTypeMode, protoFormatType); + + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + ConvertTo<TProtobufFormatConfigPtr>(config->Attributes()), + {schema}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + + TUnversionedRowBuilder builder; + builder.AddValue(MakeUnversionedUint64Value(789, numId)); + builder.AddValue(MakeUnversionedUint64Value(123, embeddedNumId)); + builder.AddValue(MakeUnversionedUint64Value(456, embedded2NumId)); + builder.AddValue(MakeUnversionedCompositeValue("[1; 555u]", variantId)); + auto embeddedYson = BuildYsonStringFluently() + .BeginList() + // float1 + .Item().Value(1.5f) + // string1 + .Item().Value("abc") + .EndList(); + auto embeddedYsonStr = embeddedYson.ToString(); + builder.AddValue(MakeUnversionedCompositeValue(embeddedYsonStr, embedded2StructId)); + auto repeatedYsonStr = BuildYsonStringFluently() + .BeginList() + .Item().Value("a") + .Item().Value("b") + .EndList() + .ToString(); + builder.AddValue(MakeUnversionedCompositeValue(repeatedYsonStr, embedded2RepeatedId)); + builder.AddValue(MakeUnversionedInt64Value(111, extraIntId)); + auto otherComplexFieldYson = BuildYsonStringFluently() + .BeginList() + .Item().Value(22) + .Item().Value(23) + .Item().Value(24) + .EndList(); + auto otherComplexFieldYsonStr = otherComplexFieldYson.ToString(); + builder.AddValue(MakeUnversionedCompositeValue(otherComplexFieldYsonStr, otherComplexFieldId)); + + + auto rows = std::vector<TUnversionedRow>(rowCount, builder.GetRow()); + EXPECT_EQ(true, writer->Write(rows)); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput input(result); + TLenvalParser lenvalParser(&input); + + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TEmbeddingMessage message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + EXPECT_EQ(message.num(), 789UL); + EXPECT_EQ(message.t1().embedded_num(), 123UL); + EXPECT_EQ(message.t1().t2().embedded2_num(), 456UL); + + EXPECT_FALSE(message.t1().has_str_variant()); + EXPECT_TRUE(message.t1().has_uint_variant()); + EXPECT_EQ(message.t1().uint_variant(), 555UL); + + EXPECT_EQ(message.t1().t2().embedded2_struct().float1(), 1.5f); + EXPECT_EQ(message.t1().t2().embedded2_struct().string1(), "abc"); + + ASSERT_EQ(message.t1().t2().embedded2_repeated_size(), 2); + EXPECT_EQ(message.t1().t2().embedded2_repeated(0), "a"); + EXPECT_EQ(message.t1().t2().embedded2_repeated(1), "b"); + + { + auto otherColumns = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); + auto mode = complexTypeMode; + auto expected = ([&] { + switch (mode) { + case EComplexTypeMode::Named: + return BuildYsonNodeFluently() + .BeginMap() + .Item("one").Value(22) + .Item("two").Value(23) + .Item("three").Value(24) + .EndMap(); + case EComplexTypeMode::Positional: + return ConvertToNode(otherComplexFieldYson); + } + YT_ABORT(); + })(); + + EXPECT_NODES_EQUAL(expected, otherColumns->GetChildOrThrow("other_complex_field")); + EXPECT_EQ(ConvertTo<i64>(otherColumns->GetChildOrThrow("extra_int")), 111); + } + + ASSERT_FALSE(message.has_extra_field()); + ASSERT_FALSE(message.t1().has_embedded_extra_field()); + } + + ASSERT_FALSE(lenvalParser.Next()); +} + +TEST_P(TProtobufFormatStructuredMessage, Write) +{ + auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); + + auto nameTable = New<TNameTable>(); + auto firstId = nameTable->RegisterName("first"); + auto secondId = nameTable->RegisterName("second"); + auto repeatedMessageId = nameTable->RegisterName("repeated_message_field"); + auto repeatedInt64Id = nameTable->RegisterName("repeated_int64_field"); + auto anotherRepeatedInt64Id = nameTable->RegisterName("another_repeated_int64_field"); + auto anyFieldId = nameTable->RegisterName("any_field"); + auto int64FieldId = nameTable->RegisterName("int64_field"); + auto uint64FieldId = nameTable->RegisterName("uint64_field"); + auto int32FieldId = nameTable->RegisterName("int32_field"); + auto uint32FieldId = nameTable->RegisterName("uint32_field"); + auto enumIntFieldId = nameTable->RegisterName("enum_int_field"); + auto enumStringStringFieldId = nameTable->RegisterName("enum_string_string_field"); + auto enumStringInt64FieldId = nameTable->RegisterName("enum_string_int64_field"); + auto utf8FieldId = nameTable->RegisterName("utf8_field"); + auto repeatedOptionalAnyFieldId = nameTable->RegisterName("repeated_optional_any_field"); + auto otherComplexFieldId = nameTable->RegisterName("other_complex_field"); + auto packedRepeatedInt64FieldId = nameTable->RegisterName("packed_repeated_int64_field"); + auto optionalRepeatedInt64FieldId = nameTable->RegisterName("optional_repeated_int64_field"); + auto oneofFieldId = nameTable->RegisterName("oneof_field"); + auto optionalOneofFieldId = nameTable->RegisterName("optional_oneof_field"); + auto mapFieldId = nameTable->RegisterName("map_field"); + + auto schema = CreateSchemaWithStructuredMessage(); + auto config = CreateConfigWithStructuredMessage(complexTypeMode, protoFormatType); + + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + ConvertTo<TProtobufFormatConfigPtr>(config->Attributes()), + {schema}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + + auto firstYsonStr = BuildYsonStringFluently() + .BeginList() + // field_missing_from_proto1 + .Item().Value(11111) + // enum_field + .Item().Value("Two") + // int64_field + .Item().Value(44) + // repeated_int64_field + .Item() + .BeginList() + .Item().Value(55) + .Item().Value(56) + .Item().Value(57) + .EndList() + // another_repeated_int64_field + .Item() + .BeginList() + .EndList() + // message_field + .Item() + .BeginList() + .Item().Value("key") + .Item().Value("value") + .EndList() + // repeated_message_field + .Item() + .BeginList() + .Item() + .BeginList() + .Item().Value("key1") + .Item().Value("value1") + .EndList() + .Item() + .BeginList() + .Item().Value("key2") + .Item().Value("value2") + .EndList() + .EndList() + // any_int64_field + .Item().Value(45) + // any_map_field + .Item() + .BeginMap() + .Item("key").Value("value") + .EndMap() + // optional_int64_field + .Item().Entity() + // repeated_optional_any_field + .Item() + .BeginList() + .Item().Value(2) + .Item().Entity() + .Item().Value("foo") + .EndList() + // packed_repeated_enum_field + .Item() + .BeginList() + .Item().Value("MinusFortyTwo") + .Item().Value("Two") + .EndList() + // optional_repeated_bool_field + .Item() + .BeginList() + .Item().Value(false) + .Item().Value(true) + .Item().Value(false) + .EndList() + // oneof_field + .Item() + .BeginList() + // message_field + .Item().Value(2) + .Item().BeginList() + .Item().Value("foo") + .Item().Entity() + .EndList() + .EndList() + // optional_oneof_field + .Item() + .Entity() + // map_field + .Item() + .BeginList() + .Item().BeginList() + .Item().Value(13) + .Item().BeginList() + .Item().Value("bac") + .Item().Value("cab") + .EndList() + .EndList() + .Item().BeginList() + .Item().Value(15) + .Item().BeginList() + .Item().Value("ya") + .Item().Value("make") + .EndList() + .EndList() + .EndList() + .EndList() + .ToString(); + + auto secondYsonStr = BuildYsonStringFluently() + .BeginList() + .Item().Value(101) + .Item().Value(102) + .Item().Value(103) + .EndList() + .ToString(); + + auto repeatedMessageYsonStr = BuildYsonStringFluently() + .BeginList() + .Item() + .BeginList() + .Item().Value("key11") + .Item().Value("value11") + .EndList() + .Item() + .BeginList() + .Item().Value("key21") + .Item().Value("value21") + .EndList() + .EndList() + .ToString(); + + auto repeatedInt64Yson = BuildYsonStringFluently() + .BeginList() + .Item().Value(31) + .Item().Value(32) + .Item().Value(33) + .EndList(); + auto repeatedInt64YsonStr = repeatedInt64Yson.ToString(); + + auto anotherRepeatedInt64YsonStr = BuildYsonStringFluently() + .BeginList() + .EndList() + .ToString(); + + auto repeatedOptionalAnyYson = BuildYsonStringFluently() + .BeginList() + .Item().Value(1) + .Item().Value("abc") + .Item().Entity() + .Item().Value(true) + .EndList(); + auto repeatedOptionalAnyYsonStr = repeatedOptionalAnyYson.ToString(); + + auto otherComplexFieldYson = BuildYsonStringFluently() + .BeginList() + .Item().Value(22) + .Item().Value(23) + .Item().Value(24) + .EndList(); + auto otherComplexFieldYsonStr = otherComplexFieldYson.ToString(); + + TUnversionedRowBuilder builder; + builder.AddValue(MakeUnversionedCompositeValue(firstYsonStr, firstId)); + builder.AddValue(MakeUnversionedCompositeValue(secondYsonStr, secondId)); + builder.AddValue(MakeUnversionedCompositeValue(repeatedMessageYsonStr, repeatedMessageId)); + builder.AddValue(MakeUnversionedCompositeValue(repeatedInt64YsonStr, repeatedInt64Id)); + builder.AddValue(MakeUnversionedCompositeValue(anotherRepeatedInt64YsonStr, anotherRepeatedInt64Id)); + builder.AddValue(MakeUnversionedInt64Value(4321, anyFieldId)); + + builder.AddValue(MakeUnversionedInt64Value(-64, int64FieldId)); + builder.AddValue(MakeUnversionedUint64Value(64, uint64FieldId)); + builder.AddValue(MakeUnversionedInt64Value(-32, int32FieldId)); + builder.AddValue(MakeUnversionedUint64Value(32, uint32FieldId)); + + builder.AddValue(MakeUnversionedInt64Value(-42, enumIntFieldId)); + builder.AddValue(MakeUnversionedStringValue("Three", enumStringStringFieldId)); + builder.AddValue(MakeUnversionedInt64Value(1, enumStringInt64FieldId)); + + const auto HelloWorldInRussian = "\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\xbc\xd0\xb8\xd1\x80!"; + builder.AddValue(MakeUnversionedStringValue(HelloWorldInRussian, utf8FieldId)); + + builder.AddValue(MakeUnversionedCompositeValue(repeatedOptionalAnyYsonStr, repeatedOptionalAnyFieldId)); + + builder.AddValue(MakeUnversionedCompositeValue(otherComplexFieldYsonStr, otherComplexFieldId)); + + builder.AddValue(MakeUnversionedCompositeValue("[12;-10;123456789000;]", packedRepeatedInt64FieldId)); + + builder.AddValue(MakeUnversionedCompositeValue("[1;2;3]", optionalRepeatedInt64FieldId)); + + builder.AddValue(MakeUnversionedCompositeValue("[0; foobaz]", oneofFieldId)); + builder.AddValue(MakeUnversionedNullValue(optionalOneofFieldId)); + + builder.AddValue(MakeUnversionedCompositeValue("[[2; [x; y]]; [5; [z; w]]]", mapFieldId)); + + auto rows = std::vector<TUnversionedRow>(rowCount, builder.GetRow()); + EXPECT_EQ(true, writer->Write(rows)); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput input(result); + TLenvalParser lenvalParser(&input); + + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TMessageWithStructuredEmbedded message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + const auto& first = message.first(); + EXPECT_EQ(first.enum_field(), EEnum::Two); + EXPECT_EQ(first.int64_field(), 44); + std::vector<i64> firstRepeatedInt64Field( + first.repeated_int64_field().begin(), + first.repeated_int64_field().end()); + EXPECT_EQ(firstRepeatedInt64Field, (std::vector<i64>{55, 56, 57})); + std::vector<i64> firstAnotherRepeatedInt64Field( + first.another_repeated_int64_field().begin(), + first.another_repeated_int64_field().end()); + EXPECT_EQ(firstAnotherRepeatedInt64Field, (std::vector<i64>{})); + EXPECT_EQ(first.message_field().key(), "key"); + EXPECT_EQ(first.message_field().value(), "value"); + ASSERT_EQ(first.repeated_message_field_size(), 2); + EXPECT_EQ(first.repeated_message_field(0).key(), "key1"); + EXPECT_EQ(first.repeated_message_field(0).value(), "value1"); + EXPECT_EQ(first.repeated_message_field(1).key(), "key2"); + EXPECT_EQ(first.repeated_message_field(1).value(), "value2"); + + EXPECT_NODES_EQUAL( + ConvertToNode(TYsonString(first.any_int64_field())), + BuildYsonNodeFluently().Value(45)); + + EXPECT_NODES_EQUAL( + ConvertToNode(TYsonString(first.any_map_field())), + BuildYsonNodeFluently().BeginMap() + .Item("key").Value("value") + .EndMap()); + + std::vector<TYsonString> firstRepeatedOptionalAnyField( + first.repeated_optional_any_field().begin(), + first.repeated_optional_any_field().end()); + + EXPECT_NODES_EQUAL( + ConvertToNode(firstRepeatedOptionalAnyField), + BuildYsonNodeFluently() + .BeginList() + .Item().Value(2) + .Item().Entity() + .Item().Value("foo") + .EndList()); + + EXPECT_FALSE(first.has_optional_int64_field()); + + std::vector<EEnum> actualFirstPackedRepeatedEnumField; + for (auto x : first.packed_repeated_enum_field()) { + actualFirstPackedRepeatedEnumField.push_back(static_cast<EEnum>(x)); + } + auto expectedFirstPackedRepeatedEnumField = std::vector<EEnum>{EEnum::MinusFortyTwo, EEnum::Two}; + EXPECT_EQ(expectedFirstPackedRepeatedEnumField, actualFirstPackedRepeatedEnumField); + + std::vector<bool> firstOptionalRepeatedBoolField( + first.optional_repeated_bool_field().begin(), + first.optional_repeated_bool_field().end()); + auto expectedFirstOptionalRepeatedBoolField = std::vector<bool>{false, true, false}; + EXPECT_EQ(expectedFirstOptionalRepeatedBoolField, firstOptionalRepeatedBoolField); + + EXPECT_FALSE(first.has_oneof_string_field_1()); + EXPECT_FALSE(first.has_oneof_string_field()); + EXPECT_TRUE(first.has_oneof_message_field()); + EXPECT_EQ(first.oneof_message_field().key(), "foo"); + EXPECT_FALSE(first.oneof_message_field().has_value()); + + EXPECT_FALSE(first.has_optional_oneof_string_field_1()); + EXPECT_FALSE(first.has_optional_oneof_string_field()); + EXPECT_FALSE(first.has_optional_oneof_message_field()); + + EXPECT_EQ(std::ssize(first.map_field()), 2); + ASSERT_EQ(static_cast<int>(first.map_field().count(13)), 1); + EXPECT_EQ(first.map_field().at(13).key(), "bac"); + EXPECT_EQ(first.map_field().at(13).value(), "cab"); + ASSERT_EQ(static_cast<int>(first.map_field().count(15)), 1); + EXPECT_EQ(first.map_field().at(15).key(), "ya"); + EXPECT_EQ(first.map_field().at(15).value(), "make"); + + const auto& second = message.second(); + EXPECT_EQ(second.one(), 101); + EXPECT_EQ(second.two(), 102); + EXPECT_EQ(second.three(), 103); + + ASSERT_EQ(message.repeated_message_field_size(), 2); + EXPECT_EQ(message.repeated_message_field(0).key(), "key11"); + EXPECT_EQ(message.repeated_message_field(0).value(), "value11"); + EXPECT_EQ(message.repeated_message_field(1).key(), "key21"); + EXPECT_EQ(message.repeated_message_field(1).value(), "value21"); + + std::vector<i64> repeatedInt64Field( + message.repeated_int64_field().begin(), + message.repeated_int64_field().end()); + EXPECT_EQ(repeatedInt64Field, (std::vector<i64>{31, 32, 33})); + + std::vector<i64> anotherRepeatedInt64Field( + message.another_repeated_int64_field().begin(), + message.another_repeated_int64_field().end()); + EXPECT_EQ(anotherRepeatedInt64Field, (std::vector<i64>{})); + + EXPECT_EQ(message.int64_any_field(), 4321); + + // Note the reversal of 32 <-> 64. + EXPECT_EQ(message.int32_field(), -64); + EXPECT_EQ(message.uint32_field(), 64u); + EXPECT_EQ(message.int64_field(), -32); + EXPECT_EQ(message.uint64_field(), 32u); + + EXPECT_EQ(message.enum_int_field(), EEnum::MinusFortyTwo); + EXPECT_EQ(message.enum_string_string_field(), EEnum::Three); + EXPECT_EQ(message.enum_string_int64_field(), EEnum::One); + + EXPECT_EQ(message.utf8_field(), HelloWorldInRussian); + + std::vector<TYsonString> repeatedOptionalAnyField( + message.repeated_optional_any_field().begin(), + message.repeated_optional_any_field().end()); + EXPECT_NODES_EQUAL(ConvertToNode(repeatedOptionalAnyField), ConvertToNode(repeatedOptionalAnyYson)); + + { + auto otherColumns = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); + auto mode = complexTypeMode; + auto expected = ([&] { + switch (mode) { + case EComplexTypeMode::Named: + return BuildYsonNodeFluently() + .BeginMap() + .Item("one").Value(22) + .Item("two").Value(23) + .Item("three").Value(24) + .EndMap(); + case EComplexTypeMode::Positional: + return ConvertToNode(otherComplexFieldYson); + } + YT_ABORT(); + })(); + + EXPECT_NODES_EQUAL(expected, otherColumns->GetChildOrThrow("other_complex_field")); + } + + std::vector<i64> actualPackedRepeatedInt64Field( + message.packed_repeated_int64_field().begin(), + message.packed_repeated_int64_field().end()); + auto expectedPackedRepeatedInt64Field = std::vector<i64>{12, -10, 123456789000LL}; + EXPECT_EQ(expectedPackedRepeatedInt64Field, actualPackedRepeatedInt64Field); + + std::vector<i64> actualOptionalRepeatedInt64Field( + message.optional_repeated_int64_field().begin(), + message.optional_repeated_int64_field().end()); + auto expectedOptionalRepeatedInt64Field = std::vector<i64>{1, 2, 3}; + EXPECT_EQ(expectedOptionalRepeatedInt64Field, actualOptionalRepeatedInt64Field); + + EXPECT_TRUE(message.has_oneof_string_field_1()); + EXPECT_EQ(message.oneof_string_field_1(), "foobaz"); + EXPECT_FALSE(message.has_oneof_string_field()); + EXPECT_FALSE(message.has_oneof_message_field()); + + EXPECT_FALSE(message.has_optional_oneof_string_field_1()); + EXPECT_FALSE(message.has_optional_oneof_string_field()); + EXPECT_FALSE(message.has_optional_oneof_message_field()); + + EXPECT_EQ(std::ssize(message.map_field()), 2); + ASSERT_EQ(static_cast<int>(message.map_field().count(2)), 1); + EXPECT_EQ(message.map_field().at(2).key(), "x"); + EXPECT_EQ(message.map_field().at(2).value(), "y"); + ASSERT_EQ(static_cast<int>(message.map_field().count(5)), 1); + EXPECT_EQ(message.map_field().at(5).key(), "z"); + EXPECT_EQ(message.map_field().at(5).value(), "w"); + } + + ASSERT_FALSE(lenvalParser.Next()); +} + +INodePtr SortMapByKey(const INodePtr& node) +{ + auto keyValuePairs = ConvertTo<std::vector<std::pair<i64, INodePtr>>>(node); + std::sort(std::begin(keyValuePairs), std::end(keyValuePairs)); + return ConvertTo<INodePtr>(keyValuePairs); +} + +TEST_P(TProtobufFormatStructuredMessage, EmbeddedParse) +{ + auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); + + auto schema = BuildEmbeddedSchema(); + auto config = BuildEmbeddedConfig(complexTypeMode, protoFormatType); + + NYT::TEmbeddingMessage message; + + message.set_num(789); + auto* t1 = message.mutable_t1(); + t1->set_embedded_num(123); + auto* t2 = t1->mutable_t2(); + t2->set_embedded2_num(456); + t1->set_uint_variant(555); + t2->add_embedded2_repeated("a"); + t2->add_embedded2_repeated("b"); + t2->add_embedded2_repeated("c"); + auto* embedded2_struct = t2->mutable_embedded2_struct(); + embedded2_struct->set_float1(1.5f); + embedded2_struct->set_string1("abc"); + + //message.set_extra_field("*"); + //t1->set_embedded_extra_field("*"); + + auto rowCollector = ParseRows(message, config, schema, rowCount); + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "num")), 789u); + EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "embedded_num")), 123u); + EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "embedded2_num")), 456u); + EXPECT_NODES_EQUAL( + GetComposite(rowCollector.GetRowValue(rowIndex, "variant")), + ConvertToNode(TYsonString(TStringBuf("[1; 555u]")))); + + auto embedded2_repeatedNode = GetComposite(rowCollector.GetRowValue(rowIndex, "embedded2_repeated")); + ASSERT_EQ(embedded2_repeatedNode->GetType(), ENodeType::List); + const auto& embedded2_repeatedList = embedded2_repeatedNode->AsList(); + ASSERT_EQ(embedded2_repeatedList->GetChildCount(), 3); + EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(0), "a"); + EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(1), "b"); + EXPECT_EQ(embedded2_repeatedList->GetChildValueOrThrow<TString>(2), "c"); + + auto embedded2_structNode = GetComposite(rowCollector.GetRowValue(rowIndex, "embedded2_struct")); + ASSERT_EQ(embedded2_structNode->GetType(), ENodeType::List); + const auto& embedded2_structList = embedded2_structNode->AsList(); + ASSERT_EQ(embedded2_structList->GetChildCount(), 2); + EXPECT_EQ(embedded2_structList->GetChildValueOrThrow<double>(0), 1.5f); + EXPECT_EQ(embedded2_structList->GetChildValueOrThrow<TString>(1), "abc"); + } +} + +TEST_P(TProtobufFormatStructuredMessage, Parse) +{ + auto [complexTypeMode, rowCount, protoFormatType] = GetParam(); + + auto schema = CreateSchemaWithStructuredMessage(); + auto config = CreateConfigWithStructuredMessage(complexTypeMode, protoFormatType); + + NYT::TMessageWithStructuredEmbedded message; + + auto* first = message.mutable_first(); + first->set_enum_field(EEnum::Two); + first->set_int64_field(44); + + first->add_repeated_int64_field(55); + first->add_repeated_int64_field(56); + first->add_repeated_int64_field(57); + + // another_repeated_int64_field is intentionally empty. + + first->mutable_message_field()->set_key("key"); + first->mutable_message_field()->set_value("value"); + auto* firstSubfield1 = first->add_repeated_message_field(); + firstSubfield1->set_key("key1"); + firstSubfield1->set_value("value1"); + auto* firstSubfield2 = first->add_repeated_message_field(); + firstSubfield2->set_key("key2"); + firstSubfield2->set_value("value2"); + + first->set_any_int64_field(BuildYsonStringFluently().Value(4422).ToString()); + first->set_any_map_field( + BuildYsonStringFluently() + .BeginMap() + .Item("key").Value("value") + .EndMap() + .ToString()); + + first->add_repeated_optional_any_field("%false"); + first->add_repeated_optional_any_field("42"); + first->add_repeated_optional_any_field("#"); + + first->add_packed_repeated_enum_field(EEnum::MaxInt32); + first->add_packed_repeated_enum_field(EEnum::MinusFortyTwo); + + // optional_repeated_bool_field is intentionally empty. + + first->mutable_oneof_message_field()->set_key("KEY"); + + // optional_oneof_field is intentionally empty. + + (*first->mutable_map_field())[111].set_key("key111"); + (*first->mutable_map_field())[111].set_value("value111"); + (*first->mutable_map_field())[222].set_key("key222"); + (*first->mutable_map_field())[222].set_value("value222"); + + auto* second = message.mutable_second(); + second->set_one(101); + second->set_two(102); + second->set_three(103); + + message.add_repeated_int64_field(31); + message.add_repeated_int64_field(32); + message.add_repeated_int64_field(33); + + // another_repeated_int64_field is intentionally empty. + + auto* subfield1 = message.add_repeated_message_field(); + subfield1->set_key("key11"); + subfield1->set_value("value11"); + auto* subfield2 = message.add_repeated_message_field(); + subfield2->set_key("key21"); + subfield2->set_value("value21"); + + message.set_int64_any_field(4321); + + // Note the reversal of 32 <-> 64. + message.set_int64_field(-32); + message.set_uint64_field(32); + message.set_int32_field(-64); + message.set_uint32_field(64); + + // Note that we don't set the "enum_string_int64_field" as it would fail during parsing. + message.set_enum_int_field(EEnum::MinusFortyTwo); + message.set_enum_string_string_field(EEnum::Three); + + const auto HelloWorldInChinese = "\xe4\xbd\xa0\xe5\xa5\xbd\xef\xbc\x8c\xe4\xb8\x96\xe7\x95\x8c"; + message.set_utf8_field(HelloWorldInChinese); + + message.add_repeated_optional_any_field("#"); + message.add_repeated_optional_any_field("1"); + message.add_repeated_optional_any_field("\"qwe\""); + message.add_repeated_optional_any_field("%true"); + + auto otherComplexFieldPositional = BuildYsonNodeFluently() + .BeginList() + .Item().Value(301) + .Item().Value(302) + .Item().Value(303) + .EndList(); + + auto mode = complexTypeMode; + auto otherComplexField = ([&] { + switch (mode) { + case EComplexTypeMode::Named: + return BuildYsonNodeFluently() + .BeginMap() + .Item("one").Value(301) + .Item("two").Value(302) + .Item("three").Value(303) + .EndMap(); + case EComplexTypeMode::Positional: + return otherComplexFieldPositional; + } + YT_ABORT(); + })(); + auto otherColumnsYson = BuildYsonStringFluently() + .BeginMap() + .Item("other_complex_field").Value(otherComplexField) + .EndMap(); + message.set_other_columns_field(otherColumnsYson.ToString()); + + message.add_packed_repeated_int64_field(-123456789000LL); + message.add_packed_repeated_int64_field(0); + + message.add_optional_repeated_int64_field(-4242); + + // optional_oneof_field is intentionally empty. + + message.set_oneof_string_field("spam"); + + (*message.mutable_map_field())[777].set_key("key777"); + (*message.mutable_map_field())[777].set_value("value777"); + (*message.mutable_map_field())[888].set_key("key888"); + (*message.mutable_map_field())[888].set_value("value888"); + + auto rowCollector = ParseRows(message, config, schema, rowCount); + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + auto firstNode = GetComposite(rowCollector.GetRowValue(rowIndex, "first")); + ASSERT_EQ(firstNode->GetType(), ENodeType::List); + const auto& firstList = firstNode->AsList(); + ASSERT_EQ(firstList->GetChildCount(), 17); + + EXPECT_EQ(firstList->GetChildOrThrow(0)->GetType(), ENodeType::Entity); + EXPECT_EQ(firstList->GetChildValueOrThrow<TString>(1), "Two"); + EXPECT_EQ(firstList->GetChildValueOrThrow<i64>(2), 44); + + ASSERT_EQ(firstList->GetChildOrThrow(3)->GetType(), ENodeType::List); + EXPECT_EQ(ConvertTo<std::vector<i64>>(firstList->GetChildOrThrow(3)), (std::vector<i64>{55, 56, 57})); + + ASSERT_EQ(firstList->GetChildOrThrow(4)->GetType(), ENodeType::List); + EXPECT_EQ(ConvertTo<std::vector<i64>>(firstList->GetChildOrThrow(4)), (std::vector<i64>{})); + + ASSERT_EQ(firstList->GetChildOrThrow(5)->GetType(), ENodeType::List); + EXPECT_EQ(firstList->GetChildOrThrow(5)->AsList()->GetChildValueOrThrow<TString>(0), "key"); + EXPECT_EQ(firstList->GetChildOrThrow(5)->AsList()->GetChildValueOrThrow<TString>(1), "value"); + + ASSERT_EQ(firstList->GetChildOrThrow(6)->GetType(), ENodeType::List); + ASSERT_EQ(firstList->GetChildOrThrow(6)->AsList()->GetChildCount(), 2); + + const auto& firstSubNode1 = firstList->GetChildOrThrow(6)->AsList()->GetChildOrThrow(0); + ASSERT_EQ(firstSubNode1->GetType(), ENodeType::List); + ASSERT_EQ(firstSubNode1->AsList()->GetChildCount(), 2); + EXPECT_EQ(firstSubNode1->AsList()->GetChildValueOrThrow<TString>(0), "key1"); + EXPECT_EQ(firstSubNode1->AsList()->GetChildValueOrThrow<TString>(1), "value1"); + + const auto& firstSubNode2 = firstList->GetChildOrThrow(6)->AsList()->GetChildOrThrow(1); + ASSERT_EQ(firstSubNode2->GetType(), ENodeType::List); + ASSERT_EQ(firstSubNode2->AsList()->GetChildCount(), 2); + EXPECT_EQ(firstSubNode2->AsList()->GetChildValueOrThrow<TString>(0), "key2"); + EXPECT_EQ(firstSubNode2->AsList()->GetChildValueOrThrow<TString>(1), "value2"); + + ASSERT_EQ(firstList->GetChildOrThrow(7)->GetType(), ENodeType::Int64); + EXPECT_EQ(firstList->GetChildValueOrThrow<i64>(7), 4422); + + ASSERT_EQ(firstList->GetChildOrThrow(8)->GetType(), ENodeType::Map); + EXPECT_NODES_EQUAL( + firstList->GetChildOrThrow(8), + BuildYsonNodeFluently() + .BeginMap() + .Item("key").Value("value") + .EndMap()); + + ASSERT_EQ(firstList->GetChildOrThrow(9)->GetType(), ENodeType::Entity); + + EXPECT_NODES_EQUAL( + firstList->GetChildOrThrow(10), + BuildYsonNodeFluently() + .BeginList() + .Item().Value(false) + .Item().Value(42) + .Item().Entity() + .EndList()); + + EXPECT_NODES_EQUAL( + firstList->GetChildOrThrow(11), + BuildYsonNodeFluently() + .BeginList() + .Item().Value("MaxInt32") + .Item().Value("MinusFortyTwo") + .EndList()); + + // optional_repeated_bool_field. + ASSERT_EQ(firstList->GetChildOrThrow(12)->GetType(), ENodeType::Entity); + + // oneof_field. + EXPECT_NODES_EQUAL( + firstList->GetChildOrThrow(13), + BuildYsonNodeFluently() + .BeginList() + .Item().Value(2) + .Item().BeginList() + .Item().Value("KEY") + .Item().Entity() + .EndList() + .EndList()); + + // optional_oneof_field. + ASSERT_EQ(firstList->GetChildOrThrow(14)->GetType(), ENodeType::Entity); + + // map_field. + EXPECT_NODES_EQUAL( + SortMapByKey(firstList->GetChildOrThrow(15)), + BuildYsonNodeFluently() + .BeginList() + .Item().BeginList() + .Item().Value(111) + .Item().BeginList() + .Item().Value("key111") + .Item().Value("value111") + .EndList() + .EndList() + .Item().BeginList() + .Item().Value(222) + .Item().BeginList() + .Item().Value("key222") + .Item().Value("value222") + .EndList() + .EndList() + .EndList()); + + // field_missing_from_proto2. + ASSERT_EQ(firstList->GetChildOrThrow(16)->GetType(), ENodeType::Entity); + + auto secondNode = GetComposite(rowCollector.GetRowValue(rowIndex, "second")); + ASSERT_EQ(secondNode->GetType(), ENodeType::List); + EXPECT_EQ(ConvertTo<std::vector<i64>>(secondNode), (std::vector<i64>{101, 102, 103})); + + auto repeatedMessageNode = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_message_field")); + ASSERT_EQ(repeatedMessageNode->GetType(), ENodeType::List); + ASSERT_EQ(repeatedMessageNode->AsList()->GetChildCount(), 2); + + const auto& subNode1 = repeatedMessageNode->AsList()->GetChildOrThrow(0); + ASSERT_EQ(subNode1->GetType(), ENodeType::List); + ASSERT_EQ(subNode1->AsList()->GetChildCount(), 2); + EXPECT_EQ(subNode1->AsList()->GetChildValueOrThrow<TString>(0), "key11"); + EXPECT_EQ(subNode1->AsList()->GetChildValueOrThrow<TString>(1), "value11"); + + const auto& subNode2 = repeatedMessageNode->AsList()->GetChildOrThrow(1); + ASSERT_EQ(subNode2->GetType(), ENodeType::List); + ASSERT_EQ(subNode2->AsList()->GetChildCount(), 2); + EXPECT_EQ(subNode2->AsList()->GetChildValueOrThrow<TString>(0), "key21"); + EXPECT_EQ(subNode2->AsList()->GetChildValueOrThrow<TString>(1), "value21"); + + auto repeatedInt64Node = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_int64_field")); + EXPECT_EQ(ConvertTo<std::vector<i64>>(repeatedInt64Node), (std::vector<i64>{31, 32, 33})); + + auto anotherRepeatedInt64Node = GetComposite(rowCollector.GetRowValue(rowIndex, "another_repeated_int64_field")); + EXPECT_EQ(ConvertTo<std::vector<i64>>(anotherRepeatedInt64Node), (std::vector<i64>{})); + + auto anyValue = rowCollector.GetRowValue(rowIndex, "any_field"); + ASSERT_EQ(anyValue.Type, EValueType::Int64); + EXPECT_EQ(anyValue.Data.Int64, 4321); + + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "int64_field")), -64); + EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "uint64_field")), 64u); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "int32_field")), -32); + EXPECT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "uint32_field")), 32u); + + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "enum_int_field")), -42); + EXPECT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "enum_string_string_field")), "Three"); + + EXPECT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "utf8_field")), HelloWorldInChinese); + + auto repeatedRepeatedOptionalAnyNode = GetComposite(rowCollector.GetRowValue(rowIndex, "repeated_optional_any_field")); + auto expectedRepeatedOptionalAnyNode = BuildYsonNodeFluently() + .BeginList() + .Item().Entity() + .Item().Value(1) + .Item().Value("qwe") + .Item().Value(true) + .EndList(); + EXPECT_NODES_EQUAL(repeatedRepeatedOptionalAnyNode, expectedRepeatedOptionalAnyNode); + + auto actualOtherComplexField = GetComposite(rowCollector.GetRowValue(rowIndex, "other_complex_field")); + EXPECT_NODES_EQUAL(actualOtherComplexField, otherComplexFieldPositional); + + EXPECT_NODES_EQUAL( + GetComposite(rowCollector.GetRowValue(rowIndex, "packed_repeated_int64_field")), + ConvertToNode(TYsonString(TStringBuf("[-123456789000;0]")))); + + EXPECT_NODES_EQUAL( + GetComposite(rowCollector.GetRowValue(rowIndex, "optional_repeated_int64_field")), + ConvertToNode(TYsonString(TStringBuf("[-4242]")))); + + EXPECT_NODES_EQUAL( + GetComposite(rowCollector.GetRowValue(rowIndex, "oneof_field")), + ConvertToNode(TYsonString(TStringBuf("[1; \"spam\"]")))); + + EXPECT_FALSE(rowCollector.FindRowValue(rowIndex, "optional_oneof_field")); + + // map_field. + EXPECT_NODES_EQUAL( + SortMapByKey(GetComposite(rowCollector.GetRowValue(rowIndex, "map_field"))), + ConvertToNode(TYsonString(TStringBuf("[[777; [key777; value777]]; [888; [key888; value888]]]")))); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +std::vector<TTableSchemaPtr> CreateSeveralTablesSchemas() +{ + return { + New<TTableSchema>(std::vector<TColumnSchema>{ + {"embedded", StructLogicalType({ + {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + })}, + {"repeated_int64_field", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + {"any_field", SimpleLogicalType(ESimpleLogicalValueType::Any)}, + }), + New<TTableSchema>(std::vector<TColumnSchema>{ + {"enum_field", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"int64_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + }), + // Empty schema. + New<TTableSchema>(), + }; +} + +INodePtr CreateSeveralTablesConfig(EProtoFormatType protoFormatType) +{ + if (protoFormatType == EProtoFormatType::FileDescriptor) { + return CreateFileDescriptorConfig<TSeveralTablesMessageFirst, TSeveralTablesMessageSecond, TSeveralTablesMessageThird>(); + } + YT_VERIFY(protoFormatType == EProtoFormatType::Structured); + + return BuildYsonNodeFluently() + .BeginAttributes() + .Item("enumerations").Value(EnumerationsConfig) + .Item("tables") + .BeginList() + // Table #1. + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("embedded") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("enum_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .EndList() + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("repeated_int64_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .Item("repeated").Value(true) + .EndMap() + .Item() + .BeginMap() + // In schema it is of type "any". + .Item("name").Value("any_field") + .Item("field_number").Value(3) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + + // Table #2. + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .Item() + .BeginMap() + .Item("name").Value("enum_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("enum_string") + .Item("enumeration_name").Value("EEnum") + .EndMap() + .EndList() + .EndMap() + + // Table #3. + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("string_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndAttributes() + .Value("protobuf"); +} + +using TProtobufFormatSeveralTablesParam = std::tuple<EProtoFormatType>; + +class TProtobufFormatSeveralTables + : public ::testing::TestWithParam<TProtobufFormatSeveralTablesParam> +{ }; + +INSTANTIATE_TEST_SUITE_P( + FileDescriptor, + TProtobufFormatSeveralTables, + ::testing::Values(TProtobufFormatSeveralTablesParam{ + EProtoFormatType::FileDescriptor})); + +INSTANTIATE_TEST_SUITE_P( + Structured, + TProtobufFormatSeveralTables, + ::testing::Values(TProtobufFormatSeveralTablesParam{ + EProtoFormatType::Structured})); + +TEST_P(TProtobufFormatSeveralTables, Write) +{ + auto [protoFormatType] = GetParam(); + + auto schemas = CreateSeveralTablesSchemas(); + auto configNode = CreateSeveralTablesConfig(protoFormatType); + + auto config = ConvertTo<TProtobufFormatConfigPtr>(configNode->Attributes().ToMap()); + + auto nameTable = New<TNameTable>(); + auto embeddedId = nameTable->RegisterName("embedded"); + auto anyFieldId = nameTable->RegisterName("any_field"); + auto int64FieldId = nameTable->RegisterName("int64_field"); + auto repeatedInt64Id = nameTable->RegisterName("repeated_int64_field"); + auto enumFieldId = nameTable->RegisterName("enum_field"); + auto stringFieldId = nameTable->RegisterName("string_field"); + auto tableIndexId = nameTable->RegisterName(TableIndexColumnName); + + TString result; + TStringOutput resultStream(result); + auto controlAttributesConfig = New<TControlAttributesConfig>(); + controlAttributesConfig->EnableTableIndex = true; + controlAttributesConfig->EnableEndOfStream = true; + auto writer = CreateWriterForProtobuf( + std::move(config), + schemas, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + std::move(controlAttributesConfig), + 0); + + auto embeddedYson = BuildYsonStringFluently() + .BeginList() + .Item().Value("Two") + .Item().Value(44) + .EndList() + .ToString(); + + auto repeatedInt64Yson = ConvertToYsonString(std::vector<i64>{31, 32, 33}).ToString(); + + { + TUnversionedRowBuilder builder; + builder.AddValue(MakeUnversionedCompositeValue(embeddedYson, embeddedId)); + builder.AddValue(MakeUnversionedCompositeValue(repeatedInt64Yson, repeatedInt64Id)); + builder.AddValue(MakeUnversionedInt64Value(4321, anyFieldId)); + EXPECT_EQ(true, writer->Write({builder.GetRow()})); + } + { + TUnversionedRowBuilder builder; + builder.AddValue(MakeUnversionedStringValue("Two", enumFieldId)); + builder.AddValue(MakeUnversionedInt64Value(999, int64FieldId)); + builder.AddValue(MakeUnversionedInt64Value(1, tableIndexId)); + EXPECT_EQ(true, writer->Write({builder.GetRow()})); + } + { + TUnversionedRowBuilder builder; + builder.AddValue(MakeUnversionedStringValue("blah", stringFieldId)); + builder.AddValue(MakeUnversionedInt64Value(2, tableIndexId)); + EXPECT_EQ(true, writer->Write({builder.GetRow()})); + } + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput input(result); + TLenvalParser lenvalParser(&input); + + { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TSeveralTablesMessageFirst message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + const auto& embedded = message.embedded(); + EXPECT_EQ(embedded.enum_field(), EEnum::Two); + EXPECT_EQ(embedded.int64_field(), 44); + + std::vector<i64> repeatedInt64Field( + message.repeated_int64_field().begin(), + message.repeated_int64_field().end()); + EXPECT_EQ(repeatedInt64Field, (std::vector<i64>{31, 32, 33})); + EXPECT_EQ(message.int64_field(), 4321); + } + { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TSeveralTablesMessageSecond message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + EXPECT_EQ(message.enum_field(), EEnum::Two); + EXPECT_EQ(message.int64_field(), 999); + } + { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TSeveralTablesMessageThird message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + EXPECT_EQ(message.string_field(), "blah"); + } + ASSERT_FALSE(lenvalParser.IsEndOfStream()); + ASSERT_FALSE(lenvalParser.Next()); + ASSERT_TRUE(lenvalParser.IsEndOfStream()); + ASSERT_FALSE(lenvalParser.Next()); +} + +TEST_P(TProtobufFormatSeveralTables, Parse) +{ + auto [protoFormatType] = GetParam(); + + auto schemas = CreateSeveralTablesSchemas(); + auto configNode = CreateSeveralTablesConfig(protoFormatType); + auto config = ConvertTo<TProtobufFormatConfigPtr>(configNode->Attributes().ToMap()); + + std::vector<TCollectingValueConsumer> rowCollectors; + std::vector<std::unique_ptr<IParser>> parsers; + for (const auto& schema : schemas) { + rowCollectors.emplace_back(schema); + } + for (int tableIndex = 0; tableIndex < static_cast<int>(schemas.size()); ++tableIndex) { + parsers.push_back(CreateParserForProtobuf( + &rowCollectors[tableIndex], + config, + tableIndex)); + } + + NYT::TSeveralTablesMessageFirst firstMessage; + auto* embedded = firstMessage.mutable_embedded(); + embedded->set_enum_field(EEnum::Two); + embedded->set_int64_field(44); + + firstMessage.add_repeated_int64_field(55); + firstMessage.add_repeated_int64_field(56); + firstMessage.add_repeated_int64_field(57); + + firstMessage.set_int64_field(4444); + + NYT::TSeveralTablesMessageSecond secondMessage; + secondMessage.set_enum_field(EEnum::Two); + secondMessage.set_int64_field(44); + + NYT::TSeveralTablesMessageThird thirdMessage; + thirdMessage.set_string_field("blah"); + + auto parse = [] (auto& parser, const auto& message) { + TString lenvalBytes; + { + TStringOutput out(lenvalBytes); + auto messageSize = static_cast<ui32>(message.ByteSizeLong()); + out.Write(&messageSize, sizeof(messageSize)); + ASSERT_TRUE(message.SerializeToArcadiaStream(&out)); + } + parser->Read(lenvalBytes); + parser->Finish(); + }; + + parse(parsers[0], firstMessage); + parse(parsers[1], secondMessage); + parse(parsers[2], thirdMessage); + + { + const auto& rowCollector = rowCollectors[0]; + ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); + + auto embeddedNode = GetComposite(rowCollector.GetRowValue(0, "embedded")); + ASSERT_EQ(ConvertToTextYson(embeddedNode), "[\"Two\";44;]"); + + auto repeatedInt64Node = GetComposite(rowCollector.GetRowValue(0, "repeated_int64_field")); + ASSERT_EQ(ConvertToTextYson(repeatedInt64Node), "[55;56;57;]"); + + auto int64Field = GetInt64(rowCollector.GetRowValue(0, "any_field")); + EXPECT_EQ(int64Field, 4444); + } + + { + const auto& rowCollector = rowCollectors[1]; + ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); + + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "enum_field")), "Two"); + EXPECT_EQ(GetInt64(rowCollector.GetRowValue(0, "int64_field")), 44); + } + + { + const auto& rowCollector = rowCollectors[2]; + ASSERT_EQ(static_cast<int>(rowCollector.Size()), 1); + + EXPECT_EQ(GetString(rowCollector.GetRowValue(0, "string_field")), "blah"); + } +} + +TEST(TProtobufFormat, SchemaConfigMismatch) +{ + auto createParser = [] (const TTableSchemaPtr& schema, const INodePtr& configNode) { + TCollectingValueConsumer rowCollector(schema); + return CreateParserForProtobuf( + &rowCollector, + ConvertTo<TProtobufFormatConfigPtr>(configNode), + 0); + }; + auto createSeveralTableWriter = [] (const std::vector<TTableSchemaPtr>& schemas, const INodePtr& configNode) { + TString result; + TStringOutput resultStream(result); + return CreateWriterForProtobuf( + ConvertTo<TProtobufFormatConfigPtr>(configNode), + schemas, + New<TNameTable>(), + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + }; + auto createWriter = [&] (const TTableSchemaPtr& schema, const INodePtr& configNode) { + createSeveralTableWriter({schema}, configNode); + }; + + auto schema_struct_with_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + {"struct", StructLogicalType({ + {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + }); + + auto schema_struct_with_uint64 = New<TTableSchema>(std::vector<TColumnSchema>{ + {"struct", StructLogicalType({ + {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))}, + })}, + }); + + auto config_struct_with_int64 = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("struct") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(2) + // Wrong type. + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + // OK. + EXPECT_NO_THROW(createParser(schema_struct_with_int64, config_struct_with_int64)); + EXPECT_NO_THROW(createWriter(schema_struct_with_int64, config_struct_with_int64)); + + // Types mismatch. + EXPECT_THROW_WITH_SUBSTRING( + createParser(schema_struct_with_uint64, config_struct_with_int64), + "signedness of both types must be the same"); + EXPECT_THROW_WITH_SUBSTRING( + createWriter(schema_struct_with_uint64, config_struct_with_int64), + "signedness of both types must be the same"); + + // No schema for structured field is Ok. + EXPECT_NO_THROW(createParser(New<TTableSchema>(), config_struct_with_int64)); + EXPECT_NO_THROW(createWriter(New<TTableSchema>(), config_struct_with_int64)); + + auto schema_list_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + { + "repeated", + ListLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64)), + }, + }); + + auto schema_list_optional_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + { + "repeated", + ListLogicalType( + OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))), + }, + }); + + auto config_repeated_int64 = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("repeated") + .Item("field_number").Value(1) + .Item("repeated").Value(true) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + // OK. + EXPECT_NO_THROW(createParser(schema_list_int64, config_repeated_int64)); + EXPECT_NO_THROW(createWriter(schema_list_int64, config_repeated_int64)); + + // No schema for repeated field is Ok. + EXPECT_NO_THROW(createParser(New<TTableSchema>(), config_repeated_int64)); + EXPECT_NO_THROW(createWriter(New<TTableSchema>(), config_repeated_int64)); + + // List of optional is not allowed. + EXPECT_THROW_WITH_SUBSTRING( + createParser(schema_list_optional_int64, config_repeated_int64), + "unexpected logical metatype \"optional\""); + EXPECT_THROW_WITH_SUBSTRING( + createWriter(schema_list_optional_int64, config_repeated_int64), + "unexpected logical metatype \"optional\""); + + auto schema_optional_list_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + {"repeated", OptionalLogicalType( + ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)))}, + }); + + // Optional list is OK. + EXPECT_NO_THROW(createParser(schema_optional_list_int64, config_repeated_int64)); + EXPECT_NO_THROW(createWriter(schema_optional_list_int64, config_repeated_int64)); + + auto schema_optional_optional_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + {"field", OptionalLogicalType( + OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64)))}, + }); + + auto config_int64 = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("field") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + // Optional of optional is not allowed. + EXPECT_THROW_WITH_SUBSTRING( + createParser(schema_optional_optional_int64, config_int64), + "unexpected logical metatype \"optional\""); + EXPECT_THROW_WITH_SUBSTRING( + createWriter(schema_optional_optional_int64, config_int64), + "unexpected logical metatype \"optional\""); + + auto schema_struct_with_both = New<TTableSchema>(std::vector<TColumnSchema>{ + {"struct", StructLogicalType({ + {"required_field", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"optional_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + }); + + auto config_struct_with_required = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("struct") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("required_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto config_struct_with_optional = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("struct") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("optional_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + auto config_struct_with_unknown = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("struct") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("required_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("optional_field") + .Item("field_number").Value(2) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("unknown_field") + .Item("field_number").Value(3) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + // Schema has more fields, non-optional field is missing in protobuf config. + // Parser should fail. + EXPECT_THROW_WITH_SUBSTRING( + createParser(schema_struct_with_both, config_struct_with_optional), + "non-optional field \"required_field\" in schema is missing from protobuf config"); + // Writer feels OK. + EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_optional)); + + // Schema has more fields, optional field is missing in protobuf config. + // It's OK for both the writer and the parser. + EXPECT_NO_THROW(createParser(schema_struct_with_both, config_struct_with_required)); + EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_required)); + + // Protobuf config has more fields, it is always OK. + EXPECT_NO_THROW(createParser(schema_struct_with_both, config_struct_with_unknown)); + EXPECT_NO_THROW(createWriter(schema_struct_with_both, config_struct_with_unknown)); + + auto schema_int64 = New<TTableSchema>(std::vector<TColumnSchema>{ + {"int64_field", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + }); + + auto config_two_tables = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("int64_field") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + EXPECT_NO_THROW(createWriter(schema_int64, config_two_tables)); + EXPECT_THROW_WITH_SUBSTRING( + createSeveralTableWriter({schema_int64, schema_int64, schema_int64}, config_two_tables), + "Number of schemas is greater than number of tables in protobuf config: 3 > 2"); + + auto schema_variant_with_int = New<TTableSchema>(std::vector<TColumnSchema>{ + {"variant", VariantStructLogicalType({ + {"a", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + })}, + }); + auto schema_variant_with_optional_int = New<TTableSchema>(std::vector<TColumnSchema>{ + {"variant", VariantStructLogicalType({ + {"a", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + })}, + }); + + auto config_with_oneof = BuildYsonNodeFluently() + .BeginMap() + .Item("tables") + .BeginList() + .Item() + .BeginMap() + .Item("columns") + .BeginList() + .Item() + .BeginMap() + .Item("name").Value("variant") + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item() + .BeginMap() + .Item("name").Value("a") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap(); + + // Oneof fields require schematized columns. + EXPECT_THROW_WITH_SUBSTRING( + createParser(New<TTableSchema>(), config_with_oneof), + "requires a corresponding schematized column"); + EXPECT_THROW_WITH_SUBSTRING( + createWriter(New<TTableSchema>(), config_with_oneof), + "requires a corresponding schematized column"); + + EXPECT_THROW_WITH_SUBSTRING( + createParser(schema_variant_with_optional_int, config_with_oneof), + "Optional variant field \"variant.a\""); + EXPECT_THROW_WITH_SUBSTRING( + createWriter(schema_variant_with_optional_int, config_with_oneof), + "Optional variant field \"variant.a\""); + EXPECT_NO_THROW(createParser(schema_variant_with_int, config_with_oneof)); + EXPECT_NO_THROW(createWriter(schema_variant_with_int, config_with_oneof)); +} + +TEST(TProtobufFormat, MultipleOtherColumns) +{ + auto nameTable = New<TNameTable>(); + + TString data; + TStringOutput resultStream(data); + + auto controlAttributesConfig = New<TControlAttributesConfig>(); + controlAttributesConfig->EnableTableIndex = true; + controlAttributesConfig->EnableEndOfStream = true; + + auto protoWriter = CreateWriterForProtobuf( + MakeProtobufFormatConfig({TOtherColumnsMessage::descriptor(), TOtherColumnsMessage::descriptor()}), + std::vector<TTableSchemaPtr>(2, New<TTableSchema>()), + nameTable, + CreateAsyncAdapter(&resultStream), + true, + controlAttributesConfig, + 0); + + EXPECT_EQ(true, protoWriter->Write( + std::vector<TUnversionedRow>{ + NNamedValue::MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"field1", "foo"}, + }), + NNamedValue::MakeRow(nameTable, { + {TString(TableIndexColumnName), 1}, + {"field2", "bar"}, + }), + })); + + WaitFor(protoWriter->Close()) + .ThrowOnError(); + + std::vector<TString> otherColumnsValue; + auto parser = TLenvalParser(data); + while (auto item = parser.Next()) { + TOtherColumnsMessage message; + bool parsed = message.ParseFromString(item->RowData); + EXPECT_TRUE(parsed); + otherColumnsValue.push_back(CanonizeYson(message.other_columns_field())); + } + + EXPECT_EQ( + otherColumnsValue, + std::vector<TString>({ + CanonizeYson("{field1=foo}"), + CanonizeYson("{field2=bar}"), + })); +} + +//////////////////////////////////////////////////////////////////////////////// + +using TProtobufFormatAllFieldsParameter = std::tuple<int, EProtoFormatType>; +class TProtobufFormatAllFields + : public ::testing::TestWithParam<TProtobufFormatAllFieldsParameter> +{ +public: + bool IsLegacyFormat() const + { + auto [rowCount, protoFormatType] = GetParam(); + return protoFormatType == EProtoFormatType::FileDescriptorLegacy; + } +}; + +INSTANTIATE_TEST_SUITE_P( + Specification, + TProtobufFormatAllFields, + ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::Structured})); + +INSTANTIATE_TEST_SUITE_P( + FileDescriptorLegacy, + TProtobufFormatAllFields, + ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::FileDescriptorLegacy})); + +INSTANTIATE_TEST_SUITE_P( + FileDescriptor, + TProtobufFormatAllFields, + ::testing::Values(TProtobufFormatAllFieldsParameter{1, EProtoFormatType::FileDescriptor})); + +INSTANTIATE_TEST_SUITE_P( + ManyRows, + TProtobufFormatAllFields, + ::testing::Values(TProtobufFormatAllFieldsParameter{50000, EProtoFormatType::Structured})); + +TEST_P(TProtobufFormatAllFields, Writer) +{ + auto [rowCount, protoFormatType] = GetParam(); + auto config = CreateAllFieldsConfig(protoFormatType); + + auto nameTable = New<TNameTable>(); + + auto doubleId = nameTable->RegisterName("Double"); + auto floatId = nameTable->RegisterName("Float"); + + auto int64Id = nameTable->RegisterName("Int64"); + auto uint64Id = nameTable->RegisterName("UInt64"); + auto sint64Id = nameTable->RegisterName("SInt64"); + auto fixed64Id = nameTable->RegisterName("Fixed64"); + auto sfixed64Id = nameTable->RegisterName("SFixed64"); + + auto int32Id = nameTable->RegisterName("Int32"); + auto uint32Id = nameTable->RegisterName("UInt32"); + auto sint32Id = nameTable->RegisterName("SInt32"); + auto fixed32Id = nameTable->RegisterName("Fixed32"); + auto sfixed32Id = nameTable->RegisterName("SFixed32"); + + auto boolId = nameTable->RegisterName("Bool"); + auto stringId = nameTable->RegisterName("String"); + auto bytesId = nameTable->RegisterName("Bytes"); + + auto enumId = nameTable->RegisterName("Enum"); + + auto messageId = nameTable->RegisterName("Message"); + + auto anyWithMapId = nameTable->RegisterName("AnyWithMap"); + auto anyWithInt64Id = nameTable->RegisterName("AnyWithInt64"); + auto anyWithStringId = nameTable->RegisterName("AnyWithString"); + + auto otherInt64ColumnId = nameTable->RegisterName("OtherInt64Column"); + auto otherDoubleColumnId = nameTable->RegisterName("OtherDoubleColumn"); + auto otherStringColumnId = nameTable->RegisterName("OtherStringColumn"); + auto otherNullColumnId = nameTable->RegisterName("OtherNullColumn"); + auto otherBooleanColumnId = nameTable->RegisterName("OtherBooleanColumn"); + auto otherAnyColumnId = nameTable->RegisterName("OtherAnyColumn"); + + auto tableIndexColumnId = nameTable->RegisterName(TableIndexColumnName); + auto rowIndexColumnId = nameTable->RegisterName(RowIndexColumnName); + auto rangeIndexColumnId = nameTable->RegisterName(RangeIndexColumnName); + + auto missintInt64Id = nameTable->RegisterName("MissingInt64"); + + TString result; + TStringOutput resultStream(result); + auto writer = CreateWriterForProtobuf( + config->Attributes(), + {New<TTableSchema>()}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + + TEmbeddedMessage embeddedMessage; + embeddedMessage.set_key("embedded_key"); + embeddedMessage.set_value("embedded_value"); + TString embeddedMessageBytes; + ASSERT_TRUE(embeddedMessage.SerializeToString(&embeddedMessageBytes)); + + auto mapNode = BuildYsonNodeFluently() + .BeginMap() + .Item("Key").Value("Value") + .Item("Another") + .BeginList() + .Item().Value(1) + .Item().Value("two") + .EndList() + .EndMap(); + auto ysonString = ConvertToYsonString(mapNode).ToString(); + + TUnversionedRowBuilder builder; + for (const auto& value : { + MakeUnversionedDoubleValue(3.14159, doubleId), + MakeUnversionedDoubleValue(2.71828, floatId), + + MakeUnversionedInt64Value(-1, int64Id), + MakeUnversionedUint64Value(2, uint64Id), + MakeUnversionedInt64Value(-3, sint64Id), + MakeUnversionedUint64Value(4, fixed64Id), + MakeUnversionedInt64Value(-5, sfixed64Id), + + MakeUnversionedInt64Value(-6, int32Id), + MakeUnversionedUint64Value(7, uint32Id), + MakeUnversionedInt64Value(-8, sint32Id), + MakeUnversionedUint64Value(9, fixed32Id), + MakeUnversionedInt64Value(-10, sfixed32Id), + + MakeUnversionedBooleanValue(true, boolId), + MakeUnversionedStringValue("this_is_string", stringId), + MakeUnversionedStringValue("this_is_bytes", bytesId), + + MakeUnversionedStringValue("Two", enumId), + + MakeUnversionedStringValue(embeddedMessageBytes, messageId), + + MakeUnversionedNullValue(missintInt64Id), + + MakeUnversionedInt64Value(12, tableIndexColumnId), + MakeUnversionedInt64Value(42, rowIndexColumnId), + MakeUnversionedInt64Value(333, rangeIndexColumnId), + }) { + builder.AddValue(value); + } + + if (!IsLegacyFormat()) { + builder.AddValue(MakeUnversionedAnyValue(ysonString, anyWithMapId)); + builder.AddValue(MakeUnversionedInt64Value(22, anyWithInt64Id)); + builder.AddValue(MakeUnversionedStringValue("some_string", anyWithStringId)); + + builder.AddValue(MakeUnversionedInt64Value(-123, otherInt64ColumnId)); + builder.AddValue(MakeUnversionedDoubleValue(-123.456, otherDoubleColumnId)); + builder.AddValue(MakeUnversionedStringValue("some_string", otherStringColumnId)); + builder.AddValue(MakeUnversionedBooleanValue(true, otherBooleanColumnId)); + builder.AddValue(MakeUnversionedAnyValue(ysonString, otherAnyColumnId)); + builder.AddValue(MakeUnversionedNullValue(otherNullColumnId)); + } + + auto row = builder.GetRow(); + std::vector<TUnversionedRow> rows(rowCount, row); + EXPECT_EQ(true, writer->Write(rows)); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput input(result); + TLenvalParser lenvalParser(&input); + + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + auto entry = lenvalParser.Next(); + ASSERT_TRUE(entry); + + NYT::TMessage message; + ASSERT_TRUE(message.ParseFromString(entry->RowData)); + + EXPECT_DOUBLE_EQ(message.double_field(), 3.14159); + EXPECT_FLOAT_EQ(message.float_field(), 2.71828); + EXPECT_EQ(message.int64_field(), -1); + EXPECT_EQ(message.uint64_field(), 2u); + EXPECT_EQ(message.sint64_field(), -3); + EXPECT_EQ(message.fixed64_field(), 4u); + EXPECT_EQ(message.sfixed64_field(), -5); + + EXPECT_EQ(message.int32_field(), -6); + EXPECT_EQ(message.uint32_field(), 7u); + EXPECT_EQ(message.sint32_field(), -8); + EXPECT_EQ(message.fixed32_field(), 9u); + EXPECT_EQ(message.sfixed32_field(), -10); + + EXPECT_EQ(message.bool_field(), true); + EXPECT_EQ(message.string_field(), "this_is_string"); + EXPECT_EQ(message.bytes_field(), "this_is_bytes"); + + EXPECT_EQ(message.enum_field(), EEnum::Two); + + EXPECT_EQ(message.message_field().key(), "embedded_key"); + EXPECT_EQ(message.message_field().value(), "embedded_value"); + + if (!IsLegacyFormat()) { + EXPECT_TRUE(AreNodesEqual(ConvertToNode(TYsonString(message.any_field_with_map())), mapNode)); + EXPECT_TRUE(AreNodesEqual( + ConvertToNode(TYsonString(message.any_field_with_int64())), + BuildYsonNodeFluently().Value(22))); + EXPECT_TRUE(AreNodesEqual( + ConvertToNode(TYsonString(message.any_field_with_string())), + BuildYsonNodeFluently().Value("some_string"))); + + auto otherColumnsMap = ConvertToNode(TYsonString(message.other_columns_field()))->AsMap(); + EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<i64>("OtherInt64Column"), -123); + EXPECT_DOUBLE_EQ(otherColumnsMap->GetChildValueOrThrow<double>("OtherDoubleColumn"), -123.456); + EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<TString>("OtherStringColumn"), "some_string"); + EXPECT_EQ(otherColumnsMap->GetChildValueOrThrow<bool>("OtherBooleanColumn"), true); + EXPECT_TRUE(AreNodesEqual(otherColumnsMap->GetChildOrThrow("OtherAnyColumn"), mapNode)); + EXPECT_EQ(otherColumnsMap->GetChildOrThrow("OtherNullColumn")->GetType(), ENodeType::Entity); + + auto keys = otherColumnsMap->GetKeys(); + std::sort(keys.begin(), keys.end()); + std::vector<std::string> expectedKeys = { + "OtherInt64Column", + "OtherDoubleColumn", + "OtherStringColumn", + "OtherBooleanColumn", + "OtherAnyColumn", + "OtherNullColumn" + }; + std::sort(expectedKeys.begin(), expectedKeys.end()); + EXPECT_EQ(expectedKeys, keys); + } + } + + ASSERT_FALSE(lenvalParser.Next()); +} + +TEST_P(TProtobufFormatAllFields, Parser) +{ + auto [rowCount, protoFormatType] = GetParam(); + + auto config = CreateAllFieldsConfig(protoFormatType); + + TMessage message; + message.set_double_field(3.14159); + message.set_float_field(2.71828); + + message.set_int64_field(-1); + message.set_uint64_field(2); + message.set_sint64_field(-3); + message.set_fixed64_field(4); + message.set_sfixed64_field(-5); + + message.set_int32_field(-6); + message.set_uint32_field(7); + message.set_sint32_field(-8); + message.set_fixed32_field(9); + message.set_sfixed32_field(-10); + + message.set_bool_field(true); + message.set_string_field("this_is_string"); + message.set_bytes_field("this_is_bytes"); + message.set_enum_field(EEnum::Three); + + message.mutable_message_field()->set_key("embedded_key"); + message.mutable_message_field()->set_value("embedded_value"); + + auto mapNode = BuildYsonNodeFluently() + .BeginMap() + .Item("Key").Value("Value") + .Item("Another") + .BeginList() + .Item().Value(1) + .Item().Value("two") + .EndList() + .EndMap(); + + auto otherColumnsNode = BuildYsonNodeFluently() + .BeginMap() + .Item("OtherInt64Column").Value(-123) + .Item("OtherDoubleColumn").Value(-123.456) + .Item("OtherStringColumn").Value("some_string") + .Item("OtherBooleanColumn").Value(true) + .Item("OtherAnyColumn").Value(mapNode) + .Item("OtherNullColumn").Entity() + .EndMap(); + + if (!IsLegacyFormat()) { + message.set_any_field_with_map(ConvertToYsonString(mapNode).ToString()); + message.set_any_field_with_int64(BuildYsonStringFluently().Value(22).ToString()); + message.set_any_field_with_string(BuildYsonStringFluently().Value("some_string").ToString()); + message.set_other_columns_field(ConvertToYsonString(otherColumnsNode).ToString()); + } + + auto rowCollector = ParseRows( + message, + ConvertTo<TProtobufFormatConfigPtr>(config->Attributes().ToMap()), + New<TTableSchema>(), + rowCount); + + for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { + int expectedSize = IsLegacyFormat() ? 17 : 26; + ASSERT_EQ(static_cast<int>(rowCollector.GetRow(rowIndex).GetCount()), expectedSize); + + ASSERT_DOUBLE_EQ(GetDouble(rowCollector.GetRowValue(rowIndex, "Double")), 3.14159); + ASSERT_NEAR(GetDouble(rowCollector.GetRowValue(rowIndex, "Float")), 2.71828, 1e-5); + + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Int64")), -1); + ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "UInt64")), 2u); + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SInt64")), -3); + ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "Fixed64")), 4u); + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SFixed64")), -5); + + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Int32")), -6); + ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "UInt32")), 7u); + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SInt32")), -8); + ASSERT_EQ(GetUint64(rowCollector.GetRowValue(rowIndex, "Fixed32")), 9u); + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "SFixed32")), -10); + + ASSERT_EQ(GetBoolean(rowCollector.GetRowValue(rowIndex, "Bool")), true); + ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "String")), "this_is_string"); + ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "Bytes")), "this_is_bytes"); + + if (IsLegacyFormat()) { + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "Enum")), 3); + } else { + ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "Enum")), "Three"); + } + + TEmbeddedMessage embeddedMessage; + ASSERT_TRUE(embeddedMessage.ParseFromString(GetString(rowCollector.GetRowValue(rowIndex, "Message")))); + ASSERT_EQ(embeddedMessage.key(), "embedded_key"); + ASSERT_EQ(embeddedMessage.value(), "embedded_value"); + + if (!IsLegacyFormat()) { + ASSERT_TRUE(AreNodesEqual(GetAny(rowCollector.GetRowValue(rowIndex, "AnyWithMap")), mapNode)); + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "AnyWithInt64")), 22); + ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "AnyWithString")), "some_string"); + + ASSERT_EQ(GetInt64(rowCollector.GetRowValue(rowIndex, "OtherInt64Column")), -123); + ASSERT_DOUBLE_EQ(GetDouble(rowCollector.GetRowValue(rowIndex, "OtherDoubleColumn")), -123.456); + ASSERT_EQ(GetString(rowCollector.GetRowValue(rowIndex, "OtherStringColumn")), "some_string"); + ASSERT_EQ(GetBoolean(rowCollector.GetRowValue(rowIndex, "OtherBooleanColumn")), true); + ASSERT_TRUE(AreNodesEqual(GetAny(rowCollector.GetRowValue(rowIndex, "OtherAnyColumn")), mapNode)); + ASSERT_EQ(rowCollector.GetRowValue(rowIndex, "OtherNullColumn").Type, EValueType::Null); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +class TProtobufFormatCompat + : public ::testing::Test +{ +public: + static TTableSchemaPtr GetEarlySchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", OptionalLogicalType(VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + }))}, + }); + return schema; + } + + static TTableSchemaPtr GetFirstMiddleSchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", OptionalLogicalType(VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))}, + {"b", OptionalLogicalType(StructLogicalType({ + {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))}, + }); + return schema; + } + + static TTableSchemaPtr GetSecondMiddleSchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", OptionalLogicalType(VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))}, + {"b", OptionalLogicalType(StructLogicalType({ + {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + }))}, + }); + return schema; + } + + static TTableSchemaPtr GetThirdMiddleSchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", OptionalLogicalType(VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))}, + {"b", OptionalLogicalType(StructLogicalType({ + {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"z", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + }))}, + }); + return schema; + } + + static TTableSchemaPtr GetLateSchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", OptionalLogicalType(VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"f3", SimpleLogicalType(ESimpleLogicalValueType::Boolean)}, + }))}, + {"c", OptionalLogicalType(ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean)))}, + {"b", OptionalLogicalType(StructLogicalType({ + {"x", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"y", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"z", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + }))}, + }); + return schema; + } + + static TProtobufFormatConfigPtr GetFirstMiddleConfig() + { + static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("a") + .Item("field_number").Value(0) + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("f1") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .EndList() + .EndMap() + .Item().BeginMap() + .Item("name").Value("b") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("x") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList().EndMap().EndList().EndMap()); + return config; + } + + static TProtobufFormatConfigPtr GetSecondMiddleConfig() + { + static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("a") + .Item("field_number").Value(0) + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("f1") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("f2") + .Item("field_number").Value(101) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .Item().BeginMap() + .Item("name").Value("b") + .Item("field_number").Value(2) + .Item("proto_type").Value("structured_message") + .Item("fields") + .BeginList() + .Item().BeginMap() + .Item("name").Value("x") + .Item("field_number").Value(1) + .Item("proto_type").Value("string") + .EndMap() + .Item().BeginMap() + .Item("name").Value("y") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList().EndMap().EndList().EndMap()); + return config; + } +}; + +template <typename TMessage> +TMessage WriteRow( + TUnversionedRow row, + const TProtobufFormatConfigPtr& config, + const TTableSchemaPtr& schema, + const TNameTablePtr& nameTable) +{ + TString result; + TStringOutput resultStream(result); + + auto writer = CreateWriterForProtobuf( + config, + {schema}, + nameTable, + CreateAsyncAdapter(&resultStream), + true, + New<TControlAttributesConfig>(), + 0); + Y_UNUSED(writer->Write(std::vector<TUnversionedRow>{row})); + writer->Close().Get().ThrowOnError(); + + TStringInput input(result); + TLenvalParser lenvalParser(&input); + auto entry = lenvalParser.Next(); + if (!entry) { + THROW_ERROR_EXCEPTION("Unexpected end of stream in lenval parser"); + } + TMessage message; + if (!message.ParseFromString(entry->RowData)) { + THROW_ERROR_EXCEPTION("Failed to parse message"); + } + if (lenvalParser.Next()) { + THROW_ERROR_EXCEPTION("Unexpected entry in lenval parser"); + } + return message; +} + +TEST_F(TProtobufFormatCompat, Write) +{ + auto nameTable = TNameTable::FromSchema(*GetLateSchema()); + auto config = GetSecondMiddleConfig(); + + auto writeRow = [&] (TUnversionedRow row, const TTableSchemaPtr& schema) { + return WriteRow<NYT::TCompatMessage>(row, config, schema, nameTable); + }; + + { + auto earlyRow = MakeRow(nameTable, { + {"a", EValueType::Composite, "[0; -24]"} + }); + + SCOPED_TRACE("early"); + auto message = writeRow(earlyRow, GetEarlySchema()); + EXPECT_EQ(message.f1(), -24); + EXPECT_FALSE(message.has_f2()); + EXPECT_EQ(message.has_b(), false); + } + { + auto firstMiddleRow = MakeRow(nameTable, { + {"a", EValueType::Composite, "[1; foobar]"}, + {"b", EValueType::Composite, "[foo]"}, + }); + + SCOPED_TRACE("firstMiddle"); + auto message = writeRow(firstMiddleRow, GetFirstMiddleSchema()); + EXPECT_FALSE(message.has_f1()); + EXPECT_EQ(message.f2(), "foobar"); + EXPECT_EQ(message.b().x(), "foo"); + EXPECT_EQ(message.b().has_y(), false); + } + { + auto secondMiddleRow = MakeRow(nameTable, { + {"a", EValueType::Composite, "[1; foobar]"}, + {"b", EValueType::Composite, "[foo; bar]"}, + }); + + SCOPED_TRACE("secondMiddle"); + auto message = writeRow(secondMiddleRow, GetSecondMiddleSchema()); + EXPECT_FALSE(message.has_f1()); + EXPECT_EQ(message.f2(), "foobar"); + EXPECT_EQ(message.b().x(), "foo"); + EXPECT_EQ(message.b().y(), "bar"); + } + { + auto thirdMiddleRow = MakeRow(nameTable, { + {"a", EValueType::Composite, "[1; foobar]"}, + {"b", EValueType::Composite, "[foo; bar; spam]"}, + }); + + SCOPED_TRACE("thirdMiddle"); + auto message = writeRow(thirdMiddleRow, GetThirdMiddleSchema()); + EXPECT_FALSE(message.has_f1()); + EXPECT_EQ(message.f2(), "foobar"); + EXPECT_EQ(message.b().x(), "foo"); + EXPECT_EQ(message.b().y(), "bar"); + } + { + auto lateRow = MakeRow(nameTable, { + {"a", EValueType::Composite, "[2; %true]"}, + {"c", EValueType::Composite, "[%false; %true; %false]"}, + {"b", EValueType::Composite, "[foo; bar; spam]"}, + }); + + SCOPED_TRACE("late"); + auto message = writeRow(lateRow, GetLateSchema()); + EXPECT_FALSE(message.has_f1()); + EXPECT_FALSE(message.has_f2()); + EXPECT_EQ(message.b().x(), "foo"); + EXPECT_EQ(message.b().y(), "bar"); + } +} + +TEST_F(TProtobufFormatCompat, Parse) +{ + auto config = GetSecondMiddleConfig(); + + NYT::TCompatMessage message; + message.set_f2("Sandiego"); + message.mutable_b()->set_x("foo"); + message.mutable_b()->set_y("bar"); + + { + SCOPED_TRACE("early"); + auto collector = ParseRows(message, config, GetEarlySchema()); + EXPECT_FALSE(collector.FindRowValue(0, "a")); + EXPECT_FALSE(collector.GetNameTable()->FindId("b")); + EXPECT_FALSE(collector.GetNameTable()->FindId("c")); + } + { + SCOPED_TRACE("firstMiddle"); + auto collector = ParseRows(message, config, GetFirstMiddleSchema()); + EXPECT_NODES_EQUAL( + GetComposite(collector.GetRowValue(0, "a")), + ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); + EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo]")))); + EXPECT_FALSE(collector.GetNameTable()->FindId("c")); + } + { + SCOPED_TRACE("secondMiddle"); + auto collector = ParseRows(message, config, GetSecondMiddleSchema()); + EXPECT_NODES_EQUAL( + GetComposite(collector.GetRowValue(0, "a")), + ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); + EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar]")))); + EXPECT_FALSE(collector.GetNameTable()->FindId("c")); + } + { + SCOPED_TRACE("thirdMiddle"); + auto collector = ParseRows(message, config, GetThirdMiddleSchema()); + EXPECT_NODES_EQUAL( + GetComposite(collector.GetRowValue(0, "a")), + ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); + EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar;#]")))); + EXPECT_FALSE(collector.GetNameTable()->FindId("c")); + } + { + SCOPED_TRACE("late"); + auto collector = ParseRows(message, config, GetLateSchema()); + EXPECT_NODES_EQUAL( + GetComposite(collector.GetRowValue(0, "a")), + ConvertToNode(TYsonString(TStringBuf("[1;Sandiego]")))); + EXPECT_NODES_EQUAL(GetComposite(collector.GetRowValue(0, "b")), ConvertToNode(TYsonString(TStringBuf("[foo;bar;#]")))); + EXPECT_TRUE(collector.GetNameTable()->FindId("c")); + } +} + +TEST_F(TProtobufFormatCompat, ParseWrong) +{ + NYT::TCompatMessage message; + message.set_f1(42); + message.mutable_b()->set_x("foo"); + message.mutable_b()->set_y("bar"); + + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetFirstMiddleConfig(), GetFirstMiddleSchema()), + "Unexpected field number 2"); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TProtobufFormatEnumCompat + : public ::testing::Test +{ +public: + static TTableSchemaPtr CreateTableSchema() + { + static const auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"optional_enum", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"required_enum", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"packed_repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"inner", OptionalLogicalType(StructLogicalType({ + {"optional_enum", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"required_enum", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + {"packed_repeated_enum", ListLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))}, + }))}, + }); + return schema; + } + static TProtobufFormatConfigPtr CreateProtobufFormatConfig() + { + static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap() + .Item("enumerations").BeginMap() + .Item("ECompatEnum") + .BeginMap() + .Item("One").Value(1) + .Item("Two").Value(2) + .Item("Three").Value(3) + .EndMap() + .EndMap() + .Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("optional_enum") + .Item("field_number").Value(1) + .Item("proto_type").Value("enum_string") + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("required_enum") + .Item("field_number").Value(2) + .Item("proto_type").Value("enum_string") + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("repeated_enum") + .Item("field_number").Value(3) + .Item("proto_type").Value("enum_string") + .Item("repeated").Value(true) + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("packed_repeated_enum") + .Item("field_number").Value(4) + .Item("proto_type").Value("enum_string") + .Item("repeated").Value(true) + .Item("packed").Value(true) + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("inner") + .Item("field_number").Value(100) + .Item("proto_type").Value("structured_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("optional_enum") + .Item("field_number").Value(1) + .Item("proto_type").Value("enum_string") + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("required_enum") + .Item("field_number").Value(2) + .Item("proto_type").Value("enum_string") + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("repeated_enum") + .Item("field_number").Value(3) + .Item("proto_type").Value("enum_string") + .Item("repeated").Value(true) + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .Item().BeginMap() + .Item("name").Value("packed_repeated_enum") + .Item("field_number").Value(4) + .Item("proto_type").Value("enum_string") + .Item("repeated").Value(true) + .Item("packed").Value(true) + .Item("enum_writing_mode").Value("skip_unknown_values") + .Item("enumeration_name").Value("ECompatEnum") + .EndMap() + .EndList() + .EndMap() + .EndList().EndMap().EndList().EndMap()); + return config; + } + +}; + +TEST_F(TProtobufFormatEnumCompat, WriteCanSkipUnknownEnumValues) +{ + auto schema = CreateTableSchema(); + auto config = CreateProtobufFormatConfig(); + + auto nameTable = TNameTable::FromSchema(*schema); + + auto row = MakeRow(nameTable, { + {"optional_enum", "MinusFortyTwo"}, + {"required_enum", "One"}, + {"repeated_enum", EValueType::Composite, "[MinusFortyTwo;One;MinusFortyTwo]"}, + {"packed_repeated_enum", EValueType::Composite, "[MinusFortyTwo;Two;MinusFortyTwo]"}, + {"inner", EValueType::Composite, "[MinusFortyTwo;Two;[MinusFortyTwo;Two];[One;MinusFortyTwo]]"}, + }); + + auto collectRepeated = [] (const auto& repeated) { + std::vector<TEnumCompat::ECompatEnum> values; + for (auto value : repeated) { + values.push_back(static_cast<TEnumCompat::ECompatEnum>(value)); + } + return values; + }; + + auto message = WriteRow<TEnumCompat>(row, config, schema, nameTable); + + EXPECT_FALSE(message.has_optional_enum()); + EXPECT_EQ(message.required_enum(), TEnumCompat::One); + EXPECT_EQ(collectRepeated(message.repeated_enum()), std::vector{TEnumCompat::One}); + EXPECT_EQ(collectRepeated(message.packed_repeated_enum()), std::vector{TEnumCompat::Two}); + + ASSERT_TRUE(message.has_inner()); + EXPECT_FALSE(message.inner().has_optional_enum()); + EXPECT_EQ(message.inner().required_enum(), TEnumCompat::Two); + EXPECT_EQ(collectRepeated(message.inner().repeated_enum()), std::vector{TEnumCompat::Two}); + EXPECT_EQ(collectRepeated(message.inner().packed_repeated_enum()), std::vector{TEnumCompat::One}); +} + +TEST_F(TProtobufFormatEnumCompat, WriteDoesntSkipRequiredFields) +{ + auto schema = CreateTableSchema(); + auto config = CreateProtobufFormatConfig(); + + auto nameTable = TNameTable::FromSchema(*schema); + + { + auto row = MakeRow(nameTable, {{"required_enum", "MinusFortyTwo"}}); + EXPECT_THROW_WITH_SUBSTRING(WriteRow<TEnumCompat>(row, config, schema, nameTable), "Invalid value for enum"); + } + { + auto row = MakeRow(nameTable, {{"inner", EValueType::Composite, "[#;MinusFortyTwo;#;#]"},}); + EXPECT_THROW_WITH_SUBSTRING(WriteRow<TEnumCompat>(row, config, schema, nameTable), "Invalid value for enum"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +class TProtobufFormatRuntimeErrors + : public ::testing::Test +{ +public: + static TTableSchemaPtr GetSchemaWithVariant(bool optional = false) + { + auto variantType = VariantStructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }); + return New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", optional ? OptionalLogicalType(variantType) : variantType}, + }); + } + + static TTableSchemaPtr GetSchemaWithStruct(bool optional = false) + { + auto structType = StructLogicalType({ + {"f1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"f2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }); + return New<TTableSchema>(std::vector<TColumnSchema>{ + {"a", optional ? OptionalLogicalType(structType) : structType}, + }); + } + + static TProtobufFormatConfigPtr GetConfigWithVariant() + { + static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("a") + .Item("proto_type").Value("oneof") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("f1") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("f2") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList().EndMap().EndList().EndMap()); + return config; + } + + static TProtobufFormatConfigPtr GetConfigWithStruct() + { + static const auto config = ConvertTo<TProtobufFormatConfigPtr>(BuildYsonNodeFluently() + .BeginMap().Item("tables").BeginList().Item().BeginMap().Item("columns").BeginList() + .Item().BeginMap() + .Item("name").Value("a") + .Item("field_number").Value(1) + .Item("proto_type").Value("structured_message") + .Item("fields").BeginList() + .Item().BeginMap() + .Item("name").Value("f1") + .Item("field_number").Value(1) + .Item("proto_type").Value("int64") + .EndMap() + .Item().BeginMap() + .Item("name").Value("f2") + .Item("field_number").Value(2) + .Item("proto_type").Value("string") + .EndMap() + .EndList() + .EndMap() + .EndList().EndMap().EndList().EndMap()); + return config; + } +}; + +TEST_F(TProtobufFormatRuntimeErrors, ParseVariant) +{ + { + SCOPED_TRACE("Optional variant, all missing"); + TMessageWithOneof message; + auto collector = ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant(/* optional */ true)); + EXPECT_FALSE(collector.FindRowValue(0, "a")); + } + { + SCOPED_TRACE("All missing"); + TMessageWithOneof message; + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant()), + "required field \"<root>.a\" is missing"); + } + { + SCOPED_TRACE("two alternatives"); + TMessageWithStruct::TStruct message; + message.set_f1(5); + message.set_f2("boo"); + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetConfigWithVariant(), GetSchemaWithVariant()), + "multiple entries for oneof field \"<root>.a\""); + } +} + +TEST_F(TProtobufFormatRuntimeErrors, ParseStruct) +{ + { + SCOPED_TRACE("Optional submessage missing"); + TMessageWithStruct message; + auto collector = ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct(/* optional */ true)); + EXPECT_FALSE(collector.FindRowValue(0, "a")); + } + { + SCOPED_TRACE("Required submessage missing"); + TMessageWithStruct message; + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), + "required field \"<root>.a\" is missing"); + } + { + SCOPED_TRACE("All fields missing"); + TMessageWithStruct message; + message.mutable_a(); + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), + "required field \"<root>.a.f1\" is missing"); + } + { + SCOPED_TRACE("Second field missing"); + TMessageWithStruct message; + message.mutable_a()->set_f1(17); + EXPECT_THROW_WITH_SUBSTRING( + ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()), + "required field \"<root>.a.f2\" is missing"); + } + { + SCOPED_TRACE("All present"); + TMessageWithStruct message; + message.mutable_a()->set_f1(17); + message.mutable_a()->set_f2("foobar"); + auto collector = ParseRows(message, GetConfigWithStruct(), GetSchemaWithStruct()); + EXPECT_NODES_EQUAL( + GetComposite(collector.GetRowValue(0, "a")), + ConvertToNode(TYsonString(TStringBuf("[17;foobar]")))); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT diff --git a/yt/yt/library/formats/unittests/protobuf_format_ut.proto b/yt/yt/library/formats/unittests/protobuf_format_ut.proto new file mode 100644 index 0000000000..06258de619 --- /dev/null +++ b/yt/yt/library/formats/unittests/protobuf_format_ut.proto @@ -0,0 +1,255 @@ +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NProtobufFormatTest; + +enum EEnum +{ + One = 1; + Two = 2; + Three = 3; + + MinusFortyTwo = -42; + + MinInt32 = -2147483648; + MaxInt32 = 2147483647; +} + +message TEmbeddedStruct { + optional float float1 = 1; + optional string string1 = 2; +}; + +message TEmbedded2Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional uint64 embedded2_num = 10; + optional TEmbeddedStruct embedded2_struct = 17; + repeated string embedded2_repeated = 42; +}; + +message TEmbedded1Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TEmbedded2Message t2 = 1 [(NYT.flags) = EMBEDDED]; + oneof variant { + string str_variant = 101; + uint64 uint_variant = 102; + } + optional uint64 embedded_num = 10; // make intentional field_num collision! + optional string embedded_extra_field = 11; +}; +message TEmbeddingMessage { + optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; + optional TEmbedded1Message t1 = 2 [(NYT.flags) = EMBEDDED]; + optional uint64 num = 12; + optional string extra_field = 13; +}; + +message TEmbeddedMessage +{ + optional string key = 1; + optional string value = 2; +} + +message TMessageWithStructuredEmbedded +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TFirstMessage + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; + optional int64 int64_field = 2; + repeated int64 repeated_int64_field = 3; + optional TEmbeddedMessage message_field = 4; + repeated TEmbeddedMessage repeated_message_field = 5; + optional bytes any_int64_field = 6 [(NYT.flags) = ANY]; + optional bytes any_map_field = 7 [(NYT.flags) = ANY]; + optional int64 optional_int64_field = 8; + repeated int64 another_repeated_int64_field = 9; + repeated bytes repeated_optional_any_field = 10 [(NYT.flags) = ANY]; + repeated EEnum packed_repeated_enum_field = 11 [packed=true, (NYT.flags) = ENUM_STRING]; + repeated bool optional_repeated_bool_field = 12; + oneof oneof_field { + string oneof_string_field_1 = 101; + string oneof_string_field = 102; + TEmbeddedMessage oneof_message_field = 1000; + } + oneof optional_oneof_field { + string optional_oneof_string_field_1 = 201; + string optional_oneof_string_field = 202; + TEmbeddedMessage optional_oneof_message_field = 2000; + } + map<int64, TEmbeddedMessage> map_field = 13 [(NYT.flags) = MAP_AS_DICT]; + } + + message TSecondMessage + { + optional int64 one = 2; + optional int64 two = 500000000; + optional int64 three = 100500; + } + + optional TFirstMessage first = 1; + optional TSecondMessage second = 2; + repeated TEmbeddedMessage repeated_message_field = 3; + repeated int64 repeated_int64_field = 4; + optional int64 int64_any_field = 5 [(NYT.column_name) = "any_field"]; + + optional int32 int32_field = 6 [(NYT.column_name) = "int64_field"]; + optional uint32 uint32_field = 7 [(NYT.column_name) = "uint64_field"]; + optional int64 int64_field = 8 [(NYT.column_name) = "int32_field"]; + optional uint64 uint64_field = 9 [(NYT.column_name) = "uint32_field"]; + + optional EEnum enum_int_field = 10 [(NYT.flags) = ENUM_INT]; + optional EEnum enum_string_string_field = 11 [(NYT.flags) = ENUM_STRING]; + optional EEnum enum_string_int64_field = 12 [(NYT.flags) = ENUM_STRING]; + + + repeated int64 another_repeated_int64_field = 13; + + repeated bytes repeated_optional_any_field = 14 [(NYT.flags) = ANY]; + + optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; + + optional string utf8_field = 16; + + repeated int64 packed_repeated_int64_field = 17 [packed=true]; + + repeated int64 optional_repeated_int64_field = 18; + + oneof oneof_field { + string oneof_string_field_1 = 101; + string oneof_string_field = 102; + TEmbeddedMessage oneof_message_field = 1000; + } + + oneof optional_oneof_field { + string optional_oneof_string_field_1 = 201; + string optional_oneof_string_field = 202; + TEmbeddedMessage optional_oneof_message_field = 2000; + } + + map<int64, TEmbeddedMessage> map_field = 19 [(NYT.flags) = MAP_AS_DICT]; +} + +message TSeveralTablesMessageFirst +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TEmbedded + { + optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; + optional int64 int64_field = 2; + } + optional TEmbedded embedded = 1; + repeated int64 repeated_int64_field = 2; + optional int64 int64_field = 3 [(NYT.column_name) = "any_field"]; +} + +message TSeveralTablesMessageSecond +{ + optional EEnum enum_field = 1 [(NYT.flags) = ENUM_STRING]; + optional int64 int64_field = 2; +} + +message TSeveralTablesMessageThird +{ + optional string string_field = 1; +} + +message TMessage +{ + optional double double_field = 1 [(NYT.column_name) = "Double"]; + optional float float_field = 2 [(NYT.column_name) = "Float"]; + + optional int64 int64_field = 3 [(NYT.column_name) = "Int64"]; + optional uint64 uint64_field = 4 [(NYT.column_name) = "UInt64"]; + optional sint64 sint64_field = 5 [(NYT.column_name) = "SInt64"]; + optional fixed64 fixed64_field = 6 [(NYT.column_name) = "Fixed64"]; + optional sfixed64 sfixed64_field = 7 [(NYT.column_name) = "SFixed64"]; + + optional int32 int32_field = 8 [(NYT.column_name) = "Int32"]; + optional uint32 uint32_field = 9 [(NYT.column_name) = "UInt32"]; + optional sint32 sint32_field = 10 [(NYT.column_name) = "SInt32"]; + optional fixed32 fixed32_field = 11 [(NYT.column_name) = "Fixed32"]; + optional sfixed32 sfixed32_field = 12 [(NYT.column_name) = "SFixed32"]; + + optional bool bool_field = 13 [(NYT.column_name) = "Bool"]; + optional string string_field = 14 [(NYT.column_name) = "String"]; + optional bytes bytes_field = 15 [(NYT.column_name) = "Bytes"]; + + optional EEnum enum_field = 16 [(NYT.column_name) = "Enum", (NYT.flags) = ENUM_STRING]; + optional TEmbeddedMessage message_field = 17 [(NYT.column_name) = "Message"]; + + optional bytes any_field_with_map = 18 [(NYT.column_name) = "AnyWithMap", (NYT.flags) = ANY]; + optional bytes any_field_with_int64 = 19 [(NYT.column_name) = "AnyWithInt64", (NYT.flags) = ANY]; + optional bytes any_field_with_string = 20 [(NYT.column_name) = "AnyWithString", (NYT.flags) = ANY]; + optional bytes other_columns_field = 21 [(NYT.flags) = OTHER_COLUMNS]; + + optional int64 missing_int64_field = 22 [(NYT.column_name) = "MissingInt64"]; +} + +message TCompatMessage +{ + message TEmbedded + { + optional string x = 1; + optional string y = 2; + } + + oneof a { + int64 f1 = 1; + string f2 = 101; + } + optional TEmbedded b = 2; +} + +message TMessageWithOneof +{ + oneof variant { + int64 f1 = 1; + string f2 = 2; + } +} + +message TMessageWithStruct +{ + message TStruct + { + optional int64 f1 = 1; + optional string f2 = 2; + } + optional TStruct a = 1; +} + +message TOtherColumnsMessage +{ + optional bytes other_columns_field = 1 [(NYT.flags) = OTHER_COLUMNS]; +} + +message TEnumCompat { + option (NYT.default_field_flags) = SERIALIZATION_YT; + option (NYT.default_field_flags) = ENUM_SKIP_UNKNOWN_VALUES; + + enum ECompatEnum { + One = 1; + Two = 2; + Three = 3; + } + + + message TStruct + { + optional ECompatEnum optional_enum = 1; + required ECompatEnum required_enum = 2; + repeated ECompatEnum repeated_enum = 3; + repeated ECompatEnum packed_repeated_enum = 4 [packed=true, (NYT.flags) = ENUM_STRING]; + } + + optional ECompatEnum optional_enum = 1; + required ECompatEnum required_enum = 2; + repeated ECompatEnum repeated_enum = 3; + repeated ECompatEnum packed_repeated_enum = 4 [packed=true, (NYT.flags) = ENUM_STRING]; + + optional TStruct inner = 100; +} diff --git a/yt/yt/library/formats/unittests/row_helpers.cpp b/yt/yt/library/formats/unittests/row_helpers.cpp new file mode 100644 index 0000000000..61a89d1669 --- /dev/null +++ b/yt/yt/library/formats/unittests/row_helpers.cpp @@ -0,0 +1,70 @@ +#include "row_helpers.h" + +#include <yt/yt/core/yson/string.h> +#include <yt/yt/core/ytree/convert.h> + +namespace NYT { + +using namespace NTableClient; + +//////////////////////////////////////////////////////////////////////////////// + +static void EnsureTypesMatch(EValueType expected, EValueType actual) +{ + if (expected != actual) { + THROW_ERROR_EXCEPTION("Unexpected type of TUnversionedValue: expected %Qlv, actual %Qlv", + expected, + actual); + } +} + +i64 GetInt64(const TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Int64, value.Type); + return value.Data.Int64; +} + +ui64 GetUint64(const TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Uint64, value.Type); + return value.Data.Uint64; +} + +double GetDouble(const NTableClient::TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Double, value.Type); + return value.Data.Double; +} + +bool GetBoolean(const TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Boolean, value.Type); + return value.Data.Boolean; +} + +TString GetString(const TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::String, value.Type); + return value.AsString(); +} + +NYTree::INodePtr GetAny(const NTableClient::TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Any, value.Type); + return NYTree::ConvertToNode(NYson::TYsonString(value.AsString())); +} + +NYTree::INodePtr GetComposite(const NTableClient::TUnversionedValue& value) +{ + EnsureTypesMatch(EValueType::Composite, value.Type); + return NYTree::ConvertToNode(NYson::TYsonString(value.AsString())); +} + +bool IsNull(const NTableClient::TUnversionedValue& value) +{ + return value.Type == EValueType::Null; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/library/formats/unittests/row_helpers.h b/yt/yt/library/formats/unittests/row_helpers.h new file mode 100644 index 0000000000..fdc3f8b560 --- /dev/null +++ b/yt/yt/library/formats/unittests/row_helpers.h @@ -0,0 +1,111 @@ +#pragma once + +#include <yt/yt/client/table_client/unversioned_row.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/schema.h> +#include <yt/yt/client/table_client/value_consumer.h> + +#include <vector> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TCollectingValueConsumer + : public NTableClient::IValueConsumer +{ +public: + explicit TCollectingValueConsumer(NTableClient::TTableSchemaPtr schema = New<NTableClient::TTableSchema>()) + : Schema_(std::move(schema)) + { } + + explicit TCollectingValueConsumer(NTableClient::TNameTablePtr nameTable, NTableClient::TTableSchemaPtr schema = New<NTableClient::TTableSchema>()) + : Schema_(std::move(schema)) + , NameTable_(std::move(nameTable)) + { } + + const NTableClient::TNameTablePtr& GetNameTable() const override + { + return NameTable_; + } + + const NTableClient::TTableSchemaPtr& GetSchema() const override + { + return Schema_; + } + + bool GetAllowUnknownColumns() const override + { + return true; + } + + void OnBeginRow() override + { } + + void OnValue(const NTableClient::TUnversionedValue& value) override + { + Builder_.AddValue(value); + } + + void OnEndRow() override + { + RowList_.emplace_back(Builder_.FinishRow()); + } + + NTableClient::TUnversionedRow GetRow(size_t rowIndex) + { + return RowList_.at(rowIndex); + } + + std::optional<NTableClient::TUnversionedValue> FindRowValue(size_t rowIndex, TStringBuf columnName) const + { + NTableClient::TUnversionedRow row = RowList_.at(rowIndex); + auto id = GetNameTable()->GetIdOrThrow(columnName); + + for (const auto& value : row) { + if (value.Id == id) { + return value; + } + } + return std::nullopt; + } + + NTableClient::TUnversionedValue GetRowValue(size_t rowIndex, TStringBuf columnName) const + { + auto row = FindRowValue(rowIndex, columnName); + if (!row) { + THROW_ERROR_EXCEPTION("Cannot find column %Qv", columnName); + } + return *row; + } + + size_t Size() const + { + return RowList_.size(); + } + + const std::vector<NTableClient::TUnversionedOwningRow>& GetRowList() const { + return RowList_; + } + +private: + const NTableClient::TTableSchemaPtr Schema_; + const NTableClient::TNameTablePtr NameTable_ = New<NTableClient::TNameTable>(); + NTableClient::TUnversionedOwningRowBuilder Builder_; + std::vector<NTableClient::TUnversionedOwningRow> RowList_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +i64 GetInt64(const NTableClient::TUnversionedValue& value); +ui64 GetUint64(const NTableClient::TUnversionedValue& value); +double GetDouble(const NTableClient::TUnversionedValue& value); +bool GetBoolean(const NTableClient::TUnversionedValue& value); +TString GetString(const NTableClient::TUnversionedValue& value); +NYTree::INodePtr GetAny(const NTableClient::TUnversionedValue& value); +NYTree::INodePtr GetComposite(const NTableClient::TUnversionedValue& value); +bool IsNull(const NTableClient::TUnversionedValue& value); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/library/formats/unittests/schemaful_dsv_parser_ut.cpp b/yt/yt/library/formats/unittests/schemaful_dsv_parser_ut.cpp new file mode 100644 index 0000000000..875ad5b9f1 --- /dev/null +++ b/yt/yt/library/formats/unittests/schemaful_dsv_parser_ut.cpp @@ -0,0 +1,248 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/test_framework/yson_consumer_mock.h> + +#include <yt/yt/library/formats/schemaful_dsv_parser.h> + +#include <yt/yt/core/yson/null_consumer.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; + +using ::testing::InSequence; +using ::testing::StrictMock; +using ::testing::NiceMock; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSchemafulDsvParserTest, Simple) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("5")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar("6")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("100")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar("max\tignat")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "5\t6\n" + "100\tmax\\tignat\n"; + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b"}; + + ParseSchemafulDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSchemafulDsvParserTest, TableIndex) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginAttributes()); + EXPECT_CALL(Mock, OnKeyedItem("table_index")); + EXPECT_CALL(Mock, OnInt64Scalar(1)); + EXPECT_CALL(Mock, OnEndAttributes()); + EXPECT_CALL(Mock, OnEntity()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("x")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginAttributes()); + EXPECT_CALL(Mock, OnKeyedItem("table_index")); + EXPECT_CALL(Mock, OnInt64Scalar(0)); + EXPECT_CALL(Mock, OnEndAttributes()); + EXPECT_CALL(Mock, OnEntity()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("y")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("z")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "1\tx\n" + "0\ty\n" + "0\tz\n"; + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a"}; + config->EnableTableIndex = true; + + ParseSchemafulDsv(input, &Mock, config); +} + +TEST(TSchemafulDsvParserTest, TooManyRows) +{ + TString input = "5\t6\n"; + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a"}; + + EXPECT_THROW({ ParseSchemafulDsv(input, GetNullYsonConsumer(), config); }, std::exception); +} + +TEST(TSchemafulDsvParserTest, SpecialSymbols) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + auto value = TString("6\0", 2); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("5\r")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar(value)); + EXPECT_CALL(Mock, OnEndMap()); + + TString input("5\r\t6\0\n", 6); + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b"}; + + ParseSchemafulDsv(input, &Mock, config); +} + +TEST(TSchemafulDsvParserTest, EnabledEscaping) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + auto value = TString("6\0", 2); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("5\r\r")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar(value)); + EXPECT_CALL(Mock, OnEndMap()); + + TString input("5\r\\r\t6\0\n", 8); + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b"}; + config->EnableEscaping = true; + + ParseSchemafulDsv(input, &Mock, config); +} + +TEST(TSchemafulDsvParserTest, DisabledEscaping) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + auto value = TString("6\0", 2); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("5\r\\r")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar(value)); + EXPECT_CALL(Mock, OnEndMap()); + + TString input("5\r\\r\t6\0\n", 8); + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b"}; + config->EnableEscaping = false; + + ParseSchemafulDsv(input, &Mock, config); +} + +TEST(TSchemafulDsvParserTest, ColumnsNamesHeader) +{ + TString input("a\tb\n1\t2\n"); + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b"}; + config->EnableColumnNamesHeader = true; + + EXPECT_THROW(ParseSchemafulDsv(input, GetNullYsonConsumer(), config), std::exception); +} + +TEST(TSchemafulDsvParserTest, MissingValueModePrintSentinel) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + TString input = "x\t\tz\n"; + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("x")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("c")); + EXPECT_CALL(Mock, OnStringScalar("z")); + EXPECT_CALL(Mock, OnEndMap()); + + auto config = New<TSchemafulDsvFormatConfig>(); + config->Columns = {"a", "b", "c"}; + // By default missing_value_mode = fail and no sentinel values are used, + // i. e. there is no way to represent YSON entity with this format. + + ParseSchemafulDsv(input, &Mock, config); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("x")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnEntity()); + EXPECT_CALL(Mock, OnKeyedItem("c")); + EXPECT_CALL(Mock, OnStringScalar("z")); + EXPECT_CALL(Mock, OnEndMap()); + + config->MissingValueMode = EMissingSchemafulDsvValueMode::PrintSentinel; + // By default missing_value_sentinel = "". + + ParseSchemafulDsv(input, &Mock, config); + + input = "null\tNULL\t\n"; + + config->MissingValueSentinel = "NULL"; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("null")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnEntity()); + EXPECT_CALL(Mock, OnKeyedItem("c")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnEndMap()); + + ParseSchemafulDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/schemaful_dsv_writer_ut.cpp b/yt/yt/library/formats/unittests/schemaful_dsv_writer_ut.cpp new file mode 100644 index 0000000000..52cd31a1a8 --- /dev/null +++ b/yt/yt/library/formats/unittests/schemaful_dsv_writer_ut.cpp @@ -0,0 +1,346 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include "format_writer_ut.h" + +#include <yt/yt/library/formats/schemaful_dsv_writer.h> +#include <yt/yt/library/formats/format.h> + +#include <yt/yt/client/table_client/name_table.h> + +#include <yt/yt/core/concurrency/async_stream.h> + +#include <limits> + +namespace NYT::NFormats { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYTree; +using namespace NYson; +using namespace NConcurrency; +using namespace NTableClient; + +class TSchemalessWriterForSchemafulDsvTest + : public ::testing::Test +{ +protected: + TNameTablePtr NameTable_; + int KeyAId_; + int KeyBId_; + int KeyCId_; + int KeyDId_; + int TableIndexId_; + int RangeIndexId_; + int RowIndexId_; + TSchemafulDsvFormatConfigPtr Config_; + + ISchemalessFormatWriterPtr Writer_; + + TStringStream OutputStream_; + + TSchemalessWriterForSchemafulDsvTest() + { + NameTable_ = New<TNameTable>(); + KeyAId_ = NameTable_->RegisterName("column_a"); + KeyBId_ = NameTable_->RegisterName("column_b"); + KeyCId_ = NameTable_->RegisterName("column_c"); + KeyDId_ = NameTable_->RegisterName("column_d"); + TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); + RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); + RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); + + Config_ = New<TSchemafulDsvFormatConfig>(); + } + + void CreateStandardWriter() + { + auto controlAttributesConfig = New<TControlAttributesConfig>(); + controlAttributesConfig->EnableTableIndex = Config_->EnableTableIndex; + Writer_ = CreateSchemalessWriterForSchemafulDsv( + Config_, + NameTable_, + CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), + false, // enableContextSaving + controlAttributesConfig, + 0 /* keyColumnCount */); + } +}; + +TEST_F(TSchemalessWriterForSchemafulDsvTest, Simple) +{ + Config_->Columns = {"column_b", "column_c", "column_a"}; + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("value_a", KeyAId_)); + row1.AddValue(MakeUnversionedInt64Value(-42, KeyBId_)); + row1.AddValue(MakeUnversionedBooleanValue(true, KeyCId_)); + row1.AddValue(MakeUnversionedStringValue("garbage", KeyDId_)); + + // Ignore system columns. + row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); + + TUnversionedRowBuilder row2; + // The order is reversed. + row2.AddValue(MakeUnversionedStringValue("value_c", KeyCId_)); + row2.AddValue(MakeUnversionedBooleanValue(false, KeyBId_)); + row2.AddValue(MakeUnversionedInt64Value(23, KeyAId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = + "-42\ttrue\tvalue_a\n" + "false\tvalue_c\t23\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +// This test shows the actual behavior of writer. It is OK to change it in the future. :) +TEST_F(TSchemalessWriterForSchemafulDsvTest, TrickyDoubleRepresentations) +{ + Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; + CreateStandardWriter(); + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedDoubleValue(1.234567890123456, KeyAId_)); + row1.AddValue(MakeUnversionedDoubleValue(42, KeyBId_)); + row1.AddValue(MakeUnversionedDoubleValue(1e300, KeyCId_)); + row1.AddValue(MakeUnversionedDoubleValue(-1e-300, KeyDId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = "1.234567890123456\t42.\t1e+300\t-1e-300\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, IntegralTypeRepresentations) +{ + Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); + row1.AddValue(MakeUnversionedInt64Value(-1LL, KeyBId_)); + row1.AddValue(MakeUnversionedInt64Value(1LL, KeyCId_)); + row1.AddValue(MakeUnversionedInt64Value(99LL, KeyDId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedInt64Value(123LL, KeyAId_)); + row2.AddValue(MakeUnversionedInt64Value(-123LL, KeyBId_)); + row2.AddValue(MakeUnversionedInt64Value(1234LL, KeyCId_)); + row2.AddValue(MakeUnversionedInt64Value(-1234LL, KeyDId_)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedUint64Value(0ULL, KeyAId_)); + row3.AddValue(MakeUnversionedUint64Value(98ULL, KeyBId_)); + row3.AddValue(MakeUnversionedUint64Value(987ULL, KeyCId_)); + row3.AddValue(MakeUnversionedUint64Value(9876ULL, KeyDId_)); + + TUnversionedRowBuilder row4; + row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::max(), KeyAId_)); + row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::min(), KeyBId_)); + row4.AddValue(MakeUnversionedInt64Value(std::numeric_limits<i64>::min() + 1LL, KeyCId_)); + row4.AddValue(MakeUnversionedUint64Value(std::numeric_limits<ui64>::max(), KeyDId_)); + + std::vector<TUnversionedRow> rows = + {row1.GetRow(), row2.GetRow(), row3.GetRow(), row4.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = + "0\t-1\t1\t99\n" + "123\t-123\t1234\t-1234\n" + "0\t98\t987\t9876\n" + "9223372036854775807\t-9223372036854775808\t-9223372036854775807\t18446744073709551615\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, EmptyColumnList) +{ + Config_->Columns = std::vector<std::string>(); + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = "\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, MissingValueMode) +{ + Config_->Columns = {"column_a", "column_b", "column_c"}; + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("Value1A", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("Value1B", KeyBId_)); + row1.AddValue(MakeUnversionedStringValue("Value1C", KeyCId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("Value2A", KeyAId_)); + row2.AddValue(MakeUnversionedStringValue("Value2C", KeyCId_)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("Value3A", KeyAId_)); + row3.AddValue(MakeUnversionedStringValue("Value3B", KeyBId_)); + row3.AddValue(MakeUnversionedStringValue("Value3C", KeyCId_)); + + std::vector<TUnversionedRow> rows = + {row1.GetRow(), row2.GetRow(), row3.GetRow()}; + + { + Config_->MissingValueMode = EMissingSchemafulDsvValueMode::SkipRow; + CreateStandardWriter(); + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = + "Value1A\tValue1B\tValue1C\n" + "Value3A\tValue3B\tValue3C\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); + OutputStream_.Clear(); + } + + { + Config_->MissingValueMode = EMissingSchemafulDsvValueMode::Fail; + CreateStandardWriter(); + EXPECT_EQ(false, Writer_->Write(rows)); + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); + OutputStream_.Clear(); + } + + { + Config_->MissingValueMode = EMissingSchemafulDsvValueMode::PrintSentinel; + Config_->MissingValueSentinel = "~"; + CreateStandardWriter(); + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = + "Value1A\tValue1B\tValue1C\n" + "Value2A\t~\tValue2C\n" + "Value3A\tValue3B\tValue3C\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); + OutputStream_.Clear(); + } +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, NameTableExpansion) +{ + Config_->Columns = {"Column1"}; + Config_->MissingValueMode = {EMissingSchemafulDsvValueMode::PrintSentinel}; + CreateStandardWriter(); + TestNameTableExpansion(Writer_, NameTable_); +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, TableIndex) +{ + Config_->Columns = {"column_a", "column_b", "column_c", "column_d"}; + Config_->EnableTableIndex = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row0; + row0.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); + row0.AddValue(MakeUnversionedInt64Value(1LL, KeyBId_)); + row0.AddValue(MakeUnversionedInt64Value(2LL, KeyCId_)); + row0.AddValue(MakeUnversionedInt64Value(3LL, KeyDId_)); + + // It's necessary to specify a column corresponding to the table index + // when enable_table_index = true. + EXPECT_EQ(false, Writer_->Write(std::vector<TUnversionedRow>{row0.GetRow()})); + + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedInt64Value(42LL, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(0LL, KeyAId_)); + row1.AddValue(MakeUnversionedInt64Value(1LL, KeyBId_)); + row1.AddValue(MakeUnversionedInt64Value(2LL, KeyCId_)); + row1.AddValue(MakeUnversionedInt64Value(3LL, KeyDId_)); + + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedInt64Value(42LL, TableIndexId_)); + row2.AddValue(MakeUnversionedInt64Value(4LL, KeyAId_)); + row2.AddValue(MakeUnversionedInt64Value(5LL, KeyBId_)); + row2.AddValue(MakeUnversionedInt64Value(6LL, KeyCId_)); + row2.AddValue(MakeUnversionedInt64Value(7LL, KeyDId_)); + + EXPECT_EQ(true, Writer_->Write(std::vector<TUnversionedRow>{row1.GetRow(), row2.GetRow()})); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedInt64Value(23LL, TableIndexId_)); + row3.AddValue(MakeUnversionedUint64Value(8LL, KeyAId_)); + row3.AddValue(MakeUnversionedUint64Value(9LL, KeyBId_)); + row3.AddValue(MakeUnversionedUint64Value(10LL, KeyCId_)); + row3.AddValue(MakeUnversionedUint64Value(11ULL, KeyDId_)); + + EXPECT_EQ(true, Writer_->Write(std::vector<TUnversionedRow>{row3.GetRow()})); + + Writer_->Close() + .Get() + .ThrowOnError(); + TString expectedOutput = + "42\t0\t1\t2\t3\n" + "42\t4\t5\t6\t7\n" + "23\t8\t9\t10\t11\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + + +TEST_F(TSchemalessWriterForSchemafulDsvTest, ValidateDuplicateNames) +{ + Config_->Columns = {"column_a", "column_b", "column_a"}; + Config_->EnableTableIndex = true; + EXPECT_THROW(CreateStandardWriter(), TErrorException); +} + +TEST_F(TSchemalessWriterForSchemafulDsvTest, ColumnsHeader) +{ + Config_->Columns = {"column_b", "column_c", "column_a"}; + Config_->EnableColumnNamesHeader = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("value_a", KeyAId_)); + row1.AddValue(MakeUnversionedInt64Value(-42, KeyBId_)); + row1.AddValue(MakeUnversionedBooleanValue(true, KeyCId_)); + std::vector<TUnversionedRow> rows = {row1.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = + "column_b\tcolumn_c\tcolumn_a\n" + "-42\ttrue\tvalue_a\n"; + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/skiff_format_ut.cpp b/yt/yt/library/formats/unittests/skiff_format_ut.cpp new file mode 100644 index 0000000000..0f5d416bd5 --- /dev/null +++ b/yt/yt/library/formats/unittests/skiff_format_ut.cpp @@ -0,0 +1,3028 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> +#include "value_examples.h" +#include "row_helpers.h" +#include "yson_helpers.h" + +#include <yt/yt/client/formats/config.h> +#include <yt/yt/client/formats/parser.h> +#include <yt/yt/library/formats/skiff_parser.h> +#include <yt/yt/library/formats/skiff_writer.h> +#include <yt/yt/library/formats/format.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/validate_logical_type.h> + +#include <yt/yt/library/named_value/named_value.h> +#include <yt/yt/library/skiff_ext/schema_match.h> + +#include <yt/yt/core/yson/string.h> +#include <yt/yt/core/ytree/convert.h> +#include <yt/yt/core/ytree/fluent.h> +#include <yt/yt/core/ytree/tree_visitor.h> + +#include <library/cpp/skiff/skiff.h> +#include <library/cpp/skiff/skiff_schema.h> + +#include <util/stream/null.h> +#include <util/string/hex.h> + +namespace NYT { + +namespace { + +using namespace NFormats; +using namespace NNamedValue; +using namespace NSkiff; +using namespace NSkiffExt; +using namespace NTableClient; +using namespace NYTree; +using namespace NYson; + +//////////////////////////////////////////////////////////////////////////////// + +TString ConvertToSkiffSchemaShortDebugString(INodePtr node) +{ + auto skiffFormatConfig = ConvertTo<TSkiffFormatConfigPtr>(std::move(node)); + auto skiffSchemas = ParseSkiffSchemas(skiffFormatConfig->SkiffSchemaRegistry, skiffFormatConfig->TableSkiffSchemas); + TStringStream result; + result << '{'; + for (const auto& schema : skiffSchemas) { + result << GetShortDebugString(schema); + result << ','; + } + result << '}'; + return result.Str(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TString ConvertToYsonTextStringStable(const INodePtr& node) +{ + TStringStream out; + TYsonWriter writer(&out, EYsonFormat::Text); + VisitTree(node, &writer, true, TAttributeFilter()); + writer.Flush(); + return out.Str(); +} + +TTableSchemaPtr CreateSingleValueTableSchema(const TLogicalTypePtr& logicalType) +{ + std::vector<TColumnSchema> columns; + if (logicalType) { + columns.emplace_back("value", logicalType); + + } + auto strict = static_cast<bool>(logicalType); + return New<TTableSchema>(columns, strict); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSkiffSchemaParse, TestAllowedTypes) +{ + EXPECT_EQ( + "{uint64,}", + + ConvertToSkiffSchemaShortDebugString( + BuildYsonNodeFluently() + .BeginMap() + .Item("table_skiff_schemas") + .BeginList() + .Item() + .BeginMap() + .Item("wire_type") + .Value("uint64") + .EndMap() + .EndList() + .EndMap())); + + EXPECT_EQ( + "{string32,}", + + ConvertToSkiffSchemaShortDebugString( + BuildYsonNodeFluently() + .BeginMap() + .Item("table_skiff_schemas") + .BeginList() + .Item() + .BeginMap() + .Item("wire_type") + .Value("string32") + .EndMap() + .EndList() + .EndMap())); + + EXPECT_EQ( + "{variant8<string32;int64;>,}", + + ConvertToSkiffSchemaShortDebugString( + BuildYsonNodeFluently() + .BeginMap() + .Item("table_skiff_schemas") + .BeginList() + .Item() + .BeginMap() + .Item("wire_type") + .Value("variant8") + .Item("children") + .BeginList() + .Item() + .BeginMap() + .Item("wire_type") + .Value("string32") + .EndMap() + .Item() + .BeginMap() + .Item("wire_type") + .Value("int64") + .EndMap() + .EndList() + .EndMap() + .EndList() + .EndMap())); + + EXPECT_EQ( + "{variant8<int64;string32;>,}", + + ConvertToSkiffSchemaShortDebugString( + BuildYsonNodeFluently() + .BeginMap() + .Item("skiff_schema_registry") + .BeginMap() + .Item("item1") + .BeginMap() + .Item("wire_type") + .Value("int64") + .EndMap() + .Item("item2") + .BeginMap() + .Item("wire_type") + .Value("string32") + .EndMap() + .EndMap() + .Item("table_skiff_schemas") + .BeginList() + .Item() + .BeginMap() + .Item("wire_type") + .Value("variant8") + .Item("children") + .BeginList() + .Item().Value("$item1") + .Item().Value("$item2") + .EndList() + .EndMap() + .EndList() + .EndMap())); +} + +TEST(TSkiffSchemaParse, TestRecursiveTypesAreDisallowed) +{ + try { + ConvertToSkiffSchemaShortDebugString( + BuildYsonNodeFluently() + .BeginMap() + .Item("skiff_schema_registry") + .BeginMap() + .Item("item1") + .BeginMap() + .Item("wire_type") + .Value("variant8") + .Item("children") + .BeginList() + .Item().Value("$item1") + .EndList() + .EndMap() + .EndMap() + .Item("table_skiff_schemas") + .BeginList() + .Item().Value("$item1") + .EndList() + .EndMap()); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("recursive types are forbidden")); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSkiffSchemaDescription, TestDescriptionDerivation) +{ + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Uint64), + })->SetName("Bar"), + }); + + auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + EXPECT_EQ(std::ssize(tableDescriptionList), 1); + EXPECT_EQ(tableDescriptionList[0].HasOtherColumns, false); + EXPECT_EQ(tableDescriptionList[0].SparseFieldDescriptionList.empty(), true); + + auto denseFieldDescriptionList = tableDescriptionList[0].DenseFieldDescriptionList; + EXPECT_EQ(std::ssize(denseFieldDescriptionList), 2); + + EXPECT_EQ(denseFieldDescriptionList[0].Name(), "Foo"); + EXPECT_EQ(denseFieldDescriptionList[0].ValidatedSimplify(), EWireType::Uint64); +} + +TEST(TSkiffSchemaDescription, TestKeySwitchColumn) +{ + { + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), + }); + + auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + EXPECT_EQ(std::ssize(tableDescriptionList), 1); + EXPECT_EQ(tableDescriptionList[0].KeySwitchFieldIndex, std::optional<size_t>(1)); + } + { + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$key_switch"), + }); + + try { + auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Column \"$key_switch\" has unexpected Skiff type")); + } + } +} + +TEST(TSkiffSchemaDescription, TestDisallowEmptyNames) +{ + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName(""), + }); + + try { + CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("must have a name")); + } +} + +TEST(TSkiffSchemaDescription, TestWrongRowType) +{ + auto schema = CreateRepeatedVariant16Schema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), + }); + + try { + CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Invalid wire type for table row")); + } +} + +TEST(TSkiffSchemaDescription, TestOtherColumnsOk) +{ + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), + }); + + auto tableDescriptionList = CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ASSERT_EQ(std::ssize(tableDescriptionList), 1); + ASSERT_EQ(tableDescriptionList[0].HasOtherColumns, true); +} + +TEST(TSkiffSchemaDescription, TestOtherColumnsWrongType) +{ + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$other_columns"), + }); + + try { + CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Invalid wire type for column \"$other_columns\"")); + } +} + +TEST(TSkiffSchemaDescription, TestOtherColumnsWrongPlace) +{ + auto schema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Foo"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("$other_columns"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("Bar"), + }); + + try { + CreateTableDescriptionList({schema}, RangeIndexColumnName, RowIndexColumnName); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Invalid placement of special column \"$other_columns\"")); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +ISchemalessFormatWriterPtr CreateSkiffWriter( + std::shared_ptr<TSkiffSchema> skiffSchema, + TNameTablePtr nameTable, + IOutputStream* outputStream, + const std::vector<TTableSchemaPtr>& tableSchemaList, + int keyColumnCount = 0, + bool enableEndOfStream = false) +{ + auto controlAttributesConfig = New<TControlAttributesConfig>(); + controlAttributesConfig->EnableKeySwitch = (keyColumnCount > 0); + controlAttributesConfig->EnableEndOfStream = enableEndOfStream; + return CreateWriterForSkiff( + {std::move(skiffSchema)}, + std::move(nameTable), + tableSchemaList, + NConcurrency::CreateAsyncAdapter(outputStream), + false, + controlAttributesConfig, + keyColumnCount); +} + +TString TableToSkiff( + const TLogicalTypePtr& logicalType, + const std::shared_ptr<TSkiffSchema>& typeSchema, + const TNamedValue::TValue& value) +{ + auto schema = CreateSingleValueTableSchema(logicalType); + auto skiffSchema = CreateTupleSchema({ + typeSchema->SetName("value") + }); + + auto nameTable = New<TNameTable>(); + + TStringStream resultStream; + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {schema}); + + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"value", value} + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + auto result = resultStream.Str(); + if (!TStringBuf(result).StartsWith(TString(2, '\0'))) { + THROW_ERROR_EXCEPTION("Expected skiff value to start with \\x00\\x00, but prefix is %Qv", + EscapeC(result.substr(0, 2))); + } + + return result.substr(2); +} + +TNamedValue::TValue SkiffToTable( + const TLogicalTypePtr& logicalType, + const std::shared_ptr<TSkiffSchema>& typeSchema, + const TString& skiffValue) +{ + auto schema = CreateSingleValueTableSchema(logicalType); + auto skiffSchema = CreateTupleSchema({ + typeSchema->SetName("value") + }); + auto nameTable = New<TNameTable>(); + + TCollectingValueConsumer rowCollector(schema); + auto parser = CreateParserForSkiff(skiffSchema, &rowCollector); + parser->Read(TString(2, 0)); + parser->Read(skiffValue); + parser->Finish(); + + if (rowCollector.Size() != 1) { + THROW_ERROR_EXCEPTION("Expected 1 row collected, actual %v", + rowCollector.Size()); + } + auto value = rowCollector.GetRowValue(0, "value"); + return TNamedValue::ExtractValue(value); +} + +#define CHECK_BIDIRECTIONAL_CONVERSION(logicalTypeArg, skiffSchemaArg, tableValueArg, hexSkiffArg) \ + do { \ + try { \ + TLogicalTypePtr logicalType = (logicalTypeArg); \ + std::shared_ptr<TSkiffSchema> skiffSchema = (skiffSchemaArg); \ + TNamedValue::TValue tableValue = (tableValueArg); \ + TString hexSkiff = (hexSkiffArg); \ + auto nameTable = New<TNameTable>(); \ + auto actualSkiff = TableToSkiff(logicalType, skiffSchema, tableValue); \ + EXPECT_EQ(HexEncode(actualSkiff), hexSkiff); \ + auto actualValue = SkiffToTable(logicalType, skiffSchema, HexDecode(hexSkiff)); \ + EXPECT_EQ(actualValue, tableValue); \ + } catch (const std::exception& ex) { \ + ADD_FAILURE() << "unexpected exception: " << ex.what(); \ + } \ + } while (0) + +//////////////////////////////////////////////////////////////////////////////// + +void TestAllWireTypes(bool useSchema) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), + CreateSimpleTypeSchema(EWireType::Double)->SetName("double_1"), + CreateSimpleTypeSchema(EWireType::Double)->SetName("double_2"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("boolean"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), + CreateSimpleTypeSchema(EWireType::Nothing)->SetName("null"), + + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName("opt_int64"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Uint64), + })->SetName("opt_uint64"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Double), + })->SetName("opt_double_1"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Double), + })->SetName("opt_double_2"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Boolean), + })->SetName("opt_boolean"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::String32), + })->SetName("opt_string32"), + }); + std::vector<TTableSchemaPtr> tableSchemas; + if (useSchema) { + tableSchemas.push_back(New<TTableSchema>(std::vector{ + TColumnSchema("int64", EValueType::Int64), + TColumnSchema("uint64", EValueType::Uint64), + TColumnSchema("double_1", EValueType::Double), + TColumnSchema("double_2", ESimpleLogicalValueType::Float), + TColumnSchema("boolean", EValueType::Boolean), + TColumnSchema("string32", EValueType::String), + TColumnSchema("null", EValueType::Null), + TColumnSchema("opt_int64", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))), + TColumnSchema("opt_uint64", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Uint64))), + TColumnSchema("opt_double_1", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Double))), + TColumnSchema("opt_double_2", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Float))), + TColumnSchema("opt_boolean", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Boolean))), + TColumnSchema("opt_string32", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::String))), + })); + } else { + tableSchemas.push_back(New<TTableSchema>()); + } + auto nameTable = New<TNameTable>(); + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, tableSchemas); + + auto isWriterReady = writer->Write({ + MakeRow(nameTable, { + {"int64", -1}, + {"uint64", 2u}, + {"double_1", 3.0}, + {"double_2", 3.0}, + {"boolean", true}, + {"string32", "four"}, + {"null", nullptr}, + + {"opt_int64", -5}, + {"opt_uint64", 6u}, + {"opt_double_1", 7.0}, + {"opt_double_2", 7.0}, + {"opt_boolean", false}, + {"opt_string32", "eight"}, + {TString(TableIndexColumnName), 0}, + }).Get(), + }); + if (!isWriterReady) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"int64", -9}, + {"uint64", 10u}, + {"double_1", 11.0}, + {"double_2", 11.0}, + {"boolean", false}, + {"string32", "twelve"}, + {"null", nullptr}, + + {"opt_int64", nullptr}, + {"opt_uint64", nullptr}, + {"opt_double_1", nullptr}, + {"opt_double_2", nullptr}, + {"opt_boolean", nullptr}, + {"opt_string32", nullptr}, + {TString(TableIndexColumnName), 0}, + }).Get() + })); + + writer->Close() + .Get() + .ThrowOnError(); + } + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -1); + ASSERT_EQ(checkedSkiffParser.ParseUint64(), 2u); + // double_1 + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 3.0); + // double_2 + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 3.0); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "four"); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -5); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseUint64(), 6u); + + // double_1 + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 7.0); + + // double_2 + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 7.0); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "eight"); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -9); + ASSERT_EQ(checkedSkiffParser.ParseUint64(), 10u); + // double_1 + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 11.0); + // double_2 + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 11.0); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "twelve"); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + // double_1 + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + // double_2 + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestAllWireTypesNoSchema) +{ + TestAllWireTypes(false); +} + +TEST(TSkiffWriter, TestAllWireTypesWithSchema) +{ + TestAllWireTypes(true); +} + +class TSkiffYsonWireTypeP + : public ::testing::TestWithParam<std::tuple< + TLogicalTypePtr, + TNamedValue::TValue, + TString + >> +{ +public: + static std::vector<ParamType> GetCases() + { + using namespace NLogicalTypeShortcuts; + std::vector<ParamType> result; + + for (const auto& example : GetPrimitiveValueExamples()) { + result.emplace_back(example.LogicalType, example.Value, example.PrettyYson); + result.emplace_back(nullptr, example.Value, example.PrettyYson); + } + + for (const auto type : TEnumTraits<ESimpleLogicalValueType>::GetDomainValues()) { + auto logicalType = OptionalLogicalType(SimpleLogicalType(type)); + if (IsV3Composite(logicalType)) { + // Optional<Null> is not v1 type + continue; + } + result.emplace_back(logicalType, nullptr, "#"); + } + return result; + } + + static const std::vector<ParamType> Cases; +}; + +const std::vector<TSkiffYsonWireTypeP::ParamType> TSkiffYsonWireTypeP::Cases = TSkiffYsonWireTypeP::GetCases(); + +INSTANTIATE_TEST_SUITE_P( + Cases, + TSkiffYsonWireTypeP, + ::testing::ValuesIn(TSkiffYsonWireTypeP::Cases)); + +TEST_P(TSkiffYsonWireTypeP, Test) +{ + const auto& [logicalType, value, expectedYson] = GetParam(); + TTableSchemaPtr tableSchema; + if (logicalType) { + tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("column", logicalType), + }); + } else { + tableSchema = New<TTableSchema>(); + } + auto skiffTableSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("column"), + }); + auto nameTable = New<TNameTable>(); + TStringStream actualSkiffDataStream; + auto writer = CreateSkiffWriter(skiffTableSchema, nameTable, &actualSkiffDataStream, {tableSchema}); + Y_UNUSED(writer->Write({ + MakeRow(nameTable, {{"column", value}}) + })); + writer->Close() + .Get() + .ThrowOnError(); + + auto actualSkiffData = actualSkiffDataStream.Str(); + { + TMemoryInput in(actualSkiffData); + TCheckedSkiffParser parser(CreateVariant16Schema({skiffTableSchema}), &in); + EXPECT_EQ(parser.ParseVariant16Tag(), 0); + auto actualYson = parser.ParseYson32(); + parser.ValidateFinished(); + + EXPECT_EQ(CanonizeYson(actualYson), CanonizeYson(expectedYson)); + } + + TCollectingValueConsumer rowCollector(nameTable); + auto parser = CreateParserForSkiff(skiffTableSchema, tableSchema, &rowCollector); + parser->Read(actualSkiffDataStream.Str()); + parser->Finish(); + auto actualValue = rowCollector.GetRowValue(0, "column"); + EXPECT_EQ(actualValue, TNamedValue("column", value).ToUnversionedValue(nameTable)); +} + +TEST(TSkiffWriter, TestYsonWireType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson32"), + + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Yson32), + })->SetName("opt_yson32"), + }); + auto nameTable = New<TNameTable>(); + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + // Row 0 (Null) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", nullptr}, + {"opt_yson32", nullptr}, + }).Get(), + }); + + // Row 1 (Int64) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", -5}, + {"opt_yson32", -6}, + }).Get(), + }); + + // Row 2 (Uint64) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", 42u}, + {"opt_yson32", 43u}, + }).Get(), + }); + + // Row 3 ((Double) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", 2.7182818}, + {"opt_yson32", 3.1415926}, + }).Get(), + }); + + // Row 4 ((Boolean) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", true}, + {"opt_yson32", false}, + }).Get(), + }); + + // Row 5 ((String) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", "Yin"}, + {"opt_yson32", "Yang"}, + }).Get(), + }); + + // Row 6 ((Any) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + + {"yson32", EValueType::Any, "{foo=bar;}"}, + {"opt_yson32", EValueType::Any, "{bar=baz;}"}, + }).Get(), + }); + + // Row 7 ((missing optional values) + write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + }).Get(), + }); + + writer->Close() + .Get() + .ThrowOnError(); + } + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + auto parseYson = [] (TCheckedSkiffParser* parser) { + auto yson = TString{parser->ParseYson32()}; + return ConvertToNode(TYsonString(yson)); + }; + + // Row 0 (Null) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->GetType(), ENodeType::Entity); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + // Row 1 (Int64) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsInt64()->GetValue(), -5); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsInt64()->GetValue(), -6); + + // Row 2 (Uint64) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsUint64()->GetValue(), 42u); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsUint64()->GetValue(), 43u); + + // Row 3 (Double) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsDouble()->GetValue(), 2.7182818); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsDouble()->GetValue(), 3.1415926); + + // Row 4 (Boolean) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsBoolean()->GetValue(), true); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsBoolean()->GetValue(), false); + + // Row 5 (String) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsString()->GetValue(), "Yin"); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsString()->GetValue(), "Yang"); + + // Row 6 (Any) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsMap()->GetChildOrThrow("foo")->AsString()->GetValue(), "bar"); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(parseYson(&checkedSkiffParser)->AsMap()->GetChildOrThrow("bar")->AsString()->GetValue(), "baz"); + + // Row 7 (Null) + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser)->GetType(), ENodeType::Entity); + + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +class TSkiffFormatSmallIntP +: public ::testing::TestWithParam<std::tuple< + std::shared_ptr<TSkiffSchema>, + TLogicalTypePtr, + TNamedValue::TValue, + TString +>> +{ +public: + static std::vector<ParamType> GetCases() + { + using namespace NLogicalTypeShortcuts; + + std::vector<ParamType> result; + + auto addSimpleCase = [&result] ( + EWireType wireType, + const TLogicalTypePtr& logicalType, + auto value, + TStringBuf skiffValue) + { + auto simpleSkiffSchema = CreateSimpleTypeSchema(wireType); + auto simpleSkiffData = TString(2, 0) + skiffValue; + result.emplace_back(simpleSkiffSchema, logicalType, value, simpleSkiffData); + }; + + auto addListCase = [&result] ( + EWireType wireType, + const TLogicalTypePtr& logicalType, + auto value, + TStringBuf skiffValue) + { + auto listSkiffSchema = CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(wireType)}); + auto listSkiffData = TString(3, 0) + skiffValue + TString(1, '\xff'); + auto listValue = TNamedValue::TValue{ + TNamedValue::TComposite{ + BuildYsonStringFluently() + .BeginList() + .Item().Value(value) + .EndList().ToString() + } + }; + result.emplace_back(listSkiffSchema, List(logicalType), listValue, listSkiffData); + }; + + auto addSimpleAndListCases = [&] ( + EWireType wireType, + const TLogicalTypePtr& logicalType, + auto value, + TStringBuf skiffValue) + { + addSimpleCase(wireType, logicalType, value, skiffValue); + addListCase(wireType, logicalType, value, skiffValue); + }; + + auto addMultiCase = [&] (EWireType wireType, auto value, TStringBuf skiffValue) { + auto add = [&] (const TLogicalTypePtr& logicalType) { + addSimpleAndListCases(wireType, logicalType, value, skiffValue); + }; + addSimpleCase(wireType, Yson(), value, skiffValue); + + using T = std::decay_t<decltype(value)>; + static_assert(std::is_integral_v<T>); + if constexpr (std::is_signed_v<T>) { + if (std::numeric_limits<i8>::min() <= value && value <= std::numeric_limits<i8>::max()) { + add(Int8()); + } + if (std::numeric_limits<i16>::min() <= value && value <= std::numeric_limits<i16>::max()) { + add(Int16()); + } + if (std::numeric_limits<i32>::min() <= value && value <= std::numeric_limits<i32>::max()) { + add(Int32()); + } + add(Int64()); + } else { + if (value <= std::numeric_limits<ui8>::max()) { + add(Uint8()); + } + if (value <= std::numeric_limits<ui16>::max()) { + add(Uint16()); + } + if (value <= std::numeric_limits<ui32>::max()) { + add(Uint32()); + } + add(Uint64()); + } + }; + addMultiCase(EWireType::Int8, 0, TStringBuf("\x00"sv)); + addMultiCase(EWireType::Int8, 42, TStringBuf("*")); + addMultiCase(EWireType::Int8, -42, TStringBuf("\xd6"sv)); + addMultiCase(EWireType::Int8, 127, TStringBuf("\x7f"sv)); + addMultiCase(EWireType::Int8, -128, TStringBuf("\x80"sv)); + + addMultiCase(EWireType::Int16, 0, TStringBuf("\x00\x00"sv)); + addMultiCase(EWireType::Int16, 42, TStringBuf("\x2a\x00"sv)); + addMultiCase(EWireType::Int16, -42, TStringBuf("\xd6\xff"sv)); + addMultiCase(EWireType::Int16, 0x7fff, TStringBuf("\xff\x7f"sv)); + addMultiCase(EWireType::Int16, -0x8000, TStringBuf("\x00\x80"sv)); + + addMultiCase(EWireType::Int32, 0, TStringBuf("\x00\x00\x00\x00"sv)); + addMultiCase(EWireType::Int32, 42, TStringBuf("\x2a\x00\x00\x00"sv)); + addMultiCase(EWireType::Int32, -42, TStringBuf("\xd6\xff\xff\xff"sv)); + addMultiCase(EWireType::Int32, 0x7fffffff, TStringBuf("\xff\xff\xff\x7f"sv)); + addMultiCase(EWireType::Int32, -0x80000000l, TStringBuf("\x00\x00\x00\x80"sv)); + + addMultiCase(EWireType::Uint8, 0ull, TStringBuf("\x00"sv)); + addMultiCase(EWireType::Uint8, 42ull, TStringBuf("*")); + addMultiCase(EWireType::Uint8, 255ull, TStringBuf("\xff"sv)); + + addMultiCase(EWireType::Uint16, 0ull, TStringBuf("\x00\x00"sv)); + addMultiCase(EWireType::Uint16, 42ull, TStringBuf("\x2a\x00"sv)); + addMultiCase(EWireType::Uint16, 0xFFFFull, TStringBuf("\xff\xff"sv)); + + addMultiCase(EWireType::Uint32, 0ull, TStringBuf("\x00\x00\x00\x00"sv)); + addMultiCase(EWireType::Uint32, 42ull, TStringBuf("\x2a\x00\x00\x00"sv)); + addMultiCase(EWireType::Uint32, 0xFFFFFFFFull, TStringBuf("\xff\xff\xff\xff"sv)); + + addSimpleAndListCases(EWireType::Uint16, Date(), 0ull, TStringBuf("\x00\x00"sv)); + addSimpleAndListCases(EWireType::Uint16, Date(), 42ull, TStringBuf("\x2a\x00"sv)); + addSimpleAndListCases(EWireType::Uint16, Date(), DateUpperBound - 1, TStringBuf("\x08\xc2"sv)); + + addSimpleAndListCases(EWireType::Uint32, Datetime(), 0ull, TStringBuf("\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Uint32, Datetime(), 42ull, TStringBuf("\x2a\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Uint32, Datetime(), DatetimeUpperBound - 1, TStringBuf("\x7f\xdd\xce\xff"sv)); + + addSimpleAndListCases(EWireType::Int64, Date32(), 0ll, TStringBuf("\x00\x00\x00\x00\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Date32(), Date32UpperBound - 1, TStringBuf("\x3f\x73\x2e\x03\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Date32(), Date32LowerBound, TStringBuf("\xbf\x8c\xd1\xfc\xff\xff\xff\xff"sv)); + + addSimpleAndListCases(EWireType::Int32, Date32(), 0ll, TStringBuf("\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int32, Date32(), Date32UpperBound - 1, TStringBuf("\x3f\x73\x2e\x03"sv)); + addSimpleAndListCases(EWireType::Int32, Date32(), Date32LowerBound, TStringBuf("\xbf\x8c\xd1\xfc"sv)); + + addSimpleAndListCases(EWireType::Int64, Datetime64(), 0ll, TStringBuf("\x00\x00\x00\x00\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Datetime64(), Datetime64UpperBound - 1, TStringBuf("\xff\xdf\xf0\xbc\x31\x04\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Datetime64(), Datetime64LowerBound, TStringBuf("\x80\xce\x0d\x43\xce\xfb\xff\xff"sv)); + + addSimpleAndListCases(EWireType::Int64, Timestamp64(), 0ll, TStringBuf("\x00\x00\x00\x00\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Timestamp64(), Timestamp64UpperBound - 1, TStringBuf("\xff\xff\xf7\x75\x42\xf1\xff\x3f"sv)); + addSimpleAndListCases(EWireType::Int64, Timestamp64(), Timestamp64LowerBound, TStringBuf("\x00\xa0\x30\x6c\xa9\x0e\x00\xc0"sv)); + + addSimpleAndListCases(EWireType::Int64, Interval64(), 0ll, TStringBuf("\x00\x00\x00\x00\x00\x00\x00\x00"sv)); + addSimpleAndListCases(EWireType::Int64, Interval64(), Interval64UpperBound - 1, TStringBuf("\x00\x60\xc7\x09\x99\xe2\xff\x7f"sv)); + addSimpleAndListCases(EWireType::Int64, Interval64(), -Interval64UpperBound + 1, TStringBuf("\x00\xa0\x38\xf6\x66\x1d\x00\x80"sv)); + + return result; + } + + static const std::vector<ParamType> Cases; +}; + +const std::vector<TSkiffFormatSmallIntP::ParamType> TSkiffFormatSmallIntP::Cases = TSkiffFormatSmallIntP::GetCases(); + +INSTANTIATE_TEST_SUITE_P( + Cases, + TSkiffFormatSmallIntP, + ::testing::ValuesIn(TSkiffFormatSmallIntP::Cases)); + +TEST_P(TSkiffFormatSmallIntP, Test) +{ + const auto& [skiffValueSchema, logicalType, value, expectedSkiffData] = GetParam(); + + const auto nameTable = New<TNameTable>(); + + TStringStream actualSkiffData; + auto skiffTableSchema = CreateTupleSchema({ + skiffValueSchema->SetName("column") + }); + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("column", logicalType), + }); + auto writer = CreateSkiffWriter(skiffTableSchema, nameTable, &actualSkiffData, {tableSchema}); + Y_UNUSED(writer->Write({ + MakeRow(nameTable, {{"column", value}}) + })); + writer->Close() + .Get() + .ThrowOnError(); + EXPECT_EQ(actualSkiffData.Str(), expectedSkiffData); + + TCollectingValueConsumer rowCollector(nameTable); + auto parser = CreateParserForSkiff(skiffTableSchema, tableSchema, &rowCollector); + parser->Read(expectedSkiffData); + parser->Finish(); + auto actualValue = rowCollector.GetRowValue(0, "column"); + + EXPECT_EQ(actualValue, TNamedValue("common", value).ToUnversionedValue(nameTable)); +} + +TEST(TSkiffWriter, TestBadSmallIntegers) +{ + using namespace NLogicalTypeShortcuts; + auto writeSkiffValue = [] ( + std::shared_ptr<TSkiffSchema>&& typeSchema, + TLogicalTypePtr logicalType, + TNamedValue::TValue value) + { + TStringStream result; + auto skiffSchema = CreateTupleSchema({ + typeSchema->SetName("column") + }); + auto tableSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + TColumnSchema("column", std::move(logicalType)), + }); + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); + Y_UNUSED(writer->Write({ + MakeRow(nameTable, {{"column", std::move(value)}}) + })); + writer->Close() + .Get() + .ThrowOnError(); + return result.Str(); + }; + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int8), Int64(), 128), + "is out of range for possible values"); + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int8), Int64(), -129), + "is out of range for possible values"); + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int16), Int64(), 0x8000), + "is out of range for possible values"); + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int16), Int64(), -0x8001), + "is out of range for possible values"); + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int32), Int64(), 0x80000000ll), + "is out of range for possible values"); + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Int32), Int64(), -0x80000001ll), + "is out of range for possible values"); + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint8), Uint64(), 256ull), + "is out of range for possible values"); + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint16), Uint64(), 0x1FFFFull), + "is out of range for possible values"); + + EXPECT_THROW_WITH_SUBSTRING( + writeSkiffValue(CreateSimpleTypeSchema(EWireType::Uint32), Uint64(), 0x100000000ull), + "is out of range for possible values"); +} + +class TSkiffFormatUuidTestP : public ::testing::TestWithParam<std::tuple< + TNameTablePtr, + TTableSchemaPtr, + std::shared_ptr<TSkiffSchema>, + std::vector<TUnversionedOwningRow>, + TString +>> +{ +public: + static std::vector<ParamType> GetCases() + { + using namespace NLogicalTypeShortcuts; + + auto nameTable = New<TNameTable>(); + const auto stringUuidValue = TStringBuf("\xee\x1f\x37\x70" "\xb9\x93\x64\xb5" "\xe4\xdf\xe9\x03" "\x67\x5c\x30\x62"); + const auto uint128UuidValue = TStringBuf("\x62\x30\x5c\x67" "\x03\xe9\xdf\xe4" "\xb5\x64\x93\xb9" "\x70\x37\x1f\xee"); + + const auto requiredTableSchema = New<TTableSchema>(std::vector<TColumnSchema>{TColumnSchema("uuid", Uuid())}); + const auto optionalTableSchema = New<TTableSchema>(std::vector<TColumnSchema>{TColumnSchema("uuid", Optional(Uuid()))}); + + const auto optionalUint128SkiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Uint128), + })->SetName("uuid"), + }); + + const auto requiredUint128SkiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint128)->SetName("uuid"), + }); + + const auto optionalStringSkiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::String32), + })->SetName("uuid"), + }); + + const auto requiredStringSkiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("uuid"), + }); + + std::vector<ParamType> result; + + result.emplace_back( + nameTable, + requiredTableSchema, + requiredUint128SkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + uint128UuidValue); + + result.emplace_back( + nameTable, + optionalTableSchema, + requiredUint128SkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + uint128UuidValue); + + result.emplace_back( + nameTable, + requiredTableSchema, + optionalUint128SkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + "\1" + uint128UuidValue); + + result.emplace_back( + nameTable, + optionalTableSchema, + optionalUint128SkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + "\1" + uint128UuidValue); + + const TString uuidLen = TString(TStringBuf("\x10\x00\x00\x00"sv)); + + result.emplace_back( + nameTable, + requiredTableSchema, + requiredStringSkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + uuidLen + stringUuidValue); + + result.emplace_back( + nameTable, + optionalTableSchema, + requiredStringSkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + uuidLen + stringUuidValue); + + result.emplace_back( + nameTable, + requiredTableSchema, + optionalStringSkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + "\1" + uuidLen + stringUuidValue); + + result.emplace_back( + nameTable, + optionalTableSchema, + optionalStringSkiffSchema, + std::vector<TUnversionedOwningRow>{ + MakeRow(nameTable, {{"uuid", stringUuidValue}}), + }, + TString(2, '\0') + "\1" + uuidLen + stringUuidValue); + + return result; + } + + static const std::vector<ParamType> Cases; +}; + +const std::vector<TSkiffFormatUuidTestP::ParamType> TSkiffFormatUuidTestP::Cases = TSkiffFormatUuidTestP::GetCases(); + +INSTANTIATE_TEST_SUITE_P( + Cases, + TSkiffFormatUuidTestP, + ::testing::ValuesIn(TSkiffFormatUuidTestP::Cases)); + +TEST_P(TSkiffFormatUuidTestP, Test) +{ + const auto& [nameTable, tableSchema, skiffSchema, rows, skiffString] = GetParam(); + + TStringStream result; + std::vector<TUnversionedRow> nonOwningRows; + for (const auto& row : rows) { + nonOwningRows.emplace_back(row); + } + auto skiffWriter = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); + Y_UNUSED(skiffWriter->Write(TRange(nonOwningRows))); + skiffWriter->Close().Get().ThrowOnError(); + ASSERT_EQ(result.Str(), skiffString); + + TCollectingValueConsumer rowCollector(nameTable); + auto requiredParser = CreateParserForSkiff(skiffSchema, tableSchema, &rowCollector); + requiredParser->Read(result.Str()); + requiredParser->Finish(); + ASSERT_EQ(rowCollector.GetRowList(), rows); +} + +TEST(TSkiffFormatUuidTest, TestError) +{ + using namespace NLogicalTypeShortcuts; + + auto nameTable = New<TNameTable>(); + auto tableSchema = New<TTableSchema>( + std::vector<TColumnSchema>{TColumnSchema("uuid", Optional(Uuid()))}); + + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Uint128)->SetName("uuid"), + }); + + TStringStream result; + auto skiffWriter = CreateSkiffWriter(skiffSchema, nameTable, &result, {tableSchema}); + Y_UNUSED(skiffWriter->Write({ + MakeRow(nameTable, {{"uuid", nullptr}}), + })); + EXPECT_THROW_WITH_SUBSTRING(skiffWriter->Close().Get().ThrowOnError(), + "Unexpected type"); + +} + +class TSkiffWriterSingular + : public ::testing::Test + , public ::testing::WithParamInterface<ESimpleLogicalValueType> +{}; + +INSTANTIATE_TEST_SUITE_P( + Singular, + TSkiffWriterSingular, + ::testing::Values(ESimpleLogicalValueType::Null, ESimpleLogicalValueType::Void)); + +TEST_P(TSkiffWriterSingular, TestOptionalSingular) +{ + const auto singularType = GetParam(); + + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Nothing), + })->SetName("opt_null"), + }); + + auto nameTable = New<TNameTable>(); + const std::vector<TTableSchemaPtr> tableSchemas = { + New<TTableSchema>(std::vector{ + TColumnSchema("opt_null", OptionalLogicalType(SimpleLogicalType(singularType))), + }), + }; + + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, tableSchemas); + // Row 0 + auto isReady = writer->Write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"opt_null", nullptr}, + }).Get(), + }); + if (!isReady) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + // Row 1 + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"opt_null", EValueType::Composite, "[#]"}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + } + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestRearrange) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::String32), + })->SetName("eng"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::String32), + })->SetName("rus"), + }); + auto nameTable = New<TNameTable>(); + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"number", 1}, + {"eng", "one"}, + {"rus", nullptr}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"eng", nullptr}, + {"number", 2}, + {"rus", "dva"}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"rus", "tri"}, + {"eng", "three"}, + {"number", 3}, + }).Get()); + + writer->Close() + .Get() + .ThrowOnError(); + } + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "dva"); + + // row 2 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 3); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "three"); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "tri"); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestMissingRequiredField) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("eng"), + }); + auto nameTable = New<TNameTable>(); + TString result; + try { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"number", 1}, + }).Get() + })); + writer->Close() + .Get() + .ThrowOnError(); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Unexpected type of \"eng\" column")); + } +} + +TEST(TSkiffWriter, TestSparse) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), + })->SetName("$sparse_columns"), + }); + + auto nameTable = New<TNameTable>(); + TString result; + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"int64", -1}, + {"string32", "minus one"}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"string32", "minus five"}, + {"int64", -5}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"uint64", 42u}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"int64", -8}, + {"uint64", nullptr}, + {"string32", nullptr}, + }).Get()); + + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + }).Get()); + + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -1); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 2); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "minus one"); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 2); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "minus five"); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -5); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + // row 2 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseUint64(), 42u); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + // row 3 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), -8); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + // row 4 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestMissingFields) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + }); + + try { + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"unknown_column", "four"}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Column \"unknown_column\" is not described by Skiff schema")); + } + + try { + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto unknownColumnId = nameTable->RegisterName("unknown_column"); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}); + + ASSERT_TRUE(unknownColumnId < nameTable->GetId("value")); + + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"unknown_column", "four"}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + ADD_FAILURE(); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Column \"unknown_column\" is not described by Skiff schema")); + } +} + +TEST(TSkiffWriter, TestOtherColumns) +{ + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64) + })->SetName("int64_column"), + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), + }); + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + nameTable->RegisterName("string_column"); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + // Row 0. + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"string_column", "foo"}, + }).Get()); + + // Row 1. + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"int64_column", 42}, + }).Get()); + + // Row 2. + write(MakeRow(nameTable, { + {TString(TableIndexColumnName), 0}, + {"other_string_column", "bar"}, + }).Get()); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + auto parseYson = [] (TCheckedSkiffParser* parser) { + auto yson = TString{parser->ParseYson32()}; + return ConvertToYsonTextStringStable(ConvertToNode(TYsonString(yson))); + }; + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser), "{\"string_column\"=\"foo\";}"); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 42); + ASSERT_EQ(parseYson(&checkedSkiffParser), "{}"); + + // row 2 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(parseYson(&checkedSkiffParser), "{\"other_string_column\"=\"bar\";}"); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestKeySwitch) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), + }); + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + // Row 0. + write(MakeRow(nameTable, { + {"value", "one"}, + {TString(TableIndexColumnName), 0}, + }).Get()); + // Row 1. + write(MakeRow(nameTable, { + {"value", "one"}, + {TString(TableIndexColumnName), 0}, + }).Get()); + // Row 2. + write(MakeRow(nameTable, { + {"value", "two"}, + {TString(TableIndexColumnName), 0}, + }).Get()); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + TString buf; + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + + // row 2 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "two"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestEndOfStream) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + }); + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1, true); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + // Row 0. + write(MakeRow(nameTable, { + {"value", "zero"}, + {TString(TableIndexColumnName), 0}, + }).Get()); + // Row 1. + write(MakeRow(nameTable, { + {"value", "one"}, + {TString(TableIndexColumnName), 0}, + }).Get()); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + TString buf; + + // Row 0. + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "zero"); + + // Row 1. + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + + // End of stream. + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0xffff); + + // The End. + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestRowRangeIndex) +{ + const auto rowAndRangeIndex = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName("$range_index"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName("$row_index"), + }); + + struct TRow { + int TableIndex; + std::optional<int> RangeIndex; + std::optional<int> RowIndex; + }; + auto generateUnversionedRow = [] (const TRow& row, const TNameTablePtr& nameTable) { + std::vector<TNamedValue> values = { + {TString(TableIndexColumnName), row.TableIndex}, + }; + if (row.RangeIndex) { + values.emplace_back(TString(RangeIndexColumnName), *row.RangeIndex); + } + if (row.RowIndex) { + values.emplace_back(TString(RowIndexColumnName), *row.RowIndex); + } + return MakeRow(nameTable, values); + }; + + auto skiffWrite = [generateUnversionedRow] (const std::vector<TRow>& rows, const std::shared_ptr<TSkiffSchema>& skiffSchema) { + std::vector<TTableSchemaPtr> tableSchemas; + { + THashSet<int> tableIndices; + for (const auto& row : rows) { + tableIndices.insert(row.TableIndex); + } + tableSchemas.assign(tableIndices.size(), New<TTableSchema>()); + } + + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter( + skiffSchema, + nameTable, + &resultStream, + tableSchemas); + + for (const auto& row : rows) { + if (!writer->Write({generateUnversionedRow(row, nameTable)})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + } + writer->Close() + .Get() + .ThrowOnError(); + + return HexEncode(resultStream.Str()); + }; + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 0, 2}, + }, rowAndRangeIndex).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "00" "00"); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 0, 3}, + }, rowAndRangeIndex).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "00" "01""03000000""00000000"); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 1, 2}, + {0, 1, 3}, + }, rowAndRangeIndex).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "01""01000000""00000000" "01""02000000""00000000" + "0000" "00" "00"); + + EXPECT_THROW_WITH_SUBSTRING(skiffWrite({{0, 0, {}}}, rowAndRangeIndex), "index requested but reader did not return it"); + EXPECT_THROW_WITH_SUBSTRING(skiffWrite({{0, {}, 0}}, rowAndRangeIndex), "index requested but reader did not return it"); + + const auto rowAndRangeIndexAllowMissing = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Nothing), + })->SetName("$range_index"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Nothing), + })->SetName("$row_index"), + }); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 0, 2}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "00" "00"); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 0, 3}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "00" "01""03000000""00000000"); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, 0}, + {0, 0, 1}, + {0, 1, 2}, + {0, 1, 3}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "01""00000000""00000000" "01""00000000""00000000" + "0000" "00" "00" + "0000" "01""01000000""00000000" "01""02000000""00000000" + "0000" "00" "00"); + + EXPECT_STREQ( + skiffWrite({ + {0, {}, {}}, + {0, {}, {}}, + {0, {}, {}}, + {0, {}, {}}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "02" "02" + "0000" "02" "02" + "0000" "02" "02" + "0000" "02" "02"); + + EXPECT_STREQ( + skiffWrite({ + {0, {}, 0}, + {0, {}, 1}, + {0, {}, 3}, + {0, {}, 4}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "02" "01""00000000""00000000" + "0000" "02" "00" + "0000" "02" "01""03000000""00000000" + "0000" "02" "00"); + + EXPECT_STREQ( + skiffWrite({ + {0, 0, {}}, + {0, 0, {}}, + {0, 1, {}}, + {0, 1, {}}, + }, rowAndRangeIndexAllowMissing).data(), + + "0000" "01""00000000""00000000" "02" + "0000" "00" "02" + "0000" "01""01000000""00000000" "02" + "0000" "00" "02"); +} + +TEST(TSkiffWriter, TestRowIndexOnlyOrRangeIndexOnly) +{ + std::string columnNameList[] = { + RowIndexColumnName, + RangeIndexColumnName, + }; + + for (const auto& columnName : columnNameList) { + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName(TString(columnName)), + }); + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); + + // Row 0. + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(columnName), 0}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } +} + +TEST(TSkiffWriter, TestComplexType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateRepeatedVariant8Schema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("x"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName("y"), + }) + })->SetName("points") + })->SetName("value"), + }); + + { + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("value", StructLogicalType({ + {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, + { + "points", + ListLogicalType( + StructLogicalType({ + {"x", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"y", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + })) + } + })), + }); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); + + // Row 0. + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"value", EValueType::Composite, "[foo;[[0; 1];[2;3]]]"}, + {TString(TableIndexColumnName), 0}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 3); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), EndOfSequenceTag<ui8>()); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } +} + +TEST(TSkiffWriter, TestEmptyComplexType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + }) + })->SetName("value"), + }); + + { + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("value", OptionalLogicalType( + StructLogicalType({ + {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))), + }); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); + + // Row 0. + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"value", nullptr}, + {TString(TableIndexColumnName), 0}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 0); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } +} + +TEST(TSkiffWriter, TestSparseComplexType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + })->SetName("value"), + })->SetName("$sparse_columns"), + }); + + { + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("value", OptionalLogicalType( + StructLogicalType({ + {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))), + }); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); + + // Row 0. + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"value", EValueType::Composite, "[foo;bar;]"}, + {TString(TableIndexColumnName), 0}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "bar"); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } +} + +TEST(TSkiffWriter, TestSparseComplexTypeWithExtraOptional) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + }) + })->SetName("value"), + })->SetName("$sparse_columns"), + }); + + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("value", OptionalLogicalType( + StructLogicalType({ + {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", SimpleLogicalType(ESimpleLogicalValueType::String)}, + }))), + }); + + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{tableSchema}); + + // Row 0. + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"value", EValueType::Composite, "[foo;bar;]"}, + {TString(TableIndexColumnName), 0}, + }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "foo"); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "bar"); + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), EndOfSequenceTag<ui16>()); + + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); +} + +TEST(TSkiffWriter, TestBadWireTypeForSimpleColumn) +{ + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Yson32), + }) + })->SetName("opt_yson32"), + }); + auto nameTable = New<TNameTable>(); + TStringStream resultStream; + EXPECT_THROW_WITH_SUBSTRING( + CreateSkiffWriter(skiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}), + "cannot be represented with Skiff schema"); +} + +TEST(TSkiffWriter, TestMissingComplexColumn) +{ + auto optionalSkiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Int64)}), + })->SetName("opt_list"), + }); + auto requiredSkiffSchema = CreateTupleSchema({ + CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Int64)})->SetName("opt_list"), + }); + + { // Non optional Skiff schema + auto nameTable = New<TNameTable>(); + EXPECT_THROW_WITH_SUBSTRING( + CreateSkiffWriter(requiredSkiffSchema, nameTable, &Cnull, std::vector{New<TTableSchema>()}), + "cannot be represented with Skiff schema"); + } + + { + auto nameTable = New<TNameTable>(); + TStringStream resultStream; + auto writer = CreateSkiffWriter(optionalSkiffSchema, nameTable, &resultStream, std::vector{New<TTableSchema>()}); + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { }).Get(), + MakeRow(nameTable, { + {"opt_list", nullptr}, + }).Get(), + MakeRow(nameTable, { }).Get(), + })); + writer->Close() + .Get() + .ThrowOnError(); + + EXPECT_EQ(HexEncode(resultStream.Str()), "0000" "00" "0000" "00" "0000" "00"); + } +} + +TEST(TSkiffWriter, TestSkippedFields) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("number"), + CreateSimpleTypeSchema(EWireType::Nothing)->SetName("string"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName(TString(RangeIndexColumnName)), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName(TString(RowIndexColumnName)), + CreateSimpleTypeSchema(EWireType::Double)->SetName("double"), + }); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("number", EValueType::Int64), + TColumnSchema("string", EValueType::String), + TColumnSchema("double", EValueType::Double), + }); + + auto nameTable = New<TNameTable>(); + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {tableSchema}); + + if (!writer->Write({ + MakeRow(nameTable, { + {"number", 1}, + {"string", "hello"}, + {TString(RangeIndexColumnName), 0}, + {TString(RowIndexColumnName), 0}, + {"double", 1.5}, + }).Get() + })) + { + writer->GetReadyEvent().Get().ThrowOnError(); + } + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {"number", 1}, + {TString(RangeIndexColumnName), 5}, + {TString(RowIndexColumnName), 1}, + {"double", 2.5}, + }).Get() + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 1.5); + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 5); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + ASSERT_EQ(checkedSkiffParser.ParseDouble(), 2.5); + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } + +} + +TEST(TSkiffWriter, TestSkippedFieldsOutOfRange) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Nothing)->SetName("string"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName(TString(RangeIndexColumnName)), + }); + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("string", EValueType::String), + }); + + auto nameTable = New<TNameTable>(); + TString result; + { + TStringOutput resultStream(result); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {tableSchema}); + + if (!writer->Write({ + MakeRow(nameTable, { + {"string", "hello"}, + {TString(RangeIndexColumnName), 0}, + }).Get() + })) + { + writer->GetReadyEvent().Get().ThrowOnError(); + } + Y_UNUSED(writer->Write({ + MakeRow(nameTable, { + {TString(RangeIndexColumnName), 5}, + }).Get() + })); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(result); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseVariant8Tag(), 1); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 5); + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + } + +} + +TEST(TSkiffWriter, TestSkippedFieldsAndKeySwitch) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("value"), + CreateSimpleTypeSchema(EWireType::Nothing)->SetName("skipped"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName("value1"), + }); + TStringStream resultStream; + auto nameTable = New<TNameTable>(); + auto writer = CreateSkiffWriter(skiffSchema, nameTable, &resultStream, {New<TTableSchema>()}, 1); + + auto write = [&] (TUnversionedRow row) { + if (!writer->Write({row})) { + writer->GetReadyEvent().Get().ThrowOnError(); + } + }; + + // Row 0. + write(MakeRow(nameTable, { + {"value", "one"}, + {"value1", 0}, + {TString(TableIndexColumnName), 0}, + }).Get()); + // Row 1. + write(MakeRow(nameTable, { + {"value", "one"}, + {"value1", 1}, + {TString(TableIndexColumnName), 0}, + }).Get()); + // Row 2. + write(MakeRow(nameTable, { + {"value", "two"}, + {"value1", 2}, + {TString(TableIndexColumnName), 0}, + }).Get()); + writer->Close() + .Get() + .ThrowOnError(); + + TStringInput resultInput(resultStream.Str()); + TCheckedSkiffParser checkedSkiffParser(CreateVariant16Schema({skiffSchema}), &resultInput); + + TString buf; + + // row 0 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 0); + + // row 1 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "one"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), false); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 1); + + // row 2 + ASSERT_EQ(checkedSkiffParser.ParseVariant16Tag(), 0); + ASSERT_EQ(checkedSkiffParser.ParseString32(), "two"); + ASSERT_EQ(checkedSkiffParser.ParseBoolean(), true); + ASSERT_EQ(checkedSkiffParser.ParseInt64(), 2); + + // end + ASSERT_EQ(checkedSkiffParser.HasMoreData(), false); + checkedSkiffParser.ValidateFinished(); + +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSkiffParser, Simple) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), + CreateSimpleTypeSchema(EWireType::Double)->SetName("double"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("boolean"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), + CreateSimpleTypeSchema(EWireType::Nothing)->SetName("null"), + + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + })->SetName("opt_int64"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Uint64), + })->SetName("opt_uint64"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Double), + })->SetName("opt_double"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Boolean), + })->SetName("opt_boolean"), + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::String32), + })->SetName("opt_string32"), + }); + + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteInt64(-1); + checkedSkiffWriter.WriteUint64(2); + checkedSkiffWriter.WriteDouble(3.0); + checkedSkiffWriter.WriteBoolean(true); + checkedSkiffWriter.WriteString32("foo"); + + checkedSkiffWriter.WriteVariant8Tag(0); + checkedSkiffWriter.WriteVariant8Tag(0); + checkedSkiffWriter.WriteVariant8Tag(0); + checkedSkiffWriter.WriteVariant8Tag(0); + checkedSkiffWriter.WriteVariant8Tag(0); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "int64")), -1); + ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "uint64")), 2u); + ASSERT_EQ(GetDouble(collectedRows.GetRowValue(0, "double")), 3.0); + ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(0, "boolean")), true); + ASSERT_EQ(GetString(collectedRows.GetRowValue(0, "string32")), "foo"); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "null")), true); + + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_int64")), true); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_uint64")), true); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_double")), true); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_boolean")), true); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(0, "opt_string32")), true); +} + +TEST(TSkiffParser, TestOptionalNull) +{ + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Nothing), + })->SetName("opt_null"), + }); + auto nameTable = New<TNameTable>(); + + { + TCollectingValueConsumer collectedRows; + EXPECT_THROW_WITH_SUBSTRING( + CreateParserForSkiff(skiffSchema, &collectedRows), + "cannot be represented with Skiff schema"); + } + + auto tableSchema = New<TTableSchema>(std::vector{ + TColumnSchema("opt_null", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Null))), + }); + + TCollectingValueConsumer collectedRows(tableSchema); + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant8Tag(0); + + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant8Tag(1); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); + + ASSERT_EQ(collectedRows.GetRowValue(0, "opt_null").Type, EValueType::Null); +} + +TEST(TSkiffParser, TestSparse) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("int64"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("uint64"), + CreateSimpleTypeSchema(EWireType::String32)->SetName("string32"), + })->SetName("$sparse_columns"), + }); + + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // row 1 + checkedSkiffWriter.WriteVariant16Tag(0); + // sparse fields begin + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteInt64(-42); + checkedSkiffWriter.WriteVariant16Tag(1); + checkedSkiffWriter.WriteUint64(54); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + // row 2 + checkedSkiffWriter.WriteVariant16Tag(0); + // sparse fields begin + checkedSkiffWriter.WriteVariant16Tag(2); + checkedSkiffWriter.WriteString32("foo"); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "int64")), -42); + ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "uint64")), 54u); + ASSERT_FALSE(collectedRows.FindRowValue(0, "string32")); + + ASSERT_FALSE(collectedRows.FindRowValue(1, "int64")); + ASSERT_FALSE(collectedRows.FindRowValue(1, "uint64")); + ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "string32")), "foo"); +} + +TEST(TSkiffParser, TestYsonWireType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), + }); + + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // Row 0. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("-42"); + + // Row 1. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("42u"); + + // Row 2. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("\"foobar\""); + + // Row 3. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("%true"); + + // Row 4. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("{foo=bar}"); + + // Row 5. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32("#"); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 6); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "yson")), -42); + ASSERT_EQ(GetUint64(collectedRows.GetRowValue(1, "yson")), 42u); + ASSERT_EQ(GetString(collectedRows.GetRowValue(2, "yson")), "foobar"); + ASSERT_EQ(GetBoolean(collectedRows.GetRowValue(3, "yson")), true); + ASSERT_EQ(GetAny(collectedRows.GetRowValue(4, "yson"))->AsMap()->GetChildOrThrow("foo")->AsString()->GetValue(), "bar"); + ASSERT_EQ(IsNull(collectedRows.GetRowValue(5, "yson")), true); +} + +TEST(TSkiffParser, TestBadYsonWireType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), + }); + + auto parseYsonUsingSkiff = [&] (TStringBuf ysonValue) { + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + TStringStream dataStream; + ASSERT_NO_THROW({ + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteYson32(ysonValue); + + checkedSkiffWriter.Finish(); + }); + + parser->Read(dataStream.Str()); + parser->Finish(); + }; + + try { + parseYsonUsingSkiff("[42"); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Premature end of stream")); + } + + try { + parseYsonUsingSkiff("<foo=bar>42"); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Table values cannot have top-level attributes")); + } +} + +TEST(TSkiffParser, TestSpecialColumns) +{ + std::shared_ptr<TSkiffSchema> skiffSchemaList[] = { + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch"), + }), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$row_switch"), + }), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("yson"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$range_switch"), + }), + }; + + for (const auto& skiffSchema : skiffSchemaList) { + try { + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + } catch (std::exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Skiff parser does not support \"$key_switch\"")); + } + } +} + +TEST(TSkiffParser, TestOtherColumns) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("$other_columns"), + }); + + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // Row 0. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteString32("row_0"); + checkedSkiffWriter.WriteYson32("{foo=-42;}"); + + // Row 1. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteString32("row_1"); + checkedSkiffWriter.WriteYson32("{bar=qux;baz={boolean=%false;};}"); + + // Row 2. + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); + ASSERT_EQ(GetString(collectedRows.GetRowValue(0, "name")), "row_0"); + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "foo")), -42); + + ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "name")), "row_1"); + ASSERT_EQ(GetString(collectedRows.GetRowValue(1, "bar")), "qux"); + ASSERT_EQ(ConvertToYsonTextStringStable(GetAny(collectedRows.GetRowValue(1, "baz"))), "{\"boolean\"=%false;}"); +} + +TEST(TSkiffParser, TestComplexColumn) +{ + auto skiffSchema = CreateTupleSchema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), + })->SetName("column") + }); + + TCollectingValueConsumer collectedRows( + New<TTableSchema>(std::vector{ + TColumnSchema("column", NTableClient::StructLogicalType({ + {"key", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::Int64)} + })) + })); + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // Row 0. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteString32("row_0"); + checkedSkiffWriter.WriteInt64(42); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); + ASSERT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "column"))), "[\"row_0\";42;]"); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSkiffParser, TestEmptyInput) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("column"), + }); + + TCollectingValueConsumer collectedRows; + + { + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + parser->Finish(); + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); + } + { + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + parser->Read(""); + parser->Finish(); + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); + } + { + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + parser->Read(""); + parser->Read(""); + parser->Finish(); + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TSkiffParser, ColumnIds) +{ + auto skiffSchema = CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("field_a"), + CreateSimpleTypeSchema(EWireType::Uint64)->SetName("field_b") + }); + + TCollectingValueConsumer collectedRows; + collectedRows.GetNameTable()->GetIdOrRegisterName("field_b"); + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteInt64(-1); + checkedSkiffWriter.WriteUint64(2); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 1); + + ASSERT_EQ(GetInt64(collectedRows.GetRowValue(0, "field_a")), -1); + ASSERT_EQ(GetUint64(collectedRows.GetRowValue(0, "field_b")), 2u); +} + +TEST(TSkiffParser, TestSparseComplexType) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("name"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), + })->SetName("value"), + })->SetName("$sparse_columns"), + }); + + TCollectingValueConsumer collectedRows( + New<TTableSchema>(std::vector{ + TColumnSchema("value", OptionalLogicalType( + StructLogicalType({ + {"name", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", SimpleLogicalType(ESimpleLogicalValueType::Int64)} + }))) + })); + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // Row 0. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteString32("row_0"); + checkedSkiffWriter.WriteInt64(10); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + // Row 1. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); + EXPECT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "value"))), "[\"row_0\";10;]"); + EXPECT_FALSE(collectedRows.FindRowValue(1, "value")); +} + +TEST(TSkiffParser, TestSparseComplexTypeWithExtraOptional) +{ + auto skiffSchema = CreateTupleSchema({ + CreateRepeatedVariant16Schema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Int64)->SetName("value"), + }) + })->SetName("column"), + })->SetName("$sparse_columns"), + }); + + TCollectingValueConsumer collectedRows( + New<TTableSchema>(std::vector{ + TColumnSchema("column", OptionalLogicalType( + StructLogicalType({ + {"key", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"value", NTableClient::SimpleLogicalType(ESimpleLogicalValueType::Int64)} + }))) + })); + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + TStringStream dataStream; + TCheckedSkiffWriter checkedSkiffWriter(CreateVariant16Schema({skiffSchema}), &dataStream); + + // Row 0. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant8Tag(1); + checkedSkiffWriter.WriteString32("row_0"); + checkedSkiffWriter.WriteInt64(42); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + // Row 1. + checkedSkiffWriter.WriteVariant16Tag(0); + checkedSkiffWriter.WriteVariant16Tag(EndOfSequenceTag<ui16>()); + + checkedSkiffWriter.Finish(); + + parser->Read(dataStream.Str()); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); + ASSERT_EQ(ConvertToYsonTextStringStable(GetComposite(collectedRows.GetRowValue(0, "column"))), "[\"row_0\";42;]"); + ASSERT_FALSE(collectedRows.FindRowValue(1, "column")); +} + + +TEST(TSkiffParser, TestBadWireTypeForSimpleColumn) +{ + auto skiffSchema = CreateTupleSchema({ + CreateVariant8Schema({ + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Yson32), + }) + })->SetName("opt_yson32"), + }); + + TCollectingValueConsumer collectedRows; + EXPECT_THROW_WITH_SUBSTRING( + CreateParserForSkiff(skiffSchema, &collectedRows), + "cannot be represented with Skiff schema"); +} + +TEST(TSkiffParser, TestEmptyColumns) +{ + auto skiffSchema = CreateTupleSchema({}); + TCollectingValueConsumer collectedRows; + auto parser = CreateParserForSkiff(skiffSchema, &collectedRows); + + parser->Read(TStringBuf("\x00\x00\x00\x00"sv)); + parser->Finish(); + + ASSERT_EQ(static_cast<int>(collectedRows.Size()), 2); +} + +TEST(TSkiffFormat, TestTimestamp) +{ + using namespace NLogicalTypeShortcuts; + CHECK_BIDIRECTIONAL_CONVERSION(Timestamp(), CreateSimpleTypeSchema(EWireType::Uint64), 42ull, "2A000000" "00000000"); + CHECK_BIDIRECTIONAL_CONVERSION(Interval(), CreateSimpleTypeSchema(EWireType::Int64), 42, "2A000000" "00000000"); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT diff --git a/yt/yt/library/formats/unittests/skiff_yson_converter_ut.cpp b/yt/yt/library/formats/unittests/skiff_yson_converter_ut.cpp new file mode 100644 index 0000000000..67e526a9dc --- /dev/null +++ b/yt/yt/library/formats/unittests/skiff_yson_converter_ut.cpp @@ -0,0 +1,707 @@ +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> + +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/client/table_client/logical_type.h> +#include <yt/yt/library/formats/skiff_yson_converter.h> + +#include <yt/yt/core/yson/parser.h> +#include <yt/yt/core/yson/pull_parser.h> +#include <yt/yt/core/yson/token_writer.h> +#include <yt/yt/core/yson/writer.h> + +#include <library/cpp/skiff/skiff.h> +#include <library/cpp/skiff/skiff_schema.h> + +#include <util/string/hex.h> + +#include <util/stream/mem.h> + +namespace NYT::NFormats { +namespace { + +using namespace NTableClient; +using namespace NSkiff; +using namespace NYson; +using namespace NTableClient::NLogicalTypeShortcuts; + +//////////////////////////////////////////////////////////////////////////////// + +std::shared_ptr<TSkiffSchema> SkiffOptional(std::shared_ptr<TSkiffSchema> skiffSchema) +{ + return CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + std::move(skiffSchema) + }); +} + +TString ConvertYsonHex( + const TLogicalTypePtr& logicalType, + const std::shared_ptr<TSkiffSchema>& skiffSchema, + TStringBuf ysonString, + const TYsonToSkiffConverterConfig& config = {}) +{ + auto converter = CreateYsonToSkiffConverter( + TComplexTypeFieldDescriptor("test-field", logicalType), + skiffSchema, + config); + + // Yson parsers have a bug when they can't parse some values that end unexpectedly. + TString spacedYsonInput = TString{ysonString} + " "; + + TStringStream out; + { + TCheckedInDebugSkiffWriter writer(skiffSchema, &out); + + TMemoryInput in(spacedYsonInput); + TYsonPullParser pullParser(&in, EYsonType::Node); + TYsonPullParserCursor cursor(&pullParser); + + converter(&cursor, &writer); + + EXPECT_EQ(cursor.GetCurrent().GetType(), EYsonItemType::EndOfStream); + writer.Finish(); + } + + auto result = HexEncode(out.Str()); + result.to_lower(); + return result; +} + +TString ConvertHexToTextYson( + const TLogicalTypePtr& logicalType, + const std::shared_ptr<TSkiffSchema>& skiffSchema, + TStringBuf hexString, + const TSkiffToYsonConverterConfig& config = {}) +{ + auto converter = CreateSkiffToYsonConverter(TComplexTypeFieldDescriptor("test-field", logicalType), skiffSchema, config); + + + TStringStream binaryOut; + { + TString binaryString = HexDecode(hexString); + TMemoryInput in(binaryString); + TCheckedInDebugSkiffParser parser(skiffSchema, &in); + + auto writer = TCheckedInDebugYsonTokenWriter(&binaryOut); + converter(&parser, &writer); + EXPECT_EQ(parser.GetReadBytesCount(), binaryString.size()); + } + binaryOut.Finish(); + + TStringStream out; + { + auto writer = TYsonWriter(&out, EYsonFormat::Text); + ParseYsonStringBuffer(binaryOut.Str(), EYsonType::Node, &writer); + } + out.Finish(); + + return out.Str(); +} + + +#define CHECK_BIDIRECTIONAL_CONVERSION(logicalType, skiffSchema, ysonString, skiffString, ...) \ + do { \ + std::tuple<TYsonToSkiffConverterConfig, TSkiffToYsonConverterConfig> cfg = {__VA_ARGS__}; \ + auto actualSkiffString = ConvertYsonHex(logicalType, skiffSchema, ysonString, std::get<0>(cfg)); \ + EXPECT_EQ(actualSkiffString, skiffString) << "Yson -> Skiff conversion error"; \ + auto actualYsonString = ConvertHexToTextYson(logicalType, skiffSchema, skiffString, std::get<1>(cfg)); \ + EXPECT_EQ(actualYsonString, ysonString) << "Skiff -> Yson conversion error"; \ + } while (0) + + +TEST(TYsonSkiffConverterTest, TestSimpleTypes) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + Int8(), + CreateSimpleTypeSchema(EWireType::Int64), + "-42", + "d6ffffff" "ffffffff"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Uint64(), + CreateSimpleTypeSchema(EWireType::Uint64), + "42u", + "2a000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Uint64(), + CreateSimpleTypeSchema(EWireType::Uint64), + "8u", + "08000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Bool(), + CreateSimpleTypeSchema(EWireType::Boolean), + "%true", + "01"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Double(), + CreateSimpleTypeSchema(EWireType::Double), + "0.", + "00000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Float(), + CreateSimpleTypeSchema(EWireType::Double), + "0.", + "00000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + String(), + CreateSimpleTypeSchema(EWireType::String32), + "\"foo\"", + "03000000" "666f6f"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Null(), + CreateSimpleTypeSchema(EWireType::Nothing), + "#", + ""); + + CHECK_BIDIRECTIONAL_CONVERSION( + Uuid(), + CreateSimpleTypeSchema(EWireType::Uint128), + "\"\\xF0\\xF1\\xF2\\xF3\\xF4\\xF5\\xF6\\xF7\\xF8\\xF9\\xFA\\xFB\\xFC\\xFD\\xFE\\xFF\"", + "fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Uuid(), + CreateSimpleTypeSchema(EWireType::String32), + "\"\\xF0\\xF1\\xF2\\xF3\\xF4\\xF5\\xF6\\xF7\\xF8\\xF9\\xFA\\xFB\\xFC\\xFD\\xFE\\xFF\"", + "10000000f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"); +} + +TEST(TYsonSkiffConverterTest, TestYson32) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + Yson(), + CreateSimpleTypeSchema(EWireType::Yson32), + "-42", + "02000000" "0253"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Yson(), + CreateSimpleTypeSchema(EWireType::Yson32), + "#", + "01000000" "23"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Yson(), + CreateSimpleTypeSchema(EWireType::Yson32), + "[1;2;[3;];]", + "0e000000" "5b02023b02043b5b02063b5d3b5d"); +} + +TEST(TYsonSkiffConverterTest, TestOptionalTypes) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Int64()), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + "-42", + "01" "d6ffffff" "ffffffff"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Int64()), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + "#", + "00"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(Bool())), + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), + "[%true;]", + "01" "01" "01"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(Bool())), + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), + "[#;]", + "01" "00"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(Bool())), + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean))), + "#", + "00"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(List(Bool())), + SkiffOptional(CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)})), + "#", + "00"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(List(Bool()))), + SkiffOptional( + SkiffOptional( + CreateRepeatedVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Boolean) + }))), + "[[%true;%false;%true;];]", + "01" "01" "0001" "0000" "0001" "ff"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(List(Bool()))), + SkiffOptional( + SkiffOptional( + CreateRepeatedVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Boolean) + }))), + "[#;]", + "0100"); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertYsonHex( + Optional(Optional(Bool())), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), + " [ %true ] "), + "Optional nesting mismatch"); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertHexToTextYson( + Optional(Bool()), + CreateSimpleTypeSchema(EWireType::Boolean), + "00"), + "Optional nesting mismatch"); + + TYsonToSkiffConverterConfig ysonToSkiffConfig; + ysonToSkiffConfig.AllowOmitTopLevelOptional = true; + + TSkiffToYsonConverterConfig skiffToYsonConfig; + skiffToYsonConfig.AllowOmitTopLevelOptional = true; + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(Bool())), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), + "[%true;]", + "01" "01", + ysonToSkiffConfig, + skiffToYsonConfig); + + CHECK_BIDIRECTIONAL_CONVERSION( + Optional(Optional(Bool())), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), + "[#;]", + "00", + ysonToSkiffConfig, + skiffToYsonConfig); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertYsonHex( + Optional(Optional(Bool())), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Boolean)), + " # ", + ysonToSkiffConfig), + "value expected to be nonempty"); +} + +TEST(TYsonSkiffConverterTest, TestListTypes) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + List(Bool()), + CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)}), + "[]", + "ff"); + + CHECK_BIDIRECTIONAL_CONVERSION( + List(Bool()), + CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)}), + "[%true;%true;%true;]", + "00" "01" "00" "01" "00" "01" "ff"); + + CHECK_BIDIRECTIONAL_CONVERSION( + List(List(Bool())), + CreateRepeatedVariant8Schema({CreateRepeatedVariant8Schema({CreateSimpleTypeSchema(EWireType::Boolean)})}), + "[[];[%true;];[%true;%true;];]", + "00" "ff" "00" "0001ff" "00" "00010001ff" "ff"); +} + +TEST(TYsonSkiffConverterTest, TestStruct) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + Struct( + "key", String(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + }), + "[\"true\";%true;]", + "04000000" "74727565" "01"); +} + +TEST(TYsonSkiffConverterTest, TestSkippedFields) +{ + TString skiffString; + skiffString = ConvertYsonHex( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "04000000" "74727565" "01"sv); + + skiffString = ConvertYsonHex( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "01000000" "00000000"sv); + + try { + ConvertHexToTextYson( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), + }), + "01000000" "00000000"); + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::ContainsRegex("Non optional struct field .* is missing")); + } + + CHECK_BIDIRECTIONAL_CONVERSION( + Struct( + "key", Optional(String()), + "subkey", Int64(), + "value", Optional(Bool())), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64)->SetName("subkey"), + }), + "[#;15;#;]", + "0f000000" "00000000"); +} + +TEST(TYsonSkiffConverterTest, TestUnknownSkiffFields) +{ + TString skiffString; + skiffString = ConvertYsonHex( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + SkiffOptional(CreateSimpleTypeSchema(EWireType::String32))->SetName("key2"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "04000000" "74727565" "00" "01"sv); + + skiffString = ConvertYsonHex( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Yson32))->SetName("value2"), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "04000000" "74727565" "01" "00"sv); + + + try { + ConvertYsonHex( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + CreateSimpleTypeSchema(EWireType::Yson32)->SetName("value2"), + }), + " [ true ; 1; %true ] "); + GTEST_FAIL() << "exception expected"; + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::ContainsRegex("Non optional Skiff field .* is missing corresponding logical struct field")); + } + + try { + ConvertHexToTextYson( + Struct( + "key", String(), + "subkey", Int64(), + "value", Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32)->SetName("key"), + SkiffOptional(CreateSimpleTypeSchema(EWireType::String32))->SetName("key2"), + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("value"), + }), + "04000000" "74727565" "00" "01"sv); + GTEST_FAIL() << "expected_exception"; + } catch (const std::exception& e) { + EXPECT_THAT(e.what(), testing::ContainsRegex("is not found in logical type")); + } +} + +TEST(TYsonSkiffConverterTest, TestTuple) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + Tuple(String(), Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32), + CreateSimpleTypeSchema(EWireType::Boolean), + }), + "[\"true\";%true;]", + "04000000" "74727565" "01"); + + CHECK_BIDIRECTIONAL_CONVERSION( + Tuple(Int64(), Optional(Int64())), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Int64), + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + }), + "[2;42;]", + "02000000" "00000000" "01" "2a000000" "00000000"); +} + +TEST(TYsonSkiffConverterTest, TestTupleSkippedFields) +{ + TString skiffString; + skiffString = ConvertYsonHex( + Tuple(String(), Int64(), Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32), + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Boolean), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "04000000" "74727565" "01"sv); + + skiffString = ConvertYsonHex( + Tuple(String(), Int64(), Bool()), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Nothing), + }), + " [ true ; 1; %true ] "); + EXPECT_EQ(skiffString, "01000000" "00000000"sv); + + skiffString = ConvertYsonHex( + Tuple(Optional(String()), Int64(), Optional(Bool())), + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Nothing) + }), + "[#;15;#;]"); + EXPECT_EQ(skiffString, "0f000000" "00000000"sv); +} + +TEST(TYsonSkiffConverterTest, TestDict) +{ + const auto logicalType = Dict(String(), Int64()); + const auto skiffSchema = CreateRepeatedVariant8Schema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32), + CreateSimpleTypeSchema(EWireType::Int64) + }) + }); + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + skiffSchema, + "[[\"one\";1;];[\"two\";2;];]", + "00" "03000000" "6f6e65" "01000000" "00000000" + "00" "03000000" "74776f" "02000000" "00000000" + "ff"); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertHexToTextYson(logicalType, skiffSchema, "01" "01000000" "6f" "01000000" "00000000" "ff"), + "Unexpected \"repeated_variant8\" tag"); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertHexToTextYson(logicalType, skiffSchema, "00" "01000000" "6f" "01000000" "00000000"), + "Premature end of stream"); +} + +TEST(TYsonSkiffConverterTest, TestTagged) +{ + const auto logicalType = Tagged( + "tag", + Dict(Tagged("tag", String()), Int64())); + const auto skiffSchema = CreateRepeatedVariant8Schema({ + CreateTupleSchema({ + CreateSimpleTypeSchema(EWireType::String32), + CreateSimpleTypeSchema(EWireType::Int64) + }) + }); + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + skiffSchema, + "[[\"one\";1;];[\"two\";2;];]", + "00" "03000000" "6f6e65" "01000000" "00000000" + "00" "03000000" "74776f" "02000000" "00000000" + "ff"); +} + +TEST(TYsonSkiffConverterTest, TestOptionalVariantSimilarity) +{ + auto logicalType = Optional( + VariantTuple(Null(), Int64())); + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), + "[1;42;]", + "01" "01" "2a000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), + "[0;#;]", + "01" "00"); + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + SkiffOptional(SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64))), + "#", + "00"); + + TYsonToSkiffConverterConfig ysonToSkiffConfig; + ysonToSkiffConfig.AllowOmitTopLevelOptional = true; + + TSkiffToYsonConverterConfig skiffToYsonConfig; + skiffToYsonConfig.AllowOmitTopLevelOptional = true; + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + "[1;42;]", + "01" "2a000000" "00000000", + ysonToSkiffConfig, + skiffToYsonConfig); + + CHECK_BIDIRECTIONAL_CONVERSION( + logicalType, + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + "[0;#;]", + "00", + ysonToSkiffConfig, + skiffToYsonConfig); + + EXPECT_THROW_WITH_SUBSTRING( + ConvertYsonHex( + logicalType, + SkiffOptional(CreateSimpleTypeSchema(EWireType::Int64)), + "#", + ysonToSkiffConfig), + "value expected to be nonempty"); +} + +class TYsonSkiffConverterTestVariant + : public ::testing::TestWithParam<std::tuple<ELogicalMetatype, EWireType>> +{ +public: + TLogicalTypePtr VariantLogicalType(const std::vector<TLogicalTypePtr>& elements) + { + auto [metatype, wireType] = GetParam(); + if (metatype == ELogicalMetatype::VariantTuple) { + return VariantTupleLogicalType(elements); + } else { + std::vector<TStructField> fields; + for (size_t i = 0; i < elements.size(); ++i) { + fields.push_back({Format("field%v", i), elements[i]}); + } + return VariantStructLogicalType(fields); + } + } + + std::shared_ptr<TSkiffSchema> VariantSkiffSchema(std::vector<std::shared_ptr<TSkiffSchema>> elements) + { + for (size_t i = 0; i < elements.size(); ++i) { + elements[i]->SetName(Format("field%v", i)); + } + auto [metatype, wireType] = GetParam(); + if (wireType == EWireType::Variant8) { + return CreateVariant8Schema(std::move(elements)); + } else if (wireType == EWireType::Variant16) { + return CreateVariant16Schema(std::move(elements)); + } + Y_UNREACHABLE(); + } + + TString VariantTagInfix() const + { + auto [metatype, wireType] = GetParam(); + if (wireType == EWireType::Variant16) { + return "00"; + } + return {}; + } +}; + +TEST_P(TYsonSkiffConverterTestVariant, TestVariant) +{ + CHECK_BIDIRECTIONAL_CONVERSION( + VariantLogicalType({ + Int64(), + Bool() + }), + VariantSkiffSchema({ + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Boolean), + }), + "[0;42;]", + "00" + VariantTagInfix() + "2a000000" "00000000"); + + CHECK_BIDIRECTIONAL_CONVERSION( + VariantLogicalType({ + Int64(), + Bool() + }), + VariantSkiffSchema({ + CreateSimpleTypeSchema(EWireType::Int64), + CreateSimpleTypeSchema(EWireType::Boolean), + }), + "[1;%true;]", + "01" + VariantTagInfix() + "01"); +} + +TEST_P(TYsonSkiffConverterTestVariant, TestMalformedVariants) +{ + auto logicalType = VariantLogicalType({ + Bool(), + Int64(), + }); + auto skiffSchema = VariantSkiffSchema({ + CreateSimpleTypeSchema(EWireType::Boolean), + CreateSimpleTypeSchema(EWireType::Int64), + }); + + EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[2; 42]"), "Yson to Skiff conversion error"); + EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[]"), "Yson to Skiff conversion error"); + EXPECT_THROW_WITH_SUBSTRING(ConvertYsonHex(logicalType, skiffSchema, "[0]"), "Yson to Skiff conversion error"); + + EXPECT_THROW_WITH_SUBSTRING(ConvertHexToTextYson(logicalType, skiffSchema, "02" + VariantTagInfix() + "00"), + "Skiff to Yson conversion error"); +} + +INSTANTIATE_TEST_SUITE_P( + Variants, + TYsonSkiffConverterTestVariant, + ::testing::Combine( + ::testing::ValuesIn({ELogicalMetatype::VariantStruct, ELogicalMetatype::VariantTuple}), + ::testing::ValuesIn({EWireType::Variant8, EWireType::Variant16})) +); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/value_examples.cpp b/yt/yt/library/formats/unittests/value_examples.cpp new file mode 100644 index 0000000000..da41a6341e --- /dev/null +++ b/yt/yt/library/formats/unittests/value_examples.cpp @@ -0,0 +1,163 @@ +#include "value_examples.h" + +#include <yt/yt/library/logical_type_shortcuts/logical_type_shortcuts.h> + +#include <yt/yt/library/decimal/decimal.h> + +#include <cmath> + +namespace NYT::NTableClient { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NLogicalTypeShortcuts; +using namespace NNamedValue; + +//////////////////////////////////////////////////////////////////////////////// + +TValueExample::TValueExample(TLogicalTypePtr logicalType, TNamedValue::TValue value, TString prettyYson) + : LogicalType(std::move(logicalType)) + , Value(std::move(value)) + , PrettyYson(std::move(prettyYson)) +{ } + +//////////////////////////////////////////////////////////////////////////////// + +std::vector<TValueExample> GetPrimitiveValueExamples() +{ + static const std::vector<TValueExample> valueExamples = { + TValueExample{Int8(), 0, "0"}, + TValueExample{Int8(), -5, "-5"}, + TValueExample{Int8(), 42, "42"}, + TValueExample{Int8(), -128, "-128"}, + TValueExample{Int8(), 127, "127"}, + + TValueExample{Int16(), 0, "0"}, + TValueExample{Int16(), -6, "-6"}, + TValueExample{Int16(), 43, "43"}, + TValueExample{Int16(), 0x7FFF, "32767"}, + TValueExample{Int16(), -0x8000, "-32768"}, + + TValueExample{Int32(), 0, "0"}, + TValueExample{Int32(), -7, "-7"}, + TValueExample{Int32(), 44, "44"}, + TValueExample{Int32(), 0x7FFFFFFF, "2147483647"}, + TValueExample{Int32(), -0x80000000ll, "-2147483648"}, + + TValueExample{Int64(), 0, "0"}, + TValueExample{Int64(), -7, "-7"}, + TValueExample{Int64(), 45, "45"}, + TValueExample{Int64(), 0x7FFFFFFFFFFFFFFFll, "9223372036854775807"}, + TValueExample{Int64(), i64(-0x8000000000000000ll), "-9223372036854775808"}, + + TValueExample{Uint8(), 0ull, "0u"}, + TValueExample{Uint8(), 46ull, "46u"}, + TValueExample{Uint8(), 255ull, "255u"}, + + TValueExample{Uint16(), 0ull, "0u"}, + TValueExample{Uint16(), 47ull, "47u"}, + TValueExample{Uint16(), 0xFFFFull, "65535u"}, + + TValueExample{Uint32(), 0ull, "0u"}, + TValueExample{Uint32(), 48ull, "48u"}, + TValueExample{Uint32(), 0xFFFFFFFFull, "4294967295u"}, + + TValueExample{Uint64(), 0ull, "0u"}, + TValueExample{Uint64(), 49ull, "49u"}, + TValueExample{Uint64(), 0xFFFFFFFFFFFFFFFFull, "18446744073709551615u"}, + + TValueExample{String(), "", R"("")"}, + TValueExample{String(), "foo", R"("foo")"}, + TValueExample{String(), TString(TStringBuf("\xf0\x00"sv)), R"("\xf0\x00")"}, + + TValueExample{Utf8(), "", R"("")"}, + TValueExample{Utf8(), "bar", R"("bar")"}, + + TValueExample{Bool(), true, "%true"}, + TValueExample{Bool(), false, "%false"}, + + // NB. .125 = 1 / 8 is + TValueExample{Double(), 3.125, "3.125"}, + TValueExample{Double(), 2.775, "2.775"}, + // TPrimitiveTypeExample{Double(), std::nan("1"), "%nan"}, + TValueExample{Double(), INFINITY, "%inf"}, + TValueExample{Double(), -INFINITY, "%-inf"}, + + TValueExample{Float(), 5.125, "5.125"}, + TValueExample{Float(), 6.775, "6.775"}, + + TValueExample{Null(), nullptr, "#"}, + TValueExample{Void(), nullptr, "#"}, + + TValueExample{Json(), "83", R"("83")"}, + TValueExample{Json(), "[]", R"("[]")"}, + + TValueExample{ + Uuid(), + TString(16, 0), + TString(TStringBuf(R"("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")")) + }, + TValueExample{ + Uuid(), + TString(TStringBuf("\x01\x23\x45\x67\x89\xAB\xCD\xEF\xFE\xDC\xBA\x98\x76\x54\x32\x10"sv)), + TString(TStringBuf(R"("\x01\x23\x45\x67\x89\xAB\xCD\xEF\xFE\xDC\xBA\x98\x76\x54\x32\x10")")) + }, + + TValueExample{Date(), 0ull, "0u"}, + TValueExample{Date(), 18431ull, "18431u"}, + TValueExample{Date(), 49672ull, "49672u"}, + + TValueExample{Datetime(), 0ull, "0u"}, + TValueExample{Datetime(), 668800588ull, "668800588u"}, + TValueExample{Datetime(), 4291747199ull, "4291747199u"}, + + TValueExample{Timestamp(), 0ull, "0u"}, + TValueExample{Timestamp(), 2508452463052426ull, "2508452463052426u"}, + TValueExample{Timestamp(), 4291747199999999ull, "4291747199999999u"}, + + TValueExample{Interval(), 0, "0"}, + TValueExample{Timestamp(), 2208610308646589ll, "2208610308646589"}, + TValueExample{Timestamp(), 1187314596653899ll, "1187314596653899"}, + TValueExample{Timestamp(), 4291747199999999ll, "4291747199999999"}, + TValueExample{Timestamp(), -4291747199999999ll, "-4291747199999999"}, + + TValueExample{Date32(), -53375809, "-53375809"}, + TValueExample{Date32(), 0, "0"}, + TValueExample{Date32(), 53375807, "53375807"}, + + TValueExample{Datetime64(), -4611669897600ll, "-4611669897600"}, + TValueExample{Datetime64(), 42, "42"}, + TValueExample{Datetime64(), 4611669811199ll, "4611669811199"}, + + TValueExample{Timestamp64(), -4611669897600000000ll, "-4611669897600000000"}, + TValueExample{Timestamp64(), 42, "42"}, + TValueExample{Timestamp64(), 4611669811199999999l, "4611669811199999999"}, + + TValueExample{Interval64(), -9223339708799999999ll, "-9223339708799999999"}, + TValueExample{Interval64(), 0, "0"}, + TValueExample{Interval64(), 9223339708799999999ll, "9223339708799999999"}, + + TValueExample{Yson(), "qux", R"("qux")"}, + + TValueExample{Decimal(3, 2), NDecimal::TDecimal::TextToBinary("3.14", 3, 2), R"("\x80\x00\x01\x3a")"}, + }; + + THashSet<ESimpleLogicalValueType> allValueTypes; + for (const auto value : TEnumTraits<ESimpleLogicalValueType>::GetDomainValues()) { + allValueTypes.insert(value); + } + for (const auto& example : valueExamples) { + if (example.LogicalType->GetMetatype() == ELogicalMetatype::Simple) { + allValueTypes.erase(example.LogicalType->AsSimpleTypeRef().GetElement()); + } + } + if (!allValueTypes.empty()) { + THROW_ERROR_EXCEPTION("PrimitiveTypeExample variable doesn't contain values: %v", + allValueTypes); + } + return valueExamples; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NTableClient diff --git a/yt/yt/library/formats/unittests/value_examples.h b/yt/yt/library/formats/unittests/value_examples.h new file mode 100644 index 0000000000..06644e2cd6 --- /dev/null +++ b/yt/yt/library/formats/unittests/value_examples.h @@ -0,0 +1,24 @@ +#pragma once + +#include <yt/yt/library/named_value/named_value.h> + +#include <yt/yt/client/table_client/logical_type.h> + +namespace NYT::NTableClient { + +//////////////////////////////////////////////////////////////////////////////// + +struct TValueExample +{ + TLogicalTypePtr LogicalType; + NNamedValue::TNamedValue::TValue Value; + TString PrettyYson; + + TValueExample(TLogicalTypePtr logicalType, NNamedValue::TNamedValue::TValue value, TString prettyYson); +}; + +std::vector<TValueExample> GetPrimitiveValueExamples(); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NTableClient diff --git a/yt/yt/library/formats/unittests/web_json_writer_ut.cpp b/yt/yt/library/formats/unittests/web_json_writer_ut.cpp new file mode 100644 index 0000000000..d7f20ec20a --- /dev/null +++ b/yt/yt/library/formats/unittests/web_json_writer_ut.cpp @@ -0,0 +1,1714 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/library/formats/web_json_writer.h> + +#include <yt/yt/client/table_client/logical_type.h> +#include <yt/yt/client/table_client/name_table.h> +#include <yt/yt/client/table_client/schema.h> + +#include <yt/yt/core/concurrency/async_stream.h> + +#include <yt/yt/core/json/json_parser.h> + +#include <yt/yt/core/ytree/fluent.h> + +#include <yt/yt/library/named_value/named_value.h> + +#include <limits> + +namespace NYT::NFormats { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYTree; +using namespace NYson; +using namespace NConcurrency; +using namespace NTableClient; + +using NNamedValue::MakeRow; + +INodePtr ParseJsonToNode(TStringBuf string) +{ + TBuildingYsonConsumerViaTreeBuilder<INodePtr> builder(EYsonType::Node); + TMemoryInput stream(string); + + // For plain (raw) JSON parsing we need to switch off + // "smart" attribute analysis and UTF-8 decoding. + auto config = New<NJson::TJsonFormatConfig>(); + config->EncodeUtf8 = false; + config->Plain = true; + + NJson::ParseJson(&stream, &builder, std::move(config)); + return builder.Finish(); +} + +class TWriterForWebJson + : public ::testing::Test +{ +protected: + TNameTablePtr NameTable_ = New<TNameTable>(); + TWebJsonFormatConfigPtr Config_ = New<TWebJsonFormatConfig>(); + TStringStream OutputStream_; + ISchemalessFormatWriterPtr Writer_; + + const TString ValueColumnName_ = "value"; + + void CreateStandardWriter(const std::vector<TTableSchemaPtr>& schemas = {New<TTableSchema>()}) + { + Writer_ = CreateWriterForWebJson( + Config_, + NameTable_, + schemas, + CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_))); + } +}; + +TEST_F(TWriterForWebJson, Simple) +{ + Config_->MaxAllColumnNamesCount = 2; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", 100500u}, + {"column_b", true}, + {"column_c", "row1_c"}, + {TString(RowIndexColumnName), 0}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row2_c"}, + {"column_b", "row2_b"}, + {TString(RowIndexColumnName), 1}, + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"column_a\":{" + "\"$type\":\"uint64\"," + "\"$value\":\"100500\"" + "}," + "\"column_b\":{" + "\"$type\":\"boolean\"," + "\"$value\":\"true\"" + "}," + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_c\"" + "}" + "}," + "{" + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row2_c\"" + "}," + "\"column_b\":{" + "\"$type\":\"string\"," + "\"$value\":\"row2_b\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"true\"," + "\"all_column_names\":[" + "\"column_a\"," + "\"column_b\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, SliceColumnsByMaxCount) +{ + Config_->MaxSelectedColumnCount = 2; + + CreateStandardWriter(); + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", "row1_a"}, + {"column_b", "row1_b"}, + {"column_c", "row1_c"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row2_c"}, + {"column_b", "row2_b"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row3_c"}, + }).Get(), + }); + EXPECT_TRUE(written); + YT_UNUSED_FUTURE(Writer_->Close()); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"column_a\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_a\"" + "}," + "\"column_b\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_b\"" + "}" + "}," + "{" + "\"column_b\":{" + "\"$type\":\"string\"," + "\"$value\":\"row2_b\"" + "}" + "}," + "{" + "}" + "]," + "\"incomplete_columns\":\"true\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"column_a\"," + "\"column_b\"," + "\"column_c\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, SliceStrings) +{ + Config_->FieldWeightLimit = 6; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_b", "row1_b"}, + {"column_c", "rooooow1_c"}, + {"column_a", "row1_a"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row2_c"}, + {"column_b", "rooow2_b"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row3_c"}, + }).Get(), + }); + EXPECT_TRUE(written); + YT_UNUSED_FUTURE(Writer_->Close()); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"column_b\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_b\"" + "}," + "\"column_c\":{" + "\"$incomplete\":true," + "\"$type\":\"string\"," + "\"$value\":\"rooooo\"" + "}," + "\"column_a\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_a\"" + "}" + "}," + "{" + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row2_c\"" + "}," + "\"column_b\":{" + "\"$incomplete\":true," + "\"$type\":\"string\"," + "\"$value\":\"rooow2\"" + "}" + "}," + "{" + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row3_c\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"column_a\"," + "\"column_b\"," + "\"column_c\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, ReplaceAnyWithNull) +{ + Config_->FieldWeightLimit = 8; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_b", EValueType::Any, "{key=a}"}, + {"column_c", "row1_c"}, + {"column_a", "row1_a"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", EValueType::Any, "{key=aaaaaa}"}, + {"column_b", "row2_b"}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row3_c"}, + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"column_b\":{" + "\"key\":{" + "\"$type\":\"string\"," + "\"$value\":\"a\"" + "}" + "}," + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_c\"" + "}," + "\"column_a\":{" + "\"$type\":\"string\"," + "\"$value\":\"row1_a\"" + "}" + "}," + "{" + "\"column_c\":{" + "\"$incomplete\":true," + "\"$type\":\"any\"," + "\"$value\":\"\"" + "}," + "\"column_b\":{" + "\"$type\":\"string\"," + "\"$value\":\"row2_b\"" + "}" + "}," + "{" + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"row3_c\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"column_a\"," + "\"column_b\"," + "\"column_c\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, NotSkipSystemColumns) +{ + Config_->SkipSystemColumns = false; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {TString(TableIndexColumnName), 0}, + {TString(RowIndexColumnName), 1}, + {TString(TabletIndexColumnName), 2}, + {ValueColumnName_, 3} + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"$$table_index\":{" + "\"$type\":\"int64\"," + "\"$value\":\"0\"" + "}," + "\"$$row_index\":{" + "\"$type\":\"int64\"," + "\"$value\":\"1\"" + "}," + "\"$$tablet_index\":{" + "\"$type\":\"int64\"," + "\"$value\":\"2\"" + "}," + "\"value\":{" + "\"$type\":\"int64\"," + "\"$value\":\"3\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"$row_index\"," + "\"$table_index\"," + "\"$tablet_index\"," + "\"value\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, SkipSystemColumns) +{ + Config_->SkipSystemColumns = true; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {TString(TableIndexColumnName), 0}, + {TString(RowIndexColumnName), 1}, + {TString(TabletIndexColumnName), 2}, + {ValueColumnName_, 3} + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"value\":{" + "\"$type\":\"int64\"," + "\"$value\":\"3\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"value\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, NotSkipRequestedSystemColumns) +{ + Config_->SkipSystemColumns = true; + Config_->ColumnNames = std::vector<std::string>{TabletIndexColumnName, ValueColumnName_}; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {TString(TableIndexColumnName), 0}, + {TString(RowIndexColumnName), 1}, + {TString(TabletIndexColumnName), 2}, + {ValueColumnName_, 3} + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"$$tablet_index\":{" + "\"$type\":\"int64\"," + "\"$value\":\"2\"" + "}," + "\"value\":{" + "\"$type\":\"int64\"," + "\"$value\":\"3\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"$tablet_index\"," + "\"value\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, SkipUnregisteredColumns) +{ + CreateStandardWriter(); + + TUnversionedRowBuilder row; + int keyDId = -1; + row.AddValue(MakeUnversionedBooleanValue(true, keyDId)); + std::vector<TUnversionedRow> rows = {row.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + + keyDId = NameTable_->RegisterName("column_d"); + + rows.clear(); + row.Reset(); + row.AddValue(MakeUnversionedBooleanValue(true, keyDId)); + rows.push_back(row.GetRow()); + + EXPECT_EQ(true, Writer_->Write(rows)); + YT_UNUSED_FUTURE(Writer_->Close()); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "}," + "{" + "\"column_d\":{" + "\"$type\":\"boolean\"," + "\"$value\":\"true\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"false\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"column_d\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +TEST_F(TWriterForWebJson, SliceColumnsByName) +{ + Config_->ColumnNames = { + "column_b", + "column_c", + "$tablet_index"}; + Config_->MaxSelectedColumnCount = 2; + Config_->SkipSystemColumns = false; + + CreateStandardWriter(); + + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", 100500u}, + {"column_b", 0.42}, + {"column_c", "abracadabra"}, + {TString(TabletIndexColumnName), 10}, + }).Get(), + }); + EXPECT_TRUE(written); + WaitFor(Writer_->Close()) + .ThrowOnError(); + auto result = ParseJsonToNode(OutputStream_.Str()); + + TString expectedOutput = + "{" + "\"rows\":[" + "{" + "\"column_b\":{" + "\"$type\":\"double\"," + "\"$value\":\"0.42\"" + "}," + "\"column_c\":{" + "\"$type\":\"string\"," + "\"$value\":\"abracadabra\"" + "}," + "\"$$tablet_index\":{" + "\"$type\":\"int64\"," + "\"$value\":\"10\"" + "}" + "}" + "]," + "\"incomplete_columns\":\"true\"," + "\"incomplete_all_column_names\":\"false\"," + "\"all_column_names\":[" + "\"$tablet_index\"," + "\"column_a\"," + "\"column_b\"," + "\"column_c\"" + "]" + "}"; + + EXPECT_EQ(std::ssize(expectedOutput), Writer_->GetWrittenSize()); + EXPECT_EQ(expectedOutput, OutputStream_.Str()); +} + +template <typename TValue> +void CheckYqlValue( + const INodePtr& valueNode, + const TValue& expectedValue) +{ + using TDecayedValue = std::decay_t<TValue>; + if constexpr (std::is_convertible_v<TDecayedValue, TString>) { + ASSERT_EQ(valueNode->GetType(), ENodeType::String); + EXPECT_EQ(valueNode->GetValue<TString>(), expectedValue); + } else if constexpr (std::is_same_v<TDecayedValue, double>) { + ASSERT_EQ(valueNode->GetType(), ENodeType::String); + EXPECT_FLOAT_EQ(FromString<double>(valueNode->GetValue<TString>()), expectedValue); + } else if constexpr (std::is_same_v<TDecayedValue, bool>) { + ASSERT_EQ(valueNode->GetType(), ENodeType::Boolean); + EXPECT_EQ(valueNode->GetValue<bool>(), expectedValue); + } else if constexpr (std::is_same_v<TDecayedValue, INodePtr>) { + EXPECT_TRUE(AreNodesEqual(valueNode, expectedValue)) + << "actualValueNode is " << ConvertToYsonString(valueNode, EYsonFormat::Pretty).AsStringBuf() + << "\nexpectedValue is " << ConvertToYsonString(expectedValue, EYsonFormat::Pretty).AsStringBuf(); + } else { + static_assert(TDependentFalse<TDecayedValue>, "Type not allowed"); + } +} + +template <typename TType> +void CheckYqlType( + const INodePtr& typeNode, + const TType& expectedType, + const std::vector<INodePtr>& yqlTypes) +{ + ASSERT_EQ(typeNode->GetType(), ENodeType::String); + auto typeIndexString = typeNode->GetValue<TString>(); + auto typeIndex = FromString<int>(typeIndexString); + ASSERT_LT(typeIndex, static_cast<int>(yqlTypes.size())); + ASSERT_GE(typeIndex, 0); + const auto& yqlType = yqlTypes[typeIndex]; + EXPECT_EQ(yqlType->GetType(), ENodeType::List); + + auto expectedTypeNode = [&] () -> INodePtr { + using TDecayedType = std::decay_t<TType>; + if constexpr (std::is_convertible_v<TDecayedType, TString>) { + return ConvertToNode(TYsonString(TString(expectedType))); + } else if constexpr (std::is_same_v<TDecayedType, INodePtr>) { + return expectedType; + } else { + static_assert(TDependentFalse<TDecayedType>, "Type not allowed"); + } + }(); + EXPECT_TRUE(AreNodesEqual(yqlType, expectedTypeNode)) + << "yqlType is " << ConvertToYsonString(yqlType, EYsonFormat::Pretty).AsStringBuf() + << "\nexpectedTypeNode is " << ConvertToYsonString(expectedTypeNode, EYsonFormat::Pretty).AsStringBuf(); +} + +template <typename TValue, typename TType> +void CheckYqlTypeAndValue( + const INodePtr& row, + TStringBuf name, + const TType& expectedType, + const TValue& expectedValue, + const std::vector<INodePtr>& yqlTypes) +{ + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto entry = row->AsMap()->FindChild(TString(name)); + ASSERT_TRUE(entry); + ASSERT_EQ(entry->GetType(), ENodeType::List); + ASSERT_EQ(entry->AsList()->GetChildCount(), 2); + auto valueNode = entry->AsList()->GetChildOrThrow(0); + CheckYqlValue(valueNode, expectedValue); + auto typeNode = entry->AsList()->GetChildOrThrow(1); + CheckYqlType(typeNode, expectedType, yqlTypes); +} + +#define CHECK_YQL_TYPE_AND_VALUE(row, name, expectedType, expectedValue, yqlTypes) \ + do { \ + SCOPED_TRACE(name); \ + CheckYqlTypeAndValue(row, name, expectedType, expectedValue, yqlTypes); \ + } while (0) + +TEST_F(TWriterForWebJson, YqlValueFormat_SimpleTypes) +{ + Config_->MaxAllColumnNamesCount = 2; + Config_->ValueFormat = EWebJsonValueFormat::Yql; + + // We will emulate writing rows from two tables. + CreateStandardWriter(std::vector{New<TTableSchema>(), New<TTableSchema>()}); + + { + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", 100500u}, + {"column_b", true}, + {"column_c", "row1_c"}, + {TString(RowIndexColumnName), 0}, + {TString(TableIndexColumnName), 0}, + }).Get(), + MakeRow(NameTable_, { + {"column_c", "row2_c"}, + {"column_b", "row2_b"}, + {TString(RowIndexColumnName), 1}, + {TString(TableIndexColumnName), 0}, + }).Get(), + MakeRow(NameTable_, { + {"column_a", -100500}, + {"column_b", EValueType::Any, "{x=2;y=3}"}, + {"column_c", 2.71828}, + {TString(RowIndexColumnName), 1}, + }).Get(), + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); + ASSERT_TRUE(incompleteColumns); + auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); + ASSERT_TRUE(incompleteAllColumnNames); + auto allColumnNames = result->AsMap()->FindChild("all_column_names"); + ASSERT_TRUE(allColumnNames); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(incompleteColumns->GetType(), ENodeType::String); + EXPECT_EQ(incompleteColumns->GetValue<TString>(), "false"); + + ASSERT_EQ(incompleteAllColumnNames->GetType(), ENodeType::String); + EXPECT_EQ(incompleteAllColumnNames->GetValue<TString>(), "true"); + + ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); + std::vector<TString> allColumnNamesVector; + ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); + EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_b"})); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 3); + + auto row1 = rows->AsList()->GetChildOrThrow(0); + auto row2 = rows->AsList()->GetChildOrThrow(1); + auto row3 = rows->AsList()->GetChildOrThrow(2); + + ASSERT_EQ(row1->GetType(), ENodeType::Map); + EXPECT_EQ(row1->AsMap()->GetChildCount(), 3); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", R"(["DataType"; "Uint64"])", "100500", yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_b", R"(["DataType"; "Boolean"])", true, yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_c", R"(["DataType"; "String"])", "row1_c", yqlTypes); + + ASSERT_EQ(row2->GetType(), ENodeType::Map); + EXPECT_EQ(row2->AsMap()->GetChildCount(), 2); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_b", R"(["DataType"; "String"])", "row2_b", yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_c", R"(["DataType"; "String"])", "row2_c", yqlTypes); + + ASSERT_EQ(row3->GetType(), ENodeType::Map); + EXPECT_EQ(row3->AsMap()->GetChildCount(), 3); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_a", R"(["DataType"; "Int64"])", "-100500", yqlTypes); + auto row3BValue = ConvertToNode(TYsonString(TStringBuf(R"({ + val = { + x = { + "$type" = "int64"; + "$value" = "2"; + }; + y = { + "$type" = "int64"; + "$value" = "3"; + } + } + })"))); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_b", R"(["DataType"; "Yson"])", row3BValue, yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_c", R"(["DataType"; "Double"])", 2.71828, yqlTypes); +} + +TEST_F(TWriterForWebJson, ColumnNameEncoding) +{ + Config_->MaxAllColumnNamesCount = 2; + Config_->ValueFormat = EWebJsonValueFormat::Yql; + + CreateStandardWriter(); + + { + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", 100500u}, + {"column_non_ascii_\xd0\x81", -100500}, + }).Get() + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); + ASSERT_TRUE(incompleteColumns); + auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); + ASSERT_TRUE(incompleteAllColumnNames); + auto allColumnNames = result->AsMap()->FindChild("all_column_names"); + ASSERT_TRUE(allColumnNames); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); + std::vector<TString> allColumnNamesVector; + ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); + EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_non_ascii_\xc3\x90\xc2\x81"})); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 1); + + auto row1 = rows->AsList()->GetChildOrThrow(0); + + ASSERT_EQ(row1->GetType(), ENodeType::Map); + EXPECT_EQ(row1->AsMap()->GetChildCount(), 2); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", R"(["DataType"; "Uint64"])", "100500", yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_non_ascii_\xc3\x90\xc2\x81", R"(["DataType"; "Int64"])", "-100500", yqlTypes); +} + +TEST_F(TWriterForWebJson, YqlValueFormat_ComplexTypes) +{ + Config_->ValueFormat = EWebJsonValueFormat::Yql; + + auto firstSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"column_a", OptionalLogicalType( + ListLogicalType(MakeLogicalType(ESimpleLogicalValueType::Int64, true)))}, + {"column_b", StructLogicalType({ + {"key", MakeLogicalType(ESimpleLogicalValueType::String, true)}, + {"value", MakeLogicalType(ESimpleLogicalValueType::String, true)}, + {"variant_tuple", VariantTupleLogicalType({ + MakeLogicalType(ESimpleLogicalValueType::Int8, true), + MakeLogicalType(ESimpleLogicalValueType::Boolean, false), + })}, + {"variant_struct", VariantStructLogicalType({ + {"a", MakeLogicalType(ESimpleLogicalValueType::Int8, true)}, + {"b", MakeLogicalType(ESimpleLogicalValueType::Boolean, false)}, + })}, + {"dict", DictLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64), + SimpleLogicalType(ESimpleLogicalValueType::String)), + }, + {"tagged", TaggedLogicalType( + "MyTag", + SimpleLogicalType(ESimpleLogicalValueType::Int64)), + }, + {"timestamp", SimpleLogicalType(ESimpleLogicalValueType::Timestamp)}, + {"date", SimpleLogicalType(ESimpleLogicalValueType::Date)}, + {"datetime", SimpleLogicalType(ESimpleLogicalValueType::Datetime)}, + {"interval", SimpleLogicalType(ESimpleLogicalValueType::Interval)}, + {"date32", SimpleLogicalType(ESimpleLogicalValueType::Date32)}, + {"datetime64", SimpleLogicalType(ESimpleLogicalValueType::Datetime64)}, + {"timestamp64", SimpleLogicalType(ESimpleLogicalValueType::Timestamp64)}, + {"interval64", SimpleLogicalType(ESimpleLogicalValueType::Interval64)}, + {"json", SimpleLogicalType(ESimpleLogicalValueType::Json)}, + {"float", SimpleLogicalType(ESimpleLogicalValueType::Float)}, + })}, + {"column_c", ListLogicalType(StructLogicalType({ + {"very_optional_key", OptionalLogicalType(MakeLogicalType(ESimpleLogicalValueType::String, false))}, + {"optional_value", MakeLogicalType(ESimpleLogicalValueType::String, false)}, + }))}, + }); + + auto secondSchema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"column_a", VariantTupleLogicalType({ + SimpleLogicalType(ESimpleLogicalValueType::Null), + SimpleLogicalType(ESimpleLogicalValueType::Any), + })}, + {"column_b", SimpleLogicalType(ESimpleLogicalValueType::Null)}, + {"column_c", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Null))}, + {"column_d", OptionalLogicalType(SimpleLogicalType(ESimpleLogicalValueType::Int64))}, + }); + + auto firstColumnAType = ConvertToNode(TYsonString(TStringBuf(R"([ + "OptionalType"; + [ + "ListType"; + ["DataType"; "Int64"] + ] + ])"))); + auto firstColumnBType = ConvertToNode(TYsonString(TStringBuf(R"([ + "StructType"; + [ + [ + "key"; + ["DataType"; "String"] + ]; + [ + "value"; + ["DataType"; "String"] + ]; + [ + "variant_tuple"; + [ + "VariantType"; + [ + "TupleType"; + [ + ["DataType"; "Int8"]; + [ + "OptionalType"; + ["DataType"; "Boolean"] + ] + ] + ] + ] + ]; + [ + "variant_struct"; + [ + "VariantType"; + [ + "StructType"; + [ + [ + "a"; + ["DataType"; "Int8"] + ]; + [ + "b"; + [ + "OptionalType"; + ["DataType"; "Boolean"] + ] + ] + ] + ] + ] + ]; + [ + "dict"; + [ + "DictType"; + ["DataType"; "Int64"]; + ["DataType"; "String"] + ] + ]; + [ + "tagged"; + [ + "TaggedType"; + "MyTag"; + ["DataType"; "Int64"] + ] + ]; + [ + "timestamp"; + ["DataType"; "Timestamp"] + ]; + [ + "date"; + ["DataType"; "Date"] + ]; + [ + "datetime"; + ["DataType"; "Datetime"] + ]; + [ + "interval"; + ["DataType"; "Interval"] + ]; + [ + "date32"; + ["DataType"; "Date32"] + ]; + [ + "datetime64"; + ["DataType"; "Datetime64"] + ]; + [ + "timestamp64"; + ["DataType"; "Timestamp64"] + ]; + [ + "interval64"; + ["DataType"; "Interval64"] + ]; + [ + "json"; + ["DataType"; "Json"] + ]; + [ + "float"; + ["DataType"; "Float"] + ]; + ] + ])"))); + auto firstColumnCType = ConvertToNode(TYsonString(TStringBuf(R"([ + "ListType"; + [ + "StructType"; + [ + [ + "very_optional_key"; + [ + "OptionalType"; + [ + "OptionalType"; + ["DataType"; "String"] + ] + ] + ]; + [ + "optional_value"; + [ + "OptionalType"; + ["DataType"; "String"] + ] + ] + ] + ] + ])"))); + auto secondColumnAType = ConvertToNode(TYsonString(TStringBuf(R"([ + "VariantType"; + [ + "TupleType"; + [ + ["NullType"]; + ["DataType"; "Yson"]; + ] + ] + ])"))); + auto secondColumnBType = ConvertToNode(TYsonString(TStringBuf(R"(["NullType"])"))); + auto secondColumnCType = ConvertToNode(TYsonString(TStringBuf(R"([ + "OptionalType"; + [ + "NullType"; + ] + ])"))); + auto secondColumnDType = ConvertToNode(TYsonString(TStringBuf(R"([ + "OptionalType"; + ["DataType"; "Int64"] + ])"))); + + CreateStandardWriter(std::vector{firstSchema, secondSchema}); + { + bool written = Writer_->Write({ + MakeRow(NameTable_, { + {"column_a", EValueType::Composite, R"([-1; -2; -5])"}, + { + "column_b", + EValueType::Composite, + R"([ + "key"; + "value"; + [0; 7]; + [1; #]; + [[1; "a"]; [2; "b"]]; + 99; + 100u; + 101u; + 102u; + 103; + -42; + 42; + -42; + -1; + "[\"a\", {\"b\": 42}]"; + -3.25; + ])", + }, + {"column_c", EValueType::Composite, R"([[[#]; "value"]; [["key"]; #]])"}, + {"column_d", -49}, + {TString(TableIndexColumnName), 0}, + {TString(RowIndexColumnName), 0}, + }).Get(), + MakeRow(NameTable_, { + {"column_a", EValueType::Composite, R"([0; -2; -5; 177])"}, + { + "column_b", + EValueType::Composite, + R"([ + "key1"; + "value1"; + [1; %false]; + [1; #]; + []; + 199; + 0u; + 1101u; + 1102u; + 1103; + 123; + -123; + 123; + 123; + "null"; + 0.0; + ])", + }, + {"column_c", EValueType::Composite, R"([[#; #]; [["key1"]; #]])"}, + {"column_d", 49u}, + {TString(RowIndexColumnName), 1}, + }).Get(), + MakeRow(NameTable_, { + {"column_a", EValueType::Composite, "[]"}, + { + "column_b", + EValueType::Composite, + R"([ + "key2"; + "value2"; + [0; 127]; + [1; %true]; + [[0; ""]]; + 399; + 30u; + 3101u; + 3202u; + 3103; + -53375809; + -4611669897600; + -4611669897600000000; + -9223339708799999999; + "{\"x\": false}"; + 1e10; + ])" + }, + {"column_c", EValueType::Composite, "[[[key]; #]]"}, + {"column_d", "49"}, + {TString(RowIndexColumnName), 2}, + }).Get(), + + MakeRow(NameTable_, { + {"column_a", nullptr}, + { + "column_b", + EValueType::Composite, + // First string is valid UTF-8, the second one should be Base64 encoded. + "[" + "\"\xC3\xBF\";" + "\"\xFA\xFB\xFC\xFD\";" + R"( + [0; 127]; + [1; %true]; + [[-1; "-1"]; [0; ""]]; + 499; + 40u; + 4101u; + 4202u; + 4103; + 53375807; + 4611669811199; + 4611669811199999999; + 9223339708799999999; + "{}"; + -2.125; + ])", + }, + {"column_c", EValueType::Composite, "[]"}, + {"column_d", EValueType::Any, "{x=49}"}, + {TString(RowIndexColumnName), 3}, + }).Get(), + + // Here come rows from the second table. + MakeRow(NameTable_, { + {"column_a", EValueType::Composite, "[0; #]"}, + {"column_b", nullptr}, + {"column_c", nullptr}, + {"column_d", -49}, + {TString(TableIndexColumnName), 1}, + {TString(RowIndexColumnName), 0}, + }).Get(), + + MakeRow(NameTable_, { + {"column_a", EValueType::Composite, "[1; {z=z}]"}, + {"column_b", nullptr}, + {"column_c", EValueType::Composite, "[#]"}, + {"column_d", nullptr}, + {TString(TableIndexColumnName), 1}, + {TString(RowIndexColumnName), 1}, + }).Get(), + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto incompleteColumns = result->AsMap()->FindChild("incomplete_columns"); + ASSERT_TRUE(incompleteColumns); + auto incompleteAllColumnNames = result->AsMap()->FindChild("incomplete_all_column_names"); + ASSERT_TRUE(incompleteAllColumnNames); + auto allColumnNames = result->AsMap()->FindChild("all_column_names"); + ASSERT_TRUE(allColumnNames); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(incompleteColumns->GetType(), ENodeType::String); + EXPECT_EQ(incompleteColumns->GetValue<TString>(), "false"); + + ASSERT_EQ(incompleteAllColumnNames->GetType(), ENodeType::String); + EXPECT_EQ(incompleteAllColumnNames->GetValue<TString>(), "false"); + + ASSERT_EQ(allColumnNames->GetType(), ENodeType::List); + std::vector<TString> allColumnNamesVector; + ASSERT_NO_THROW(allColumnNamesVector = ConvertTo<decltype(allColumnNamesVector)>(allColumnNames)); + EXPECT_EQ(allColumnNamesVector, (std::vector<TString>{"column_a", "column_b", "column_c", "column_d"})); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 6); + + auto row1 = rows->AsList()->GetChildOrThrow(0); + auto row2 = rows->AsList()->GetChildOrThrow(1); + auto row3 = rows->AsList()->GetChildOrThrow(2); + auto row4 = rows->AsList()->GetChildOrThrow(3); + auto row5 = rows->AsList()->GetChildOrThrow(4); + auto row6 = rows->AsList()->GetChildOrThrow(5); + + ASSERT_EQ(row1->GetType(), ENodeType::Map); + EXPECT_EQ(row1->AsMap()->GetChildCount(), 4); + auto row1AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=["-1"; "-2"; "-5"]}])"))); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_a", firstColumnAType, row1AValue, yqlTypes); + auto row1BValue = ConvertToNode(TYsonString(TStringBuf( + R"([ + "key"; + "value"; + ["0"; "7"]; + ["1"; #]; + {"val"=[["1"; "a"]; ["2"; "b"]]}; + "99"; + "100"; + "101"; + "102"; + "103"; + "-42"; + "42"; + "-42"; + "-1"; + "[\"a\", {\"b\": 42}]"; + "-3.25"; + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_b", firstColumnBType, row1BValue, yqlTypes); + auto row1CValue = ConvertToNode(TYsonString(TStringBuf(R"({ + "val"=[ + [[#]; ["value"]]; + [[["key"]]; #] + ] + })"))); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_c", firstColumnCType, row1CValue, yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row1, "column_d", R"(["DataType"; "Int64"])", "-49", yqlTypes); + + ASSERT_EQ(row2->GetType(), ENodeType::Map); + EXPECT_EQ(row2->AsMap()->GetChildCount(), 4); + auto row2AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=["0"; "-2"; "-5"; "177"]}])"))); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_a", firstColumnAType, row2AValue, yqlTypes); + auto row2BValue = ConvertToNode(TYsonString(TStringBuf( + R"([ + "key1"; + "value1"; + ["1"; [%false]]; + ["1"; #]; + {"val"=[]}; + "199"; + "0"; + "1101"; + "1102"; + "1103"; + "123"; + "-123"; + "123"; + "123"; + "null"; + "0"; + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_b", firstColumnBType, row2BValue, yqlTypes); + auto row2CValue = ConvertToNode(TYsonString(TStringBuf(R"({ + "val"=[ + [#; #]; + [[["key1"]]; #] + ] + })"))); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_c", firstColumnCType, row2CValue, yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row2, "column_d", R"(["DataType"; "Uint64"])", "49", yqlTypes); + + ASSERT_EQ(row3->GetType(), ENodeType::Map); + EXPECT_EQ(row3->AsMap()->GetChildCount(), 4); + auto row3AValue = ConvertToNode(TYsonString(TStringBuf(R"([{"val"=[]}])"))); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_a", firstColumnAType, row3AValue, yqlTypes); + auto row3BValue = ConvertToNode(TYsonString(TStringBuf( + R"([ + "key2"; + "value2"; + ["0"; "127"]; + ["1"; [%true]]; + {"val"=[["0"; ""]]}; + "399"; + "30"; + "3101"; + "3202"; + "3103"; + "-53375809"; + "-4611669897600"; + "-4611669897600000000"; + "-9223339708799999999"; + "{\"x\": false}"; + "10000000000"; + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_b", firstColumnBType, row3BValue, yqlTypes); + auto row3CValue = ConvertToNode(TYsonString(TStringBuf(R"({ + "val"=[ + [[["key"]]; #] + ] + })"))); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_c", firstColumnCType, row3CValue, yqlTypes); + CHECK_YQL_TYPE_AND_VALUE(row3, "column_d", R"(["DataType"; "String"])", "49", yqlTypes); + + ASSERT_EQ(row4->GetType(), ENodeType::Map); + EXPECT_EQ(row4->AsMap()->GetChildCount(), 4); + auto row4AValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); + CHECK_YQL_TYPE_AND_VALUE(row4, "column_a", firstColumnAType, row4AValue, yqlTypes); + + auto row4BValue = ConvertToNode(TYsonString(TStringBuf( + "[" + "\"\xC3\xBF\";" + R"( + {"b64" = %true; "val" = "+vv8/Q=="}; + ["0"; "127"]; + ["1"; [%true]]; + {"val"=[["-1"; "-1"]; ["0"; ""]]}; + "499"; + "40"; + "4101"; + "4202"; + "4103"; + "53375807"; + "4611669811199"; + "4611669811199999999"; + "9223339708799999999"; + "{}"; + "-2.125"; + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row4, "column_b", firstColumnBType, row4BValue, yqlTypes); + + auto row4CValue = ConvertToNode(TYsonString(TStringBuf(R"({"val"=[]})"))); + CHECK_YQL_TYPE_AND_VALUE(row4, "column_c", firstColumnCType, row4CValue, yqlTypes); + auto row4DValue = ConvertToNode(TYsonString(TStringBuf(R"({ + val = { + x = { + "$type" = "int64"; + "$value" = "49"; + } + } + })"))); + CHECK_YQL_TYPE_AND_VALUE(row4, "column_d", R"(["DataType"; "Yson"])", row4DValue, yqlTypes); + + // Here must come rows from the second table. + + ASSERT_EQ(row5->GetType(), ENodeType::Map); + EXPECT_EQ(row5->AsMap()->GetChildCount(), 4); + auto row5AValue = ConvertToNode(TYsonString(TStringBuf(R"(["0"; #])"))); + CHECK_YQL_TYPE_AND_VALUE(row5, "column_a", secondColumnAType, row5AValue, yqlTypes); + auto row5BValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); + CHECK_YQL_TYPE_AND_VALUE(row5, "column_b", secondColumnBType, row5BValue, yqlTypes); + auto row5CValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); + CHECK_YQL_TYPE_AND_VALUE(row5, "column_c", secondColumnCType, row5CValue, yqlTypes); + auto row5DValue = ConvertToNode(TYsonString(TStringBuf(R"(["-49"])"))); + CHECK_YQL_TYPE_AND_VALUE(row5, "column_d", secondColumnDType, row5DValue, yqlTypes); + + ASSERT_EQ(row6->GetType(), ENodeType::Map); + EXPECT_EQ(row6->AsMap()->GetChildCount(), 4); + auto row6AValue = ConvertToNode(TYsonString(TStringBuf(R"([ + "1"; + { + val = { + z = { + "$type" = "string"; + "$value" = "z"; + } + } + }; + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row6, "column_a", secondColumnAType, row6AValue, yqlTypes); + auto row6BValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); + CHECK_YQL_TYPE_AND_VALUE(row6, "column_b", secondColumnBType, row6BValue, yqlTypes); + auto row6CValue = ConvertToNode(TYsonString(TStringBuf(R"([#])"))); + CHECK_YQL_TYPE_AND_VALUE(row6, "column_c", secondColumnCType, row6CValue, yqlTypes); + auto row6DValue = ConvertToNode(TYsonString(TStringBuf(R"(#)"))); + CHECK_YQL_TYPE_AND_VALUE(row6, "column_d", secondColumnDType, row6DValue, yqlTypes); +} + +TEST_F(TWriterForWebJson, YqlValueFormat_Incomplete) +{ + Config_->ValueFormat = EWebJsonValueFormat::Yql; + Config_->FieldWeightLimit = 215; + Config_->StringWeightLimit = 10; + + auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"column_a", StructLogicalType({ + {"field1", SimpleLogicalType(ESimpleLogicalValueType::Int64)}, + {"list", ListLogicalType( + VariantStructLogicalType({ + {"a", DictLogicalType( + SimpleLogicalType(ESimpleLogicalValueType::Int64), + SimpleLogicalType(ESimpleLogicalValueType::String)), + }, + {"b", SimpleLogicalType(ESimpleLogicalValueType::Any)}, + })), + }, + {"field2", SimpleLogicalType(ESimpleLogicalValueType::String)}, + {"field3", MakeLogicalType(ESimpleLogicalValueType::Int64, false)}, + })}, + {"column_b", SimpleLogicalType(ESimpleLogicalValueType::Any)}, + {"column_c", MakeLogicalType(ESimpleLogicalValueType::String, false)}, + }); + + auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"([ + "StructType"; + [ + [ + "field1"; + ["DataType"; "Int64"] + ]; + [ + "list"; + [ + "ListType"; + [ + "VariantType"; + [ + "StructType"; + [ + [ + "a"; + [ + "DictType"; + ["DataType"; "Int64"]; + ["DataType"; "String"] + ] + ]; + [ + "b"; + ["DataType"; "Yson"] + ]; + ] + ] + ] + ] + ]; + [ + "field2"; + ["DataType"; "String"] + ]; + [ + "field3"; + [ + "OptionalType"; + ["DataType"; "Int64"] + ] + ]; + ] + ])"))); + + auto yqlTypeB = ConvertToNode(TYsonString(TStringBuf(R"(["DataType"; "Yson"])"))); + auto yqlTypeC = ConvertToNode(TYsonString(TStringBuf(R"(["OptionalType"; ["DataType"; "String"]])"))); + { + CreateStandardWriter({schema}); + bool written = Writer_->Write({ + MakeRow(NameTable_, { + { + "column_a", + EValueType::Composite, + R"([ + -1; + [ + [ + 0; + [ + [-2; "UTF:)" + TString("\xF0\x90\x8D\x88") + "\xF0\x90\x8D\x88" + R"("]; + [2; "!UTF:)" + TString("\xFA\xFB\xFC\xFD\xFA\xFB\xFC\xFD") + R"("]; + [0; ""]; + ] + ]; + [ + 1; + "{kinda_long_key = kinda_even_longer_value}" + ]; + [ + 0; + [ + [0; "One more quite long string"]; + [1; "One more quite long string"]; + [2; "One more quite long string"]; + [3; "One more quite long string"]; + [4; "One more quite long string"]; + [5; "One more quite long string"]; + ] + ]; + [ + 1; + "{kinda_long_key = kinda_even_longer_value}" + ]; + ]; + "I'm short"; + 424242238133245 + ])" + }, + {"column_b", EValueType::Any, "{kinda_long_key = kinda_even_longer_value}"}, + {"column_c", "One more quite long string"}, + }).Get(), + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 1); + + auto row = rows->AsList()->GetChildOrThrow(0); + ASSERT_EQ(row->GetType(), ENodeType::Map); + EXPECT_EQ(row->AsMap()->GetChildCount(), 3); + + auto rowAValue = ConvertToNode(TYsonString(R"([ + "-1"; + { + "inc" = %true; + "val" = [ + [ + "0"; + { + "val" = [ + ["-2"; {"inc"=%true; "val"="UTF:)" + TString("\xF0\x90\x8D\x88") + R"("}]; + ["2"; {"inc"=%true; "b64"=%true; "val"="IVVURjr6"}]; + ["0"; ""]; + ] + } + ]; + [ + "1"; + {"val"=""; "inc"=%true} + ]; + [ + "0"; + { + "inc" = %true; + "val" = [ + ["0"; {"val"="One more q"; "inc"=%true}]; + ["1"; {"val"="One more "; "inc"=%true}]; + ]; + } + ]; + ]; + }; + { + "val" = ""; + "inc" = %true; + }; + ["424242238133245"]; + ])")); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + + // Simple values are not truncated to |StringWeightLimit| + auto rowBValue = ConvertToNode(TYsonString(TStringBuf(R"({ + val = { + kinda_long_key = { + "$type" = "string"; + "$value" = kinda_even_longer_value; + } + } + })"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_b", yqlTypeB, rowBValue, yqlTypes); + auto rowCValue = ConvertToNode(TYsonString(TStringBuf(R"(["One more quite long string"])"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_c", yqlTypeC, rowCValue, yqlTypes); +} + + +TEST_F(TWriterForWebJson, YqlValueFormat_Any) +{ + Config_->ValueFormat = EWebJsonValueFormat::Yql; + + auto schema = New<TTableSchema>(std::vector<TColumnSchema>{ + {"column_a", MakeLogicalType(ESimpleLogicalValueType::Any, false)}, + }); + + auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"([ + "OptionalType"; + ["DataType"; "Yson"] + ])"))); + + CreateStandardWriter({schema}); + { + bool written = Writer_->Write({ + MakeRow(NameTable_, {{"column_a", EValueType::Any, "{x=y;z=2}"}}).Get(), + MakeRow(NameTable_, {{"column_a", true}}).Get(), + MakeRow(NameTable_, {{"column_a", -42}}).Get(), + MakeRow(NameTable_, {{"column_a", 42u}}).Get(), + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 4); + + { + auto row = rows->AsList()->GetChildOrThrow(0); + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ + { + val = { + x = { + "$type" = "string"; + "$value" = "y"; + }; + z = { + "$type" = "int64"; + "$value" = "2"; + } + } + } + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + } + { + auto row = rows->AsList()->GetChildOrThrow(1); + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ + { + val = { + "$type" = "boolean"; + "$value" = "true"; + } + } + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + } + { + auto row = rows->AsList()->GetChildOrThrow(2); + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ + { + val = { + "$type" = "int64"; + "$value" = "-42"; + } + } + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + } + { + auto row = rows->AsList()->GetChildOrThrow(3); + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"([ + { + val = { + "$type" = "uint64"; + "$value" = "42"; + } + } + ])"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + } +} + +TEST_F(TWriterForWebJson, YqlValueFormat_CompositeNoSchema) +{ + Config_->ValueFormat = EWebJsonValueFormat::Yql; + + auto schema = New<TTableSchema>(); + + auto yqlTypeA = ConvertToNode(TYsonString(TStringBuf(R"(["DataType"; "Yson"])"))); + + CreateStandardWriter({schema}); + { + bool written = Writer_->Write({ + MakeRow(NameTable_, {{"column_a", EValueType::Composite, "[1;2]"}}).Get(), + }); + EXPECT_TRUE(written); + Writer_->Close().Get().ThrowOnError(); + } + + auto result = ParseJsonToNode(OutputStream_.Str()); + ASSERT_EQ(result->GetType(), ENodeType::Map); + + auto rows = result->AsMap()->FindChild("rows"); + ASSERT_TRUE(rows); + auto yqlTypeRegistry = result->AsMap()->FindChild("yql_type_registry"); + ASSERT_TRUE(yqlTypeRegistry); + + ASSERT_EQ(yqlTypeRegistry->GetType(), ENodeType::List); + auto yqlTypes = ConvertTo<std::vector<INodePtr>>(yqlTypeRegistry); + + ASSERT_EQ(rows->GetType(), ENodeType::List); + ASSERT_EQ(rows->AsList()->GetChildCount(), 1); + + { + auto row = rows->AsList()->GetChildOrThrow(0); + ASSERT_EQ(row->GetType(), ENodeType::Map); + auto rowAValue = ConvertToNode(TYsonString(TStringBuf(R"({ + "val" = [ + { + "$type" = "int64"; + "$value" = "1"; + }; + { + "$type" = "int64"; + "$value" = "2"; + } + ] + })"))); + CHECK_YQL_TYPE_AND_VALUE(row, "column_a", yqlTypeA, rowAValue, yqlTypes); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/ya.make b/yt/yt/library/formats/unittests/ya.make new file mode 100644 index 0000000000..f080e66dc7 --- /dev/null +++ b/yt/yt/library/formats/unittests/ya.make @@ -0,0 +1,53 @@ +GTEST(unittester-formats) + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +PROTO_NAMESPACE(yt) + +SRCS( + protobuf_format_ut.proto + + arrow_parser_ut.cpp + dsv_parser_ut.cpp + dsv_writer_ut.cpp + protobuf_format_ut.cpp + row_helpers.cpp + schemaful_dsv_parser_ut.cpp + schemaful_dsv_writer_ut.cpp + skiff_format_ut.cpp + skiff_yson_converter_ut.cpp + value_examples.cpp + web_json_writer_ut.cpp + yamred_dsv_parser_ut.cpp + yamred_dsv_writer_ut.cpp + yaml_parser_ut.cpp + yaml_writer_ut.cpp + yamr_parser_ut.cpp + yamr_writer_ut.cpp + yson_helpers.cpp +) + +INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc) + +PEERDIR( + yt/yt/build + yt/yt/core/test_framework + yt/yt/core + yt/yt/client + yt/yt/client/formats + yt/yt/library/formats + yt/yt/library/named_value + + contrib/libs/apache/arrow +) + +RESOURCE( + ${ARCADIA_ROOT}/library/cpp/type_info/ut/test-data/good-types.txt /types/good + ${ARCADIA_ROOT}/library/cpp/type_info/ut/test-data/bad-types.txt /types/bad +) + +SIZE(MEDIUM) + +REQUIREMENTS(ram:12) + +END() diff --git a/yt/yt/library/formats/unittests/yaml_parser_ut.cpp b/yt/yt/library/formats/unittests/yaml_parser_ut.cpp new file mode 100644 index 0000000000..95b9898360 --- /dev/null +++ b/yt/yt/library/formats/unittests/yaml_parser_ut.cpp @@ -0,0 +1,598 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/client/formats/parser.h> + +#include <yt/yt/client/formats/config.h> + +#include <yt/yt/library/formats/yaml_parser.h> + +#include <yt/yt/core/yson/writer.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; + +//////////////////////////////////////////////////////////////////////////// + +TString ParseYaml(const TString& yaml, EYsonType ysonType) +{ + TStringStream inputStream(yaml); + TStringStream outputStream; + TYsonWriter writer(&outputStream, EYsonFormat::Pretty, ysonType); + auto config = New<TYamlFormatConfig>(); + ParseYaml(&inputStream, &writer, config, ysonType); + return outputStream.Str(); +} + +////////////////////////////////////////////////////////////////////////////// + +TEST(TYamlParserTest, Simple) +{ + TString yaml = R"( +hello)"; + // Here and in the rest of the tests we introduce an extra leading \n for the better readabilty, which we later + // strip off in the comparison. + TString expectedYson = R"( +"hello")"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Integers) +{ + TString yaml = R"( +a: 1 +b: -1 +# Hex and oct +c: 0xDeAdBeEf +d: 0o42 +# Various non-normalized forms of numbers +e: -000 +f: +0 +g: +42 +# Would be oct in YAML 1.1, but not in YAML 1.2! +h: 0042 +i: -018 +# 2^63, should be unsigned +j: 9223372036854775808 +# 2^64 - 1, should be unsigned +k: 18446744073709551615 +l: -9223372036854775808 +m: !yt/uint64 1234 +n: !!int 23 +o: !!int -15)"; + TString expectedYson = R"( +{ + "a" = 1; + "b" = -1; + "c" = 3735928559u; + "d" = 34u; + "e" = 0; + "f" = 0; + "g" = 42; + "h" = 42; + "i" = -18; + "j" = 9223372036854775808u; + "k" = 18446744073709551615u; + "l" = -9223372036854775808; + "m" = 1234u; + "n" = 23; + "o" = -15; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); + + std::vector<TString> invalidYamls = { + "!!int -0x42", + "!!int -0o23", + "!!int deadbeef", + "!!int 18446744073709551616", + "!!int -9223372036854775809" + "!yt/uint64 -1", + "!yt/uint64 18446744073709551616", + "!!int 0x", + // Examples below were integers in YAML 1.1, but not in YAML 1.2. + "!!int 123_456", + "!!int 190:20:30", + "!!int 0b1001", + "!!int \"\"", + }; + for (const auto& yaml : invalidYamls) { + EXPECT_THROW_MESSAGE_HAS_SUBSTR(ParseYaml(yaml, EYsonType::Node), std::exception, "is not an integer or does not fit") + << "For YAML: " << yaml << std::endl; + } +} + +TEST(TYamlParserTest, Floats) +{ + TString yaml = R"( +a: 1. +b: .2 +c: +3.14 +d: -2.17 +e: .inf +f: -.Inf +g: +.INF +h: .nan +i: .NaN +j: .NAN +k: !!float 42 +l: 1e2 +m: 1e+2 +n: 1e-2 +)"; + TString expectedYson = R"( +{ + "a" = 1.; + "b" = 0.2; + "c" = 3.14; + "d" = -2.17; + "e" = %inf; + "f" = %-inf; + "g" = %inf; + "h" = %nan; + "i" = %nan; + "j" = %nan; + "k" = 42.; + "l" = 100.; + "m" = 100.; + "n" = 0.01; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); + + std::vector<TString> invalidYamls = { + "!!float 0o23", + "!!float 1e", + "!!float 1e+", + "!!float 1e-", + "!!float 1e-2.3", + "!!float 1e2.3", + // Examples below were integers in YAML 1.1, but not in YAML 1.2. + "!!float 123_456", + "!!float 190:20:30.15", + "!!float inf", + "!!float .InF", + "!!float -+42.0", + "!!float .", + // For some reason arcadian FloatToString parses this, but it feels excessive to ban that + // despite not satisfying the regexp from the spec. + // "!!float 0x42", + }; + for (const auto& yaml : invalidYamls) { + EXPECT_THROW_MESSAGE_HAS_SUBSTR(ParseYaml(yaml, EYsonType::Node), std::exception, "is not a floating point") + << "For YAML: " << yaml << std::endl; + } +} + +TEST(TYamlParserTest, Booleans) +{ + TString yaml = R"( +a: true +b: false +c: True +d: False +e: TRUE +f: FALSE +g: !!bool true +)"; + TString expectedYson = R"( +{ + "a" = %true; + "b" = %false; + "c" = %true; + "d" = %false; + "e" = %true; + "f" = %false; + "g" = %true; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); + + std::vector<TString> invalidYamls = { + "!!bool 1", + "!!bool 0", + // Examples below were booleans in YAML 1.1, but not in YAML 1.2. + "!!bool yes", + "!!bool no", + "!!bool on", + "!!bool off", + "!!bool y", + "!!bool n", + "!!bool \"\"", + }; + for (const auto& yaml : invalidYamls) { + EXPECT_THROW_MESSAGE_HAS_SUBSTR(ParseYaml(yaml, EYsonType::Node), std::exception, "is not a boolean") + << "For YAML: " << yaml << std::endl; + } +} + +TEST(TYamlParserTest, Nulls) +{ + TString yaml = R"( +a: null +b: Null +c: NULL +d: ~ +e: +f: !!null null +# This is not allowed by a regexp in a spec, but feels excessive to ban. +g: !!null foo +)"; + TString expectedYson = R"( +{ + "a" = #; + "b" = #; + "c" = #; + "d" = #; + "e" = #; + "f" = #; + "g" = #; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Strings) +{ + TString yaml = R"( +a: "hello" +b: 'world' +c: of +d: !!str warcraft +e: !!str 42 +f: !!str ~ +g: ! hello +)"; + TString expectedYson = R"( +{ + "a" = "hello"; + "b" = "world"; + "c" = "of"; + "d" = "warcraft"; + "e" = "42"; + "f" = "~"; + "g" = "hello"; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Mappings) +{ + TString yaml = R"( +a: + x: 1 + y: + foo: bar + bar: foo +42: + z: 3 +c: {} +)"; + TString expectedYson = R"( +{ + "a" = { + "x" = 1; + "y" = { + "foo" = "bar"; + "bar" = "foo"; + }; + }; + "42" = { + "z" = 3; + }; + "c" = {}; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Sequences) +{ + TString yaml = R"( +- foo +- - 1 + - 2 + - 3 +- bar +- [] +- - - - null +)"; + TString expectedYson = R"( +[ + "foo"; + [ + 1; + 2; + 3; + ]; + "bar"; + []; + [ + [ + [ + #; + ]; + ]; + ]; +])"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Attributes) +{ + TString yaml = R"( +!yt/attrnode +- x: 1 + y: 2 +- a: !yt/attrnode + - {} + - 42 + b: !yt/attrnode + - x: null + - - 1 + - 2 + - 3 + c: !yt/attrnode + - foo: 1 + - null +)"; + // <x=1;y=2>{a=<>42; b=<x=#>[1;2;3]; c=<foo=1>#;} + TString expectedYson = R"( +< + "x" = 1; + "y" = 2; +> { + "a" = <> 42; + "b" = < + "x" = #; + > [ + 1; + 2; + 3; + ]; + "c" = < + "foo" = 1; + > #; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); + + std::vector<std::pair<TString, TString>> invalidYamlsAndErrors = { + {R"( +!yt/attrnode +- x: 1 +)", "Unexpected event type \"sequence_end\""}, + {R"( +!yt/attrnode +- foo +- bar +)", "Unexpected event type \"scalar\""}, + {R"( +!yt/attrnode +- x: 1 +- y: 2 +- z: 3 +)", "Unexpected event type \"mapping_start\""}, +}; + for (const auto& [yaml, error] : invalidYamlsAndErrors) { + EXPECT_THROW_MESSAGE_HAS_SUBSTR(ParseYaml(yaml, EYsonType::Node), std::exception, error) + << "For YAML: " << yaml << std::endl; + } +}; + +TEST(TYamlParserTest, MultiDocument) +{ + TString yaml = R"( +a: 1 +--- +foo +--- +~ +--- +)"; + TString expectedYson = R"( +{ + "a" = 1; +}; +"foo"; +#; +#; +)"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::ListFragment), expectedYson.substr(1)); +} + +TEST(TYamlParserTest, Anchors) +{ + TString yaml = R"( +a: &foo 1 +b: *foo +c: &bar + x: &baz + - False + - &qux True + y: 2 + z: *baz + t: *foo + w: *qux +d: *bar +e: *baz +f: *foo +g: *qux +)"; + TString expectedYson = R"( +{ + "a" = 1; + "b" = 1; + "c" = { + "x" = [ + %false; + %true; + ]; + "y" = 2; + "z" = [ + %false; + %true; + ]; + "t" = 1; + "w" = %true; + }; + "d" = { + "x" = [ + %false; + %true; + ]; + "y" = 2; + "z" = [ + %false; + %true; + ]; + "t" = 1; + "w" = %true; + }; + "e" = [ + %false; + %true; + ]; + "f" = 1; + "g" = %true; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); + + std::vector<std::pair<TString, TString>> invalidYamlsAndErrors = { + {R"( +a: *foo +)", "undefined or unfinished anchor"}, + {R"( +- &foo a +- &foo b +)", "already defined"}, + {R"( +a: &foo +- b: &foo + - c +)", "already defined"}, + {R"( +a: &foo + bar: *foo +)", "undefined or unfinished anchor"}, + {R"( +a: &foo bar +*foo: baz +)", "alias as a map key is not supported"}, + {R"( +&foo a: b +)", "anchors on map keys is not supported"}, + }; + for (const auto& [yaml, error] : invalidYamlsAndErrors) { + EXPECT_THROW_MESSAGE_HAS_SUBSTR(ParseYaml(yaml, EYsonType::Node), std::exception, error) + << "For YAML: " << yaml << std::endl; + } +} + +TEST(TYamlParserTest, Empty) +{ + TString yaml = ""; + TString expectedYson = ""; + EXPECT_EQ(ParseYaml(yaml, EYsonType::ListFragment), expectedYson); +} + +//! There is a reverse test in yaml_writer_ut.cpp. +TEST(TYamlParserTest, RealExample) +{ + TString yaml = R"( +mount_config: {} +schema: !yt/attrnode +- strict: true + unique_keys: false +- - name: lat + required: false + type: double + type_v3: + type_name: optional + item: double + - name: lon + required: false + type: double + type_v3: + type_name: optional + item: double +native_cell_tag: !yt/uint64 9991 +creation_time: 2024-08-15T11:17:59.314773Z +inherit_acl: true +revision: !yt/uint64 8233452423020 +resource_usage: + node_count: 1 + chunk_count: 1 + disk_space_per_medium: + default: 562182 + disk_space: 562182 + chunk_host_cell_master_memory: 0 + master_memory: 0 + tablet_count: 0 + tablet_static_memory: 0 +acl: [] +id: 77d-1c53a-27070191-e4d8f5ac +parent_id: 77d-1c0d3-2707012f-ddf40dd7 +foreign: false +type: table +sequoia: false +ref_counter: 1 +builtin: false +owner: max +compression_ratio: 0.3679379456925491 +)"; + TString expectedYson = R"( +{ + "mount_config" = {}; + "schema" = < + "strict" = %true; + "unique_keys" = %false; + > [ + { + "name" = "lat"; + "required" = %false; + "type" = "double"; + "type_v3" = { + "type_name" = "optional"; + "item" = "double"; + }; + }; + { + "name" = "lon"; + "required" = %false; + "type" = "double"; + "type_v3" = { + "type_name" = "optional"; + "item" = "double"; + }; + }; + ]; + "native_cell_tag" = 9991u; + "creation_time" = "2024-08-15T11:17:59.314773Z"; + "inherit_acl" = %true; + "revision" = 8233452423020u; + "resource_usage" = { + "node_count" = 1; + "chunk_count" = 1; + "disk_space_per_medium" = { + "default" = 562182; + }; + "disk_space" = 562182; + "chunk_host_cell_master_memory" = 0; + "master_memory" = 0; + "tablet_count" = 0; + "tablet_static_memory" = 0; + }; + "acl" = []; + "id" = "77d-1c53a-27070191-e4d8f5ac"; + "parent_id" = "77d-1c0d3-2707012f-ddf40dd7"; + "foreign" = %false; + "type" = "table"; + "sequoia" = %false; + "ref_counter" = 1; + "builtin" = %false; + "owner" = "max"; + "compression_ratio" = 0.3679379456925491; +})"; + EXPECT_EQ(ParseYaml(yaml, EYsonType::Node), expectedYson.substr(1)); +} + +//////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yaml_writer_ut.cpp b/yt/yt/library/formats/unittests/yaml_writer_ut.cpp new file mode 100644 index 0000000000..96fd4a4003 --- /dev/null +++ b/yt/yt/library/formats/unittests/yaml_writer_ut.cpp @@ -0,0 +1,319 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/library/formats/yaml_writer.h> + +#include <yt/yt/client/formats/config.h> + +#include <yt/yt/core/yson/string.h> + +#include <yt/yt/core/ytree/convert.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; +using namespace NYTree; + +////////////////////////////////////////////////////////////////////////////// + +TString YsonToYaml(const TYsonString& yson, const TYsonString& formatAttributes = TYsonString(TStringBuf("{}"))) +{ + TStringStream outputStream; + auto config = ConvertTo<TYamlFormatConfigPtr>(formatAttributes); + auto writer = CreateYamlWriter(&outputStream, yson.GetType(), config); + Serialize(yson, writer.get()); + writer->Flush(); + return outputStream.Str(); +} + +////////////////////////////////////////////////////////////////////////////// + +TEST(TYamlWriterTest, Simple) +{ + TString yson = "hello"; + // Here and in the rest of the tests we introduce an extra leading \n for the better readabilty, which we later + // strip off in the comparison. + TString expectedYaml = R"( +hello +)"; + + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, IntegersWithoutUintTag) +{ + TString yson = "{a=1; b=1u; c=-1; d=9223372036854775808u; e=-9223372036854775808; f=18446744073709551615u}"; + TString expectedYaml = R"( +a: 1 +b: 1 +c: -1 +d: 9223372036854775808 +e: -9223372036854775808 +f: 18446744073709551615 +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, IntegersWithUintTag) +{ + TString formatAttributes = "{write_uint_tag=%true}"; + TString yson = "{a=1; b=1u; c=-1; d=9223372036854775808u; e=-9223372036854775808; f=18446744073709551615u}"; + TString expectedYaml = R"( +a: 1 +b: !yt/uint64 1 +c: -1 +d: !yt/uint64 9223372036854775808 +e: -9223372036854775808 +f: !yt/uint64 18446744073709551615 +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson), TYsonString(formatAttributes)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Doubles) +{ + TString yson = "{a=2.7; b=-3.14; c=0.0; d=4.; e=1e30; f=%nan; g=%inf; h=%-inf}"; + TString expectedYaml = R"( +a: 2.7 +b: -3.14 +c: 0. +d: 4. +e: 1e+30 +f: .nan +g: .inf +h: -.inf +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Entity) +{ + TString yson = "{a=#}"; + TString expectedYaml = R"( +a: null +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Booleans) +{ + TString yson = "{a=%true; b=%false}"; + TString expectedYaml = R"( +a: true +b: false +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Strings) +{ + // a and b may be represented as plain scalars. + // c-e must be quoted on syntactical level, so libyaml chooses a single-quoted style. + // f-i must be quoted because they meet regexps for non-string types, so we force a double-quoted style. + TString yson = R"({a=hello; b="23asd"; c=" "; d="foo\nbar"; e=""; f="42"; g="TRUE"; h="1e4000"; i="~";})"; + TString expectedYaml = R"( +a: hello +b: 23asd +c: ' ' +d: 'foo + + bar' +e: "" +f: "42" +g: "TRUE" +h: "1e4000" +i: "~" +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Mappings) +{ + TString yson("{a={x=1;y={foo=bar;bar=foo}};b={z=3};c={};}"); + TString expectedYaml = R"( +a: + x: 1 + y: + foo: bar + bar: foo +b: + z: 3 +c: {} +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Sequences) +{ + TString yson = "[foo; [1; 2; 3]; bar; []; [[[#]]]]"; + TString expectedYaml = R"( +- foo +- - 1 + - 2 + - 3 +- bar +- [] +- - - - null +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, MultiDocument) +{ + TString yson = "foo;{a=1;b=2};[x;y];{};#;bar;[]"; + TString expectedYaml = R"( +foo +--- +a: 1 +b: 2 +--- +- x +- y +--- {} +--- null +--- bar +--- [] +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson, EYsonType::ListFragment)), expectedYaml.substr(1)); +} + +TEST(TYamlWriterTest, Attributes) +{ + TString yson = "<x=1;y=2>{a=<>42; b=<x=#>[1;2;3]; c=<foo=1>#;}"; + TString expectedYaml = R"( +!yt/attrnode +- x: 1 + y: 2 +- a: !yt/attrnode + - {} + - 42 + b: !yt/attrnode + - x: null + - - 1 + - 2 + - 3 + c: !yt/attrnode + - foo: 1 + - null +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson)), expectedYaml.substr(1)); +}; + +////////////////////////////////////////////////////////////////////////////// + +TEST(TYamlWriterTest, EmptyStream) +{ + TString yson = ""; + TString expectedYaml = ""; + EXPECT_EQ(YsonToYaml(TYsonString(yson, EYsonType::ListFragment)), expectedYaml); +} + +////////////////////////////////////////////////////////////////////////////// + +//! There is a reverse test in yaml_reader_ut.cpp. +TEST(TYamlWriterTest, RealExample) +{ + TString formatAttributes = "{write_uint_tag=%true}"; + TString yson = R"( +{ + "mount_config" = {}; + "schema" = < + "strict" = %true; + "unique_keys" = %false; + > [ + { + "name" = "lat"; + "required" = %false; + "type" = "double"; + "type_v3" = { + "type_name" = "optional"; + "item" = "double"; + }; + }; + { + "name" = "lon"; + "required" = %false; + "type" = "double"; + "type_v3" = { + "type_name" = "optional"; + "item" = "double"; + }; + }; + ]; + "native_cell_tag" = 9991u; + "creation_time" = "2024-08-15T11:17:59.314773Z"; + "inherit_acl" = %true; + "revision" = 8233452423020u; + "resource_usage" = { + "node_count" = 1; + "chunk_count" = 1; + "disk_space_per_medium" = { + "default" = 562182; + }; + "disk_space" = 562182; + "chunk_host_cell_master_memory" = 0; + "master_memory" = 0; + "tablet_count" = 0; + "tablet_static_memory" = 0; + }; + "acl" = []; + "id" = "77d-1c53a-27070191-e4d8f5ac"; + "parent_id" = "77d-1c0d3-2707012f-ddf40dd7"; + "foreign" = %false; + "type" = "table"; + "sequoia" = %false; + "ref_counter" = 1; + "builtin" = %false; + "owner" = "max"; + "compression_ratio" = 0.3679379456925491; +} + )"; + + TString expectedYaml = R"( +mount_config: {} +schema: !yt/attrnode +- strict: true + unique_keys: false +- - name: lat + required: false + type: double + type_v3: + type_name: optional + item: double + - name: lon + required: false + type: double + type_v3: + type_name: optional + item: double +native_cell_tag: !yt/uint64 9991 +creation_time: 2024-08-15T11:17:59.314773Z +inherit_acl: true +revision: !yt/uint64 8233452423020 +resource_usage: + node_count: 1 + chunk_count: 1 + disk_space_per_medium: + default: 562182 + disk_space: 562182 + chunk_host_cell_master_memory: 0 + master_memory: 0 + tablet_count: 0 + tablet_static_memory: 0 +acl: [] +id: 77d-1c53a-27070191-e4d8f5ac +parent_id: 77d-1c0d3-2707012f-ddf40dd7 +foreign: false +type: table +sequoia: false +ref_counter: 1 +builtin: false +owner: max +compression_ratio: 0.3679379456925491 +)"; + EXPECT_EQ(YsonToYaml(TYsonString(yson), TYsonString(formatAttributes)), expectedYaml.substr(1)); +} + +////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yamr_parser_ut.cpp b/yt/yt/library/formats/unittests/yamr_parser_ut.cpp new file mode 100644 index 0000000000..84c9a28457 --- /dev/null +++ b/yt/yt/library/formats/unittests/yamr_parser_ut.cpp @@ -0,0 +1,601 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/test_framework/yson_consumer_mock.h> + +#include <yt/yt/library/formats/yamr_parser.h> + +#include <yt/yt/core/yson/null_consumer.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; + +using ::testing::InSequence; +using ::testing::StrictMock; +using ::testing::NiceMock; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamrParserTest, Simple) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginAttributes()); + EXPECT_CALL(Mock, OnKeyedItem("table_index")); + EXPECT_CALL(Mock, OnInt64Scalar(2)); + EXPECT_CALL(Mock, OnEndAttributes()); + EXPECT_CALL(Mock, OnEntity()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "key1\tvalue1\n" + "2\n" + "key2\tvalue2\n"; + + ParseYamr(input, &Mock); +} + +TEST(TYamrParserTest, ValueWithTabs) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar(TStringBuf("key1\0", 5))); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value with \t and some other")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar(TStringBuf("another\0 value with \t", 21))); + EXPECT_CALL(Mock, OnEndMap()); + + TString input( + "key1\0\tvalue with \t and some other\n" + "key2\tanother\0 value with \t\n", + 34 + + 27); + + ParseYamr(input, &Mock); +} + +TEST(TYamrParserTest, SimpleWithSubkey) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "key1\tsubkey1\tvalue1\n" + "key2\tsubkey2\tvalue2\n"; + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrParserTest, IncompleteRows) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "key1\tsubkey1\tvalue1\n" + "key\tsubkey\n" + "key2\tsubkey2\tvalue2\n"; + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrParserTest, IncorrectIncompleteRows) +{ + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = false; + + EXPECT_THROW(ParseYamr("\n", GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr("key\n", GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr("key\tvalue\nkey\n", GetNullYsonConsumer(), config), std::exception); +} + +TEST(TYamrParserTest, TabsInValue) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("a\tb\\tc\t")); + EXPECT_CALL(Mock, OnEndMap()); + + auto config = New<TYamrFormatConfig>(); + TString input = "key\ta\tb\\tc\t"; + ParseYamr(input, &Mock, config); +} + +TEST(TYamrParserTest, Escaping) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("\tkey\t")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("\n")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("a\tb\t\n")); + EXPECT_CALL(Mock, OnEndMap()); + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + config->EnableEscaping = true; + + TString input = "\\tkey\\t\t\\n\ta\tb\t\\n\n"; + ParseYamr(input, &Mock, config); +} + +TEST(TYamrParserTest, CustomSeparators) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + auto config = New<TYamrFormatConfig>(); + config->RecordSeparator = 'Y'; + config->FieldSeparator = 'X'; + + TString input = "keyXvalueYkey2Xvalue2Y"; + ParseYamr(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamrLenvalParserTest, Simple) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginAttributes()); + EXPECT_CALL(Mock, OnKeyedItem("table_index")); + EXPECT_CALL(Mock, OnInt64Scalar(1)); + EXPECT_CALL(Mock, OnEndAttributes()); + EXPECT_CALL(Mock, OnEntity()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2", + + 2 * (2 * 4 + 4 + 6) + 8); // all i32 + lengths of keys + + auto config = New<TYamrFormatConfig>(); + config->Lenval = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrLenvalParserTest, SimpleWithSubkey) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("subkey2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x04\x00\x00\x00" "key1" + "\x07\x00\x00\x00" "subkey1" + "\x06\x00\x00\x00" "value1" + + "\x04\x00\x00\x00" "key2" + "\x07\x00\x00\x00" "subkey2" + "\x06\x00\x00\x00" "value2", + + 2 * (3 * 4 + 4 + 7 + 6)); // all i32 + lengths of keys + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + config->Lenval = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrLenvalParserTest, EmptyFields) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x00\x00\x00\x00" + "\x00\x00\x00\x00" + "\x00\x00\x00\x00", + 3 * 4); + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + config->Lenval = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrLenvalParserTest, HugeLength) +{ + TString input = TString( + "\xFF\xFF\xFF\xFF" + "\x00\x00\x00\x00" + "\x00\x00\x00\x00", + 3 * 4); + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + config->Lenval = true; + + EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); +} + +TEST(TYamrLenvalParserTest, SimpleEndOfMessage) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key1")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value1")); + EXPECT_CALL(Mock, OnEndMap()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginAttributes()); + EXPECT_CALL(Mock, OnKeyedItem("table_index")); + EXPECT_CALL(Mock, OnInt64Scalar(1)); + EXPECT_CALL(Mock, OnEndAttributes()); + EXPECT_CALL(Mock, OnEntity()); + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("key2")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("value2")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + auto config = New<TYamrFormatConfig>(); + config->Lenval = true; + config->EnableEom = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrLenvalParserTest, EmptyFieldsWithEOM) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("value")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x00\x00\x00\x00" + "\x00\x00\x00\x00" + "\x00\x00\x00\x00" + "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00", + 3 * 4 + 12); + + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = true; + config->Lenval = true; + config->EnableEom = true; + + ParseYamr(input, &Mock, config); +} + +TEST(TYamrParserTest, IncorrectPlaceOfEOM) +{ + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = false; + config->Lenval = true; + config->EnableEom = true; + + TString input1 = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + TString input2 = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + + "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00" + + "\x06\x00\x00\x00" "value2", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + EXPECT_THROW(ParseYamr(input1, GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr(input2, GetNullYsonConsumer(), config), std::exception); +} + +TEST(TYamrParserTest, IncorrectEOM) +{ + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = false; + config->Lenval = true; + config->EnableEom = true; + + // Garbage after EOM marker + TString input1 = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + // Row count mismatch + TString input2 = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\xfb\xff\xff\xff" "\x03\x00\x00\x00\x00\x00\x00\x00", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + // Missing EOM marker + TString input3 = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2", + + 2 * (2 * 4 + 4 + 6) + 8); // all i32 + lengths of keys + + // Missing EOM marker with empty fields + TString input4 = TString( + "\x00\x00\x00\x00" + "\x00\x00\x00\x00" + "\x00\x00\x00\x00", + 3 * 4); + + EXPECT_THROW(ParseYamr(input1, GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr(input2, GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr(input3, GetNullYsonConsumer(), config), std::exception); + EXPECT_THROW(ParseYamr(input4, GetNullYsonConsumer(), config), std::exception); +} + +TEST(TYamrParserTest, UnsupportedEOMInTextMode) +{ + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = false; + config->Lenval = false; + config->EnableEom = true; + + TString input = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); +} + +TEST(TYamrParserTest, UnexpectedEOM) +{ + auto config = New<TYamrFormatConfig>(); + config->HasSubkey = false; + config->Lenval = true; + config->EnableEom = false; + + TString input = TString( + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\xff\xff\xff\xff" "\x01\x00\x00\x00" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\xfb\xff\xff\xff" "\x02\x00\x00\x00\x00\x00\x00\x00", + + 2 * (2 * 4 + 4 + 6) + 8 + 12); // all i32 + lengths of keys + + EXPECT_THROW(ParseYamr(input, GetNullYsonConsumer(), config), std::exception); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yamr_writer_ut.cpp b/yt/yt/library/formats/unittests/yamr_writer_ut.cpp new file mode 100644 index 0000000000..2cad4bcbc9 --- /dev/null +++ b/yt/yt/library/formats/unittests/yamr_writer_ut.cpp @@ -0,0 +1,645 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/client/table_client/unversioned_row.h> +#include <yt/yt/client/table_client/name_table.h> + +#include <yt/yt/library/formats/yamr_writer.h> + +#include <yt/yt/core/concurrency/async_stream.h> + +namespace NYT::NFormats { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYTree; +using namespace NYson; +using namespace NConcurrency; +using namespace NTableClient; + +class TSchemalessWriterForYamrTest + : public ::testing::Test +{ +protected: + TNameTablePtr NameTable_; + int KeyId_; + int SubkeyId_; + int ValueId_; + int TableIndexId_; + int RangeIndexId_; + int RowIndexId_; + + TYamrFormatConfigPtr Config_; + + IUnversionedRowsetWriterPtr Writer_; + + TStringStream OutputStream_; + + TSchemalessWriterForYamrTest() + { + NameTable_ = New<TNameTable>(); + KeyId_ = NameTable_->RegisterName("key"); + SubkeyId_ = NameTable_->RegisterName("subkey"); + ValueId_ = NameTable_->RegisterName("value"); + TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); + RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); + RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); + + Config_ = New<TYamrFormatConfig>(); + } + + void CreateStandardWriter(TControlAttributesConfigPtr controlAttributes = New<TControlAttributesConfig>()) + { + Writer_ = CreateSchemalessWriterForYamr( + Config_, + NameTable_, + CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), + false, /* enableContextSaving */ + controlAttributes, + 0 /* keyColumnCount */); + } +}; + +TEST_F(TSchemalessWriterForYamrTest, Simple) +{ + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + + // Ignore system columns. + row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); + + // Note that key and value follow not in order. + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = + "key1\tvalue1\n" + "key2\tvalue2\n"; + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SimpleWithSubkey) +{ + Config_->HasSubkey = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("subkey2", SubkeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = + "key1\tsubkey1\tvalue1\n" + "key2\tsubkey2\tvalue2\n"; + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SubkeyCouldBeSkipped) +{ + Config_->HasSubkey = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("key", KeyId_)); + row.AddValue(MakeUnversionedStringValue("value", ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = "key\t\tvalue\n"; + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SubkeyCouldBeNull) +{ + Config_->HasSubkey = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("key", KeyId_)); + row.AddValue(MakeUnversionedSentinelValue(EValueType::Null, SubkeyId_)); + row.AddValue(MakeUnversionedStringValue("value", ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = "key\t\tvalue\n"; + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, NonNullTerminatedStrings) +{ + Config_->HasSubkey = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row; + const char* longString = "trashkeytrashsubkeytrashvalue"; + row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 5, 3), KeyId_)); + row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 13, 6), SubkeyId_)); + row.AddValue(MakeUnversionedStringValue(TStringBuf(longString + 24, 5), ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = "key\tsubkey\tvalue\n"; + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SkippedKey) +{ + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("value", ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_FALSE(Writer_->Write(rows)); + + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); +} + +TEST_F(TSchemalessWriterForYamrTest, SkippedValue) +{ + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("key", KeyId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_FALSE(Writer_->Write(rows)); + + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); +} + +TEST_F(TSchemalessWriterForYamrTest, NotStringType) { + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("key", KeyId_)); + row.AddValue(MakeUnversionedInt64Value(42, ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_FALSE(Writer_->Write(rows)); + + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); +} + +TEST_F(TSchemalessWriterForYamrTest, ExtraItem) +{ + int trashId = NameTable_->RegisterName("trash"); + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("key", KeyId_)); + row.AddValue(MakeUnversionedStringValue("value", ValueId_)); + // This value will be ignored. + row.AddValue(MakeUnversionedStringValue("trash", trashId)); + // This value will also be ignored because Config_->HasSubkey is off, + // despite the fact it has non-string type. + row.AddValue(MakeUnversionedInt64Value(42, SubkeyId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = "key\tvalue\n"; + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, Escaping) +{ + Config_->HasSubkey = true; + Config_->EnableEscaping = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("\n", KeyId_)); + row.AddValue(MakeUnversionedStringValue("\t", SubkeyId_)); + row.AddValue(MakeUnversionedStringValue("\n", ValueId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = "\\n\t\\t\t\\n\n"; + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SimpleWithTableIndex) +{ + Config_->EnableTableIndex = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableTableIndex = true; + CreateStandardWriter(controlAttributes); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); + row3.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); + + rows = { row3.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = + "42\n" + "key1\tvalue1\n" + "key2\tvalue2\n" + "23\n" + "key3\tvalue3\n"; + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, SimpleWithRowIndexAndTableIndex) +{ + Config_->EnableTableIndex = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableTableIndex = true; + controlAttributes->EnableRowIndex = true; + CreateStandardWriter(controlAttributes); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(0, RowIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(0, RangeIndexId_)); + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); + row3.AddValue(MakeUnversionedInt64Value(5, RowIndexId_)); + row3.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); + rows = { row3.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row4; + row4.AddValue(MakeUnversionedStringValue("key4", KeyId_)); + row4.AddValue(MakeUnversionedStringValue("value4", ValueId_)); + row4.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); + row4.AddValue(MakeUnversionedInt64Value(10, RowIndexId_)); + row4.AddValue(MakeUnversionedInt64Value(2, RangeIndexId_)); + rows = { row4.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = + "42\n0\n" + "key1\tvalue1\n" + "key2\tvalue2\n" + "42\n5\n" + "key3\tvalue3\n" + "23\n10\n" + "key4\tvalue4\n"; + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, Lenval) +{ + Config_->HasSubkey = true; + Config_->Lenval = true; + CreateStandardWriter(); + + // Note that order in both rows is unusual. + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedStringValue("subkey2", SubkeyId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = TString( + "\x04\x00\x00\x00" "key1" + "\x07\x00\x00\x00" "subkey1" + "\x06\x00\x00\x00" "value1" + + "\x04\x00\x00\x00" "key2" + "\x07\x00\x00\x00" "subkey2" + "\x06\x00\x00\x00" "value2", + + 2 * (3 * 4 + 4 + 6 + 7)); // all i32 + lengths of keys + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, LenvalWithEmptyFields) +{ + Config_->HasSubkey = true; + Config_->Lenval = true; + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("", SubkeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("subkey3", SubkeyId_)); + row3.AddValue(MakeUnversionedStringValue("", ValueId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = TString( + "\x00\x00\x00\x00" "" + "\x07\x00\x00\x00" "subkey1" + "\x06\x00\x00\x00" "value1" + + "\x04\x00\x00\x00" "key2" + "\x00\x00\x00\x00" "" + "\x06\x00\x00\x00" "value2" + + "\x04\x00\x00\x00" "key3" + "\x07\x00\x00\x00" "subkey3" + "\x00\x00\x00\x00" "", + + 9 * 4 + (7 + 6) + (4 + 6) + (4 + 7)); // all i32 + lengths of keys + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, LenvalWithKeySwitch) +{ + Config_->HasSubkey = true; + Config_->Lenval = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableKeySwitch = true; + + Writer_ = CreateSchemalessWriterForYamr( + Config_, + NameTable_, + CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), + false, /* enableContextSaving */ + controlAttributes, + 1 /* keyColumnCount */); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("subkey1", SubkeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("subkey21", SubkeyId_)); + row2.AddValue(MakeUnversionedStringValue("value21", ValueId_)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("subkey22", SubkeyId_)); + row3.AddValue(MakeUnversionedStringValue("value22", ValueId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow(), row3.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row4; + row4.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row4.AddValue(MakeUnversionedStringValue("subkey3", SubkeyId_)); + row4.AddValue(MakeUnversionedStringValue("value3", ValueId_)); + + rows = { row4.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output = TString( + "\x04\x00\x00\x00" "key1" + "\x07\x00\x00\x00" "subkey1" + "\x06\x00\x00\x00" "value1" + + "\xfe\xff\xff\xff" // key switch + + "\x04\x00\x00\x00" "key2" + "\x08\x00\x00\x00" "subkey21" + "\x07\x00\x00\x00" "value21" + + "\x04\x00\x00\x00" "key2" + "\x08\x00\x00\x00" "subkey22" + "\x07\x00\x00\x00" "value22" + + "\xfe\xff\xff\xff" + + "\x04\x00\x00\x00" "key3" + "\x07\x00\x00\x00" "subkey3" + "\x06\x00\x00\x00" "value3", + + 14 * 4 + (4 + 7 + 6) + (4 + 8 + 7) + (4 + 8 + 7) + (4 + 7 + 6)); // all i32 + lengths of keys + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, LenvalWithTableIndex) +{ + Config_->EnableTableIndex = true; + Config_->Lenval = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableTableIndex = true; + CreateStandardWriter(controlAttributes); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); + row3.AddValue(MakeUnversionedInt64Value(23, TableIndexId_)); + + rows = { row3.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output( + "\xff\xff\xff\xff" "\x2a\x00\x00\x00" // 42 + + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\xff\xff\xff\xff" "\x17\x00\x00\x00" // 23 + + "\x04\x00\x00\x00" "key3" + "\x06\x00\x00\x00" "value3", + + 10 * 4 + 3 * (4 + 6)); + + EXPECT_EQ(output, OutputStream_.Str()); +} + +TEST_F(TSchemalessWriterForYamrTest, LenvalWithRangeAndRowIndex) +{ + Config_->Lenval = true; + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableRowIndex = true; + controlAttributes->EnableRangeIndex = true; + CreateStandardWriter(controlAttributes); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("key1", KeyId_)); + row1.AddValue(MakeUnversionedStringValue("value1", ValueId_)); + row1.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(23, RowIndexId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("key2", KeyId_)); + row2.AddValue(MakeUnversionedStringValue("value2", ValueId_)); + row2.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); + row2.AddValue(MakeUnversionedInt64Value(24, RowIndexId_)); + + std::vector<TUnversionedRow> rows = { row1.GetRow(), row2.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + TUnversionedRowBuilder row3; + row3.AddValue(MakeUnversionedStringValue("key3", KeyId_)); + row3.AddValue(MakeUnversionedStringValue("value3", ValueId_)); + row3.AddValue(MakeUnversionedInt64Value(42, RangeIndexId_)); + row3.AddValue(MakeUnversionedInt64Value(25, RowIndexId_)); + + rows = { row3.GetRow() }; + EXPECT_EQ(true, Writer_->Write(rows)); + + Writer_->Close() + .Get() + .ThrowOnError(); + + TString output( + "\xfd\xff\xff\xff" "\x2a\x00\x00\x00" // 42 + "\xfc\xff\xff\xff" "\x17\x00\x00\x00\x00\x00\x00\x00" // 23 + + "\x04\x00\x00\x00" "key1" + "\x06\x00\x00\x00" "value1" + + "\x04\x00\x00\x00" "key2" + "\x06\x00\x00\x00" "value2" + + "\x04\x00\x00\x00" "key3" + "\x06\x00\x00\x00" "value3", + + 11 * 4 + 3 * (4 + 6)); + + EXPECT_EQ(output, OutputStream_.Str()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yamred_dsv_parser_ut.cpp b/yt/yt/library/formats/unittests/yamred_dsv_parser_ut.cpp new file mode 100644 index 0000000000..41183ca5f0 --- /dev/null +++ b/yt/yt/library/formats/unittests/yamred_dsv_parser_ut.cpp @@ -0,0 +1,185 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/core/test_framework/yson_consumer_mock.h> + +#include <yt/yt/library/formats/yamred_dsv_parser.h> + +namespace NYT::NFormats { +namespace { + +using namespace NYson; + +using ::testing::InSequence; +using ::testing::StrictMock; +using ::testing::NiceMock; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamredDsvParserTest, Simple) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key_a")); + EXPECT_CALL(Mock, OnStringScalar("1")); + EXPECT_CALL(Mock, OnKeyedItem("key_b")); + EXPECT_CALL(Mock, OnStringScalar("2")); + EXPECT_CALL(Mock, OnKeyedItem("subkey_x")); + EXPECT_CALL(Mock, OnStringScalar("3")); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("5")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar("6")); + EXPECT_CALL(Mock, OnEndMap()); + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key_a")); + EXPECT_CALL(Mock, OnStringScalar("7")); + EXPECT_CALL(Mock, OnKeyedItem("key_b")); + EXPECT_CALL(Mock, OnStringScalar("8")); + EXPECT_CALL(Mock, OnKeyedItem("subkey_x")); + EXPECT_CALL(Mock, OnStringScalar("9")); + EXPECT_CALL(Mock, OnKeyedItem("b")); + EXPECT_CALL(Mock, OnStringScalar("max\tignat")); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("100")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = + "1 2\t3\ta=5\tb=6\n" + "7 8\t9\tb=max\\tignat\ta=100\n"; + + auto config = New<TYamredDsvFormatConfig>(); + config->HasSubkey = true; + config->KeyColumnNames.push_back("key_a"); + config->KeyColumnNames.push_back("key_b"); + config->SubkeyColumnNames.push_back("subkey_x"); + + ParseYamredDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamredDsvParserTest, EmptyField) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("0 1")); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("b")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "\t0 1\ta=b\n"; + + auto config = New<TYamredDsvFormatConfig>(); + config->HasSubkey = true; + config->KeyColumnNames.push_back("key"); + config->SubkeyColumnNames.push_back("subkey"); + + ParseYamredDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamredDsvParserTest, Escaping) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("\t")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("0\n1")); + EXPECT_CALL(Mock, OnKeyedItem("a")); + EXPECT_CALL(Mock, OnStringScalar("\tb\nc")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = "\\t\t0\\n1\ta=\\tb\\nc\n"; + + auto config = New<TYamredDsvFormatConfig>(); + config->HasSubkey = true; + config->EnableEscaping = true; + config->KeyColumnNames.push_back("key"); + config->SubkeyColumnNames.push_back("subkey"); + + ParseYamredDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(TYamredDsvParserTest, Lenval) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("a")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("bc")); + EXPECT_CALL(Mock, OnKeyedItem("d")); + EXPECT_CALL(Mock, OnStringScalar("e")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x01\x00\x00\x00" "a" + "\x02\x00\x00\x00" "bc" + "\x03\x00\x00\x00" "d=e", + 3 * 4 + 1 + 2 + 3); + + auto config = New<TYamredDsvFormatConfig>(); + config->Lenval = true; + config->HasSubkey = true; + config->KeyColumnNames.push_back("key"); + config->SubkeyColumnNames.push_back("subkey"); + + ParseYamredDsv(input, &Mock, config); +} + +TEST(TYamredDsvParserTest, EOM) +{ + StrictMock<TMockYsonConsumer> Mock; + InSequence dummy; + + EXPECT_CALL(Mock, OnListItem()); + EXPECT_CALL(Mock, OnBeginMap()); + EXPECT_CALL(Mock, OnKeyedItem("key")); + EXPECT_CALL(Mock, OnStringScalar("a")); + EXPECT_CALL(Mock, OnKeyedItem("subkey")); + EXPECT_CALL(Mock, OnStringScalar("bc")); + EXPECT_CALL(Mock, OnKeyedItem("d")); + EXPECT_CALL(Mock, OnStringScalar("e")); + EXPECT_CALL(Mock, OnEndMap()); + + TString input = TString( + "\x01\x00\x00\x00" "a" + "\x02\x00\x00\x00" "bc" + "\x03\x00\x00\x00" "d=e" + "\xfb\xff\xff\xff" "\x01\x00\x00\x00\x00\x00\x00\x00", + 3 * 4 + 1 + 2 + 3 + 12); + + auto config = New<TYamredDsvFormatConfig>(); + config->Lenval = true; + config->EnableEom = true; + config->HasSubkey = true; + config->KeyColumnNames.push_back("key"); + config->SubkeyColumnNames.push_back("subkey"); + + ParseYamredDsv(input, &Mock, config); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yamred_dsv_writer_ut.cpp b/yt/yt/library/formats/unittests/yamred_dsv_writer_ut.cpp new file mode 100644 index 0000000000..fc5f28639e --- /dev/null +++ b/yt/yt/library/formats/unittests/yamred_dsv_writer_ut.cpp @@ -0,0 +1,424 @@ +#include <yt/yt/core/test_framework/framework.h> + +#include <yt/yt/client/table_client/unversioned_row.h> +#include <yt/yt/client/table_client/name_table.h> + +#include <yt/yt/library/formats/yamred_dsv_writer.h> + +#include <yt/yt/core/concurrency/async_stream.h> + +#include <util/string/vector.h> + +#include <cstdio> + + +namespace NYT::NFormats { +namespace { + +using VectorStrok = TVector<TString>; + +//////////////////////////////////////////////////////////////////////////////// + +using namespace NYTree; +using namespace NYson; +using namespace NConcurrency; +using namespace NTableClient; + +class TSchemalessWriterForYamredDsvTest + : public ::testing::Test +{ +protected: + TNameTablePtr NameTable_; + TYamredDsvFormatConfigPtr Config_; + IUnversionedRowsetWriterPtr Writer_; + + TStringStream OutputStream_; + + int KeyAId_; + int KeyBId_; + int KeyCId_; + int ValueXId_; + int ValueYId_; + int TableIndexId_; + int RangeIndexId_; + int RowIndexId_; + + TSchemalessWriterForYamredDsvTest() + { + NameTable_ = New<TNameTable>(); + KeyAId_ = NameTable_->RegisterName("key_a"); + KeyBId_ = NameTable_->RegisterName("key_b"); + KeyCId_ = NameTable_->RegisterName("key_c"); + ValueXId_ = NameTable_->RegisterName("value_x"); + ValueYId_ = NameTable_->RegisterName("value_y"); + TableIndexId_ = NameTable_->RegisterName(TableIndexColumnName); + RowIndexId_ = NameTable_->RegisterName(RowIndexColumnName); + RangeIndexId_ = NameTable_->RegisterName(RangeIndexColumnName); + Config_ = New<TYamredDsvFormatConfig>(); + } + + void CreateStandardWriter(TControlAttributesConfigPtr controlAttributes = New<TControlAttributesConfig>()) + { + Writer_ = CreateSchemalessWriterForYamredDsv( + Config_, + NameTable_, + CreateAsyncAdapter(static_cast<IOutputStream*>(&OutputStream_)), + false, /* enableContextSaving */ + controlAttributes, + 0 /* keyColumnCount */); + } + + // Splits output into key and sorted vector of values that are entries of the last YAMR column. + // Returns true if success (there are >= 2 values after splitting by field separator), otherwise false. + bool ExtractKeyValue(TString output, TString& key, VectorStrok& value, char fieldSeparator = '\t') + { + char delimiter[2] = {fieldSeparator, 0}; + // Splitting by field separator. + value = SplitString(output, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); + // We should at least have key and the rest of values. + if (value.size() < 2) + return false; + key = value[0]; + value.erase(value.begin()); + std::sort(value.begin(), value.end()); + return true; + } + + // The same function as previous, version with subkey. + bool ExtractKeySubkeyValue(TString output, TString& key, TString& subkey, VectorStrok& value, char fieldSeparator = '\t') + { + char delimiter[2] = {fieldSeparator, 0}; + // Splitting by field separator. + value = SplitString(output, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); + // We should at least have key, subkey and the rest of values. + if (value.size() < 3) + return false; + key = value[0]; + subkey = value[1]; + value.erase(value.begin(), value.end()); + std::sort(value.begin(), value.end()); + return true; + } + + // Compares output and expected output ignoring the order of entries in YAMR value column. + void CompareKeyValue(TString output, TString expected, char recordSeparator = '\n', char fieldSeparator = '\t') + { + char delimiter[2] = {recordSeparator, 0}; + VectorStrok outputRows = SplitString(output, delimiter, 0 /* maxFields */ , KEEP_EMPTY_TOKENS); + VectorStrok expectedRows = SplitString(expected, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); + EXPECT_EQ(outputRows.size(), expectedRows.size()); + // Since there is \n after each row, there will be an extra empty string in both vectors. + EXPECT_EQ(outputRows.back(), ""); + ASSERT_EQ(expectedRows.back(), ""); + outputRows.pop_back(); + expectedRows.pop_back(); + + TString outputKey; + TString expectedKey; + VectorStrok outputValue; + VectorStrok expectedValue; + for (int rowIndex = 0; rowIndex < static_cast<int>(outputRows.size()); rowIndex++) { + EXPECT_TRUE(ExtractKeyValue(outputRows[rowIndex], outputKey, outputValue, fieldSeparator)); + ASSERT_TRUE(ExtractKeyValue(expectedRows[rowIndex], expectedKey, expectedValue, fieldSeparator)); + EXPECT_EQ(outputKey, expectedKey); + EXPECT_EQ(outputValue, expectedValue); + } + } + + // The same function as previous, version with subkey. + void CompareKeySubkeyValue(TString output, TString expected, char recordSeparator = '\n', char fieldSeparator = '\t') + { + char delimiter[2] = {recordSeparator, 0}; + VectorStrok outputRows = SplitString(output, delimiter, 0 /* maxFields */ , KEEP_EMPTY_TOKENS); + VectorStrok expectedRows = SplitString(expected, delimiter, 0 /* maxFields */, KEEP_EMPTY_TOKENS); + EXPECT_EQ(outputRows.size(), expectedRows.size()); + // Since there is \n after each row, there will be an extra empty string in both vectors. + EXPECT_EQ(outputRows.back(), ""); + ASSERT_EQ(expectedRows.back(), ""); + outputRows.pop_back(); + expectedRows.pop_back(); + + TString outputKey; + TString expectedKey; + TString outputSubkey; + TString expectedSubkey; + VectorStrok outputValue; + VectorStrok expectedValue; + for (int rowIndex = 0; rowIndex < static_cast<int>(outputRows.size()); rowIndex++) { + EXPECT_TRUE(ExtractKeySubkeyValue(outputRows[rowIndex], outputKey, outputSubkey, outputValue, fieldSeparator)); + ASSERT_TRUE(ExtractKeySubkeyValue(expectedRows[rowIndex], expectedKey, expectedSubkey, expectedValue, fieldSeparator)); + EXPECT_EQ(outputKey, expectedKey); + EXPECT_EQ(outputSubkey, expectedSubkey); + EXPECT_EQ(outputValue, expectedValue); + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, Simple) +{ + Config_->KeyColumnNames.emplace_back("key_a"); + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("a1", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); + row1.AddValue(MakeUnversionedSentinelValue(EValueType::Null, ValueYId_)); + + // Ignore system columns. + row1.AddValue(MakeUnversionedInt64Value(2, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(42, RowIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(1, RangeIndexId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("a2", KeyAId_)); + row2.AddValue(MakeUnversionedStringValue("y", ValueYId_)); + row2.AddValue(MakeUnversionedStringValue("b", KeyBId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = + "a1\tvalue_x=x\n" + "a2\tvalue_y=y\tkey_b=b\n"; + + TString output = OutputStream_.Str(); + + CompareKeyValue(expectedOutput, output); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, SimpleWithSubkey) +{ + Config_->HasSubkey = true; + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->KeyColumnNames.emplace_back("key_b"); + Config_->SubkeyColumnNames.emplace_back("key_c"); + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("b1", KeyBId_)); + row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + row2.AddValue(MakeUnversionedStringValue("b2", KeyBId_)); + row2.AddValue(MakeUnversionedStringValue("c", KeyCId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = + "a b1\tc\t\n" + "a b2\tc\t\n"; + + TString output = OutputStream_.Str(); + + CompareKeySubkeyValue(expectedOutput, output); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, Lenval) +{ + Config_->Lenval = true; + Config_->HasSubkey = true; + Config_->EnableTableIndex = true; + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->KeyColumnNames.emplace_back("key_b"); + Config_->SubkeyColumnNames.emplace_back("key_c"); + + auto controlAttributes = New<TControlAttributesConfig>(); + controlAttributes->EnableTableIndex = true; + controlAttributes->EnableRowIndex = true; + controlAttributes->EnableRangeIndex = true; + CreateStandardWriter(controlAttributes); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("b1", KeyBId_)); + row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); + row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); + + row1.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(23, RangeIndexId_)); + row1.AddValue(MakeUnversionedInt64Value(17, RowIndexId_)); + + TUnversionedRowBuilder row2; + row2.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + row2.AddValue(MakeUnversionedStringValue("b2", KeyBId_)); + row2.AddValue(MakeUnversionedStringValue("c", KeyCId_)); + + row2.AddValue(MakeUnversionedInt64Value(42, TableIndexId_)); + row2.AddValue(MakeUnversionedInt64Value(23, RangeIndexId_)); + row2.AddValue(MakeUnversionedInt64Value(18, RowIndexId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow(), row2.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = TString( + "\xff\xff\xff\xff" "\x2a\x00\x00\x00" // Table index. + "\xfd\xff\xff\xff" "\x17\x00\x00\x00" // Range index. + "\xfc\xff\xff\xff" "\x11\x00\x00\x00\x00\x00\x00\x00" // Row index. + + "\x04\x00\x00\x00" "a b1" + "\x01\x00\x00\x00" "c" + "\x09\x00\x00\x00" "value_x=x" + + "\x04\x00\x00\x00" "a b2" + "\x01\x00\x00\x00" "c" + "\x00\x00\x00\x00" "", + + 13 * 4 + 4 + 1 + 9 + 4 + 1 + 0); + + TString output = OutputStream_.Str(); + EXPECT_EQ(expectedOutput, output) + << "expected length: " << expectedOutput.length() + << ", " + << "actual length: " << output.length(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, Escaping) +{ + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->KeyColumnNames.emplace_back("key_b"); + int columnWithEscapedNameId = NameTable_->GetIdOrRegisterName("value\t_t"); + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("a\n", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("\nb\t", KeyBId_)); + row1.AddValue(MakeUnversionedStringValue("\nva\\lue\t", columnWithEscapedNameId)); + + std::vector<TUnversionedRow> rows = {row1.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = "a\\n \\nb\\t\tvalue\\t_t=\\nva\\\\lue\\t\n"; + TString output = OutputStream_.Str(); + + EXPECT_EQ(expectedOutput, output); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, SkippedKey) +{ + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->KeyColumnNames.emplace_back("key_b"); + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("b", KeyBId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_FALSE(Writer_->Write(rows)); + + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, SkippedSubkey) +{ + Config_->HasSubkey = true; + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->SubkeyColumnNames.emplace_back("key_c"); + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_FALSE(Writer_->Write(rows)); + + EXPECT_THROW(Writer_->Close() + .Get() + .ThrowOnError(), std::exception); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, NonStringValues) +{ + Config_->HasSubkey = true; + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->SubkeyColumnNames.emplace_back("key_c"); + CreateStandardWriter(); + + TUnversionedRowBuilder row; + row.AddValue(MakeUnversionedInt64Value(-42, KeyAId_)); + row.AddValue(MakeUnversionedUint64Value(18, KeyCId_)); + row.AddValue(MakeUnversionedBooleanValue(true, KeyBId_)); + row.AddValue(MakeUnversionedDoubleValue(3.14, ValueXId_)); + row.AddValue(MakeUnversionedStringValue("yt", ValueYId_)); + + std::vector<TUnversionedRow> rows = { row.GetRow() }; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = "-42\t18\tkey_b=true\tvalue_x=3.14\tvalue_y=yt\n"; + TString output = OutputStream_.Str(); + + EXPECT_EQ(expectedOutput, output); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST_F(TSchemalessWriterForYamredDsvTest, ErasingSubkeyColumnsWhenHasSubkeyIsFalse) +{ + Config_->KeyColumnNames.emplace_back("key_a"); + Config_->SubkeyColumnNames.emplace_back("key_b"); + // Config->HasSubkey = false by default. + CreateStandardWriter(); + + TUnversionedRowBuilder row1; + row1.AddValue(MakeUnversionedStringValue("a", KeyAId_)); + row1.AddValue(MakeUnversionedStringValue("b", KeyBId_)); + row1.AddValue(MakeUnversionedStringValue("c", KeyCId_)); + row1.AddValue(MakeUnversionedStringValue("x", ValueXId_)); + + std::vector<TUnversionedRow> rows = {row1.GetRow()}; + + EXPECT_EQ(true, Writer_->Write(rows)); + Writer_->Close() + .Get() + .ThrowOnError(); + + TString expectedOutput = "a\tkey_c=c\tvalue_x=x\n"; + TString output = OutputStream_.Str(); + + EXPECT_EQ(expectedOutput, output); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace +} // namespace NYT::NFormats diff --git a/yt/yt/library/formats/unittests/yson_helpers.cpp b/yt/yt/library/formats/unittests/yson_helpers.cpp new file mode 100644 index 0000000000..669585caf7 --- /dev/null +++ b/yt/yt/library/formats/unittests/yson_helpers.cpp @@ -0,0 +1,29 @@ +#include "yson_helpers.h" + +#include <yt/yt/core/ytree/convert.h> +#include <yt/yt/core/ytree/node.h> +#include <yt/yt/core/yson/string.h> + +namespace NYT { + +using namespace NYson; +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +TString CanonizeYson(TStringBuf input) +{ + auto node = ConvertToNode(TYsonString(input)); + auto binaryYson = ConvertToYsonString(node); + + TStringStream out; + { + TYsonWriter writer(&out, NYson::EYsonFormat::Pretty); + ParseYsonStringBuffer(binaryYson.AsStringBuf(), EYsonType::Node, &writer); + } + return out.Str(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/yt/library/program/private.h b/yt/yt/library/formats/unittests/yson_helpers.h index e6e06faf63..d123d40447 100644 --- a/yt/yt/library/program/private.h +++ b/yt/yt/library/formats/unittests/yson_helpers.h @@ -1,14 +1,12 @@ #pragma once -#include "public.h" - -#include <yt/yt/core/logging/log.h> +#include <util/generic/string.h> namespace NYT { //////////////////////////////////////////////////////////////////////////////// -YT_DEFINE_GLOBAL(const NLogging::TLogger, ProgramLogger, "Program"); +TString CanonizeYson(TStringBuf yson); //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/library/monitoring/http_integration.cpp b/yt/yt/library/monitoring/http_integration.cpp deleted file mode 100644 index 25fe9ad304..0000000000 --- a/yt/yt/library/monitoring/http_integration.cpp +++ /dev/null @@ -1,209 +0,0 @@ -#include "http_integration.h" - -#include "monitoring_manager.h" - -#include <yt/yt/build/build.h> - -#include <yt/yt/core/json/config.h> -#include <yt/yt/core/json/json_writer.h> - -#include <yt/yt/core/ytree/fluent.h> - -#include <yt/yt/core/yson/parser.h> -#include <yt/yt/core/yson/consumer.h> - -#include <yt/yt/core/concurrency/scheduler.h> - -#include <yt/yt/core/ytree/helpers.h> -#include <yt/yt/core/ytree/virtual.h> -#include <yt/yt/core/ytree/ypath_detail.h> -#include <yt/yt/core/ytree/ypath_proxy.h> - -#include <yt/yt/core/http/http.h> -#include <yt/yt/core/http/helpers.h> -#include <yt/yt/core/http/server.h> - -#include <yt/yt/core/bus/tcp/dispatcher.h> - -#include <yt/yt/core/misc/ref_counted_tracker_statistics_producer.h> - -#include <yt/yt/library/profiling/solomon/exporter.h> - -#ifdef _linux_ -#include <yt/yt/library/ytprof/http/handler.h> -#include <yt/yt/library/ytprof/build_info.h> - -#include <yt/yt/library/backtrace_introspector/http/handler.h> -#endif - -#include <library/cpp/cgiparam/cgiparam.h> - -#include <util/string/vector.h> - -namespace NYT::NMonitoring { - -using namespace NYTree; -using namespace NYson; -using namespace NHttp; -using namespace NConcurrency; -using namespace NJson; - -//////////////////////////////////////////////////////////////////////////////// - -DEFINE_ENUM(EVerb, - (Get) - (List) -); - -//////////////////////////////////////////////////////////////////////////////// - -void Initialize( - const NHttp::IServerPtr& monitoringServer, - const NProfiling::TSolomonExporterConfigPtr& config, - TMonitoringManagerPtr* monitoringManager, - NYTree::IMapNodePtr* orchidRoot) -{ - *monitoringManager = New<TMonitoringManager>(); - (*monitoringManager)->Register("/ref_counted", CreateRefCountedTrackerStatisticsProducer()); - (*monitoringManager)->Register("/solomon", BIND([] (NYson::IYsonConsumer* consumer) { - auto tags = NProfiling::TSolomonRegistry::Get()->GetDynamicTags(); - - BuildYsonFluently(consumer) - .BeginMap() - .Item("dynamic_tags").Value(THashMap<TString, TString>(tags.begin(), tags.end())) - .EndMap(); - })); - (*monitoringManager)->Start(); - - *orchidRoot = NYTree::GetEphemeralNodeFactory(true)->CreateMap(); - SetNodeByYPath( - *orchidRoot, - "/monitoring", - CreateVirtualNode((*monitoringManager)->GetService())); - SetNodeByYPath( - *orchidRoot, - "/tcp_dispatcher", - CreateVirtualNode(NYT::NBus::TTcpDispatcher::Get()->GetOrchidService())); - -#ifdef _linux_ - auto buildInfo = NYTProf::TBuildInfo::GetDefault(); - buildInfo.BinaryVersion = GetVersion(); - - SetNodeByYPath( - *orchidRoot, - "/build_info", - NYTree::BuildYsonNodeFluently() - .BeginMap() - .Item("arc_revision").Value(buildInfo.ArcRevision) - .Item("binary_version").Value(buildInfo.BinaryVersion) - .Item("build_type").Value(buildInfo.BuildType) - .EndMap()); -#endif - - if (monitoringServer) { - auto exporter = New<NProfiling::TSolomonExporter>(config); - exporter->Register("/solomon", monitoringServer); - exporter->Start(); - - SetNodeByYPath( - *orchidRoot, - "/sensors", - CreateVirtualNode(exporter->GetSensorService())); - -#ifdef _linux_ - NYTProf::Register(monitoringServer, "/ytprof", buildInfo); - NBacktraceIntrospector::Register(monitoringServer, "/backtrace"); -#endif - monitoringServer->AddHandler( - "/orchid/", - GetOrchidYPathHttpHandler(*orchidRoot)); - } -} - -//////////////////////////////////////////////////////////////////////////////// - -class TYPathHttpHandler - : public IHttpHandler -{ -public: - explicit TYPathHttpHandler(IYPathServicePtr service) - : Service_(std::move(service)) - { } - - void HandleRequest( - const IRequestPtr& req, - const IResponseWriterPtr& rsp) override - { - const TStringBuf orchidPrefix = "/orchid"; - - TString path{req->GetUrl().Path}; - if (!path.StartsWith(orchidPrefix)) { - THROW_ERROR_EXCEPTION("HTTP request must start with %Qv prefix", - orchidPrefix) - << TErrorAttribute("path", path); - } - - path = path.substr(orchidPrefix.size(), TString::npos); - TCgiParameters params(req->GetUrl().RawQuery); - - auto verb = EVerb::Get; - - auto options = CreateEphemeralAttributes(); - for (const auto& param : params) { - if (param.first == "verb") { - verb = ParseEnum<EVerb>(param.second); - } else { - // Just a check, IAttributeDictionary takes raw YSON anyway. - try { - ValidateYson(TYsonString(param.second), DefaultYsonParserNestingLevelLimit); - } catch (const std::exception& ex) { - THROW_ERROR_EXCEPTION("Error parsing value of query parameter %Qv", - param.first) - << ex; - } - - options->SetYson(param.first, TYsonString(param.second)); - } - } - - TYsonString result; - switch (verb) { - case EVerb::Get: { - auto ypathReq = TYPathProxy::Get(path); - ToProto(ypathReq->mutable_options(), *options); - auto ypathRsp = WaitFor(ExecuteVerb(Service_, ypathReq)) - .ValueOrThrow(); - result = TYsonString(ypathRsp->value()); - break; - } - case EVerb::List: { - auto ypathReq = TYPathProxy::List(path); - auto ypathRsp = WaitFor(ExecuteVerb(Service_, ypathReq)) - .ValueOrThrow(); - result = TYsonString(ypathRsp->value()); - break; - } - default: - YT_ABORT(); - } - - rsp->SetStatus(EStatusCode::OK); - NHttp::ReplyJson(rsp, [&] (NYson::IYsonConsumer* writer) { - Serialize(result, writer); - }); - WaitFor(rsp->Close()) - .ThrowOnError(); - } - -private: - const IYPathServicePtr Service_; -}; - -IHttpHandlerPtr GetOrchidYPathHttpHandler(const IYPathServicePtr& service) -{ - return WrapYTException(New<TYPathHttpHandler>(service)); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NMonitoring diff --git a/yt/yt/library/monitoring/http_integration.h b/yt/yt/library/monitoring/http_integration.h deleted file mode 100644 index 48c12ca8a8..0000000000 --- a/yt/yt/library/monitoring/http_integration.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/ytree/ypath_service.h> - -#include <yt/yt/core/http/public.h> - -#include <yt/yt/library/profiling/solomon/public.h> - -namespace NYT::NMonitoring { - -//////////////////////////////////////////////////////////////////////////////// - -void Initialize( - const NHttp::IServerPtr& monitoringServer, - const NProfiling::TSolomonExporterConfigPtr& solomonExporterConfig, - TMonitoringManagerPtr* monitoringManager, - NYTree::IMapNodePtr* orchidRoot); - -NHttp::IHttpHandlerPtr CreateTracingHttpHandler(); - -NHttp::IHttpHandlerPtr GetOrchidYPathHttpHandler( - const NYTree::IYPathServicePtr& service); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NMonitoring diff --git a/yt/yt/library/monitoring/monitoring_manager.cpp b/yt/yt/library/monitoring/monitoring_manager.cpp deleted file mode 100644 index 263443060b..0000000000 --- a/yt/yt/library/monitoring/monitoring_manager.cpp +++ /dev/null @@ -1,177 +0,0 @@ -#include "monitoring_manager.h" -#include "private.h" - -#include <yt/yt/core/concurrency/action_queue.h> -#include <yt/yt/core/concurrency/periodic_executor.h> - -#include <yt/yt/core/ytree/convert.h> -#include <yt/yt/core/ytree/ephemeral_node_factory.h> -#include <yt/yt/core/ytree/node.h> -#include <yt/yt/core/ytree/tree_visitor.h> -#include <yt/yt/core/ytree/ypath_detail.h> -#include <yt/yt/core/ytree/ypath_client.h> - -#include <yt/yt/library/profiling/sensor.h> - -namespace NYT::NMonitoring { - -using namespace NYTree; -using namespace NYPath; -using namespace NYson; -using namespace NConcurrency; - -//////////////////////////////////////////////////////////////////////////////// - -static constexpr auto& Logger = MonitoringLogger; - -static const auto UpdatePeriod = TDuration::Seconds(3); -static const auto EmptyRoot = GetEphemeralNodeFactory()->CreateMap(); - -//////////////////////////////////////////////////////////////////////////////// - -class TMonitoringManager::TImpl - : public TRefCounted -{ -public: - void Register(const TYPath& path, TYsonProducer producer) - { - auto guard = Guard(SpinLock_); - YT_VERIFY(PathToProducer_.emplace(path, producer).second); - } - - void Unregister(const TYPath& path) - { - auto guard = Guard(SpinLock_); - YT_VERIFY(PathToProducer_.erase(path) == 1); - } - - IYPathServicePtr GetService() - { - return New<TYPathService>(this); - } - - void Start() - { - auto guard = Guard(SpinLock_); - - YT_VERIFY(!Started_); - - PeriodicExecutor_ = New<TPeriodicExecutor>( - ActionQueue_->GetInvoker(), - BIND(&TImpl::Update, MakeWeak(this)), - UpdatePeriod); - PeriodicExecutor_->Start(); - - Started_ = true; - } - - void Stop() - { - auto guard = Guard(SpinLock_); - - if (!Started_) - return; - - Started_ = false; - YT_UNUSED_FUTURE(PeriodicExecutor_->Stop()); - Root_.Reset(); - } - -private: - class TYPathService - : public TYPathServiceBase - { - public: - explicit TYPathService(TIntrusivePtr<TImpl> owner) - : Owner_(std::move(owner)) - { } - - TResolveResult Resolve(const TYPath& path, const IYPathServiceContextPtr& /*context*/) override - { - return TResolveResultThere{Owner_->GetRoot(), path}; - } - - private: - const TIntrusivePtr<TImpl> Owner_; - - }; - - bool Started_ = false; - TActionQueuePtr ActionQueue_ = New<TActionQueue>("Monitoring"); - TPeriodicExecutorPtr PeriodicExecutor_; - - YT_DECLARE_SPIN_LOCK(NThreading::TSpinLock, SpinLock_); - THashMap<TString, NYson::TYsonProducer> PathToProducer_; - IMapNodePtr Root_; - - void Update() - { - YT_LOG_DEBUG("Started updating monitoring state"); - - YT_PROFILE_TIMING("/monitoring/update_time") { - auto newRoot = GetEphemeralNodeFactory()->CreateMap(); - - THashMap<TString, NYson::TYsonProducer> pathToProducer;; - { - auto guard = Guard(SpinLock_); - pathToProducer = PathToProducer_; - } - - for (const auto& [path, producer] : pathToProducer) { - auto value = ConvertToYsonString(producer); - SyncYPathSet(newRoot, path, value); - } - - if (Started_) { - auto guard = Guard(SpinLock_); - std::swap(Root_, newRoot); - } - } - YT_LOG_DEBUG("Finished updating monitoring state"); - } - - IMapNodePtr GetRoot() - { - auto guard = Guard(SpinLock_); - return Root_ ? Root_ : EmptyRoot; - } -}; - -DEFINE_REFCOUNTED_TYPE(TMonitoringManager) - -//////////////////////////////////////////////////////////////////////////////// - -TMonitoringManager::TMonitoringManager() - : Impl_(New<TImpl>()) -{ } - -TMonitoringManager::~TMonitoringManager() = default; - -void TMonitoringManager::Register(const TYPath& path, TYsonProducer producer) -{ - Impl_->Register(path, producer); -} - -void TMonitoringManager::Unregister(const TYPath& path) -{ - Impl_->Unregister(path); -} - -IYPathServicePtr TMonitoringManager::GetService() -{ - return Impl_->GetService(); -} - -void TMonitoringManager::Start() -{ - Impl_->Start(); -} - -void TMonitoringManager::Stop() -{ - Impl_->Stop(); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NMonitoring diff --git a/yt/yt/library/monitoring/monitoring_manager.h b/yt/yt/library/monitoring/monitoring_manager.h deleted file mode 100644 index b2582bbe70..0000000000 --- a/yt/yt/library/monitoring/monitoring_manager.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/yson/consumer.h> -#include <yt/yt/core/yson/producer.h> - -#include <yt/yt/core/ypath/public.h> - -#include <yt/yt/core/ytree/public.h> - -namespace NYT::NMonitoring { - -//////////////////////////////////////////////////////////////////////////////// - -//! Exposes a tree assembled from results returned by a set of -//! registered NYson::TYsonProducer-s. -/*! - * \note - * The results are cached and periodically updated. - */ -class TMonitoringManager - : public TRefCounted -{ -public: - TMonitoringManager(); - ~TMonitoringManager(); - - //! Registers a new #producer for a given #path. - void Register(const NYPath::TYPath& path, NYson::TYsonProducer producer); - - //! Unregisters an existing producer for the specified #path. - void Unregister(const NYPath::TYPath& path); - - //! Returns the service representing the whole tree. - /*! - * \note The service is thread-safe. - */ - NYTree::IYPathServicePtr GetService(); - - //! Starts periodic updates. - void Start(); - - //! Stops periodic updates. - void Stop(); - -private: - class TImpl; - TIntrusivePtr<TImpl> Impl_; - -}; - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NMonitoring diff --git a/yt/yt/library/monitoring/private.h b/yt/yt/library/monitoring/private.h deleted file mode 100644 index 61809bdb68..0000000000 --- a/yt/yt/library/monitoring/private.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/logging/log.h> - -namespace NYT::NMonitoring { - -//////////////////////////////////////////////////////////////////////////////// - -YT_DEFINE_GLOBAL(const NLogging::TLogger, MonitoringLogger, "Monitoring"); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NJournalClient diff --git a/yt/yt/library/monitoring/public.h b/yt/yt/library/monitoring/public.h deleted file mode 100644 index 3514bdd858..0000000000 --- a/yt/yt/library/monitoring/public.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include <yt/yt/core/misc/public.h> - -namespace NYT::NMonitoring { - -//////////////////////////////////////////////////////////////////////////////// - -DECLARE_REFCOUNTED_CLASS(TMonitoringManager) - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NMonitoring diff --git a/yt/yt/library/monitoring/ya.make b/yt/yt/library/monitoring/ya.make deleted file mode 100644 index c2fccd99ac..0000000000 --- a/yt/yt/library/monitoring/ya.make +++ /dev/null @@ -1,27 +0,0 @@ -LIBRARY() - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - http_integration.cpp - monitoring_manager.cpp -) - -PEERDIR( - yt/yt/core - yt/yt/build - yt/yt/library/profiling - yt/yt/library/profiling/solomon - library/cpp/cgiparam -) - -IF (OS_LINUX) - PEERDIR( - yt/yt/library/ytprof - yt/yt/library/ytprof/http - - yt/yt/library/backtrace_introspector/http - ) -ENDIF() - -END() diff --git a/yt/yt/library/oom/oom.cpp b/yt/yt/library/oom/oom.cpp deleted file mode 100644 index 56714260ce..0000000000 --- a/yt/yt/library/oom/oom.cpp +++ /dev/null @@ -1,144 +0,0 @@ -#include "oom.h" - -#include <thread> -#include <mutex> - -#include <yt/yt/core/misc/proc.h> -#include <yt/yt/core/misc/ref_counted_tracker.h> - -#include <library/cpp/yt/assert/assert.h> -#include <library/cpp/yt/logging/logger.h> - -#include <yt/yt/library/ytprof/heap_profiler.h> -#include <yt/yt/library/ytprof/profile.h> - -#include <util/datetime/base.h> -#include <util/system/file.h> -#include <util/stream/output.h> -#include <util/stream/file.h> -#include <util/string/split.h> -#include <util/system/fs.h> - -namespace NYT { - -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -YT_DEFINE_GLOBAL(const NYT::NLogging::TLogger, Logger, "OOM"); - -const char* TCMallocStats[] = { - "tcmalloc.per_cpu_caches_active", - "generic.virtual_memory_used", - "generic.physical_memory_used", - "generic.bytes_in_use_by_app", - "generic.heap_size", - "tcmalloc.central_cache_free", - "tcmalloc.cpu_free", - "tcmalloc.page_heap_free", - "tcmalloc.page_heap_unmapped", - "tcmalloc.page_algorithm", - "tcmalloc.max_total_thread_cache_bytes", - "tcmalloc.thread_cache_free", - "tcmalloc.thread_cache_count", - "tcmalloc.local_bytes", - "tcmalloc.external_fragmentation_bytes", - "tcmalloc.metadata_bytes", - "tcmalloc.transfer_cache_free", - "tcmalloc.hard_usage_limit_bytes", - "tcmalloc.desired_usage_limit_bytes", - "tcmalloc.required_bytes", -}; - -void OomWatchdog(TOomWatchdogOptions options) -{ - while (true) { - auto rss = GetProcessMemoryUsage().Rss; - - if (options.MemoryLimit && static_cast<i64>(rss) > *options.MemoryLimit) { - auto profile = NYTProf::CaptureHeapProfile(tcmalloc::ProfileType::kHeap); - - TFileOutput output(options.HeapDumpPath); - NYTProf::WriteCompressedProfile(&output, profile); - output.Finish(); - - auto rctDump = TRefCountedTracker::Get()->GetDebugInfo(); - for (const auto& line : StringSplitter(rctDump).Split('\n')) { - YT_LOG_DEBUG("RCT %v", line.Token()); - } - - auto parseMemoryAmount = [] (const TStringBuf strValue) { - const TStringBuf kbSuffix = " kB"; - YT_VERIFY(strValue.EndsWith(kbSuffix)); - auto startPos = strValue.find_first_not_of(' '); - auto valueString = strValue.substr( - startPos, - strValue.size() - kbSuffix.size() - startPos); - return FromString<ui64>(valueString) * 1_KB; - }; - - ui64 rssAnon = 0; - ui64 rssFile = 0; - ui64 rssShmem = 0; - - TFileInput statusFile(Format("/proc/self/status")); - TString line; - while (statusFile.ReadLine(line)) { - const TStringBuf rssAnonHeader = "RssAnon:\t"; - if (line.StartsWith(rssAnonHeader)) { - rssAnon = parseMemoryAmount(line.substr(rssAnonHeader.size())); - continue; - } - - const TStringBuf rssFileHeader = "RssFile:\t"; - if (line.StartsWith(rssFileHeader)) { - rssFile = parseMemoryAmount(line.substr(rssFileHeader.size())); - continue; - } - - const TStringBuf rssShmemHeader = "RssShmem:\t"; - if (line.StartsWith(rssShmemHeader)) { - rssShmem = parseMemoryAmount(line.substr(rssShmemHeader.size())); - continue; - } - } - - YT_LOG_DEBUG("Memory statistis (RssTotal: %v, RssAnon: %v, RssFile %v, RssShmem: %v, TCMalloc: %v)", - rss, - rssAnon, - rssFile, - rssShmem, - MakeFormattableView( - TRange(TCMallocStats), - [&] (auto* builder, auto metric) { - auto value = tcmalloc::MallocExtension::GetNumericProperty(metric); - builder->AppendFormat("%v: %v", metric, value); - })); - - YT_LOG_FATAL("Early OOM triggered (MemoryUsage: %v, MemoryLimit: %v, HeapDump: %v, CurrentWorkingDirectory: %v)", - rss, - *options.MemoryLimit, - options.HeapDumpPath, - NFs::CurrentWorkingDirectory()); - } - - Sleep(TDuration::MilliSeconds(10)); - } -} - -} // namespace - -//////////////////////////////////////////////////////////////////////////////// - -void EnableEarlyOomWatchdog(TOomWatchdogOptions options) -{ - static std::once_flag onceFlag; - - std::call_once(onceFlag, [options] { - std::thread(OomWatchdog, options).detach(); - }); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/oom/oom.h b/yt/yt/library/oom/oom.h deleted file mode 100644 index 7a5892918a..0000000000 --- a/yt/yt/library/oom/oom.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include <optional> - -#include <util/generic/string.h> - -namespace NYT { - -//////////////////////////////////////////////////////////////////////////////// - -struct TOomWatchdogOptions -{ - std::optional<i64> MemoryLimit; - TString HeapDumpPath = "oom.pb.gz"; -}; - -void EnableEarlyOomWatchdog(TOomWatchdogOptions options); - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/oom/unittests/oom_ut.cpp b/yt/yt/library/oom/unittests/oom_ut.cpp deleted file mode 100644 index 78f0182973..0000000000 --- a/yt/yt/library/oom/unittests/oom_ut.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include <gtest/gtest.h> - -#include <yt/yt/library/oom/oom.h> - -#include <util/datetime/base.h> -#include <util/system/fs.h> -#include <util/generic/size_literals.h> - -namespace NYT { -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -TEST(TEarlyOomTest, Crash) -{ - auto checkOom = [] { - EnableEarlyOomWatchdog(TOomWatchdogOptions{ - .MemoryLimit = 0, - }); - - Sleep(TDuration::Seconds(5)); - }; - - ASSERT_DEATH(checkOom(), ""); - - ASSERT_TRUE(NFs::Exists("oom.pb.gz")); -} - -TEST(TEarlyOomTest, NoCrash) -{ - EnableEarlyOomWatchdog(TOomWatchdogOptions{ - .MemoryLimit = 1_GB, - }); - - Sleep(TDuration::Seconds(5)); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT diff --git a/yt/yt/library/oom/ya.make b/yt/yt/library/oom/ya.make deleted file mode 100644 index f4845495d8..0000000000 --- a/yt/yt/library/oom/ya.make +++ /dev/null @@ -1,20 +0,0 @@ -LIBRARY() - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - oom.cpp -) - -PEERDIR( - yt/yt/core - yt/yt/library/ytprof -) - -END() - -IF (OS_LINUX AND NOT SANITIZER_TYPE) - RECURSE( - unittests - ) -ENDIF() diff --git a/yt/yt/library/process/config.cpp b/yt/yt/library/process/config.cpp new file mode 100644 index 0000000000..9099aca7f0 --- /dev/null +++ b/yt/yt/library/process/config.cpp @@ -0,0 +1,34 @@ +#include "config.h" + +namespace NYT::NPipes { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void TIODispatcherConfig::Register(TRegistrar registrar) +{ + registrar.Parameter("thread_pool_polling_period", &TThis::ThreadPoolPollingPeriod) + .Default(TDuration::MilliSeconds(10)); +} + +TIODispatcherConfigPtr TIODispatcherConfig::ApplyDynamic( + const TIODispatcherDynamicConfigPtr& dynamicConfig) const +{ + auto mergedConfig = CloneYsonStruct(MakeStrong(this)); + UpdateYsonStructField(mergedConfig->ThreadPoolPollingPeriod, dynamicConfig->ThreadPoolPollingPeriod); + mergedConfig->Postprocess(); + return mergedConfig; +} + +//////////////////////////////////////////////////////////////////////////////// + +void TIODispatcherDynamicConfig::Register(TRegistrar registrar) +{ + registrar.Parameter("thread_pool_polling_period", &TThis::ThreadPoolPollingPeriod) + .Optional(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NPipes diff --git a/yt/yt/library/process/config.h b/yt/yt/library/process/config.h new file mode 100644 index 0000000000..84be0ef7be --- /dev/null +++ b/yt/yt/library/process/config.h @@ -0,0 +1,43 @@ +#pragma once + +#include "public.h" + +#include <yt/yt/core/ytree/yson_struct.h> + +namespace NYT::NPipes { + +//////////////////////////////////////////////////////////////////////////////// + +class TIODispatcherConfig + : public NYTree::TYsonStruct +{ +public: + TDuration ThreadPoolPollingPeriod; + + TIODispatcherConfigPtr ApplyDynamic(const TIODispatcherDynamicConfigPtr& dynamicConfig) const; + + REGISTER_YSON_STRUCT(TIODispatcherConfig); + + static void Register(TRegistrar registrar); +}; + +DEFINE_REFCOUNTED_TYPE(TIODispatcherConfig) + +//////////////////////////////////////////////////////////////////////////////// + +class TIODispatcherDynamicConfig + : public NYTree::TYsonStruct +{ +public: + std::optional<TDuration> ThreadPoolPollingPeriod; + + REGISTER_YSON_STRUCT(TIODispatcherDynamicConfig); + + static void Register(TRegistrar registrar); +}; + +DEFINE_REFCOUNTED_TYPE(TIODispatcherDynamicConfig) + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NPipes diff --git a/yt/yt/library/process/configure_io_dispatcher.cpp b/yt/yt/library/process/configure_io_dispatcher.cpp new file mode 100644 index 0000000000..d2f834b0f2 --- /dev/null +++ b/yt/yt/library/process/configure_io_dispatcher.cpp @@ -0,0 +1,41 @@ +#include "io_dispatcher.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NPipes { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TIODispatcherConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TIODispatcherDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TIODispatcherConfigPtr& config) +{ + TIODispatcher::Get()->Configure(config); +} + +void ReconfigureSingleton( + const TIODispatcherConfigPtr& config, + const TIODispatcherDynamicConfigPtr& dynamicConfig) +{ + TIODispatcher::Get()->Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "io_dispatcher", + TIODispatcherConfig, + TIODispatcherDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NPipes diff --git a/yt/yt/library/process/io_dispatcher.cpp b/yt/yt/library/process/io_dispatcher.cpp index c6e7e2f67f..96e1f88087 100644 --- a/yt/yt/library/process/io_dispatcher.cpp +++ b/yt/yt/library/process/io_dispatcher.cpp @@ -1,5 +1,7 @@ #include "io_dispatcher.h" +#include "config.h" + #include <yt/yt/core/concurrency/thread_pool_poller.h> #include <yt/yt/core/concurrency/poller.h> @@ -9,14 +11,6 @@ using namespace NConcurrency; //////////////////////////////////////////////////////////////////////////////// -void TIODispatcherConfig::Register(TRegistrar registrar) -{ - registrar.Parameter("thread_pool_polling_period", &TThis::ThreadPoolPollingPeriod) - .Default(TDuration::MilliSeconds(10)); -} - -//////////////////////////////////////////////////////////////////////////////// - TIODispatcher::TIODispatcher() : Poller_(BIND([] { return CreateThreadPoolPoller(1, "Pipes"); })) { } diff --git a/yt/yt/library/process/io_dispatcher.h b/yt/yt/library/process/io_dispatcher.h index 32fd92f0ac..3c47bddf78 100644 --- a/yt/yt/library/process/io_dispatcher.h +++ b/yt/yt/library/process/io_dispatcher.h @@ -12,32 +12,15 @@ namespace NYT::NPipes { //////////////////////////////////////////////////////////////////////////////// -class TIODispatcherConfig - : public NYTree::TYsonStruct -{ -public: - TDuration ThreadPoolPollingPeriod; - - REGISTER_YSON_STRUCT(TIODispatcherConfig); - - static void Register(TRegistrar registrar); -}; - -DEFINE_REFCOUNTED_TYPE(TIODispatcherConfig) - -//////////////////////////////////////////////////////////////////////////////// - class TIODispatcher { public: - ~TIODispatcher(); - static TIODispatcher* Get(); + ~TIODispatcher(); void Configure(const TIODispatcherConfigPtr& config); IInvokerPtr GetInvoker(); - NConcurrency::IPollerPtr GetPoller(); private: diff --git a/yt/yt/library/process/public.h b/yt/yt/library/process/public.h index fd4193f80d..76cfff1340 100644 --- a/yt/yt/library/process/public.h +++ b/yt/yt/library/process/public.h @@ -1,6 +1,8 @@ #pragma once -#include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> + +#include <library/cpp/yt/memory/ref_counted.h> namespace NYT::NPipes { @@ -10,6 +12,10 @@ DECLARE_REFCOUNTED_CLASS(TNamedPipe) DECLARE_REFCOUNTED_CLASS(TNamedPipeConfig) DECLARE_REFCOUNTED_CLASS(TIODispatcherConfig) +DECLARE_REFCOUNTED_CLASS(TIODispatcherDynamicConfig) + + +YT_DECLARE_RECONFIGURABLE_SINGLETON(TIODispatcherConfig, TIODispatcherDynamicConfig); //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/library/process/unittests/pipes_ut.cpp b/yt/yt/library/process/unittests/pipes_ut.cpp deleted file mode 100644 index f0c371dd30..0000000000 --- a/yt/yt/library/process/unittests/pipes_ut.cpp +++ /dev/null @@ -1,432 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/concurrency/action_queue.h> -#include <yt/yt/core/concurrency/scheduler.h> - -#include <yt/yt/core/misc/blob.h> -#include <yt/yt/core/misc/proc.h> - -#include <yt/yt/core/net/connection.h> - -#include <yt/yt/library/process/pipe.h> - -#include <random> - -namespace NYT::NPipes { - -//////////////////////////////////////////////////////////////////////////////// - -using namespace NConcurrency; -using namespace NNet; - -#ifndef _win_ - -//! NB: You can't set size smaller than that of a page. -constexpr int SmallPipeCapacity = 4096; - -TEST(TPipeIOHolder, CanInstantiate) -{ - auto pipe = TPipeFactory().Create(); - - auto readerHolder = pipe.CreateAsyncReader(); - auto writerHolder = pipe.CreateAsyncWriter(); - - readerHolder->Abort().Get(); - writerHolder->Abort().Get(); -} - -TEST(TPipeTest, PrematureEOF) -{ - auto pipe = TNamedPipe::Create("./namedpipe"); - auto reader = pipe->CreateAsyncReader(); - - auto buffer = TSharedMutableRef::Allocate(1024 * 1024); - EXPECT_THROW(reader->Read(buffer).WithTimeout(TDuration::Seconds(1)).Get().ValueOrThrow(), TErrorException); -} - -//////////////////////////////////////////////////////////////////////////////// - -TBlob ReadAll(IConnectionReaderPtr reader, bool useWaitFor) -{ - auto buffer = TSharedMutableRef::Allocate(1_MB, {.InitializeStorage = false}); - auto whole = TBlob(GetRefCountedTypeCookie<TDefaultBlobTag>()); - - while (true) { - TErrorOr<size_t> result; - auto future = reader->Read(buffer); - if (useWaitFor) { - result = WaitFor(future); - } else { - result = future.Get(); - } - - if (result.ValueOrThrow() == 0) { - break; - } - - whole.Append(buffer.Begin(), result.Value()); - } - return whole; -} - -void WriteAll(IConnectionWriterPtr writer, const char* data, size_t size, size_t blockSize) -{ - while (size > 0) { - const size_t currentBlockSize = std::min(blockSize, size); - auto buffer = TSharedRef(data, currentBlockSize, nullptr); - auto error = WaitFor(writer->Write(buffer)); - THROW_ERROR_EXCEPTION_IF_FAILED(error); - size -= currentBlockSize; - data += currentBlockSize; - } - - { - auto error = WaitFor(writer->Close()); - THROW_ERROR_EXCEPTION_IF_FAILED(error); - } -} - -TEST(TAsyncWriterTest, AsyncCloseFail) -{ - auto pipe = TPipeFactory().Create(); - - auto reader = pipe.CreateAsyncReader(); - auto writer = pipe.CreateAsyncWriter(); - - auto queue = New<NConcurrency::TActionQueue>(); - auto readFromPipe = - BIND(&ReadAll, reader, false) - .AsyncVia(queue->GetInvoker()) - .Run(); - - int length = 200*1024; - auto buffer = TSharedMutableRef::Allocate(length); - ::memset(buffer.Begin(), 'a', buffer.Size()); - - auto writeResult = writer->Write(buffer).Get(); - - EXPECT_TRUE(writeResult.IsOK()) - << ToString(writeResult); - - auto error = writer->Close(); - - auto readResult = readFromPipe.Get(); - ASSERT_TRUE(readResult.IsOK()) - << ToString(readResult); - - auto closeStatus = error.Get(); -} - -TEST(TAsyncWriterTest, WriteFailed) -{ - auto pipe = TPipeFactory().Create(); - auto reader = pipe.CreateAsyncReader(); - auto writer = pipe.CreateAsyncWriter(); - - int length = 200*1024; - auto buffer = TSharedMutableRef::Allocate(length); - ::memset(buffer.Begin(), 'a', buffer.Size()); - - auto asyncWriteResult = writer->Write(buffer); - YT_UNUSED_FUTURE(reader->Abort()); - - EXPECT_FALSE(asyncWriteResult.Get().IsOK()) - << ToString(asyncWriteResult.Get()); -} - -//////////////////////////////////////////////////////////////////////////////// - -class TPipeReadWriteTest - : public ::testing::Test -{ -protected: - void SetUp() override - { - auto pipe = TPipeFactory().Create(); - - Reader = pipe.CreateAsyncReader(); - Writer = pipe.CreateAsyncWriter(); - } - - void TearDown() override - { } - - IConnectionReaderPtr Reader; - IConnectionWriterPtr Writer; -}; - -class TNamedPipeReadWriteTest - : public ::testing::Test -{ -protected: - void SetUp() override - { - auto pipe = TNamedPipe::Create("./namedpipe"); - Reader = pipe->CreateAsyncReader(); - Writer = pipe->CreateAsyncWriter(); - } - - void TearDown() override - { } - - void SetUpWithCapacity(int capacity) - { - auto pipe = TNamedPipe::Create("./namedpipewcap", 0660, capacity); - Reader = pipe->CreateAsyncReader(); - Writer = pipe->CreateAsyncWriter(); - } - - void SetUpWithDeliveryFence() - { - auto pipe = TNamedPipe::Create("./namedpipewcap", 0660); - Reader = pipe->CreateAsyncReader(); - Writer = pipe->CreateAsyncWriter(/*useDeliveryFence*/ true); - } - - IConnectionReaderPtr Reader; - IConnectionWriterPtr Writer; -}; - -TEST_F(TPipeReadWriteTest, ReadSomethingSpin) -{ - TString message("Hello pipe!\n"); - auto buffer = TSharedRef::FromString(message); - Writer->Write(buffer).Get().ThrowOnError(); - Writer->Close().Get().ThrowOnError(); - - auto data = TSharedMutableRef::Allocate(1); - auto whole = TBlob(GetRefCountedTypeCookie<TDefaultBlobTag>()); - - while (true) { - auto result = Reader->Read(data).Get(); - if (result.ValueOrThrow() == 0) { - break; - } - whole.Append(data.Begin(), result.Value()); - } - - EXPECT_EQ(message, TString(whole.Begin(), whole.End())); -} - -TEST_F(TNamedPipeReadWriteTest, ReadSomethingSpin) -{ - TString message("Hello pipe!\n"); - auto buffer = TSharedRef::FromString(message); - - Writer->Write(buffer).Get().ThrowOnError(); - Writer->Close().Get().ThrowOnError(); - - auto data = TSharedMutableRef::Allocate(1); - auto whole = TBlob(GetRefCountedTypeCookie<TDefaultBlobTag>()); - - while (true) { - auto result = Reader->Read(data).Get(); - if (result.ValueOrThrow() == 0) { - break; - } - whole.Append(data.Begin(), result.Value()); - } - EXPECT_EQ(message, TString(whole.Begin(), whole.End())); -} - - -TEST_F(TPipeReadWriteTest, ReadSomethingWait) -{ - TString message("Hello pipe!\n"); - auto buffer = TSharedRef::FromString(message); - EXPECT_TRUE(Writer->Write(buffer).Get().IsOK()); - WaitFor(Writer->Close()) - .ThrowOnError(); - auto whole = ReadAll(Reader, false); - EXPECT_EQ(message, TString(whole.Begin(), whole.End())); -} - -TEST_F(TNamedPipeReadWriteTest, ReadSomethingWait) -{ - TString message("Hello pipe!\n"); - auto buffer = TSharedRef::FromString(message); - EXPECT_TRUE(Writer->Write(buffer).Get().IsOK()); - WaitFor(Writer->Close()) - .ThrowOnError(); - auto whole = ReadAll(Reader, false); - EXPECT_EQ(message, TString(whole.Begin(), whole.End())); -} - -TEST_F(TPipeReadWriteTest, ReadWrite) -{ - TString text("Hello cruel world!\n"); - auto buffer = TSharedRef::FromString(text); - Writer->Write(buffer).Get(); - auto errorsOnClose = Writer->Close(); - - auto textFromPipe = ReadAll(Reader, false); - - auto error = errorsOnClose.Get(); - EXPECT_TRUE(error.IsOK()) << error.GetMessage(); - EXPECT_EQ(text, TString(textFromPipe.Begin(), textFromPipe.End())); -} - -TEST_F(TNamedPipeReadWriteTest, ReadWrite) -{ - TString text("Hello cruel world!\n"); - auto buffer = TSharedRef::FromString(text); - Writer->Write(buffer).Get(); - auto errorsOnClose = Writer->Close(); - - auto textFromPipe = ReadAll(Reader, false); - - auto error = errorsOnClose.Get(); - EXPECT_TRUE(error.IsOK()) << error.GetMessage(); - EXPECT_EQ(text, TString(textFromPipe.Begin(), textFromPipe.End())); -} - -TEST_F(TNamedPipeReadWriteTest, CapacityJustWorks) -{ - SetUpWithCapacity(SmallPipeCapacity); - - TString text(5, 'a'); - text.push_back('\n'); - auto writeBuffer = TSharedRef::FromString(text); - - auto writeFuture = Writer->Write(writeBuffer); - EXPECT_TRUE(writeFuture.Get().IsOK()); - - auto readBuffer = TSharedMutableRef::Allocate(5000, {.InitializeStorage = false}); - auto readResult = Reader->Read(readBuffer).Get(); - - EXPECT_EQ(text, TString(readBuffer.Begin(), readResult.Value())); -} - -TEST_F(TNamedPipeReadWriteTest, CapacityOverflow) -{ - SetUpWithCapacity(SmallPipeCapacity); - auto readerQueue = New<NConcurrency::TActionQueue>("Reader"); - - TString text(5000, 'a'); - text.push_back('\n'); - auto writeBuffer = TSharedRef::FromString(text); - auto writeFuture = Writer->Write(writeBuffer); - - TDelayedExecutor::WaitForDuration(TDuration::Seconds(1)); - EXPECT_FALSE(writeFuture.IsSet()); - - auto readFuture = BIND([&] { - auto readBuffer = TSharedMutableRef::Allocate(6000, {.InitializeStorage = false}); - auto readResult = Reader->Read(readBuffer).Get(); - - EXPECT_TRUE(readResult.IsOK()); - EXPECT_EQ(text.substr(0, 4096), TString(readBuffer.Begin(), readResult.Value())); - }) - .AsyncVia(readerQueue->GetInvoker()) - .Run(); - - EXPECT_TRUE(readFuture.Get().IsOK()); - EXPECT_TRUE(writeFuture.Get().IsOK()); -} - -TEST_F(TNamedPipeReadWriteTest, CapacityDontDiscardSurplus) -{ - SetUpWithCapacity(SmallPipeCapacity); - auto readerQueue = New<NConcurrency::TActionQueue>("Reader"); - auto writerQueue = New<NConcurrency::TActionQueue>("Writer"); - - TString text(5000, 'a'); - text.push_back('\n'); - - auto writeFuture = BIND(&WriteAll, Writer, text.data(), text.size(), text.size()) - .AsyncVia(writerQueue->GetInvoker()) - .Run(); - - TDelayedExecutor::WaitForDuration(TDuration::Seconds(1)); - EXPECT_FALSE(writeFuture.IsSet()); - - auto readFuture = BIND(&ReadAll, Reader, false) - .AsyncVia(readerQueue->GetInvoker()) - .Run(); - - auto readResult = readFuture.Get().ValueOrThrow(); - EXPECT_EQ(text, TString(readResult.Begin(), readResult.End())); - - EXPECT_TRUE(writeFuture.Get().IsOK()); -} - -#if defined(_linux_) - -TEST_F(TNamedPipeReadWriteTest, DeliveryFencedWriteJustWorks) -{ - SetUpWithDeliveryFence(); - - TString text("aabbb"); - auto writeBuffer = TSharedRef::FromString(text); - auto writeFuture = Writer->Write(writeBuffer); - - auto readBuffer = TSharedMutableRef::Allocate(2, {.InitializeStorage = false}); - auto readResult = Reader->Read(readBuffer).Get(); - EXPECT_EQ(TString("aa"), TString(readBuffer.Begin(), readResult.Value())); - - EXPECT_FALSE(writeFuture.IsSet()); - - readBuffer = TSharedMutableRef::Allocate(10, {.InitializeStorage = false}); - readResult = Reader->Read(readBuffer).Get(); - EXPECT_EQ(TString("bbb"), TString(readBuffer.Begin(), readResult.Value())); - - // Future is set only after the entire buffer is read. - EXPECT_TRUE(writeFuture.Get().IsOK()); -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// - -class TPipeBigReadWriteTest - : public TPipeReadWriteTest - , public ::testing::WithParamInterface<std::pair<size_t, size_t>> -{ }; - -TEST_P(TPipeBigReadWriteTest, RealReadWrite) -{ - size_t dataSize, blockSize; - std::tie(dataSize, blockSize) = GetParam(); - - auto queue = New<NConcurrency::TActionQueue>(); - - std::vector<char> data(dataSize, 'a'); - - YT_UNUSED_FUTURE(BIND([&] { - auto dice = std::bind( - std::uniform_int_distribution<int>(0, 127), - std::default_random_engine()); - for (size_t i = 0; i < data.size(); ++i) { - data[i] = dice(); - } - }) - .AsyncVia(queue->GetInvoker()).Run()); - - auto writeError = BIND(&WriteAll, Writer, data.data(), data.size(), blockSize) - .AsyncVia(queue->GetInvoker()) - .Run(); - auto readFromPipe = BIND(&ReadAll, Reader, true) - .AsyncVia(queue->GetInvoker()) - .Run(); - - auto textFromPipe = readFromPipe.Get().ValueOrThrow(); - EXPECT_EQ(data.size(), textFromPipe.Size()); - auto result = std::mismatch(textFromPipe.Begin(), textFromPipe.End(), data.begin()); - EXPECT_TRUE(std::equal(textFromPipe.Begin(), textFromPipe.End(), data.begin())) << - (result.first - textFromPipe.Begin()) << " " << (int)(*result.first); -} - -INSTANTIATE_TEST_SUITE_P( - ValueParametrized, - TPipeBigReadWriteTest, - ::testing::Values( - std::pair(2000 * 4096, 4096), - std::pair(100 * 4096, 10000), - std::pair(100 * 4096, 100), - std::pair(100, 4096))); - -#endif - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT::NPipes diff --git a/yt/yt/library/process/unittests/process_ut.cpp b/yt/yt/library/process/unittests/process_ut.cpp deleted file mode 100644 index 61508c487f..0000000000 --- a/yt/yt/library/process/unittests/process_ut.cpp +++ /dev/null @@ -1,242 +0,0 @@ -#include <yt/yt/library/process/process.h> - -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/actions/bind.h> - -#include <yt/yt/core/concurrency/action_queue.h> -#include <yt/yt/core/concurrency/delayed_executor.h> -#include <yt/yt/core/concurrency/scheduler.h> - -#include <yt/yt/core/net/connection.h> - -#include <library/cpp/yt/system/handle_eintr.h> - -namespace NYT { -namespace { - -using namespace NConcurrency; - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(_unix_) and not defined(_asan_enabled_) - -TEST(TProcessTest, Basic) -{ - auto p = New<TSimpleProcess>("/bin/ls"); - TFuture<void> finished; - - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_TRUE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); -} - -// NB: We cannot rely on 'ls' and 'sleep' in arcadia tests. -TEST(TProcessTest, RunFromPathEnv) -{ - auto p = New<TSimpleProcess>("/bin/ls", false); - TFuture<void> finished; - - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_TRUE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, PollDuration) -{ - auto p = New<TSimpleProcess>("/bin/sleep", true, TDuration::MilliSeconds(1)); - p->AddArgument("0.1"); - - auto error = WaitFor(p->Spawn()); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, InvalidPath) -{ - auto p = New<TSimpleProcess>("/some/bad/path/binary"); - - TFuture<void> finished; - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_FALSE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_FALSE(p->IsFinished()); - EXPECT_FALSE(error.IsOK()); -} - -TEST(TProcessTest, StdOut) -{ - auto p = New<TSimpleProcess>("/bin/date"); - - auto outStream = p->GetStdOutReader(); - TFuture<void> finished; - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_TRUE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); - - auto buffer = TSharedMutableRef::Allocate(4_KB, {.InitializeStorage = false}); - auto future = outStream->Read(buffer); - auto result = WaitFor(future); - size_t sz = result.ValueOrThrow(); - EXPECT_TRUE(sz > 0); -} - -TEST(TSimpleProcess, GetCommandLine1) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - EXPECT_EQ("/bin/bash", p->GetCommandLine()); - p->AddArgument("-c"); - EXPECT_EQ("/bin/bash -c", p->GetCommandLine()); - p->AddArgument("exit 0"); - EXPECT_EQ("/bin/bash -c \"exit 0\"", p->GetCommandLine()); -} - -TEST(TProcessBase, GetCommandLine2) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - EXPECT_EQ("/bin/bash", p->GetCommandLine()); - p->AddArgument("-c"); - EXPECT_EQ("/bin/bash -c", p->GetCommandLine()); - p->AddArgument("\"quoted\""); - EXPECT_EQ("/bin/bash -c \"\\\"quoted\\\"\"", p->GetCommandLine()); -} - -TEST(TProcessTest, ProcessReturnCode0) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("exit 0"); - - TFuture<void> finished; - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_TRUE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, ProcessReturnCode123) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("exit 123"); - - TFuture<void> finished; - ASSERT_NO_THROW(finished = p->Spawn()); - ASSERT_TRUE(p->IsStarted()); - auto error = WaitFor(finished); - EXPECT_EQ(EProcessErrorCode::NonZeroExitCode, error.GetCode()); - EXPECT_EQ(123, error.Attributes().Get<int>("exit_code")); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, Params1) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("if test 3 -gt 1; then exit 7; fi"); - - auto error = WaitFor(p->Spawn()); - EXPECT_FALSE(error.IsOK()); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, Params2) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("if test 1 -gt 3; then exit 7; fi"); - - auto error = WaitFor(p->Spawn()); - EXPECT_TRUE(error.IsOK()) << ToString(error); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, InheritEnvironment) -{ - const char* name = "SPAWN_TEST_ENV_VAR"; - const char* value = "42"; - setenv(name, value, 1); - - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("if test $SPAWN_TEST_ENV_VAR = 42; then exit 7; fi"); - - auto error = WaitFor(p->Spawn()); - EXPECT_FALSE(error.IsOK()); - EXPECT_TRUE(p->IsFinished()); - - unsetenv(name); -} - -TEST(TProcessTest, Kill) -{ - auto p = New<TSimpleProcess>("/bin/sleep"); - p->AddArgument("5"); - - auto finished = p->Spawn(); - - NConcurrency::TDelayedExecutor::Submit( - BIND([&] { - p->Kill(SIGKILL); - }), - TDuration::MilliSeconds(100)); - - auto error = WaitFor(finished); - EXPECT_FALSE(error.IsOK()); - EXPECT_TRUE(p->IsFinished()); -} - -TEST(TProcessTest, KillFinished) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("true"); - - auto finished = p->Spawn(); - - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()); - - p->Kill(SIGKILL); -} - -TEST(TProcessTest, KillZombie) -{ - auto p = New<TSimpleProcess>("/bin/bash"); - p->AddArgument("-c"); - p->AddArgument("/bin/sleep 1; /bin/true"); - - auto finished = p->Spawn(); - - siginfo_t infop; - auto res = HandleEintr(::waitid, P_PID, p->GetProcessId(), &infop, WEXITED | WNOWAIT); - - if (res == 0) { - EXPECT_EQ(p->GetProcessId(), infop.si_pid); - } else { - // NB(arkady-e1ppa): Sometimes child process will run - // just fine and yet will be invisible to waitid - // on some platforms. - // Cause of this is still unknown. - EXPECT_EQ(errno, ECHILD); - } - - p->Kill(SIGKILL); - auto error = WaitFor(finished); - EXPECT_TRUE(error.IsOK()) - << ToString(error); -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT diff --git a/yt/yt/library/process/unittests/subprocess_ut.cpp b/yt/yt/library/process/unittests/subprocess_ut.cpp deleted file mode 100644 index ff7cf7aa08..0000000000 --- a/yt/yt/library/process/unittests/subprocess_ut.cpp +++ /dev/null @@ -1,111 +0,0 @@ -#include <yt/yt/core/test_framework/framework.h> - -#include <yt/yt/core/actions/future.h> - -#include <yt/yt/core/concurrency/action_queue.h> - -#include <yt/yt/library/process/subprocess.h> - -namespace NYT { -namespace { - -using namespace NConcurrency; - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(_unix_) and not defined(_asan_enabled_) - -TEST(TSubprocessTest, Basic) -{ - TSubprocess subprocess("/bin/bash"); - - subprocess.AddArgument("-c"); - subprocess.AddArgument("true"); - - auto result = subprocess.Execute(); - EXPECT_TRUE(result.Status.IsOK()); -} - -TEST(TSubprocessTest, PipeOutput) -{ - TSubprocess subprocess("/bin/echo"); - - subprocess.AddArgument("hello"); - - auto result = subprocess.Execute(); - EXPECT_TRUE(result.Status.IsOK()); - TString output(result.Output.Begin(), result.Output.End()); - EXPECT_TRUE(output == "hello\n") << output; -} - -TEST(TSubprocessTest, PipeStdin) -{ - auto queue = New<TActionQueue>(); - - BIND([] { - TSubprocess subprocess("/bin/cat"); - subprocess.AddArgument("-"); - - auto input = TString("TEST test TEST"); - auto inputRef = TSharedRef::FromString(input); - auto result = subprocess.Execute(inputRef); - EXPECT_TRUE(result.Status.IsOK()); - - TString output(result.Output.Begin(), result.Output.End()); - EXPECT_EQ(input, output); - }).AsyncVia(queue->GetInvoker()).Run().Get().ThrowOnError(); -} - -TEST(TSubprocessTest, PipeBigOutput) -{ - auto queue = New<TActionQueue>(); - - auto result = BIND([] { - TSubprocess subprocess("/bin/bash"); - - subprocess.AddArgument("-c"); - subprocess.AddArgument("for i in `/usr/bin/seq 100000`; do echo hello; done; echo world"); - - auto result = subprocess.Execute(); - return result.Status.IsOK(); - }).AsyncVia(queue->GetInvoker()).Run().Get().Value(); - - EXPECT_TRUE(result); -} - -TEST(TSubprocessTest, PipeBigError) -{ - auto queue = New<TActionQueue>(); - - auto result = BIND([] { - TSubprocess subprocess("/bin/bash"); - - subprocess.AddArgument("-c"); - subprocess.AddArgument("for i in `/usr/bin/seq 100000`; do echo hello 1>&2; done; echo world"); - - auto result = subprocess.Execute(); - return result; - }).AsyncVia(queue->GetInvoker()).Run().Get().Value(); - - EXPECT_TRUE(result.Status.IsOK()); - EXPECT_EQ(6*100000, std::ssize(result.Error)); -} - -TEST(TSubprocessTest, BinaryNotFound) -{ - auto queue = New<TActionQueue>(); - - auto result = BIND([] { - TSubprocess subprocess("does-not-exist"); - return subprocess.Execute(); - }).AsyncVia(queue->GetInvoker()).Run().Get().Value(); - - EXPECT_FALSE(result.Status.IsOK()); -} - -#endif - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace -} // namespace NYT diff --git a/yt/yt/library/process/unittests/ya.make b/yt/yt/library/process/unittests/ya.make deleted file mode 100644 index 149d9eee1f..0000000000 --- a/yt/yt/library/process/unittests/ya.make +++ /dev/null @@ -1,22 +0,0 @@ -GTEST(unittester-library-process) - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - pipes_ut.cpp - process_ut.cpp - subprocess_ut.cpp -) - -INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc) - -PEERDIR( - yt/yt/build - yt/yt/core - yt/yt/core/test_framework - yt/yt/library/process -) - -SIZE(MEDIUM) - -END() diff --git a/yt/yt/library/process/ya.make b/yt/yt/library/process/ya.make index 79763c7267..6b3ea41ca2 100644 --- a/yt/yt/library/process/ya.make +++ b/yt/yt/library/process/ya.make @@ -3,6 +3,8 @@ LIBRARY() INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) SRCS( + config.cpp + GLOBAL configure_io_dispatcher.cpp io_dispatcher.cpp pipe.cpp process.cpp diff --git a/yt/yt/library/profiling/resource_tracker/configure_resource_tracker.cpp b/yt/yt/library/profiling/resource_tracker/configure_resource_tracker.cpp new file mode 100644 index 0000000000..4b25b3f6f4 --- /dev/null +++ b/yt/yt/library/profiling/resource_tracker/configure_resource_tracker.cpp @@ -0,0 +1,28 @@ +#include "resource_tracker.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NProfiling { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TResourceTrackerConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TResourceTrackerConfigPtr& config) +{ + TResourceTracker::Configure(config); +} + +YT_DEFINE_CONFIGURABLE_SINGLETON( + "resource_tracker", + TResourceTrackerConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NProfiling diff --git a/yt/yt/library/profiling/resource_tracker/public.h b/yt/yt/library/profiling/resource_tracker/public.h index 36786770fa..febd29d563 100644 --- a/yt/yt/library/profiling/resource_tracker/public.h +++ b/yt/yt/library/profiling/resource_tracker/public.h @@ -1,6 +1,8 @@ #pragma once -#include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> + +#include <library/cpp/yt/memory/ref_counted.h> namespace NYT::NProfiling { @@ -8,6 +10,8 @@ namespace NYT::NProfiling { DECLARE_REFCOUNTED_STRUCT(TResourceTrackerConfig) +YT_DECLARE_CONFIGURABLE_SINGLETON(TResourceTrackerConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NProfiling diff --git a/yt/yt/library/profiling/resource_tracker/resource_tracker.cpp b/yt/yt/library/profiling/resource_tracker/resource_tracker.cpp index 1c45abfbbb..138f89434a 100644 --- a/yt/yt/library/profiling/resource_tracker/resource_tracker.cpp +++ b/yt/yt/library/profiling/resource_tracker/resource_tracker.cpp @@ -33,7 +33,6 @@ namespace NYT::NProfiling { //////////////////////////////////////////////////////////////////////////////// using namespace NYPath; -using namespace NYTree; using namespace NProfiling; using namespace NConcurrency; diff --git a/yt/yt/library/profiling/resource_tracker/ya.make b/yt/yt/library/profiling/resource_tracker/ya.make index 62287a3fb7..009f21eaa4 100644 --- a/yt/yt/library/profiling/resource_tracker/ya.make +++ b/yt/yt/library/profiling/resource_tracker/ya.make @@ -4,6 +4,7 @@ INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) SRCS( config.cpp + GLOBAL configure_resource_tracker.cpp resource_tracker.cpp ) diff --git a/yt/yt/library/profiling/solomon/config.cpp b/yt/yt/library/profiling/solomon/config.cpp index e714548804..7b9666ab2e 100644 --- a/yt/yt/library/profiling/solomon/config.cpp +++ b/yt/yt/library/profiling/solomon/config.cpp @@ -45,6 +45,9 @@ void TSolomonExporterConfig::Register(TRegistrar registrar) registrar.Parameter("convert_counters_to_delta_gauge", &TThis::ConvertCountersToDeltaGauge) .Default(false); + registrar.Parameter("enable_histogram_compat", &TThis::EnableHistogramCompat) + .Default(false); + registrar.Parameter("export_summary", &TThis::ExportSummary) .Default(false); registrar.Parameter("export_summary_as_max", &TThis::ExportSummaryAsMax) diff --git a/yt/yt/library/profiling/solomon/config.h b/yt/yt/library/profiling/solomon/config.h index 42b5b4548e..0ff84c96d2 100644 --- a/yt/yt/library/profiling/solomon/config.h +++ b/yt/yt/library/profiling/solomon/config.h @@ -41,6 +41,7 @@ struct TSolomonExporterConfig bool ConvertCountersToRateForSolomon; bool RenameConvertedCounters; bool ConvertCountersToDeltaGauge; + bool EnableHistogramCompat; bool ExportSummary; bool ExportSummaryAsMax; diff --git a/yt/yt/library/profiling/solomon/exporter.cpp b/yt/yt/library/profiling/solomon/exporter.cpp index aee542660a..c06a7e4ac7 100644 --- a/yt/yt/library/profiling/solomon/exporter.cpp +++ b/yt/yt/library/profiling/solomon/exporter.cpp @@ -636,6 +636,9 @@ void TSolomonExporter::DoHandleShard( if (Config_->ConvertCountersToDeltaGauge && outputEncodingContext.IsSolomonPull) { options.ConvertCountersToDeltaGauge = true; } + if (Config_->EnableHistogramCompat && outputEncodingContext.IsSolomonPull) { + options.EnableHistogramCompat = true; + } options.EnableSolomonAggregationWorkaround = outputEncodingContext.IsSolomonPull; options.Times = readWindow; diff --git a/yt/yt/library/profiling/solomon/helpers.cpp b/yt/yt/library/profiling/solomon/helpers.cpp index 056daaa330..7f4be734a5 100644 --- a/yt/yt/library/profiling/solomon/helpers.cpp +++ b/yt/yt/library/profiling/solomon/helpers.cpp @@ -1,8 +1,13 @@ #include "helpers.h" +#include "percpu.h" #include "private.h" +#include "producer.h" +#include "sensor_set.h" #include <yt/yt/core/http/http.h> +#include <yt/yt/core/misc/ref_counted_tracker.h> + #include <library/cpp/monlib/encode/json/json.h> #include <library/cpp/monlib/encode/spack/spack_v1.h> #include <library/cpp/monlib/encode/prometheus/prometheus.h> @@ -73,6 +78,38 @@ TOutputEncodingContext CreateOutputEncodingContextFromHeaders(const THeadersPtr& return context; } +i64 GetCountersBytesAlive() +{ + auto* tracker = TRefCountedTracker::Get(); + i64 usage = 0; + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSimpleCounter>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TPerCpuCounter>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TCounterState>()); + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSimpleTimeCounter>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TPerCpuTimeCounter>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TTimeCounterState>()); + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSimpleGauge>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TPerCpuGauge>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TGaugeState>()); + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSimpleSummary<double>>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TPerCpuSummary<double>>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSimpleSummary<TDuration>>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TPerCpuSummary<TDuration>>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TSummaryState>()); + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TProducerState>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<THistogram>()); + + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<TTimerSummaryState>()); + usage += tracker->GetBytesAlive(GetRefCountedTypeKey<THistogramState>()); + + return usage; +} + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NProfiling diff --git a/yt/yt/library/profiling/solomon/helpers.h b/yt/yt/library/profiling/solomon/helpers.h index 4713fd20b7..fd2c841b3f 100644 --- a/yt/yt/library/profiling/solomon/helpers.h +++ b/yt/yt/library/profiling/solomon/helpers.h @@ -27,6 +27,8 @@ void FillResponseHeaders(const TOutputEncodingContext& outputEncodingContext, co //! Creates output encoder according to request headers. TOutputEncodingContext CreateOutputEncodingContextFromHeaders(const NHttp::THeadersPtr& headers); +i64 GetCountersBytesAlive(); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NProfiling diff --git a/yt/yt/library/program/config.cpp b/yt/yt/library/program/config.cpp index 371c7ced70..762feea0f2 100644 --- a/yt/yt/library/program/config.cpp +++ b/yt/yt/library/program/config.cpp @@ -18,67 +18,6 @@ void THeapProfilerConfig::Register(TRegistrar registrar) //////////////////////////////////////////////////////////////////////////////// -void TSingletonsConfig::Register(TRegistrar registrar) -{ - registrar.Parameter("fiber_manager", &TThis::FiberManager) - .DefaultNew(); - registrar.Parameter("address_resolver", &TThis::AddressResolver) - .DefaultNew(); - registrar.Parameter("tcp_dispatcher", &TThis::TcpDispatcher) - .DefaultNew(); - registrar.Parameter("io_dispatcher", &TThis::IODispatcher) - .DefaultNew(); - registrar.Parameter("rpc_dispatcher", &TThis::RpcDispatcher) - .DefaultNew(); - registrar.Parameter("grpc_dispatcher", &TThis::GrpcDispatcher) - .DefaultNew(); - registrar.Parameter("yp_service_discovery", &TThis::YPServiceDiscovery) - .DefaultNew(); - registrar.Parameter("logging", &TThis::Logging) - .DefaultCtor([] { return NLogging::TLogManagerConfig::CreateDefault(); }) - .ResetOnLoad(); - registrar.Parameter("jaeger", &TThis::Jaeger) - .DefaultNew(); - registrar.Parameter("tcmalloc", &TThis::TCMalloc) - .DefaultNew(); - registrar.Parameter("stockpile", &TThis::Stockpile) - .DefaultNew(); - registrar.Parameter("enable_ref_counted_tracker_profiling", &TThis::EnableRefCountedTrackerProfiling) - .Default(true); - registrar.Parameter("resource_tracker", &TThis::ResourceTracker) - .DefaultNew(); - registrar.Parameter("heap_profiler", &TThis::HeapProfiler) - .DefaultNew(); - registrar.Parameter("protobuf_interop", &TThis::ProtobufInterop) - .DefaultNew(); -} - -//////////////////////////////////////////////////////////////////////////////// - -void TSingletonsDynamicConfig::Register(TRegistrar registrar) -{ - registrar.Parameter("fiber_manager", &TThis::FiberManager) - .DefaultNew(); - registrar.Parameter("tcp_dispatcher", &TThis::TcpDispatcher) - .DefaultNew(); - registrar.Parameter("io_dispatcher", &TThis::IODispatcher) - .Optional(); - registrar.Parameter("rpc_dispatcher", &TThis::RpcDispatcher) - .DefaultNew(); - registrar.Parameter("logging", &TThis::Logging) - .DefaultNew(); - registrar.Parameter("jaeger", &TThis::Jaeger) - .DefaultNew(); - registrar.Parameter("tcmalloc", &TThis::TCMalloc) - .Optional(); - registrar.Parameter("stockpile", &TThis::Stockpile) - .DefaultNew(); - registrar.Parameter("protobuf_interop", &TThis::ProtobufInterop) - .DefaultNew(); -} - -//////////////////////////////////////////////////////////////////////////////// - void WarnForUnrecognizedOptionsImpl( const NLogging::TLogger& logger, const IMapNodePtr& unrecognized) diff --git a/yt/yt/library/program/config.h b/yt/yt/library/program/config.h index 1b6e0ffd64..38f6eb65ee 100644 --- a/yt/yt/library/program/config.h +++ b/yt/yt/library/program/config.h @@ -2,32 +2,9 @@ #include "public.h" -#include <yt/yt/core/concurrency/config.h> - #include <yt/yt/core/ytree/yson_struct.h> -#include <yt/yt/core/net/config.h> - -#include <yt/yt/core/rpc/config.h> -#include <yt/yt/core/rpc/grpc/config.h> - -#include <yt/yt/core/bus/tcp/config.h> - -#include <yt/yt/core/logging/config.h> - -#include <yt/yt/core/service_discovery/yp/config.h> - -#include <yt/yt/core/yson/config.h> - -#include <yt/yt/library/process/io_dispatcher.h> - -#include <yt/yt/library/tracing/jaeger/tracer.h> - -#include <yt/yt/library/profiling/resource_tracker/config.h> - -#include <yt/yt/library/tcmalloc/config.h> - -#include <yt/yt/library/stockpile/config.h> +#include <yt/yt/core/misc/configurable_singleton_def.h> namespace NYT { @@ -53,58 +30,6 @@ DEFINE_REFCOUNTED_TYPE(THeapProfilerConfig) //////////////////////////////////////////////////////////////////////////////// -class TSingletonsConfig - : public virtual NYTree::TYsonStruct -{ -public: - NConcurrency::TFiberManagerConfigPtr FiberManager; - NNet::TAddressResolverConfigPtr AddressResolver; - NBus::TTcpDispatcherConfigPtr TcpDispatcher; - NPipes::TIODispatcherConfigPtr IODispatcher; - NRpc::TDispatcherConfigPtr RpcDispatcher; - NRpc::NGrpc::TDispatcherConfigPtr GrpcDispatcher; - NServiceDiscovery::NYP::TServiceDiscoveryConfigPtr YPServiceDiscovery; - NLogging::TLogManagerConfigPtr Logging; - NTracing::TJaegerTracerConfigPtr Jaeger; - NTCMalloc::TTCMallocConfigPtr TCMalloc; - TStockpileConfigPtr Stockpile; - bool EnableRefCountedTrackerProfiling; - NProfiling::TResourceTrackerConfigPtr ResourceTracker; - THeapProfilerConfigPtr HeapProfiler; - NYson::TProtobufInteropConfigPtr ProtobufInterop; - - REGISTER_YSON_STRUCT(TSingletonsConfig); - - static void Register(TRegistrar registrar); -}; - -DEFINE_REFCOUNTED_TYPE(TSingletonsConfig) - -//////////////////////////////////////////////////////////////////////////////// - -class TSingletonsDynamicConfig - : public virtual NYTree::TYsonStruct -{ -public: - NConcurrency::TFiberManagerDynamicConfigPtr FiberManager; - NBus::TTcpDispatcherDynamicConfigPtr TcpDispatcher; - NPipes::TIODispatcherConfigPtr IODispatcher; - NRpc::TDispatcherDynamicConfigPtr RpcDispatcher; - NLogging::TLogManagerDynamicConfigPtr Logging; - NTracing::TJaegerTracerDynamicConfigPtr Jaeger; - NTCMalloc::TTCMallocConfigPtr TCMalloc; - TStockpileDynamicConfigPtr Stockpile; - NYson::TProtobufInteropDynamicConfigPtr ProtobufInterop; - - REGISTER_YSON_STRUCT(TSingletonsDynamicConfig); - - static void Register(TRegistrar registrar); -}; - -DEFINE_REFCOUNTED_TYPE(TSingletonsDynamicConfig) - -//////////////////////////////////////////////////////////////////////////////// - // NB: These functions should not be called from bootstrap // config validator since logger is not set up yet. void WarnForUnrecognizedOptions( diff --git a/yt/yt/library/program/helpers.cpp b/yt/yt/library/program/helpers.cpp index b46bdd9786..1911b327b5 100644 --- a/yt/yt/library/program/helpers.cpp +++ b/yt/yt/library/program/helpers.cpp @@ -1,124 +1,37 @@ #include "helpers.h" #include "config.h" -#include "private.h" - -#include <yt/yt/core/misc/lazy_ptr.h> -#include <yt/yt/core/misc/ref_counted_tracker.h> -#include <yt/yt/core/misc/ref_counted_tracker_profiler.h> - -#include <yt/yt/core/bus/tcp/dispatcher.h> - -#include <yt/yt/core/concurrency/fiber_manager.h> - -#include <yt/yt/library/tracing/jaeger/tracer.h> #include <yt/yt/library/profiling/perf/event_counter_profiler.h> -#include <yt/yt/library/profiling/resource_tracker/resource_tracker.h> - -#include <yt/yt/library/tcmalloc/tcmalloc_manager.h> +#include <yt/yt/core/misc/ref_counted_tracker_profiler.h> #include <yt/yt/core/logging/log_manager.h> -#include <yt/yt/core/concurrency/execution_stack.h> -#include <yt/yt/core/concurrency/fiber_scheduler_thread.h> -#include <yt/yt/core/concurrency/periodic_executor.h> - #include <yt/yt/core/net/address.h> -#include <yt/yt/core/yson/protobuf_interop.h> - -#include <yt/yt/core/rpc/dispatcher.h> -#include <yt/yt/core/rpc/grpc/dispatcher.h> - -#include <yt/yt/core/service_discovery/yp/service_discovery.h> - -#include <library/cpp/yt/memory/atomic_intrusive_ptr.h> - -#include <util/string/split.h> -#include <util/system/thread.h> - -#include <mutex> -#include <thread> - namespace NYT { -using namespace NConcurrency; -using namespace NThreading; -using namespace NTCMalloc; - //////////////////////////////////////////////////////////////////////////////// void ConfigureSingletons(const TSingletonsConfigPtr& config) { - TFiberManager::Configure(config->FiberManager); + TSingletonManager::Configure(config); + // TODO(babenko): move to server program base NLogging::TLogManager::Get()->EnableReopenOnSighup(); - if (!NLogging::TLogManager::Get()->IsConfiguredFromEnv()) { - NLogging::TLogManager::Get()->Configure(config->Logging); - } - NNet::TAddressResolver::Get()->Configure(config->AddressResolver); // By default, server components must have a reasonable FQDN. // Failure to do so may result in issues like YT-4561. + // TODO(babenko): move to server program base NNet::TAddressResolver::Get()->EnsureLocalHostName(); - NBus::TTcpDispatcher::Get()->Configure(config->TcpDispatcher); - - NPipes::TIODispatcher::Get()->Configure(config->IODispatcher); - - NRpc::TDispatcher::Get()->Configure(config->RpcDispatcher); - - NRpc::NGrpc::TDispatcher::Get()->Configure(config->GrpcDispatcher); - - NRpc::TDispatcher::Get()->SetServiceDiscovery( - NServiceDiscovery::NYP::CreateServiceDiscovery(config->YPServiceDiscovery)); - - NTracing::SetGlobalTracer(New<NTracing::TJaegerTracer>(config->Jaeger)); - + // TODO(babenko): move to server program base NProfiling::EnablePerfEventCounterProfiling(); - - NTCMalloc::TTCMallocManager::Configure(config->TCMalloc); - - TStockpileManager::Reconfigure(*config->Stockpile); - - if (config->EnableRefCountedTrackerProfiling) { - EnableRefCountedTrackerProfiling(); - } - - NProfiling::TResourceTracker::Configure(config->ResourceTracker); - - NYson::SetProtobufInteropConfig(config->ProtobufInterop); } -void ReconfigureSingletons(const TSingletonsConfigPtr& config, const TSingletonsDynamicConfigPtr& dynamicConfig) +void ReconfigureSingletons(const TSingletonsDynamicConfigPtr& dynamicConfig) { - TFiberManager::Configure(config->FiberManager->ApplyDynamic(dynamicConfig->FiberManager)); - - if (!NLogging::TLogManager::Get()->IsConfiguredFromEnv()) { - NLogging::TLogManager::Get()->Configure( - config->Logging->ApplyDynamic(dynamicConfig->Logging), - /*sync*/ false); - } - - auto tracer = NTracing::GetGlobalTracer(); - if (auto jaeger = DynamicPointerCast<NTracing::TJaegerTracer>(tracer); jaeger) { - jaeger->Configure(config->Jaeger->ApplyDynamic(dynamicConfig->Jaeger)); - } - - NBus::TTcpDispatcher::Get()->Configure(config->TcpDispatcher->ApplyDynamic(dynamicConfig->TcpDispatcher)); - - NPipes::TIODispatcher::Get()->Configure(dynamicConfig->IODispatcher ? dynamicConfig->IODispatcher : config->IODispatcher); - - NRpc::TDispatcher::Get()->Configure(config->RpcDispatcher->ApplyDynamic(dynamicConfig->RpcDispatcher)); - - NTCMalloc::TTCMallocManager::Configure(dynamicConfig->TCMalloc - ? config->TCMalloc->ApplyDynamic(dynamicConfig->TCMalloc) - : config->TCMalloc); - - TStockpileManager::Reconfigure(*config->Stockpile->ApplyDynamic(dynamicConfig->Stockpile)); - - NYson::SetProtobufInteropConfig(config->ProtobufInterop->ApplyDynamic(dynamicConfig->ProtobufInterop)); + TSingletonManager::Reconfigure(dynamicConfig); } //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/library/program/helpers.h b/yt/yt/library/program/helpers.h index 7cbf696109..7d0842179d 100644 --- a/yt/yt/library/program/helpers.h +++ b/yt/yt/library/program/helpers.h @@ -7,9 +7,7 @@ namespace NYT { //////////////////////////////////////////////////////////////////////////////// void ConfigureSingletons(const TSingletonsConfigPtr& config); -void ReconfigureSingletons( - const TSingletonsConfigPtr& config, - const TSingletonsDynamicConfigPtr& dynamicConfig); +void ReconfigureSingletons(const TSingletonsDynamicConfigPtr& dynamicConfig); //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/library/program/program.cpp b/yt/yt/library/program/program.cpp index ff74660266..5c13a6d09c 100644 --- a/yt/yt/library/program/program.cpp +++ b/yt/yt/library/program/program.cpp @@ -18,8 +18,6 @@ #include <yt/yt/library/profiling/tcmalloc/profiler.h> -#include <library/cpp/yt/stockpile/stockpile.h> - #include <library/cpp/yt/system/exit.h> #include <library/cpp/yt/backtrace/absl_unwinder/absl_unwinder.h> diff --git a/yt/yt/library/program/program.h b/yt/yt/library/program/program.h index cd8bf61554..1f47ce93b7 100644 --- a/yt/yt/library/program/program.h +++ b/yt/yt/library/program/program.h @@ -2,8 +2,6 @@ #include <yt/yt/core/misc/public.h> -#include <library/cpp/yt/stockpile/stockpile.h> - #include <library/cpp/getopt/last_getopt.h> #include <yt/yt/core/yson/string.h> diff --git a/yt/yt/library/program/public.h b/yt/yt/library/program/public.h index 34231b1373..e45512239b 100644 --- a/yt/yt/library/program/public.h +++ b/yt/yt/library/program/public.h @@ -8,8 +8,6 @@ namespace NYT { DECLARE_REFCOUNTED_CLASS(TBuildInfo) DECLARE_REFCOUNTED_CLASS(TRpcConfig) -DECLARE_REFCOUNTED_CLASS(TSingletonsConfig) -DECLARE_REFCOUNTED_CLASS(TSingletonsDynamicConfig) DECLARE_REFCOUNTED_CLASS(THeapSizeLimitConfig) DECLARE_REFCOUNTED_CLASS(THeapProfilerConfig) diff --git a/yt/yt/library/program/ya.make b/yt/yt/library/program/ya.make index 5e07ac0d66..eac249a168 100644 --- a/yt/yt/library/program/ya.make +++ b/yt/yt/library/program/ya.make @@ -15,19 +15,12 @@ SRCS( PEERDIR( yt/yt/core - yt/yt/core/service_discovery/yp - yt/yt/library/monitoring - yt/yt/library/oom - yt/yt/library/profiling/solomon yt/yt/library/profiling/tcmalloc yt/yt/library/profiling/perf - yt/yt/library/stockpile yt/yt/library/ytprof - yt/yt/library/tcmalloc - yt/yt/library/tracing/jaeger - library/cpp/yt/mlock - library/cpp/yt/stockpile + yt/yt/library/tcmalloc # for tcmalloc singleton library/cpp/yt/string + library/cpp/yt/system library/cpp/yt/backtrace/absl_unwinder library/cpp/getopt/small ) diff --git a/yt/yt/library/stockpile/config.cpp b/yt/yt/library/stockpile/config.cpp deleted file mode 100644 index 4a2fc69971..0000000000 --- a/yt/yt/library/stockpile/config.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "config.h" - -namespace NYT { - -using namespace NYTree; - -//////////////////////////////////////////////////////////////////////////////// - -void TStockpileConfig::Register(TRegistrar registrar) -{ - registrar.BaseClassParameter("buffer_size", &TThis::BufferSize) - .Default(DefaultBufferSize) - .GreaterThan(0); - registrar.BaseClassParameter("thread_count", &TThis::ThreadCount) - .Default(DefaultThreadCount); - registrar.BaseClassParameter("strategy", &TThis::Strategy) - .Default(DefaultStrategy); - registrar.BaseClassParameter("period", &TThis::Period) - .Default(DefaultPeriod); -} - -TStockpileConfigPtr TStockpileConfig::ApplyDynamic(const TStockpileDynamicConfigPtr& dynamicConfig) const -{ - auto mergedConfig = CloneYsonStruct(MakeStrong(this)); - - if (dynamicConfig->BufferSize) { - mergedConfig->BufferSize = *dynamicConfig->BufferSize; - } - if (dynamicConfig->ThreadCount) { - mergedConfig->ThreadCount = *dynamicConfig->ThreadCount; - } - if (dynamicConfig->Strategy) { - mergedConfig->Strategy = *dynamicConfig->Strategy; - } - if (dynamicConfig->Period) { - mergedConfig->Period = *dynamicConfig->Period; - } - - mergedConfig->Postprocess(); - return mergedConfig; -} - -//////////////////////////////////////////////////////////////////////////////// - -void TStockpileDynamicConfig::Register(TRegistrar registrar) -{ - registrar.BaseClassParameter("buffer_size", &TThis::BufferSize) - .Optional() - .GreaterThan(0); - registrar.BaseClassParameter("thread_count", &TThis::ThreadCount) - .Optional() - .GreaterThanOrEqual(0); - registrar.BaseClassParameter("strategy", &TThis::Strategy) - .Optional(); - registrar.BaseClassParameter("period", &TThis::Period) - .Optional(); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/stockpile/config.h b/yt/yt/library/stockpile/config.h deleted file mode 100644 index 7d12b5bfa4..0000000000 --- a/yt/yt/library/stockpile/config.h +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "public.h" - -#include <yt/yt/core/ytree/yson_struct.h> - -#include <library/cpp/yt/stockpile/stockpile.h> - -namespace NYT { - -//////////////////////////////////////////////////////////////////////////////// - -struct TStockpileConfig - : public TStockpileOptions - , public NYTree::TYsonStruct -{ - TStockpileConfigPtr ApplyDynamic(const TStockpileDynamicConfigPtr& dynamicConfig) const; - - REGISTER_YSON_STRUCT(TStockpileConfig); - - static void Register(TRegistrar registrar); -}; - -DEFINE_REFCOUNTED_TYPE(TStockpileConfig) - -//////////////////////////////////////////////////////////////////////////////// - -struct TStockpileDynamicConfig - : public NYTree::TYsonStruct -{ - std::optional<i64> BufferSize; - std::optional<int> ThreadCount; - std::optional<EStockpileStrategy> Strategy; - std::optional<TDuration> Period; - - REGISTER_YSON_STRUCT(TStockpileDynamicConfig); - - static void Register(TRegistrar registrar); -}; - -DEFINE_REFCOUNTED_TYPE(TStockpileDynamicConfig) - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/stockpile/public.h b/yt/yt/library/stockpile/public.h deleted file mode 100644 index f71c1dc7e4..0000000000 --- a/yt/yt/library/stockpile/public.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include <yt/yt/core/misc/public.h> - -namespace NYT { - -//////////////////////////////////////////////////////////////////////////////// - -DECLARE_REFCOUNTED_STRUCT(TStockpileConfig) -DECLARE_REFCOUNTED_STRUCT(TStockpileDynamicConfig) - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace NYT diff --git a/yt/yt/library/stockpile/ya.make b/yt/yt/library/stockpile/ya.make deleted file mode 100644 index 9529fab0fb..0000000000 --- a/yt/yt/library/stockpile/ya.make +++ /dev/null @@ -1,14 +0,0 @@ -LIBRARY() - -INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) - -SRCS( - config.cpp -) - -PEERDIR( - yt/yt/core - library/cpp/yt/stockpile -) - -END() diff --git a/yt/yt/library/tcmalloc/configure_tcmalloc_manager.cpp b/yt/yt/library/tcmalloc/configure_tcmalloc_manager.cpp new file mode 100644 index 0000000000..d5947bf185 --- /dev/null +++ b/yt/yt/library/tcmalloc/configure_tcmalloc_manager.cpp @@ -0,0 +1,36 @@ +#include "tcmalloc_manager.h" +#include "config.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NTCMalloc { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TTCMallocConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TTCMallocConfigPtr& config) +{ + TTCMallocManager::Configure(config); +} + +void ReconfigureSingleton( + const TTCMallocConfigPtr& config, + const TTCMallocConfigPtr& dynamicConfig) +{ + TTCMallocManager::Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "tcmalloc", + TTCMallocConfig, + TTCMallocConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NTCMalloc diff --git a/yt/yt/library/tcmalloc/public.h b/yt/yt/library/tcmalloc/public.h index 620be42454..f6703f918c 100644 --- a/yt/yt/library/tcmalloc/public.h +++ b/yt/yt/library/tcmalloc/public.h @@ -1,6 +1,8 @@ #pragma once -#include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> + +#include <library/cpp/yt/memory/ref_counted.h> namespace NYT::NTCMalloc { @@ -9,6 +11,8 @@ namespace NYT::NTCMalloc { DECLARE_REFCOUNTED_STRUCT(TTCMallocConfig) DECLARE_REFCOUNTED_STRUCT(THeapSizeLimitConfig) +YT_DECLARE_RECONFIGURABLE_SINGLETON(TTCMallocConfig, TTCMallocConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NTCMalloc diff --git a/yt/yt/library/tcmalloc/ya.make b/yt/yt/library/tcmalloc/ya.make index 4042a08971..35e68d60f8 100644 --- a/yt/yt/library/tcmalloc/ya.make +++ b/yt/yt/library/tcmalloc/ya.make @@ -5,6 +5,7 @@ INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) SRCS( config.cpp tcmalloc_manager.cpp + GLOBAL configure_tcmalloc_manager.cpp ) PEERDIR( diff --git a/yt/yt/library/tracing/jaeger/configure_tracer.cpp b/yt/yt/library/tracing/jaeger/configure_tracer.cpp new file mode 100644 index 0000000000..376e8ea9a4 --- /dev/null +++ b/yt/yt/library/tracing/jaeger/configure_tracer.cpp @@ -0,0 +1,43 @@ +#include "tracer.h" + +#include <yt/yt/core/misc/configurable_singleton_def.h> + +namespace NYT::NTracing { + +using namespace NYTree; + +//////////////////////////////////////////////////////////////////////////////// + +void SetupSingletonConfigParameter(TYsonStructParameter<TJaegerTracerConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void SetupSingletonConfigParameter(TYsonStructParameter<TJaegerTracerDynamicConfigPtr>& parameter) +{ + parameter.DefaultNew(); +} + +void ConfigureSingleton(const TJaegerTracerConfigPtr& config) +{ + SetGlobalTracer(New<TJaegerTracer>(config)); +} + +void ReconfigureSingleton( + const TJaegerTracerConfigPtr& config, + const TJaegerTracerDynamicConfigPtr& dynamicConfig) +{ + auto tracer = NTracing::GetGlobalTracer(); + auto jaegerTracer = DynamicPointerCast<NTracing::TJaegerTracer>(tracer); + YT_VERIFY(jaegerTracer); + jaegerTracer->Configure(config->ApplyDynamic(dynamicConfig)); +} + +YT_DEFINE_RECONFIGURABLE_SINGLETON( + "jaeger", + TJaegerTracerConfig, + TJaegerTracerDynamicConfig); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NTracing diff --git a/yt/yt/library/tracing/jaeger/public.h b/yt/yt/library/tracing/jaeger/public.h index 1d9506cacf..9eeaead1d9 100644 --- a/yt/yt/library/tracing/jaeger/public.h +++ b/yt/yt/library/tracing/jaeger/public.h @@ -1,6 +1,8 @@ #pragma once -#include <yt/yt/core/misc/public.h> +#include <yt/yt/core/misc/configurable_singleton_decl.h> + +#include <library/cpp/yt/memory/ref_counted.h> namespace NYT::NTracing { @@ -12,6 +14,8 @@ DECLARE_REFCOUNTED_CLASS(TSamplerConfig) DECLARE_REFCOUNTED_CLASS(TJaegerTracerDynamicConfig) DECLARE_REFCOUNTED_CLASS(TJaegerTracerConfig) +YT_DECLARE_RECONFIGURABLE_SINGLETON(TJaegerTracerConfig, TJaegerTracerDynamicConfig); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NTracing diff --git a/yt/yt/library/tracing/jaeger/tracer.cpp b/yt/yt/library/tracing/jaeger/tracer.cpp index 090f413790..f29a8be167 100644 --- a/yt/yt/library/tracing/jaeger/tracer.cpp +++ b/yt/yt/library/tracing/jaeger/tracer.cpp @@ -1,3 +1,4 @@ + #include "tracer.h" #include "private.h" diff --git a/yt/yt/library/tracing/jaeger/ya.make b/yt/yt/library/tracing/jaeger/ya.make index b85e518305..14f1877294 100644 --- a/yt/yt/library/tracing/jaeger/ya.make +++ b/yt/yt/library/tracing/jaeger/ya.make @@ -10,8 +10,10 @@ PEERDIR( SRCS( model.proto + sampler.cpp - GLOBAL tracer.cpp + tracer.cpp + GLOBAL configure_tracer.cpp ) END() diff --git a/yt/yt/library/tvm/service/unittests/ya.make b/yt/yt/library/tvm/service/unittests/ya.make deleted file mode 100644 index 23ac522bd0..0000000000 --- a/yt/yt/library/tvm/service/unittests/ya.make +++ /dev/null @@ -1,19 +0,0 @@ -GTEST(unittester-library-auth_tvm) - -INCLUDE(${ARCADIA_ROOT}/yt/opensource.inc) - -PEERDIR( - yt/yt/build - - yt/yt/core/test_framework - - yt/yt/library/tvm/service -) - -EXPLICIT_DATA() - -IF(NOT OPENSOURCE) - INCLUDE(ya_non_opensource.inc) -ENDIF() - -END() |