diff options
author | max42 <max42@yandex-team.com> | 2023-06-30 03:37:03 +0300 |
---|---|---|
committer | max42 <max42@yandex-team.com> | 2023-06-30 03:37:03 +0300 |
commit | fac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a (patch) | |
tree | b8cbc1deb00309c7f1a7ab6df520a76cf0b5c6d7 /yt/cpp/mapreduce | |
parent | 7bf166b1a7ed0af927f230022b245af618e998c1 (diff) | |
download | ydb-fac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a.tar.gz |
YT-19324: move YT provider to ydb/library/yql
This commit is formed by the following script: https://paste.yandex-team.ru/6f92e4b8-efc5-4d34-948b-15ee2accd7e7/text.
This commit has zero effect on all projects that depend on YQL.
The summary of changes:
- `yql/providers/yt -> ydb/library/yql/providers/yt `- the whole implementation of YT provider is moved into YDB code base for further export as a part of YT YQL plugin shared library;
- `yql/providers/stat/{expr_nodes,uploader} -> ydb/library/yql/providers/stat/{expr_nodes,uploader}` - a small interface without implementation and the description of stat expr nodes;
- `yql/core/extract_predicate/ut -> ydb/library/yql/core/extract_predicate/ut`;
- `yql/core/{ut,ut_common} -> ydb/library/yql/core/{ut,ut_common}`;
- `yql/core` is gone;
- `yql/library/url_preprocessing -> ydb/library/yql/core/url_preprocessing`.
**NB**: all new targets inside `ydb/` are under `IF (NOT CMAKE_EXPORT)` clause which disables them from open-source cmake generation and ya make build. They will be enabled in the subsequent commits.
Diffstat (limited to 'yt/cpp/mapreduce')
195 files changed, 43102 insertions, 0 deletions
diff --git a/yt/cpp/mapreduce/client/abortable_registry.cpp b/yt/cpp/mapreduce/client/abortable_registry.cpp new file mode 100644 index 0000000000..283d39e049 --- /dev/null +++ b/yt/cpp/mapreduce/client/abortable_registry.cpp @@ -0,0 +1,125 @@ +#include "abortable_registry.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <util/generic/singleton.h> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +//////////////////////////////////////////////////////////////////////////////// + +TTransactionAbortable::TTransactionAbortable(const TClientContext& context, const TTransactionId& transactionId) + : Context_(context) + , TransactionId_(transactionId) +{ } + +void TTransactionAbortable::Abort() +{ + AbortTransaction(nullptr, Context_, TransactionId_); +} + +TString TTransactionAbortable::GetType() const +{ + return "transaction"; +} + +//////////////////////////////////////////////////////////////////////////////// + +TOperationAbortable::TOperationAbortable(IClientRetryPolicyPtr clientRetryPolicy, TClientContext context, const TOperationId& operationId) + : ClientRetryPolicy_(std::move(clientRetryPolicy)) + , Context_(std::move(context)) + , OperationId_(operationId) +{ } + + +void TOperationAbortable::Abort() +{ + AbortOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, OperationId_); +} + +TString TOperationAbortable::GetType() const +{ + return "operation"; +} + +//////////////////////////////////////////////////////////////////////////////// + +void TAbortableRegistry::AbortAllAndBlockForever() +{ + auto guard = Guard(Lock_); + + for (const auto& entry : ActiveAbortables_) { + const auto& id = entry.first; + const auto& abortable = entry.second; + try { + abortable->Abort(); + } catch (std::exception& ex) { + YT_LOG_ERROR("Exception while aborting %v %v: %v", + abortable->GetType(), + id, + ex.what()); + } + } + + Running_ = false; +} + +void TAbortableRegistry::Add(const TGUID& id, IAbortablePtr abortable) +{ + auto guard = Guard(Lock_); + + if (!Running_) { + Sleep(TDuration::Max()); + } + + ActiveAbortables_[id] = abortable; +} + +void TAbortableRegistry::Remove(const TGUID& id) +{ + auto guard = Guard(Lock_); + + if (!Running_) { + Sleep(TDuration::Max()); + } + + ActiveAbortables_.erase(id); +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +class TRegistryHolder +{ +public: + TRegistryHolder() + : Registry_(::MakeIntrusive<TAbortableRegistry>()) + { } + + ::TIntrusivePtr<TAbortableRegistry> Get() + { + return Registry_; + } + +private: + ::TIntrusivePtr<TAbortableRegistry> Registry_; +}; + +} // namespace + +::TIntrusivePtr<TAbortableRegistry> TAbortableRegistry::Get() +{ + return Singleton<TRegistryHolder>()->Get(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/abortable_registry.h b/yt/cpp/mapreduce/client/abortable_registry.h new file mode 100644 index 0000000000..119d685cad --- /dev/null +++ b/yt/cpp/mapreduce/client/abortable_registry.h @@ -0,0 +1,81 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <yt/cpp/mapreduce/http/context.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <util/str_stl.h> +#include <util/system/mutex.h> +#include <util/generic/hash.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class IAbortable + : public TThrRefBase +{ +public: + virtual void Abort() = 0; + virtual TString GetType() const = 0; +}; + +using IAbortablePtr = ::TIntrusivePtr<IAbortable>; + +//////////////////////////////////////////////////////////////////////////////// + +class TTransactionAbortable + : public IAbortable +{ +public: + TTransactionAbortable(const TClientContext& context, const TTransactionId& transactionId); + void Abort() override; + TString GetType() const override; + +private: + TClientContext Context_; + TTransactionId TransactionId_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TOperationAbortable + : public IAbortable +{ +public: + TOperationAbortable(IClientRetryPolicyPtr clientRetryPolicy, TClientContext context, const TOperationId& operationId); + void Abort() override; + TString GetType() const override; + +private: + const IClientRetryPolicyPtr ClientRetryPolicy_; + const TClientContext Context_; + const TOperationId OperationId_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TAbortableRegistry + : public TThrRefBase +{ +public: + TAbortableRegistry() = default; + static ::TIntrusivePtr<TAbortableRegistry> Get(); + + void AbortAllAndBlockForever(); + void Add(const TGUID& id, IAbortablePtr abortable); + void Remove(const TGUID& id); + +private: + THashMap<TGUID, IAbortablePtr> ActiveAbortables_; + TMutex Lock_; + bool Running_ = true; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/batch_request_impl.cpp b/yt/cpp/mapreduce/client/batch_request_impl.cpp new file mode 100644 index 0000000000..6afa5665f1 --- /dev/null +++ b/yt/cpp/mapreduce/client/batch_request_impl.cpp @@ -0,0 +1,198 @@ +#include "batch_request_impl.h" + +#include "lock.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node.h> +#include <library/cpp/yson/node/serialize.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> +#include <yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h> + +#include <util/generic/guid.h> +#include <util/string/builder.h> + +#include <exception> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +using ::NThreading::TFuture; +using ::NThreading::TPromise; +using ::NThreading::NewPromise; + +//////////////////////////////////////////////////////////////////// + +TBatchRequest::TBatchRequest(const TTransactionId& defaultTransaction, ::TIntrusivePtr<TClient> client) + : DefaultTransaction_(defaultTransaction) + , Impl_(MakeIntrusive<TRawBatchRequest>(client->GetContext().Config)) + , Client_(client) +{ } + +TBatchRequest::TBatchRequest(TRawBatchRequest* impl, ::TIntrusivePtr<TClient> client) + : Impl_(impl) + , Client_(std::move(client)) +{ } + +TBatchRequest::~TBatchRequest() = default; + +IBatchRequestBase& TBatchRequest::WithTransaction(const TTransactionId& transactionId) +{ + if (!TmpWithTransaction_) { + TmpWithTransaction_.Reset(new TBatchRequest(Impl_.Get(), Client_)); + } + TmpWithTransaction_->DefaultTransaction_ = transactionId; + return *TmpWithTransaction_; +} + +TFuture<TNode> TBatchRequest::Get( + const TYPath& path, + const TGetOptions& options) +{ + return Impl_->Get(DefaultTransaction_, path, options); +} + +TFuture<void> TBatchRequest::Set(const TYPath& path, const TNode& node, const TSetOptions& options) +{ + return Impl_->Set(DefaultTransaction_, path, node, options); +} + +TFuture<TNode::TListType> TBatchRequest::List(const TYPath& path, const TListOptions& options) +{ + return Impl_->List(DefaultTransaction_, path, options); +} + +TFuture<bool> TBatchRequest::Exists(const TYPath& path, const TExistsOptions& options) +{ + return Impl_->Exists(DefaultTransaction_, path, options); +} + +TFuture<ILockPtr> TBatchRequest::Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options) +{ + auto convert = [waitable=options.Waitable_, client=Client_] (TFuture<TNodeId> nodeIdFuture) -> ILockPtr { + return ::MakeIntrusive<TLock>(nodeIdFuture.GetValue(), client, waitable); + }; + return Impl_->Lock(DefaultTransaction_, path, mode, options).Apply(convert); +} + +::NThreading::TFuture<void> TBatchRequest::Unlock( + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()) +{ + return Impl_->Unlock(DefaultTransaction_, path, options); +} + +TFuture<TLockId> TBatchRequest::Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options) +{ + return Impl_->Create(DefaultTransaction_, path, type, options); +} + +TFuture<void> TBatchRequest::Remove( + const TYPath& path, + const TRemoveOptions& options) +{ + return Impl_->Remove(DefaultTransaction_, path, options); +} + +TFuture<TNodeId> TBatchRequest::Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) +{ + return Impl_->Move(DefaultTransaction_, sourcePath, destinationPath, options); +} + +TFuture<TNodeId> TBatchRequest::Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) +{ + return Impl_->Copy(DefaultTransaction_, sourcePath, destinationPath, options); +} + +TFuture<TNodeId> TBatchRequest::Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) +{ + return Impl_->Link(DefaultTransaction_, targetPath, linkPath, options); +} + +TFuture<void> TBatchRequest::AbortOperation(const NYT::TOperationId& operationId) +{ + return Impl_->AbortOperation(operationId); +} + +TFuture<void> TBatchRequest::CompleteOperation(const NYT::TOperationId& operationId) +{ + return Impl_->CompleteOperation(operationId); +} + +TFuture<void> TBatchRequest::SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) +{ + return Impl_->SuspendOperation(operationId, options); +} + +TFuture<void> TBatchRequest::ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) +{ + return Impl_->ResumeOperation(operationId, options); +} + +TFuture<void> TBatchRequest::UpdateOperationParameters( + const NYT::TOperationId& operationId, + const NYT::TUpdateOperationParametersOptions& options) +{ + return Impl_->UpdateOperationParameters(operationId, options); +} + +TFuture<TRichYPath> TBatchRequest::CanonizeYPath(const TRichYPath& path) +{ + return Impl_->CanonizeYPath(path); +} + +TFuture<TVector<TTableColumnarStatistics>> TBatchRequest::GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const NYT::TGetTableColumnarStatisticsOptions& options) +{ + return Impl_->GetTableColumnarStatistics(DefaultTransaction_, paths, options); +} + +TFuture<TCheckPermissionResponse> TBatchRequest::CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) +{ + return Impl_->CheckPermission(user, permission, path, options); +} + +void TBatchRequest::ExecuteBatch(const TExecuteBatchOptions& options) +{ + NYT::NDetail::ExecuteBatch(Client_->GetRetryPolicy()->CreatePolicyForGenericRequest(), Client_->GetContext(), *Impl_, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/batch_request_impl.h b/yt/cpp/mapreduce/client/batch_request_impl.h new file mode 100644 index 0000000000..0a176417b3 --- /dev/null +++ b/yt/cpp/mapreduce/client/batch_request_impl.h @@ -0,0 +1,137 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/batch_request.h> +#include <yt/cpp/mapreduce/interface/fwd.h> +#include <yt/cpp/mapreduce/interface/node.h> + +#include <yt/cpp/mapreduce/http/requests.h> + +#include <library/cpp/threading/future/future.h> + +#include <util/generic/ptr.h> +#include <util/generic/deque.h> + +#include <exception> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +struct TResponseInfo; +class TClient; +using TClientPtr = ::TIntrusivePtr<TClient>; + +namespace NRawClient { + class TRawBatchRequest; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TBatchRequest + : public IBatchRequest +{ +public: + TBatchRequest(const TTransactionId& defaultTransaction, ::TIntrusivePtr<TClient> client); + + ~TBatchRequest(); + + virtual IBatchRequestBase& WithTransaction(const TTransactionId& transactionId) override; + + virtual ::NThreading::TFuture<TLockId> Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options = TCreateOptions()) override; + + virtual ::NThreading::TFuture<void> Remove( + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()) override; + + virtual ::NThreading::TFuture<bool> Exists( + const TYPath& path, + const TExistsOptions& options = TExistsOptions()) override; + + virtual ::NThreading::TFuture<TNode> Get( + const TYPath& path, + const TGetOptions& options = TGetOptions()) override; + + virtual ::NThreading::TFuture<void> Set( + const TYPath& path, + const TNode& node, + const TSetOptions& options = TSetOptions()) override; + + virtual ::NThreading::TFuture<TNode::TListType> List( + const TYPath& path, + const TListOptions& options = TListOptions()) override; + + virtual ::NThreading::TFuture<TNodeId> Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()) override; + + virtual ::NThreading::TFuture<TNodeId> Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()) override; + + virtual ::NThreading::TFuture<TNodeId> Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()) override; + + virtual ::NThreading::TFuture<ILockPtr> Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options) override; + + virtual ::NThreading::TFuture<void> Unlock( + const TYPath& path, + const TUnlockOptions& options) override; + + virtual ::NThreading::TFuture<void> AbortOperation(const TOperationId& operationId) override; + + virtual ::NThreading::TFuture<void> CompleteOperation(const TOperationId& operationId) override; + + ::NThreading::TFuture<void> SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) override; + + ::NThreading::TFuture<void> ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) override; + + virtual ::NThreading::TFuture<void> UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) override; + + virtual ::NThreading::TFuture<TRichYPath> CanonizeYPath(const TRichYPath& path) override; + + virtual ::NThreading::TFuture<TVector<TTableColumnarStatistics>> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) override; + + ::NThreading::TFuture<TCheckPermissionResponse> CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) override; + + virtual void ExecuteBatch(const TExecuteBatchOptions& executeBatch) override; + +private: + TBatchRequest(NDetail::NRawClient::TRawBatchRequest* impl, ::TIntrusivePtr<TClient> client); + +private: + TTransactionId DefaultTransaction_; + ::TIntrusivePtr<NDetail::NRawClient::TRawBatchRequest> Impl_; + THolder<TBatchRequest> TmpWithTransaction_; + ::TIntrusivePtr<TClient> Client_; + +private: + friend class NYT::NDetail::TClient; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client.cpp b/yt/cpp/mapreduce/client/client.cpp new file mode 100644 index 0000000000..ca979c5588 --- /dev/null +++ b/yt/cpp/mapreduce/client/client.cpp @@ -0,0 +1,1361 @@ +#include "client.h" + +#include "batch_request_impl.h" +#include "client_reader.h" +#include "client_writer.h" +#include "file_reader.h" +#include "file_writer.h" +#include "format_hints.h" +#include "lock.h" +#include "operation.h" +#include "retry_transaction.h" +#include "retryful_writer.h" +#include "transaction.h" +#include "transaction_pinger.h" +#include "yt_poller.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/http.h> +#include <yt/cpp/mapreduce/http/http_client.h> +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/client.h> +#include <yt/cpp/mapreduce/interface/fluent.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> +#include <yt/cpp/mapreduce/interface/skiff_row.h> + +#include <yt/cpp/mapreduce/io/yamr_table_reader.h> +#include <yt/cpp/mapreduce/io/yamr_table_writer.h> +#include <yt/cpp/mapreduce/io/node_table_reader.h> +#include <yt/cpp/mapreduce/io/node_table_writer.h> +#include <yt/cpp/mapreduce/io/proto_table_reader.h> +#include <yt/cpp/mapreduce/io/proto_table_writer.h> +#include <yt/cpp/mapreduce/io/skiff_row_table_reader.h> +#include <yt/cpp/mapreduce/io/proto_helpers.h> + +#include <yt/cpp/mapreduce/library/table_schema/protobuf.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> +#include <yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h> + +#include <library/cpp/json/json_reader.h> + +#include <util/generic/algorithm.h> +#include <util/string/type.h> +#include <util/system/env.h> + +#include <exception> + +using namespace NYT::NDetail::NRawClient; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +TClientBase::TClientBase( + const TClientContext& context, + const TTransactionId& transactionId, + IClientRetryPolicyPtr retryPolicy) + : Context_(context) + , TransactionId_(transactionId) + , ClientRetryPolicy_(std::move(retryPolicy)) +{ } + +ITransactionPtr TClientBase::StartTransaction( + const TStartTransactionOptions& options) +{ + return MakeIntrusive<TTransaction>(GetParentClientImpl(), Context_, TransactionId_, options); +} + +TNodeId TClientBase::Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options) +{ + return NRawClient::Create(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, type, options); +} + +void TClientBase::Remove( + const TYPath& path, + const TRemoveOptions& options) +{ + return NRawClient::Remove(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +bool TClientBase::Exists( + const TYPath& path, + const TExistsOptions& options) +{ + return NRawClient::Exists(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +TNode TClientBase::Get( + const TYPath& path, + const TGetOptions& options) +{ + return NRawClient::Get(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +void TClientBase::Set( + const TYPath& path, + const TNode& value, + const TSetOptions& options) +{ + NRawClient::Set(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, value, options); +} + +void TClientBase::MultisetAttributes( + const TYPath& path, const TNode::TMapType& value, const TMultisetAttributesOptions& options) +{ + NRawClient::MultisetAttributes(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, value, options); +} + + +TNode::TListType TClientBase::List( + const TYPath& path, + const TListOptions& options) +{ + return NRawClient::List(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +TNodeId TClientBase::Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) +{ + return NRawClient::Copy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, sourcePath, destinationPath, options); +} + +TNodeId TClientBase::Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) +{ + return NRawClient::Move(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, sourcePath, destinationPath, options); +} + +TNodeId TClientBase::Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) +{ + return NRawClient::Link(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, targetPath, linkPath, options); +} + +void TClientBase::Concatenate( + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options) +{ + std::function<void(ITransactionPtr)> lambda = [&sourcePaths, &destinationPath, &options, this](ITransactionPtr transaction) { + if (!options.Append_ && !sourcePaths.empty() && !transaction->Exists(destinationPath.Path_)) { + auto typeNode = transaction->Get(CanonizeYPath(sourcePaths.front()).Path_ + "/@type"); + auto type = FromString<ENodeType>(typeNode.AsString()); + transaction->Create(destinationPath.Path_, type, TCreateOptions().IgnoreExisting(true)); + } + NRawClient::Concatenate(this->Context_, transaction->GetId(), sourcePaths, destinationPath, options); + }; + RetryTransactionWithPolicy(this, lambda, ClientRetryPolicy_->CreatePolicyForGenericRequest()); +} + +TRichYPath TClientBase::CanonizeYPath(const TRichYPath& path) +{ + return NRawClient::CanonizeYPath(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, path); +} + +TVector<TTableColumnarStatistics> TClientBase::GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) +{ + return NRawClient::GetTableColumnarStatistics( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + TransactionId_, + paths, + options); +} + +TMultiTablePartitions TClientBase::GetTablePartitions( + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) +{ + return NRawClient::GetTablePartitions( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + TransactionId_, + paths, + options); +} + +TMaybe<TYPath> TClientBase::GetFileFromCache( + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options) +{ + return NRawClient::GetFileFromCache(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, md5Signature, cachePath, options); +} + +TYPath TClientBase::PutFileToCache( + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options) +{ + return NRawClient::PutFileToCache(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, filePath, md5Signature, cachePath, options); +} + +IFileReaderPtr TClientBase::CreateBlobTableReader( + const TYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options) +{ + return new TBlobTableReader( + path, + key, + ClientRetryPolicy_, + GetTransactionPinger(), + Context_, + TransactionId_, + options); +} + +IFileReaderPtr TClientBase::CreateFileReader( + const TRichYPath& path, + const TFileReaderOptions& options) +{ + return new TFileReader( + CanonizeYPath(path), + ClientRetryPolicy_, + GetTransactionPinger(), + Context_, + TransactionId_, + options); +} + +IFileWriterPtr TClientBase::CreateFileWriter( + const TRichYPath& path, + const TFileWriterOptions& options) +{ + auto realPath = CanonizeYPath(path); + if (!NRawClient::Exists(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, realPath.Path_)) { + NRawClient::Create(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, realPath.Path_, NT_FILE, + TCreateOptions().IgnoreExisting(true)); + } + return new TFileWriter(realPath, ClientRetryPolicy_, GetTransactionPinger(), Context_, TransactionId_, options); +} + +TTableWriterPtr<::google::protobuf::Message> TClientBase::CreateTableWriter( + const TRichYPath& path, const ::google::protobuf::Descriptor& descriptor, const TTableWriterOptions& options) +{ + const Message* prototype = google::protobuf::MessageFactory::generated_factory()->GetPrototype(&descriptor); + return new TTableWriter<::google::protobuf::Message>(CreateProtoWriter(path, options, prototype)); +} + +TRawTableReaderPtr TClientBase::CreateRawReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options) +{ + return CreateClientReader(path, format, options).Get(); +} + +TRawTableWriterPtr TClientBase::CreateRawWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options) +{ + return ::MakeIntrusive<TRetryfulWriter>( + ClientRetryPolicy_, + GetTransactionPinger(), + Context_, + TransactionId_, + GetWriteTableCommand(Context_.Config->ApiVersion), + format, + CanonizeYPath(path), + options).Get(); +} + +IOperationPtr TClientBase::DoMap( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + mapper, + options + ] () { + ExecuteMap( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + mapper, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RawMap( + const TRawMapOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + mapper, + options + ] () { + ExecuteRawMap( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + mapper, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::DoReduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + reducer, + options + ] () { + ExecuteReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RawReduce( + const TRawReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + reducer, + options + ] () { + ExecuteRawReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::DoJoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + reducer, + options + ] () { + ExecuteJoinReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RawJoinReduce( + const TRawJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + reducer, + options + ] () { + ExecuteRawJoinReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::DoMapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + ::TIntrusivePtr<IStructuredJob> reduceCombiner, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + mapper, + reduceCombiner, + reducer, + options + ] () { + ExecuteMapReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + mapper, + reduceCombiner, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RawMapReduce( + const TRawMapReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + ::TIntrusivePtr<IRawJob> reduceCombiner, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_=::TIntrusivePtr(this), + operation, + spec, + mapper, + reduceCombiner, + reducer, + options + ] () { + ExecuteRawMapReduce( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + mapper, + reduceCombiner, + reducer, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::Sort( + const TSortOperationSpec& spec, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + options + ] () { + ExecuteSort( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::Merge( + const TMergeOperationSpec& spec, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + options + ] () { + ExecuteMerge( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::Erase( + const TEraseOperationSpec& spec, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + options + ] () { + ExecuteErase( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RemoteCopy( + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + options + ] () { + ExecuteRemoteCopy( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::RunVanilla( + const TVanillaOperationSpec& spec, + const TOperationOptions& options) +{ + auto operation = ::MakeIntrusive<TOperation>(GetParentClientImpl()); + auto prepareOperation = [ + this_ = ::TIntrusivePtr(this), + operation, + spec, + options + ] () { + ExecuteVanilla( + operation, + ::MakeIntrusive<TOperationPreparer>(this_->GetParentClientImpl(), this_->TransactionId_), + spec, + options); + }; + return ProcessOperation(GetParentClientImpl(), std::move(prepareOperation), std::move(operation), options); +} + +IOperationPtr TClientBase::AttachOperation(const TOperationId& operationId) +{ + auto operation = ::MakeIntrusive<TOperation>(operationId, GetParentClientImpl()); + operation->GetBriefState(); // check that operation exists + return operation; +} + +EOperationBriefState TClientBase::CheckOperation(const TOperationId& operationId) +{ + return NYT::NDetail::CheckOperation(ClientRetryPolicy_, Context_, operationId); +} + +void TClientBase::AbortOperation(const TOperationId& operationId) +{ + NRawClient::AbortOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId); +} + +void TClientBase::CompleteOperation(const TOperationId& operationId) +{ + NRawClient::CompleteOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId); +} + +void TClientBase::WaitForOperation(const TOperationId& operationId) +{ + NYT::NDetail::WaitForOperation(ClientRetryPolicy_, Context_, operationId); +} + +void TClientBase::AlterTable( + const TYPath& path, + const TAlterTableOptions& options) +{ + NRawClient::AlterTable(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +::TIntrusivePtr<TClientReader> TClientBase::CreateClientReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options, + bool useFormatFromTableAttributes) +{ + return ::MakeIntrusive<TClientReader>( + CanonizeYPath(path), + ClientRetryPolicy_, + GetTransactionPinger(), + Context_, + TransactionId_, + format, + options, + useFormatFromTableAttributes); +} + +THolder<TClientWriter> TClientBase::CreateClientWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options) +{ + auto realPath = CanonizeYPath(path); + if (!NRawClient::Exists(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, realPath.Path_)) { + NRawClient::Create(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, realPath.Path_, NT_TABLE, + TCreateOptions().IgnoreExisting(true)); + } + return MakeHolder<TClientWriter>( + realPath, + ClientRetryPolicy_, + GetTransactionPinger(), + Context_, + TransactionId_, + format, + options + ); +} + +::TIntrusivePtr<INodeReaderImpl> TClientBase::CreateNodeReader( + const TRichYPath& path, const TTableReaderOptions& options) +{ + auto format = TFormat::YsonBinary(); + ApplyFormatHints<TNode>(&format, options.FormatHints_); + + // Skiff is disabled here because of large header problem (see https://st.yandex-team.ru/YT-6926). + // Revert this code to r3614168 when it is fixed. + return new TNodeTableReader( + CreateClientReader(path, format, options)); +} + +::TIntrusivePtr<IYaMRReaderImpl> TClientBase::CreateYaMRReader( + const TRichYPath& path, const TTableReaderOptions& options) +{ + return new TYaMRTableReader( + CreateClientReader(path, TFormat::YaMRLenval(), options, /* useFormatFromTableAttributes = */ true)); +} + +::TIntrusivePtr<IProtoReaderImpl> TClientBase::CreateProtoReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const Message* prototype) +{ + TVector<const ::google::protobuf::Descriptor*> descriptors; + descriptors.push_back(prototype->GetDescriptor()); + + if (Context_.Config->UseClientProtobuf) { + return new TProtoTableReader( + CreateClientReader(path, TFormat::YsonBinary(), options), + std::move(descriptors)); + } else { + auto format = TFormat::Protobuf({prototype->GetDescriptor()}, Context_.Config->ProtobufFormatWithDescriptors); + return new TLenvalProtoTableReader( + CreateClientReader(path, format, options), + std::move(descriptors)); + } +} + +::TIntrusivePtr<ISkiffRowReaderImpl> TClientBase::CreateSkiffRowReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ISkiffRowSkipperPtr& skipper, + const NSkiff::TSkiffSchemaPtr& schema) +{ + auto skiffOptions = TCreateSkiffSchemaOptions().HasRangeIndex(true); + auto resultSchema = NYT::NDetail::CreateSkiffSchema(TVector{schema}, skiffOptions); + return new TSkiffRowTableReader( + CreateClientReader(path, NYT::NDetail::CreateSkiffFormat(resultSchema), options), + resultSchema, + {skipper}, + std::move(skiffOptions)); +} + +::TIntrusivePtr<INodeWriterImpl> TClientBase::CreateNodeWriter( + const TRichYPath& path, const TTableWriterOptions& options) +{ + auto format = TFormat::YsonBinary(); + ApplyFormatHints<TNode>(&format, options.FormatHints_); + + return new TNodeTableWriter( + CreateClientWriter(path, format, options)); +} + +::TIntrusivePtr<IYaMRWriterImpl> TClientBase::CreateYaMRWriter( + const TRichYPath& path, const TTableWriterOptions& options) +{ + auto format = TFormat::YaMRLenval(); + ApplyFormatHints<TYaMRRow>(&format, options.FormatHints_); + + return new TYaMRTableWriter( + CreateClientWriter(path, format, options)); +} + +::TIntrusivePtr<IProtoWriterImpl> TClientBase::CreateProtoWriter( + const TRichYPath& path, + const TTableWriterOptions& options, + const Message* prototype) +{ + TVector<const ::google::protobuf::Descriptor*> descriptors; + descriptors.push_back(prototype->GetDescriptor()); + + auto pathWithSchema = path; + if (options.InferSchema_.GetOrElse(Context_.Config->InferTableSchema) && !path.Schema_) { + pathWithSchema.Schema(CreateTableSchema(*prototype->GetDescriptor())); + } + + if (Context_.Config->UseClientProtobuf) { + auto format = TFormat::YsonBinary(); + ApplyFormatHints<TNode>(&format, options.FormatHints_); + return new TProtoTableWriter( + CreateClientWriter(pathWithSchema, format, options), + std::move(descriptors)); + } else { + auto format = TFormat::Protobuf({prototype->GetDescriptor()}, Context_.Config->ProtobufFormatWithDescriptors); + ApplyFormatHints<::google::protobuf::Message>(&format, options.FormatHints_); + return new TLenvalProtoTableWriter( + CreateClientWriter(pathWithSchema, format, options), + std::move(descriptors)); + } +} + +TBatchRequestPtr TClientBase::CreateBatchRequest() +{ + return MakeIntrusive<TBatchRequest>(TransactionId_, GetParentClientImpl()); +} + +IClientPtr TClientBase::GetParentClient() +{ + return GetParentClientImpl(); +} + +const TClientContext& TClientBase::GetContext() const +{ + return Context_; +} + +const IClientRetryPolicyPtr& TClientBase::GetRetryPolicy() const +{ + return ClientRetryPolicy_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TTransaction::TTransaction( + TClientPtr parentClient, + const TClientContext& context, + const TTransactionId& parentTransactionId, + const TStartTransactionOptions& options) + : TClientBase(context, parentTransactionId, parentClient->GetRetryPolicy()) + , TransactionPinger_(parentClient->GetTransactionPinger()) + , PingableTx_( + MakeHolder<TPingableTransaction>( + parentClient->GetRetryPolicy(), + context, + parentTransactionId, + TransactionPinger_->GetChildTxPinger(), + options)) + , ParentClient_(parentClient) +{ + TransactionId_ = PingableTx_->GetId(); +} + +TTransaction::TTransaction( + TClientPtr parentClient, + const TClientContext& context, + const TTransactionId& transactionId, + const TAttachTransactionOptions& options) + : TClientBase(context, transactionId, parentClient->GetRetryPolicy()) + , TransactionPinger_(parentClient->GetTransactionPinger()) + , PingableTx_( + new TPingableTransaction( + parentClient->GetRetryPolicy(), + context, + transactionId, + parentClient->GetTransactionPinger()->GetChildTxPinger(), + options)) + , ParentClient_(parentClient) +{ } + +const TTransactionId& TTransaction::GetId() const +{ + return TransactionId_; +} + +ILockPtr TTransaction::Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options) +{ + auto lockId = NRawClient::Lock(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, mode, options); + return ::MakeIntrusive<TLock>(lockId, GetParentClientImpl(), options.Waitable_); +} + +void TTransaction::Unlock( + const TYPath& path, + const TUnlockOptions& options) +{ + NRawClient::Unlock(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_, path, options); +} + +void TTransaction::Commit() +{ + PingableTx_->Commit(); +} + +void TTransaction::Abort() +{ + PingableTx_->Abort(); +} + +void TTransaction::Ping() +{ + PingTx(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, TransactionId_); +} + +void TTransaction::Detach() +{ + PingableTx_->Detach(); +} + +ITransactionPingerPtr TTransaction::GetTransactionPinger() +{ + return TransactionPinger_; +} + +TClientPtr TTransaction::GetParentClientImpl() +{ + return ParentClient_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TClient::TClient( + const TClientContext& context, + const TTransactionId& globalId, + IClientRetryPolicyPtr retryPolicy) + : TClientBase(context, globalId, retryPolicy) + , TransactionPinger_(nullptr) +{ } + +TClient::~TClient() = default; + +ITransactionPtr TClient::AttachTransaction( + const TTransactionId& transactionId, + const TAttachTransactionOptions& options) +{ + CheckShutdown(); + + return MakeIntrusive<TTransaction>(this, Context_, transactionId, options); +} + +void TClient::MountTable( + const TYPath& path, + const TMountTableOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "mount_table"); + SetTabletParams(header, path, options); + if (options.CellId_) { + header.AddParameter("cell_id", GetGuidAsString(*options.CellId_)); + } + header.AddParameter("freeze", options.Freeze_); + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); +} + +void TClient::UnmountTable( + const TYPath& path, + const TUnmountTableOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "unmount_table"); + SetTabletParams(header, path, options); + header.AddParameter("force", options.Force_); + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); +} + +void TClient::RemountTable( + const TYPath& path, + const TRemountTableOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "remount_table"); + SetTabletParams(header, path, options); + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); +} + +void TClient::FreezeTable( + const TYPath& path, + const TFreezeTableOptions& options) +{ + CheckShutdown(); + NRawClient::FreezeTable(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, path, options); +} + +void TClient::UnfreezeTable( + const TYPath& path, + const TUnfreezeTableOptions& options) +{ + CheckShutdown(); + NRawClient::UnfreezeTable(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, path, options); +} + +void TClient::ReshardTable( + const TYPath& path, + const TVector<TKey>& keys, + const TReshardTableOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "reshard_table"); + SetTabletParams(header, path, options); + header.AddParameter("pivot_keys", BuildYsonNodeFluently().List(keys)); + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); +} + +void TClient::ReshardTable( + const TYPath& path, + i64 tabletCount, + const TReshardTableOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "reshard_table"); + SetTabletParams(header, path, options); + header.AddParameter("tablet_count", tabletCount); + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); +} + +void TClient::InsertRows( + const TYPath& path, + const TNode::TListType& rows, + const TInsertRowsOptions& options) +{ + CheckShutdown(); + + THttpHeader header("PUT", "insert_rows"); + header.SetInputFormat(TFormat::YsonBinary()); + // TODO: use corresponding raw request + header.MergeParameters(SerializeParametersForInsertRows(Context_.Config->Prefix, path, options)); + + auto body = NodeListToYsonString(rows); + TRequestConfig config; + config.IsHeavy = true; + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header, body, config); +} + +void TClient::DeleteRows( + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options) +{ + CheckShutdown(); + return NRawClient::DeleteRows(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, path, keys, options); +} + +void TClient::TrimRows( + const TYPath& path, + i64 tabletIndex, + i64 rowCount, + const TTrimRowsOptions& options) +{ + CheckShutdown(); + + THttpHeader header("POST", "trim_rows"); + header.AddParameter("trimmed_row_count", rowCount); + header.AddParameter("tablet_index", tabletIndex); + // TODO: use corresponding raw request + header.MergeParameters(NRawClient::SerializeParametersForTrimRows(Context_.Config->Prefix, path, options)); + + TRequestConfig config; + config.IsHeavy = true; + RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header, {}, config); +} + +TNode::TListType TClient::LookupRows( + const TYPath& path, + const TNode::TListType& keys, + const TLookupRowsOptions& options) +{ + CheckShutdown(); + + Y_UNUSED(options); + THttpHeader header("PUT", "lookup_rows"); + header.AddPath(AddPathPrefix(path, Context_.Config->ApiVersion)); + header.SetInputFormat(TFormat::YsonBinary()); + header.SetOutputFormat(TFormat::YsonBinary()); + + header.MergeParameters(BuildYsonNodeFluently().BeginMap() + .DoIf(options.Timeout_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("timeout").Value(static_cast<i64>(options.Timeout_->MilliSeconds())); + }) + .Item("keep_missing_rows").Value(options.KeepMissingRows_) + .DoIf(options.Versioned_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("versioned").Value(*options.Versioned_); + }) + .DoIf(options.Columns_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("column_names").Value(*options.Columns_); + }) + .EndMap()); + + auto body = NodeListToYsonString(keys); + TRequestConfig config; + config.IsHeavy = true; + auto result = RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header, body, config); + return NodeFromYsonString(result.Response, ::NYson::EYsonType::ListFragment).AsList(); +} + +TNode::TListType TClient::SelectRows( + const TString& query, + const TSelectRowsOptions& options) +{ + CheckShutdown(); + + THttpHeader header("GET", "select_rows"); + header.SetInputFormat(TFormat::YsonBinary()); + header.SetOutputFormat(TFormat::YsonBinary()); + + header.MergeParameters(BuildYsonNodeFluently().BeginMap() + .Item("query").Value(query) + .DoIf(options.Timeout_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("timeout").Value(static_cast<i64>(options.Timeout_->MilliSeconds())); + }) + .DoIf(options.InputRowLimit_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("input_row_limit").Value(*options.InputRowLimit_); + }) + .DoIf(options.OutputRowLimit_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("output_row_limit").Value(*options.OutputRowLimit_); + }) + .Item("range_expansion_limit").Value(options.RangeExpansionLimit_) + .Item("fail_on_incomplete_result").Value(options.FailOnIncompleteResult_) + .Item("verbose_logging").Value(options.VerboseLogging_) + .Item("enable_code_cache").Value(options.EnableCodeCache_) + .EndMap()); + + TRequestConfig config; + config.IsHeavy = true; + auto result = RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header, {}, config); + return NodeFromYsonString(result.Response, ::NYson::EYsonType::ListFragment).AsList(); +} + +void TClient::AlterTableReplica(const TReplicaId& replicaId, const TAlterTableReplicaOptions& options) +{ + CheckShutdown(); + NRawClient::AlterTableReplica(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, replicaId, options); +} + +ui64 TClient::GenerateTimestamp() +{ + CheckShutdown(); + THttpHeader header("GET", "generate_timestamp"); + TRequestConfig config; + config.IsHeavy = true; + auto requestResult = RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header, {}, config); + return NodeFromYsonString(requestResult.Response).AsUint64(); +} + +TAuthorizationInfo TClient::WhoAmI() +{ + CheckShutdown(); + + THttpHeader header("GET", "auth/whoami", /* isApi = */ false); + auto requestResult = RetryRequestWithPolicy(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, header); + TAuthorizationInfo result; + + NJson::TJsonValue jsonValue; + bool ok = NJson::ReadJsonTree(requestResult.Response, &jsonValue, /* throwOnError = */ true); + Y_VERIFY(ok); + result.Login = jsonValue["login"].GetString(); + result.Realm = jsonValue["realm"].GetString(); + return result; +} + +TOperationAttributes TClient::GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options) +{ + CheckShutdown(); + return NRawClient::GetOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, options); +} + +TListOperationsResult TClient::ListOperations( + const TListOperationsOptions& options) +{ + CheckShutdown(); + return NRawClient::ListOperations(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, options); +} + +void TClient::UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) +{ + CheckShutdown(); + return NRawClient::UpdateOperationParameters(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, options); +} + +TJobAttributes TClient::GetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options) +{ + CheckShutdown(); + return NRawClient::GetJob(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, jobId, options); +} + +TListJobsResult TClient::ListJobs( + const TOperationId& operationId, + const TListJobsOptions& options) +{ + CheckShutdown(); + return NRawClient::ListJobs(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, options); +} + +IFileReaderPtr TClient::GetJobInput( + const TJobId& jobId, + const TGetJobInputOptions& options) +{ + CheckShutdown(); + return NRawClient::GetJobInput(Context_, jobId, options); +} + +IFileReaderPtr TClient::GetJobFailContext( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& options) +{ + CheckShutdown(); + return NRawClient::GetJobFailContext(Context_, operationId, jobId, options); +} + +IFileReaderPtr TClient::GetJobStderr( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& options) +{ + CheckShutdown(); + return NRawClient::GetJobStderr(Context_, operationId, jobId, options); +} + +TNode::TListType TClient::SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) +{ + CheckShutdown(); + return NRawClient::SkyShareTable( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + tablePaths, + options); +} + +TCheckPermissionResponse TClient::CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) +{ + CheckShutdown(); + return NRawClient::CheckPermission(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, user, permission, path, options); +} + +TVector<TTabletInfo> TClient::GetTabletInfos( + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options) +{ + CheckShutdown(); + return NRawClient::GetTabletInfos(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, path, tabletIndexes, options); +} + + +void TClient::SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) +{ + CheckShutdown(); + NRawClient::SuspendOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, options); +} + +void TClient::ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) +{ + CheckShutdown(); + NRawClient::ResumeOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, operationId, options); +} + +TYtPoller& TClient::GetYtPoller() +{ + auto g = Guard(YtPollerLock_); + if (!YtPoller_) { + CheckShutdown(); + // We don't use current client and create new client because YtPoller_ might use + // this client during current client shutdown. + // That might lead to incrementing of current client refcount and double delete of current client object. + YtPoller_ = MakeHolder<TYtPoller>(Context_, ClientRetryPolicy_); + } + return *YtPoller_; +} + +void TClient::Shutdown() +{ + auto g = Guard(YtPollerLock_); + + if (!Shutdown_.exchange(true) && YtPoller_) { + YtPoller_->Stop(); + } +} + +ITransactionPingerPtr TClient::GetTransactionPinger() +{ + if (!TransactionPinger_) { + TransactionPinger_ = CreateTransactionPinger(Context_.Config); + } + return TransactionPinger_; +} + +TClientPtr TClient::GetParentClientImpl() +{ + return this; +} + +template <class TOptions> +void TClient::SetTabletParams( + THttpHeader& header, + const TYPath& path, + const TOptions& options) +{ + header.AddPath(AddPathPrefix(path, Context_.Config->Prefix)); + if (options.FirstTabletIndex_) { + header.AddParameter("first_tablet_index", *options.FirstTabletIndex_); + } + if (options.LastTabletIndex_) { + header.AddParameter("last_tablet_index", *options.LastTabletIndex_); + } +} + +void TClient::CheckShutdown() const +{ + if (Shutdown_) { + ythrow TApiUsageError() << "Call client's methods after shutdown"; + } +} + +TClientPtr CreateClientImpl( + const TString& serverName, + const TCreateClientOptions& options) +{ + TClientContext context; + context.Config = options.Config_ ? options.Config_ : TConfig::Get(); + context.TvmOnly = options.TvmOnly_; + context.UseTLS = options.UseTLS_; + + context.ServerName = serverName; + if (serverName.find('.') == TString::npos && + serverName.find(':') == TString::npos) + { + context.ServerName += ".yt.yandex.net"; + } + + if (serverName.find(':') == TString::npos) { + context.ServerName = CreateHostNameWithPort(context.ServerName, context); + } + if (options.TvmOnly_) { + context.ServerName = Format("tvm.%v", context.ServerName); + } + + if (options.UseTLS_ || options.UseCoreHttpClient_) { + context.HttpClient = NHttpClient::CreateCoreHttpClient(options.UseTLS_, context.Config); + } else { + context.HttpClient = NHttpClient::CreateDefaultHttpClient(); + } + + context.Token = context.Config->Token; + if (options.Token_) { + context.Token = options.Token_; + } else if (options.TokenPath_) { + context.Token = TConfig::LoadTokenFromFile(options.TokenPath_); + } else if (options.ServiceTicketAuth_) { + context.ServiceTicketAuth = options.ServiceTicketAuth_; + } + + context.ImpersonationUser = options.ImpersonationUser_; + + if (context.Token) { + TConfig::ValidateToken(context.Token); + } + + auto globalTxId = GetGuid(context.Config->GlobalTxId); + + auto retryConfigProvider = options.RetryConfigProvider_; + if (!retryConfigProvider) { + retryConfigProvider = CreateDefaultRetryConfigProvider(); + } + return new NDetail::TClient(context, globalTxId, CreateDefaultClientRetryPolicy(retryConfigProvider, context.Config)); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +IClientPtr CreateClient( + const TString& serverName, + const TCreateClientOptions& options) +{ + return NDetail::CreateClientImpl(serverName, options); +} + +IClientPtr CreateClientFromEnv(const TCreateClientOptions& options) +{ + auto serverName = GetEnv("YT_PROXY"); + if (!serverName) { + ythrow yexception() << "YT_PROXY is not set"; + } + + return NDetail::CreateClientImpl(serverName, options); +} + + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client.h b/yt/cpp/mapreduce/client/client.h new file mode 100644 index 0000000000..0f4df09d0b --- /dev/null +++ b/yt/cpp/mapreduce/client/client.h @@ -0,0 +1,506 @@ +#pragma once + +#include "client_reader.h" +#include "client_writer.h" +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/interface/client.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TYtPoller; + +class TClientBase; +using TClientBasePtr = ::TIntrusivePtr<TClientBase>; + +class TClient; +using TClientPtr = ::TIntrusivePtr<TClient>; + +//////////////////////////////////////////////////////////////////////////////// + +class TClientBase + : virtual public IClientBase +{ +public: + TClientBase( + const TClientContext& context, + const TTransactionId& transactionId, + IClientRetryPolicyPtr retryPolicy); + + ITransactionPtr StartTransaction( + const TStartTransactionOptions& options) override; + + // cypress + + TNodeId Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options) override; + + void Remove( + const TYPath& path, + const TRemoveOptions& options) override; + + bool Exists( + const TYPath& path, + const TExistsOptions& options) override; + + TNode Get( + const TYPath& path, + const TGetOptions& options) override; + + void Set( + const TYPath& path, + const TNode& value, + const TSetOptions& options) override; + + void MultisetAttributes( + const TYPath& path, + const TNode::TMapType& value, + const TMultisetAttributesOptions& options) override; + + TNode::TListType List( + const TYPath& path, + const TListOptions& options) override; + + TNodeId Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) override; + + TNodeId Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) override; + + TNodeId Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) override; + + void Concatenate( + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options) override; + + TRichYPath CanonizeYPath(const TRichYPath& path) override; + + TVector<TTableColumnarStatistics> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) override; + + TMultiTablePartitions GetTablePartitions( + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) override; + + TMaybe<TYPath> GetFileFromCache( + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options = TGetFileFromCacheOptions()) override; + + TYPath PutFileToCache( + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options = TPutFileToCacheOptions()) override; + + IFileReaderPtr CreateFileReader( + const TRichYPath& path, + const TFileReaderOptions& options) override; + + IFileWriterPtr CreateFileWriter( + const TRichYPath& path, + const TFileWriterOptions& options) override; + + TTableWriterPtr<::google::protobuf::Message> CreateTableWriter( + const TRichYPath& path, + const ::google::protobuf::Descriptor& descriptor, + const TTableWriterOptions& options) override; + + TRawTableReaderPtr CreateRawReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options) override; + + TRawTableWriterPtr CreateRawWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options) override; + + IFileReaderPtr CreateBlobTableReader( + const TYPath& path, + const TKey& key, + const TBlobTableReaderOptions& options) override; + + // operations + + IOperationPtr DoMap( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + const TOperationOptions& options) override; + + IOperationPtr RawMap( + const TRawMapOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + const TOperationOptions& options) override; + + IOperationPtr DoReduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) override; + + IOperationPtr RawReduce( + const TRawReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + const TOperationOptions& options) override; + + IOperationPtr DoJoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) override; + + IOperationPtr RawJoinReduce( + const TRawJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + const TOperationOptions& options) override; + + IOperationPtr DoMapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + ::TIntrusivePtr<IStructuredJob> reduceCombiner, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) override; + + IOperationPtr RawMapReduce( + const TRawMapReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + ::TIntrusivePtr<IRawJob> reduceCombiner, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options) override; + + IOperationPtr Sort( + const TSortOperationSpec& spec, + const TOperationOptions& options) override; + + IOperationPtr Merge( + const TMergeOperationSpec& spec, + const TOperationOptions& options) override; + + IOperationPtr Erase( + const TEraseOperationSpec& spec, + const TOperationOptions& options) override; + + IOperationPtr RemoteCopy( + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) override; + + IOperationPtr RunVanilla( + const TVanillaOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) override; + + IOperationPtr AttachOperation(const TOperationId& operationId) override; + + EOperationBriefState CheckOperation(const TOperationId& operationId) override; + + void AbortOperation(const TOperationId& operationId) override; + + void CompleteOperation(const TOperationId& operationId) override; + + void WaitForOperation(const TOperationId& operationId) override; + + void AlterTable( + const TYPath& path, + const TAlterTableOptions& options) override; + + TBatchRequestPtr CreateBatchRequest() override; + + IClientPtr GetParentClient() override; + + const TClientContext& GetContext() const; + + const IClientRetryPolicyPtr& GetRetryPolicy() const; + + virtual ITransactionPingerPtr GetTransactionPinger() = 0; + +protected: + virtual TClientPtr GetParentClientImpl() = 0; + +protected: + const TClientContext Context_; + TTransactionId TransactionId_; + IClientRetryPolicyPtr ClientRetryPolicy_; + +private: + ::TIntrusivePtr<TClientReader> CreateClientReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options, + bool useFormatFromTableAttributes = false); + + THolder<TClientWriter> CreateClientWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options); + + ::TIntrusivePtr<INodeReaderImpl> CreateNodeReader( + const TRichYPath& path, const TTableReaderOptions& options) override; + + ::TIntrusivePtr<IYaMRReaderImpl> CreateYaMRReader( + const TRichYPath& path, const TTableReaderOptions& options) override; + + ::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const Message* prototype) override; + + ::TIntrusivePtr<ISkiffRowReaderImpl> CreateSkiffRowReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ISkiffRowSkipperPtr& skipper, + const NSkiff::TSkiffSchemaPtr& schema) override; + + ::TIntrusivePtr<INodeWriterImpl> CreateNodeWriter( + const TRichYPath& path, const TTableWriterOptions& options) override; + + ::TIntrusivePtr<IYaMRWriterImpl> CreateYaMRWriter( + const TRichYPath& path, const TTableWriterOptions& options) override; + + ::TIntrusivePtr<IProtoWriterImpl> CreateProtoWriter( + const TRichYPath& path, + const TTableWriterOptions& options, + const Message* prototype) override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TTransaction + : public ITransaction + , public TClientBase +{ +public: + // + // Start a new transaction. + TTransaction( + TClientPtr parentClient, + const TClientContext& context, + const TTransactionId& parentTransactionId, + const TStartTransactionOptions& options); + + // + // Attach an existing transaction. + TTransaction( + TClientPtr parentClient, + const TClientContext& context, + const TTransactionId& transactionId, + const TAttachTransactionOptions& options); + + const TTransactionId& GetId() const override; + + ILockPtr Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options) override; + + void Unlock( + const TYPath& path, + const TUnlockOptions& options) override; + + void Commit() override; + + void Abort() override; + + void Ping() override; + + void Detach() override; + + ITransactionPingerPtr GetTransactionPinger() override; + +protected: + TClientPtr GetParentClientImpl() override; + +private: + ITransactionPingerPtr TransactionPinger_; + THolder<TPingableTransaction> PingableTx_; + TClientPtr ParentClient_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TClient + : public IClient + , public TClientBase +{ +public: + TClient( + const TClientContext& context, + const TTransactionId& globalId, + IClientRetryPolicyPtr retryPolicy); + + ~TClient(); + + ITransactionPtr AttachTransaction( + const TTransactionId& transactionId, + const TAttachTransactionOptions& options) override; + + void MountTable( + const TYPath& path, + const TMountTableOptions& options) override; + + void UnmountTable( + const TYPath& path, + const TUnmountTableOptions& options) override; + + void RemountTable( + const TYPath& path, + const TRemountTableOptions& options) override; + + void FreezeTable( + const TYPath& path, + const TFreezeTableOptions& options) override; + + void UnfreezeTable( + const TYPath& path, + const TUnfreezeTableOptions& options) override; + + void ReshardTable( + const TYPath& path, + const TVector<TKey>& keys, + const TReshardTableOptions& options) override; + + void ReshardTable( + const TYPath& path, + i64 tabletCount, + const TReshardTableOptions& options) override; + + void InsertRows( + const TYPath& path, + const TNode::TListType& rows, + const TInsertRowsOptions& options) override; + + void DeleteRows( + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options) override; + + void TrimRows( + const TYPath& path, + i64 tabletIndex, + i64 rowCount, + const TTrimRowsOptions& options) override; + + TNode::TListType LookupRows( + const TYPath& path, + const TNode::TListType& keys, + const TLookupRowsOptions& options) override; + + TNode::TListType SelectRows( + const TString& query, + const TSelectRowsOptions& options) override; + + void AlterTableReplica( + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& alterTableReplicaOptions) override; + + ui64 GenerateTimestamp() override; + + TAuthorizationInfo WhoAmI() override; + + TOperationAttributes GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options) override; + + TListOperationsResult ListOperations( + const TListOperationsOptions& options) override; + + void UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) override; + + TJobAttributes GetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options) override; + + TListJobsResult ListJobs( + const TOperationId& operationId, + const TListJobsOptions& options = TListJobsOptions()) override; + + IFileReaderPtr GetJobInput( + const TJobId& jobId, + const TGetJobInputOptions& options = TGetJobInputOptions()) override; + + IFileReaderPtr GetJobFailContext( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& options = TGetJobFailContextOptions()) override; + + IFileReaderPtr GetJobStderr( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& options = TGetJobStderrOptions()) override; + + TNode::TListType SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options = TSkyShareTableOptions()) override; + + TCheckPermissionResponse CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) override; + + TVector<TTabletInfo> GetTabletInfos( + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options) override; + + void SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) override; + + void ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) override; + + void Shutdown() override; + + ITransactionPingerPtr GetTransactionPinger() override; + + // Helper methods + TYtPoller& GetYtPoller(); + +protected: + TClientPtr GetParentClientImpl() override; + +private: + template <class TOptions> + void SetTabletParams( + THttpHeader& header, + const TYPath& path, + const TOptions& options); + + void CheckShutdown() const; + + ITransactionPingerPtr TransactionPinger_; + + std::atomic<bool> Shutdown_ = false; + TMutex YtPollerLock_; + THolder<TYtPoller> YtPoller_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TClientPtr CreateClientImpl( + const TString& serverName, + const TCreateClientOptions& options = TCreateClientOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client_reader.cpp b/yt/cpp/mapreduce/client/client_reader.cpp new file mode 100644 index 0000000000..80759b12dc --- /dev/null +++ b/yt/cpp/mapreduce/client/client_reader.cpp @@ -0,0 +1,232 @@ +#include "client_reader.h" + +#include "structured_table_formats.h" +#include "transaction.h" +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/io/helpers.h> +#include <yt/cpp/mapreduce/io/yamr_table_reader.h> + +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <library/cpp/yson/node/serialize.h> + +#include <util/random/random.h> +#include <util/stream/file.h> +#include <util/stream/str.h> +#include <util/string/builder.h> +#include <util/string/cast.h> + +namespace NYT { + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +TClientReader::TClientReader( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFormat& format, + const TTableReaderOptions& options, + bool useFormatFromTableAttributes) + : Path_(path) + , ClientRetryPolicy_(std::move(clientRetryPolicy)) + , Context_(context) + , ParentTransactionId_(transactionId) + , Format_(format) + , Options_(options) + , ReadTransaction_(nullptr) +{ + if (options.CreateTransaction_) { + Y_VERIFY(transactionPinger, "Internal error: transactionPinger is null"); + ReadTransaction_ = MakeHolder<TPingableTransaction>( + ClientRetryPolicy_, + Context_, + transactionId, + transactionPinger->GetChildTxPinger(), + TStartTransactionOptions()); + Path_.Path(Snapshot( + ClientRetryPolicy_, + Context_, + ReadTransaction_->GetId(), + path.Path_)); + } + + if (useFormatFromTableAttributes) { + auto transactionId2 = ReadTransaction_ ? ReadTransaction_->GetId() : ParentTransactionId_; + auto newFormat = GetTableFormat(ClientRetryPolicy_, Context_, transactionId2, Path_); + if (newFormat) { + Format_->Config = *newFormat; + } + } + + TransformYPath(); + CreateRequest(); +} + +bool TClientReader::Retry( + const TMaybe<ui32>& rangeIndex, + const TMaybe<ui64>& rowIndex) +{ + if (CurrentRequestRetryPolicy_) { + // TODO we should pass actual exception in Retry function + yexception genericError; + auto backoff = CurrentRequestRetryPolicy_->OnGenericError(genericError); + if (!backoff) { + return false; + } + } + + try { + CreateRequest(rangeIndex, rowIndex); + return true; + } catch (const std::exception& ex) { + YT_LOG_ERROR("Client reader retry failed: %v", + ex.what()); + + return false; + } +} + +void TClientReader::ResetRetries() +{ + CurrentRequestRetryPolicy_ = nullptr; +} + +size_t TClientReader::DoRead(void* buf, size_t len) +{ + return Input_->Read(buf, len); +} + +void TClientReader::TransformYPath() +{ + for (auto& range : Path_.MutableRangesView()) { + auto& exact = range.Exact_; + if (IsTrivial(exact)) { + continue; + } + + if (exact.RowIndex_) { + range.LowerLimit(TReadLimit().RowIndex(*exact.RowIndex_)); + range.UpperLimit(TReadLimit().RowIndex(*exact.RowIndex_ + 1)); + exact.RowIndex_.Clear(); + + } else if (exact.Key_) { + range.LowerLimit(TReadLimit().Key(*exact.Key_)); + + auto lastPart = TNode::CreateEntity(); + lastPart.Attributes() = TNode()("type", "max"); + exact.Key_->Parts_.push_back(lastPart); + + range.UpperLimit(TReadLimit().Key(*exact.Key_)); + exact.Key_.Clear(); + } + } +} + +void TClientReader::CreateRequest(const TMaybe<ui32>& rangeIndex, const TMaybe<ui64>& rowIndex) +{ + if (!CurrentRequestRetryPolicy_) { + CurrentRequestRetryPolicy_ = ClientRetryPolicy_->CreatePolicyForGenericRequest(); + } + while (true) { + CurrentRequestRetryPolicy_->NotifyNewAttempt(); + + THttpHeader header("GET", GetReadTableCommand(Context_.Config->ApiVersion)); + if (Context_.ServiceTicketAuth) { + header.SetServiceTicket(Context_.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(Context_.Token); + } + auto transactionId = (ReadTransaction_ ? ReadTransaction_->GetId() : ParentTransactionId_); + header.AddTransactionId(transactionId); + + const auto& controlAttributes = Options_.ControlAttributes_; + header.AddParameter("control_attributes", TNode() + ("enable_row_index", controlAttributes.EnableRowIndex_) + ("enable_range_index", controlAttributes.EnableRangeIndex_)); + header.SetOutputFormat(Format_); + + header.SetResponseCompression(ToString(Context_.Config->AcceptEncoding)); + + if (rowIndex.Defined()) { + auto& ranges = Path_.MutableRanges(); + if (ranges.Empty()) { + ranges.ConstructInPlace(TVector{TReadRange()}); + } else { + if (rangeIndex.GetOrElse(0) >= ranges->size()) { + ythrow yexception() + << "range index " << rangeIndex.GetOrElse(0) + << " is out of range, input range count is " << ranges->size(); + } + ranges->erase(ranges->begin(), ranges->begin() + rangeIndex.GetOrElse(0)); + } + ranges->begin()->LowerLimit(TReadLimit().RowIndex(*rowIndex)); + } + + header.MergeParameters(FormIORequestParameters(Path_, Options_)); + + auto requestId = CreateGuidAsString(); + + try { + const auto proxyName = GetProxyForHeavyRequest(Context_); + Response_ = Context_.HttpClient->Request(GetFullUrl(proxyName, Context_, header), requestId, header); + + Input_ = Response_->GetResponseStream(); + + YT_LOG_DEBUG("RSP %v - table stream", requestId); + + return; + } catch (const TErrorResponse& e) { + LogRequestError( + requestId, + header, + e.what(), + CurrentRequestRetryPolicy_->GetAttemptDescription()); + + if (!IsRetriable(e)) { + throw; + } + auto backoff = CurrentRequestRetryPolicy_->OnRetriableError(e); + if (!backoff) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(*backoff); + } catch (const std::exception& e) { + LogRequestError( + requestId, + header, + e.what(), + CurrentRequestRetryPolicy_->GetAttemptDescription()); + + Response_.reset(); + Input_ = nullptr; + + auto backoff = CurrentRequestRetryPolicy_->OnGenericError(e); + if (!backoff) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(*backoff); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client_reader.h b/yt/cpp/mapreduce/client/client_reader.h new file mode 100644 index 0000000000..22f5a0ebb0 --- /dev/null +++ b/yt/cpp/mapreduce/client/client_reader.h @@ -0,0 +1,65 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/http.h> +#include <yt/cpp/mapreduce/http/http_client.h> + +namespace NYT { + +class TPingableTransaction; + +//////////////////////////////////////////////////////////////////////////////// + +class TClientReader + : public TRawTableReader +{ +public: + TClientReader( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFormat& format, + const TTableReaderOptions& options, + bool useFormatFromTableAttributes); + + bool Retry( + const TMaybe<ui32>& rangeIndex, + const TMaybe<ui64>& rowIndex) override; + + void ResetRetries() override; + + bool HasRangeIndices() const override { return true; } + +protected: + size_t DoRead(void* buf, size_t len) override; + +private: + TRichYPath Path_; + const IClientRetryPolicyPtr ClientRetryPolicy_; + const TClientContext Context_; + TTransactionId ParentTransactionId_; + TMaybe<TFormat> Format_; + TTableReaderOptions Options_; + + THolder<TPingableTransaction> ReadTransaction_; + + NHttpClient::IHttpResponsePtr Response_; + IInputStream* Input_; + + IRequestRetryPolicyPtr CurrentRequestRetryPolicy_; + +private: + void TransformYPath(); + void CreateRequest(const TMaybe<ui32>& rangeIndex = Nothing(), const TMaybe<ui64>& rowIndex = Nothing()); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client_writer.cpp b/yt/cpp/mapreduce/client/client_writer.cpp new file mode 100644 index 0000000000..357abd32eb --- /dev/null +++ b/yt/cpp/mapreduce/client/client_writer.cpp @@ -0,0 +1,69 @@ +#include "client_writer.h" + +#include "retryful_writer.h" +#include "retryless_writer.h" + +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/common/fwd.h> +#include <yt/cpp/mapreduce/common/helpers.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TClientWriter::TClientWriter( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TMaybe<TFormat>& format, + const TTableWriterOptions& options) + : BUFFER_SIZE(options.BufferSize_) +{ + if (options.SingleHttpRequest_) { + RawWriter_.Reset(new TRetrylessWriter( + context, + transactionId, + GetWriteTableCommand(context.Config->ApiVersion), + format, + path, + BUFFER_SIZE, + options)); + } else { + RawWriter_.Reset(new TRetryfulWriter( + std::move(clientRetryPolicy), + std::move(transactionPinger), + context, + transactionId, + GetWriteTableCommand(context.Config->ApiVersion), + format, + path, + options)); + } +} + +size_t TClientWriter::GetStreamCount() const +{ + return 1; +} + +IOutputStream* TClientWriter::GetStream(size_t tableIndex) const +{ + Y_UNUSED(tableIndex); + return RawWriter_.Get(); +} + +void TClientWriter::OnRowFinished(size_t) +{ + RawWriter_->NotifyRowEnd(); +} + +void TClientWriter::Abort() +{ + RawWriter_->Abort(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/client_writer.h b/yt/cpp/mapreduce/client/client_writer.h new file mode 100644 index 0000000000..010a88a8ff --- /dev/null +++ b/yt/cpp/mapreduce/client/client_writer.h @@ -0,0 +1,42 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +struct TTableWriterOptions; +class TRetryfulWriter; + +//////////////////////////////////////////////////////////////////////////////// + +class TClientWriter + : public IProxyOutput +{ +public: + TClientWriter( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TMaybe<TFormat>& format, + const TTableWriterOptions& options); + + size_t GetStreamCount() const override; + IOutputStream* GetStream(size_t tableIndex) const override; + void OnRowFinished(size_t tableIndex) override; + void Abort() override; + +private: + ::TIntrusivePtr<TRawTableWriter> RawWriter_; + + const size_t BUFFER_SIZE = 64 << 20; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/dummy_job_profiler.cpp b/yt/cpp/mapreduce/client/dummy_job_profiler.cpp new file mode 100644 index 0000000000..5a2f1e8d46 --- /dev/null +++ b/yt/cpp/mapreduce/client/dummy_job_profiler.cpp @@ -0,0 +1,26 @@ +#include "job_profiler.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TDummyJobProfiler + : public IJobProfiler +{ + void Start() override + { } + + void Stop() override + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr<IJobProfiler> CreateJobProfiler() +{ + return std::make_unique<TDummyJobProfiler>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/file_reader.cpp b/yt/cpp/mapreduce/client/file_reader.cpp new file mode 100644 index 0000000000..fc21e0bc02 --- /dev/null +++ b/yt/cpp/mapreduce/client/file_reader.cpp @@ -0,0 +1,243 @@ +#include "file_reader.h" + +#include "transaction.h" +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/io/helpers.h> + +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/http.h> +#include <yt/cpp/mapreduce/http/http_client.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +namespace NYT { +namespace NDetail { + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +static TMaybe<ui64> GetEndOffset(const TFileReaderOptions& options) { + if (options.Length_) { + return options.Offset_.GetOrElse(0) + *options.Length_; + } else { + return Nothing(); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TStreamReaderBase::TStreamReaderBase( + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId) + : Context_(context) + , ClientRetryPolicy_(std::move(clientRetryPolicy)) + , ReadTransaction_(MakeHolder<TPingableTransaction>( + ClientRetryPolicy_, + context, + transactionId, + transactionPinger->GetChildTxPinger(), + TStartTransactionOptions())) +{ } + +TStreamReaderBase::~TStreamReaderBase() = default; + +TYPath TStreamReaderBase::Snapshot(const TYPath& path) +{ + return NYT::Snapshot(ClientRetryPolicy_, Context_, ReadTransaction_->GetId(), path); +} + +TString TStreamReaderBase::GetActiveRequestId() const +{ + if (Response_) { + return Response_->GetRequestId();; + } else { + return "<no-active-request>"; + } +} + +size_t TStreamReaderBase::DoRead(void* buf, size_t len) +{ + const int retryCount = Context_.Config->ReadRetryCount; + for (int attempt = 1; attempt <= retryCount; ++attempt) { + try { + if (!Input_) { + Response_ = Request(Context_, ReadTransaction_->GetId(), CurrentOffset_); + Input_ = Response_->GetResponseStream(); + } + if (len == 0) { + return 0; + } + const size_t read = Input_->Read(buf, len); + CurrentOffset_ += read; + return read; + } catch (TErrorResponse& e) { + YT_LOG_ERROR("RSP %v - failed: %v (attempt %v of %v)", + GetActiveRequestId(), + e.what(), + attempt, + retryCount); + + if (!IsRetriable(e) || attempt == retryCount) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(GetBackoffDuration(e, Context_.Config)); + } catch (std::exception& e) { + YT_LOG_ERROR("RSP %v - failed: %v (attempt %v of %v)", + GetActiveRequestId(), + e.what(), + attempt, + retryCount); + + // Invalidate connection. + Response_.reset(); + + if (attempt == retryCount) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(GetBackoffDuration(e, Context_.Config)); + } + Input_ = nullptr; + } + Y_UNREACHABLE(); // we should either return or throw from loop above +} + +//////////////////////////////////////////////////////////////////////////////// + +TFileReader::TFileReader( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFileReaderOptions& options) + : TStreamReaderBase(std::move(clientRetryPolicy), std::move(transactionPinger), context, transactionId) + , FileReaderOptions_(options) + , Path_(path) + , StartOffset_(FileReaderOptions_.Offset_.GetOrElse(0)) + , EndOffset_(GetEndOffset(FileReaderOptions_)) +{ + Path_.Path_ = TStreamReaderBase::Snapshot(Path_.Path_); +} + +NHttpClient::IHttpResponsePtr TFileReader::Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) +{ + const ui64 currentOffset = StartOffset_ + readBytes; + TString hostName = GetProxyForHeavyRequest(context); + + THttpHeader header("GET", GetReadFileCommand(context.Config->ApiVersion)); + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + header.AddTransactionId(transactionId); + header.SetOutputFormat(TMaybe<TFormat>()); // Binary format + + if (EndOffset_) { + Y_VERIFY(*EndOffset_ >= currentOffset); + FileReaderOptions_.Length(*EndOffset_ - currentOffset); + } + FileReaderOptions_.Offset(currentOffset); + header.MergeParameters(FormIORequestParameters(Path_, FileReaderOptions_)); + + header.SetResponseCompression(ToString(context.Config->AcceptEncoding)); + + auto requestId = CreateGuidAsString(); + NHttpClient::IHttpResponsePtr response; + try { + response = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); + } catch (const std::exception& ex) { + LogRequestError(requestId, header, ex.what(), ""); + throw; + } + + YT_LOG_DEBUG("RSP %v - file stream", + requestId); + + return response; +} + +//////////////////////////////////////////////////////////////////////////////// + +TBlobTableReader::TBlobTableReader( + const TYPath& path, + const TKey& key, + IClientRetryPolicyPtr retryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TBlobTableReaderOptions& options) + : TStreamReaderBase(std::move(retryPolicy), std::move(transactionPinger), context, transactionId) + , Key_(key) + , Options_(options) +{ + Path_ = TStreamReaderBase::Snapshot(path); +} + +NHttpClient::IHttpResponsePtr TBlobTableReader::Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) +{ + TString hostName = GetProxyForHeavyRequest(context); + + THttpHeader header("GET", "read_blob_table"); + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + header.AddTransactionId(transactionId); + header.SetOutputFormat(TMaybe<TFormat>()); // Binary format + + const ui64 currentOffset = Options_.Offset_ + readBytes; + const i64 startPartIndex = currentOffset / Options_.PartSize_; + const ui64 skipBytes = currentOffset - Options_.PartSize_ * startPartIndex; + auto lowerLimitKey = Key_; + lowerLimitKey.Parts_.push_back(startPartIndex); + auto upperLimitKey = Key_; + upperLimitKey.Parts_.push_back(std::numeric_limits<i64>::max()); + TNode params = PathToParamNode(TRichYPath(Path_).AddRange(TReadRange() + .LowerLimit(TReadLimit().Key(lowerLimitKey)) + .UpperLimit(TReadLimit().Key(upperLimitKey)))); + params["start_part_index"] = TNode(startPartIndex); + params["offset"] = skipBytes; + if (Options_.PartIndexColumnName_) { + params["part_index_column_name"] = *Options_.PartIndexColumnName_; + } + if (Options_.DataColumnName_) { + params["data_column_name"] = *Options_.DataColumnName_; + } + params["part_size"] = Options_.PartSize_; + header.MergeParameters(params); + header.SetResponseCompression(ToString(context.Config->AcceptEncoding)); + + auto requestId = CreateGuidAsString(); + NHttpClient::IHttpResponsePtr response; + try { + response = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); + } catch (const std::exception& ex) { + LogRequestError(requestId, header, ex.what(), ""); + throw; + } + + YT_LOG_DEBUG("RSP %v - blob table stream", + requestId); + return response; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/file_reader.h b/yt/cpp/mapreduce/client/file_reader.h new file mode 100644 index 0000000000..d850008a31 --- /dev/null +++ b/yt/cpp/mapreduce/client/file_reader.h @@ -0,0 +1,105 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> + +class IInputStream; + +namespace NYT { + +class THttpRequest; +class TPingableTransaction; + +namespace NDetail { +//////////////////////////////////////////////////////////////////////////////// + +class TStreamReaderBase + : public IFileReader +{ +public: + TStreamReaderBase( + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId); + + ~TStreamReaderBase(); + +protected: + TYPath Snapshot(const TYPath& path); + +protected: + const TClientContext Context_; + +private: + size_t DoRead(void* buf, size_t len) override; + virtual NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) = 0; + TString GetActiveRequestId() const; + +private: + const IClientRetryPolicyPtr ClientRetryPolicy_; + TFileReaderOptions FileReaderOptions_; + + NHttpClient::IHttpResponsePtr Response_; + IInputStream* Input_ = nullptr; + + THolder<TPingableTransaction> ReadTransaction_; + + ui64 CurrentOffset_ = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TFileReader + : public TStreamReaderBase +{ +public: + TFileReader( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFileReaderOptions& options = TFileReaderOptions()); + +private: + NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) override; + +private: + TFileReaderOptions FileReaderOptions_; + + TRichYPath Path_; + const ui64 StartOffset_; + const TMaybe<ui64> EndOffset_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TBlobTableReader + : public TStreamReaderBase +{ +public: + TBlobTableReader( + const TYPath& path, + const TKey& key, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TBlobTableReaderOptions& options); + +private: + NHttpClient::IHttpResponsePtr Request(const TClientContext& context, const TTransactionId& transactionId, ui64 readBytes) override; + +private: + const TKey Key_; + const TBlobTableReaderOptions Options_; + TYPath Path_; +}; + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/file_writer.cpp b/yt/cpp/mapreduce/client/file_writer.cpp new file mode 100644 index 0000000000..daf6461edd --- /dev/null +++ b/yt/cpp/mapreduce/client/file_writer.cpp @@ -0,0 +1,60 @@ +#include "file_writer.h" + +#include <yt/cpp/mapreduce/io/helpers.h> +#include <yt/cpp/mapreduce/interface/finish_or_die.h> + +#include <yt/cpp/mapreduce/common/helpers.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TFileWriter::TFileWriter( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFileWriterOptions& options) + : RetryfulWriter_( + std::move(clientRetryPolicy), + std::move(transactionPinger), + context, + transactionId, + GetWriteFileCommand(context.Config->ApiVersion), + TMaybe<TFormat>(), + path, + options) +{ } + +TFileWriter::~TFileWriter() +{ + NDetail::FinishOrDie(this, "TFileWriter"); +} + +void TFileWriter::DoWrite(const void* buf, size_t len) +{ + // If user tunes RetryBlockSize / DesiredChunkSize he expects + // us to send data exactly by RetryBlockSize. So behaviour of the writer is predictable. + // + // We want to avoid situation when size of sent data slightly exceeded DesiredChunkSize + // and server produced one chunk of desired size and one small chunk. + while (len > 0) { + const auto retryBlockRemainingSize = RetryfulWriter_.GetRetryBlockRemainingSize(); + Y_VERIFY(retryBlockRemainingSize > 0); + const auto firstWriteLen = Min(len, retryBlockRemainingSize); + RetryfulWriter_.Write(buf, firstWriteLen); + RetryfulWriter_.NotifyRowEnd(); + len -= firstWriteLen; + buf = static_cast<const char*>(buf) + firstWriteLen; + } +} + +void TFileWriter::DoFinish() +{ + RetryfulWriter_.Finish(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/file_writer.h b/yt/cpp/mapreduce/client/file_writer.h new file mode 100644 index 0000000000..f3b97b904e --- /dev/null +++ b/yt/cpp/mapreduce/client/file_writer.h @@ -0,0 +1,38 @@ +#pragma once + +#include "retryful_writer.h" + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TFileWriter + : public IFileWriter +{ +public: + TFileWriter( + const TRichYPath& path, + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& transactionId, + const TFileWriterOptions& options = TFileWriterOptions()); + + ~TFileWriter() override; + +protected: + void DoWrite(const void* buf, size_t len) override; + void DoFinish() override; + +private: + TRetryfulWriter RetryfulWriter_; + static const size_t BUFFER_SIZE = 64 << 20; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/format_hints.cpp b/yt/cpp/mapreduce/client/format_hints.cpp new file mode 100644 index 0000000000..1f6eb173ad --- /dev/null +++ b/yt/cpp/mapreduce/client/format_hints.cpp @@ -0,0 +1,84 @@ +#include "format_hints.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <util/string/builder.h> + +namespace NYT::NDetail { + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +static void ApplyEnableTypeConversion(TFormat* format, const TFormatHints& formatHints) +{ + if (formatHints.EnableAllToStringConversion_) { + format->Config.Attributes()["enable_all_to_string_conversion"] = *formatHints.EnableAllToStringConversion_; + } + if (formatHints.EnableStringToAllConversion_) { + format->Config.Attributes()["enable_string_to_all_conversion"] = *formatHints.EnableStringToAllConversion_; + } + if (formatHints.EnableIntegralTypeConversion_) { + format->Config.Attributes()["enable_integral_type_conversion"] = *formatHints.EnableIntegralTypeConversion_; + } + if (formatHints.EnableIntegralToDoubleConversion_) { + format->Config.Attributes()["enable_integral_to_double_conversion"] = *formatHints.EnableIntegralToDoubleConversion_; + } + if (formatHints.EnableTypeConversion_) { + format->Config.Attributes()["enable_type_conversion"] = *formatHints.EnableTypeConversion_; + } +} + +template <> +void ApplyFormatHints<TNode>(TFormat* format, const TMaybe<TFormatHints>& formatHints) +{ + Y_VERIFY(format); + if (!formatHints) { + return; + } + + ApplyEnableTypeConversion(format, *formatHints); + + if (formatHints->SkipNullValuesForTNode_) { + Y_ENSURE_EX( + format->Config.AsString() == "yson", + TApiUsageError() << "SkipNullForTNode option must be used with yson format, actual format: " << format->Config.AsString()); + format->Config.Attributes()["skip_null_values"] = formatHints->SkipNullValuesForTNode_; + } + + if (formatHints->ComplexTypeMode_) { + Y_ENSURE_EX( + format->Config.AsString() == "yson", + TApiUsageError() << "ComplexTypeMode option must be used with yson format, actual format: " + << format->Config.AsString()); + format->Config.Attributes()["complex_type_mode"] = ToString(*formatHints->ComplexTypeMode_); + } +} + +template <> +void ApplyFormatHints<TYaMRRow>(TFormat* format, const TMaybe<TFormatHints>& formatHints) +{ + Y_VERIFY(format); + if (!formatHints) { + return; + } + + ythrow TApiUsageError() << "Yamr format currently has no supported format hints"; +} + +template <> +void ApplyFormatHints<::google::protobuf::Message>(TFormat* format, const TMaybe<TFormatHints>& formatHints) +{ + Y_VERIFY(format); + if (!formatHints) { + return; + } + + ythrow TApiUsageError() << "Protobuf format currently has no supported format hints"; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/format_hints.h b/yt/cpp/mapreduce/client/format_hints.h new file mode 100644 index 0000000000..f6576b1045 --- /dev/null +++ b/yt/cpp/mapreduce/client/format_hints.h @@ -0,0 +1,27 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/fwd.h> + +#include <util/generic/maybe.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +void ApplyFormatHints(TFormat* format, const TMaybe<TFormatHints>& formatHints); + +template <> +void ApplyFormatHints<TNode>(TFormat* format, const TMaybe<TFormatHints>& formatHints); + +template <> +void ApplyFormatHints<TYaMRRow>(TFormat* format, const TMaybe<TFormatHints>& formatHints); + +template <> +void ApplyFormatHints<::google::protobuf::Message>(TFormat* format, const TMaybe<TFormatHints>& formatHints); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/fwd.h b/yt/cpp/mapreduce/client/fwd.h new file mode 100644 index 0000000000..d4449d4ac1 --- /dev/null +++ b/yt/cpp/mapreduce/client/fwd.h @@ -0,0 +1,16 @@ +#pragma once + +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TPingableTransaction; + +class TClient; +using TClientPtr = ::TIntrusivePtr<TClient>; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/init.cpp b/yt/cpp/mapreduce/client/init.cpp new file mode 100644 index 0000000000..c74598ba14 --- /dev/null +++ b/yt/cpp/mapreduce/client/init.cpp @@ -0,0 +1,280 @@ +#include "init.h" + +#include "abortable_registry.h" +#include "job_profiler.h" + +#include <yt/cpp/mapreduce/http/requests.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/init.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <yt/cpp/mapreduce/interface/logging/logger.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/io/job_reader.h> + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <library/cpp/sighandler/async_signals_handler.h> + +#include <util/folder/dirut.h> + +#include <util/generic/singleton.h> + +#include <util/string/builder.h> +#include <util/string/cast.h> +#include <util/string/type.h> + +#include <util/system/env.h> +#include <util/system/thread.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +void WriteVersionToLog() +{ + YT_LOG_INFO("Wrapper version: %v", + TProcessState::Get()->ClientVersion); +} + +static TNode SecureVaultContents; // safe + +void InitializeSecureVault() +{ + SecureVaultContents = NodeFromYsonString( + GetEnv("YT_SECURE_VAULT", "{}")); +} + +} + +//////////////////////////////////////////////////////////////////////////////// + +const TNode& GetJobSecureVault() +{ + return SecureVaultContents; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TAbnormalTerminator +{ +public: + TAbnormalTerminator() = default; + + static void SetErrorTerminationHandler() + { + if (Instance().OldHandler_ != nullptr) { + return; + } + + Instance().OldHandler_ = std::set_terminate(&TerminateHandler); + + SetAsyncSignalFunction(SIGINT, SignalHandler); + SetAsyncSignalFunction(SIGTERM, SignalHandler); + } + +private: + static TAbnormalTerminator& Instance() + { + return *Singleton<TAbnormalTerminator>(); + } + + static void* Invoke(void* opaque) + { + (*reinterpret_cast<std::function<void()>*>(opaque))(); + return nullptr; + } + + static void TerminateWithTimeout( + const TDuration& timeout, + const std::function<void(void)>& exitFunction, + const TString& logMessage) + { + std::function<void()> threadFun = [=] { + YT_LOG_INFO("%v", + logMessage); + NDetail::TAbortableRegistry::Get()->AbortAllAndBlockForever(); + }; + TThread thread(TThread::TParams(Invoke, &threadFun).SetName("aborter")); + thread.Start(); + thread.Detach(); + + Sleep(timeout); + exitFunction(); + } + + static void SignalHandler(int signalNumber) + { + TerminateWithTimeout( + TDuration::Seconds(5), + std::bind(_exit, -signalNumber), + ::TStringBuilder() << "Signal " << signalNumber << " received, aborting transactions. Waiting 5 seconds..."); + } + + static void TerminateHandler() + { + TerminateWithTimeout( + TDuration::Seconds(5), + [&] { + if (Instance().OldHandler_) { + Instance().OldHandler_(); + } else { + abort(); + } + }, + ::TStringBuilder() << "Terminate called, aborting transactions. Waiting 5 seconds..."); + } + +private: + std::terminate_handler OldHandler_ = nullptr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +EInitStatus& GetInitStatus() +{ + static EInitStatus initStatus = EInitStatus::NotInitialized; + return initStatus; +} + +static void ElevateInitStatus(const EInitStatus newStatus) { + NDetail::GetInitStatus() = Max(NDetail::GetInitStatus(), newStatus); +} + +void CommonInitialize(int argc, const char** argv) +{ + auto logLevelStr = to_lower(TConfig::Get()->LogLevel); + ILogger::ELevel logLevel; + + if (!TryFromString(logLevelStr, logLevel)) { + Cerr << "Invalid log level: " << TConfig::Get()->LogLevel << Endl; + exit(1); + } + + SetLogger(CreateStdErrLogger(logLevel)); + + TProcessState::Get()->SetCommandLine(argc, argv); +} + +void NonJobInitialize(const TInitializeOptions& options) +{ + if (FromString<bool>(GetEnv("YT_CLEANUP_ON_TERMINATION", "0")) || options.CleanupOnTermination_) { + TAbnormalTerminator::SetErrorTerminationHandler(); + } + if (options.WaitProxy_) { + NDetail::TWaitProxy::Get()->SetProxy(options.WaitProxy_); + } + WriteVersionToLog(); +} + +void ExecJob(int argc, const char** argv, const TInitializeOptions& options) +{ + // Now we are definitely in job. + // We take this setting from environment variable to be consistent with client code. + TConfig::Get()->UseClientProtobuf = IsTrue(GetEnv("YT_USE_CLIENT_PROTOBUF", "")); + + auto execJobImpl = [&options](TString jobName, i64 outputTableCount, bool hasState) { + auto jobProfiler = CreateJobProfiler(); + jobProfiler->Start(); + + InitializeSecureVault(); + + NDetail::OutputTableCount = static_cast<i64>(outputTableCount); + + THolder<IInputStream> jobStateStream; + if (hasState) { + jobStateStream = MakeHolder<TIFStream>("jobstate"); + } else { + jobStateStream = MakeHolder<TBufferStream>(0); + } + + int ret = 1; + try { + ret = TJobFactory::Get()->GetJobFunction(jobName.data())(outputTableCount, *jobStateStream); + } catch (const TSystemError& ex) { + if (ex.Status() == EPIPE) { + // 32 == EPIPE, write number here so it's easier to grep this exit code in source files + exit(32); + } + throw; + } + + jobProfiler->Stop(); + + if (options.JobOnExitFunction_) { + (*options.JobOnExitFunction_)(); + } + exit(ret); + }; + + auto jobArguments = NodeFromYsonString(GetEnv("YT_JOB_ARGUMENTS", "#")); + if (jobArguments.HasValue()) { + execJobImpl( + jobArguments["job_name"].AsString(), + jobArguments["output_table_count"].AsInt64(), + jobArguments["has_state"].AsBool()); + Y_UNREACHABLE(); + } + + TString jobType = argc >= 2 ? argv[1] : TString(); + if (argc != 5 || jobType != "--yt-map" && jobType != "--yt-reduce") { + // We are inside job but probably using old API + // (i.e. both NYT::Initialize and NMR::Initialize are called). + WriteVersionToLog(); + return; + } + + TString jobName(argv[2]); + i64 outputTableCount = FromString<i64>(argv[3]); + int hasState = FromString<int>(argv[4]); + execJobImpl(jobName, outputTableCount, hasState); + Y_UNREACHABLE(); +} + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +void JoblessInitialize(const TInitializeOptions& options) +{ + static const char* fakeArgv[] = {"unknown..."}; + NDetail::CommonInitialize(1, fakeArgv); + NDetail::NonJobInitialize(options); + NDetail::ElevateInitStatus(NDetail::EInitStatus::JoblessInitialization); +} + +void Initialize(int argc, const char* argv[], const TInitializeOptions& options) +{ + NDetail::CommonInitialize(argc, argv); + + NDetail::ElevateInitStatus(NDetail::EInitStatus::FullInitialization); + + const bool isInsideJob = !GetEnv("YT_JOB_ID").empty(); + if (isInsideJob) { + NDetail::ExecJob(argc, argv, options); + } else { + NDetail::NonJobInitialize(options); + } +} + +void Initialize(int argc, char* argv[], const TInitializeOptions& options) +{ + return Initialize(argc, const_cast<const char**>(argv), options); +} + +void Initialize(const TInitializeOptions& options) +{ + static const char* fakeArgv[] = {"unknown..."}; + Initialize(1, fakeArgv, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/init.h b/yt/cpp/mapreduce/client/init.h new file mode 100644 index 0000000000..af2fc80e55 --- /dev/null +++ b/yt/cpp/mapreduce/client/init.h @@ -0,0 +1,22 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/init.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +enum class EInitStatus : int +{ + NotInitialized, + JoblessInitialization, + FullInitialization, +}; + +EInitStatus& GetInitStatus(); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/job_profiler.h b/yt/cpp/mapreduce/client/job_profiler.h new file mode 100644 index 0000000000..6532871380 --- /dev/null +++ b/yt/cpp/mapreduce/client/job_profiler.h @@ -0,0 +1,27 @@ +#pragma once + +#include <memory> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +struct IJobProfiler +{ + virtual ~IJobProfiler() = default; + + //! Starts job profiling if corresponding options are set + //! in environment. + virtual void Start() = 0; + + //! Stops profiling and sends profile to job proxy. + virtual void Stop() = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr<IJobProfiler> CreateJobProfiler(); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/lock.cpp b/yt/cpp/mapreduce/client/lock.cpp new file mode 100644 index 0000000000..88110f9266 --- /dev/null +++ b/yt/cpp/mapreduce/client/lock.cpp @@ -0,0 +1,105 @@ +#include "lock.h" + +#include "yt_poller.h" + +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> + +#include <util/string/builder.h> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +//////////////////////////////////////////////////////////////////////////////// + +class TLockPollerItem + : public IYtPollerItem +{ +public: + TLockPollerItem(const TLockId& lockId, ::NThreading::TPromise<void> acquired) + : LockStateYPath_("#" + GetGuidAsString(lockId) + "/@state") + , Acquired_(acquired) + { } + + void PrepareRequest(TRawBatchRequest* batchRequest) override + { + LockState_ = batchRequest->Get(TTransactionId(), LockStateYPath_, TGetOptions()); + } + + EStatus OnRequestExecuted() override + { + try { + const auto& state = LockState_.GetValue().AsString(); + if (state == "acquired") { + Acquired_.SetValue(); + return PollBreak; + } + } catch (const TErrorResponse& e) { + if (!IsRetriable(e)) { + Acquired_.SetException(std::current_exception()); + return PollBreak; + } + } catch (const std::exception& e) { + if (!IsRetriable(e)) { + Acquired_.SetException(std::current_exception()); + return PollBreak; + } + } + return PollContinue; + } + + void OnItemDiscarded() override + { + Acquired_.SetException(std::make_exception_ptr(yexception() << "Operation cancelled")); + } + +private: + const TString LockStateYPath_; + ::NThreading::TPromise<void> Acquired_; + + ::NThreading::TFuture<TNode> LockState_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TLock::TLock(const TLockId& lockId, TClientPtr client, bool waitable) + : LockId_(lockId) + , Client_(std::move(client)) +{ + if (!waitable) { + Acquired_ = ::NThreading::MakeFuture(); + } +} + +const TLockId& TLock::GetId() const +{ + return LockId_; +} + +TNodeId TLock::GetLockedNodeId() const +{ + auto nodeIdNode = Client_->Get( + ::TStringBuilder() << '#' << GetGuidAsString(LockId_) << "/@node_id", + TGetOptions()); + return GetGuid(nodeIdNode.AsString()); +} + +const ::NThreading::TFuture<void>& TLock::GetAcquiredFuture() const +{ + if (!Acquired_) { + auto promise = ::NThreading::NewPromise<void>(); + Client_->GetYtPoller().Watch(::MakeIntrusive<TLockPollerItem>(LockId_, promise)); + Acquired_ = promise.GetFuture(); + } + return *Acquired_; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/lock.h b/yt/cpp/mapreduce/client/lock.h new file mode 100644 index 0000000000..7e2c7a127d --- /dev/null +++ b/yt/cpp/mapreduce/client/lock.h @@ -0,0 +1,31 @@ +#pragma once + +#include "client.h" + +#include <yt/cpp/mapreduce/interface/client.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TLock + : public ILock +{ +public: + TLock(const TLockId& lockId, TClientPtr client, bool waitable); + + virtual const TLockId& GetId() const override; + virtual TNodeId GetLockedNodeId() const override; + virtual const ::NThreading::TFuture<void>& GetAcquiredFuture() const override; + +private: + const TLockId LockId_; + mutable TMaybe<::NThreading::TFuture<void>> Acquired_; + TClientPtr Client_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/operation.cpp b/yt/cpp/mapreduce/client/operation.cpp new file mode 100644 index 0000000000..fc1600c240 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation.cpp @@ -0,0 +1,2981 @@ +#include "operation.h" + +#include "abortable_registry.h" +#include "client.h" +#include "operation_helpers.h" +#include "operation_tracker.h" +#include "transaction.h" +#include "prepare_operation.h" +#include "retry_heavy_write_request.h" +#include "skiff.h" +#include "structured_table_formats.h" +#include "yt_poller.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/fluent.h> +#include <yt/cpp/mapreduce/interface/format.h> +#include <yt/cpp/mapreduce/interface/job_statistics.h> +#include <yt/cpp/mapreduce/interface/protobuf_format.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/io/job_reader.h> +#include <yt/cpp/mapreduce/io/job_writer.h> +#include <yt/cpp/mapreduce/io/yamr_table_reader.h> +#include <yt/cpp/mapreduce/io/yamr_table_writer.h> +#include <yt/cpp/mapreduce/io/node_table_reader.h> +#include <yt/cpp/mapreduce/io/node_table_writer.h> +#include <yt/cpp/mapreduce/io/proto_table_reader.h> +#include <yt/cpp/mapreduce/io/proto_table_writer.h> +#include <yt/cpp/mapreduce/io/proto_helpers.h> +#include <yt/cpp/mapreduce/io/skiff_table_reader.h> + +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <library/cpp/yson/node/serialize.h> + +#include <util/generic/hash_set.h> + +#include <util/string/builder.h> +#include <util/string/cast.h> + +#include <util/system/thread.h> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +static const ui64 DefaultExrtaTmpfsSize = 1024LL * 1024LL; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +struct TMapReduceOperationIo +{ + TVector<TRichYPath> Inputs; + TVector<TRichYPath> MapOutputs; + TVector<TRichYPath> Outputs; + + TMaybe<TFormat> MapperInputFormat; + TMaybe<TFormat> MapperOutputFormat; + + TMaybe<TFormat> ReduceCombinerInputFormat; + TMaybe<TFormat> ReduceCombinerOutputFormat; + + TFormat ReducerInputFormat = TFormat::YsonBinary(); + TFormat ReducerOutputFormat = TFormat::YsonBinary(); + + TVector<TSmallJobFile> MapperJobFiles; + TVector<TSmallJobFile> ReduceCombinerJobFiles; + TVector<TSmallJobFile> ReducerJobFiles; +}; + +template <typename T> +void VerifyHasElements(const TVector<T>& paths, TStringBuf name) +{ + if (paths.empty()) { + ythrow TApiUsageError() << "no " << name << " table is specified"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TVector<TSmallJobFile> CreateFormatConfig( + TMaybe<TSmallJobFile> inputConfig, + const TMaybe<TSmallJobFile>& outputConfig) +{ + TVector<TSmallJobFile> result; + if (inputConfig) { + result.push_back(std::move(*inputConfig)); + } + if (outputConfig) { + result.push_back(std::move(*outputConfig)); + } + return result; +} + +template <typename T> +ENodeReaderFormat NodeReaderFormatFromHintAndGlobalConfig(const TUserJobFormatHintsBase<T>& formatHints) +{ + auto result = TConfig::Get()->NodeReaderFormat; + if (formatHints.InputFormatHints_ && formatHints.InputFormatHints_->SkipNullValuesForTNode_) { + Y_ENSURE_EX( + result != ENodeReaderFormat::Skiff, + TApiUsageError() << "skiff format doesn't support SkipNullValuesForTNode format hint"); + result = ENodeReaderFormat::Yson; + } + return result; +} + +template <class TSpec> +const TVector<TStructuredTablePath>& GetStructuredInputs(const TSpec& spec) +{ + if constexpr (std::is_same_v<TSpec, TVanillaTask>) { + static const TVector<TStructuredTablePath> empty; + return empty; + } else { + return spec.GetStructuredInputs(); + } +} + +template <class TSpec> +const TVector<TStructuredTablePath>& GetStructuredOutputs(const TSpec& spec) +{ + return spec.GetStructuredOutputs(); +} + +template <class TSpec> +const TMaybe<TFormatHints>& GetInputFormatHints(const TSpec& spec) +{ + if constexpr (std::is_same_v<TSpec, TVanillaTask>) { + static const TMaybe<TFormatHints> empty = Nothing(); + return empty; + } else { + return spec.InputFormatHints_; + } +} + +template <class TSpec> +const TMaybe<TFormatHints>& GetOutputFormatHints(const TSpec& spec) +{ + return spec.OutputFormatHints_; +} + +template <class TSpec> +ENodeReaderFormat GetNodeReaderFormat(const TSpec& spec, bool allowSkiff) +{ + if constexpr (std::is_same<TSpec, TVanillaTask>::value) { + return ENodeReaderFormat::Yson; + } else { + return allowSkiff + ? NodeReaderFormatFromHintAndGlobalConfig(spec) + : ENodeReaderFormat::Yson; + } +} + +static void SortColumnsToNames(const TSortColumns& sortColumns, THashSet<TString>* result) +{ + auto names = sortColumns.GetNames(); + result->insert(names.begin(), names.end()); +} + +static THashSet<TString> SortColumnsToNames(const TSortColumns& sortColumns) +{ + THashSet<TString> columnNames; + SortColumnsToNames(sortColumns, &columnNames); + return columnNames; +} + +THashSet<TString> GetColumnsUsedInOperation(const TJoinReduceOperationSpec& spec) +{ + return SortColumnsToNames(spec.JoinBy_); +} + +THashSet<TString> GetColumnsUsedInOperation(const TReduceOperationSpec& spec) { + auto result = SortColumnsToNames(spec.SortBy_); + SortColumnsToNames(spec.ReduceBy_, &result); + if (spec.JoinBy_) { + SortColumnsToNames(*spec.JoinBy_, &result); + } + return result; +} + +THashSet<TString> GetColumnsUsedInOperation(const TMapReduceOperationSpec& spec) +{ + auto result = SortColumnsToNames(spec.SortBy_); + SortColumnsToNames(spec.ReduceBy_, &result); + return result; +} + +THashSet<TString> GetColumnsUsedInOperation(const TMapOperationSpec&) +{ + return THashSet<TString>(); +} + +THashSet<TString> GetColumnsUsedInOperation(const TVanillaTask&) +{ + return THashSet<TString>(); +} + +TStructuredJobTableList ApplyProtobufColumnFilters( + const TStructuredJobTableList& tableList, + const TOperationPreparer& preparer, + const THashSet<TString>& columnsUsedInOperations, + const TOperationOptions& options) +{ + bool hasInputQuery = options.Spec_.Defined() && options.Spec_->IsMap() && options.Spec_->HasKey("input_query"); + if (hasInputQuery) { + return tableList; + } + + auto isDynamic = BatchTransform( + CreateDefaultRequestRetryPolicy(preparer.GetContext().Config), + preparer.GetContext(), + tableList, + [&] (TRawBatchRequest& batch, const auto& table) { + return batch.Get(preparer.GetTransactionId(), table.RichYPath->Path_ + "/@dynamic", TGetOptions()); + }); + + auto newTableList = tableList; + for (size_t tableIndex = 0; tableIndex < tableList.size(); ++tableIndex) { + if (isDynamic[tableIndex].AsBool()) { + continue; + } + auto& table = newTableList[tableIndex]; + Y_VERIFY(table.RichYPath); + if (table.RichYPath->Columns_) { + continue; + } + if (!std::holds_alternative<TProtobufTableStructure>(table.Description)) { + continue; + } + const auto& descriptor = std::get<TProtobufTableStructure>(table.Description).Descriptor; + if (!descriptor) { + continue; + } + auto fromDescriptor = NDetail::InferColumnFilter(*descriptor); + if (!fromDescriptor) { + continue; + } + THashSet<TString> columns(fromDescriptor->begin(), fromDescriptor->end()); + columns.insert(columnsUsedInOperations.begin(), columnsUsedInOperations.end()); + table.RichYPath->Columns(TVector<TString>(columns.begin(), columns.end())); + } + return newTableList; +} + +template <class TSpec> +TSimpleOperationIo CreateSimpleOperationIo( + const IStructuredJob& structuredJob, + const TOperationPreparer& preparer, + const TSpec& spec, + const TOperationOptions& options, + bool allowSkiff) +{ + if (!std::holds_alternative<TVoidStructuredRowStream>(structuredJob.GetInputRowStreamDescription())) { + VerifyHasElements(GetStructuredInputs(spec), "input"); + } + + TUserJobFormatHints hints; + hints.InputFormatHints_ = GetInputFormatHints(spec); + hints.OutputFormatHints_ = GetOutputFormatHints(spec); + ENodeReaderFormat nodeReaderFormat = GetNodeReaderFormat(spec, allowSkiff); + + return CreateSimpleOperationIoHelper( + structuredJob, + preparer, + options, + CanonizeStructuredTableList(preparer.GetContext(), GetStructuredInputs(spec)), + CanonizeStructuredTableList(preparer.GetContext(), GetStructuredOutputs(spec)), + hints, + nodeReaderFormat, + GetColumnsUsedInOperation(spec)); +} + +template <class T> +TSimpleOperationIo CreateSimpleOperationIo( + const IJob& job, + const TOperationPreparer& preparer, + const TSimpleRawOperationIoSpec<T>& spec) +{ + auto getFormatOrDefault = [&] (const TMaybe<TFormat>& maybeFormat, const char* formatName) { + if (maybeFormat) { + return *maybeFormat; + } else if (spec.Format_) { + return *spec.Format_; + } else { + ythrow TApiUsageError() << "Neither " << formatName << "format nor default format is specified for raw operation"; + } + }; + + auto inputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer.GetContext(), spec.GetInputs()); + auto outputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer.GetContext(), spec.GetOutputs()); + + VerifyHasElements(inputs, "input"); + VerifyHasElements(outputs, "output"); + + TUserJobFormatHints hints; + + auto outputSchemas = PrepareOperation( + job, + TOperationPreparationContext( + inputs, + outputs, + preparer.GetContext(), + preparer.GetClientRetryPolicy(), + preparer.GetTransactionId()), + &inputs, + &outputs, + hints); + + Y_VERIFY(outputs.size() == outputSchemas.size()); + for (int i = 0; i < static_cast<int>(outputs.size()); ++i) { + if (!outputs[i].Schema_ && !outputSchemas[i].Columns().empty()) { + outputs[i].Schema_ = outputSchemas[i]; + } + } + + return TSimpleOperationIo { + inputs, + outputs, + + getFormatOrDefault(spec.InputFormat_, "input"), + getFormatOrDefault(spec.OutputFormat_, "output"), + + TVector<TSmallJobFile>{}, + }; +} + +//////////////////////////////////////////////////////////////////// + +TString GetJobStderrWithRetriesAndIgnoreErrors( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const size_t stderrTailSize, + const TGetJobStderrOptions& options = TGetJobStderrOptions()) +{ + TString jobStderr; + try { + jobStderr = GetJobStderrWithRetries( + retryPolicy, + context, + operationId, + jobId, + options); + } catch (const TErrorResponse& e) { + YT_LOG_ERROR("Cannot get job stderr (OperationId: %v, JobId: %v, Error: %v)", + operationId, + jobId, + e.what()); + } + if (jobStderr.size() > stderrTailSize) { + jobStderr = jobStderr.substr(jobStderr.size() - stderrTailSize, stderrTailSize); + } + return jobStderr; +} + +TVector<TFailedJobInfo> GetFailedJobInfo( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TGetFailedJobInfoOptions& options) +{ + const auto listJobsResult = ListJobs( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + operationId, + TListJobsOptions() + .State(EJobState::Failed) + .Limit(options.MaxJobCount_)); + + const auto stderrTailSize = options.StderrTailSize_; + + TVector<TFailedJobInfo> result; + for (const auto& job : listJobsResult.Jobs) { + auto& info = result.emplace_back(); + Y_ENSURE(job.Id); + info.JobId = *job.Id; + info.Error = job.Error.GetOrElse(TYtError(TString("unknown error"))); + if (job.StderrSize.GetOrElse(0) != 0) { + // There are cases when due to bad luck we cannot read stderr even if + // list_jobs reports that stderr_size > 0. + // + // Such errors don't have special error code + // so we ignore all errors and try our luck on other jobs. + info.Stderr = GetJobStderrWithRetriesAndIgnoreErrors( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + operationId, + *job.Id, + stderrTailSize); + } + } + return result; +} + +struct TGetJobsStderrOptions +{ + using TSelf = TGetJobsStderrOptions; + + // How many jobs to download. Which jobs will be chosen is undefined. + FLUENT_FIELD_DEFAULT(ui64, MaxJobCount, 10); + + // How much of stderr should be downloaded. + FLUENT_FIELD_DEFAULT(ui64, StderrTailSize, 64 * 1024); +}; + +static TVector<TString> GetJobsStderr( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TGetJobsStderrOptions& options = TGetJobsStderrOptions()) +{ + const auto listJobsResult = ListJobs( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + operationId, + TListJobsOptions().Limit(options.MaxJobCount_).WithStderr(true)); + const auto stderrTailSize = options.StderrTailSize_; + TVector<TString> result; + for (const auto& job : listJobsResult.Jobs) { + result.push_back( + // There are cases when due to bad luck we cannot read stderr even if + // list_jobs reports that stderr_size > 0. + // + // Such errors don't have special error code + // so we ignore all errors and try our luck on other jobs. + GetJobStderrWithRetriesAndIgnoreErrors( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + operationId, + *job.Id, + stderrTailSize) + ); + } + return result; +} + +int CountIntermediateTables(const TStructuredJobTableList& tables) +{ + int result = 0; + for (const auto& table : tables) { + if (table.RichYPath) { + break; + } + ++result; + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TSimpleOperationIo CreateSimpleOperationIoHelper( + const IStructuredJob& structuredJob, + const TOperationPreparer& preparer, + const TOperationOptions& options, + TStructuredJobTableList structuredInputs, + TStructuredJobTableList structuredOutputs, + TUserJobFormatHints hints, + ENodeReaderFormat nodeReaderFormat, + const THashSet<TString>& columnsUsedInOperations) +{ + auto intermediateInputTableCount = CountIntermediateTables(structuredInputs); + auto intermediateOutputTableCount = CountIntermediateTables(structuredOutputs); + + auto jobSchemaInferenceResult = PrepareOperation( + structuredJob, + TOperationPreparationContext( + structuredInputs, + structuredOutputs, + preparer.GetContext(), + preparer.GetClientRetryPolicy(), + preparer.GetTransactionId()), + &structuredInputs, + &structuredOutputs, + hints); + + TVector<TSmallJobFile> formatConfigList; + TFormatBuilder formatBuilder(preparer.GetClientRetryPolicy(), preparer.GetContext(), preparer.GetTransactionId(), options); + + auto [inputFormat, inputFormatConfig] = formatBuilder.CreateFormat( + structuredJob, + EIODirection::Input, + structuredInputs, + hints.InputFormatHints_, + nodeReaderFormat, + /* allowFormatFromTableAttribute = */ true); + + auto [outputFormat, outputFormatConfig] = formatBuilder.CreateFormat( + structuredJob, + EIODirection::Output, + structuredOutputs, + hints.OutputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute = */ false); + + const bool inferOutputSchema = options.InferOutputSchema_.GetOrElse(preparer.GetContext().Config->InferTableSchema); + + auto outputPaths = GetPathList( + TStructuredJobTableList(structuredOutputs.begin() + intermediateOutputTableCount, structuredOutputs.end()), + TVector<TTableSchema>(jobSchemaInferenceResult.begin() + intermediateOutputTableCount, jobSchemaInferenceResult.end()), + inferOutputSchema); + + auto inputPaths = GetPathList( + ApplyProtobufColumnFilters( + TStructuredJobTableList(structuredInputs.begin() + intermediateInputTableCount, structuredInputs.end()), + preparer, + columnsUsedInOperations, + options), + /*schemaInferenceResult*/ Nothing(), + /*inferSchema*/ false); + + return TSimpleOperationIo { + inputPaths, + outputPaths, + + inputFormat, + outputFormat, + + CreateFormatConfig(inputFormatConfig, outputFormatConfig) + }; +} + +EOperationBriefState CheckOperation( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId) +{ + auto attributes = GetOperation( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + operationId, + TGetOperationOptions().AttributeFilter(TOperationAttributeFilter() + .Add(EOperationAttribute::State) + .Add(EOperationAttribute::Result))); + Y_VERIFY(attributes.BriefState, + "get_operation for operation %s has not returned \"state\" field", + GetGuidAsString(operationId).Data()); + if (*attributes.BriefState == EOperationBriefState::Completed) { + return EOperationBriefState::Completed; + } else if (*attributes.BriefState == EOperationBriefState::Aborted || *attributes.BriefState == EOperationBriefState::Failed) { + YT_LOG_ERROR("Operation %v %v (%v)", + operationId, + ToString(*attributes.BriefState), + ToString(TOperationExecutionTimeTracker::Get()->Finish(operationId))); + + auto failedJobInfoList = GetFailedJobInfo( + clientRetryPolicy, + context, + operationId, + TGetFailedJobInfoOptions()); + + Y_VERIFY(attributes.Result && attributes.Result->Error); + ythrow TOperationFailedError( + *attributes.BriefState == EOperationBriefState::Aborted + ? TOperationFailedError::Aborted + : TOperationFailedError::Failed, + operationId, + *attributes.Result->Error, + failedJobInfoList); + } + return EOperationBriefState::InProgress; +} + +void WaitForOperation( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId) +{ + const TDuration checkOperationStateInterval = + UseLocalModeOptimization(context, clientRetryPolicy) + ? Min(TDuration::MilliSeconds(100), context.Config->OperationTrackerPollPeriod) + : context.Config->OperationTrackerPollPeriod; + + while (true) { + auto status = CheckOperation(clientRetryPolicy, context, operationId); + if (status == EOperationBriefState::Completed) { + YT_LOG_INFO("Operation %v completed (%v)", + operationId, + TOperationExecutionTimeTracker::Get()->Finish(operationId)); + break; + } + TWaitProxy::Get()->Sleep(checkOperationStateInterval); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +TNode BuildAutoMergeSpec(const TAutoMergeSpec& options) +{ + TNode result; + if (options.Mode_) { + result["mode"] = ToString(*options.Mode_); + } + if (options.MaxIntermediateChunkCount_) { + result["max_intermediate_chunk_count"] = *options.MaxIntermediateChunkCount_; + } + if (options.ChunkCountPerMergeJob_) { + result["chunk_count_per_merge_job"] = *options.ChunkCountPerMergeJob_; + } + if (options.ChunkSizeThreshold_) { + result["chunk_size_threshold"] = *options.ChunkSizeThreshold_; + } + return result; +} + +TNode BuildJobProfilerSpec(const TJobProfilerSpec& profilerSpec) +{ + TNode result; + if (profilerSpec.ProfilingBinary_) { + result["binary"] = ToString(*profilerSpec.ProfilingBinary_); + } + if (profilerSpec.ProfilerType_) { + result["type"] = ToString(*profilerSpec.ProfilerType_); + } + if (profilerSpec.ProfilingProbability_) { + result["profiling_probability"] = *profilerSpec.ProfilingProbability_; + } + if (profilerSpec.SamplingFrequency_) { + result["sampling_frequency"] = *profilerSpec.SamplingFrequency_; + } + + return result; +} + +// Returns undefined node if resources doesn't contain any meaningful field +TNode BuildSchedulerResourcesSpec(const TSchedulerResources& resources) +{ + TNode result; + if (resources.UserSlots().Defined()) { + result["user_slots"] = *resources.UserSlots(); + } + if (resources.Cpu().Defined()) { + result["cpu"] = *resources.Cpu(); + } + if (resources.Memory().Defined()) { + result["memory"] = *resources.Memory(); + } + return result; +} + +void BuildUserJobFluently( + const TJobPreparer& preparer, + const TMaybe<TFormat>& inputFormat, + const TMaybe<TFormat>& outputFormat, + TFluentMap fluent) +{ + const auto& userJobSpec = preparer.GetSpec(); + TMaybe<i64> memoryLimit = userJobSpec.MemoryLimit_; + TMaybe<double> cpuLimit = userJobSpec.CpuLimit_; + TMaybe<ui16> portCount = userJobSpec.PortCount_; + + // Use 1MB extra tmpfs size by default, it helps to detect job sandbox as tmp directory + // for standard python libraries. See YTADMINREQ-14505 for more details. + auto tmpfsSize = preparer.GetSpec().ExtraTmpfsSize_.GetOrElse(DefaultExrtaTmpfsSize); + if (preparer.ShouldMountSandbox()) { + tmpfsSize += preparer.GetTotalFileSize(); + if (tmpfsSize == 0) { + // This can be a case for example when it is local mode and we don't upload binary. + // NOTE: YT doesn't like zero tmpfs size. + tmpfsSize = RoundUpFileSize(1); + } + memoryLimit = memoryLimit.GetOrElse(512ll << 20) + tmpfsSize; + } + + fluent + .Item("file_paths").List(preparer.GetFiles()) + .Item("command").Value(preparer.GetCommand()) + .Item("class_name").Value(preparer.GetClassName()) + .DoIf(!userJobSpec.Environment_.empty(), [&] (TFluentMap fluentMap) { + TNode environment; + for (const auto& item : userJobSpec.Environment_) { + environment[item.first] = item.second; + } + fluentMap.Item("environment").Value(environment); + }) + .DoIf(userJobSpec.DiskSpaceLimit_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("disk_space_limit").Value(*userJobSpec.DiskSpaceLimit_); + }) + .DoIf(inputFormat.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("input_format").Value(inputFormat->Config); + }) + .DoIf(outputFormat.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("output_format").Value(outputFormat->Config); + }) + .DoIf(memoryLimit.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("memory_limit").Value(*memoryLimit); + }) + .DoIf(userJobSpec.MemoryReserveFactor_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("memory_reserve_factor").Value(*userJobSpec.MemoryReserveFactor_); + }) + .DoIf(cpuLimit.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("cpu_limit").Value(*cpuLimit); + }) + .DoIf(portCount.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("port_count").Value(*portCount); + }) + .DoIf(userJobSpec.JobTimeLimit_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("job_time_limit").Value(userJobSpec.JobTimeLimit_->MilliSeconds()); + }) + .DoIf(userJobSpec.NetworkProject_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("network_project").Value(*userJobSpec.NetworkProject_); + }) + .DoIf(preparer.ShouldMountSandbox(), [&] (TFluentMap fluentMap) { + fluentMap.Item("tmpfs_path").Value("."); + fluentMap.Item("tmpfs_size").Value(tmpfsSize); + fluentMap.Item("copy_files").Value(true); + }) + .Item("profilers") + .BeginList() + .DoFor(userJobSpec.JobProfilers_, [&] (TFluentList list, const auto& jobProfiler) { + list.Item().Value(BuildJobProfilerSpec(jobProfiler)); + }) + .EndList(); +} + +template <typename T> +void BuildCommonOperationPart(const TConfigPtr& config, const TOperationSpecBase<T>& baseSpec, const TOperationOptions& options, TFluentMap fluent) +{ + const TProcessState* properties = TProcessState::Get(); + TString pool = config->Pool; + + if (baseSpec.Pool_) { + pool = *baseSpec.Pool_; + } + + fluent + .Item("started_by") + .BeginMap() + .Item("hostname").Value(properties->FqdnHostName) + .Item("pid").Value(properties->Pid) + .Item("user").Value(properties->UserName) + .Item("command").List(properties->CensoredCommandLine) + .Item("wrapper_version").Value(properties->ClientVersion) + .EndMap() + .DoIf(!pool.empty(), [&] (TFluentMap fluentMap) { + fluentMap.Item("pool").Value(pool); + }) + .DoIf(baseSpec.Weight_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("weight").Value(*baseSpec.Weight_); + }) + .DoIf(baseSpec.TimeLimit_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("time_limit").Value(baseSpec.TimeLimit_->MilliSeconds()); + }) + .DoIf(baseSpec.PoolTrees().Defined(), [&] (TFluentMap fluentMap) { + TNode poolTreesSpec = TNode::CreateList(); + for (const auto& tree : *baseSpec.PoolTrees()) { + poolTreesSpec.Add(tree); + } + fluentMap.Item("pool_trees").Value(poolTreesSpec); + }) + .DoIf(baseSpec.ResourceLimits().Defined(), [&] (TFluentMap fluentMap) { + auto resourceLimitsSpec = BuildSchedulerResourcesSpec(*baseSpec.ResourceLimits()); + if (!resourceLimitsSpec.IsUndefined()) { + fluentMap.Item("resource_limits").Value(std::move(resourceLimitsSpec)); + } + }) + .DoIf(options.SecureVault_.Defined(), [&] (TFluentMap fluentMap) { + Y_ENSURE(options.SecureVault_->IsMap(), + "SecureVault must be a map node, got " << options.SecureVault_->GetType()); + fluentMap.Item("secure_vault").Value(*options.SecureVault_); + }) + .DoIf(baseSpec.Title_.Defined(), [&] (TFluentMap fluentMap) { + fluentMap.Item("title").Value(*baseSpec.Title_); + }); +} + +template <typename TSpec> +void BuildCommonUserOperationPart(const TSpec& baseSpec, TNode* spec) +{ + if (baseSpec.MaxFailedJobCount_.Defined()) { + (*spec)["max_failed_job_count"] = *baseSpec.MaxFailedJobCount_; + } + if (baseSpec.FailOnJobRestart_.Defined()) { + (*spec)["fail_on_job_restart"] = *baseSpec.FailOnJobRestart_; + } + if (baseSpec.StderrTablePath_.Defined()) { + (*spec)["stderr_table_path"] = *baseSpec.StderrTablePath_; + } + if (baseSpec.CoreTablePath_.Defined()) { + (*spec)["core_table_path"] = *baseSpec.CoreTablePath_; + } + if (baseSpec.WaitingJobTimeout_.Defined()) { + (*spec)["waiting_job_timeout"] = baseSpec.WaitingJobTimeout_->MilliSeconds(); + } +} + +template <typename TSpec> +void BuildJobCountOperationPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.JobCount_.Defined()) { + (*nodeSpec)["job_count"] = *spec.JobCount_; + } + if (spec.DataSizePerJob_.Defined()) { + (*nodeSpec)["data_size_per_job"] = *spec.DataSizePerJob_; + } +} + +template <typename TSpec> +void BuildPartitionCountOperationPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.PartitionCount_.Defined()) { + (*nodeSpec)["partition_count"] = *spec.PartitionCount_; + } + if (spec.PartitionDataSize_.Defined()) { + (*nodeSpec)["partition_data_size"] = *spec.PartitionDataSize_; + } +} + +template <typename TSpec> +void BuildDataSizePerSortJobPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.DataSizePerSortJob_.Defined()) { + (*nodeSpec)["data_size_per_sort_job"] = *spec.DataSizePerSortJob_; + } +} + +template <typename TSpec> +void BuildPartitionJobCountOperationPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.PartitionJobCount_.Defined()) { + (*nodeSpec)["partition_job_count"] = *spec.PartitionJobCount_; + } + if (spec.DataSizePerPartitionJob_.Defined()) { + (*nodeSpec)["data_size_per_partition_job"] = *spec.DataSizePerPartitionJob_; + } +} + +template <typename TSpec> +void BuildMapJobCountOperationPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.MapJobCount_.Defined()) { + (*nodeSpec)["map_job_count"] = *spec.MapJobCount_; + } + if (spec.DataSizePerMapJob_.Defined()) { + (*nodeSpec)["data_size_per_map_job"] = *spec.DataSizePerMapJob_; + } +} + +template <typename TSpec> +void BuildIntermediateDataPart(const TSpec& spec, TNode* nodeSpec) +{ + if (spec.IntermediateDataAccount_.Defined()) { + (*nodeSpec)["intermediate_data_account"] = *spec.IntermediateDataAccount_; + } + if (spec.IntermediateDataReplicationFactor_.Defined()) { + (*nodeSpec)["intermediate_data_replication_factor"] = *spec.IntermediateDataReplicationFactor_; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TNode MergeSpec(TNode dst, TNode spec, const TOperationOptions& options) +{ + MergeNodes(dst["spec"], spec); + if (options.Spec_) { + MergeNodes(dst["spec"], *options.Spec_); + } + return dst; +} + +template <typename TSpec> +void CreateDebugOutputTables(const TSpec& spec, const TOperationPreparer& preparer) +{ + if (spec.StderrTablePath_.Defined()) { + NYT::NDetail::Create( + preparer.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + preparer.GetContext(), + TTransactionId(), + *spec.StderrTablePath_, + NT_TABLE, + TCreateOptions() + .IgnoreExisting(true) + .Recursive(true)); + } + if (spec.CoreTablePath_.Defined()) { + NYT::NDetail::Create( + preparer.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + preparer.GetContext(), + TTransactionId(), + *spec.CoreTablePath_, + NT_TABLE, + TCreateOptions() + .IgnoreExisting(true) + .Recursive(true)); + } +} + +void CreateOutputTable( + const TOperationPreparer& preparer, + const TRichYPath& path) +{ + Y_ENSURE(path.Path_, "Output table is not set"); + Create( + preparer.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + preparer.GetContext(), preparer.GetTransactionId(), path.Path_, NT_TABLE, + TCreateOptions() + .IgnoreExisting(true) + .Recursive(true)); +} + +void CreateOutputTables( + const TOperationPreparer& preparer, + const TVector<TRichYPath>& paths) +{ + for (auto& path : paths) { + CreateOutputTable(preparer, path); + } +} + +void CheckInputTablesExist( + const TOperationPreparer& preparer, + const TVector<TRichYPath>& paths) +{ + Y_ENSURE(!paths.empty(), "Input tables are not set"); + for (auto& path : paths) { + auto curTransactionId = path.TransactionId_.GetOrElse(preparer.GetTransactionId()); + Y_ENSURE_EX( + Exists( + preparer.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + preparer.GetContext(), + curTransactionId, + path.Path_), + TApiUsageError() << "Input table '" << path.Path_ << "' doesn't exist"); + } +} + +void LogJob(const TOperationId& opId, const IJob* job, const char* type) +{ + if (job) { + YT_LOG_INFO("Operation %v; %v = %v", + opId, + type, + TJobFactory::Get()->GetJobName(job)); + } +} + +void LogYPaths(const TOperationId& opId, const TVector<TRichYPath>& paths, const char* type) +{ + for (size_t i = 0; i < paths.size(); ++i) { + YT_LOG_INFO("Operation %v; %v[%v] = %v", + opId, + type, + i, + paths[i].Path_); + } +} + +void LogYPath(const TOperationId& opId, const TRichYPath& path, const char* type) +{ + YT_LOG_INFO("Operation %v; %v = %v", + opId, + type, + path.Path_); +} + +TString AddModeToTitleIfDebug(const TString& title) { +#ifndef NDEBUG + return title + " (debug build)"; +#else + return title; +#endif +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void DoExecuteMap( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TSimpleOperationIo& operationIo, + TMapOperationSpecBase<T> spec, + const IJobPtr& mapper, + const TOperationOptions& options) +{ + if (options.CreateDebugOutputTables_) { + CreateDebugOutputTables(spec, *preparer); + } + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, operationIo.Inputs); + CreateOutputTables(*preparer, operationIo.Outputs); + } + + TJobPreparer map( + *preparer, + spec.MapperSpec_, + *mapper, + operationIo.Outputs.size(), + operationIo.JobFiles, + options); + + spec.Title_ = spec.Title_.GetOrElse(AddModeToTitleIfDebug(map.GetClassName())); + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("mapper").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + map, + operationIo.InputFormat, + operationIo.OutputFormat, + fluent); + }) + .DoIf(spec.AutoMerge_.Defined(), [&] (TFluentMap fluent) { + auto autoMergeSpec = BuildAutoMergeSpec(*spec.AutoMerge_); + if (!autoMergeSpec.IsUndefined()) { + fluent.Item("auto_merge").Value(std::move(autoMergeSpec)); + } + }) + .Item("input_table_paths").List(operationIo.Inputs) + .Item("output_table_paths").List(operationIo.Outputs) + .DoIf(spec.Ordered_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("ordered").Value(spec.Ordered_.GetRef()); + }) + .Do(std::bind(BuildCommonOperationPart<T>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + specNode["spec"]["job_io"]["control_attributes"]["enable_row_index"] = TNode(true); + specNode["spec"]["job_io"]["control_attributes"]["enable_range_index"] = TNode(true); + if (!preparer->GetContext().Config->TableWriter.Empty()) { + specNode["spec"]["job_io"]["table_writer"] = preparer->GetContext().Config->TableWriter; + } + + BuildCommonUserOperationPart(spec, &specNode["spec"]); + BuildJobCountOperationPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + operationIo, + mapper + ] () { + auto operationId = preparer->StartOperation(operation, "map", spec); + + LogJob(operationId, mapper.Get(), "mapper"); + LogYPaths(operationId, operationIo.Inputs, "input"); + LogYPaths(operationId, operationIo.Outputs, "output"); + + return operationId; + }; + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteMap( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMapOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& mapper, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting map operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*mapper, *preparer, spec, options, /* allowSkiff = */ true); + DoExecuteMap( + operation, + preparer, + operationIo, + spec, + mapper, + options); +} + +void ExecuteRawMap( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawMapOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& mapper, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting raw map operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*mapper, *preparer, spec); + DoExecuteMap( + operation, + preparer, + operationIo, + spec, + mapper, + options); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void DoExecuteReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TSimpleOperationIo& operationIo, + TReduceOperationSpecBase<T> spec, + const IJobPtr& reducer, + const TOperationOptions& options) +{ + if (options.CreateDebugOutputTables_) { + CreateDebugOutputTables(spec, *preparer); + } + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, operationIo.Inputs); + CreateOutputTables(*preparer, operationIo.Outputs); + } + + TJobPreparer reduce( + *preparer, + spec.ReducerSpec_, + *reducer, + operationIo.Outputs.size(), + operationIo.JobFiles, + options); + + spec.Title_ = spec.Title_.GetOrElse(AddModeToTitleIfDebug(reduce.GetClassName())); + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("reducer").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + reduce, + operationIo.InputFormat, + operationIo.OutputFormat, + fluent); + }) + .Item("sort_by").Value(spec.SortBy_) + .Item("reduce_by").Value(spec.ReduceBy_) + .DoIf(spec.JoinBy_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("join_by").Value(spec.JoinBy_.GetRef()); + }) + .DoIf(spec.EnableKeyGuarantee_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("enable_key_guarantee").Value(spec.EnableKeyGuarantee_.GetRef()); + }) + .Item("input_table_paths").List(operationIo.Inputs) + .Item("output_table_paths").List(operationIo.Outputs) + .Item("job_io").BeginMap() + .Item("control_attributes").BeginMap() + .Item("enable_key_switch").Value(true) + .Item("enable_row_index").Value(true) + .Item("enable_range_index").Value(true) + .EndMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&] (TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .EndMap() + .DoIf(spec.AutoMerge_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("auto_merge").Value(BuildAutoMergeSpec(*spec.AutoMerge_)); + }) + .Do(std::bind(BuildCommonOperationPart<T>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + BuildCommonUserOperationPart(spec, &specNode["spec"]); + BuildJobCountOperationPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + operationIo, + reducer + ] () { + auto operationId = preparer->StartOperation(operation, "reduce", spec); + + LogJob(operationId, reducer.Get(), "reducer"); + LogYPaths(operationId, operationIo.Inputs, "input"); + LogYPaths(operationId, operationIo.Outputs, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TReduceOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*reducer, *preparer, spec, options, /* allowSkiff = */ false); + DoExecuteReduce( + operation, + preparer, + operationIo, + spec, + reducer, + options); +} + +void ExecuteRawReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting raw reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*reducer, *preparer, spec); + DoExecuteReduce( + operation, + preparer, + operationIo, + spec, + reducer, + options); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void DoExecuteJoinReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TSimpleOperationIo& operationIo, + TJoinReduceOperationSpecBase<T> spec, + const IJobPtr& reducer, + const TOperationOptions& options) +{ + if (options.CreateDebugOutputTables_) { + CreateDebugOutputTables(spec, *preparer); + } + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, operationIo.Inputs); + CreateOutputTables(*preparer, operationIo.Outputs); + } + + TJobPreparer reduce( + *preparer, + spec.ReducerSpec_, + *reducer, + operationIo.Outputs.size(), + operationIo.JobFiles, + options); + + spec.Title_ = spec.Title_.GetOrElse(AddModeToTitleIfDebug(reduce.GetClassName())); + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("reducer").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + reduce, + operationIo.InputFormat, + operationIo.OutputFormat, + fluent); + }) + .Item("join_by").Value(spec.JoinBy_) + .Item("input_table_paths").List(operationIo.Inputs) + .Item("output_table_paths").List(operationIo.Outputs) + .Item("job_io").BeginMap() + .Item("control_attributes").BeginMap() + .Item("enable_key_switch").Value(true) + .Item("enable_row_index").Value(true) + .Item("enable_range_index").Value(true) + .EndMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&] (TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .EndMap() + .Do(std::bind(BuildCommonOperationPart<T>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + BuildCommonUserOperationPart(spec, &specNode["spec"]); + BuildJobCountOperationPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + reducer, + operationIo + ] () { + auto operationId = preparer->StartOperation(operation, "join_reduce", spec); + + LogJob(operationId, reducer.Get(), "reducer"); + LogYPaths(operationId, operationIo.Inputs, "input"); + LogYPaths(operationId, operationIo.Outputs, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteJoinReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TJoinReduceOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting join reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*reducer, *preparer, spec, options, /* allowSkiff = */ false); + return DoExecuteJoinReduce( + operation, + preparer, + operationIo, + spec, + reducer, + options); +} + +void ExecuteRawJoinReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawJoinReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting raw join reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto operationIo = CreateSimpleOperationIo(*reducer, *preparer, spec); + return DoExecuteJoinReduce( + operation, + preparer, + operationIo, + spec, + reducer, + options); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void DoExecuteMapReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMapReduceOperationIo& operationIo, + TMapReduceOperationSpecBase<T> spec, + const IJobPtr& mapper, + const IJobPtr& reduceCombiner, + const IJobPtr& reducer, + const TOperationOptions& options) +{ + TVector<TRichYPath> allOutputs; + allOutputs.insert(allOutputs.end(), operationIo.MapOutputs.begin(), operationIo.MapOutputs.end()); + allOutputs.insert(allOutputs.end(), operationIo.Outputs.begin(), operationIo.Outputs.end()); + + if (options.CreateDebugOutputTables_) { + CreateDebugOutputTables(spec, *preparer); + } + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, operationIo.Inputs); + CreateOutputTables(*preparer, allOutputs); + } + + TSortColumns sortBy = spec.SortBy_; + TSortColumns reduceBy = spec.ReduceBy_; + + if (sortBy.Parts_.empty()) { + sortBy = reduceBy; + } + + const bool hasMapper = mapper != nullptr; + const bool hasCombiner = reduceCombiner != nullptr; + + TVector<TRichYPath> files; + + TJobPreparer reduce( + *preparer, + spec.ReducerSpec_, + *reducer, + operationIo.Outputs.size(), + operationIo.ReducerJobFiles, + options); + + TString title; + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .DoIf(hasMapper, [&] (TFluentMap fluent) { + TJobPreparer map( + *preparer, + spec.MapperSpec_, + *mapper, + 1 + operationIo.MapOutputs.size(), + operationIo.MapperJobFiles, + options); + fluent.Item("mapper").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + std::cref(map), + *operationIo.MapperInputFormat, + *operationIo.MapperOutputFormat, + fluent); + }); + + title = "mapper:" + map.GetClassName() + " "; + }) + .DoIf(hasCombiner, [&] (TFluentMap fluent) { + TJobPreparer combine( + *preparer, + spec.ReduceCombinerSpec_, + *reduceCombiner, + size_t(1), + operationIo.ReduceCombinerJobFiles, + options); + fluent.Item("reduce_combiner").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + combine, + *operationIo.ReduceCombinerInputFormat, + *operationIo.ReduceCombinerOutputFormat, + fluent); + }); + title += "combiner:" + combine.GetClassName() + " "; + }) + .Item("reducer").DoMap([&] (TFluentMap fluent) { + BuildUserJobFluently( + reduce, + operationIo.ReducerInputFormat, + operationIo.ReducerOutputFormat, + fluent); + }) + .Item("sort_by").Value(sortBy) + .Item("reduce_by").Value(reduceBy) + .Item("input_table_paths").List(operationIo.Inputs) + .Item("output_table_paths").List(allOutputs) + .Item("mapper_output_table_count").Value(operationIo.MapOutputs.size()) + .DoIf(spec.ForceReduceCombiners_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("force_reduce_combiners").Value(*spec.ForceReduceCombiners_); + }) + .Item("map_job_io").BeginMap() + .Item("control_attributes").BeginMap() + .Item("enable_row_index").Value(true) + .Item("enable_range_index").Value(true) + .EndMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&] (TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .EndMap() + .Item("sort_job_io").BeginMap() + .Item("control_attributes").BeginMap() + .Item("enable_key_switch").Value(true) + .EndMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&] (TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .EndMap() + .Item("reduce_job_io").BeginMap() + .Item("control_attributes").BeginMap() + .Item("enable_key_switch").Value(true) + .EndMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&] (TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .EndMap() + .Do([&] (TFluentMap) { + spec.Title_ = spec.Title_.GetOrElse(AddModeToTitleIfDebug(title + "reducer:" + reduce.GetClassName())); + }) + .Do(std::bind(BuildCommonOperationPart<T>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + if (spec.Ordered_) { + specNode["spec"]["ordered"] = *spec.Ordered_; + } + + BuildCommonUserOperationPart(spec, &specNode["spec"]); + BuildMapJobCountOperationPart(spec, &specNode["spec"]); + BuildPartitionCountOperationPart(spec, &specNode["spec"]); + BuildIntermediateDataPart(spec, &specNode["spec"]); + BuildDataSizePerSortJobPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + mapper, + reduceCombiner, + reducer, + inputs=operationIo.Inputs, + allOutputs + ] () { + auto operationId = preparer->StartOperation(operation, "map_reduce", spec); + + LogJob(operationId, mapper.Get(), "mapper"); + LogJob(operationId, reduceCombiner.Get(), "reduce_combiner"); + LogJob(operationId, reducer.Get(), "reducer"); + LogYPaths(operationId, inputs, "input"); + LogYPaths(operationId, allOutputs, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteMapReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMapReduceOperationSpec& spec_, + const ::TIntrusivePtr<IStructuredJob>& mapper, + const ::TIntrusivePtr<IStructuredJob>& reduceCombiner, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting map-reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + TMapReduceOperationSpec spec = spec_; + + TMapReduceOperationIo operationIo; + auto structuredInputs = CanonizeStructuredTableList(preparer->GetContext(), spec.GetStructuredInputs()); + auto structuredMapOutputs = CanonizeStructuredTableList(preparer->GetContext(), spec.GetStructuredMapOutputs()); + auto structuredOutputs = CanonizeStructuredTableList(preparer->GetContext(), spec.GetStructuredOutputs()); + + const bool inferOutputSchema = options.InferOutputSchema_.GetOrElse(preparer->GetContext().Config->InferTableSchema); + + TVector<TTableSchema> currentInferenceResult; + + auto fixSpec = [&] (const TFormat& format) { + if (format.IsYamredDsv()) { + spec.SortBy_.Parts_.clear(); + spec.ReduceBy_.Parts_.clear(); + + const TYamredDsvAttributes attributes = format.GetYamredDsvAttributes(); + for (auto& column : attributes.KeyColumnNames) { + spec.SortBy_.Parts_.push_back(column); + spec.ReduceBy_.Parts_.push_back(column); + } + for (const auto& column : attributes.SubkeyColumnNames) { + spec.SortBy_.Parts_.push_back(column); + } + } + }; + + VerifyHasElements(structuredInputs, "inputs"); + + TFormatBuilder formatBuilder( + preparer->GetClientRetryPolicy(), + preparer->GetContext(), + preparer->GetTransactionId(), + options); + + if (mapper) { + auto mapperOutputDescription = + spec.GetIntermediateMapOutputDescription() + .GetOrElse(TUnspecifiedTableStructure()); + TStructuredJobTableList mapperOutput = { + TStructuredJobTable::Intermediate(mapperOutputDescription), + }; + + for (const auto& table : structuredMapOutputs) { + mapperOutput.push_back(TStructuredJobTable{table.Description, table.RichYPath}); + } + + auto hints = spec.MapperFormatHints_; + + auto mapperInferenceResult = PrepareOperation<TStructuredJobTableList>( + *mapper, + TOperationPreparationContext( + structuredInputs, + mapperOutput, + preparer->GetContext(), + preparer->GetClientRetryPolicy(), + preparer->GetTransactionId()), + &structuredInputs, + /* outputs */ nullptr, + hints); + + auto nodeReaderFormat = NodeReaderFormatFromHintAndGlobalConfig(spec.MapperFormatHints_); + + auto [inputFormat, inputFormatConfig] = formatBuilder.CreateFormat( + *mapper, + EIODirection::Input, + structuredInputs, + hints.InputFormatHints_, + nodeReaderFormat, + /* allowFormatFromTableAttribute */ true); + + auto [outputFormat, outputFormatConfig] = formatBuilder.CreateFormat( + *mapper, + EIODirection::Output, + mapperOutput, + hints.OutputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute */ false); + + operationIo.MapperJobFiles = CreateFormatConfig(inputFormatConfig, outputFormatConfig); + operationIo.MapperInputFormat = inputFormat; + operationIo.MapperOutputFormat = outputFormat; + + Y_VERIFY(mapperInferenceResult.size() >= 1); + currentInferenceResult = TVector<TTableSchema>{mapperInferenceResult[0]}; + // The first output as it corresponds to the intermediate data. + TVector<TTableSchema> additionalOutputsInferenceResult(mapperInferenceResult.begin() + 1, mapperInferenceResult.end()); + + operationIo.MapOutputs = GetPathList( + structuredMapOutputs, + additionalOutputsInferenceResult, + inferOutputSchema); + } + + if (reduceCombiner) { + const bool isFirstStep = !mapper; + TStructuredJobTableList inputs; + if (isFirstStep) { + inputs = structuredInputs; + } else { + auto reduceCombinerIntermediateInput = + spec.GetIntermediateReduceCombinerInputDescription() + .GetOrElse(TUnspecifiedTableStructure()); + inputs = { + TStructuredJobTable::Intermediate(reduceCombinerIntermediateInput), + }; + } + + auto reduceCombinerOutputDescription = spec.GetIntermediateReduceCombinerOutputDescription() + .GetOrElse(TUnspecifiedTableStructure()); + + TStructuredJobTableList outputs = { + TStructuredJobTable::Intermediate(reduceCombinerOutputDescription), + }; + + auto hints = spec.ReduceCombinerFormatHints_; + + if (isFirstStep) { + currentInferenceResult = PrepareOperation<TStructuredJobTableList>( + *reduceCombiner, + TOperationPreparationContext( + inputs, + outputs, + preparer->GetContext(), + preparer->GetClientRetryPolicy(), + preparer->GetTransactionId()), + &inputs, + /* outputs */ nullptr, + hints); + } else { + currentInferenceResult = PrepareOperation<TStructuredJobTableList>( + *reduceCombiner, + TSpeculativeOperationPreparationContext( + currentInferenceResult, + inputs, + outputs), + /* inputs */ nullptr, + /* outputs */ nullptr, + hints); + } + + auto [inputFormat, inputFormatConfig] = formatBuilder.CreateFormat( + *reduceCombiner, + EIODirection::Input, + inputs, + hints.InputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute = */ isFirstStep); + + auto [outputFormat, outputFormatConfig] = formatBuilder.CreateFormat( + *reduceCombiner, + EIODirection::Output, + outputs, + hints.OutputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute = */ false); + + operationIo.ReduceCombinerJobFiles = CreateFormatConfig(inputFormatConfig, outputFormatConfig); + operationIo.ReduceCombinerInputFormat = inputFormat; + operationIo.ReduceCombinerOutputFormat = outputFormat; + + if (isFirstStep) { + fixSpec(*operationIo.ReduceCombinerInputFormat); + } + } + + const bool isFirstStep = (!mapper && !reduceCombiner); + TStructuredJobTableList reducerInputs; + if (isFirstStep) { + reducerInputs = structuredInputs; + } else { + auto reducerInputDescription = + spec.GetIntermediateReducerInputDescription() + .GetOrElse(TUnspecifiedTableStructure()); + reducerInputs = { + TStructuredJobTable::Intermediate(reducerInputDescription), + }; + } + + auto hints = spec.ReducerFormatHints_; + + TVector<TTableSchema> reducerInferenceResult; + if (isFirstStep) { + reducerInferenceResult = PrepareOperation( + *reducer, + TOperationPreparationContext( + structuredInputs, + structuredOutputs, + preparer->GetContext(), + preparer->GetClientRetryPolicy(), + preparer->GetTransactionId()), + &structuredInputs, + &structuredOutputs, + hints); + } else { + reducerInferenceResult = PrepareOperation<TStructuredJobTableList>( + *reducer, + TSpeculativeOperationPreparationContext( + currentInferenceResult, + reducerInputs, + structuredOutputs), + /* inputs */ nullptr, + &structuredOutputs, + hints); + } + + auto [inputFormat, inputFormatConfig] = formatBuilder.CreateFormat( + *reducer, + EIODirection::Input, + reducerInputs, + hints.InputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute = */ isFirstStep); + + auto [outputFormat, outputFormatConfig] = formatBuilder.CreateFormat( + *reducer, + EIODirection::Output, + ToStructuredJobTableList(spec.GetStructuredOutputs()), + hints.OutputFormatHints_, + ENodeReaderFormat::Yson, + /* allowFormatFromTableAttribute = */ false); + operationIo.ReducerJobFiles = CreateFormatConfig(inputFormatConfig, outputFormatConfig); + operationIo.ReducerInputFormat = inputFormat; + operationIo.ReducerOutputFormat = outputFormat; + + if (isFirstStep) { + fixSpec(operationIo.ReducerInputFormat); + } + + operationIo.Inputs = GetPathList( + ApplyProtobufColumnFilters( + structuredInputs, + *preparer, + GetColumnsUsedInOperation(spec), + options), + /* jobSchemaInferenceResult */ Nothing(), + /* inferSchema */ false); + + operationIo.Outputs = GetPathList( + structuredOutputs, + reducerInferenceResult, + inferOutputSchema); + + VerifyHasElements(operationIo.Outputs, "outputs"); + + return DoExecuteMapReduce( + operation, + preparer, + operationIo, + spec, + mapper, + reduceCombiner, + reducer, + options); +} + +void ExecuteRawMapReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawMapReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& mapper, + const ::TIntrusivePtr<IRawJob>& reduceCombiner, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting raw map-reduce operation (PreparationId: %v)", + preparer->GetPreparationId()); + TMapReduceOperationIo operationIo; + operationIo.Inputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.GetInputs()); + operationIo.MapOutputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.GetMapOutputs()); + operationIo.Outputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.GetOutputs()); + + VerifyHasElements(operationIo.Inputs, "inputs"); + VerifyHasElements(operationIo.Outputs, "outputs"); + + auto getFormatOrDefault = [&] (const TMaybe<TFormat>& maybeFormat, const TMaybe<TFormat> stageDefaultFormat, const char* formatName) { + if (maybeFormat) { + return *maybeFormat; + } else if (stageDefaultFormat) { + return *stageDefaultFormat; + } else { + ythrow TApiUsageError() << "Cannot derive " << formatName; + } + }; + + if (mapper) { + operationIo.MapperInputFormat = getFormatOrDefault(spec.MapperInputFormat_, spec.MapperFormat_, "mapper input format"); + operationIo.MapperOutputFormat = getFormatOrDefault(spec.MapperOutputFormat_, spec.MapperFormat_, "mapper output format"); + } + + if (reduceCombiner) { + operationIo.ReduceCombinerInputFormat = getFormatOrDefault(spec.ReduceCombinerInputFormat_, spec.ReduceCombinerFormat_, "reduce combiner input format"); + operationIo.ReduceCombinerOutputFormat = getFormatOrDefault(spec.ReduceCombinerOutputFormat_, spec.ReduceCombinerFormat_, "reduce combiner output format"); + } + + operationIo.ReducerInputFormat = getFormatOrDefault(spec.ReducerInputFormat_, spec.ReducerFormat_, "reducer input format"); + operationIo.ReducerOutputFormat = getFormatOrDefault(spec.ReducerOutputFormat_, spec.ReducerFormat_, "reducer output format"); + + return DoExecuteMapReduce( + operation, + preparer, + operationIo, + spec, + mapper, + reduceCombiner, + reducer, + options); +} + +void ExecuteSort( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TSortOperationSpec& spec, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting sort operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto inputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.Inputs_); + auto output = CanonizeYPath(nullptr, preparer->GetContext(), spec.Output_); + + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, inputs); + CreateOutputTable(*preparer, output); + } + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("input_table_paths").List(inputs) + .Item("output_table_path").Value(output) + .Item("sort_by").Value(spec.SortBy_) + .DoIf(spec.SchemaInferenceMode_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("schema_inference_mode").Value(ToString(*spec.SchemaInferenceMode_)); + }) + .Do(std::bind(BuildCommonOperationPart<TSortOperationSpec>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + BuildPartitionCountOperationPart(spec, &specNode["spec"]); + BuildPartitionJobCountOperationPart(spec, &specNode["spec"]); + BuildIntermediateDataPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + inputs, + output + ] () { + auto operationId = preparer->StartOperation(operation, "sort", spec); + + LogYPaths(operationId, inputs, "input"); + LogYPath(operationId, output, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteMerge( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMergeOperationSpec& spec, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting merge operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto inputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.Inputs_); + auto output = CanonizeYPath(nullptr, preparer->GetContext(), spec.Output_); + + if (options.CreateOutputTables_) { + CheckInputTablesExist(*preparer, inputs); + CreateOutputTable(*preparer, output); + } + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("input_table_paths").List(inputs) + .Item("output_table_path").Value(output) + .Item("mode").Value(ToString(spec.Mode_)) + .Item("combine_chunks").Value(spec.CombineChunks_) + .Item("force_transform").Value(spec.ForceTransform_) + .Item("merge_by").Value(spec.MergeBy_) + .DoIf(spec.SchemaInferenceMode_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("schema_inference_mode").Value(ToString(*spec.SchemaInferenceMode_)); + }) + .Do(std::bind(BuildCommonOperationPart<TMergeOperationSpec>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + BuildJobCountOperationPart(spec, &specNode["spec"]); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + inputs, + output + ] () { + auto operationId = preparer->StartOperation(operation, "merge", spec); + + LogYPaths(operationId, inputs, "input"); + LogYPath(operationId, output, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteErase( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TEraseOperationSpec& spec, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting erase operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto tablePath = CanonizeYPath(nullptr, preparer->GetContext(), spec.TablePath_); + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("table_path").Value(tablePath) + .Item("combine_chunks").Value(spec.CombineChunks_) + .DoIf(spec.SchemaInferenceMode_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("schema_inference_mode").Value(ToString(*spec.SchemaInferenceMode_)); + }) + .Do(std::bind(BuildCommonOperationPart<TEraseOperationSpec>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), + preparer, + tablePath + ] () { + auto operationId = preparer->StartOperation(operation, "erase", spec); + + LogYPath(operationId, tablePath, "table_path"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteRemoteCopy( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting remote copy operation (PreparationId: %v)", + preparer->GetPreparationId()); + auto inputs = CanonizeYPaths(/* retryPolicy */ nullptr, preparer->GetContext(), spec.Inputs_); + auto output = CanonizeYPath(nullptr, preparer->GetContext(), spec.Output_); + + if (options.CreateOutputTables_) { + CreateOutputTable(*preparer, output); + } + + Y_ENSURE_EX(!spec.ClusterName_.empty(), TApiUsageError() << "ClusterName parameter is required"); + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("cluster_name").Value(spec.ClusterName_) + .Item("input_table_paths").List(inputs) + .Item("output_table_path").Value(output) + .DoIf(spec.NetworkName_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("network_name").Value(*spec.NetworkName_); + }) + .DoIf(spec.SchemaInferenceMode_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("schema_inference_mode").Value(ToString(*spec.SchemaInferenceMode_)); + }) + .Item("copy_attributes").Value(spec.CopyAttributes_) + .DoIf(!spec.AttributeKeys_.empty(), [&] (TFluentMap fluent) { + Y_ENSURE_EX(spec.CopyAttributes_, TApiUsageError() << + "Specifying nonempty AttributeKeys in RemoteCopy " + "doesn't make sense without CopyAttributes == true"); + fluent.Item("attribute_keys").List(spec.AttributeKeys_); + }) + .Do(std::bind(BuildCommonOperationPart<TRemoteCopyOperationSpec>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + auto startOperation = [ + operation=operation.Get(), + spec=MergeSpec(specNode, preparer->GetContext().Config->Spec, options), + preparer, + inputs, + output + ] () { + auto operationId = preparer->StartOperation(operation, "remote_copy", spec); + + LogYPaths(operationId, inputs, "input"); + LogYPath(operationId, output, "output"); + + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +void ExecuteVanilla( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TVanillaOperationSpec& spec, + const TOperationOptions& options) +{ + YT_LOG_DEBUG("Starting vanilla operation (PreparationId: %v)", + preparer->GetPreparationId()); + + auto addTask = [&](TFluentMap fluent, const TVanillaTask& task) { + Y_VERIFY(task.Job_.Get()); + if (std::holds_alternative<TVoidStructuredRowStream>(task.Job_->GetOutputRowStreamDescription())) { + Y_ENSURE_EX(task.Outputs_.empty(), + TApiUsageError() << "Vanilla task with void IVanillaJob doesn't expect output tables"); + TJobPreparer jobPreparer( + *preparer, + task.Spec_, + *task.Job_, + /* outputTableCount */ 0, + /* smallFileList */ {}, + options); + fluent + .Item(task.Name_).BeginMap() + .Item("job_count").Value(task.JobCount_) + .DoIf(task.NetworkProject_.Defined(), [&](TFluentMap fluent) { + fluent.Item("network_project").Value(*task.NetworkProject_); + }) + .Do([&] (TFluentMap fluent) { + BuildUserJobFluently( + std::cref(jobPreparer), + /* inputFormat */ Nothing(), + /* outputFormat */ Nothing(), + fluent); + }) + .EndMap(); + } else { + auto operationIo = CreateSimpleOperationIo( + *task.Job_, + *preparer, + task, + options, + false); + Y_ENSURE_EX(operationIo.Outputs.size() > 0, + TApiUsageError() << "Vanilla task with IVanillaJob that has table writer expects output tables"); + if (options.CreateOutputTables_) { + CreateOutputTables(*preparer, operationIo.Outputs); + } + TJobPreparer jobPreparer( + *preparer, + task.Spec_, + *task.Job_, + operationIo.Outputs.size(), + operationIo.JobFiles, + options); + fluent + .Item(task.Name_).BeginMap() + .Item("job_count").Value(task.JobCount_) + .DoIf(task.NetworkProject_.Defined(), [&](TFluentMap fluent) { + fluent.Item("network_project").Value(*task.NetworkProject_); + }) + .Do([&] (TFluentMap fluent) { + BuildUserJobFluently( + std::cref(jobPreparer), + /* inputFormat */ Nothing(), + operationIo.OutputFormat, + fluent); + }) + .Item("output_table_paths").List(operationIo.Outputs) + .Item("job_io").BeginMap() + .DoIf(!preparer->GetContext().Config->TableWriter.Empty(), [&](TFluentMap fluent) { + fluent.Item("table_writer").Value(preparer->GetContext().Config->TableWriter); + }) + .Item("control_attributes").BeginMap() + .Item("enable_row_index").Value(TNode(true)) + .Item("enable_range_index").Value(TNode(true)) + .EndMap() + .EndMap() + .EndMap(); + } + }; + + if (options.CreateDebugOutputTables_) { + CreateDebugOutputTables(spec, *preparer); + } + + TNode specNode = BuildYsonNodeFluently() + .BeginMap().Item("spec").BeginMap() + .Item("tasks").DoMapFor(spec.Tasks_, addTask) + .Do(std::bind(BuildCommonOperationPart<TVanillaOperationSpec>, preparer->GetContext().Config, spec, options, std::placeholders::_1)) + .EndMap().EndMap(); + + BuildCommonUserOperationPart(spec, &specNode["spec"]); + + auto startOperation = [operation=operation.Get(), spec=MergeSpec(std::move(specNode), preparer->GetContext().Config->Spec, options), preparer] () { + auto operationId = preparer->StartOperation(operation, "vanilla", spec, /* useStartOperationRequest */ true); + return operationId; + }; + + operation->SetDelayedStartFunction(std::move(startOperation)); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TOperation::TOperationImpl + : public TThrRefBase +{ +public: + TOperationImpl( + IClientRetryPolicyPtr clientRetryPolicy, + TClientContext context, + const TMaybe<TOperationId>& operationId = {}) + : ClientRetryPolicy_(clientRetryPolicy) + , Context_(std::move(context)) + , Id_(operationId) + , PreparedPromise_(::NThreading::NewPromise<void>()) + , StartedPromise_(::NThreading::NewPromise<void>()) + { + if (Id_) { + PreparedPromise_.SetValue(); + StartedPromise_.SetValue(); + } else { + PreparedPromise_.GetFuture().Subscribe([this_=::TIntrusivePtr(this)] (const ::NThreading::TFuture<void>& preparedResult) { + try { + preparedResult.GetValue(); + } catch (...) { + this_->StartedPromise_.SetException(std::current_exception()); + return; + } + }); + } + } + + const TOperationId& GetId() const; + TString GetWebInterfaceUrl() const; + + void OnPrepared(); + void SetDelayedStartFunction(std::function<TOperationId()> start); + void Start(); + bool IsStarted() const; + void OnPreparationException(std::exception_ptr e); + + TString GetStatus(); + void OnStatusUpdated(const TString& newStatus); + + ::NThreading::TFuture<void> GetPreparedFuture(); + ::NThreading::TFuture<void> GetStartedFuture(); + ::NThreading::TFuture<void> Watch(TClientPtr client); + + EOperationBriefState GetBriefState(); + TMaybe<TYtError> GetError(); + TJobStatistics GetJobStatistics(); + TMaybe<TOperationBriefProgress> GetBriefProgress(); + void AbortOperation(); + void CompleteOperation(); + void SuspendOperation(const TSuspendOperationOptions& options); + void ResumeOperation(const TResumeOperationOptions& options); + TOperationAttributes GetAttributes(const TGetOperationOptions& options); + void UpdateParameters(const TUpdateOperationParametersOptions& options); + TJobAttributes GetJob(const TJobId& jobId, const TGetJobOptions& options); + TListJobsResult ListJobs(const TListJobsOptions& options); + + void AsyncFinishOperation(TOperationAttributes operationAttributes); + void FinishWithException(std::exception_ptr exception); + void UpdateBriefProgress(TMaybe<TOperationBriefProgress> briefProgress); + void AnalyzeUnrecognizedSpec(TNode unrecognizedSpec); + + const TClientContext& GetContext() const; + +private: + void OnStarted(const TOperationId& operationId); + + void UpdateAttributesAndCall(bool needJobStatistics, std::function<void(const TOperationAttributes&)> func); + + void SyncFinishOperationImpl(const TOperationAttributes&); + static void* SyncFinishOperationProc(void* ); + + void ValidateOperationStarted() const; + +private: + IClientRetryPolicyPtr ClientRetryPolicy_; + const TClientContext Context_; + TMaybe<TOperationId> Id_; + TMutex Lock_; + + ::NThreading::TPromise<void> PreparedPromise_; + ::NThreading::TPromise<void> StartedPromise_; + TMaybe<::NThreading::TPromise<void>> CompletePromise_; + + std::function<TOperationId()> DelayedStartFunction_; + TString Status_; + TOperationAttributes Attributes_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TOperationPollerItem + : public IYtPollerItem +{ +public: + TOperationPollerItem(::TIntrusivePtr<TOperation::TOperationImpl> operationImpl) + : OperationImpl_(std::move(operationImpl)) + { } + + void PrepareRequest(TRawBatchRequest* batchRequest) override + { + auto filter = TOperationAttributeFilter() + .Add(EOperationAttribute::State) + .Add(EOperationAttribute::BriefProgress) + .Add(EOperationAttribute::Result); + + if (!UnrecognizedSpecAnalyzed_) { + filter.Add(EOperationAttribute::UnrecognizedSpec); + } + + OperationState_ = batchRequest->GetOperation( + OperationImpl_->GetId(), + TGetOperationOptions().AttributeFilter(filter)); + } + + EStatus OnRequestExecuted() override + { + try { + const auto& attributes = OperationState_.GetValue(); + if (!UnrecognizedSpecAnalyzed_ && !attributes.UnrecognizedSpec.Empty()) { + OperationImpl_->AnalyzeUnrecognizedSpec(*attributes.UnrecognizedSpec); + UnrecognizedSpecAnalyzed_ = true; + } + Y_VERIFY(attributes.BriefState, + "get_operation for operation %s has not returned \"state\" field", + GetGuidAsString(OperationImpl_->GetId()).Data()); + if (*attributes.BriefState != EOperationBriefState::InProgress) { + OperationImpl_->AsyncFinishOperation(attributes); + return PollBreak; + } else { + OperationImpl_->UpdateBriefProgress(attributes.BriefProgress); + } + } catch (const TErrorResponse& e) { + if (!IsRetriable(e)) { + OperationImpl_->FinishWithException(std::current_exception()); + return PollBreak; + } + } catch (const std::exception& e) { + OperationImpl_->FinishWithException(std::current_exception()); + return PollBreak; + } + return PollContinue; + } + + void OnItemDiscarded() override { + OperationImpl_->FinishWithException(std::make_exception_ptr(yexception() << "Operation cancelled")); + } + +private: + ::TIntrusivePtr<TOperation::TOperationImpl> OperationImpl_; + ::NThreading::TFuture<TOperationAttributes> OperationState_; + bool UnrecognizedSpecAnalyzed_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +const TOperationId& TOperation::TOperationImpl::GetId() const +{ + ValidateOperationStarted(); + return *Id_; +} + +TString TOperation::TOperationImpl::GetWebInterfaceUrl() const +{ + ValidateOperationStarted(); + return GetOperationWebInterfaceUrl(Context_.ServerName, *Id_); +} + +void TOperation::TOperationImpl::OnPrepared() +{ + Y_VERIFY(!PreparedPromise_.HasException() && !PreparedPromise_.HasValue()); + PreparedPromise_.SetValue(); +} + +void TOperation::TOperationImpl::SetDelayedStartFunction(std::function<TOperationId()> start) +{ + DelayedStartFunction_ = std::move(start); +} + +void TOperation::TOperationImpl::Start() +{ + { + auto guard = Guard(Lock_); + if (Id_) { + ythrow TApiUsageError() << "Start() should not be called on running operations"; + } + } + GetPreparedFuture().GetValueSync(); + + std::function<TOperationId()> startStuff; + { + auto guard = Guard(Lock_); + startStuff.swap(DelayedStartFunction_); + } + if (!startStuff) { + ythrow TApiUsageError() << "Seems that Start() was called multiple times. If not, contact yt@"; + } + + TOperationId operationId; + try { + operationId = startStuff(); + } catch (...) { + auto exception = std::current_exception(); + StartedPromise_.SetException(exception); + std::rethrow_exception(exception); + } + OnStarted(operationId); +} + +bool TOperation::TOperationImpl::IsStarted() const { + auto guard = Guard(Lock_); + return bool(Id_); +} + +void TOperation::TOperationImpl::OnPreparationException(std::exception_ptr e) +{ + Y_VERIFY(!PreparedPromise_.HasValue() && !PreparedPromise_.HasException()); + PreparedPromise_.SetException(e); +} + +TString TOperation::TOperationImpl::GetStatus() +{ + { + auto guard = Guard(Lock_); + if (!Id_) { + return Status_; + } + } + TMaybe<TString> state; + UpdateAttributesAndCall(false, [&] (const TOperationAttributes& attributes) { + state = attributes.State; + }); + + return "On YT cluster: " + state.GetOrElse("undefined state"); +} + +void TOperation::TOperationImpl::OnStatusUpdated(const TString& newStatus) +{ + auto guard = Guard(Lock_); + Status_ = newStatus; +} + +::NThreading::TFuture<void> TOperation::TOperationImpl::GetPreparedFuture() +{ + return PreparedPromise_.GetFuture(); +} + +::NThreading::TFuture<void> TOperation::TOperationImpl::GetStartedFuture() +{ + return StartedPromise_.GetFuture(); +} + +::NThreading::TFuture<void> TOperation::TOperationImpl::Watch(TClientPtr client) +{ + { + auto guard = Guard(Lock_); + if (CompletePromise_) { + return *CompletePromise_; + } + CompletePromise_ = ::NThreading::NewPromise<void>(); + } + GetStartedFuture().Subscribe([ + this_=::TIntrusivePtr(this), + client=std::move(client) + ] (const ::NThreading::TFuture<void>& startedResult) { + try { + startedResult.GetValue(); + } catch (...) { + this_->CompletePromise_->SetException(std::current_exception()); + return; + } + client->GetYtPoller().Watch(::MakeIntrusive<TOperationPollerItem>(this_)); + auto operationId = this_->GetId(); + auto registry = TAbortableRegistry::Get(); + registry->Add( + operationId, + ::MakeIntrusive<TOperationAbortable>(this_->ClientRetryPolicy_, this_->Context_, operationId)); + // We have to own an IntrusivePtr to registry to prevent use-after-free + auto removeOperation = [registry, operationId] (const ::NThreading::TFuture<void>&) { + registry->Remove(operationId); + }; + this_->CompletePromise_->GetFuture().Subscribe(removeOperation); + }); + + return *CompletePromise_; +} + +EOperationBriefState TOperation::TOperationImpl::GetBriefState() +{ + ValidateOperationStarted(); + EOperationBriefState result = EOperationBriefState::InProgress; + UpdateAttributesAndCall(false, [&] (const TOperationAttributes& attributes) { + Y_VERIFY(attributes.BriefState, + "get_operation for operation %s has not returned \"state\" field", + GetGuidAsString(*Id_).Data()); + result = *attributes.BriefState; + }); + return result; +} + +TMaybe<TYtError> TOperation::TOperationImpl::GetError() +{ + ValidateOperationStarted(); + TMaybe<TYtError> result; + UpdateAttributesAndCall(false, [&] (const TOperationAttributes& attributes) { + Y_VERIFY(attributes.Result); + result = attributes.Result->Error; + }); + return result; +} + +TJobStatistics TOperation::TOperationImpl::GetJobStatistics() +{ + ValidateOperationStarted(); + TJobStatistics result; + UpdateAttributesAndCall(true, [&] (const TOperationAttributes& attributes) { + if (attributes.Progress) { + result = attributes.Progress->JobStatistics; + } + }); + return result; +} + +TMaybe<TOperationBriefProgress> TOperation::TOperationImpl::GetBriefProgress() +{ + ValidateOperationStarted(); + { + auto g = Guard(Lock_); + if (CompletePromise_.Defined()) { + // Poller do this job for us + return Attributes_.BriefProgress; + } + } + TMaybe<TOperationBriefProgress> result; + UpdateAttributesAndCall(false, [&] (const TOperationAttributes& attributes) { + result = attributes.BriefProgress; + }); + return result; +} + +void TOperation::TOperationImpl::UpdateBriefProgress(TMaybe<TOperationBriefProgress> briefProgress) +{ + auto g = Guard(Lock_); + Attributes_.BriefProgress = std::move(briefProgress); +} + +void TOperation::TOperationImpl::AnalyzeUnrecognizedSpec(TNode unrecognizedSpec) +{ + static const TVector<TVector<TString>> knownUnrecognizedSpecFieldPaths = { + {"mapper", "class_name"}, + {"reducer", "class_name"}, + {"reduce_combiner", "class_name"}, + }; + + auto removeByPath = [] (TNode& node, auto pathBegin, auto pathEnd, auto& removeByPath) { + if (pathBegin == pathEnd) { + return; + } + if (!node.IsMap()) { + return; + } + auto* child = node.AsMap().FindPtr(*pathBegin); + if (!child) { + return; + } + removeByPath(*child, std::next(pathBegin), pathEnd, removeByPath); + if (std::next(pathBegin) == pathEnd || (child->IsMap() && child->Empty())) { + node.AsMap().erase(*pathBegin); + } + }; + + Y_VERIFY(unrecognizedSpec.IsMap()); + for (const auto& knownFieldPath : knownUnrecognizedSpecFieldPaths) { + Y_VERIFY(!knownFieldPath.empty()); + removeByPath(unrecognizedSpec, knownFieldPath.cbegin(), knownFieldPath.cend(), removeByPath); + } + + if (!unrecognizedSpec.Empty()) { + YT_LOG_INFO( + "WARNING! Unrecognized spec for operation %s is not empty " + "(fields added by the YT API library are excluded): %s", + GetGuidAsString(*Id_).Data(), + NodeToYsonString(unrecognizedSpec).Data()); + } +} + +void TOperation::TOperationImpl::OnStarted(const TOperationId& operationId) +{ + auto guard = Guard(Lock_); + Y_VERIFY(!Id_, + "OnStarted() called with operationId = %s for operation with id %s", + GetGuidAsString(operationId).Data(), + GetGuidAsString(*Id_).Data()); + Id_ = operationId; + + Y_VERIFY(!StartedPromise_.HasValue() && !StartedPromise_.HasException()); + StartedPromise_.SetValue(); +} + +void TOperation::TOperationImpl::UpdateAttributesAndCall(bool needJobStatistics, std::function<void(const TOperationAttributes&)> func) +{ + { + auto g = Guard(Lock_); + if (Attributes_.BriefState + && *Attributes_.BriefState != EOperationBriefState::InProgress + && (!needJobStatistics || Attributes_.Progress)) + { + func(Attributes_); + return; + } + } + + TOperationAttributes attributes = NDetail::GetOperation( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + *Id_, + TGetOperationOptions().AttributeFilter(TOperationAttributeFilter() + .Add(EOperationAttribute::Result) + .Add(EOperationAttribute::Progress) + .Add(EOperationAttribute::State) + .Add(EOperationAttribute::BriefProgress))); + + func(attributes); + + Y_ENSURE(attributes.BriefState); + if (*attributes.BriefState != EOperationBriefState::InProgress) { + auto g = Guard(Lock_); + Attributes_ = std::move(attributes); + } +} + +void TOperation::TOperationImpl::FinishWithException(std::exception_ptr e) +{ + CompletePromise_->SetException(std::move(e)); +} + +void TOperation::TOperationImpl::AbortOperation() +{ + ValidateOperationStarted(); + NYT::NDetail::AbortOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_); +} + +void TOperation::TOperationImpl::CompleteOperation() +{ + ValidateOperationStarted(); + NYT::NDetail::CompleteOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_); +} + +void TOperation::TOperationImpl::SuspendOperation(const TSuspendOperationOptions& options) +{ + ValidateOperationStarted(); + NYT::NDetail::SuspendOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, options); +} + +void TOperation::TOperationImpl::ResumeOperation(const TResumeOperationOptions& options) +{ + ValidateOperationStarted(); + NYT::NDetail::ResumeOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, options); +} + +TOperationAttributes TOperation::TOperationImpl::GetAttributes(const TGetOperationOptions& options) +{ + ValidateOperationStarted(); + return NYT::NDetail::GetOperation(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, options); +} + +void TOperation::TOperationImpl::UpdateParameters(const TUpdateOperationParametersOptions& options) +{ + ValidateOperationStarted(); + return NYT::NDetail::UpdateOperationParameters(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, options); +} + +TJobAttributes TOperation::TOperationImpl::GetJob(const TJobId& jobId, const TGetJobOptions& options) +{ + ValidateOperationStarted(); + return NYT::NDetail::GetJob(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, jobId, options); +} + +TListJobsResult TOperation::TOperationImpl::ListJobs(const TListJobsOptions& options) +{ + ValidateOperationStarted(); + return NYT::NDetail::ListJobs(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, *Id_, options); +} + +struct TAsyncFinishOperationsArgs +{ + ::TIntrusivePtr<TOperation::TOperationImpl> OperationImpl; + TOperationAttributes OperationAttributes; +}; + +void TOperation::TOperationImpl::AsyncFinishOperation(TOperationAttributes operationAttributes) +{ + auto args = new TAsyncFinishOperationsArgs; + args->OperationImpl = this; + args->OperationAttributes = std::move(operationAttributes); + + TThread thread(TThread::TParams(&TOperation::TOperationImpl::SyncFinishOperationProc, args).SetName("finish operation")); + thread.Start(); + thread.Detach(); +} + +void* TOperation::TOperationImpl::SyncFinishOperationProc(void* pArgs) +{ + THolder<TAsyncFinishOperationsArgs> args(static_cast<TAsyncFinishOperationsArgs*>(pArgs)); + args->OperationImpl->SyncFinishOperationImpl(args->OperationAttributes); + return nullptr; +} + +void TOperation::TOperationImpl::SyncFinishOperationImpl(const TOperationAttributes& attributes) +{ + { + auto guard = Guard(Lock_); + Y_VERIFY(Id_); + } + Y_VERIFY(attributes.BriefState, + "get_operation for operation %s has not returned \"state\" field", + GetGuidAsString(*Id_).Data()); + Y_VERIFY(*attributes.BriefState != EOperationBriefState::InProgress); + + { + try { + // `attributes' that came from poller don't have JobStatistics + // so we call `GetJobStatistics' in order to get it from server + // and cache inside object. + GetJobStatistics(); + } catch (const TErrorResponse& ) { + // But if for any reason we failed to get attributes + // we complete operation using what we have. + auto g = Guard(Lock_); + Attributes_ = attributes; + } + } + + if (*attributes.BriefState == EOperationBriefState::Completed) { + CompletePromise_->SetValue(); + } else if (*attributes.BriefState == EOperationBriefState::Aborted || *attributes.BriefState == EOperationBriefState::Failed) { + Y_VERIFY(attributes.Result && attributes.Result->Error); + const auto& error = *attributes.Result->Error; + YT_LOG_ERROR("Operation %v is `%v' with error: %v", + *Id_, + ToString(*attributes.BriefState), + error.FullDescription()); + + TString additionalExceptionText; + TVector<TFailedJobInfo> failedJobStderrInfo; + if (*attributes.BriefState == EOperationBriefState::Failed) { + try { + failedJobStderrInfo = NYT::NDetail::GetFailedJobInfo(ClientRetryPolicy_, Context_, *Id_, TGetFailedJobInfoOptions()); + } catch (const std::exception& e) { + additionalExceptionText = "Cannot get job stderrs: "; + additionalExceptionText += e.what(); + } + } + CompletePromise_->SetException( + std::make_exception_ptr( + TOperationFailedError( + *attributes.BriefState == EOperationBriefState::Failed + ? TOperationFailedError::Failed + : TOperationFailedError::Aborted, + *Id_, + error, + failedJobStderrInfo) << additionalExceptionText)); + } +} + +void TOperation::TOperationImpl::ValidateOperationStarted() const +{ + auto guard = Guard(Lock_); + if (!Id_) { + ythrow TApiUsageError() << "Operation is not started"; + } +} + +const TClientContext& TOperation::TOperationImpl::GetContext() const +{ + return Context_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TOperation::TOperation(TClientPtr client) + : Client_(std::move(client)) + , Impl_(::MakeIntrusive<TOperationImpl>(Client_->GetRetryPolicy(), Client_->GetContext())) +{ +} + +TOperation::TOperation(TOperationId id, TClientPtr client) + : Client_(std::move(client)) + , Impl_(::MakeIntrusive<TOperationImpl>(Client_->GetRetryPolicy(), Client_->GetContext(), id)) +{ +} + +const TOperationId& TOperation::GetId() const +{ + return Impl_->GetId(); +} + +TString TOperation::GetWebInterfaceUrl() const +{ + return Impl_->GetWebInterfaceUrl(); +} + +void TOperation::OnPrepared() +{ + Impl_->OnPrepared(); +} + +void TOperation::SetDelayedStartFunction(std::function<TOperationId()> start) +{ + Impl_->SetDelayedStartFunction(std::move(start)); +} + +void TOperation::Start() +{ + Impl_->Start(); +} + +bool TOperation::IsStarted() const +{ + return Impl_->IsStarted(); +} + +void TOperation::OnPreparationException(std::exception_ptr e) +{ + Impl_->OnPreparationException(std::move(e)); +} + +TString TOperation::GetStatus() const +{ + return Impl_->GetStatus(); +} + +void TOperation::OnStatusUpdated(const TString& newStatus) +{ + Impl_->OnStatusUpdated(newStatus); +} + +::NThreading::TFuture<void> TOperation::GetPreparedFuture() +{ + return Impl_->GetPreparedFuture(); +} + +::NThreading::TFuture<void> TOperation::GetStartedFuture() +{ + return Impl_->GetStartedFuture(); +} + +::NThreading::TFuture<void> TOperation::Watch() +{ + return Impl_->Watch(Client_); +} + +TVector<TFailedJobInfo> TOperation::GetFailedJobInfo(const TGetFailedJobInfoOptions& options) +{ + return NYT::NDetail::GetFailedJobInfo(Client_->GetRetryPolicy(), Client_->GetContext(), GetId(), options); +} + +EOperationBriefState TOperation::GetBriefState() +{ + return Impl_->GetBriefState(); +} + +TMaybe<TYtError> TOperation::GetError() +{ + return Impl_->GetError(); +} + +TJobStatistics TOperation::GetJobStatistics() +{ + return Impl_->GetJobStatistics(); +} + +TMaybe<TOperationBriefProgress> TOperation::GetBriefProgress() +{ + return Impl_->GetBriefProgress(); +} + +void TOperation::AbortOperation() +{ + Impl_->AbortOperation(); +} + +void TOperation::CompleteOperation() +{ + Impl_->CompleteOperation(); +} + +void TOperation::SuspendOperation(const TSuspendOperationOptions& options) +{ + Impl_->SuspendOperation(options); +} + +void TOperation::ResumeOperation(const TResumeOperationOptions& options) +{ + Impl_->ResumeOperation(options); +} + +TOperationAttributes TOperation::GetAttributes(const TGetOperationOptions& options) +{ + return Impl_->GetAttributes(options); +} + +void TOperation::UpdateParameters(const TUpdateOperationParametersOptions& options) +{ + Impl_->UpdateParameters(options); +} + +TJobAttributes TOperation::GetJob(const TJobId& jobId, const TGetJobOptions& options) +{ + return Impl_->GetJob(jobId, options); +} + +TListJobsResult TOperation::ListJobs(const TListJobsOptions& options) +{ + return Impl_->ListJobs(options); +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TAsyncPrepareAndStartOperationArgs +{ + std::function<void()> PrepareAndStart; +}; + +void* SyncPrepareAndStartOperation(void* pArgs) +{ + THolder<TAsyncPrepareAndStartOperationArgs> args(static_cast<TAsyncPrepareAndStartOperationArgs*>(pArgs)); + args->PrepareAndStart(); + return nullptr; +} + +::TIntrusivePtr<TOperation> ProcessOperation( + NYT::NDetail::TClientPtr client, + std::function<void()> prepare, + ::TIntrusivePtr<TOperation> operation, + const TOperationOptions& options) +{ + auto prepareAndStart = [prepare = std::move(prepare), operation, mode = options.StartOperationMode_] () { + try { + prepare(); + operation->OnPrepared(); + } catch (...) { + operation->OnPreparationException(std::current_exception()); + } + if (mode >= TOperationOptions::EStartOperationMode::AsyncStart) { + try { + operation->Start(); + } catch (...) { } + } + }; + if (options.StartOperationMode_ >= TOperationOptions::EStartOperationMode::SyncStart) { + prepareAndStart(); + WaitIfRequired(operation, client, options); + } else { + auto args = new TAsyncPrepareAndStartOperationArgs; + args->PrepareAndStart = std::move(prepareAndStart); + + TThread thread(TThread::TParams(SyncPrepareAndStartOperation, args).SetName("prepare and start operation")); + thread.Start(); + thread.Detach(); + } + return operation; +} + +void WaitIfRequired(const TOperationPtr& operation, const TClientPtr& client, const TOperationOptions& options) +{ + auto retryPolicy = client->GetRetryPolicy(); + auto context = client->GetContext(); + if (options.StartOperationMode_ >= TOperationOptions::EStartOperationMode::SyncStart) { + operation->GetStartedFuture().GetValueSync(); + } + if (options.StartOperationMode_ == TOperationOptions::EStartOperationMode::SyncWait) { + auto finishedFuture = operation->Watch(); + TWaitProxy::Get()->WaitFuture(finishedFuture); + finishedFuture.GetValue(); + if (context.Config->WriteStderrSuccessfulJobs) { + auto stderrs = GetJobsStderr(retryPolicy, context, operation->GetId()); + for (const auto& jobStderr : stderrs) { + if (!jobStderr.empty()) { + Cerr << jobStderr << '\n'; + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void ResetUseClientProtobuf(const char* methodName) +{ + Cerr << "WARNING! OPTION `TConfig::UseClientProtobuf' IS RESET TO `true'; " + << "IT CAN DETERIORATE YOUR CODE PERFORMANCE!!! DON'T USE DEPRECATED METHOD `" + << "TOperationIOSpec::" << methodName << "' TO AVOID THIS RESET" << Endl; + // Give users some time to contemplate about usage of deprecated functions. + Cerr << "Sleeping for 5 seconds..." << Endl; + Sleep(TDuration::Seconds(5)); + TConfig::Get()->UseClientProtobuf = true; +} + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<INodeReaderImpl> CreateJobNodeReader(TRawTableReaderPtr rawTableReader) +{ + if (auto schema = NDetail::GetJobInputSkiffSchema()) { + return new NDetail::TSkiffTableReader(rawTableReader, schema); + } else { + return new TNodeTableReader(rawTableReader); + } +} + +::TIntrusivePtr<IYaMRReaderImpl> CreateJobYaMRReader(TRawTableReaderPtr rawTableReader) +{ + return new TYaMRTableReader(rawTableReader); +} + +::TIntrusivePtr<IProtoReaderImpl> CreateJobProtoReader(TRawTableReaderPtr rawTableReader) +{ + if (TConfig::Get()->UseClientProtobuf) { + return new TProtoTableReader( + rawTableReader, + GetJobInputDescriptors()); + } else { + return new TLenvalProtoTableReader( + rawTableReader, + GetJobInputDescriptors()); + } +} + +::TIntrusivePtr<INodeWriterImpl> CreateJobNodeWriter(THolder<IProxyOutput> rawJobWriter) +{ + return new TNodeTableWriter(std::move(rawJobWriter)); +} + +::TIntrusivePtr<IYaMRWriterImpl> CreateJobYaMRWriter(THolder<IProxyOutput> rawJobWriter) +{ + return new TYaMRTableWriter(std::move(rawJobWriter)); +} + +::TIntrusivePtr<IProtoWriterImpl> CreateJobProtoWriter(THolder<IProxyOutput> rawJobWriter) +{ + if (TConfig::Get()->UseClientProtobuf) { + return new TProtoTableWriter( + std::move(rawJobWriter), + GetJobOutputDescriptors()); + } else { + return new TLenvalProtoTableWriter( + std::move(rawJobWriter), + GetJobOutputDescriptors()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/operation.h b/yt/cpp/mapreduce/client/operation.h new file mode 100644 index 0000000000..141161b0b7 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation.h @@ -0,0 +1,203 @@ +#pragma once + +#include "fwd.h" +#include "structured_table_formats.h" +#include "operation_preparer.h" + +#include <yt/cpp/mapreduce/http/fwd.h> + +#include <yt/cpp/mapreduce/interface/client.h> +#include <yt/cpp/mapreduce/interface/operation.h> +#include <yt/cpp/mapreduce/interface/retry_policy.h> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TOperation + : public IOperation +{ +public: + class TOperationImpl; + +public: + explicit TOperation(TClientPtr client); + TOperation(TOperationId id, TClientPtr client); + virtual const TOperationId& GetId() const override; + virtual TString GetWebInterfaceUrl() const override; + + void OnPrepared(); + void SetDelayedStartFunction(std::function<TOperationId()> start); + virtual void Start() override; + void OnPreparationException(std::exception_ptr e); + virtual bool IsStarted() const override; + + virtual TString GetStatus() const override; + void OnStatusUpdated(const TString& newStatus); + + virtual ::NThreading::TFuture<void> GetPreparedFuture() override; + virtual ::NThreading::TFuture<void> GetStartedFuture() override; + virtual ::NThreading::TFuture<void> Watch() override; + + virtual TVector<TFailedJobInfo> GetFailedJobInfo(const TGetFailedJobInfoOptions& options = TGetFailedJobInfoOptions()) override; + virtual EOperationBriefState GetBriefState() override; + virtual TMaybe<TYtError> GetError() override; + virtual TJobStatistics GetJobStatistics() override; + virtual TMaybe<TOperationBriefProgress> GetBriefProgress() override; + virtual void AbortOperation() override; + virtual void CompleteOperation() override; + virtual void SuspendOperation(const TSuspendOperationOptions& options) override; + virtual void ResumeOperation(const TResumeOperationOptions& options) override; + virtual TOperationAttributes GetAttributes(const TGetOperationOptions& options) override; + virtual void UpdateParameters(const TUpdateOperationParametersOptions& options) override; + virtual TJobAttributes GetJob(const TJobId& jobId, const TGetJobOptions& options) override; + virtual TListJobsResult ListJobs(const TListJobsOptions& options) override; + +private: + TClientPtr Client_; + ::TIntrusivePtr<TOperationImpl> Impl_; +}; + +using TOperationPtr = ::TIntrusivePtr<TOperation>; + +//////////////////////////////////////////////////////////////////////////////// + +struct TSimpleOperationIo +{ + TVector<TRichYPath> Inputs; + TVector<TRichYPath> Outputs; + + TFormat InputFormat; + TFormat OutputFormat; + + TVector<TSmallJobFile> JobFiles; +}; + +TSimpleOperationIo CreateSimpleOperationIoHelper( + const IStructuredJob& structuredJob, + const TOperationPreparer& preparer, + const TOperationOptions& options, + TStructuredJobTableList structuredInputs, + TStructuredJobTableList structuredOutputs, + TUserJobFormatHints hints, + ENodeReaderFormat nodeReaderFormat, + const THashSet<TString>& columnsUsedInOperations); + +//////////////////////////////////////////////////////////////////////////////// + +void ExecuteMap( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMapOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& mapper, + const TOperationOptions& options); + +void ExecuteRawMap( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawMapOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& mapper, + const TOperationOptions& options); + +void ExecuteReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TReduceOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options); + +void ExecuteRawReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options); + +void ExecuteJoinReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TJoinReduceOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options); + +void ExecuteRawJoinReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawJoinReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options); + +void ExecuteMapReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMapReduceOperationSpec& spec, + const ::TIntrusivePtr<IStructuredJob>& mapper, + const ::TIntrusivePtr<IStructuredJob>& reduceCombiner, + const ::TIntrusivePtr<IStructuredJob>& reducer, + const TOperationOptions& options); + +void ExecuteRawMapReduce( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRawMapReduceOperationSpec& spec, + const ::TIntrusivePtr<IRawJob>& mapper, + const ::TIntrusivePtr<IRawJob>& reduceCombiner, + const ::TIntrusivePtr<IRawJob>& reducer, + const TOperationOptions& options); + +void ExecuteSort( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TSortOperationSpec& spec, + const TOperationOptions& options); + +void ExecuteMerge( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TMergeOperationSpec& spec, + const TOperationOptions& options); + +void ExecuteErase( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TEraseOperationSpec& spec, + const TOperationOptions& options); + +void ExecuteRemoteCopy( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options); + +void ExecuteVanilla( + const TOperationPtr& operation, + const TOperationPreparerPtr& preparer, + const TVanillaOperationSpec& spec, + const TOperationOptions& options); + +EOperationBriefState CheckOperation( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId); + +void WaitForOperation( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TOperationId& operationId); + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<TOperation> ProcessOperation( + NYT::NDetail::TClientPtr client, + std::function<void()> prepare, + ::TIntrusivePtr<TOperation> operation, + const TOperationOptions& options); + +void WaitIfRequired(const TOperationPtr& operation, const TClientPtr& client, const TOperationOptions& options); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/operation_helpers.cpp b/yt/cpp/mapreduce/client/operation_helpers.cpp new file mode 100644 index 0000000000..abb2185662 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_helpers.cpp @@ -0,0 +1,91 @@ +#include "operation_helpers.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> + +#include <util/string/builder.h> + +#include <util/system/mutex.h> +#include <util/system/rwlock.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +ui64 RoundUpFileSize(ui64 size) +{ + constexpr ui64 roundUpTo = 4ull << 10; + return (size + roundUpTo - 1) & ~(roundUpTo - 1); +} + +bool UseLocalModeOptimization(const TClientContext& context, const IClientRetryPolicyPtr& clientRetryPolicy) +{ + if (!context.Config->EnableLocalModeOptimization) { + return false; + } + + static THashMap<TString, bool> localModeMap; + static TRWMutex mutex; + + { + TReadGuard guard(mutex); + auto it = localModeMap.find(context.ServerName); + if (it != localModeMap.end()) { + return it->second; + } + } + + bool isLocalMode = false; + TString localModeAttr("//sys/@local_mode_fqdn"); + // We don't want to pollute logs with errors about failed request, + // so we check if path exists before getting it. + if (NRawClient::Exists(clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + TTransactionId(), + localModeAttr, + TExistsOptions().ReadFrom(EMasterReadKind::Cache))) + { + auto fqdnNode = NRawClient::TryGet( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + TTransactionId(), + localModeAttr, + TGetOptions().ReadFrom(EMasterReadKind::Cache)); + if (!fqdnNode.IsUndefined()) { + auto fqdn = fqdnNode.AsString(); + isLocalMode = (fqdn == TProcessState::Get()->FqdnHostName); + YT_LOG_DEBUG("Checking local mode; LocalModeFqdn: %v FqdnHostName: %v IsLocalMode: %v", + fqdn, + TProcessState::Get()->FqdnHostName, + isLocalMode ? "true" : "false"); + } + } + + { + TWriteGuard guard(mutex); + localModeMap[context.ServerName] = isLocalMode; + } + + return isLocalMode; +} + +TString GetOperationWebInterfaceUrl(TStringBuf serverName, TOperationId operationId) +{ + serverName.ChopSuffix(":80"); + serverName.ChopSuffix(".yt.yandex-team.ru"); + serverName.ChopSuffix(".yt.yandex.net"); + return ::TStringBuilder() << "https://yt.yandex-team.ru/" << serverName << + "/operations/" << GetGuidAsString(operationId); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/operation_helpers.h b/yt/cpp/mapreduce/client/operation_helpers.h new file mode 100644 index 0000000000..7fd2ffb0de --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_helpers.h @@ -0,0 +1,20 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> +#include <yt/cpp/mapreduce/interface/fwd.h> + +#include <yt/cpp/mapreduce/http/fwd.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +ui64 RoundUpFileSize(ui64 size); + +bool UseLocalModeOptimization(const TClientContext& context, const IClientRetryPolicyPtr& clientRetryPolicy); + +TString GetOperationWebInterfaceUrl(TStringBuf serverName, TOperationId operationId); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/operation_preparer.cpp b/yt/cpp/mapreduce/client/operation_preparer.cpp new file mode 100644 index 0000000000..e06fac4061 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_preparer.cpp @@ -0,0 +1,881 @@ +#include "operation_preparer.h" + +#include "init.h" +#include "file_writer.h" +#include "operation.h" +#include "operation_helpers.h" +#include "operation_tracker.h" +#include "transaction.h" +#include "transaction_pinger.h" +#include "yt_poller.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> + +#include <yt/cpp/mapreduce/interface/error_codes.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/digest/md5/md5.h> + +#include <util/folder/path.h> + +#include <util/string/builder.h> + +#include <util/system/execpath.h> + +namespace NYT::NDetail { + +using namespace NRawClient; + +//////////////////////////////////////////////////////////////////////////////// + +class TWaitOperationStartPollerItem + : public IYtPollerItem +{ +public: + TWaitOperationStartPollerItem(TOperationId operationId, THolder<TPingableTransaction> transaction) + : OperationId_(operationId) + , Transaction_(std::move(transaction)) + { } + + void PrepareRequest(TRawBatchRequest* batchRequest) override + { + Future_ = batchRequest->GetOperation( + OperationId_, + TGetOperationOptions().AttributeFilter( + TOperationAttributeFilter().Add(EOperationAttribute::State))); + } + + EStatus OnRequestExecuted() override + { + try { + auto attributes = Future_.GetValue(); + Y_ENSURE(attributes.State.Defined()); + bool operationHasLockedFiles = + *attributes.State != "starting" && + *attributes.State != "pending" && + *attributes.State != "orphaned" && + *attributes.State != "waiting_for_agent" && + *attributes.State != "initializing"; + return operationHasLockedFiles ? EStatus::PollBreak : EStatus::PollContinue; + } catch (const TErrorResponse& e) { + YT_LOG_ERROR("get_operation request failed: %v (RequestId: %v)", + e.GetError().GetMessage(), + e.GetRequestId()); + return IsRetriable(e) ? PollContinue : PollBreak; + } catch (const std::exception& e) { + YT_LOG_ERROR("%v", e.what()); + return PollBreak; + } + } + + void OnItemDiscarded() override { + } + +private: + TOperationId OperationId_; + THolder<TPingableTransaction> Transaction_; + ::NThreading::TFuture<TOperationAttributes> Future_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TOperationForwardingRequestRetryPolicy + : public IRequestRetryPolicy +{ +public: + TOperationForwardingRequestRetryPolicy(const IRequestRetryPolicyPtr& underlying, const TOperationPtr& operation) + : Underlying_(underlying) + , Operation_(operation) + { } + + void NotifyNewAttempt() override + { + Underlying_->NotifyNewAttempt(); + } + + TMaybe<TDuration> OnGenericError(const std::exception& e) override + { + UpdateOperationStatus(e.what()); + return Underlying_->OnGenericError(e); + } + + TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override + { + auto msg = e.GetError().ShortDescription(); + UpdateOperationStatus(msg); + return Underlying_->OnRetriableError(e); + } + + void OnIgnoredError(const TErrorResponse& e) override + { + Underlying_->OnIgnoredError(e); + } + + TString GetAttemptDescription() const override + { + return Underlying_->GetAttemptDescription(); + } + +private: + void UpdateOperationStatus(TStringBuf err) + { + Y_VERIFY(Operation_); + Operation_->OnStatusUpdated( + ::TStringBuilder() << "Retriable error during operation start: " << err); + } + +private: + IRequestRetryPolicyPtr Underlying_; + TOperationPtr Operation_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TOperationPreparer::TOperationPreparer(TClientPtr client, TTransactionId transactionId) + : Client_(std::move(client)) + , TransactionId_(transactionId) + , FileTransaction_(MakeHolder<TPingableTransaction>( + Client_->GetRetryPolicy(), + Client_->GetContext(), + TransactionId_, + Client_->GetTransactionPinger()->GetChildTxPinger(), + TStartTransactionOptions())) + , ClientRetryPolicy_(Client_->GetRetryPolicy()) + , PreparationId_(CreateGuidAsString()) +{ } + +const TClientContext& TOperationPreparer::GetContext() const +{ + return Client_->GetContext(); +} + +TTransactionId TOperationPreparer::GetTransactionId() const +{ + return TransactionId_; +} + +TClientPtr TOperationPreparer::GetClient() const +{ + return Client_; +} + +const TString& TOperationPreparer::GetPreparationId() const +{ + return PreparationId_; +} + +const IClientRetryPolicyPtr& TOperationPreparer::GetClientRetryPolicy() const +{ + return ClientRetryPolicy_; +} + +TOperationId TOperationPreparer::StartOperation( + TOperation* operation, + const TString& operationType, + const TNode& spec, + bool useStartOperationRequest) +{ + CheckValidity(); + + THttpHeader header("POST", (useStartOperationRequest ? "start_op" : operationType)); + if (useStartOperationRequest) { + header.AddParameter("operation_type", operationType); + } + header.AddTransactionId(TransactionId_); + header.AddMutationId(); + + auto ysonSpec = NodeToYsonString(spec); + auto responseInfo = RetryRequestWithPolicy( + ::MakeIntrusive<TOperationForwardingRequestRetryPolicy>( + ClientRetryPolicy_->CreatePolicyForStartOperationRequest(), + TOperationPtr(operation)), + GetContext(), + header, + ysonSpec); + TOperationId operationId = ParseGuidFromResponse(responseInfo.Response); + YT_LOG_DEBUG("Operation started (OperationId: %v; PreparationId: %v)", + operationId, + GetPreparationId()); + + YT_LOG_INFO("Operation %v started (%v): %v", + operationId, + operationType, + GetOperationWebInterfaceUrl(GetContext().ServerName, operationId)); + + TOperationExecutionTimeTracker::Get()->Start(operationId); + + Client_->GetYtPoller().Watch( + new TWaitOperationStartPollerItem(operationId, std::move(FileTransaction_))); + + return operationId; +} + +void TOperationPreparer::LockFiles(TVector<TRichYPath>* paths) +{ + CheckValidity(); + + TVector<::NThreading::TFuture<TLockId>> lockIdFutures; + lockIdFutures.reserve(paths->size()); + TRawBatchRequest lockRequest(GetContext().Config); + for (const auto& path : *paths) { + lockIdFutures.push_back(lockRequest.Lock( + FileTransaction_->GetId(), + path.Path_, + ELockMode::LM_SNAPSHOT, + TLockOptions().Waitable(true))); + } + ExecuteBatch(ClientRetryPolicy_->CreatePolicyForGenericRequest(), GetContext(), lockRequest); + + TVector<::NThreading::TFuture<TNode>> nodeIdFutures; + nodeIdFutures.reserve(paths->size()); + TRawBatchRequest getNodeIdRequest(GetContext().Config); + for (const auto& lockIdFuture : lockIdFutures) { + nodeIdFutures.push_back(getNodeIdRequest.Get( + FileTransaction_->GetId(), + ::TStringBuilder() << '#' << GetGuidAsString(lockIdFuture.GetValue()) << "/@node_id", + TGetOptions())); + } + ExecuteBatch(ClientRetryPolicy_->CreatePolicyForGenericRequest(), GetContext(), getNodeIdRequest); + + for (size_t i = 0; i != paths->size(); ++i) { + auto& richPath = (*paths)[i]; + richPath.OriginalPath(richPath.Path_); + richPath.Path("#" + nodeIdFutures[i].GetValue().AsString()); + YT_LOG_DEBUG("Locked file %v, new path is %v", + *richPath.OriginalPath_, + richPath.Path_); + } +} + +void TOperationPreparer::CheckValidity() const +{ + Y_ENSURE( + FileTransaction_, + "File transaction is already moved, are you trying to use preparer for more than one operation?"); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TRetryPolicyIgnoringLockConflicts + : public TAttemptLimitedRetryPolicy +{ +public: + using TAttemptLimitedRetryPolicy::TAttemptLimitedRetryPolicy; + using TAttemptLimitedRetryPolicy::OnGenericError; + + TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override + { + if (IsAttemptLimitExceeded()) { + return Nothing(); + } + if (e.IsConcurrentTransactionLockConflict()) { + return GetBackoffDuration(Config_); + } + return TAttemptLimitedRetryPolicy::OnRetriableError(e); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TFileToUpload + : public IItemToUpload +{ +public: + TFileToUpload(TString fileName, TMaybe<TString> md5) + : FileName_(std::move(fileName)) + , MD5_(std::move(md5)) + { } + + TString CalculateMD5() const override + { + if (MD5_) { + return *MD5_; + } + constexpr size_t md5Size = 32; + TString result; + result.ReserveAndResize(md5Size); + MD5::File(FileName_.data(), result.Detach()); + MD5_ = result; + return result; + } + + THolder<IInputStream> CreateInputStream() const override + { + return MakeHolder<TFileInput>(FileName_); + } + + TString GetDescription() const override + { + return FileName_; + } + + ui64 GetDataSize() const override + { + return GetFileLength(FileName_); + } + +private: + TString FileName_; + mutable TMaybe<TString> MD5_; +}; + +class TDataToUpload + : public IItemToUpload +{ +public: + TDataToUpload(TString data, TString description) + : Data_(std::move(data)) + , Description_(std::move(description)) + { } + + TString CalculateMD5() const override + { + constexpr size_t md5Size = 32; + TString result; + result.ReserveAndResize(md5Size); + MD5::Data(reinterpret_cast<const unsigned char*>(Data_.data()), Data_.size(), result.Detach()); + return result; + } + + THolder<IInputStream> CreateInputStream() const override + { + return MakeHolder<TMemoryInput>(Data_.data(), Data_.size()); + } + + TString GetDescription() const override + { + return Description_; + } + + ui64 GetDataSize() const override + { + return Data_.size(); + } + +private: + TString Data_; + TString Description_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +static const TString& GetPersistentExecPathMd5() +{ + static TString md5 = MD5::File(GetPersistentExecPath()); + return md5; +} + +static TMaybe<TSmallJobFile> GetJobState(const IJob& job) +{ + TString result; + { + TStringOutput output(result); + job.Save(output); + output.Finish(); + } + if (result.empty()) { + return Nothing(); + } else { + return TSmallJobFile{"jobstate", result}; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobPreparer::TJobPreparer( + TOperationPreparer& operationPreparer, + const TUserJobSpec& spec, + const IJob& job, + size_t outputTableCount, + const TVector<TSmallJobFile>& smallFileList, + const TOperationOptions& options) + : OperationPreparer_(operationPreparer) + , Spec_(spec) + , Options_(options) +{ + + CreateStorage(); + auto cypressFileList = CanonizeYPaths(/* retryPolicy */ nullptr, OperationPreparer_.GetContext(), spec.Files_); + + for (const auto& file : cypressFileList) { + UseFileInCypress(file); + } + for (const auto& localFile : spec.GetLocalFiles()) { + UploadLocalFile(std::get<0>(localFile), std::get<1>(localFile)); + } + auto jobStateSmallFile = GetJobState(job); + if (jobStateSmallFile) { + UploadSmallFile(*jobStateSmallFile); + } + for (const auto& smallFile : smallFileList) { + UploadSmallFile(smallFile); + } + + if (auto commandJob = dynamic_cast<const ICommandJob*>(&job)) { + ClassName_ = TJobFactory::Get()->GetJobName(&job); + Command_ = commandJob->GetCommand(); + } else { + PrepareJobBinary(job, outputTableCount, jobStateSmallFile.Defined()); + } + + operationPreparer.LockFiles(&CachedFiles_); +} + +TVector<TRichYPath> TJobPreparer::GetFiles() const +{ + TVector<TRichYPath> allFiles = CypressFiles_; + allFiles.insert(allFiles.end(), CachedFiles_.begin(), CachedFiles_.end()); + return allFiles; +} + +const TString& TJobPreparer::GetClassName() const +{ + return ClassName_; +} + +const TString& TJobPreparer::GetCommand() const +{ + return Command_; +} + +const TUserJobSpec& TJobPreparer::GetSpec() const +{ + return Spec_; +} + +bool TJobPreparer::ShouldMountSandbox() const +{ + return OperationPreparer_.GetContext().Config->MountSandboxInTmpfs || Options_.MountSandboxInTmpfs_; +} + +ui64 TJobPreparer::GetTotalFileSize() const +{ + return TotalFileSize_; +} + +TString TJobPreparer::GetFileStorage() const +{ + return Options_.FileStorage_ ? + *Options_.FileStorage_ : + OperationPreparer_.GetContext().Config->RemoteTempFilesDirectory; +} + +TYPath TJobPreparer::GetCachePath() const +{ + return AddPathPrefix( + ::TStringBuilder() << GetFileStorage() << "/new_cache", + OperationPreparer_.GetContext().Config->Prefix); +} + +void TJobPreparer::CreateStorage() const +{ + Create( + OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + OperationPreparer_.GetContext(), + Options_.FileStorageTransactionId_, + GetCachePath(), + NT_MAP, + TCreateOptions() + .IgnoreExisting(true) + .Recursive(true)); +} + +int TJobPreparer::GetFileCacheReplicationFactor() const +{ + if (IsLocalMode()) { + return 1; + } else { + return OperationPreparer_.GetContext().Config->FileCacheReplicationFactor; + } +} + +void TJobPreparer::CreateFileInCypress(const TString& path) const +{ + auto attributes = TNode()("replication_factor", GetFileCacheReplicationFactor()); + if (Options_.FileExpirationTimeout_) { + attributes["expiration_timeout"] = Options_.FileExpirationTimeout_->MilliSeconds(); + } + + Create( + OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + OperationPreparer_.GetContext(), + Options_.FileStorageTransactionId_, + path, + NT_FILE, + TCreateOptions() + .IgnoreExisting(true) + .Recursive(true) + .Attributes(attributes) + ); +} + +TString TJobPreparer::PutFileToCypressCache( + const TString& path, + const TString& md5Signature, + TTransactionId transactionId) const +{ + constexpr ui32 LockConflictRetryCount = 30; + auto retryPolicy = MakeIntrusive<TRetryPolicyIgnoringLockConflicts>( + LockConflictRetryCount, + OperationPreparer_.GetContext().Config); + + auto putFileToCacheOptions = TPutFileToCacheOptions(); + if (Options_.FileExpirationTimeout_) { + putFileToCacheOptions.PreserveExpirationTimeout(true); + } + + auto cachePath = PutFileToCache( + retryPolicy, + OperationPreparer_.GetContext(), + transactionId, + path, + md5Signature, + GetCachePath(), + putFileToCacheOptions); + + Remove( + OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + OperationPreparer_.GetContext(), + transactionId, + path, + TRemoveOptions().Force(true)); + + return cachePath; +} + +TMaybe<TString> TJobPreparer::GetItemFromCypressCache(const TString& md5Signature, const TString& fileName) const +{ + constexpr ui32 LockConflictRetryCount = 30; + auto retryPolicy = MakeIntrusive<TRetryPolicyIgnoringLockConflicts>( + LockConflictRetryCount, + OperationPreparer_.GetContext().Config); + auto maybePath = GetFileFromCache( + retryPolicy, + OperationPreparer_.GetContext(), + TTransactionId(), + md5Signature, + GetCachePath(), + TGetFileFromCacheOptions()); + if (maybePath) { + YT_LOG_DEBUG("File is already in cache (FileName: %v)", + fileName, + *maybePath); + } + return maybePath; +} + +TDuration TJobPreparer::GetWaitForUploadTimeout(const IItemToUpload& itemToUpload) const +{ + const TDuration extraTime = OperationPreparer_.GetContext().Config->WaitLockPollInterval + + TDuration::MilliSeconds(100); + const double dataSizeGb = static_cast<double>(itemToUpload.GetDataSize()) / 1_GB; + return extraTime + dataSizeGb * OperationPreparer_.GetContext().Config->CacheLockTimeoutPerGb; +} + +TString TJobPreparer::UploadToRandomPath(const IItemToUpload& itemToUpload) const +{ + TString uniquePath = AddPathPrefix( + ::TStringBuilder() << GetFileStorage() << "/cpp_" << CreateGuidAsString(), + OperationPreparer_.GetContext().Config->Prefix); + YT_LOG_INFO("Uploading file to random cypress path (FileName: %v; CypressPath: %v; PreparationId: %v)", + itemToUpload.GetDescription(), + uniquePath, + OperationPreparer_.GetPreparationId()); + + CreateFileInCypress(uniquePath); + + { + TFileWriter writer( + uniquePath, + OperationPreparer_.GetClientRetryPolicy(), + OperationPreparer_.GetClient()->GetTransactionPinger(), + OperationPreparer_.GetContext(), + Options_.FileStorageTransactionId_, + TFileWriterOptions().ComputeMD5(true)); + itemToUpload.CreateInputStream()->ReadAll(writer); + writer.Finish(); + } + return uniquePath; +} + +TMaybe<TString> TJobPreparer::TryUploadWithDeduplication(const IItemToUpload& itemToUpload) const +{ + const auto md5Signature = itemToUpload.CalculateMD5(); + + auto fileName = ::TStringBuilder() << GetFileStorage() << "/cpp_md5_" << md5Signature; + if (OperationPreparer_.GetContext().Config->CacheUploadDeduplicationMode == EUploadDeduplicationMode::Host) { + fileName << "_" << MD5::Data(TProcessState::Get()->FqdnHostName); + } + TString cypressPath = AddPathPrefix(fileName, OperationPreparer_.GetContext().Config->Prefix); + + CreateFileInCypress(cypressPath); + + auto uploadTx = MakeIntrusive<TTransaction>( + OperationPreparer_.GetClient(), + OperationPreparer_.GetContext(), + TTransactionId(), + TStartTransactionOptions()); + + ILockPtr lock; + try { + lock = uploadTx->Lock(cypressPath, ELockMode::LM_EXCLUSIVE, TLockOptions().Waitable(true)); + } catch (const TErrorResponse& e) { + if (e.IsResolveError()) { + // If the node doesn't exist, it must be removed by concurrent uploading process. + // Let's try to find it in the cache. + return GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription()); + } + throw; + } + + auto waitTimeout = GetWaitForUploadTimeout(itemToUpload); + YT_LOG_DEBUG("Waiting for the lock on file (FileName: %v; CypressPath: %v; LockTimeout: %v)", + itemToUpload.GetDescription(), + cypressPath, + waitTimeout); + + if (!TWaitProxy::Get()->WaitFuture(lock->GetAcquiredFuture(), waitTimeout)) { + YT_LOG_DEBUG("Waiting for the lock timed out. Fallback to random path uploading (FileName: %v; CypressPath: %v)", + itemToUpload.GetDescription(), + cypressPath); + return Nothing(); + } + + YT_LOG_DEBUG("Exclusive lock successfully acquired (FileName: %v; CypressPath: %v)", + itemToUpload.GetDescription(), + cypressPath); + + // Ensure that this process is the first to take a lock. + if (auto cachedItemPath = GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription())) { + return *cachedItemPath; + } + + YT_LOG_INFO("Uploading file to cypress (FileName: %v; CypressPath: %v; PreparationId: %v)", + itemToUpload.GetDescription(), + cypressPath, + OperationPreparer_.GetPreparationId()); + + { + auto writer = uploadTx->CreateFileWriter(cypressPath, TFileWriterOptions().ComputeMD5(true)); + YT_VERIFY(writer); + itemToUpload.CreateInputStream()->ReadAll(*writer); + writer->Finish(); + } + + auto path = PutFileToCypressCache(cypressPath, md5Signature, uploadTx->GetId()); + + uploadTx->Commit(); + return path; +} + +TString TJobPreparer::UploadToCacheUsingApi(const IItemToUpload& itemToUpload) const +{ + auto md5Signature = itemToUpload.CalculateMD5(); + Y_VERIFY(md5Signature.size() == 32); + + if (auto cachedItemPath = GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription())) { + return *cachedItemPath; + } + + YT_LOG_INFO("File not found in cache; uploading to cypress (FileName: %v; PreparationId: %v)", + itemToUpload.GetDescription(), + OperationPreparer_.GetPreparationId()); + + if (OperationPreparer_.GetContext().Config->CacheUploadDeduplicationMode != EUploadDeduplicationMode::Disabled) { + if (auto path = TryUploadWithDeduplication(itemToUpload)) { + return *path; + } + } + + auto path = UploadToRandomPath(itemToUpload); + return PutFileToCypressCache(path, md5Signature, Options_.FileStorageTransactionId_); +} + +TString TJobPreparer::UploadToCache(const IItemToUpload& itemToUpload) const +{ + YT_LOG_INFO("Uploading file (FileName: %v; PreparationId: %v)", + itemToUpload.GetDescription(), + OperationPreparer_.GetPreparationId()); + + TString result; + switch (Options_.FileCacheMode_) { + case TOperationOptions::EFileCacheMode::ApiCommandBased: + Y_ENSURE_EX(Options_.FileStorageTransactionId_.IsEmpty(), TApiUsageError() << + "Default cache mode (API command-based) doesn't allow non-default 'FileStorageTransactionId_'"); + result = UploadToCacheUsingApi(itemToUpload); + break; + case TOperationOptions::EFileCacheMode::CachelessRandomPathUpload: + result = UploadToRandomPath(itemToUpload); + break; + default: + Y_FAIL("Unknown file cache mode: %d", static_cast<int>(Options_.FileCacheMode_)); + } + + YT_LOG_INFO("Complete uploading file (FileName: %v; PreparationId: %v)", + itemToUpload.GetDescription(), + OperationPreparer_.GetPreparationId()); + + return result; +} + +void TJobPreparer::UseFileInCypress(const TRichYPath& file) +{ + if (!Exists( + OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + OperationPreparer_.GetContext(), + file.TransactionId_.GetOrElse(OperationPreparer_.GetTransactionId()), + file.Path_)) + { + ythrow yexception() << "File " << file.Path_ << " does not exist"; + } + + if (ShouldMountSandbox()) { + auto size = Get( + OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(), + OperationPreparer_.GetContext(), + file.TransactionId_.GetOrElse(OperationPreparer_.GetTransactionId()), + file.Path_ + "/@uncompressed_data_size") + .AsInt64(); + + TotalFileSize_ += RoundUpFileSize(static_cast<ui64>(size)); + } + CypressFiles_.push_back(file); +} + +void TJobPreparer::UploadLocalFile( + const TLocalFilePath& localPath, + const TAddLocalFileOptions& options, + bool isApiFile) +{ + TFsPath fsPath(localPath); + fsPath.CheckExists(); + + TFileStat stat; + fsPath.Stat(stat); + + bool isExecutable = stat.Mode & (S_IXUSR | S_IXGRP | S_IXOTH); + auto cachePath = UploadToCache(TFileToUpload(localPath, options.MD5CheckSum_)); + + TRichYPath cypressPath; + if (isApiFile) { + cypressPath = OperationPreparer_.GetContext().Config->ApiFilePathOptions; + } + cypressPath.Path(cachePath).FileName(options.PathInJob_.GetOrElse(fsPath.Basename())); + if (isExecutable) { + cypressPath.Executable(true); + } + if (options.BypassArtifactCache_) { + cypressPath.BypassArtifactCache(*options.BypassArtifactCache_); + } + + if (ShouldMountSandbox()) { + TotalFileSize_ += RoundUpFileSize(stat.Size); + } + + CachedFiles_.push_back(cypressPath); +} + +void TJobPreparer::UploadBinary(const TJobBinaryConfig& jobBinary) +{ + if (std::holds_alternative<TJobBinaryLocalPath>(jobBinary)) { + auto binaryLocalPath = std::get<TJobBinaryLocalPath>(jobBinary); + auto opts = TAddLocalFileOptions().PathInJob("cppbinary"); + if (binaryLocalPath.MD5CheckSum) { + opts.MD5CheckSum(*binaryLocalPath.MD5CheckSum); + } + UploadLocalFile(binaryLocalPath.Path, opts, /* isApiFile */ true); + } else if (std::holds_alternative<TJobBinaryCypressPath>(jobBinary)) { + auto binaryCypressPath = std::get<TJobBinaryCypressPath>(jobBinary); + TRichYPath ytPath = OperationPreparer_.GetContext().Config->ApiFilePathOptions; + ytPath.Path(binaryCypressPath.Path); + if (binaryCypressPath.TransactionId) { + ytPath.TransactionId(*binaryCypressPath.TransactionId); + } + UseFileInCypress(ytPath.FileName("cppbinary").Executable(true)); + } else { + Y_FAIL("%s", (::TStringBuilder() << "Unexpected jobBinary tag: " << jobBinary.index()).data()); + } +} + +void TJobPreparer::UploadSmallFile(const TSmallJobFile& smallFile) +{ + auto cachePath = UploadToCache(TDataToUpload(smallFile.Data, smallFile.FileName + " [generated-file]")); + auto path = OperationPreparer_.GetContext().Config->ApiFilePathOptions; + CachedFiles_.push_back(path.Path(cachePath).FileName(smallFile.FileName)); + if (ShouldMountSandbox()) { + TotalFileSize_ += RoundUpFileSize(smallFile.Data.size()); + } +} + +bool TJobPreparer::IsLocalMode() const +{ + return UseLocalModeOptimization(OperationPreparer_.GetContext(), OperationPreparer_.GetClientRetryPolicy()); +} + +void TJobPreparer::PrepareJobBinary(const IJob& job, int outputTableCount, bool hasState) +{ + auto jobBinary = TJobBinaryConfig(); + if (!std::holds_alternative<TJobBinaryDefault>(Spec_.GetJobBinary())) { + jobBinary = Spec_.GetJobBinary(); + } + TString binaryPathInsideJob; + if (std::holds_alternative<TJobBinaryDefault>(jobBinary)) { + if (GetInitStatus() != EInitStatus::FullInitialization) { + ythrow yexception() << "NYT::Initialize() must be called prior to any operation"; + } + + const bool isLocalMode = IsLocalMode(); + const TMaybe<TString> md5 = !isLocalMode ? MakeMaybe(GetPersistentExecPathMd5()) : Nothing(); + jobBinary = TJobBinaryLocalPath{GetPersistentExecPath(), md5}; + + if (isLocalMode) { + binaryPathInsideJob = GetExecPath(); + } + } else if (std::holds_alternative<TJobBinaryLocalPath>(jobBinary)) { + const bool isLocalMode = IsLocalMode(); + if (isLocalMode) { + binaryPathInsideJob = TFsPath(std::get<TJobBinaryLocalPath>(jobBinary).Path).RealPath(); + } + } + Y_ASSERT(!std::holds_alternative<TJobBinaryDefault>(jobBinary)); + + // binaryPathInsideJob is only set when LocalModeOptimization option is on, so upload is not needed + if (!binaryPathInsideJob) { + binaryPathInsideJob = "./cppbinary"; + UploadBinary(jobBinary); + } + + TString jobCommandPrefix = Options_.JobCommandPrefix_; + if (!Spec_.JobCommandPrefix_.empty()) { + jobCommandPrefix = Spec_.JobCommandPrefix_; + } + + TString jobCommandSuffix = Options_.JobCommandSuffix_; + if (!Spec_.JobCommandSuffix_.empty()) { + jobCommandSuffix = Spec_.JobCommandSuffix_; + } + + ClassName_ = TJobFactory::Get()->GetJobName(&job); + + auto jobArguments = TNode::CreateMap(); + jobArguments["job_name"] = ClassName_; + jobArguments["output_table_count"] = static_cast<i64>(outputTableCount); + jobArguments["has_state"] = hasState; + Spec_.AddEnvironment("YT_JOB_ARGUMENTS", NodeToYsonString(jobArguments)); + + Command_ = ::TStringBuilder() << + jobCommandPrefix << + (OperationPreparer_.GetContext().Config->UseClientProtobuf ? "YT_USE_CLIENT_PROTOBUF=1" : "YT_USE_CLIENT_PROTOBUF=0") << " " << + binaryPathInsideJob << + jobCommandSuffix; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/operation_preparer.h b/yt/cpp/mapreduce/client/operation_preparer.h new file mode 100644 index 0000000000..7ced54e3b5 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_preparer.h @@ -0,0 +1,129 @@ +#pragma once + +#include "client.h" +#include "structured_table_formats.h" + +#include <yt/cpp/mapreduce/interface/operation.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TOperation; + +class TOperationPreparer + : public TThrRefBase +{ +public: + TOperationPreparer(TClientPtr client, TTransactionId transactionId); + + const TClientContext& GetContext() const; + TTransactionId GetTransactionId() const; + ITransactionPingerPtr GetTransactionPinger() const; + TClientPtr GetClient() const; + + const TString& GetPreparationId() const; + + void LockFiles(TVector<TRichYPath>* paths); + + TOperationId StartOperation( + TOperation* operation, + const TString& operationType, + const TNode& spec, + bool useStartOperationRequest = false); + + const IClientRetryPolicyPtr& GetClientRetryPolicy() const; + +private: + TClientPtr Client_; + TTransactionId TransactionId_; + THolder<TPingableTransaction> FileTransaction_; + IClientRetryPolicyPtr ClientRetryPolicy_; + const TString PreparationId_; + +private: + void CheckValidity() const; +}; + +using TOperationPreparerPtr = ::TIntrusivePtr<TOperationPreparer>; + +//////////////////////////////////////////////////////////////////////////////// + +struct IItemToUpload +{ + virtual ~IItemToUpload() = default; + + virtual TString CalculateMD5() const = 0; + virtual THolder<IInputStream> CreateInputStream() const = 0; + virtual TString GetDescription() const = 0; + virtual ui64 GetDataSize() const = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TJobPreparer + : private TNonCopyable +{ +public: + TJobPreparer( + TOperationPreparer& operationPreparer, + const TUserJobSpec& spec, + const IJob& job, + size_t outputTableCount, + const TVector<TSmallJobFile>& smallFileList, + const TOperationOptions& options); + + TVector<TRichYPath> GetFiles() const; + const TString& GetClassName() const; + const TString& GetCommand() const; + const TUserJobSpec& GetSpec() const; + bool ShouldMountSandbox() const; + ui64 GetTotalFileSize() const; + +private: + TOperationPreparer& OperationPreparer_; + TUserJobSpec Spec_; + TOperationOptions Options_; + + TVector<TRichYPath> CypressFiles_; + TVector<TRichYPath> CachedFiles_; + + TString ClassName_; + TString Command_; + ui64 TotalFileSize_ = 0; + +private: + TString GetFileStorage() const; + TYPath GetCachePath() const; + + bool IsLocalMode() const; + int GetFileCacheReplicationFactor() const; + + void CreateStorage() const; + + void CreateFileInCypress(const TString& path) const; + TString PutFileToCypressCache(const TString& path, const TString& md5Signature, TTransactionId transactionId) const; + TMaybe<TString> GetItemFromCypressCache(const TString& md5Signature, const TString& fileName) const; + + TDuration GetWaitForUploadTimeout(const IItemToUpload& itemToUpload) const; + TString UploadToRandomPath(const IItemToUpload& itemToUpload) const; + TString UploadToCacheUsingApi(const IItemToUpload& itemToUpload) const; + TMaybe<TString> TryUploadWithDeduplication(const IItemToUpload& itemToUpload) const; + TString UploadToCache(const IItemToUpload& itemToUpload) const; + + void UseFileInCypress(const TRichYPath& file); + + void UploadLocalFile( + const TLocalFilePath& localPath, + const TAddLocalFileOptions& options, + bool isApiFile = false); + + void UploadBinary(const TJobBinaryConfig& jobBinary); + void UploadSmallFile(const TSmallJobFile& smallFile); + + void PrepareJobBinary(const IJob& job, int outputTableCount, bool hasState); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/operation_tracker.cpp b/yt/cpp/mapreduce/client/operation_tracker.cpp new file mode 100644 index 0000000000..56623e9927 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_tracker.cpp @@ -0,0 +1,34 @@ +#include "operation_tracker.h" + +#include <yt/cpp/mapreduce/interface/config.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +void TOperationExecutionTimeTracker::Start(const TOperationId& operationId) { + with_lock(Lock_) { + StartTimes_[operationId] = TInstant::Now(); + } +} + +TMaybe<TDuration> TOperationExecutionTimeTracker::Finish(const TOperationId& operationId) { + TDuration duration; + with_lock(Lock_) { + auto i = StartTimes_.find(operationId); + if (i == StartTimes_.end()) { + return Nothing(); + } + duration = TInstant::Now() - i->second; + StartTimes_.erase(i); + } + return duration; +} + +TOperationExecutionTimeTracker* TOperationExecutionTimeTracker::Get() { + return Singleton<TOperationExecutionTimeTracker>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/operation_tracker.h b/yt/cpp/mapreduce/client/operation_tracker.h new file mode 100644 index 0000000000..9f1504ea91 --- /dev/null +++ b/yt/cpp/mapreduce/client/operation_tracker.h @@ -0,0 +1,27 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <util/datetime/base.h> +#include <util/generic/hash.h> +#include <util/generic/maybe.h> +#include <util/system/mutex.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TOperationExecutionTimeTracker { +public: + void Start(const TOperationId& operationId); + TMaybe<TDuration> Finish(const TOperationId& operationId); + static TOperationExecutionTimeTracker* Get(); + +private: + THashMap<TOperationId, TInstant> StartTimes_; + TMutex Lock_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/prepare_operation.cpp b/yt/cpp/mapreduce/client/prepare_operation.cpp new file mode 100644 index 0000000000..7f772dc99a --- /dev/null +++ b/yt/cpp/mapreduce/client/prepare_operation.cpp @@ -0,0 +1,286 @@ +#include "prepare_operation.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/serialize.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> + +#include <library/cpp/iterator/functools.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +TOperationPreparationContext::TOperationPreparationContext( + const TStructuredJobTableList& structuredInputs, + const TStructuredJobTableList& structuredOutputs, + const TClientContext& context, + const IClientRetryPolicyPtr& retryPolicy, + TTransactionId transactionId) + : Context_(context) + , RetryPolicy_(retryPolicy) + , TransactionId_(transactionId) + , InputSchemas_(structuredInputs.size()) + , InputSchemasLoaded_(structuredInputs.size(), false) +{ + Inputs_.reserve(structuredInputs.size()); + for (const auto& input : structuredInputs) { + Inputs_.push_back(input.RichYPath); + } + Outputs_.reserve(structuredOutputs.size()); + for (const auto& output : structuredOutputs) { + Outputs_.push_back(output.RichYPath); + } +} + +TOperationPreparationContext::TOperationPreparationContext( + TVector<TRichYPath> inputs, + TVector<TRichYPath> outputs, + const TClientContext& context, + const IClientRetryPolicyPtr& retryPolicy, + TTransactionId transactionId) + : Context_(context) + , RetryPolicy_(retryPolicy) + , TransactionId_(transactionId) + , InputSchemas_(inputs.size()) + , InputSchemasLoaded_(inputs.size(), false) +{ + Inputs_.reserve(inputs.size()); + for (auto& input : inputs) { + Inputs_.push_back(std::move(input)); + } + Outputs_.reserve(outputs.size()); + for (const auto& output : outputs) { + Outputs_.push_back(std::move(output)); + } +} + +int TOperationPreparationContext::GetInputCount() const +{ + return static_cast<int>(Inputs_.size()); +} + +int TOperationPreparationContext::GetOutputCount() const +{ + return static_cast<int>(Outputs_.size()); +} + +const TVector<TTableSchema>& TOperationPreparationContext::GetInputSchemas() const +{ + TVector<::NThreading::TFuture<TNode>> schemaFutures; + NRawClient::TRawBatchRequest batch(Context_.Config); + for (int tableIndex = 0; tableIndex < static_cast<int>(InputSchemas_.size()); ++tableIndex) { + if (InputSchemasLoaded_[tableIndex]) { + schemaFutures.emplace_back(); + continue; + } + Y_VERIFY(Inputs_[tableIndex]); + schemaFutures.push_back(batch.Get(TransactionId_, Inputs_[tableIndex]->Path_ + "/@schema", TGetOptions{})); + } + + NRawClient::ExecuteBatch( + RetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + batch); + + for (int tableIndex = 0; tableIndex < static_cast<int>(InputSchemas_.size()); ++tableIndex) { + if (schemaFutures[tableIndex].Initialized()) { + Deserialize(InputSchemas_[tableIndex], schemaFutures[tableIndex].ExtractValueSync()); + } + } + + return InputSchemas_; +} + +const TTableSchema& TOperationPreparationContext::GetInputSchema(int index) const +{ + auto& schema = InputSchemas_[index]; + if (!InputSchemasLoaded_[index]) { + Y_VERIFY(Inputs_[index]); + auto schemaNode = NRawClient::Get( + RetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + TransactionId_, + Inputs_[index]->Path_ + "/@schema"); + Deserialize(schema, schemaNode); + } + return schema; +} + +TMaybe<TYPath> TOperationPreparationContext::GetInputPath(int index) const +{ + Y_VERIFY(index < static_cast<int>(Inputs_.size())); + if (Inputs_[index]) { + return Inputs_[index]->Path_; + } + return Nothing(); +} + +TMaybe<TYPath> TOperationPreparationContext::GetOutputPath(int index) const +{ + Y_VERIFY(index < static_cast<int>(Outputs_.size())); + if (Outputs_[index]) { + return Outputs_[index]->Path_; + } + return Nothing(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TSpeculativeOperationPreparationContext::TSpeculativeOperationPreparationContext( + const TVector<TTableSchema>& previousResult, + TStructuredJobTableList inputs, + TStructuredJobTableList outputs) + : InputSchemas_(previousResult) + , Inputs_(std::move(inputs)) + , Outputs_(std::move(outputs)) +{ + Y_VERIFY(Inputs_.size() == previousResult.size()); +} + +int TSpeculativeOperationPreparationContext::GetInputCount() const +{ + return static_cast<int>(Inputs_.size()); +} + +int TSpeculativeOperationPreparationContext::GetOutputCount() const +{ + return static_cast<int>(Outputs_.size()); +} + +const TVector<TTableSchema>& TSpeculativeOperationPreparationContext::GetInputSchemas() const +{ + return InputSchemas_; +} + +const TTableSchema& TSpeculativeOperationPreparationContext::GetInputSchema(int index) const +{ + Y_VERIFY(index < static_cast<int>(InputSchemas_.size())); + return InputSchemas_[index]; +} + +TMaybe<TYPath> TSpeculativeOperationPreparationContext::GetInputPath(int index) const +{ + Y_VERIFY(index < static_cast<int>(Inputs_.size())); + if (Inputs_[index].RichYPath) { + return Inputs_[index].RichYPath->Path_; + } + return Nothing(); +} + +TMaybe<TYPath> TSpeculativeOperationPreparationContext::GetOutputPath(int index) const +{ + Y_VERIFY(index < static_cast<int>(Outputs_.size())); + if (Outputs_[index].RichYPath) { + return Outputs_[index].RichYPath->Path_; + } + return Nothing(); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void FixInputTable(TRichYPath& table, int index, const TJobOperationPreparer& preparer) +{ + const auto& columnRenamings = preparer.GetInputColumnRenamings(); + const auto& columnFilters = preparer.GetInputColumnFilters(); + + if (!columnRenamings[index].empty()) { + table.RenameColumns(columnRenamings[index]); + } + if (columnFilters[index]) { + table.Columns(*columnFilters[index]); + } +} + +static void FixInputTable(TStructuredJobTable& table, int index, const TJobOperationPreparer& preparer) +{ + const auto& inputDescriptions = preparer.GetInputDescriptions(); + + if (inputDescriptions[index] && std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) { + table.Description = *inputDescriptions[index]; + } + if (table.RichYPath) { + FixInputTable(*table.RichYPath, index, preparer); + } +} + +static void FixOutputTable(TRichYPath& /* table */, int /* index */, const TJobOperationPreparer& /* preparer */) +{ } + +static void FixOutputTable(TStructuredJobTable& table, int index, const TJobOperationPreparer& preparer) +{ + const auto& outputDescriptions = preparer.GetOutputDescriptions(); + + if (outputDescriptions[index] && std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) { + table.Description = *outputDescriptions[index]; + } + if (table.RichYPath) { + FixOutputTable(*table.RichYPath, index, preparer); + } +} + +template <typename TTables> +TVector<TTableSchema> PrepareOperation( + const IJob& job, + const IOperationPreparationContext& context, + TTables* inputsPtr, + TTables* outputsPtr, + TUserJobFormatHints& hints) +{ + TJobOperationPreparer preparer(context); + job.PrepareOperation(context, preparer); + preparer.Finish(); + + if (inputsPtr) { + auto& inputs = *inputsPtr; + for (int i = 0; i < static_cast<int>(inputs.size()); ++i) { + FixInputTable(inputs[i], i, preparer); + } + } + + if (outputsPtr) { + auto& outputs = *outputsPtr; + for (int i = 0; i < static_cast<int>(outputs.size()); ++i) { + FixOutputTable(outputs[i], i, preparer); + } + } + + auto applyPatch = [](TMaybe<TFormatHints>& origin, const TMaybe<TFormatHints>& patch) { + if (origin) { + if (patch) { + origin->Merge(*patch); + } + } else { + origin = patch; + } + }; + + auto preparerHints = preparer.GetFormatHints(); + applyPatch(preparerHints.InputFormatHints_, hints.InputFormatHints_); + applyPatch(preparerHints.OutputFormatHints_, hints.OutputFormatHints_); + hints = std::move(preparerHints); + + return preparer.GetOutputSchemas(); +} + +template +TVector<TTableSchema> PrepareOperation<TStructuredJobTableList>( + const IJob& job, + const IOperationPreparationContext& context, + TStructuredJobTableList* inputsPtr, + TStructuredJobTableList* outputsPtr, + TUserJobFormatHints& hints); + +template +TVector<TTableSchema> PrepareOperation<TVector<TRichYPath>>( + const IJob& job, + const IOperationPreparationContext& context, + TVector<TRichYPath>* inputsPtr, + TVector<TRichYPath>* outputsPtr, + TUserJobFormatHints& hints); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/prepare_operation.h b/yt/cpp/mapreduce/client/prepare_operation.h new file mode 100644 index 0000000000..3b64aa2856 --- /dev/null +++ b/yt/cpp/mapreduce/client/prepare_operation.h @@ -0,0 +1,93 @@ +#pragma once + +#include "structured_table_formats.h" + +#include <yt/cpp/mapreduce/interface/operation.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TOperationPreparationContext + : public IOperationPreparationContext +{ +public: + TOperationPreparationContext( + const TStructuredJobTableList& structuredInputs, + const TStructuredJobTableList& structuredOutputs, + const TClientContext& context, + const IClientRetryPolicyPtr& retryPolicy, + TTransactionId transactionId); + + TOperationPreparationContext( + TVector<TRichYPath> inputs, + TVector<TRichYPath> outputs, + const TClientContext& context, + const IClientRetryPolicyPtr& retryPolicy, + TTransactionId transactionId); + + int GetInputCount() const override; + int GetOutputCount() const override; + + const TVector<TTableSchema>& GetInputSchemas() const override; + const TTableSchema& GetInputSchema(int index) const override; + + TMaybe<TYPath> GetInputPath(int index) const override; + TMaybe<TYPath> GetOutputPath(int index) const override; + +private: + TVector<TMaybe<TRichYPath>> Inputs_; + TVector<TMaybe<TRichYPath>> Outputs_; + const TClientContext& Context_; + const IClientRetryPolicyPtr RetryPolicy_; + TTransactionId TransactionId_; + + mutable TVector<TTableSchema> InputSchemas_; + mutable TVector<bool> InputSchemasLoaded_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TSpeculativeOperationPreparationContext + : public IOperationPreparationContext +{ +public: + TSpeculativeOperationPreparationContext( + const TVector<TTableSchema>& previousResult, + TStructuredJobTableList inputs, + TStructuredJobTableList outputs); + + int GetInputCount() const override; + int GetOutputCount() const override; + + const TVector<TTableSchema>& GetInputSchemas() const override; + const TTableSchema& GetInputSchema(int index) const override; + + TMaybe<TYPath> GetInputPath(int index) const override; + TMaybe<TYPath> GetOutputPath(int index) const override; + +private: + TVector<TTableSchema> InputSchemas_; + TStructuredJobTableList Inputs_; + TStructuredJobTableList Outputs_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TTables> +TVector<TTableSchema> PrepareOperation( + const IJob& job, + const IOperationPreparationContext& context, + TTables* inputsPtr, + TTables* outputsPtr, + TUserJobFormatHints& hints); + +//////////////////////////////////////////////////////////////////////////////// + +TJobOperationPreparer GetOperationPreparer( + const IJob& job, + const IOperationPreparationContext& context); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/py_helpers.cpp b/yt/cpp/mapreduce/client/py_helpers.cpp new file mode 100644 index 0000000000..3072449866 --- /dev/null +++ b/yt/cpp/mapreduce/client/py_helpers.cpp @@ -0,0 +1,112 @@ +#include "py_helpers.h" + +#include "client.h" +#include "operation.h" +#include "transaction.h" + +#include <yt/cpp/mapreduce/interface/client.h> +#include <yt/cpp/mapreduce/interface/fluent.h> + +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/helpers.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <util/generic/hash_set.h> + +namespace NYT { + +using namespace NDetail; + +//////////////////////////////////////////////////////////////////////////////// + +IStructuredJobPtr ConstructJob(const TString& jobName, const TString& state) +{ + auto node = TNode(); + if (!state.empty()) { + node = NodeFromYsonString(state); + } + return TJobFactory::Get()->GetConstructingFunction(jobName.data())(node); +} + +TString GetJobStateString(const IStructuredJob& job) +{ + TString result; + { + TStringOutput output(result); + job.Save(output); + output.Finish(); + } + return result; +} + +TStructuredJobTableList NodeToStructuredTablePaths(const TNode& node, const TOperationPreparer& preparer) +{ + int intermediateTableCount = 0; + TVector<TRichYPath> paths; + for (const auto& inputNode : node.AsList()) { + if (inputNode.IsNull()) { + ++intermediateTableCount; + } else { + paths.emplace_back(inputNode.AsString()); + } + } + paths = NRawClient::CanonizeYPaths(/* retryPolicy */ nullptr, preparer.GetContext(), paths); + TStructuredJobTableList result(intermediateTableCount, TStructuredJobTable::Intermediate(TUnspecifiedTableStructure())); + for (const auto& path : paths) { + result.emplace_back(TStructuredJobTable{TUnspecifiedTableStructure(), path}); + } + return result; +} + +TString GetIOInfo( + const IStructuredJob& job, + const TCreateClientOptions& options, + const TString& cluster, + const TString& transactionId, + const TString& inputPaths, + const TString& outputPaths, + const TString& neededColumns) +{ + auto client = NDetail::CreateClientImpl(cluster, options); + TOperationPreparer preparer(client, GetGuid(transactionId)); + + auto structuredInputs = NodeToStructuredTablePaths(NodeFromYsonString(inputPaths), preparer); + auto structuredOutputs = NodeToStructuredTablePaths(NodeFromYsonString(outputPaths), preparer); + + auto neededColumnsNode = NodeFromYsonString(neededColumns); + THashSet<TString> columnsUsedInOperations; + for (const auto& columnNode : neededColumnsNode.AsList()) { + columnsUsedInOperations.insert(columnNode.AsString()); + } + + auto operationIo = CreateSimpleOperationIoHelper( + job, + preparer, + TOperationOptions(), + std::move(structuredInputs), + std::move(structuredOutputs), + TUserJobFormatHints(), + ENodeReaderFormat::Yson, + columnsUsedInOperations); + + return BuildYsonStringFluently().BeginMap() + .Item("input_format").Value(operationIo.InputFormat.Config) + .Item("output_format").Value(operationIo.OutputFormat.Config) + .Item("input_table_paths").List(operationIo.Inputs) + .Item("output_table_paths").List(operationIo.Outputs) + .Item("small_files").DoListFor( + operationIo.JobFiles.begin(), + operationIo.JobFiles.end(), + [] (TFluentList fluent, auto fileIt) { + fluent.Item().BeginMap() + .Item("file_name").Value(fileIt->FileName) + .Item("data").Value(fileIt->Data) + .EndMap(); + }) + .EndMap(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/py_helpers.h b/yt/cpp/mapreduce/client/py_helpers.h new file mode 100644 index 0000000000..85aa0a93f3 --- /dev/null +++ b/yt/cpp/mapreduce/client/py_helpers.h @@ -0,0 +1,25 @@ +#include <yt/cpp/mapreduce/interface/client_method_options.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +using IStructuredJobPtr = TIntrusiveConstPtr<IStructuredJob>; + +IStructuredJobPtr ConstructJob(const TString& jobName, const TString& state); + +TString GetJobStateString(const IStructuredJob& job); + +TString GetIOInfo( + const IStructuredJob& job, + const TCreateClientOptions& options, + const TString& cluster, + const TString& transactionId, + const TString& inputPaths, + const TString& outputPaths, + const TString& neededColumns); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/retry_heavy_write_request.cpp b/yt/cpp/mapreduce/client/retry_heavy_write_request.cpp new file mode 100644 index 0000000000..b4e4975d7f --- /dev/null +++ b/yt/cpp/mapreduce/client/retry_heavy_write_request.cpp @@ -0,0 +1,87 @@ +#include "retry_heavy_write_request.h" + +#include "transaction.h" +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/http_client.h> +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +namespace NYT { + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +void RetryHeavyWriteRequest( + const IClientRetryPolicyPtr& clientRetryPolicy, + const ITransactionPingerPtr& transactionPinger, + const TClientContext& context, + const TTransactionId& parentId, + THttpHeader& header, + std::function<THolder<IInputStream>()> streamMaker) +{ + int retryCount = context.Config->RetryCount; + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + + for (int attempt = 0; attempt < retryCount; ++attempt) { + TPingableTransaction attemptTx(clientRetryPolicy, context, parentId, transactionPinger->GetChildTxPinger(), TStartTransactionOptions()); + + auto input = streamMaker(); + TString requestId; + + try { + auto hostName = GetProxyForHeavyRequest(context); + requestId = CreateGuidAsString(); + + header.AddTransactionId(attemptTx.GetId(), /* overwrite = */ true); + header.SetRequestCompression(ToString(context.Config->ContentEncoding)); + + auto request = context.HttpClient->StartRequest(GetFullUrl(hostName, context, header), requestId, header); + TransferData(input.Get(), request->GetStream()); + request->Finish()->GetResponse(); + } catch (TErrorResponse& e) { + YT_LOG_ERROR("RSP %v - attempt %v failed", + requestId, + attempt); + + if (!IsRetriable(e) || attempt + 1 == retryCount) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(GetBackoffDuration(e, context.Config)); + continue; + + } catch (std::exception& e) { + YT_LOG_ERROR("RSP %v - %v - attempt %v failed", + requestId, + e.what(), + attempt); + + if (attempt + 1 == retryCount) { + throw; + } + NDetail::TWaitProxy::Get()->Sleep(GetBackoffDuration(e, context.Config)); + continue; + } + + attemptTx.Commit(); + return; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/retry_heavy_write_request.h b/yt/cpp/mapreduce/client/retry_heavy_write_request.h new file mode 100644 index 0000000000..647cad302c --- /dev/null +++ b/yt/cpp/mapreduce/client/retry_heavy_write_request.h @@ -0,0 +1,21 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/requests.h> + +namespace NYT { + +/////////////////////////////////////////////////////////////////////////////// + +void RetryHeavyWriteRequest( + const IClientRetryPolicyPtr& clientRetryPolicy, + const ITransactionPingerPtr& transactionPinger, + const TClientContext& context, + const TTransactionId& parentId, + THttpHeader& header, + std::function<THolder<IInputStream>()> streamMaker); + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/retry_transaction.h b/yt/cpp/mapreduce/client/retry_transaction.h new file mode 100644 index 0000000000..5220c222b8 --- /dev/null +++ b/yt/cpp/mapreduce/client/retry_transaction.h @@ -0,0 +1,71 @@ +#pragma once + +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/client/client.h> + +#include <yt/cpp/mapreduce/common/wait_proxy.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +namespace NYT::NDetail { + +template <typename TResult> +TResult RetryTransactionWithPolicy( + const TClientBasePtr& client, + std::function<TResult(ITransactionPtr)> func, + IRequestRetryPolicyPtr retryPolicy) +{ + if (!retryPolicy) { + retryPolicy = CreateDefaultRequestRetryPolicy(client->GetContext().Config); + } + + while (true) { + try { + retryPolicy->NotifyNewAttempt(); + auto transaction = client->StartTransaction(TStartTransactionOptions()); + if constexpr (std::is_same<TResult, void>::value) { + func(transaction); + transaction->Commit(); + return; + } else { + auto result = func(transaction); + transaction->Commit(); + return result; + } + } catch (const TErrorResponse& e) { + YT_LOG_ERROR("Retry failed %v - %v", + e.GetError().GetMessage(), + retryPolicy->GetAttemptDescription()); + + if (!IsRetriable(e)) { + throw; + } + + auto maybeRetryTimeout = retryPolicy->OnRetriableError(e); + if (maybeRetryTimeout) { + TWaitProxy::Get()->Sleep(*maybeRetryTimeout); + } else { + throw; + } + } catch (const std::exception& e) { + YT_LOG_ERROR("Retry failed %v - %v", + e.what(), + retryPolicy->GetAttemptDescription()); + + if (!IsRetriable(e)) { + throw; + } + + auto maybeRetryTimeout = retryPolicy->OnGenericError(e); + if (maybeRetryTimeout) { + TWaitProxy::Get()->Sleep(*maybeRetryTimeout); + } else { + throw; + } + } + } +} + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/client/retryful_writer.cpp b/yt/cpp/mapreduce/client/retryful_writer.cpp new file mode 100644 index 0000000000..12b2939ffa --- /dev/null +++ b/yt/cpp/mapreduce/client/retryful_writer.cpp @@ -0,0 +1,163 @@ +#include "retryful_writer.h" + +#include "retry_heavy_write_request.h" + +#include <yt/cpp/mapreduce/http/requests.h> + +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/finish_or_die.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <util/generic/size_literals.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TRetryfulWriter::~TRetryfulWriter() +{ + NDetail::FinishOrDie(this, "TRetryfulWriter"); +} + +void TRetryfulWriter::CheckWriterState() +{ + switch (WriterState_) { + case Ok: + break; + case Completed: + ythrow TApiUsageError() << "Cannot use table writer that is finished"; + case Error: + ythrow TApiUsageError() << "Cannot use table writer that finished with error"; + } +} + +void TRetryfulWriter::NotifyRowEnd() +{ + CheckWriterState(); + if (Buffer_.Size() >= BufferSize_) { + FlushBuffer(false); + } +} + +void TRetryfulWriter::DoWrite(const void* buf, size_t len) +{ + CheckWriterState(); + while (Buffer_.Size() + len > Buffer_.Capacity()) { + Buffer_.Reserve(Buffer_.Capacity() * 2); + } + Buffer_.Append(static_cast<const char*>(buf), len); +} + +void TRetryfulWriter::DoFinish() +{ + if (WriterState_ != Ok) { + return; + } + FlushBuffer(true); + if (Started_) { + FilledBuffers_.Stop(); + Thread_.Join(); + } + if (Exception_) { + WriterState_ = Error; + std::rethrow_exception(Exception_); + } + if (WriteTransaction_) { + WriteTransaction_->Commit(); + } + WriterState_ = Completed; +} + +void TRetryfulWriter::FlushBuffer(bool lastBlock) +{ + if (!Started_) { + if (lastBlock) { + try { + Send(Buffer_); + } catch (...) { + WriterState_ = Error; + throw; + } + return; + } else { + Started_ = true; + Thread_.Start(); + } + } + + auto emptyBuffer = EmptyBuffers_.Pop(); + if (!emptyBuffer) { + WriterState_ = Error; + std::rethrow_exception(Exception_); + } + FilledBuffers_.Push(std::move(Buffer_)); + Buffer_ = std::move(emptyBuffer.GetRef()); +} + +void TRetryfulWriter::Send(const TBuffer& buffer) +{ + THttpHeader header("PUT", Command_); + header.SetInputFormat(Format_); + header.MergeParameters(Parameters_); + + auto streamMaker = [&buffer] () { + return MakeHolder<TBufferInput>(buffer); + }; + + auto transactionId = (WriteTransaction_ ? WriteTransaction_->GetId() : ParentTransactionId_); + RetryHeavyWriteRequest(ClientRetryPolicy_, TransactionPinger_, Context_, transactionId, header, streamMaker); + + Parameters_ = SecondaryParameters_; // all blocks except the first one are appended +} + +void TRetryfulWriter::SendThread() +{ + while (auto maybeBuffer = FilledBuffers_.Pop()) { + auto& buffer = maybeBuffer.GetRef(); + try { + Send(buffer); + } catch (const std::exception&) { + Exception_ = std::current_exception(); + EmptyBuffers_.Stop(); + break; + } + buffer.Clear(); + EmptyBuffers_.Push(std::move(buffer)); + } +} + +void* TRetryfulWriter::SendThread(void* opaque) +{ + static_cast<TRetryfulWriter*>(opaque)->SendThread(); + return nullptr; +} + +void TRetryfulWriter::Abort() +{ + if (Started_) { + FilledBuffers_.Stop(); + Thread_.Join(); + } + if (WriteTransaction_) { + WriteTransaction_->Abort(); + } + WriterState_ = Completed; +} + +size_t TRetryfulWriter::GetBufferSize(const TMaybe<TWriterOptions>& writerOptions) +{ + auto retryBlockSize = TMaybe<size_t>(); + if (writerOptions) { + if (writerOptions->RetryBlockSize_) { + retryBlockSize = *writerOptions->RetryBlockSize_; + } else if (writerOptions->DesiredChunkSize_) { + retryBlockSize = *writerOptions->DesiredChunkSize_; + } + } + return retryBlockSize.GetOrElse(64_MB); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/retryful_writer.h b/yt/cpp/mapreduce/client/retryful_writer.h new file mode 100644 index 0000000000..38e351977d --- /dev/null +++ b/yt/cpp/mapreduce/client/retryful_writer.h @@ -0,0 +1,130 @@ +#pragma once + +#include "transaction.h" +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/http/http.h> +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/io/helpers.h> +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <library/cpp/threading/blocking_queue/blocking_queue.h> + +#include <util/stream/output.h> +#include <util/generic/buffer.h> +#include <util/stream/buffer.h> +#include <util/system/thread.h> +#include <util/system/event.h> + +#include <atomic> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TRetryfulWriter + : public TRawTableWriter +{ +public: + template <class TWriterOptions> + TRetryfulWriter( + IClientRetryPolicyPtr clientRetryPolicy, + ITransactionPingerPtr transactionPinger, + const TClientContext& context, + const TTransactionId& parentId, + const TString& command, + const TMaybe<TFormat>& format, + const TRichYPath& path, + const TWriterOptions& options) + : ClientRetryPolicy_(std::move(clientRetryPolicy)) + , TransactionPinger_(std::move(transactionPinger)) + , Context_(context) + , Command_(command) + , Format_(format) + , BufferSize_(GetBufferSize(options.WriterOptions_)) + , ParentTransactionId_(parentId) + , WriteTransaction_() + , FilledBuffers_(2) + , EmptyBuffers_(2) + , Buffer_(BufferSize_ * 2) + , Thread_(TThread::TParams{SendThread, this}.SetName("retryful_writer")) + { + Parameters_ = FormIORequestParameters(path, options); + + auto secondaryPath = path; + secondaryPath.Append_ = true; + secondaryPath.Schema_.Clear(); + secondaryPath.CompressionCodec_.Clear(); + secondaryPath.ErasureCodec_.Clear(); + secondaryPath.OptimizeFor_.Clear(); + SecondaryParameters_ = FormIORequestParameters(secondaryPath, options); + + if (options.CreateTransaction_) { + WriteTransaction_.ConstructInPlace(ClientRetryPolicy_, context, parentId, TransactionPinger_->GetChildTxPinger(), TStartTransactionOptions()); + auto append = path.Append_.GetOrElse(false); + auto lockMode = (append ? LM_SHARED : LM_EXCLUSIVE); + NDetail::NRawClient::Lock(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, WriteTransaction_->GetId(), path.Path_, lockMode); + } + + EmptyBuffers_.Push(TBuffer(BufferSize_ * 2)); + } + + ~TRetryfulWriter() override; + void NotifyRowEnd() override; + void Abort() override; + + size_t GetRetryBlockRemainingSize() const + { + return (BufferSize_ > Buffer_.size()) ? (BufferSize_ - Buffer_.size()) : 0; + } + +protected: + void DoWrite(const void* buf, size_t len) override; + void DoFinish() override; + +private: + static size_t GetBufferSize(const TMaybe<TWriterOptions>& writerOptions); + +private: + const IClientRetryPolicyPtr ClientRetryPolicy_; + const ITransactionPingerPtr TransactionPinger_; + const TClientContext Context_; + TString Command_; + TMaybe<TFormat> Format_; + const size_t BufferSize_; + + TNode Parameters_; + TNode SecondaryParameters_; + + TTransactionId ParentTransactionId_; + TMaybe<TPingableTransaction> WriteTransaction_; + + ::NThreading::TBlockingQueue<TBuffer> FilledBuffers_; + ::NThreading::TBlockingQueue<TBuffer> EmptyBuffers_; + + TBuffer Buffer_; + + TThread Thread_; + bool Started_ = false; + std::exception_ptr Exception_ = nullptr; + + enum EWriterState { + Ok, + Completed, + Error, + } WriterState_ = Ok; + +private: + void FlushBuffer(bool lastBlock); + void Send(const TBuffer& buffer); + void CheckWriterState(); + + void SendThread(); + static void* SendThread(void* opaque); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} diff --git a/yt/cpp/mapreduce/client/retryless_writer.cpp b/yt/cpp/mapreduce/client/retryless_writer.cpp new file mode 100644 index 0000000000..4c25c1a1dd --- /dev/null +++ b/yt/cpp/mapreduce/client/retryless_writer.cpp @@ -0,0 +1,45 @@ +#include "retryless_writer.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TRetrylessWriter::~TRetrylessWriter() +{ + NDetail::FinishOrDie(this, "TRetrylessWriter"); +} + +void TRetrylessWriter::DoFinish() +{ + if (!Running_) { + return; + } + Running_ = false; + + BufferedOutput_->Finish(); + Request_->Finish()->GetResponse(); +} + +void TRetrylessWriter::DoWrite(const void* buf, size_t len) +{ + try { + BufferedOutput_->Write(buf, len); + } catch (...) { + Running_ = false; + throw; + } +} + +void TRetrylessWriter::NotifyRowEnd() +{ } + +void TRetrylessWriter::Abort() +{ + Running_ = false; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/retryless_writer.h b/yt/cpp/mapreduce/client/retryless_writer.h new file mode 100644 index 0000000000..baf49a258f --- /dev/null +++ b/yt/cpp/mapreduce/client/retryless_writer.h @@ -0,0 +1,73 @@ +#pragma once + +#include "transaction.h" + +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/http.h> +#include <yt/cpp/mapreduce/http/http_client.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/io/helpers.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <util/stream/buffered.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TRetrylessWriter + : public TRawTableWriter +{ +public: + template <class TWriterOptions> + TRetrylessWriter( + const TClientContext& context, + const TTransactionId& parentId, + const TString& command, + const TMaybe<TFormat>& format, + const TRichYPath& path, + size_t bufferSize, + const TWriterOptions& options) + { + THttpHeader header("PUT", command); + header.SetInputFormat(format); + header.MergeParameters(FormIORequestParameters(path, options)); + header.AddTransactionId(parentId); + header.SetRequestCompression(ToString(context.Config->ContentEncoding)); + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + + TString requestId = CreateGuidAsString(); + + auto hostName = GetProxyForHeavyRequest(context); + Request_ = context.HttpClient->StartRequest(GetFullUrl(hostName, context, header), requestId, header); + BufferedOutput_.Reset(new TBufferedOutput(Request_->GetStream(), bufferSize)); + } + + ~TRetrylessWriter() override; + void NotifyRowEnd() override; + void Abort() override; + +protected: + void DoWrite(const void* buf, size_t len) override; + void DoFinish() override; + +private: + bool Running_ = true; + NHttpClient::IHttpRequestPtr Request_; + THolder<TBufferedOutput> BufferedOutput_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/skiff.cpp b/yt/cpp/mapreduce/client/skiff.cpp new file mode 100644 index 0000000000..67a0f960ae --- /dev/null +++ b/yt/cpp/mapreduce/client/skiff.cpp @@ -0,0 +1,396 @@ +#include "skiff.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/http/retry_request.h> +#include <yt/cpp/mapreduce/http/requests.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/serialize.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_io.h> + +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <yt/cpp/mapreduce/skiff/skiff_schema.h> + +#include <library/cpp/yson/consumer.h> +#include <library/cpp/yson/writer.h> + +#include <util/string/cast.h> +#include <util/stream/str.h> +#include <util/stream/file.h> +#include <util/folder/path.h> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +using ::ToString; + +//////////////////////////////////////////////////////////////////////////////// + +static NSkiff::TSkiffSchemaPtr ReadSkiffSchema(const TString& fileName) +{ + if (!TFsPath(fileName).Exists()) { + return nullptr; + } + TIFStream input(fileName); + NSkiff::TSkiffSchemaPtr schema; + Deserialize(schema, NodeFromYsonStream(&input)); + return schema; +} + +NSkiff::TSkiffSchemaPtr GetJobInputSkiffSchema() +{ + return ReadSkiffSchema("skiff_input"); +} + +NSkiff::EWireType ValueTypeToSkiffType(EValueType valueType) +{ + using NSkiff::EWireType; + switch (valueType) { + case VT_INT64: + case VT_INT32: + case VT_INT16: + case VT_INT8: + return EWireType::Int64; + + case VT_UINT64: + case VT_UINT32: + case VT_UINT16: + case VT_UINT8: + return EWireType::Uint64; + + case VT_DOUBLE: + case VT_FLOAT: + return EWireType::Double; + + case VT_BOOLEAN: + return EWireType::Boolean; + + case VT_STRING: + case VT_UTF8: + case VT_JSON: + return EWireType::String32; + + case VT_ANY: + return EWireType::Yson32; + + case VT_NULL: + case VT_VOID: + return EWireType::Nothing; + + case VT_DATE: + case VT_DATETIME: + case VT_TIMESTAMP: + return EWireType::Uint64; + + case VT_INTERVAL: + return EWireType::Int64; + }; + ythrow yexception() << "Cannot convert EValueType '" << valueType << "' to NSkiff::EWireType"; +} + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TTableSchema& schema, + const TCreateSkiffSchemaOptions& options) +{ + using namespace NSkiff; + + Y_ENSURE(schema.Strict(), "Cannot create Skiff schema for non-strict table schema"); + TVector<TSkiffSchemaPtr> skiffColumns; + for (const auto& column: schema.Columns()) { + TSkiffSchemaPtr skiffColumn; + if (column.Type() == VT_ANY && *column.TypeV3() != *NTi::Optional(NTi::Yson())) { + // We ignore all complex types until YT-12717 is done. + return nullptr; + } + if (column.Required() || NTi::IsSingular(column.TypeV3()->GetTypeName())) { + skiffColumn = CreateSimpleTypeSchema(ValueTypeToSkiffType(column.Type())); + } else { + skiffColumn = CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(ValueTypeToSkiffType(column.Type()))}); + } + if (options.RenameColumns_) { + auto maybeName = options.RenameColumns_->find(column.Name()); + skiffColumn->SetName(maybeName == options.RenameColumns_->end() ? column.Name() : maybeName->second); + } else { + skiffColumn->SetName(column.Name()); + } + skiffColumns.push_back(skiffColumn); + } + + if (options.HasKeySwitch_) { + skiffColumns.push_back( + CreateSimpleTypeSchema(EWireType::Boolean)->SetName("$key_switch")); + } + if (options.HasRangeIndex_) { + skiffColumns.push_back( + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64)}) + ->SetName("$range_index")); + } + + skiffColumns.push_back( + CreateVariant8Schema({ + CreateSimpleTypeSchema(EWireType::Nothing), + CreateSimpleTypeSchema(EWireType::Int64)}) + ->SetName("$row_index")); + + return CreateTupleSchema(std::move(skiffColumns)); +} + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TNode& schemaNode, + const TCreateSkiffSchemaOptions& options) +{ + TTableSchema schema; + Deserialize(schema, schemaNode); + return CreateSkiffSchema(schema, options); +} + +void Serialize(const NSkiff::TSkiffSchemaPtr& schema, NYson::IYsonConsumer* consumer) +{ + consumer->OnBeginMap(); + if (schema->GetName().size() > 0) { + consumer->OnKeyedItem("name"); + consumer->OnStringScalar(schema->GetName()); + } + consumer->OnKeyedItem("wire_type"); + consumer->OnStringScalar(ToString(schema->GetWireType())); + if (schema->GetChildren().size() > 0) { + consumer->OnKeyedItem("children"); + consumer->OnBeginList(); + for (const auto& child : schema->GetChildren()) { + consumer->OnListItem(); + Serialize(child, consumer); + } + consumer->OnEndList(); + } + consumer->OnEndMap(); +} + +void Deserialize(NSkiff::TSkiffSchemaPtr& schema, const TNode& node) +{ + using namespace NSkiff; + + static auto createSchema = [](EWireType wireType, TVector<TSkiffSchemaPtr>&& children) -> TSkiffSchemaPtr { + switch (wireType) { + case EWireType::Tuple: + return CreateTupleSchema(std::move(children)); + case EWireType::Variant8: + return CreateVariant8Schema(std::move(children)); + case EWireType::Variant16: + return CreateVariant16Schema(std::move(children)); + case EWireType::RepeatedVariant8: + return CreateRepeatedVariant8Schema(std::move(children)); + case EWireType::RepeatedVariant16: + return CreateRepeatedVariant16Schema(std::move(children)); + default: + return CreateSimpleTypeSchema(wireType); + } + }; + + const auto& map = node.AsMap(); + const auto* wireTypePtr = map.FindPtr("wire_type"); + Y_ENSURE(wireTypePtr, "'wire_type' is a required key"); + auto wireType = FromString<NSkiff::EWireType>(wireTypePtr->AsString()); + + const auto* childrenPtr = map.FindPtr("children"); + Y_ENSURE(NSkiff::IsSimpleType(wireType) || childrenPtr, + "'children' key is required for complex node '" << wireType << "'"); + TVector<TSkiffSchemaPtr> children; + if (childrenPtr) { + for (const auto& childNode : childrenPtr->AsList()) { + TSkiffSchemaPtr childSchema; + Deserialize(childSchema, childNode); + children.push_back(std::move(childSchema)); + } + } + + schema = createSchema(wireType, std::move(children)); + + const auto* namePtr = map.FindPtr("name"); + if (namePtr) { + schema->SetName(namePtr->AsString()); + } +} + +TFormat CreateSkiffFormat(const NSkiff::TSkiffSchemaPtr& schema) { + Y_ENSURE(schema->GetWireType() == NSkiff::EWireType::Variant16, + "Bad wire type for schema; expected 'variant16', got " << schema->GetWireType()); + + THashMap< + NSkiff::TSkiffSchemaPtr, + size_t, + NSkiff::TSkiffSchemaPtrHasher, + NSkiff::TSkiffSchemaPtrEqual> schemasMap; + size_t tableIndex = 0; + auto config = TNode("skiff"); + config.Attributes()["table_skiff_schemas"] = TNode::CreateList(); + + for (const auto& schemaChild : schema->GetChildren()) { + auto [iter, inserted] = schemasMap.emplace(schemaChild, tableIndex); + size_t currentIndex; + if (inserted) { + currentIndex = tableIndex; + ++tableIndex; + } else { + currentIndex = iter->second; + } + config.Attributes()["table_skiff_schemas"].Add("$" + ToString(currentIndex)); + } + + config.Attributes()["skiff_schema_registry"] = TNode::CreateMap(); + + for (const auto& [tableSchema, index] : schemasMap) { + TNode node; + TNodeBuilder nodeBuilder(&node); + Serialize(tableSchema, &nodeBuilder); + config.Attributes()["skiff_schema_registry"][ToString(index)] = std::move(node); + } + + return TFormat(config); +} + +NSkiff::TSkiffSchemaPtr CreateSkiffSchemaIfNecessary( + const TClientContext& context, + const IClientRetryPolicyPtr& clientRetryPolicy, + const TTransactionId& transactionId, + ENodeReaderFormat nodeReaderFormat, + const TVector<TRichYPath>& tablePaths, + const TCreateSkiffSchemaOptions& options) +{ + if (nodeReaderFormat == ENodeReaderFormat::Yson) { + return nullptr; + } + + for (const auto& path : tablePaths) { + if (path.Columns_) { + switch (nodeReaderFormat) { + case ENodeReaderFormat::Skiff: + ythrow TApiUsageError() << "Cannot use Skiff format with column selectors"; + case ENodeReaderFormat::Auto: + return nullptr; + default: + Y_FAIL("Unexpected node reader format: %d", static_cast<int>(nodeReaderFormat)); + } + } + } + + auto nodes = NRawClient::BatchTransform( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + NRawClient::CanonizeYPaths(clientRetryPolicy->CreatePolicyForGenericRequest(), context, tablePaths), + [&] (TRawBatchRequest& batch, const TRichYPath& path) { + auto getOptions = TGetOptions() + .AttributeFilter( + TAttributeFilter() + .AddAttribute("schema") + .AddAttribute("dynamic") + .AddAttribute("type") + ); + return batch.Get(transactionId, path.Path_, getOptions); + }); + + TVector<NSkiff::TSkiffSchemaPtr> schemas; + for (size_t tableIndex = 0; tableIndex < nodes.size(); ++tableIndex) { + const auto& tablePath = tablePaths[tableIndex].Path_; + const auto& attributes = nodes[tableIndex].GetAttributes(); + Y_ENSURE_EX(attributes["type"] == TNode("table"), + TApiUsageError() << "Operation input path " << tablePath << " is not a table"); + bool dynamic = attributes["dynamic"].AsBool(); + bool strict = attributes["schema"].GetAttributes()["strict"].AsBool(); + switch (nodeReaderFormat) { + case ENodeReaderFormat::Skiff: + Y_ENSURE_EX(strict, + TApiUsageError() << "Cannot use skiff format for table with non-strict schema '" << tablePath << "'"); + Y_ENSURE_EX(!dynamic, + TApiUsageError() << "Cannot use skiff format for dynamic table '" << tablePath << "'"); + break; + case ENodeReaderFormat::Auto: + if (dynamic || !strict) { + YT_LOG_DEBUG("Cannot use skiff format for table '%v' as it is dynamic or has non-strict schema", + tablePath); + return nullptr; + } + break; + default: + Y_FAIL("Unexpected node reader format: %d", static_cast<int>(nodeReaderFormat)); + } + + NSkiff::TSkiffSchemaPtr curSkiffSchema; + if (tablePaths[tableIndex].RenameColumns_) { + auto customOptions = options; + customOptions.RenameColumns(*tablePaths[tableIndex].RenameColumns_); + curSkiffSchema = CreateSkiffSchema(attributes["schema"], customOptions); + } else { + curSkiffSchema = CreateSkiffSchema(attributes["schema"], options); + } + + if (!curSkiffSchema) { + return nullptr; + } + schemas.push_back(curSkiffSchema); + } + return NSkiff::CreateVariant16Schema(std::move(schemas)); +} + +//////////////////////////////////////////////////////////////////////////////// + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TVector<NSkiff::TSkiffSchemaPtr>& tableSchemas, + const TCreateSkiffSchemaOptions& options +) { + constexpr auto KEY_SWITCH_COLUMN = "$key_switch"; + constexpr auto ROW_INDEX_COLUMN = "$row_index"; + constexpr auto RANGE_INDEX_COLUMN = "$range_index"; + + TVector<NSkiff::TSkiffSchemaPtr> schemas; + schemas.reserve(tableSchemas.size()); + + for (const auto& tableSchema : tableSchemas) { + Y_ENSURE(tableSchema->GetWireType() == NSkiff::EWireType::Tuple, + "Expected 'tuple' wire type for table schema, got '" << tableSchema->GetWireType() << "'"); + + const auto& children = tableSchema->GetChildren(); + NSkiff::TSkiffSchemaList columns; + + columns.reserve(children.size() + 3); + if (options.HasKeySwitch_) { + columns.push_back( + CreateSimpleTypeSchema(NSkiff::EWireType::Boolean)->SetName(KEY_SWITCH_COLUMN)); + } + columns.push_back( + NSkiff::CreateVariant8Schema({ + CreateSimpleTypeSchema(NSkiff::EWireType::Nothing), + CreateSimpleTypeSchema(NSkiff::EWireType::Int64)}) + ->SetName(ROW_INDEX_COLUMN)); + if (options.HasRangeIndex_) { + columns.push_back( + NSkiff::CreateVariant8Schema({ + CreateSimpleTypeSchema(NSkiff::EWireType::Nothing), + CreateSimpleTypeSchema(NSkiff::EWireType::Int64)}) + ->SetName(RANGE_INDEX_COLUMN)); + } + columns.insert(columns.end(), children.begin(), children.end()); + + schemas.push_back(NSkiff::CreateTupleSchema(columns)); + } + + return NSkiff::CreateVariant16Schema(schemas); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/skiff.h b/yt/cpp/mapreduce/client/skiff.h new file mode 100644 index 0000000000..82d80a4967 --- /dev/null +++ b/yt/cpp/mapreduce/client/skiff.h @@ -0,0 +1,72 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/interface/fwd.h> +#include <yt/cpp/mapreduce/interface/common.h> + +#include <yt/cpp/mapreduce/skiff/wire_type.h> +#include <yt/cpp/mapreduce/skiff/skiff_schema.h> + +#include <util/generic/vector.h> + +namespace NYT::NYson { +struct IYsonConsumer; +} // namespace NYT::NYson + +namespace NYT { + +struct TClientContext; +enum class ENodeReaderFormat : int; + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +struct TCreateSkiffSchemaOptions +{ + using TSelf = TCreateSkiffSchemaOptions; + + FLUENT_FIELD_DEFAULT(bool, HasKeySwitch, false); + FLUENT_FIELD_DEFAULT(bool, HasRangeIndex, false); + + using TRenameColumnsDescriptor = THashMap<TString, TString>; + FLUENT_FIELD_OPTION(TRenameColumnsDescriptor, RenameColumns); +}; + +//////////////////////////////////////////////////////////////////////////////// + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TVector<NSkiff::TSkiffSchemaPtr>& tableSchemas, + const TCreateSkiffSchemaOptions& options); + +NSkiff::TSkiffSchemaPtr GetJobInputSkiffSchema(); + +NSkiff::EWireType ValueTypeToSkiffType(EValueType valueType); + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TTableSchema& schema, + const TCreateSkiffSchemaOptions& options = TCreateSkiffSchemaOptions()); + +NSkiff::TSkiffSchemaPtr CreateSkiffSchema( + const TNode& schemaNode, + const TCreateSkiffSchemaOptions& options = TCreateSkiffSchemaOptions()); + +void Serialize(const NSkiff::TSkiffSchemaPtr& schema, NYson::IYsonConsumer* consumer); + +void Deserialize(NSkiff::TSkiffSchemaPtr& schema, const TNode& node); + +TFormat CreateSkiffFormat(const NSkiff::TSkiffSchemaPtr& schema); + +NSkiff::TSkiffSchemaPtr CreateSkiffSchemaIfNecessary( + const TClientContext& context, + const IClientRetryPolicyPtr& clientRetryPolicy, + const TTransactionId& transactionId, + ENodeReaderFormat nodeReaderFormat, + const TVector<TRichYPath>& tablePaths, + const TCreateSkiffSchemaOptions& options = TCreateSkiffSchemaOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/structured_table_formats.cpp b/yt/cpp/mapreduce/client/structured_table_formats.cpp new file mode 100644 index 0000000000..b6e82c6c15 --- /dev/null +++ b/yt/cpp/mapreduce/client/structured_table_formats.cpp @@ -0,0 +1,572 @@ +#include "structured_table_formats.h" + +#include "format_hints.h" +#include "skiff.h" + +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/io/yamr_table_reader.h> + +#include <yt/cpp/mapreduce/library/table_schema/protobuf.h> + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <library/cpp/type_info/type_info.h> +#include <library/cpp/yson/writer.h> + +#include <memory> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TMaybe<TNode> GetCommonTableFormat( + const TVector<TMaybe<TNode>>& formats) +{ + TMaybe<TNode> result; + bool start = true; + for (auto& format : formats) { + if (start) { + result = format; + start = false; + continue; + } + + if (result.Defined() != format.Defined()) { + ythrow yexception() << "Different formats of input tables"; + } + + if (!result.Defined()) { + continue; + } + + auto& resultAttrs = result.Get()->GetAttributes(); + auto& formatAttrs = format.Get()->GetAttributes(); + + if (resultAttrs["key_column_names"] != formatAttrs["key_column_names"]) { + ythrow yexception() << "Different formats of input tables"; + } + + bool hasSubkeyColumns = resultAttrs.HasKey("subkey_column_names"); + if (hasSubkeyColumns != formatAttrs.HasKey("subkey_column_names")) { + ythrow yexception() << "Different formats of input tables"; + } + + if (hasSubkeyColumns && + resultAttrs["subkey_column_names"] != formatAttrs["subkey_column_names"]) + { + ythrow yexception() << "Different formats of input tables"; + } + } + + return result; +} + +TMaybe<TNode> GetTableFormat( + const IClientRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TRichYPath& path) +{ + auto formatPath = path.Path_ + "/@_format"; + if (!NDetail::NRawClient::Exists(retryPolicy->CreatePolicyForGenericRequest(), context, transactionId, formatPath)) { + return TMaybe<TNode>(); + } + TMaybe<TNode> format = NDetail::NRawClient::Get(retryPolicy->CreatePolicyForGenericRequest(), context, transactionId, formatPath); + if (format.Get()->AsString() != "yamred_dsv") { + return TMaybe<TNode>(); + } + auto& formatAttrs = format.Get()->Attributes(); + if (!formatAttrs.HasKey("key_column_names")) { + ythrow yexception() << + "Table '" << path.Path_ << "': attribute 'key_column_names' is missing"; + } + formatAttrs["has_subkey"] = "true"; + formatAttrs["lenval"] = "true"; + return format; +} + +TMaybe<TNode> GetTableFormats( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& inputs) +{ + TVector<TMaybe<TNode>> formats; + for (auto& table : inputs) { + formats.push_back(GetTableFormat(clientRetryPolicy, context, transactionId, table)); + } + + return GetCommonTableFormat(formats); +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +NSkiff::TSkiffSchemaPtr TryCreateSkiffSchema( + const TClientContext& context, + const IClientRetryPolicyPtr& clientRetryPolicy, + const TTransactionId& transactionId, + const TVector<TRichYPath>& tables, + const TOperationOptions& options, + ENodeReaderFormat nodeReaderFormat) +{ + bool hasInputQuery = options.Spec_.Defined() && options.Spec_->IsMap() && options.Spec_->HasKey("input_query"); + if (hasInputQuery) { + Y_ENSURE_EX(nodeReaderFormat != ENodeReaderFormat::Skiff, + TApiUsageError() << "Cannot use Skiff format for operations with 'input_query' in spec"); + return nullptr; + } + return CreateSkiffSchemaIfNecessary( + context, + clientRetryPolicy, + transactionId, + nodeReaderFormat, + tables, + TCreateSkiffSchemaOptions() + .HasKeySwitch(true) + .HasRangeIndex(true)); +} + +TString CreateSkiffConfig(const NSkiff::TSkiffSchemaPtr& schema) +{ + TString result; + TStringOutput stream(result); + ::NYson::TYsonWriter writer(&stream); + Serialize(schema, &writer); + return result; +} + +TString CreateProtoConfig(const TVector<const ::google::protobuf::Descriptor*>& descriptorList) +{ + TString result; + TStringOutput messageTypeList(result); + for (const auto& descriptor : descriptorList) { + messageTypeList << descriptor->full_name() << Endl; + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TGetTableStructureDescriptionStringImpl { + template<typename T> + TString operator()(const T& description) { + if constexpr (std::is_same_v<T, TUnspecifiedTableStructure>) { + return "Unspecified"; + } else if constexpr (std::is_same_v<T, TProtobufTableStructure>) { + TString res; + TStringStream out(res); + if (description.Descriptor) { + out << description.Descriptor->full_name(); + } else { + out << "<unknown>"; + } + out << " protobuf message"; + return res; + } else { + static_assert(TDependentFalse<T>, "Unknown type"); + } + } +}; + +TString GetTableStructureDescriptionString(const TTableStructure& tableStructure) +{ + return std::visit(TGetTableStructureDescriptionStringImpl(), tableStructure); +} + +//////////////////////////////////////////////////////////////////////////////// + +TString JobTablePathString(const TStructuredJobTable& jobTable) +{ + if (jobTable.RichYPath) { + return jobTable.RichYPath->Path_; + } else { + return "<intermediate-table>"; + } +} + +TStructuredJobTableList ToStructuredJobTableList(const TVector<TStructuredTablePath>& tableList) +{ + TStructuredJobTableList result; + for (const auto& table : tableList) { + result.push_back(TStructuredJobTable{table.Description, table.RichYPath}); + } + return result; +} + +TStructuredJobTableList CanonizeStructuredTableList(const TClientContext& context, const TVector<TStructuredTablePath>& tableList) +{ + TVector<TRichYPath> toCanonize; + toCanonize.reserve(tableList.size()); + for (const auto& table : tableList) { + toCanonize.emplace_back(table.RichYPath); + } + const auto canonized = NRawClient::CanonizeYPaths(/* retryPolicy */ nullptr, context, toCanonize); + Y_VERIFY(canonized.size() == tableList.size()); + + TStructuredJobTableList result; + result.reserve(tableList.size()); + for (size_t i = 0; i != canonized.size(); ++i) { + result.emplace_back(TStructuredJobTable{tableList[i].Description, canonized[i]}); + } + return result; +} + +TVector<TRichYPath> GetPathList( + const TStructuredJobTableList& tableList, + const TMaybe<TVector<TTableSchema>>& jobSchemaInferenceResult, + bool inferSchemaFromDescriptions) +{ + Y_VERIFY(!jobSchemaInferenceResult || tableList.size() == jobSchemaInferenceResult->size()); + + auto maybeInferSchema = [&] (const TStructuredJobTable& table, ui32 tableIndex) -> TMaybe<TTableSchema> { + if (jobSchemaInferenceResult && !jobSchemaInferenceResult->at(tableIndex).Empty()) { + return jobSchemaInferenceResult->at(tableIndex); + } + if (inferSchemaFromDescriptions) { + return GetTableSchema(table.Description); + } + return Nothing(); + }; + + TVector<TRichYPath> result; + result.reserve(tableList.size()); + for (size_t tableIndex = 0; tableIndex != tableList.size(); ++tableIndex) { + const auto& table = tableList[tableIndex]; + Y_VERIFY(table.RichYPath, "Cannot get path for intermediate table"); + auto richYPath = *table.RichYPath; + if (!richYPath.Schema_) { + if (auto schema = maybeInferSchema(table, tableIndex)) { + richYPath.Schema(std::move(*schema)); + } + } + + result.emplace_back(std::move(richYPath)); + } + return result; +} + + +TStructuredRowStreamDescription GetJobStreamDescription( + const IStructuredJob& job, + EIODirection direction) +{ + switch (direction) { + case EIODirection::Input: + return job.GetInputRowStreamDescription(); + case EIODirection::Output: + return job.GetOutputRowStreamDescription(); + default: + Y_FAIL("unreachable"); + } +} + +TString GetSuffix(EIODirection direction) +{ + switch (direction) { + case EIODirection::Input: + return "_input"; + case EIODirection::Output: + return "_output"; + } + Y_FAIL("unreachable"); +} + +TString GetAddIOMethodName(EIODirection direction) +{ + switch (direction) { + case EIODirection::Input: + return "AddInput<>"; + case EIODirection::Output: + return "AddOutput<>"; + } + Y_FAIL("unreachable"); +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TFormatBuilder::TFormatSwitcher +{ + template <typename T> + auto operator() (const T& /*t*/) { + if constexpr (std::is_same_v<T, TTNodeStructuredRowStream>) { + return &TFormatBuilder::CreateNodeFormat; + } else if constexpr (std::is_same_v<T, TTYaMRRowStructuredRowStream>) { + return &TFormatBuilder::CreateYamrFormat; + } else if constexpr (std::is_same_v<T, TProtobufStructuredRowStream>) { + return &TFormatBuilder::CreateProtobufFormat; + } else if constexpr (std::is_same_v<T, TVoidStructuredRowStream>) { + return &TFormatBuilder::CreateVoidFormat; + } else { + static_assert(TDependentFalse<T>, "unknown stream description"); + } + } +}; + +TFormatBuilder::TFormatBuilder( + IClientRetryPolicyPtr clientRetryPolicy, + TClientContext context, + TTransactionId transactionId, + TOperationOptions operationOptions) + : ClientRetryPolicy_(std::move(clientRetryPolicy)) + , Context_(std::move(context)) + , TransactionId_(transactionId) + , OperationOptions_(std::move(operationOptions)) +{ } + +std::pair <TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe <TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute) +{ + auto jobStreamDescription = GetJobStreamDescription(job, direction); + auto method = std::visit(TFormatSwitcher(), jobStreamDescription); + return (this->*method)( + job, + direction, + structuredTableList, + formatHints, + nodeReaderFormat, + allowFormatFromTableAttribute); +} + +std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateVoidFormat( + const IStructuredJob& /*job*/, + const EIODirection& /*direction*/, + const TStructuredJobTableList& /*structuredTableList*/, + const TMaybe<TFormatHints>& /*formatHints*/, + ENodeReaderFormat /*nodeReaderFormat*/, + bool /*allowFormatFromTableAttribute*/) +{ + return { + TFormat(), + Nothing() + }; +} + +std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateYamrFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& /*formatHints*/, + ENodeReaderFormat /*nodeReaderFormat*/, + bool allowFormatFromTableAttribute) +{ + for (const auto& table: structuredTableList) { + if (!std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) { + ythrow TApiUsageError() + << "cannot use " << direction << " table '" << JobTablePathString(table) + << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; " + << "table has unsupported structure description; check " << GetAddIOMethodName(direction) << " for this table"; + } + } + TMaybe<TNode> formatFromTableAttributes; + if (allowFormatFromTableAttribute && OperationOptions_.UseTableFormats_) { + TVector<TRichYPath> tableList; + for (const auto& table: structuredTableList) { + Y_VERIFY(table.RichYPath, "Cannot use format from table for intermediate table"); + tableList.push_back(*table.RichYPath); + } + formatFromTableAttributes = GetTableFormats(ClientRetryPolicy_, Context_, TransactionId_, tableList); + } + if (formatFromTableAttributes) { + return { + TFormat(*formatFromTableAttributes), + Nothing() + }; + } else { + auto formatNode = TNode("yamr"); + formatNode.Attributes() = TNode() + ("lenval", true) + ("has_subkey", true) + ("enable_table_index", true); + return { + TFormat(formatNode), + Nothing() + }; + } +} + +std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateNodeFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool /*allowFormatFromTableAttribute*/) +{ + for (const auto& table: structuredTableList) { + if (!std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) { + ythrow TApiUsageError() + << "cannot use " << direction << " table '" << JobTablePathString(table) + << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; " + << "table has unsupported structure description; check AddInput<> / AddOutput<> for this table"; + } + } + NSkiff::TSkiffSchemaPtr skiffSchema = nullptr; + if (nodeReaderFormat != ENodeReaderFormat::Yson) { + TVector<TRichYPath> tableList; + for (const auto& table: structuredTableList) { + Y_VERIFY(table.RichYPath, "Cannot use skiff with temporary tables"); + tableList.emplace_back(*table.RichYPath); + } + skiffSchema = TryCreateSkiffSchema( + Context_, + ClientRetryPolicy_, + TransactionId_, + tableList, + OperationOptions_, + nodeReaderFormat); + } + if (skiffSchema) { + auto format = CreateSkiffFormat(skiffSchema); + NYT::NDetail::ApplyFormatHints<TNode>(&format, formatHints); + return { + CreateSkiffFormat(skiffSchema), + TSmallJobFile{ + TString("skiff") + GetSuffix(direction), + CreateSkiffConfig(skiffSchema) + } + }; + } else { + auto format = TFormat::YsonBinary(); + NYT::NDetail::ApplyFormatHints<TNode>(&format, formatHints); + return { + format, + Nothing() + }; + } +} + +[[noreturn]] static void ThrowUnsupportedStructureDescription( + const EIODirection& direction, + const TStructuredJobTable& table, + const IStructuredJob& job) +{ + ythrow TApiUsageError() + << "cannot use " << direction << " table '" << JobTablePathString(table) + << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; " + << "table has unsupported structure description; check " << GetAddIOMethodName(direction) << " for this table"; +} + +[[noreturn]] static void ThrowTypeDeriveFail( + const EIODirection& direction, + const IStructuredJob& job, + const TString& type) +{ + ythrow TApiUsageError() + << "Cannot derive exact " << type << " type for intermediate " << direction << " table for job " + << TJobFactory::Get()->GetJobName(&job) + << "; use one of TMapReduceOperationSpec::Hint* methods to specifiy intermediate table structure"; +} + +[[noreturn]] static void ThrowUnexpectedDifferentDescriptors( + const EIODirection& direction, + const TStructuredJobTable& table, + const IStructuredJob& job, + const TMaybe<TStringBuf> jobDescriptorName, + const TMaybe<TStringBuf> descriptorName) +{ + ythrow TApiUsageError() + << "Job " << TJobFactory::Get()->GetJobName(&job) << " expects " + << jobDescriptorName << " as " << direction << ", but table " << JobTablePathString(table) + << " is tagged with " << descriptorName; +} + +std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateProtobufFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& /*formatHints*/, + ENodeReaderFormat /*nodeReaderFormat*/, + bool /*allowFormatFromTableAttribute*/) +{ + if (Context_.Config->UseClientProtobuf) { + return { + TFormat::YsonBinary(), + TSmallJobFile{ + TString("proto") + GetSuffix(direction), + CreateProtoConfig({}), + }, + }; + } + const ::google::protobuf::Descriptor* const jobDescriptor = + std::get<TProtobufStructuredRowStream>(GetJobStreamDescription(job, direction)).Descriptor; + Y_ENSURE(!structuredTableList.empty(), + "empty " << direction << " tables for job " << TJobFactory::Get()->GetJobName(&job)); + + TVector<const ::google::protobuf::Descriptor*> descriptorList; + for (const auto& table : structuredTableList) { + const ::google::protobuf::Descriptor* descriptor = nullptr; + if (std::holds_alternative<TProtobufTableStructure>(table.Description)) { + descriptor = std::get<TProtobufTableStructure>(table.Description).Descriptor; + } else if (table.RichYPath) { + ThrowUnsupportedStructureDescription(direction, table, job); + } + if (!descriptor) { + // It must be intermediate table, because there is no proper way to add such table to spec + // (AddInput requires to specify proper message). + Y_VERIFY(!table.RichYPath, "Descriptors for all tables except intermediate must be known"); + if (jobDescriptor) { + descriptor = jobDescriptor; + } else { + ThrowTypeDeriveFail(direction, job, "protobuf"); + } + } + if (jobDescriptor && descriptor != jobDescriptor) { + ThrowUnexpectedDifferentDescriptors( + direction, + table, + job, + jobDescriptor->full_name(), + descriptor->full_name()); + } + descriptorList.push_back(descriptor); + } + Y_VERIFY(!descriptorList.empty(), "Messages for proto format are unknown (empty ProtoDescriptors)"); + return { + TFormat::Protobuf(descriptorList, Context_.Config->ProtobufFormatWithDescriptors), + TSmallJobFile{ + TString("proto") + GetSuffix(direction), + CreateProtoConfig(descriptorList) + }, + }; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TGetTableSchemaImpl +{ + template <typename T> + TMaybe<TTableSchema> operator() (const T& description) { + if constexpr (std::is_same_v<T, TUnspecifiedTableStructure>) { + return Nothing(); + } else if constexpr (std::is_same_v<T, TProtobufTableStructure>) { + if (!description.Descriptor) { + return Nothing(); + } + return CreateTableSchema(*description.Descriptor); + } else { + static_assert(TDependentFalse<T>, "unknown type"); + } + } +}; + +TMaybe<TTableSchema> GetTableSchema(const TTableStructure& tableStructure) +{ + return std::visit(TGetTableSchemaImpl(), tableStructure); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/structured_table_formats.h b/yt/cpp/mapreduce/client/structured_table_formats.h new file mode 100644 index 0000000000..27d980c587 --- /dev/null +++ b/yt/cpp/mapreduce/client/structured_table_formats.h @@ -0,0 +1,146 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/fwd.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> + +#include <utility> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TMaybe<TNode> GetCommonTableFormat( + const TVector<TMaybe<TNode>>& formats); + +TMaybe<TNode> GetTableFormat( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TRichYPath& path); + +TMaybe<TNode> GetTableFormats( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths); + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +enum class EIODirection +{ + Input, + Output, +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TSmallJobFile +{ + TString FileName; + TString Data; +}; + +//////////////////////////////////////////////////////////////////////////////// + +// Table that is used while preparing operation formats. Can be real table or intermediate +struct TStructuredJobTable +{ + TTableStructure Description; + // Might be null for intermediate tables in MapReduce operation + TMaybe<TRichYPath> RichYPath; + + static TStructuredJobTable Intermediate(TTableStructure description) + { + return TStructuredJobTable{std::move(description), Nothing()}; + } +}; +using TStructuredJobTableList = TVector<TStructuredJobTable>; +TString JobTablePathString(const TStructuredJobTable& jobTable); +TStructuredJobTableList ToStructuredJobTableList(const TVector<TStructuredTablePath>& tableList); + +TStructuredJobTableList CanonizeStructuredTableList(const TClientContext& context, const TVector<TStructuredTablePath>& tableList); +TVector<TRichYPath> GetPathList( + const TStructuredJobTableList& tableList, + const TMaybe<TVector<TTableSchema>>& schemaInferenceResult, + bool inferSchema); + +//////////////////////////////////////////////////////////////////////////////// + +class TFormatBuilder +{ +private: + struct TFormatSwitcher; + +public: + TFormatBuilder( + IClientRetryPolicyPtr clientRetryPolicy, + TClientContext context, + TTransactionId transactionId, + TOperationOptions operationOptions); + + std::pair<TFormat, TMaybe<TSmallJobFile>> CreateFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute); + + std::pair<TFormat, TMaybe<TSmallJobFile>> CreateVoidFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute); + + std::pair<TFormat, TMaybe<TSmallJobFile>> CreateYamrFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute); + + std::pair<TFormat, TMaybe<TSmallJobFile>> CreateNodeFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute); + + std::pair<TFormat, TMaybe<TSmallJobFile>> CreateProtobufFormat( + const IStructuredJob& job, + const EIODirection& direction, + const TStructuredJobTableList& structuredTableList, + const TMaybe<TFormatHints>& formatHints, + ENodeReaderFormat nodeReaderFormat, + bool allowFormatFromTableAttribute); + +private: + const IClientRetryPolicyPtr ClientRetryPolicy_; + const TClientContext Context_; + const TTransactionId TransactionId_; + const TOperationOptions OperationOptions_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TMaybe<TTableSchema> GetTableSchema(const TTableStructure& tableStructure); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/transaction.cpp b/yt/cpp/mapreduce/client/transaction.cpp new file mode 100644 index 0000000000..0aa1a7a1c3 --- /dev/null +++ b/yt/cpp/mapreduce/client/transaction.cpp @@ -0,0 +1,195 @@ +#include "transaction.h" + +#include "transaction_pinger.h" + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/error_codes.h> + +#include <yt/cpp/mapreduce/common/wait_proxy.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <util/datetime/base.h> + +#include <util/generic/scope.h> + +#include <util/random/random.h> + +#include <util/string/builder.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TPingableTransaction::TPingableTransaction( + const IClientRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& parentId, + ITransactionPingerPtr transactionPinger, + const TStartTransactionOptions& options) + : ClientRetryPolicy_(retryPolicy) + , Context_(context) + , AbortableRegistry_(NDetail::TAbortableRegistry::Get()) + , AbortOnTermination_(true) + , AutoPingable_(options.AutoPingable_) + , Pinger_(std::move(transactionPinger)) +{ + auto transactionId = NDetail::NRawClient::StartTransaction( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + context, + parentId, + options); + + auto actualTimeout = options.Timeout_.GetOrElse(Context_.Config->TxTimeout); + Init(context, transactionId, actualTimeout); +} + +TPingableTransaction::TPingableTransaction( + const IClientRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + ITransactionPingerPtr transactionPinger, + const TAttachTransactionOptions& options) + : ClientRetryPolicy_(retryPolicy) + , Context_(context) + , AbortableRegistry_(NDetail::TAbortableRegistry::Get()) + , AbortOnTermination_(options.AbortOnTermination_) + , AutoPingable_(options.AutoPingable_) + , Pinger_(std::move(transactionPinger)) +{ + auto timeoutNode = NDetail::NRawClient::TryGet( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + context, + TTransactionId(), + "#" + GetGuidAsString(transactionId) + "/@timeout", + TGetOptions()); + if (timeoutNode.IsUndefined()) { + throw yexception() << "Transaction " << GetGuidAsString(transactionId) << " does not exist"; + } + auto timeout = TDuration::MilliSeconds(timeoutNode.AsInt64()); + Init(context, transactionId, timeout); +} + +void TPingableTransaction::Init( + const TClientContext& context, + const TTransactionId& transactionId, + TDuration timeout) +{ + TransactionId_ = transactionId; + + if (AbortOnTermination_) { + AbortableRegistry_->Add( + TransactionId_, + ::MakeIntrusive<NDetail::TTransactionAbortable>(context, TransactionId_)); + } + + if (AutoPingable_) { + // Compute 'MaxPingInterval_' and 'MinPingInterval_' such that 'pingInterval == (max + min) / 2'. + auto pingInterval = Context_.Config->PingInterval; + auto safeTimeout = timeout - TDuration::Seconds(5); + MaxPingInterval_ = Max(pingInterval, Min(safeTimeout, pingInterval * 1.5)); + MinPingInterval_ = pingInterval - (MaxPingInterval_ - pingInterval); + + Pinger_->RegisterTransaction(*this); + } +} + +TPingableTransaction::~TPingableTransaction() +{ + try { + Stop(AbortOnTermination_ ? EStopAction::Abort : EStopAction::Detach); + } catch (...) { + } +} + +const TTransactionId TPingableTransaction::GetId() const +{ + return TransactionId_; +} + +const std::pair<TDuration, TDuration> TPingableTransaction::GetPingInterval() const { + return {MinPingInterval_, MaxPingInterval_}; +} + +const TClientContext TPingableTransaction::GetContext() const { + return Context_; +} + +void TPingableTransaction::Commit() +{ + Stop(EStopAction::Commit); +} + +void TPingableTransaction::Abort() +{ + Stop(EStopAction::Abort); +} + +void TPingableTransaction::Detach() +{ + Stop(EStopAction::Detach); +} + +void TPingableTransaction::Stop(EStopAction action) +{ + if (Finalized_) { + return; + } + + Y_DEFER { + Finalized_ = true; + if (AutoPingable_ && Pinger_->HasTransaction(*this)) { + Pinger_->RemoveTransaction(*this); + } + }; + + switch (action) { + case EStopAction::Commit: + NDetail::NRawClient::CommitTransaction( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + TransactionId_); + break; + case EStopAction::Abort: + NDetail::NRawClient::AbortTransaction( + ClientRetryPolicy_->CreatePolicyForGenericRequest(), + Context_, + TransactionId_); + break; + case EStopAction::Detach: + // Do nothing. + break; + } + + AbortableRegistry_->Remove(TransactionId_); +} + +//////////////////////////////////////////////////////////////////////////////// + +TYPath Snapshot( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path) +{ + auto lockId = NDetail::NRawClient::Lock( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + transactionId, + path, + ELockMode::LM_SNAPSHOT); + auto lockedNodeId = NDetail::NRawClient::Get( + clientRetryPolicy->CreatePolicyForGenericRequest(), + context, + transactionId, + ::TStringBuilder() << '#' << GetGuidAsString(lockId) << "/@node_id"); + return "#" + lockedNodeId.AsString(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/transaction.h b/yt/cpp/mapreduce/client/transaction.h new file mode 100644 index 0000000000..559fca619e --- /dev/null +++ b/yt/cpp/mapreduce/client/transaction.h @@ -0,0 +1,95 @@ +#pragma once + +#include "abortable_registry.h" + +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <util/datetime/base.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/system/thread.h> + +#include <atomic> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TPingableTransaction +{ +public: + // + // Start a new transaction. + TPingableTransaction( + const IClientRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& parentId, + ITransactionPingerPtr transactionPinger, + const TStartTransactionOptions& options); + + // + // Attach to an existing transaction. + TPingableTransaction( + const IClientRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + ITransactionPingerPtr transactionPinger, + const TAttachTransactionOptions& options); + + ~TPingableTransaction(); + + const TTransactionId GetId() const; + + const std::pair<TDuration, TDuration> GetPingInterval() const; + const TClientContext GetContext() const; + + void Commit(); + void Abort(); + void Detach(); + + +private: + enum class EStopAction + { + Detach, + Abort, + Commit, + }; + +private: + IClientRetryPolicyPtr ClientRetryPolicy_; + TClientContext Context_; + TTransactionId TransactionId_; + TDuration MinPingInterval_; + TDuration MaxPingInterval_; + + // We have to own an IntrusivePtr to registry to prevent use-after-free. + ::TIntrusivePtr<NDetail::TAbortableRegistry> AbortableRegistry_; + + bool AbortOnTermination_; + + bool AutoPingable_; + bool Finalized_ = false; + ITransactionPingerPtr Pinger_; + +private: + void Init( + const TClientContext& context, + const TTransactionId& transactionId, + TDuration timeout); + + void Stop(EStopAction action); +}; + +//////////////////////////////////////////////////////////////////////////////// + +TYPath Snapshot( + const IClientRetryPolicyPtr& clientRetryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/transaction_pinger.cpp b/yt/cpp/mapreduce/client/transaction_pinger.cpp new file mode 100644 index 0000000000..2b51e47f9f --- /dev/null +++ b/yt/cpp/mapreduce/client/transaction_pinger.cpp @@ -0,0 +1,321 @@ +#include "transaction_pinger.h" + +#include "transaction.h" + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/error_codes.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/common/wait_proxy.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/http/requests.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#if defined(__x86_64__) || defined(__arm64__) + #include <yt/yt/core/concurrency/periodic_executor.h> + #include <yt/yt/core/concurrency/poller.h> + #include <yt/yt/core/concurrency/scheduler_api.h> + #include <yt/yt/core/concurrency/thread_pool_poller.h> + #include <yt/yt/core/concurrency/thread_pool.h> + + #include <yt/yt/core/http/client.h> + #include <yt/yt/core/http/http.h> +#endif // defined(__x86_64__) || defined(__arm64__) + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/yt/threading/spin_lock.h> +#include <library/cpp/yt/assert/assert.h> + +#include <util/datetime/base.h> +#include <util/random/random.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__x86_64__) || defined(__arm64__) + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +void CheckError(const TString& requestId, NHttp::IResponsePtr response) +{ + TErrorResponse errorResponse(static_cast<int>(response->GetStatusCode()), requestId); + + if (const auto* ytError = response->GetHeaders()->Find("X-YT-Error")) { + errorResponse.ParseFromJsonError(*ytError); + } + if (errorResponse.IsOk()) { + return; + } + + YT_LOG_ERROR("RSP %v - HTTP %v - %v", + requestId, + response->GetStatusCode(), + errorResponse.AsStrBuf()); + + ythrow errorResponse; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +void PingTx(NHttp::IClientPtr httpClient, const TPingableTransaction& tx) +{ + auto url = TString::Join("http://", tx.GetContext().ServerName, "/api/", tx.GetContext().Config->ApiVersion, "/ping_tx"); + auto headers = New<NHttp::THeaders>(); + auto requestId = CreateGuidAsString(); + + headers->Add("Host", url); + headers->Add("User-Agent", TProcessState::Get()->ClientVersion); + + const auto& token = tx.GetContext().Token; + if (!token.empty()) { + headers->Add("Authorization", "OAuth " + token); + } + + headers->Add("Transfer-Encoding", "chunked"); + headers->Add("X-YT-Correlation-Id", requestId); + headers->Add("X-YT-Header-Format", "<format=text>yson"); + headers->Add("Content-Encoding", "identity"); + headers->Add("Accept-Encoding", "identity"); + + TNode node; + node["transaction_id"] = GetGuidAsString(tx.GetId()); + auto strParams = NodeToYsonString(node); + + YT_LOG_DEBUG("REQ %v - sending request (HostName: %v; Method POST %v; X-YT-Parameters (sent in body): %v)", + requestId, + tx.GetContext().ServerName, + url, + strParams + ); + + auto response = NConcurrency::WaitFor(httpClient->Post(url, TSharedRef::FromString(strParams), headers)).ValueOrThrow(); + CheckError(requestId, response); + + YT_LOG_DEBUG("RSP %v - received response %v bytes. (%v)", + requestId, + response->ReadAll().size(), + strParams); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +class TSharedTransactionPinger + : public ITransactionPinger +{ +public: + TSharedTransactionPinger(NHttp::IClientPtr httpClient, int poolThreadCount) + : PingerPool_(NConcurrency::CreateThreadPool( + poolThreadCount, "tx_pinger_pool")) + , HttpClient_(std::move(httpClient)) + { } + + ~TSharedTransactionPinger() override + { + PingerPool_->Shutdown(); + } + + ITransactionPingerPtr GetChildTxPinger() override + { + return this; + } + + void RegisterTransaction(const TPingableTransaction& pingableTx) override + { + auto [minPingInterval, maxPingInterval] = pingableTx.GetPingInterval(); + auto pingInterval = (minPingInterval + maxPingInterval) / 2; + double jitter = (maxPingInterval - pingInterval) / pingInterval; + + auto opts = NConcurrency::TPeriodicExecutorOptions{pingInterval, pingInterval, jitter}; + auto periodic = std::make_shared<NConcurrency::TPeriodicExecutorPtr>(nullptr); + // Have to use weak_ptr in order to break reference cycle + // This weak_ptr holds pointer to periodic, which will contain this lambda + // Also we consider that lifetime of this lambda is no longer than lifetime of pingableTx + // because every pingableTx have to call RemoveTransaction before it is destroyed + auto pingRoutine = BIND([this, &pingableTx, periodic = std::weak_ptr{periodic}] { + auto strong_ptr = periodic.lock(); + YT_VERIFY(strong_ptr); + DoPingTransaction(pingableTx, *strong_ptr); + }); + *periodic = New<NConcurrency::TPeriodicExecutor>(PingerPool_->GetInvoker(), pingRoutine, opts); + (*periodic)->Start(); + + auto guard = Guard(SpinLock_); + YT_VERIFY(!Transactions_.contains(pingableTx.GetId())); + Transactions_[pingableTx.GetId()] = std::move(periodic); + } + + bool HasTransaction(const TPingableTransaction& pingableTx) override + { + auto guard = Guard(SpinLock_); + return Transactions_.contains(pingableTx.GetId()); + } + + + void RemoveTransaction(const TPingableTransaction& pingableTx) override + { + std::shared_ptr<NConcurrency::TPeriodicExecutorPtr> periodic; + { + auto guard = Guard(SpinLock_); + + auto it = Transactions_.find(pingableTx.GetId()); + + YT_VERIFY(it != Transactions_.end()); + + periodic = std::move(it->second); + Transactions_.erase(it); + } + NConcurrency::WaitUntilSet((*periodic)->Stop()); + } + +private: + void DoPingTransaction(const TPingableTransaction& pingableTx, + NConcurrency::TPeriodicExecutorPtr periodic) + { + try { + PingTx(HttpClient_, pingableTx); + } catch (const std::exception& e) { + if (auto* errorResponse = dynamic_cast<const TErrorResponse*>(&e)) { + if (errorResponse->GetError().ContainsErrorCode(NYT::NClusterErrorCodes::NTransactionClient::NoSuchTransaction)) { + YT_UNUSED_FUTURE(periodic->Stop()); + } else if (errorResponse->GetError().ContainsErrorCode(NYT::NClusterErrorCodes::Timeout)) { + periodic->ScheduleOutOfBand(); + } + } + } + } + + +private: + YT_DECLARE_SPIN_LOCK(NThreading::TSpinLock, SpinLock_); + THashMap<TTransactionId, std::shared_ptr<NConcurrency::TPeriodicExecutorPtr>> Transactions_; + + NConcurrency::IThreadPoolPtr PingerPool_; + NHttp::IClientPtr HttpClient_; +}; + +#endif // defined(__x86_64__) || defined(__arm64__) + +//////////////////////////////////////////////////////////////////////////////// + +class TThreadPerTransactionPinger + : public ITransactionPinger +{ +public: + ~TThreadPerTransactionPinger() override + { + if (Running_) { + RemoveTransaction(*PingableTx_); + } + } + + ITransactionPingerPtr GetChildTxPinger() override + { + return MakeIntrusive<TThreadPerTransactionPinger>(); + } + + void RegisterTransaction(const TPingableTransaction& pingableTx) override + { + YT_VERIFY(!Running_); + YT_VERIFY(PingableTx_ == nullptr); + + PingableTx_ = &pingableTx; + Running_ = true; + + PingerThread_ = MakeHolder<TThread>( + TThread::TParams{Pinger, this}.SetName("pingable_tx")); + PingerThread_->Start(); + } + + bool HasTransaction(const TPingableTransaction& pingableTx) override + { + return PingableTx_ == &pingableTx && Running_; + } + + void RemoveTransaction(const TPingableTransaction& pingableTx) override + { + YT_VERIFY(HasTransaction(pingableTx)); + + Running_ = false; + if (PingerThread_) { + PingerThread_->Join(); + } + } + +private: + static void* Pinger(void* opaque) + { + static_cast<TThreadPerTransactionPinger*>(opaque)->Pinger(); + return nullptr; + } + + void Pinger() + { + auto [minPingInterval, maxPingInterval] = PingableTx_->GetPingInterval(); + while (Running_) { + TDuration waitTime = minPingInterval + (maxPingInterval - minPingInterval) * RandomNumber<float>(); + try { + auto noRetryPolicy = MakeIntrusive<TAttemptLimitedRetryPolicy>(1u, PingableTx_->GetContext().Config); + NDetail::NRawClient::PingTx(noRetryPolicy, PingableTx_->GetContext(), PingableTx_->GetId()); + } catch (const std::exception& e) { + if (auto* errorResponse = dynamic_cast<const TErrorResponse*>(&e)) { + if (errorResponse->GetError().ContainsErrorCode(NYT::NClusterErrorCodes::NTransactionClient::NoSuchTransaction)) { + break; + } else if (errorResponse->GetError().ContainsErrorCode(NYT::NClusterErrorCodes::Timeout)) { + waitTime = TDuration::MilliSeconds(0); + } + } + // Else do nothing, going to retry this error. + } + + TInstant t = Now(); + while (Running_ && Now() - t < waitTime) { + NDetail::TWaitProxy::Get()->Sleep(TDuration::MilliSeconds(100)); + } + } + } + +private: + const TPingableTransaction* PingableTx_ = nullptr; + + std::atomic<bool> Running_ = false; + THolder<TThread> PingerThread_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +ITransactionPingerPtr CreateTransactionPinger(const TConfigPtr& config) +{ + if (config->UseAsyncTxPinger) { +// TODO(aleexfi): Remove it after YT-17689 +#if defined(__x86_64__) || defined(__arm64__) + YT_LOG_DEBUG("Using async transaction pinger"); + auto httpClientConfig = NYT::New<NHttp::TClientConfig>(); + httpClientConfig->MaxIdleConnections = 16; + auto httpPoller = NConcurrency::CreateThreadPoolPoller( + config->AsyncHttpClientThreads, + "tx_http_client_poller"); + auto httpClient = NHttp::CreateClient(std::move(httpClientConfig), std::move(httpPoller)); + + return MakeIntrusive<TSharedTransactionPinger>( + std::move(httpClient), + config->AsyncTxPingerPoolThreads); +#else + YT_LOG_WARNING("Async transaction pinger is not supported on your platform. Fallback to TThreadPerTransactionPinger..."); +#endif // defined(__x86_64__) || defined(__arm64__) + } + return MakeIntrusive<TThreadPerTransactionPinger>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/transaction_pinger.h b/yt/cpp/mapreduce/client/transaction_pinger.h new file mode 100644 index 0000000000..98e8b5cb2f --- /dev/null +++ b/yt/cpp/mapreduce/client/transaction_pinger.h @@ -0,0 +1,39 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/requests.h> + +#include <util/generic/ptr.h> +#include <util/system/thread.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TPingableTransaction; + +//////////////////////////////////////////////////////////////////////////////// + +// Each registered transaction must be removed from pinger +// (using RemoveTransaction) before it is destroyed +class ITransactionPinger + : public TThrRefBase +{ +public: + virtual ~ITransactionPinger() = default; + + virtual ITransactionPingerPtr GetChildTxPinger() = 0; + + virtual void RegisterTransaction(const TPingableTransaction& pingableTx) = 0; + + virtual bool HasTransaction(const TPingableTransaction& pingableTx) = 0; + + virtual void RemoveTransaction(const TPingableTransaction& pingableTx) = 0; +}; + +ITransactionPingerPtr CreateTransactionPinger(const TConfigPtr& config); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/ya.make b/yt/cpp/mapreduce/client/ya.make new file mode 100644 index 0000000000..a1b3b4da69 --- /dev/null +++ b/yt/cpp/mapreduce/client/ya.make @@ -0,0 +1,75 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + abortable_registry.cpp + batch_request_impl.cpp + client_reader.cpp + client_writer.cpp + client.cpp + file_reader.cpp + file_writer.cpp + format_hints.cpp + init.cpp + lock.cpp + operation_helpers.cpp + operation_preparer.cpp + operation_tracker.cpp + operation.cpp + prepare_operation.cpp + py_helpers.cpp + retry_heavy_write_request.cpp + retryful_writer.cpp + retryless_writer.cpp + skiff.cpp + structured_table_formats.cpp + transaction.cpp + transaction_pinger.cpp + yt_poller.cpp +) + +PEERDIR( + library/cpp/digest/md5 + library/cpp/sighandler + library/cpp/threading/blocking_queue + library/cpp/threading/future + library/cpp/type_info + library/cpp/yson + yt/cpp/mapreduce/common + yt/cpp/mapreduce/http + yt/cpp/mapreduce/interface + yt/cpp/mapreduce/io + yt/cpp/mapreduce/library/table_schema + yt/cpp/mapreduce/raw_client +) + +IF (ARCH_X86_64 OR OS_DARWIN) + PEERDIR( + yt/yt/core + yt/yt/core/http + ) +ELSE() + # Suppress yamaker's WBadIncl error on exotic platforms + PEERDIR( + yt/yt_proto/yt/core + ) +ENDIF() + +IF (BUILD_TYPE == "PROFILE") + PEERDIR( + yt/yt/library/ytprof + ) + + SRCS( + job_profiler.cpp + ) +ELSE() + SRCS( + dummy_job_profiler.cpp + ) +ENDIF() + +GENERATE_ENUM_SERIALIZATION(structured_table_formats.h) + +END() diff --git a/yt/cpp/mapreduce/client/yt_poller.cpp b/yt/cpp/mapreduce/client/yt_poller.cpp new file mode 100644 index 0000000000..e0bea1690e --- /dev/null +++ b/yt/cpp/mapreduce/client/yt_poller.cpp @@ -0,0 +1,132 @@ +#include "yt_poller.h" + +#include <yt/cpp/mapreduce/raw_client/raw_batch_request.h> +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +#include <yt/cpp/mapreduce/common/debug_metrics.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +namespace NYT { +namespace NDetail { + +using namespace NRawClient; + +//////////////////////////////////////////////////////////////////////////////// + +TYtPoller::TYtPoller( + TClientContext context, + const IClientRetryPolicyPtr& retryPolicy) + : Context_(std::move(context)) + , ClientRetryPolicy_(retryPolicy) + , WaiterThread_(&TYtPoller::WatchLoopProc, this) +{ + WaiterThread_.Start(); +} + +TYtPoller::~TYtPoller() +{ + Stop(); +} + +void TYtPoller::Watch(IYtPollerItemPtr item) +{ + auto g = Guard(Lock_); + Pending_.emplace_back(std::move(item)); + HasData_.Signal(); +} + + +void TYtPoller::Stop() +{ + { + auto g = Guard(Lock_); + if (!IsRunning_) { + return; + } + IsRunning_ = false; + HasData_.Signal(); + } + WaiterThread_.Join(); +} + +void TYtPoller::DiscardQueuedItems() +{ + for (auto& item : Pending_) { + item->OnItemDiscarded(); + } + for (auto& item : InProgress_) { + item->OnItemDiscarded(); + } +} + +void TYtPoller::WatchLoop() +{ + TInstant nextRequest = TInstant::Zero(); + while (true) { + { + auto g = Guard(Lock_); + if (IsRunning_ && Pending_.empty() && InProgress_.empty()) { + TWaitProxy::Get()->WaitCondVar(HasData_, Lock_); + } + + if (!IsRunning_) { + DiscardQueuedItems(); + return; + } + + { + auto ug = Unguard(Lock_); // allow adding new items into Pending_ + TWaitProxy::Get()->SleepUntil(nextRequest); + nextRequest = TInstant::Now() + Context_.Config->WaitLockPollInterval; + } + if (!Pending_.empty()) { + InProgress_.splice(InProgress_.end(), Pending_); + } + Y_VERIFY(!InProgress_.empty()); + } + + TRawBatchRequest rawBatchRequest(Context_.Config); + + for (auto& item : InProgress_) { + item->PrepareRequest(&rawBatchRequest); + } + + try { + ExecuteBatch(ClientRetryPolicy_->CreatePolicyForGenericRequest(), Context_, rawBatchRequest); + } catch (const std::exception& ex) { + YT_LOG_ERROR("Exception while executing batch request: %v", ex.what()); + } + + for (auto it = InProgress_.begin(); it != InProgress_.end();) { + auto& item = *it; + + IYtPollerItem::EStatus status = item->OnRequestExecuted(); + + if (status == IYtPollerItem::PollBreak) { + it = InProgress_.erase(it); + } else { + ++it; + } + } + + IncDebugMetric(TStringBuf("yt_poller_top_loop_repeat_count")); + } +} + +void* TYtPoller::WatchLoopProc(void* data) +{ + static_cast<TYtPoller*>(data)->WatchLoop(); + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/client/yt_poller.h b/yt/cpp/mapreduce/client/yt_poller.h new file mode 100644 index 0000000000..4f4e9eb7ab --- /dev/null +++ b/yt/cpp/mapreduce/client/yt_poller.h @@ -0,0 +1,86 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/requests.h> + +#include <yt/cpp/mapreduce/interface/client.h> + +#include <util/generic/list.h> +#include <util/system/mutex.h> +#include <util/system/thread.h> +#include <util/system/condvar.h> + +namespace NYT { +namespace NDetail { + +namespace NRawClient { + class TRawBatchRequest; +} + +//////////////////////////////////////////////////////////////////////////////// + +class IYtPollerItem + : public TThrRefBase +{ +public: + enum EStatus { + PollContinue, + PollBreak, + }; + +public: + virtual ~IYtPollerItem() = default; + + virtual void PrepareRequest(NRawClient::TRawBatchRequest* batchRequest) = 0; + + // Should return PollContinue if poller should continue polling this item. + // Should return PollBreak if poller should stop polling this item. + virtual EStatus OnRequestExecuted() = 0; + + virtual void OnItemDiscarded() = 0; + +}; +using IYtPollerItemPtr = ::TIntrusivePtr<IYtPollerItem>; + +//////////////////////////////////////////////////////////////////////////////// + +class TYtPoller + : public TThrRefBase +{ +public: + TYtPoller(TClientContext context, const IClientRetryPolicyPtr& retryPolicy); + ~TYtPoller(); + + void Watch(IYtPollerItemPtr item); + + void Stop(); + +private: + void DiscardQueuedItems(); + + void WatchLoop(); + static void* WatchLoopProc(void*); + +private: + struct TItem; + + const TClientContext Context_; + const IClientRetryPolicyPtr ClientRetryPolicy_; + + + TList<IYtPollerItemPtr> InProgress_; + TList<IYtPollerItemPtr> Pending_; + + TThread WaiterThread_; + TMutex Lock_; + TCondVar HasData_; + + bool IsRunning_ = true; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/debug_metrics.cpp b/yt/cpp/mapreduce/common/debug_metrics.cpp new file mode 100644 index 0000000000..6235e55f7e --- /dev/null +++ b/yt/cpp/mapreduce/common/debug_metrics.cpp @@ -0,0 +1,62 @@ +#include "debug_metrics.h" + +#include <util/generic/hash.h> +#include <util/generic/singleton.h> + +#include <util/string/cast.h> +#include <util/system/mutex.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TDebugMetrics { +public: + static TDebugMetrics& Get() + { + return *Singleton<TDebugMetrics>(); + } + + void Inc(TStringBuf name) + { + auto g = Guard(Lock_); + auto it = Metrics_.find(name); + if (it == Metrics_.end()) { + it = Metrics_.emplace(ToString(name), 0).first; + } + ++it->second; + } + + ui64 Get(TStringBuf name) const + { + auto g = Guard(Lock_); + auto it = Metrics_.find(name); + if (it == Metrics_.end()) { + return 0; + } else { + return it->second; + } + } + +private: + TMutex Lock_; + THashMap<TString, ui64> Metrics_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +void IncDebugMetricImpl(TStringBuf name) +{ + TDebugMetrics::Get().Inc(name); +} + +ui64 GetDebugMetric(TStringBuf name) +{ + return TDebugMetrics::Get().Get(name); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/debug_metrics.h b/yt/cpp/mapreduce/common/debug_metrics.h new file mode 100644 index 0000000000..6ebbc89f72 --- /dev/null +++ b/yt/cpp/mapreduce/common/debug_metrics.h @@ -0,0 +1,22 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <util/generic/strbuf.h> + +namespace NYT { +namespace NDetail { + +void IncDebugMetricImpl(TStringBuf name); + +// Helper functions that allows to track various events inside YT library, useful for testing. +inline void IncDebugMetric(TStringBuf name) +{ + if (TConfig::Get()->EnableDebugMetrics) { + IncDebugMetricImpl(name); + } +} +ui64 GetDebugMetric(TStringBuf name); + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/fwd.h b/yt/cpp/mapreduce/common/fwd.h new file mode 100644 index 0000000000..a195e727be --- /dev/null +++ b/yt/cpp/mapreduce/common/fwd.h @@ -0,0 +1,11 @@ +#pragma once + +#include <util/generic/fwd.h> + +namespace NYT { + class IRequestRetryPolicy; + using IRequestRetryPolicyPtr = ::TIntrusivePtr<IRequestRetryPolicy>; + + class IClientRetryPolicy; + using IClientRetryPolicyPtr = ::TIntrusivePtr<IClientRetryPolicy>; +} diff --git a/yt/cpp/mapreduce/common/helpers.cpp b/yt/cpp/mapreduce/common/helpers.cpp new file mode 100644 index 0000000000..95924d812c --- /dev/null +++ b/yt/cpp/mapreduce/common/helpers.cpp @@ -0,0 +1,126 @@ +#include "helpers.h" + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/serialize.h> +#include <yt/cpp/mapreduce/interface/fluent.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_visitor.h> + +#include <library/cpp/yson/parser.h> +#include <library/cpp/yson/writer.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/json/json_value.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> +#include <util/stream/str.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TString NodeListToYsonString(const TNode::TListType& nodes) +{ + TStringStream stream; + ::NYson::TYsonWriter writer(&stream, NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment); + auto list = BuildYsonListFluently(&writer); + for (const auto& node : nodes) { + list.Item().Value(node); + } + return stream.Str(); +} + +TNode PathToNode(const TRichYPath& path) +{ + TNode result; + TNodeBuilder builder(&result); + Serialize(path, &builder); + return result; +} + +TNode PathToParamNode(const TRichYPath& path) +{ + return TNode()("path", PathToNode(path)); +} + +TString AttributesToYsonString(const TNode& node) +{ + return BuildYsonStringFluently().BeginMap() + .Item("attributes").Value(node) + .EndMap(); +} + +TString AttributeFilterToYsonString(const TAttributeFilter& filter) +{ + return BuildYsonStringFluently().BeginMap() + .Item("attributes").Value(filter) + .EndMap(); +} + +TNode NodeFromTableSchema(const TTableSchema& schema) +{ + TNode result; + TNodeBuilder builder(&result); + Serialize(schema, &builder); + return result; +} + +void MergeNodes(TNode& dst, const TNode& src) +{ + if (dst.IsMap() && src.IsMap()) { + auto& dstMap = dst.AsMap(); + const auto& srcMap = src.AsMap(); + for (const auto& srcItem : srcMap) { + const auto& key = srcItem.first; + auto dstItem = dstMap.find(key); + if (dstItem != dstMap.end()) { + MergeNodes(dstItem->second, srcItem.second); + } else { + dstMap[key] = srcItem.second; + } + } + } else { + if (dst.GetType() == src.GetType() && src.HasAttributes()) { + auto attributes = dst.GetAttributes(); + MergeNodes(attributes, src.GetAttributes()); + dst = src; + dst.Attributes() = attributes; + } else { + dst = src; + } + } +} + +TYPath AddPathPrefix(const TYPath& path, const TString& pathPrefix) +{ + if (path.StartsWith("//") || path.StartsWith("#")) { + return path; + } + return pathPrefix + path; +} + +TString GetWriteTableCommand(const TString& apiVersion) +{ + return apiVersion == "v2" ? "write" : "write_table"; +} + +TString GetReadTableCommand(const TString& apiVersion) +{ + return apiVersion == "v2" ? "read" : "read_table"; +} + +TString GetWriteFileCommand(const TString& apiVersion) +{ + return apiVersion == "v2" ? "upload" : "write_file"; +} + +TString GetReadFileCommand(const TString& apiVersion) +{ + return apiVersion == "v2" ? "download" : "read_file"; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/helpers.h b/yt/cpp/mapreduce/common/helpers.h new file mode 100644 index 0000000000..2174ba820b --- /dev/null +++ b/yt/cpp/mapreduce/common/helpers.h @@ -0,0 +1,37 @@ +#pragma once + +#include "fwd.h" + +#include <library/cpp/yson/node/node_io.h> // backward compatibility + +#include <yt/cpp/mapreduce/interface/node.h> +#include <yt/cpp/mapreduce/interface/common.h> +#include <library/cpp/yson/public.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TString NodeListToYsonString(const TNode::TListType& nodes); + +TNode PathToNode(const TRichYPath& path); +TNode PathToParamNode(const TRichYPath& path); + +TString AttributesToYsonString(const TNode& attributes); + +TString AttributeFilterToYsonString(const TAttributeFilter& filter); + +TNode NodeFromTableSchema(const TTableSchema& schema); + +void MergeNodes(TNode& dst, const TNode& src); + +TYPath AddPathPrefix(const TYPath& path, const TString& pathPrefix); + +TString GetWriteTableCommand(const TString& apiVersion); +TString GetReadTableCommand(const TString& apiVersion); +TString GetWriteFileCommand(const TString& apiVersion); +TString GetReadFileCommand(const TString& apiVersion); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/node_builder.h b/yt/cpp/mapreduce/common/node_builder.h new file mode 100644 index 0000000000..c7f731cf09 --- /dev/null +++ b/yt/cpp/mapreduce/common/node_builder.h @@ -0,0 +1,4 @@ +#pragma once + +// Backward compatibility. +#include <library/cpp/yson/node/node_builder.h> diff --git a/yt/cpp/mapreduce/common/node_visitor.h b/yt/cpp/mapreduce/common/node_visitor.h new file mode 100644 index 0000000000..a8bde52b5a --- /dev/null +++ b/yt/cpp/mapreduce/common/node_visitor.h @@ -0,0 +1,4 @@ +#pragma once + +// Backward compatibility. +#include <library/cpp/yson/node/node_visitor.h> diff --git a/yt/cpp/mapreduce/common/retry_lib.cpp b/yt/cpp/mapreduce/common/retry_lib.cpp new file mode 100644 index 0000000000..cf2c021eb4 --- /dev/null +++ b/yt/cpp/mapreduce/common/retry_lib.cpp @@ -0,0 +1,267 @@ +#include "retry_lib.h" + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/error_codes.h> +#include <yt/cpp/mapreduce/interface/retry_policy.h> + +#include <util/string/builder.h> +#include <util/generic/set.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TAttemptLimitedRetryPolicy::TAttemptLimitedRetryPolicy(ui32 attemptLimit, const TConfigPtr& config) + : Config_(config) + , AttemptLimit_(attemptLimit) +{ } + +void TAttemptLimitedRetryPolicy::NotifyNewAttempt() +{ + ++Attempt_; +} + +TMaybe<TDuration> TAttemptLimitedRetryPolicy::OnGenericError(const std::exception& e) +{ + if (IsAttemptLimitExceeded()) { + return Nothing(); + } + return GetBackoffDuration(e, Config_); +} + +TMaybe<TDuration> TAttemptLimitedRetryPolicy::OnRetriableError(const TErrorResponse& e) +{ + if (IsAttemptLimitExceeded()) { + return Nothing(); + } + return GetBackoffDuration(e, Config_); +} + +void TAttemptLimitedRetryPolicy::OnIgnoredError(const TErrorResponse& /*e*/) +{ + --Attempt_; +} + +TString TAttemptLimitedRetryPolicy::GetAttemptDescription() const +{ + return ::TStringBuilder() << "attempt " << Attempt_ << " of " << AttemptLimit_; +} + +bool TAttemptLimitedRetryPolicy::IsAttemptLimitExceeded() const +{ + return Attempt_ >= AttemptLimit_; +} +//////////////////////////////////////////////////////////////////////////////// + +class TTimeLimitedRetryPolicy + : public IRequestRetryPolicy +{ +public: + TTimeLimitedRetryPolicy(IRequestRetryPolicyPtr retryPolicy, TDuration timeout) + : RetryPolicy_(retryPolicy) + , Deadline_(TInstant::Now() + timeout) + , Timeout_(timeout) + { } + void NotifyNewAttempt() override + { + if (TInstant::Now() >= Deadline_) { + ythrow TRequestRetriesTimeout() << "retry timeout exceeded (timeout: " << Timeout_ << ")"; + } + RetryPolicy_->NotifyNewAttempt(); + } + + TMaybe<TDuration> OnGenericError(const std::exception& e) override + { + return RetryPolicy_->OnGenericError(e); + } + + TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override + { + return RetryPolicy_->OnRetriableError(e); + } + + void OnIgnoredError(const TErrorResponse& e) override + { + return RetryPolicy_->OnIgnoredError(e); + } + + TString GetAttemptDescription() const override + { + return RetryPolicy_->GetAttemptDescription(); + } + +private: + const IRequestRetryPolicyPtr RetryPolicy_; + const TInstant Deadline_; + const TDuration Timeout_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TDefaultClientRetryPolicy + : public IClientRetryPolicy +{ +public: + explicit TDefaultClientRetryPolicy(IRetryConfigProviderPtr retryConfigProvider, const TConfigPtr& config) + : RetryConfigProvider_(std::move(retryConfigProvider)) + , Config_(config) + { } + + IRequestRetryPolicyPtr CreatePolicyForGenericRequest() override + { + return Wrap(CreateDefaultRequestRetryPolicy(Config_)); + } + + IRequestRetryPolicyPtr CreatePolicyForStartOperationRequest() override + { + return Wrap(MakeIntrusive<TAttemptLimitedRetryPolicy>(static_cast<ui32>(Config_->StartOperationRetryCount), Config_)); + } + + IRequestRetryPolicyPtr Wrap(IRequestRetryPolicyPtr basePolicy) + { + auto config = RetryConfigProvider_->CreateRetryConfig(); + if (config.RetriesTimeLimit < TDuration::Max()) { + return ::MakeIntrusive<TTimeLimitedRetryPolicy>(std::move(basePolicy), config.RetriesTimeLimit); + } + return basePolicy; + } + +private: + IRetryConfigProviderPtr RetryConfigProvider_; + const TConfigPtr Config_; +}; + +class TDefaultRetryConfigProvider + : public IRetryConfigProvider +{ +public: + TRetryConfig CreateRetryConfig() override + { + return {}; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +IRequestRetryPolicyPtr CreateDefaultRequestRetryPolicy(const TConfigPtr& config) +{ + return MakeIntrusive<TAttemptLimitedRetryPolicy>(static_cast<ui32>(config->RetryCount), config); +} + +IClientRetryPolicyPtr CreateDefaultClientRetryPolicy(IRetryConfigProviderPtr retryConfigProvider, const TConfigPtr& config) +{ + return MakeIntrusive<TDefaultClientRetryPolicy>(std::move(retryConfigProvider), config); +} +IRetryConfigProviderPtr CreateDefaultRetryConfigProvider() +{ + return MakeIntrusive<TDefaultRetryConfigProvider>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +static bool IsChunkError(int code) +{ + return code / 100 == 7; +} + +// Check whether: +// 1) codes contain at least one chunk error AND +// 2) codes don't contain non-retriable chunk errors. +static bool IsRetriableChunkError(const TSet<int>& codes) +{ + using namespace NClusterErrorCodes; + auto isChunkError = false; + for (auto code : codes) { + switch (code) { + case NChunkClient::SessionAlreadyExists: + case NChunkClient::ChunkAlreadyExists: + case NChunkClient::WindowError: + case NChunkClient::BlockContentMismatch: + case NChunkClient::InvalidBlockChecksum: + case NChunkClient::BlockOutOfRange: + case NChunkClient::MissingExtension: + case NChunkClient::NoSuchBlock: + case NChunkClient::NoSuchChunk: + case NChunkClient::NoSuchChunkList: + case NChunkClient::NoSuchChunkTree: + case NChunkClient::NoSuchChunkView: + case NChunkClient::NoSuchMedium: + return false; + default: + isChunkError |= IsChunkError(code); + break; + } + } + return isChunkError; +} + +static TMaybe<TDuration> TryGetBackoffDuration(const TErrorResponse& errorResponse, const TConfigPtr& config) +{ + int httpCode = errorResponse.GetHttpCode(); + if (httpCode / 100 != 4 && !errorResponse.IsFromTrailers()) { + return config->RetryInterval; + } + + auto allCodes = errorResponse.GetError().GetAllErrorCodes(); + using namespace NClusterErrorCodes; + if (httpCode == 429 + || allCodes.count(NSecurityClient::RequestQueueSizeLimitExceeded) + || allCodes.count(NRpc::RequestQueueSizeLimitExceeded)) + { + // request rate limit exceeded + return config->RateLimitExceededRetryInterval; + } + if (errorResponse.IsConcurrentOperationsLimitReached()) { + // limit for the number of concurrent operations exceeded + return config->StartOperationRetryInterval; + } + if (IsRetriableChunkError(allCodes)) { + // chunk client errors + return config->ChunkErrorsRetryInterval; + } + for (auto code : TVector<int>{ + NRpc::TransportError, + NRpc::Unavailable, + NApi::RetriableArchiveError, + Canceled, + }) { + if (allCodes.contains(code)) { + return config->RetryInterval; + } + } + return Nothing(); +} + +TDuration GetBackoffDuration(const TErrorResponse& errorResponse, const TConfigPtr& config) +{ + return TryGetBackoffDuration(errorResponse, config).GetOrElse(config->RetryInterval); +} + +bool IsRetriable(const TErrorResponse& errorResponse) +{ + // Retriability of an error doesn't depend on config, so just use global one. + return TryGetBackoffDuration(errorResponse, TConfig::Get()).Defined(); +} + +bool IsRetriable(const std::exception& ex) +{ + if (dynamic_cast<const TRequestRetriesTimeout*>(&ex)) { + return false; + } + return true; +} + +TDuration GetBackoffDuration(const std::exception& /*error*/, const TConfigPtr& config) +{ + return GetBackoffDuration(config); +} + +TDuration GetBackoffDuration(const TConfigPtr& config) +{ + return config->RetryInterval; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/retry_lib.h b/yt/cpp/mapreduce/common/retry_lib.h new file mode 100644 index 0000000000..c6c061f614 --- /dev/null +++ b/yt/cpp/mapreduce/common/retry_lib.h @@ -0,0 +1,100 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/fwd.h> + +#include <util/datetime/base.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/generic/string.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +// IRequestRetryPolicy class controls retries of single request. +class IRequestRetryPolicy + : public virtual TThrRefBase +{ +public: + // Helper function that returns text description of current attempt, e.g. + // "attempt 3 / 10" + // used in logs. + virtual TString GetAttemptDescription() const = 0; + + // Library code calls this function before any request attempt. + virtual void NotifyNewAttempt() = 0; + + // OnRetriableError is called whenever client gets YT error that can be retried (e.g. operation limit exceeded). + // OnGenericError is called whenever request failed due to generic error like network error. + // + // Both methods must return nothing if policy doesn't want to retry this error. + // Otherwise method should return backoff time. + virtual TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) = 0; + virtual TMaybe<TDuration> OnGenericError(const std::exception& e) = 0; + + // OnIgnoredError is called whenever client gets an error but is going to ignore it. + virtual void OnIgnoredError(const TErrorResponse& /*e*/) = 0; +}; +using IRequestRetryPolicyPtr = ::TIntrusivePtr<IRequestRetryPolicy>; + +//////////////////////////////////////////////////////////////////////////////// + +// IClientRetryPolicy controls creation of policies for individual requests. +class IClientRetryPolicy + : public virtual TThrRefBase +{ +public: + virtual IRequestRetryPolicyPtr CreatePolicyForGenericRequest() = 0; + virtual IRequestRetryPolicyPtr CreatePolicyForStartOperationRequest() = 0; +}; + + +//////////////////////////////////////////////////////////////////////////////// + +class TAttemptLimitedRetryPolicy + : public IRequestRetryPolicy +{ +public: + explicit TAttemptLimitedRetryPolicy(ui32 attemptLimit, const TConfigPtr& config); + + void NotifyNewAttempt() override; + + TMaybe<TDuration> OnGenericError(const std::exception& e) override; + TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override; + void OnIgnoredError(const TErrorResponse& e) override; + TString GetAttemptDescription() const override; + + bool IsAttemptLimitExceeded() const; + +protected: + const TConfigPtr Config_; + +private: + const ui32 AttemptLimit_; + ui32 Attempt_ = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +IRequestRetryPolicyPtr CreateDefaultRequestRetryPolicy(const TConfigPtr& config); +IClientRetryPolicyPtr CreateDefaultClientRetryPolicy(IRetryConfigProviderPtr retryConfigProvider, const TConfigPtr& config); +IRetryConfigProviderPtr CreateDefaultRetryConfigProvider(); + +//////////////////////////////////////////////////////////////////////////////// + +// Check if error returned by YT can be retried +bool IsRetriable(const TErrorResponse& errorResponse); +bool IsRetriable(const std::exception& ex); + +// Get backoff duration for errors returned by YT. +TDuration GetBackoffDuration(const TErrorResponse& errorResponse, const TConfigPtr& config); + +// Get backoff duration for errors that are not TErrorResponse. +TDuration GetBackoffDuration(const std::exception& error, const TConfigPtr& config); +TDuration GetBackoffDuration(const TConfigPtr& config); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/wait_proxy.cpp b/yt/cpp/mapreduce/common/wait_proxy.cpp new file mode 100644 index 0000000000..3db034a098 --- /dev/null +++ b/yt/cpp/mapreduce/common/wait_proxy.cpp @@ -0,0 +1,118 @@ +#include "wait_proxy.h" + + +#include <library/cpp/threading/future/future.h> + +#include <util/system/event.h> +#include <util/system/condvar.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +bool TDefaultWaitProxy::WaitFuture(const NThreading::TFuture<void>& future, TDuration timeout) +{ + return future.Wait(timeout); +} + +bool TDefaultWaitProxy::WaitEvent(TSystemEvent& event, TDuration timeout) +{ + return event.WaitT(timeout); +} + +bool TDefaultWaitProxy::WaitCondVar(TCondVar &condVar, TMutex &mutex, TDuration timeout) +{ + return condVar.WaitT(mutex, timeout); +} + +void TDefaultWaitProxy::Sleep(TDuration timeout) +{ + ::Sleep(timeout); +} + +//////////////////////////////////////////////////////////////////////////////// + +TWaitProxy::TWaitProxy() + : Proxy_(::MakeIntrusive<TDefaultWaitProxy>()) +{ } + +TWaitProxy* TWaitProxy::Get() +{ + return Singleton<TWaitProxy>(); +} + +void TWaitProxy::SetProxy(::TIntrusivePtr<IWaitProxy> proxy) +{ + Proxy_ = std::move(proxy); +} + +bool TWaitProxy::WaitFuture(const NThreading::TFuture<void>& future) +{ + return Proxy_->WaitFuture(future, TDuration::Max()); +} + +bool TWaitProxy::WaitFuture(const NThreading::TFuture<void>& future, TInstant deadLine) +{ + return Proxy_->WaitFuture(future, deadLine - TInstant::Now()); +} + +bool TWaitProxy::WaitFuture(const NThreading::TFuture<void>& future, TDuration timeout) +{ + return Proxy_->WaitFuture(future, timeout); +} + +bool TWaitProxy::WaitEventD(TSystemEvent& event, TInstant deadLine) +{ + return Proxy_->WaitEvent(event, deadLine - TInstant::Now()); +} + +bool TWaitProxy::WaitEventT(TSystemEvent& event, TDuration timeout) +{ + return Proxy_->WaitEvent(event, timeout); +} + +void TWaitProxy::WaitEventI(TSystemEvent& event) +{ + Proxy_->WaitEvent(event, TDuration::Max()); +} + +bool TWaitProxy::WaitEvent(TSystemEvent& event) +{ + return Proxy_->WaitEvent(event, TDuration::Max()); +} + +bool TWaitProxy::WaitCondVarD(TCondVar& condVar, TMutex& m, TInstant deadLine) +{ + return Proxy_->WaitCondVar(condVar, m, deadLine - TInstant::Now()); +} + +bool TWaitProxy::WaitCondVarT(TCondVar& condVar, TMutex& m, TDuration timeOut) +{ + return Proxy_->WaitCondVar(condVar, m, timeOut); +} + +void TWaitProxy::WaitCondVarI(TCondVar& condVar, TMutex& m) +{ + Proxy_->WaitCondVar(condVar, m, TDuration::Max()); +} + +void TWaitProxy::WaitCondVar(TCondVar& condVar, TMutex& m) +{ + Proxy_->WaitCondVar(condVar, m, TDuration::Max()); +} + +void TWaitProxy::Sleep(TDuration timeout) +{ + Proxy_->Sleep(timeout); +} + +void TWaitProxy::SleepUntil(TInstant instant) +{ + Proxy_->Sleep(instant - TInstant::Now()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/wait_proxy.h b/yt/cpp/mapreduce/common/wait_proxy.h new file mode 100644 index 0000000000..e7c944cf24 --- /dev/null +++ b/yt/cpp/mapreduce/common/wait_proxy.h @@ -0,0 +1,53 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/wait_proxy.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TDefaultWaitProxy + : public IWaitProxy +{ +public: + bool WaitFuture(const ::NThreading::TFuture<void>& future, TDuration timeout) override; + bool WaitEvent(TSystemEvent& event, TDuration timeout) override; + bool WaitCondVar(TCondVar& condVar, TMutex& mutex, TDuration timeout) override; + void Sleep(TDuration timeout) override; +}; + +class TWaitProxy { +public: + TWaitProxy(); + + static TWaitProxy* Get(); + + // NB: Non thread-safe, should be called only in initialization code. + void SetProxy(::TIntrusivePtr<IWaitProxy> proxy); + + bool WaitFuture(const ::NThreading::TFuture<void>& future); + bool WaitFuture(const ::NThreading::TFuture<void>& future, TInstant deadLine); + bool WaitFuture(const ::NThreading::TFuture<void>& future, TDuration timeout); + + bool WaitEventD(TSystemEvent& event, TInstant deadLine); + bool WaitEventT(TSystemEvent& event, TDuration timeout); + void WaitEventI(TSystemEvent& event); + bool WaitEvent(TSystemEvent& event); + + bool WaitCondVarD(TCondVar& condVar, TMutex& m, TInstant deadLine); + bool WaitCondVarT(TCondVar& condVar, TMutex& m, TDuration timeOut); + void WaitCondVarI(TCondVar& condVar, TMutex& m); + void WaitCondVar(TCondVar& condVar, TMutex& m); + + void Sleep(TDuration timeout); + void SleepUntil(TInstant instant); + +private: + ::TIntrusivePtr<IWaitProxy> Proxy_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/common/ya.make b/yt/cpp/mapreduce/common/ya.make new file mode 100644 index 0000000000..004708cb44 --- /dev/null +++ b/yt/cpp/mapreduce/common/ya.make @@ -0,0 +1,23 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + debug_metrics.cpp + helpers.cpp + retry_lib.cpp + wait_proxy.cpp +) + +PEERDIR( + library/cpp/json + library/cpp/svnversion + library/cpp/threading/future + library/cpp/yson + library/cpp/yson/json + library/cpp/yson/node + yt/cpp/mapreduce/interface + yt/cpp/mapreduce/interface/logging +) + +END() diff --git a/yt/cpp/mapreduce/http/abortable_http_response.cpp b/yt/cpp/mapreduce/http/abortable_http_response.cpp new file mode 100644 index 0000000000..9da9241d33 --- /dev/null +++ b/yt/cpp/mapreduce/http/abortable_http_response.cpp @@ -0,0 +1,223 @@ +#include "abortable_http_response.h" + +#include <util/system/mutex.h> +#include <util/generic/singleton.h> +#include <util/generic/hash_set.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TAbortableHttpResponseRegistry { +public: + TOutageId StartOutage(TString urlPattern, const TOutageOptions& options) + { + auto g = Guard(Lock_); + auto id = NextId_++; + IdToOutage.emplace(id, TOutageEntry{std::move(urlPattern), options.ResponseCount_, options.LengthLimit_}); + return id; + } + + void StopOutage(TOutageId id) + { + auto g = Guard(Lock_); + IdToOutage.erase(id); + } + + void Add(IAbortableHttpResponse* response) + { + auto g = Guard(Lock_); + for (auto& [id, entry] : IdToOutage) { + if (entry.Counter > 0 && response->GetUrl().find(entry.Pattern) != TString::npos) { + response->SetLengthLimit(entry.LengthLimit); + entry.Counter -= 1; + } + } + ResponseList_.PushBack(response); + } + + void Remove(IAbortableHttpResponse* response) + { + auto g = Guard(Lock_); + response->Unlink(); + } + + static TAbortableHttpResponseRegistry& Get() + { + return *Singleton<TAbortableHttpResponseRegistry>(); + } + + int AbortAll(const TString& urlPattern) + { + int result = 0; + for (auto& response : ResponseList_) { + if (!response.IsAborted() && response.GetUrl().find(urlPattern) != TString::npos) { + response.Abort(); + ++result; + } + } + return result; + } + +private: + struct TOutageEntry + { + TString Pattern; + size_t Counter; + size_t LengthLimit; + }; + +private: + TOutageId NextId_ = 0; + TIntrusiveList<IAbortableHttpResponse> ResponseList_; + THashMap<TOutageId, TOutageEntry> IdToOutage; + TMutex Lock_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TAbortableHttpResponse::TOutage::TOutage( + TString urlPattern, + TAbortableHttpResponseRegistry& registry, + const TOutageOptions& options) + : UrlPattern_(std::move(urlPattern)) + , Registry_(registry) + , Id_(registry.StartOutage(UrlPattern_, options)) +{ } + +TAbortableHttpResponse::TOutage::~TOutage() +{ + Stop(); +} + +void TAbortableHttpResponse::TOutage::Stop() +{ + if (!Stopped_) { + Registry_.StopOutage(Id_); + Stopped_ = true; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TAbortableHttpResponseBase::TAbortableHttpResponseBase(const TString& url) + : Url_(url) +{ + TAbortableHttpResponseRegistry::Get().Add(this); +} + +TAbortableHttpResponseBase::~TAbortableHttpResponseBase() +{ + TAbortableHttpResponseRegistry::Get().Remove(this); +} + +void TAbortableHttpResponseBase::Abort() +{ + Aborted_ = true; +} + +void TAbortableHttpResponseBase::SetLengthLimit(size_t limit) +{ + LengthLimit_ = limit; + if (LengthLimit_ == 0) { + Abort(); + } +} + +const TString& TAbortableHttpResponseBase::GetUrl() const +{ + return Url_; +} + +bool TAbortableHttpResponseBase::IsAborted() const +{ + return Aborted_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TAbortableHttpResponse::TAbortableHttpResponse( + IInputStream* socketStream, + const TString& requestId, + const TString& hostName, + const TString& url) + : THttpResponse(socketStream, requestId, hostName) + , TAbortableHttpResponseBase(url) +{ +} + +size_t TAbortableHttpResponse::DoRead(void* buf, size_t len) +{ + if (Aborted_) { + ythrow TAbortedForTestPurpose() << "response was aborted"; + } + len = std::min(len, LengthLimit_); + auto read = THttpResponse::DoRead(buf, len); + LengthLimit_ -= read; + if (LengthLimit_ == 0) { + Abort(); + } + return read; +} + +size_t TAbortableHttpResponse::DoSkip(size_t len) +{ + if (Aborted_) { + ythrow TAbortedForTestPurpose() << "response was aborted"; + } + return THttpResponse::DoSkip(len); +} + +int TAbortableHttpResponse::AbortAll(const TString& urlPattern) +{ + return TAbortableHttpResponseRegistry::Get().AbortAll(urlPattern); +} + +TAbortableHttpResponse::TOutage TAbortableHttpResponse::StartOutage( + const TString& urlPattern, + const TOutageOptions& options) +{ + return TOutage(urlPattern, TAbortableHttpResponseRegistry::Get(), options); +} + +TAbortableHttpResponse::TOutage TAbortableHttpResponse::StartOutage( + const TString& urlPattern, + size_t responseCount) +{ + return StartOutage(urlPattern, TOutageOptions().ResponseCount(responseCount)); +} + +TAbortableCoreHttpResponse::TAbortableCoreHttpResponse( + std::unique_ptr<IInputStream> stream, + const TString& url) + : TAbortableHttpResponseBase(url) + , Stream_(std::move(stream)) +{ +} + +size_t TAbortableCoreHttpResponse::DoRead(void* buf, size_t len) +{ + if (Aborted_) { + ythrow TAbortedForTestPurpose() << "response was aborted"; + } + len = std::min(len, LengthLimit_); + auto read = Stream_->Read(buf, len); + LengthLimit_ -= read; + if (LengthLimit_ == 0) { + Abort(); + } + + return read; +} + +size_t TAbortableCoreHttpResponse::DoSkip(size_t len) +{ + if (Aborted_) { + ythrow TAbortedForTestPurpose() << "response was aborted"; + } + return Stream_->Skip(len); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/abortable_http_response.h b/yt/cpp/mapreduce/http/abortable_http_response.h new file mode 100644 index 0000000000..d72bcfa0a6 --- /dev/null +++ b/yt/cpp/mapreduce/http/abortable_http_response.h @@ -0,0 +1,142 @@ +#pragma once + +#include "http.h" + +#include <util/generic/intrlist.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TAbortableHttpResponseRegistry; + +using TOutageId = size_t; + +//////////////////////////////////////////////////////////////////////////////// + +class TAbortedForTestPurpose + : public yexception +{ }; + +struct TOutageOptions +{ + using TSelf = TOutageOptions; + + /// @brief Number of responses to abort. + FLUENT_FIELD_DEFAULT(size_t, ResponseCount, std::numeric_limits<size_t>::max()); + + /// @brief Number of bytes to read before abortion. If zero, abort immediately. + FLUENT_FIELD_DEFAULT(size_t, LengthLimit, 0); +}; + +//////////////////////////////////////////////////////////////////////////////// + +class IAbortableHttpResponse + : public TIntrusiveListItem<IAbortableHttpResponse> +{ +public: + virtual void Abort() = 0; + virtual const TString& GetUrl() const = 0; + virtual bool IsAborted() const = 0; + virtual void SetLengthLimit(size_t limit) = 0; + + virtual ~IAbortableHttpResponse() = default; +}; + +class TAbortableHttpResponseBase + : public IAbortableHttpResponse +{ +public: + TAbortableHttpResponseBase(const TString& url); + ~TAbortableHttpResponseBase(); + + void Abort() override; + const TString& GetUrl() const override; + bool IsAborted() const override; + void SetLengthLimit(size_t limit) override; + +protected: + TString Url_; + std::atomic<bool> Aborted_ = {false}; + size_t LengthLimit_ = std::numeric_limits<size_t>::max(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Stream wrapper for @ref NYT::NHttpClient::TCoreHttpResponse with possibility to emulate errors. +class TAbortableCoreHttpResponse + : public IInputStream + , public TAbortableHttpResponseBase +{ +public: + TAbortableCoreHttpResponse( + std::unique_ptr<IInputStream> stream, + const TString& url); + +private: + size_t DoRead(void* buf, size_t len) override; + size_t DoSkip(size_t len) override; + +private: + std::unique_ptr<IInputStream> Stream_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class extends @ref NYT::THttpResponse with possibility to emulate errors. +class TAbortableHttpResponse + : public THttpResponse + , public TAbortableHttpResponseBase +{ +public: + class TOutage + { + public: + TOutage(TString urlPattern, TAbortableHttpResponseRegistry& registry, const TOutageOptions& options); + TOutage(TOutage&&) = default; + TOutage(const TOutage&) = delete; + ~TOutage(); + + void Stop(); + + private: + TString UrlPattern_; + TAbortableHttpResponseRegistry& Registry_; + TOutageId Id_; + bool Stopped_ = false; + }; + +public: + TAbortableHttpResponse( + IInputStream* socketStream, + const TString& requestId, + const TString& hostName, + const TString& url); + + /// @brief Abort any responses which match `urlPattern` (i.e. contain it in url). + /// + /// @return number of aborted responses. + static int AbortAll(const TString& urlPattern); + + /// @brief Start outage. Future responses which match `urlPattern` (i.e. contain it in url) will fail. + /// + /// @return outage object controlling the lifetime of outage (outage stops when object is destroyed) + [[nodiscard]] static TOutage StartOutage( + const TString& urlPattern, + const TOutageOptions& options = TOutageOptions()); + + /// @brief Start outage. Future `responseCount` responses which match `urlPattern` (i.e. contain it in url) will fail. + /// + /// @return outage object controlling the lifetime of outage (outage stops when object is destroyed) + [[nodiscard]] static TOutage StartOutage( + const TString& urlPattern, + size_t responseCount); + +private: + size_t DoRead(void* buf, size_t len) override; + size_t DoSkip(size_t len) override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/context.cpp b/yt/cpp/mapreduce/http/context.cpp new file mode 100644 index 0000000000..1c016263c5 --- /dev/null +++ b/yt/cpp/mapreduce/http/context.cpp @@ -0,0 +1,25 @@ +#include "context.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +bool operator==(const TClientContext& lhs, const TClientContext& rhs) +{ + return lhs.ServerName == rhs.ServerName && + lhs.Token == rhs.Token && + lhs.ImpersonationUser == rhs.ImpersonationUser && + lhs.ServiceTicketAuth == rhs.ServiceTicketAuth && + lhs.HttpClient == rhs.HttpClient && + lhs.UseTLS == rhs.UseTLS && + lhs.TvmOnly == rhs.TvmOnly; +} + +bool operator!=(const TClientContext& lhs, const TClientContext& rhs) +{ + return !(rhs == lhs); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/context.h b/yt/cpp/mapreduce/http/context.h new file mode 100644 index 0000000000..3926373e17 --- /dev/null +++ b/yt/cpp/mapreduce/http/context.h @@ -0,0 +1,31 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/public.h> + + +namespace NYT { + +/////////////////////////////////////////////////////////////////////////////// + +struct TClientContext +{ + TString ServerName; + TString Token; + TMaybe<TString> ImpersonationUser; + NAuth::IServiceTicketAuthPtrWrapperPtr ServiceTicketAuth; + NHttpClient::IHttpClientPtr HttpClient; + bool TvmOnly = false; + bool UseTLS = false; + TConfigPtr Config = TConfig::Get(); +}; + +bool operator==(const TClientContext& lhs, const TClientContext& rhs); +bool operator!=(const TClientContext& lhs, const TClientContext& rhs); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/core.h b/yt/cpp/mapreduce/http/core.h new file mode 100644 index 0000000000..37c74d7551 --- /dev/null +++ b/yt/cpp/mapreduce/http/core.h @@ -0,0 +1,27 @@ +#pragma once + +#include <yt/yt/core/http/public.h> + +#include <memory> + +namespace NYT::NHttp { + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Wrapper for THeaderPtr which allows to hide NYT::IntrusivePtr from interfaces. +struct THeadersPtrWrapper +{ + THeadersPtrWrapper(THeadersPtr ptr) + : Ptr(std::make_shared<THeadersPtr>(std::move(ptr))) + { } + + THeadersPtr Get() { + return *Ptr; + } + + std::shared_ptr<THeadersPtr> Ptr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NHttp diff --git a/yt/cpp/mapreduce/http/fwd.h b/yt/cpp/mapreduce/http/fwd.h new file mode 100644 index 0000000000..62891731f6 --- /dev/null +++ b/yt/cpp/mapreduce/http/fwd.h @@ -0,0 +1,26 @@ +#pragma once + +#include <memory> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +struct TClientContext; +class THttpHeader; + +namespace NHttpClient { + +class IHttpClient; +class IHttpRequest; +class IHttpResponse; + +using IHttpClientPtr = std::shared_ptr<IHttpClient>; +using IHttpResponsePtr = std::unique_ptr<IHttpResponse>; +using IHttpRequestPtr = std::unique_ptr<IHttpRequest>; + +} // namespace NHttpClient + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/helpers.cpp b/yt/cpp/mapreduce/http/helpers.cpp new file mode 100644 index 0000000000..233a565f20 --- /dev/null +++ b/yt/cpp/mapreduce/http/helpers.cpp @@ -0,0 +1,88 @@ +#include "helpers.h" + +#include "context.h" +#include "requests.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node_io.h> + +namespace NYT { + +/////////////////////////////////////////////////////////////////////////////// + +TString CreateHostNameWithPort(const TString& hostName, const TClientContext& context) +{ + static constexpr int HttpProxyPort = 80; + static constexpr int HttpsProxyPort = 443; + + static constexpr int TvmOnlyHttpProxyPort = 9026; + static constexpr int TvmOnlyHttpsProxyPort = 9443; + + if (hostName.find(':') == TString::npos) { + int port; + if (context.TvmOnly) { + port = context.UseTLS + ? TvmOnlyHttpsProxyPort + : TvmOnlyHttpProxyPort; + } else { + port = context.UseTLS + ? HttpsProxyPort + : HttpProxyPort; + } + return Format("%v:%v", hostName, port); + } + return hostName; +} + +TString GetFullUrl(const TString& hostName, const TClientContext& context, THttpHeader& header) +{ + Y_UNUSED(context); + return Format("http://%v%v", hostName, header.GetUrl()); +} + +static TString GetParametersDebugString(const THttpHeader& header) +{ + const auto& parameters = header.GetParameters(); + if (parameters.Empty()) { + return "<empty>"; + } else { + return NodeToYsonString(parameters); + } +} + +TString TruncateForLogs(const TString& text, size_t maxSize) +{ + Y_VERIFY(maxSize > 10); + if (text.empty()) { + static TString empty = "empty"; + return empty; + } else if (text.size() > maxSize) { + TStringStream out; + out << text.substr(0, maxSize) + "... (" << text.size() << " bytes total)"; + return out.Str(); + } else { + return text; + } +} + +TString GetLoggedAttributes(const THttpHeader& header, const TString& url, bool includeParameters, size_t sizeLimit) +{ + const auto parametersDebugString = GetParametersDebugString(header); + TStringStream out; + out << "Method: " << url << "; " + << "X-YT-Parameters (sent in " << (includeParameters ? "header" : "body") << "): " << TruncateForLogs(parametersDebugString, sizeLimit); + return out.Str(); +} + +void LogRequest(const THttpHeader& header, const TString& url, bool includeParameters, const TString& requestId, const TString& hostName) +{ + YT_LOG_DEBUG("REQ %v - sending request (HostName: %v; %v)", + requestId, + hostName, + GetLoggedAttributes(header, url, includeParameters, Max<size_t>())); +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/helpers.h b/yt/cpp/mapreduce/http/helpers.h new file mode 100644 index 0000000000..0c510fa2e8 --- /dev/null +++ b/yt/cpp/mapreduce/http/helpers.h @@ -0,0 +1,25 @@ +#pragma once + +#include "fwd.h" + +#include "http.h" + +#include <util/generic/fwd.h> + +namespace NYT { + +/////////////////////////////////////////////////////////////////////////////// + +TString CreateHostNameWithPort(const TString& name, const TClientContext& context); + +TString GetFullUrl(const TString& hostName, const TClientContext& context, THttpHeader& header); + +TString TruncateForLogs(const TString& text, size_t maxSize); + +TString GetLoggedAttributes(const THttpHeader& header, const TString& url, bool includeParameters, size_t sizeLimit); + +void LogRequest(const THttpHeader& header, const TString& url, bool includeParameters, const TString& requestId, const TString& hostName); + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/host_manager.cpp b/yt/cpp/mapreduce/http/host_manager.cpp new file mode 100644 index 0000000000..a239dde769 --- /dev/null +++ b/yt/cpp/mapreduce/http/host_manager.cpp @@ -0,0 +1,140 @@ +#include "host_manager.h" + +#include "context.h" +#include "helpers.h" +#include "http.h" +#include "http_client.h" +#include "requests.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <library/cpp/json/json_reader.h> + +#include <util/generic/guid.h> +#include <util/generic/vector.h> +#include <util/generic/singleton.h> +#include <util/generic/ymath.h> + +#include <util/random/random.h> + +#include <util/string/vector.h> + +namespace NYT::NPrivate { + +//////////////////////////////////////////////////////////////////////////////// + +static TVector<TString> ParseJsonStringArray(const TString& response) +{ + NJson::TJsonValue value; + TStringInput input(response); + NJson::ReadJsonTree(&input, &value); + + const NJson::TJsonValue::TArray& array = value.GetArray(); + TVector<TString> result; + result.reserve(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + result.push_back(array[i].GetString()); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +class THostManager::TClusterHostList +{ +public: + explicit TClusterHostList(TVector<TString> hosts) + : Hosts_(std::move(hosts)) + , Timestamp_(TInstant::Now()) + { } + + explicit TClusterHostList(std::exception_ptr error) + : Error_(std::move(error)) + , Timestamp_(TInstant::Now()) + { } + + TString ChooseHostOrThrow() const + { + if (Error_) { + std::rethrow_exception(Error_); + } + + if (Hosts_.empty()) { + ythrow yexception() << "fetched list of proxies is empty"; + } + + return Hosts_[RandomNumber<size_t>(Hosts_.size())]; + } + + TDuration GetAge() const + { + return TInstant::Now() - Timestamp_; + } + +private: + TVector<TString> Hosts_; + std::exception_ptr Error_; + TInstant Timestamp_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +THostManager& THostManager::Get() +{ + return *Singleton<THostManager>(); +} + +void THostManager::Reset() +{ + auto guard = Guard(Lock_); + ClusterHosts_.clear(); +} + +TString THostManager::GetProxyForHeavyRequest(const TClientContext& context) +{ + auto cluster = context.ServerName; + { + auto guard = Guard(Lock_); + auto it = ClusterHosts_.find(cluster); + if (it != ClusterHosts_.end() && it->second.GetAge() < context.Config->HostListUpdateInterval) { + return it->second.ChooseHostOrThrow(); + } + } + + auto hostList = GetHosts(context); + auto result = hostList.ChooseHostOrThrow(); + { + auto guard = Guard(Lock_); + ClusterHosts_.emplace(cluster, std::move(hostList)); + } + return result; +} + +THostManager::TClusterHostList THostManager::GetHosts(const TClientContext& context) +{ + TString hostsEndpoint = context.Config->Hosts; + while (hostsEndpoint.StartsWith("/")) { + hostsEndpoint = hostsEndpoint.substr(1); + } + THttpHeader header("GET", hostsEndpoint, false); + + try { + auto hostName = context.ServerName; + auto requestId = CreateGuidAsString(); + // TODO: we need to set socket timeout here + auto response = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); + auto hosts = ParseJsonStringArray(response->GetResponse()); + for (auto& host : hosts) { + host = CreateHostNameWithPort(host, context); + } + return TClusterHostList(std::move(hosts)); + } catch (const std::exception& e) { + return TClusterHostList(std::current_exception()); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NPrivate diff --git a/yt/cpp/mapreduce/http/host_manager.h b/yt/cpp/mapreduce/http/host_manager.h new file mode 100644 index 0000000000..fdbb740566 --- /dev/null +++ b/yt/cpp/mapreduce/http/host_manager.h @@ -0,0 +1,37 @@ +#pragma once + +#include "fwd.h" + +#include <util/generic/string.h> +#include <util/generic/hash.h> +#include <util/system/spinlock.h> + + +namespace NYT::NPrivate { + +//////////////////////////////////////////////////////////////////////////////// + +class THostManager +{ +public: + static THostManager& Get(); + + TString GetProxyForHeavyRequest(const TClientContext& context); + + // For testing purposes only. + void Reset(); + +private: + class TClusterHostList; + +private: + TAdaptiveLock Lock_; + THashMap<TString, TClusterHostList> ClusterHosts_; + +private: + static TClusterHostList GetHosts(const TClientContext& context); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NPrivate diff --git a/yt/cpp/mapreduce/http/http.cpp b/yt/cpp/mapreduce/http/http.cpp new file mode 100644 index 0000000000..d44b2638a0 --- /dev/null +++ b/yt/cpp/mapreduce/http/http.cpp @@ -0,0 +1,1014 @@ +#include "http.h" + +#include "abortable_http_response.h" +#include "core.h" +#include "helpers.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/yt/core/http/http.h> + +#include <library/cpp/json/json_writer.h> + +#include <library/cpp/string_utils/base64/base64.h> +#include <library/cpp/string_utils/quote/quote.h> + +#include <util/generic/singleton.h> +#include <util/generic/algorithm.h> + +#include <util/stream/mem.h> + +#include <util/string/builder.h> +#include <util/string/cast.h> +#include <util/string/escape.h> +#include <util/string/printf.h> + +#include <util/system/byteorder.h> +#include <util/system/getpid.h> + +#include <exception> + + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class THttpRequest::TRequestStream + : public IOutputStream +{ +public: + TRequestStream(THttpRequest* httpRequest, const TSocket& s) + : HttpRequest_(httpRequest) + , SocketOutput_(s) + , HttpOutput_(static_cast<IOutputStream*>(&SocketOutput_)) + { + HttpOutput_.EnableKeepAlive(true); + } + +private: + void DoWrite(const void* buf, size_t len) override + { + WrapWriteFunc([&] { + HttpOutput_.Write(buf, len); + }); + } + + void DoWriteV(const TPart* parts, size_t count) override + { + WrapWriteFunc([&] { + HttpOutput_.Write(parts, count); + }); + } + + void DoWriteC(char ch) override + { + WrapWriteFunc([&] { + HttpOutput_.Write(ch); + }); + } + + void DoFlush() override + { + WrapWriteFunc([&] { + HttpOutput_.Flush(); + }); + } + + void DoFinish() override + { + WrapWriteFunc([&] { + HttpOutput_.Finish(); + }); + } + + void WrapWriteFunc(std::function<void()> func) + { + CheckErrorState(); + try { + func(); + } catch (const std::exception&) { + HandleWriteException(); + } + } + + // In many cases http proxy stops reading request and resets connection + // if error has happend. This function tries to read error response + // in such cases. + void HandleWriteException() { + Y_VERIFY(WriteError_ == nullptr); + WriteError_ = std::current_exception(); + Y_VERIFY(WriteError_ != nullptr); + try { + HttpRequest_->GetResponseStream(); + } catch (const TErrorResponse &) { + throw; + } catch (...) { + } + std::rethrow_exception(WriteError_); + } + + void CheckErrorState() + { + if (WriteError_) { + std::rethrow_exception(WriteError_); + } + } + +private: + THttpRequest* const HttpRequest_; + TSocketOutput SocketOutput_; + THttpOutput HttpOutput_; + std::exception_ptr WriteError_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +THttpHeader::THttpHeader(const TString& method, const TString& command, bool isApi) + : Method(method) + , Command(command) + , IsApi(isApi) +{ } + +void THttpHeader::AddParameter(const TString& key, TNode value, bool overwrite) +{ + auto it = Parameters.find(key); + if (it == Parameters.end()) { + Parameters.emplace(key, std::move(value)); + } else { + if (overwrite) { + it->second = std::move(value); + } else { + ythrow yexception() << "Duplicate key: " << key; + } + } +} + +void THttpHeader::MergeParameters(const TNode& newParameters, bool overwrite) +{ + for (const auto& p : newParameters.AsMap()) { + AddParameter(p.first, p.second, overwrite); + } +} + +void THttpHeader::RemoveParameter(const TString& key) +{ + Parameters.erase(key); +} + +TNode THttpHeader::GetParameters() const +{ + return Parameters; +} + +void THttpHeader::AddTransactionId(const TTransactionId& transactionId, bool overwrite) +{ + if (transactionId) { + AddParameter("transaction_id", GetGuidAsString(transactionId), overwrite); + } else { + RemoveParameter("transaction_id"); + } +} + +void THttpHeader::AddPath(const TString& path, bool overwrite) +{ + AddParameter("path", path, overwrite); +} + +void THttpHeader::AddOperationId(const TOperationId& operationId, bool overwrite) +{ + AddParameter("operation_id", GetGuidAsString(operationId), overwrite); +} + +void THttpHeader::AddMutationId() +{ + TGUID guid; + + // Some users use `fork()' with yt wrapper + // (actually they use python + multiprocessing) + // and CreateGuid is not resistant to `fork()', so spice it a little bit. + // + // Check IGNIETFERRO-610 + CreateGuid(&guid); + guid.dw[2] = GetPID() ^ MicroSeconds(); + + AddParameter("mutation_id", GetGuidAsString(guid), true); +} + +bool THttpHeader::HasMutationId() const +{ + return Parameters.contains("mutation_id"); +} + +void THttpHeader::SetToken(const TString& token) +{ + Token = token; +} + +void THttpHeader::SetImpersonationUser(const TString& impersonationUser) +{ + ImpersonationUser = impersonationUser; +} + +void THttpHeader::SetServiceTicket(const TString& ticket) +{ + ServiceTicket = ticket; +} + +void THttpHeader::SetInputFormat(const TMaybe<TFormat>& format) +{ + InputFormat = format; +} + +void THttpHeader::SetOutputFormat(const TMaybe<TFormat>& format) +{ + OutputFormat = format; +} + +TMaybe<TFormat> THttpHeader::GetOutputFormat() const +{ + return OutputFormat; +} + +void THttpHeader::SetRequestCompression(const TString& compression) +{ + RequestCompression = compression; +} + +void THttpHeader::SetResponseCompression(const TString& compression) +{ + ResponseCompression = compression; +} + +TString THttpHeader::GetCommand() const +{ + return Command; +} + +TString THttpHeader::GetUrl() const +{ + TStringStream url; + + if (IsApi) { + url << "/api/" << TConfig::Get()->ApiVersion << "/" << Command; + } else { + url << "/" << Command; + } + + return url.Str(); +} + +bool THttpHeader::ShouldAcceptFraming() const +{ + return TConfig::Get()->CommandsWithFraming.contains(Command); +} + +TString THttpHeader::GetHeaderAsString(const TString& hostName, const TString& requestId, bool includeParameters) const +{ + TStringStream result; + + result << Method << " " << GetUrl() << " HTTP/1.1\r\n"; + + GetHeader(hostName, requestId, includeParameters).Get()->WriteTo(&result); + + if (ShouldAcceptFraming()) { + result << "X-YT-Accept-Framing: 1\r\n"; + } + + result << "\r\n"; + + return result.Str(); +} + +NHttp::THeadersPtrWrapper THttpHeader::GetHeader(const TString& hostName, const TString& requestId, bool includeParameters) const +{ + auto headers = New<NHttp::THeaders>(); + + headers->Add("Host", hostName); + headers->Add("User-Agent", TProcessState::Get()->ClientVersion); + + if (!Token.empty()) { + headers->Add("Authorization", "OAuth " + Token); + } + if (!ServiceTicket.empty()) { + headers->Add("X-Ya-Service-Ticket", ServiceTicket); + } + if (!ImpersonationUser.empty()) { + headers->Add("X-Yt-User-Name", ImpersonationUser); + } + + if (Method == "PUT" || Method == "POST") { + headers->Add("Transfer-Encoding", "chunked"); + } + + headers->Add("X-YT-Correlation-Id", requestId); + headers->Add("X-YT-Header-Format", "<format=text>yson"); + + headers->Add("Content-Encoding", RequestCompression); + headers->Add("Accept-Encoding", ResponseCompression); + + auto printYTHeader = [&headers] (const char* headerName, const TString& value) { + static const size_t maxHttpHeaderSize = 64 << 10; + if (!value) { + return; + } + if (value.size() <= maxHttpHeaderSize) { + headers->Add(headerName, value); + return; + } + + TString encoded; + Base64Encode(value, encoded); + auto ptr = encoded.data(); + auto finish = encoded.data() + encoded.size(); + size_t index = 0; + do { + auto end = Min(ptr + maxHttpHeaderSize, finish); + headers->Add(Format("%v%v", headerName, index++), TString(ptr, end)); + ptr = end; + } while (ptr != finish); + }; + + if (InputFormat) { + printYTHeader("X-YT-Input-Format", NodeToYsonString(InputFormat->Config)); + } + if (OutputFormat) { + printYTHeader("X-YT-Output-Format", NodeToYsonString(OutputFormat->Config)); + } + if (includeParameters) { + printYTHeader("X-YT-Parameters", NodeToYsonString(Parameters)); + } + + return NHttp::THeadersPtrWrapper(std::move(headers)); +} + +const TString& THttpHeader::GetMethod() const +{ + return Method; +} + +//////////////////////////////////////////////////////////////////////////////// + +TAddressCache* TAddressCache::Get() +{ + return Singleton<TAddressCache>(); +} + +bool ContainsAddressOfRequiredVersion(const TAddressCache::TAddressPtr& address) +{ + if (!TConfig::Get()->ForceIpV4 && !TConfig::Get()->ForceIpV6) { + return true; + } + + for (auto i = address->Begin(); i != address->End(); ++i) { + const auto& addressInfo = *i; + if (TConfig::Get()->ForceIpV4 && addressInfo.ai_family == AF_INET) { + return true; + } + if (TConfig::Get()->ForceIpV6 && addressInfo.ai_family == AF_INET6) { + return true; + } + } + return false; +} + +TAddressCache::TAddressPtr TAddressCache::Resolve(const TString& hostName) +{ + auto address = FindAddress(hostName); + if (address) { + return address; + } + + TString host(hostName); + ui16 port = 80; + + auto colon = hostName.find(':'); + if (colon != TString::npos) { + port = FromString<ui16>(hostName.substr(colon + 1)); + host = hostName.substr(0, colon); + } + + auto retryPolicy = CreateDefaultRequestRetryPolicy(TConfig::Get()); + auto error = yexception() << "can not resolve address of required version for host " << hostName; + while (true) { + address = new TNetworkAddress(host, port); + if (ContainsAddressOfRequiredVersion(address)) { + break; + } + retryPolicy->NotifyNewAttempt(); + YT_LOG_DEBUG("Failed to resolve address of required version for host %v, retrying: %v", + hostName, + retryPolicy->GetAttemptDescription()); + if (auto backoffDuration = retryPolicy->OnGenericError(error)) { + NDetail::TWaitProxy::Get()->Sleep(*backoffDuration); + } else { + ythrow error; + } + } + + AddAddress(hostName, address); + return address; +} + +TAddressCache::TAddressPtr TAddressCache::FindAddress(const TString& hostName) const +{ + TCacheEntry entry; + { + TReadGuard guard(Lock_); + auto it = Cache_.find(hostName); + if (it == Cache_.end()) { + return nullptr; + } + entry = it->second; + } + + if (TInstant::Now() > entry.ExpirationTime) { + YT_LOG_DEBUG("Address resolution cache entry for host %v is expired, will retry resolution", + hostName); + return nullptr; + } + + if (!ContainsAddressOfRequiredVersion(entry.Address)) { + YT_LOG_DEBUG("Address of required version not found for host %v, will retry resolution", + hostName); + return nullptr; + } + + return entry.Address; +} + +void TAddressCache::AddAddress(TString hostName, TAddressPtr address) +{ + auto entry = TCacheEntry{ + .Address = std::move(address), + .ExpirationTime = TInstant::Now() + TConfig::Get()->AddressCacheExpirationTimeout, + }; + + { + TWriteGuard guard(Lock_); + Cache_.emplace(std::move(hostName), std::move(entry)); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TConnectionPool* TConnectionPool::Get() +{ + return Singleton<TConnectionPool>(); +} + +TConnectionPtr TConnectionPool::Connect( + const TString& hostName, + TDuration socketTimeout) +{ + Refresh(); + + if (socketTimeout == TDuration::Zero()) { + socketTimeout = TConfig::Get()->SocketTimeout; + } + + { + auto guard = Guard(Lock_); + auto now = TInstant::Now(); + auto range = Connections_.equal_range(hostName); + for (auto it = range.first; it != range.second; ++it) { + auto& connection = it->second; + if (connection->DeadLine < now) { + continue; + } + if (!AtomicCas(&connection->Busy, 1, 0)) { + continue; + } + + connection->DeadLine = now + socketTimeout; + connection->Socket->SetSocketTimeout(socketTimeout.Seconds()); + return connection; + } + } + + TConnectionPtr connection(new TConnection); + + auto networkAddress = TAddressCache::Get()->Resolve(hostName); + TSocketHolder socket(DoConnect(networkAddress)); + SetNonBlock(socket, false); + + connection->Socket.Reset(new TSocket(socket.Release())); + + connection->DeadLine = TInstant::Now() + socketTimeout; + connection->Socket->SetSocketTimeout(socketTimeout.Seconds()); + + { + auto guard = Guard(Lock_); + static ui32 connectionId = 0; + connection->Id = ++connectionId; + Connections_.insert({hostName, connection}); + } + + YT_LOG_DEBUG("New connection to %v #%v opened", + hostName, + connection->Id); + + return connection; +} + +void TConnectionPool::Release(TConnectionPtr connection) +{ + auto socketTimeout = TConfig::Get()->SocketTimeout; + auto newDeadline = TInstant::Now() + socketTimeout; + + { + auto guard = Guard(Lock_); + connection->DeadLine = newDeadline; + } + + connection->Socket->SetSocketTimeout(socketTimeout.Seconds()); + AtomicSet(connection->Busy, 0); + + Refresh(); +} + +void TConnectionPool::Invalidate( + const TString& hostName, + TConnectionPtr connection) +{ + auto guard = Guard(Lock_); + auto range = Connections_.equal_range(hostName); + for (auto it = range.first; it != range.second; ++it) { + if (it->second == connection) { + YT_LOG_DEBUG("Closing connection #%v", + connection->Id); + Connections_.erase(it); + return; + } + } +} + +void TConnectionPool::Refresh() +{ + auto guard = Guard(Lock_); + + // simple, since we don't expect too many connections + using TItem = std::pair<TInstant, TConnectionMap::iterator>; + std::vector<TItem> sortedConnections; + for (auto it = Connections_.begin(); it != Connections_.end(); ++it) { + sortedConnections.emplace_back(it->second->DeadLine, it); + } + + std::sort( + sortedConnections.begin(), + sortedConnections.end(), + [] (const TItem& a, const TItem& b) -> bool { + return a.first < b.first; + }); + + auto removeCount = static_cast<int>(Connections_.size()) - TConfig::Get()->ConnectionPoolSize; + + const auto now = TInstant::Now(); + for (const auto& item : sortedConnections) { + const auto& mapIterator = item.second; + auto connection = mapIterator->second; + if (AtomicGet(connection->Busy)) { + continue; + } + + if (removeCount > 0) { + Connections_.erase(mapIterator); + YT_LOG_DEBUG("Closing connection #%v (too many opened connections)", + connection->Id); + --removeCount; + continue; + } + + if (connection->DeadLine < now) { + Connections_.erase(mapIterator); + YT_LOG_DEBUG("Closing connection #%v (timeout)", + connection->Id); + } + } +} + +SOCKET TConnectionPool::DoConnect(TAddressCache::TAddressPtr address) +{ + int lastError = 0; + + for (auto i = address->Begin(); i != address->End(); ++i) { + struct addrinfo* info = &*i; + + if (TConfig::Get()->ForceIpV4 && info->ai_family != AF_INET) { + continue; + } + + if (TConfig::Get()->ForceIpV6 && info->ai_family != AF_INET6) { + continue; + } + + TSocketHolder socket( + ::socket(info->ai_family, info->ai_socktype, info->ai_protocol)); + + if (socket.Closed()) { + lastError = LastSystemError(); + continue; + } + + SetNonBlock(socket, true); + if (TConfig::Get()->SocketPriority) { + SetSocketPriority(socket, *TConfig::Get()->SocketPriority); + } + + if (connect(socket, info->ai_addr, info->ai_addrlen) == 0) + return socket.Release(); + + int err = LastSystemError(); + if (err == EINPROGRESS || err == EAGAIN || err == EWOULDBLOCK) { + struct pollfd p = { + socket, + POLLOUT, + 0 + }; + const ssize_t n = PollD(&p, 1, TInstant::Now() + TConfig::Get()->ConnectTimeout); + if (n < 0) { + ythrow TSystemError(-(int)n) << "can not connect to " << info; + } + CheckedGetSockOpt(socket, SOL_SOCKET, SO_ERROR, err, "socket error"); + if (!err) + return socket.Release(); + } + + lastError = err; + continue; + } + + ythrow TSystemError(lastError) << "can not connect to " << *address; +} + +//////////////////////////////////////////////////////////////////////////////// + +static TMaybe<TString> GetProxyName(const THttpInput& input) +{ + if (auto proxyHeader = input.Headers().FindHeader("X-YT-Proxy")) { + return proxyHeader->Value(); + } + return Nothing(); +} + +THttpResponse::THttpResponse( + IInputStream* socketStream, + const TString& requestId, + const TString& hostName) + : HttpInput_(socketStream) + , RequestId_(requestId) + , HostName_(GetProxyName(HttpInput_).GetOrElse(hostName)) + , Unframe_(HttpInput_.Headers().HasHeader("X-YT-Framing")) +{ + HttpCode_ = ParseHttpRetCode(HttpInput_.FirstLine()); + if (HttpCode_ == 200 || HttpCode_ == 202) { + return; + } + + ErrorResponse_ = TErrorResponse(HttpCode_, RequestId_); + + auto logAndSetError = [&] (const TString& rawError) { + YT_LOG_ERROR("RSP %v - HTTP %v - %v", + RequestId_, + HttpCode_, + rawError.data()); + ErrorResponse_->SetRawError(rawError); + }; + + switch (HttpCode_) { + case 429: + logAndSetError("request rate limit exceeded"); + break; + + case 500: + logAndSetError(::TStringBuilder() << "internal error in proxy " << HostName_); + break; + + default: { + TStringStream httpHeaders; + httpHeaders << "HTTP headers ("; + for (const auto& header : HttpInput_.Headers()) { + httpHeaders << header.Name() << ": " << header.Value() << "; "; + } + httpHeaders << ")"; + + auto errorString = Sprintf("RSP %s - HTTP %d - %s", + RequestId_.data(), + HttpCode_, + httpHeaders.Str().data()); + + YT_LOG_ERROR("%v", + errorString.data()); + + if (auto parsedResponse = ParseError(HttpInput_.Headers())) { + ErrorResponse_ = parsedResponse.GetRef(); + } else { + ErrorResponse_->SetRawError( + errorString + " - X-YT-Error is missing in headers"); + } + break; + } + } +} + +const THttpHeaders& THttpResponse::Headers() const +{ + return HttpInput_.Headers(); +} + +void THttpResponse::CheckErrorResponse() const +{ + if (ErrorResponse_) { + throw *ErrorResponse_; + } +} + +bool THttpResponse::IsExhausted() const +{ + return IsExhausted_; +} + +int THttpResponse::GetHttpCode() const +{ + return HttpCode_; +} + +const TString& THttpResponse::GetHostName() const +{ + return HostName_; +} + +bool THttpResponse::IsKeepAlive() const +{ + return HttpInput_.IsKeepAlive(); +} + +TMaybe<TErrorResponse> THttpResponse::ParseError(const THttpHeaders& headers) +{ + for (const auto& header : headers) { + if (header.Name() == "X-YT-Error") { + TErrorResponse errorResponse(HttpCode_, RequestId_); + errorResponse.ParseFromJsonError(header.Value()); + if (errorResponse.IsOk()) { + return Nothing(); + } + return errorResponse; + } + } + return Nothing(); +} + +size_t THttpResponse::DoRead(void* buf, size_t len) +{ + size_t read; + if (Unframe_) { + read = UnframeRead(buf, len); + } else { + read = HttpInput_.Read(buf, len); + } + if (read == 0 && len != 0) { + // THttpInput MUST return defined (but may be empty) + // trailers when it is exhausted. + Y_VERIFY(HttpInput_.Trailers().Defined(), + "trailers MUST be defined for exhausted stream"); + CheckTrailers(HttpInput_.Trailers().GetRef()); + IsExhausted_ = true; + } + return read; +} + +size_t THttpResponse::DoSkip(size_t len) +{ + size_t skipped; + if (Unframe_) { + skipped = UnframeSkip(len); + } else { + skipped = HttpInput_.Skip(len); + } + if (skipped == 0 && len != 0) { + // THttpInput MUST return defined (but may be empty) + // trailers when it is exhausted. + Y_VERIFY(HttpInput_.Trailers().Defined(), + "trailers MUST be defined for exhausted stream"); + CheckTrailers(HttpInput_.Trailers().GetRef()); + IsExhausted_ = true; + } + return skipped; +} + +void THttpResponse::CheckTrailers(const THttpHeaders& trailers) +{ + if (auto errorResponse = ParseError(trailers)) { + errorResponse->SetIsFromTrailers(true); + YT_LOG_ERROR("RSP %v - %v", + RequestId_, + errorResponse.GetRef().what()); + ythrow errorResponse.GetRef(); + } +} + +static ui32 ReadDataFrameSize(THttpInput* stream) +{ + ui32 littleEndianSize; + auto read = stream->Load(&littleEndianSize, sizeof(littleEndianSize)); + if (read < sizeof(littleEndianSize)) { + ythrow yexception() << "Bad data frame header: " << + "expected " << sizeof(littleEndianSize) << " bytes, got " << read; + } + return LittleToHost(littleEndianSize); +} + +bool THttpResponse::RefreshFrameIfNecessary() +{ + while (RemainingFrameSize_ == 0) { + ui8 frameTypeByte; + auto read = HttpInput_.Read(&frameTypeByte, sizeof(frameTypeByte)); + if (read == 0) { + return false; + } + auto frameType = static_cast<EFrameType>(frameTypeByte); + switch (frameType) { + case EFrameType::KeepAlive: + break; + case EFrameType::Data: + RemainingFrameSize_ = ReadDataFrameSize(&HttpInput_); + break; + default: + ythrow yexception() << "Bad frame type " << static_cast<int>(frameTypeByte); + } + } + return true; +} + +size_t THttpResponse::UnframeRead(void* buf, size_t len) +{ + if (!RefreshFrameIfNecessary()) { + return 0; + } + auto read = HttpInput_.Read(buf, Min(len, RemainingFrameSize_)); + RemainingFrameSize_ -= read; + return read; +} + +size_t THttpResponse::UnframeSkip(size_t len) +{ + if (!RefreshFrameIfNecessary()) { + return 0; + } + auto skipped = HttpInput_.Skip(Min(len, RemainingFrameSize_)); + RemainingFrameSize_ -= skipped; + return skipped; +} + +//////////////////////////////////////////////////////////////////////////////// + +THttpRequest::THttpRequest() +{ + RequestId = CreateGuidAsString(); +} + +THttpRequest::THttpRequest(const TString& requestId) + : RequestId(requestId) +{ } + +THttpRequest::~THttpRequest() +{ + if (!Connection) { + return; + } + + if (Input && Input->IsKeepAlive() && Input->IsExhausted()) { + // We should return to the pool only connections where HTTP response was fully read. + // Otherwise next reader might read our remaining data and misinterpret them (YT-6510). + TConnectionPool::Get()->Release(Connection); + } else { + TConnectionPool::Get()->Invalidate(HostName, Connection); + } +} + +TString THttpRequest::GetRequestId() const +{ + return RequestId; +} + +void THttpRequest::Connect(TString hostName, TDuration socketTimeout) +{ + HostName = std::move(hostName); + YT_LOG_DEBUG("REQ %v - requesting connection to %v from connection pool", + RequestId, + HostName); + + StartTime_ = TInstant::Now(); + Connection = TConnectionPool::Get()->Connect(HostName, socketTimeout); + + YT_LOG_DEBUG("REQ %v - connection #%v", + RequestId, + Connection->Id); +} + +IOutputStream* THttpRequest::StartRequestImpl(const THttpHeader& header, bool includeParameters) +{ + auto strHeader = header.GetHeaderAsString(HostName, RequestId, includeParameters); + Url_ = header.GetUrl(); + + LogRequest(header, Url_, includeParameters, RequestId, HostName); + + LoggedAttributes_ = GetLoggedAttributes(header, Url_, includeParameters, 128); + + auto outputFormat = header.GetOutputFormat(); + if (outputFormat && outputFormat->IsTextYson()) { + LogResponse = true; + } + + RequestStream_ = MakeHolder<TRequestStream>(this, *Connection->Socket.Get()); + + RequestStream_->Write(strHeader.data(), strHeader.size()); + return RequestStream_.Get(); +} + +IOutputStream* THttpRequest::StartRequest(const THttpHeader& header) +{ + return StartRequestImpl(header, true); +} + +void THttpRequest::FinishRequest() +{ + RequestStream_->Flush(); + RequestStream_->Finish(); +} + +void THttpRequest::SmallRequest(const THttpHeader& header, TMaybe<TStringBuf> body) +{ + if (!body && (header.GetMethod() == "PUT" || header.GetMethod() == "POST")) { + const auto& parameters = header.GetParameters(); + auto parametersStr = NodeToYsonString(parameters); + auto* output = StartRequestImpl(header, false); + output->Write(parametersStr); + FinishRequest(); + } else { + auto* output = StartRequest(header); + if (body) { + output->Write(*body); + } + FinishRequest(); + } +} + +THttpResponse* THttpRequest::GetResponseStream() +{ + if (!Input) { + SocketInput.Reset(new TSocketInput(*Connection->Socket.Get())); + if (TConfig::Get()->UseAbortableResponse) { + Y_VERIFY(!Url_.empty()); + Input.Reset(new TAbortableHttpResponse(SocketInput.Get(), RequestId, HostName, Url_)); + } else { + Input.Reset(new THttpResponse(SocketInput.Get(), RequestId, HostName)); + } + Input->CheckErrorResponse(); + } + return Input.Get(); +} + +TString THttpRequest::GetResponse() +{ + TString result = GetResponseStream()->ReadAll(); + + TStringStream loggedAttributes; + loggedAttributes + << "Time: " << TInstant::Now() - StartTime_ << "; " + << "HostName: " << GetResponseStream()->GetHostName() << "; " + << LoggedAttributes_; + + if (LogResponse) { + constexpr auto sizeLimit = 1 << 7; + YT_LOG_DEBUG("RSP %v - received response (Response: '%v'; %v)", + RequestId, + TruncateForLogs(result, sizeLimit), + loggedAttributes.Str()); + } else { + YT_LOG_DEBUG("RSP %v - received response of %v bytes (%v)", + RequestId, + result.size(), + loggedAttributes.Str()); + } + return result; +} + +int THttpRequest::GetHttpCode() { + return GetResponseStream()->GetHttpCode(); +} + +void THttpRequest::InvalidateConnection() +{ + TConnectionPool::Get()->Invalidate(HostName, Connection); + Connection.Reset(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/http.h b/yt/cpp/mapreduce/http/http.h new file mode 100644 index 0000000000..ee8783088d --- /dev/null +++ b/yt/cpp/mapreduce/http/http.h @@ -0,0 +1,256 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/common.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/format.h> +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/interface/node.h> + +#include <library/cpp/deprecated/atomic/atomic.h> +#include <library/cpp/http/io/stream.h> + +#include <util/generic/hash.h> +#include <util/generic/hash_multi_map.h> +#include <util/generic/strbuf.h> +#include <util/generic/guid.h> +#include <util/network/socket.h> +#include <util/stream/input.h> +#include <util/system/mutex.h> +#include <util/system/rwlock.h> +#include <util/generic/ptr.h> + +namespace NYT { + +class TNode; + +namespace NHttp { + +struct THeadersPtrWrapper; + +} // NHttp + +/////////////////////////////////////////////////////////////////////////////// + +enum class EFrameType +{ + Data = 0x01, + KeepAlive = 0x02, +}; + + +class THttpHeader +{ +public: + THttpHeader(const TString& method, const TString& command, bool isApi = true); + + void AddParameter(const TString& key, TNode value, bool overwrite = false); + void RemoveParameter(const TString& key); + void MergeParameters(const TNode& parameters, bool overwrite = false); + TNode GetParameters() const; + + void AddTransactionId(const TTransactionId& transactionId, bool overwrite = false); + void AddPath(const TString& path, bool overwrite = false); + void AddOperationId(const TOperationId& operationId, bool overwrite = false); + void AddMutationId(); + bool HasMutationId() const; + + void SetToken(const TString& token); + void SetImpersonationUser(const TString& impersonationUser); + + void SetServiceTicket(const TString& ticket); + + void SetInputFormat(const TMaybe<TFormat>& format); + + void SetOutputFormat(const TMaybe<TFormat>& format); + TMaybe<TFormat> GetOutputFormat() const; + + void SetRequestCompression(const TString& compression); + void SetResponseCompression(const TString& compression); + + TString GetCommand() const; + TString GetUrl() const; + TString GetHeaderAsString(const TString& hostName, const TString& requestId, bool includeParameters = true) const; + NHttp::THeadersPtrWrapper GetHeader(const TString& hostName, const TString& requestId, bool includeParameters) const; + + const TString& GetMethod() const; + +private: + bool ShouldAcceptFraming() const; + +private: + const TString Method; + const TString Command; + const bool IsApi; + + TNode::TMapType Parameters; + TString ImpersonationUser; + TString Token; + TString ServiceTicket; + TNode Attributes; + +private: + TMaybe<TFormat> InputFormat = TFormat::YsonText(); + TMaybe<TFormat> OutputFormat = TFormat::YsonText(); + + TString RequestCompression = "identity"; + TString ResponseCompression = "identity"; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TAddressCache +{ +public: + using TAddressPtr = TAtomicSharedPtr<TNetworkAddress>; + + static TAddressCache* Get(); + + TAddressPtr Resolve(const TString& hostName); + +private: + struct TCacheEntry { + TAddressPtr Address; + TInstant ExpirationTime; + }; + +private: + TAddressPtr FindAddress(const TString& hostName) const; + void AddAddress(TString hostName, TAddressPtr address); + +private: + TRWMutex Lock_; + THashMap<TString, TCacheEntry> Cache_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TConnection +{ + THolder<TSocket> Socket; + TAtomic Busy = 1; + TInstant DeadLine; + ui32 Id; +}; + +using TConnectionPtr = TAtomicSharedPtr<TConnection>; + +class TConnectionPool +{ +public: + using TConnectionMap = THashMultiMap<TString, TConnectionPtr>; + + static TConnectionPool* Get(); + + TConnectionPtr Connect(const TString& hostName, TDuration socketTimeout); + void Release(TConnectionPtr connection); + void Invalidate(const TString& hostName, TConnectionPtr connection); + +private: + void Refresh(); + static SOCKET DoConnect(TAddressCache::TAddressPtr address); + +private: + TMutex Lock_; + TConnectionMap Connections_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +// +// Input stream that handles YT-specific header/trailer errors +// and throws TErrorResponse if it finds any. +class THttpResponse + : public IInputStream +{ +public: + // 'requestId' and 'hostName' are provided for debug reasons + // (they will appear in some error messages). + THttpResponse( + IInputStream* socketStream, + const TString& requestId, + const TString& hostName); + + const THttpHeaders& Headers() const; + + void CheckErrorResponse() const; + bool IsExhausted() const; + int GetHttpCode() const; + const TString& GetHostName() const; + bool IsKeepAlive() const; + +protected: + size_t DoRead(void* buf, size_t len) override; + size_t DoSkip(size_t len) override; + +private: + void CheckTrailers(const THttpHeaders& trailers); + TMaybe<TErrorResponse> ParseError(const THttpHeaders& headers); + size_t UnframeRead(void* buf, size_t len); + size_t UnframeSkip(size_t len); + bool RefreshFrameIfNecessary(); + +private: + THttpInput HttpInput_; + const TString RequestId_; + const TString HostName_; + int HttpCode_ = 0; + TMaybe<TErrorResponse> ErrorResponse_; + bool IsExhausted_ = false; + const bool Unframe_; + size_t RemainingFrameSize_ = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class THttpRequest +{ +public: + THttpRequest(); + THttpRequest(const TString& requestId); + ~THttpRequest(); + + TString GetRequestId() const; + + void Connect(TString hostName, TDuration socketTimeout = TDuration::Zero()); + + IOutputStream* StartRequest(const THttpHeader& header); + void FinishRequest(); + + void SmallRequest(const THttpHeader& header, TMaybe<TStringBuf> body); + + THttpResponse* GetResponseStream(); + + TString GetResponse(); + + void InvalidateConnection(); + + int GetHttpCode(); + +private: + IOutputStream* StartRequestImpl(const THttpHeader& header, bool includeParameters); + +private: + class TRequestStream; + +private: + TString HostName; + TString RequestId; + TString Url_; + TInstant StartTime_; + TString LoggedAttributes_; + + TConnectionPtr Connection; + + THolder<TRequestStream> RequestStream_; + + THolder<TSocketInput> SocketInput; + THolder<THttpResponse> Input; + + bool LogResponse = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/http_client.cpp b/yt/cpp/mapreduce/http/http_client.cpp new file mode 100644 index 0000000000..a2af1182dc --- /dev/null +++ b/yt/cpp/mapreduce/http/http_client.cpp @@ -0,0 +1,603 @@ +#include "http_client.h" + +#include "abortable_http_response.h" +#include "core.h" +#include "helpers.h" +#include "http.h" + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/yt/core/concurrency/thread_pool_poller.h> + +#include <yt/yt/core/http/client.h> +#include <yt/yt/core/http/config.h> +#include <yt/yt/core/http/http.h> + +#include <yt/yt/core/https/client.h> +#include <yt/yt/core/https/config.h> + +#include <library/cpp/yson/node/node_io.h> + +namespace NYT::NHttpClient { + +namespace { + +TString CreateHost(TStringBuf host, TStringBuf port) +{ + if (!port.empty()) { + return Format("%v:%v", host, port); + } + + return TString(host); +} + +TMaybe<TErrorResponse> GetErrorResponse(const TString& hostName, const TString& requestId, const NHttp::IResponsePtr& response) +{ + auto httpCode = response->GetStatusCode(); + if (httpCode == NHttp::EStatusCode::OK || httpCode == NHttp::EStatusCode::Accepted) { + return {}; + } + + TErrorResponse errorResponse(static_cast<int>(httpCode), requestId); + + auto logAndSetError = [&] (const TString& rawError) { + YT_LOG_ERROR("RSP %v - HTTP %v - %v", + requestId, + httpCode, + rawError.data()); + errorResponse.SetRawError(rawError); + }; + + switch (httpCode) { + case NHttp::EStatusCode::TooManyRequests: + logAndSetError("request rate limit exceeded"); + break; + + case NHttp::EStatusCode::InternalServerError: + logAndSetError("internal error in proxy " + hostName); + break; + + default: { + TStringStream httpHeaders; + httpHeaders << "HTTP headers ("; + for (const auto& [headerName, headerValue] : response->GetHeaders()->Dump()) { + httpHeaders << headerName << ": " << headerValue << "; "; + } + httpHeaders << ")"; + + auto errorString = Sprintf("RSP %s - HTTP %d - %s", + requestId.data(), + static_cast<int>(httpCode), + httpHeaders.Str().data()); + + YT_LOG_ERROR("%v", + errorString.data()); + + if (auto errorHeader = response->GetHeaders()->Find("X-YT-Error")) { + errorResponse.ParseFromJsonError(*errorHeader); + if (errorResponse.IsOk()) { + return Nothing(); + } + return errorResponse; + } + + errorResponse.SetRawError( + errorString + " - X-YT-Error is missing in headers"); + break; + } + } + + return errorResponse; +} + +void CheckErrorResponse(const TString& hostName, const TString& requestId, const NHttp::IResponsePtr& response) +{ + auto errorResponse = GetErrorResponse(hostName, requestId, response); + if (errorResponse) { + throw *errorResponse; + } +} + +} // namespace + +/////////////////////////////////////////////////////////////////////////////// + +class TDefaultHttpResponse + : public IHttpResponse +{ +public: + TDefaultHttpResponse(std::unique_ptr<THttpRequest> request) + : Request_(std::move(request)) + { } + + int GetStatusCode() override + { + return Request_->GetHttpCode(); + } + + IInputStream* GetResponseStream() override + { + return Request_->GetResponseStream(); + } + + TString GetResponse() override + { + return Request_->GetResponse(); + } + + TString GetRequestId() const override + { + return Request_->GetRequestId(); + } + +private: + std::unique_ptr<THttpRequest> Request_; +}; + +class TDefaultHttpRequest + : public IHttpRequest +{ +public: + TDefaultHttpRequest(std::unique_ptr<THttpRequest> request, IOutputStream* stream) + : Request_(std::move(request)) + , Stream_(stream) + { } + + IOutputStream* GetStream() override + { + return Stream_; + } + + IHttpResponsePtr Finish() override + { + Request_->FinishRequest(); + return std::make_unique<TDefaultHttpResponse>(std::move(Request_)); + } + +private: + std::unique_ptr<THttpRequest> Request_; + IOutputStream* Stream_; +}; + +class TDefaultHttpClient + : public IHttpClient +{ +public: + IHttpResponsePtr Request(const TString& url, const TString& requestId, const THttpConfig& config, const THttpHeader& header, TMaybe<TStringBuf> body) override + { + auto request = std::make_unique<THttpRequest>(requestId); + + auto urlRef = NHttp::ParseUrl(url); + + request->Connect(CreateHost(urlRef.Host, urlRef.PortStr), config.SocketTimeout); + request->SmallRequest(header, body); + return std::make_unique<TDefaultHttpResponse>(std::move(request)); + } + + IHttpRequestPtr StartRequest(const TString& url, const TString& requestId, const THttpConfig& config, const THttpHeader& header) override + { + auto request = std::make_unique<THttpRequest>(requestId); + + auto urlRef = NHttp::ParseUrl(url); + + request->Connect(CreateHost(urlRef.Host, urlRef.PortStr), config.SocketTimeout); + auto stream = request->StartRequest(header); + return std::make_unique<TDefaultHttpRequest>(std::move(request), stream); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +struct TCoreRequestContext +{ + TString HostName; + TString Url; + TString RequestId; + bool LogResponse; + TInstant StartTime; + TString LoggedAttributes; +}; + +class TCoreHttpResponse + : public IHttpResponse +{ +public: + TCoreHttpResponse( + TCoreRequestContext context, + NHttp::IResponsePtr response) + : Context_(std::move(context)) + , Response_(std::move(response)) + { } + + int GetStatusCode() override + { + return static_cast<int>(Response_->GetStatusCode()); + } + + IInputStream* GetResponseStream() override + { + if (!Stream_) { + auto stream = std::make_unique<TWrappedStream>( + NConcurrency::CreateSyncAdapter(NConcurrency::CreateCopyingAdapter(Response_), NConcurrency::EWaitForStrategy::WaitFor), + Response_, + Context_.RequestId); + CheckErrorResponse(Context_.HostName, Context_.RequestId, Response_); + + if (TConfig::Get()->UseAbortableResponse) { + Y_VERIFY(!Context_.Url.empty()); + Stream_ = std::make_unique<TAbortableCoreHttpResponse>(std::move(stream), Context_.Url); + } else { + Stream_ = std::move(stream); + } + } + + return Stream_.get(); + } + + TString GetResponse() override + { + auto result = GetResponseStream()->ReadAll(); + + TStringStream loggedAttributes; + loggedAttributes + << "Time: " << TInstant::Now() - Context_.StartTime << "; " + << "HostName: " << Context_.HostName << "; " + << Context_.LoggedAttributes; + + if (Context_.LogResponse) { + constexpr auto sizeLimit = 1 << 7; + YT_LOG_DEBUG("RSP %v - received response (Response: '%v'; %v)", + Context_.RequestId, + TruncateForLogs(result, sizeLimit), + loggedAttributes.Str()); + } else { + YT_LOG_DEBUG("RSP %v - received response of %v bytes (%v)", + Context_.RequestId, + result.size(), + loggedAttributes.Str()); + } + return result; + } + + TString GetRequestId() const override + { + return Context_.RequestId; + } + +private: + class TWrappedStream + : public IInputStream + { + public: + TWrappedStream(std::unique_ptr<IInputStream> underlying, NHttp::IResponsePtr response, TString requestId) + : Underlying_(std::move(underlying)) + , Response_(std::move(response)) + , RequestId_(std::move(requestId)) + { } + + protected: + size_t DoRead(void* buf, size_t len) override + { + size_t read = Underlying_->Read(buf, len); + + if (read == 0 && len != 0) { + CheckTrailers(Response_->GetTrailers()); + } + return read; + } + + size_t DoSkip(size_t len) override + { + size_t skipped = Underlying_->Skip(len); + if (skipped == 0 && len != 0) { + CheckTrailers(Response_->GetTrailers()); + } + return skipped; + } + + private: + void CheckTrailers(const NHttp::THeadersPtr& trailers) + { + if (auto errorResponse = ParseError(trailers)) { + errorResponse->SetIsFromTrailers(true); + YT_LOG_ERROR("RSP %v - %v", + RequestId_, + errorResponse.GetRef().what()); + ythrow errorResponse.GetRef(); + } + } + + TMaybe<TErrorResponse> ParseError(const NHttp::THeadersPtr& headers) + { + if (auto errorHeader = headers->Find("X-YT-Error")) { + TErrorResponse errorResponse(static_cast<int>(Response_->GetStatusCode()), RequestId_); + errorResponse.ParseFromJsonError(*errorHeader); + if (errorResponse.IsOk()) { + return Nothing(); + } + return errorResponse; + } + return Nothing(); + } + + private: + std::unique_ptr<IInputStream> Underlying_; + NHttp::IResponsePtr Response_; + TString RequestId_; + }; + +private: + TCoreRequestContext Context_; + NHttp::IResponsePtr Response_; + std::unique_ptr<IInputStream> Stream_; +}; + +class TCoreHttpRequest + : public IHttpRequest +{ +public: + TCoreHttpRequest(TCoreRequestContext context, NHttp::IActiveRequestPtr activeRequest) + : Context_(std::move(context)) + , ActiveRequest_(std::move(activeRequest)) + , Stream_(NConcurrency::CreateBufferedSyncAdapter(ActiveRequest_->GetRequestStream())) + , WrappedStream_(this, Stream_.get()) + { } + + IOutputStream* GetStream() override + { + return &WrappedStream_; + } + + IHttpResponsePtr Finish() override + { + WrappedStream_.Flush(); + auto response = ActiveRequest_->Finish().Get().ValueOrThrow(); + return std::make_unique<TCoreHttpResponse>(std::move(Context_), std::move(response)); + } + + IHttpResponsePtr FinishWithError() + { + auto response = ActiveRequest_->GetResponse(); + return std::make_unique<TCoreHttpResponse>(std::move(Context_), std::move(response)); + } + +private: + class TWrappedStream + : public IOutputStream + { + public: + TWrappedStream(TCoreHttpRequest* httpRequest, IOutputStream* underlying) + : HttpRequest_(httpRequest) + , Underlying_(underlying) + { } + + private: + void DoWrite(const void* buf, size_t len) override + { + WrapWriteFunc([&] { + Underlying_->Write(buf, len); + }); + } + + void DoWriteV(const TPart* parts, size_t count) override + { + WrapWriteFunc([&] { + Underlying_->Write(parts, count); + }); + } + + void DoWriteC(char ch) override + { + WrapWriteFunc([&] { + Underlying_->Write(ch); + }); + } + + void DoFlush() override + { + WrapWriteFunc([&] { + Underlying_->Flush(); + }); + } + + void DoFinish() override + { + WrapWriteFunc([&] { + Underlying_->Finish(); + }); + } + + void WrapWriteFunc(std::function<void()> func) + { + CheckErrorState(); + try { + func(); + } catch (const std::exception&) { + HandleWriteException(); + } + } + + // In many cases http proxy stops reading request and resets connection + // if error has happend. This function tries to read error response + // in such cases. + void HandleWriteException() { + Y_VERIFY(WriteError_ == nullptr); + WriteError_ = std::current_exception(); + Y_VERIFY(WriteError_ != nullptr); + try { + HttpRequest_->FinishWithError()->GetResponseStream(); + } catch (const TErrorResponse &) { + throw; + } catch (...) { + } + std::rethrow_exception(WriteError_); + } + + void CheckErrorState() + { + if (WriteError_) { + std::rethrow_exception(WriteError_); + } + } + + private: + TCoreHttpRequest* const HttpRequest_; + IOutputStream* Underlying_; + std::exception_ptr WriteError_; + }; + +private: + TCoreRequestContext Context_; + NHttp::IActiveRequestPtr ActiveRequest_; + std::unique_ptr<IOutputStream> Stream_; + TWrappedStream WrappedStream_; +}; + +class TCoreHttpClient + : public IHttpClient +{ +public: + TCoreHttpClient(bool useTLS, const TConfigPtr& config) + : Poller_(NConcurrency::CreateThreadPoolPoller(1, "http_poller")) // TODO(nadya73): YT-18363: move threads count to config + { + if (useTLS) { + auto httpsConfig = NYT::New<NYT::NHttps::TClientConfig>(); + httpsConfig->MaxIdleConnections = config->ConnectionPoolSize; + Client_ = NHttps::CreateClient(httpsConfig, Poller_); + } else { + auto httpConfig = NYT::New<NYT::NHttp::TClientConfig>(); + httpConfig->MaxIdleConnections = config->ConnectionPoolSize; + Client_ = NHttp::CreateClient(httpConfig, Poller_); + } + } + + IHttpResponsePtr Request(const TString& url, const TString& requestId, const THttpConfig& /*config*/, const THttpHeader& header, TMaybe<TStringBuf> body) override + { + TCoreRequestContext context = CreateContext(url, requestId, header); + + // TODO(nadya73): YT-18363: pass socket timeouts from THttpConfig + + NHttp::IResponsePtr response; + + auto logRequest = [&](bool includeParameters) { + LogRequest(header, url, includeParameters, requestId, context.HostName); + context.LoggedAttributes = GetLoggedAttributes(header, url, includeParameters, 128); + }; + + if (!body && (header.GetMethod() == "PUT" || header.GetMethod() == "POST")) { + const auto& parameters = header.GetParameters(); + auto parametersStr = NodeToYsonString(parameters); + + bool includeParameters = false; + auto headers = header.GetHeader(context.HostName, requestId, includeParameters).Get(); + + logRequest(includeParameters); + + auto activeRequest = StartRequestImpl(header.GetMethod(), url, headers); + + activeRequest->GetRequestStream()->Write(TSharedRef::FromString(parametersStr)).Get().ThrowOnError(); + response = activeRequest->Finish().Get().ValueOrThrow(); + } else { + auto bodyRef = TSharedRef::FromString(TString(body ? *body : "")); + bool includeParameters = true; + auto headers = header.GetHeader(context.HostName, requestId, includeParameters).Get(); + + logRequest(includeParameters); + + if (header.GetMethod() == "GET") { + response = RequestImpl(header.GetMethod(), url, headers, bodyRef); + } else { + auto activeRequest = StartRequestImpl(header.GetMethod(), url, headers); + + auto request = std::make_unique<TCoreHttpRequest>(std::move(context), std::move(activeRequest)); + if (body) { + request->GetStream()->Write(*body); + } + return request->Finish(); + } + } + + return std::make_unique<TCoreHttpResponse>(std::move(context), std::move(response)); + } + + IHttpRequestPtr StartRequest(const TString& url, const TString& requestId, const THttpConfig& /*config*/, const THttpHeader& header) override + { + TCoreRequestContext context = CreateContext(url, requestId, header); + + LogRequest(header, url, true, requestId, context.HostName); + context.LoggedAttributes = GetLoggedAttributes(header, url, true, 128); + + auto headers = header.GetHeader(context.HostName, requestId, true).Get(); + auto activeRequest = StartRequestImpl(header.GetMethod(), url, headers); + + return std::make_unique<TCoreHttpRequest>(std::move(context), std::move(activeRequest)); + } + +private: + TCoreRequestContext CreateContext(const TString& url, const TString& requestId, const THttpHeader& header) + { + TCoreRequestContext context; + context.Url = url; + context.RequestId = requestId; + + auto urlRef = NHttp::ParseUrl(url); + context.HostName = CreateHost(urlRef.Host, urlRef.PortStr); + + context.LogResponse = false; + auto outputFormat = header.GetOutputFormat(); + if (outputFormat && outputFormat->IsTextYson()) { + context.LogResponse = true; + } + context.StartTime = TInstant::Now(); + return context; + } + + NHttp::IResponsePtr RequestImpl(const TString& method, const TString& url, const NHttp::THeadersPtr& headers, const TSharedRef& body) + { + if (method == "GET") { + return Client_->Get(url, headers).Get().ValueOrThrow(); + } else if (method == "POST") { + return Client_->Post(url, body, headers).Get().ValueOrThrow(); + } else if (method == "PUT") { + return Client_->Put(url, body, headers).Get().ValueOrThrow(); + } else { + YT_LOG_FATAL("Unsupported http method (Method: %v, Url: %v)", + method, + url); + } + } + + NHttp::IActiveRequestPtr StartRequestImpl(const TString& method, const TString& url, const NHttp::THeadersPtr& headers) + { + if (method == "POST") { + return Client_->StartPost(url, headers).Get().ValueOrThrow(); + } else if (method == "PUT") { + return Client_->StartPut(url, headers).Get().ValueOrThrow(); + } else { + YT_LOG_FATAL("Unsupported http method (Method: %v, Url: %v)", + method, + url); + } + } + + NConcurrency::IThreadPoolPollerPtr Poller_; + NHttp::IClientPtr Client_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +IHttpClientPtr CreateDefaultHttpClient() +{ + return std::make_shared<TDefaultHttpClient>(); +} + +IHttpClientPtr CreateCoreHttpClient(bool useTLS, const TConfigPtr& config) +{ + return std::make_shared<TCoreHttpClient>(useTLS, config); +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NHttpClient diff --git a/yt/cpp/mapreduce/http/http_client.h b/yt/cpp/mapreduce/http/http_client.h new file mode 100644 index 0000000000..859f0423cb --- /dev/null +++ b/yt/cpp/mapreduce/http/http_client.h @@ -0,0 +1,76 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/fwd.h> + +#include <util/datetime/base.h> + +#include <util/generic/maybe.h> +#include <util/generic/string.h> + +#include <util/stream/fwd.h> + +#include <memory> + +namespace NYT::NHttpClient { + +/////////////////////////////////////////////////////////////////////////////// + +struct THttpConfig +{ + TDuration SocketTimeout = TDuration::Zero(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class IHttpResponse +{ +public: + virtual ~IHttpResponse() = default; + + virtual int GetStatusCode() = 0; + virtual IInputStream* GetResponseStream() = 0; + virtual TString GetResponse() = 0; + virtual TString GetRequestId() const = 0; +}; + +class IHttpRequest +{ +public: + virtual ~IHttpRequest() = default; + + virtual IOutputStream* GetStream() = 0; + virtual IHttpResponsePtr Finish() = 0; +}; + + +class IHttpClient +{ +public: + virtual ~IHttpClient() = default; + + virtual IHttpResponsePtr Request(const TString& url, const TString& requestId, const THttpConfig& config, const THttpHeader& header, TMaybe<TStringBuf> body = {}) = 0; + + virtual IHttpResponsePtr Request(const TString& url, const TString& requestId, const THttpHeader& header, TMaybe<TStringBuf> body = {}) + { + return Request(url, requestId, /*config*/ {}, header, body); + } + + virtual IHttpRequestPtr StartRequest(const TString& url, const TString& requestId, const THttpConfig& config, const THttpHeader& header) = 0; + + virtual IHttpRequestPtr StartRequest(const TString& url, const TString& requestId, const THttpHeader& header) + { + return StartRequest(url, requestId, /*config*/ {}, header); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +IHttpClientPtr CreateDefaultHttpClient(); + +IHttpClientPtr CreateCoreHttpClient(bool useTLS, const TConfigPtr& config); + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NHttpClient diff --git a/yt/cpp/mapreduce/http/requests.cpp b/yt/cpp/mapreduce/http/requests.cpp new file mode 100644 index 0000000000..7cf0f673bb --- /dev/null +++ b/yt/cpp/mapreduce/http/requests.cpp @@ -0,0 +1,66 @@ +#include "requests.h" + +#include "context.h" +#include "host_manager.h" +#include "retry_request.h" + +#include <yt/cpp/mapreduce/client/transaction.h> + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/node_builder.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> +#include <yt/cpp/mapreduce/interface/serialize.h> + +#include <util/stream/file.h> +#include <util/string/builder.h> +#include <util/generic/buffer.h> + + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +bool ParseBoolFromResponse(const TString& response) +{ + return GetBool(NodeFromYsonString(response)); +} + +TGUID ParseGuidFromResponse(const TString& response) +{ + auto node = NodeFromYsonString(response); + return GetGuid(node.AsString()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TString GetProxyForHeavyRequest(const TClientContext& context) +{ + if (!context.Config->UseHosts) { + return context.ServerName; + } + + return NPrivate::THostManager::Get().GetProxyForHeavyRequest(context); +} + +void LogRequestError( + const TString& requestId, + const THttpHeader& header, + const TString& message, + const TString& attemptDescription) +{ + YT_LOG_ERROR("RSP %v - %v - %v - %v - X-YT-Parameters: %v", + requestId, + header.GetUrl(), + message, + attemptDescription, + NodeToYsonString(header.GetParameters())); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/requests.h b/yt/cpp/mapreduce/http/requests.h new file mode 100644 index 0000000000..2c692475d1 --- /dev/null +++ b/yt/cpp/mapreduce/http/requests.h @@ -0,0 +1,29 @@ +#pragma once + +#include "fwd.h" +#include "http.h" + +#include <util/generic/maybe.h> +#include <util/str_stl.h> + +namespace NYT { + +/////////////////////////////////////////////////////////////////////////////// + +bool ParseBoolFromResponse(const TString& response); + +TGUID ParseGuidFromResponse(const TString& response); + +//////////////////////////////////////////////////////////////////////////////// + +TString GetProxyForHeavyRequest(const TClientContext& context); + +void LogRequestError( + const TString& requestId, + const THttpHeader& header, + const TString& message, + const TString& attemptDescription); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/retry_request.cpp b/yt/cpp/mapreduce/http/retry_request.cpp new file mode 100644 index 0000000000..ba116edcf7 --- /dev/null +++ b/yt/cpp/mapreduce/http/retry_request.cpp @@ -0,0 +1,149 @@ +#include "retry_request.h" + +#include "context.h" +#include "helpers.h" +#include "http_client.h" +#include "requests.h" + +#include <yt/cpp/mapreduce/common/wait_proxy.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node_io.h> + +namespace NYT { +namespace NDetail { + +/////////////////////////////////////////////////////////////////////////////// + +static TResponseInfo Request( + const TClientContext& context, + THttpHeader& header, + TMaybe<TStringBuf> body, + const TString& requestId, + const TRequestConfig& config) +{ + TString hostName; + if (config.IsHeavy) { + hostName = GetProxyForHeavyRequest(context); + } else { + hostName = context.ServerName; + } + + auto url = GetFullUrl(hostName, context, header); + + auto response = context.HttpClient->Request(url, requestId, config.HttpConfig, header, body); + + TResponseInfo result; + result.RequestId = requestId; + result.Response = response->GetResponse(); + result.HttpCode = response->GetStatusCode(); + return result; +} + +TResponseInfo RequestWithoutRetry( + const TClientContext& context, + THttpHeader& header, + TMaybe<TStringBuf> body, + const TRequestConfig& config) +{ + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + + if (context.ImpersonationUser) { + header.SetImpersonationUser(*context.ImpersonationUser); + } + + if (header.HasMutationId()) { + header.RemoveParameter("retry"); + header.AddMutationId(); + } + auto requestId = CreateGuidAsString(); + return Request(context, header, body, requestId, config); +} + + +TResponseInfo RetryRequestWithPolicy( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + THttpHeader& header, + TMaybe<TStringBuf> body, + const TRequestConfig& config) +{ + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + + if (context.ImpersonationUser) { + header.SetImpersonationUser(*context.ImpersonationUser); + } + + bool useMutationId = header.HasMutationId(); + bool retryWithSameMutationId = false; + + if (!retryPolicy) { + retryPolicy = CreateDefaultRequestRetryPolicy(context.Config); + } + + while (true) { + auto requestId = CreateGuidAsString(); + try { + retryPolicy->NotifyNewAttempt(); + + if (useMutationId) { + if (retryWithSameMutationId) { + header.AddParameter("retry", true, /* overwrite = */ true); + } else { + header.RemoveParameter("retry"); + header.AddMutationId(); + } + } + + return Request(context, header, body, requestId, config); + } catch (const TErrorResponse& e) { + LogRequestError(requestId, header, e.GetError().GetMessage(), retryPolicy->GetAttemptDescription()); + retryWithSameMutationId = e.IsTransportError(); + + if (!IsRetriable(e)) { + throw; + } + + auto maybeRetryTimeout = retryPolicy->OnRetriableError(e); + if (maybeRetryTimeout) { + TWaitProxy::Get()->Sleep(*maybeRetryTimeout); + } else { + throw; + } + } catch (const std::exception& e) { + LogRequestError(requestId, header, e.what(), retryPolicy->GetAttemptDescription()); + retryWithSameMutationId = true; + + if (!IsRetriable(e)) { + throw; + } + + auto maybeRetryTimeout = retryPolicy->OnGenericError(e); + if (maybeRetryTimeout) { + TWaitProxy::Get()->Sleep(*maybeRetryTimeout); + } else { + throw; + } + } + } + + Y_FAIL("Retries must have either succeeded or thrown an exception"); +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/http/retry_request.h b/yt/cpp/mapreduce/http/retry_request.h new file mode 100644 index 0000000000..2210e318f1 --- /dev/null +++ b/yt/cpp/mapreduce/http/retry_request.h @@ -0,0 +1,52 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/fwd.h> +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/http/http_client.h> + +#include <util/datetime/base.h> +#include <util/generic/maybe.h> +#include <util/generic/string.h> + +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////// + +struct TResponseInfo +{ + TString RequestId; + TString Response; + int HttpCode = 0; +}; + +//////////////////////////////////////////////////////////////////// + +struct TRequestConfig +{ + NHttpClient::THttpConfig HttpConfig; + bool IsHeavy = false; +}; + +//////////////////////////////////////////////////////////////////// + +// Retry request with given `header' and `body' using `retryPolicy'. +// If `retryPolicy == nullptr' use default, currently `TAttemptLimitedRetryPolicy(TConfig::Get()->RetryCount)`. +TResponseInfo RetryRequestWithPolicy( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + THttpHeader& header, + TMaybe<TStringBuf> body = {}, + const TRequestConfig& config = TRequestConfig()); + +TResponseInfo RequestWithoutRetry( + const TClientContext& context, + THttpHeader& header, + TMaybe<TStringBuf> body = {}, + const TRequestConfig& config = TRequestConfig()); + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail diff --git a/yt/cpp/mapreduce/http/ya.make b/yt/cpp/mapreduce/http/ya.make new file mode 100644 index 0000000000..ef81a4b64a --- /dev/null +++ b/yt/cpp/mapreduce/http/ya.make @@ -0,0 +1,29 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + abortable_http_response.cpp + context.cpp + helpers.cpp + host_manager.cpp + http.cpp + http_client.cpp + requests.cpp + retry_request.cpp +) + +PEERDIR( + library/cpp/deprecated/atomic + library/cpp/http/io + library/cpp/string_utils/base64 + library/cpp/string_utils/quote + library/cpp/threading/cron + yt/cpp/mapreduce/common + yt/cpp/mapreduce/interface + yt/cpp/mapreduce/interface/logging + yt/yt/core/http + yt/yt/core/https +) + +END() diff --git a/yt/cpp/mapreduce/interface/batch_request.cpp b/yt/cpp/mapreduce/interface/batch_request.cpp new file mode 100644 index 0000000000..fefdacb61a --- /dev/null +++ b/yt/cpp/mapreduce/interface/batch_request.cpp @@ -0,0 +1,15 @@ +#include "batch_request.h" +#include "client.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +IBatchRequestBase& IBatchRequest::WithTransaction(const ITransactionPtr& transaction) +{ + return WithTransaction(transaction->GetId()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/batch_request.h b/yt/cpp/mapreduce/interface/batch_request.h new file mode 100644 index 0000000000..3ea28f76fd --- /dev/null +++ b/yt/cpp/mapreduce/interface/batch_request.h @@ -0,0 +1,222 @@ +#pragma once + +#include "fwd.h" + +#include "client_method_options.h" + +#include <library/cpp/threading/future/future.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +/// Helper base of @ref NYT::IBatchRequest holding most of useful methods. +class IBatchRequestBase + : public TThrRefBase +{ +public: + virtual ~IBatchRequestBase() = default; + + /// + /// @brief Create cypress node. + /// + /// @see NYT::ICypressClient::Create + virtual ::NThreading::TFuture<TNodeId> Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options = TCreateOptions()) = 0; + + /// + /// @brief Remove cypress node. + /// + /// @see NYT::ICypressClient::Remove + virtual ::NThreading::TFuture<void> Remove( + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()) = 0; + + /// + /// @brief Check wether cypress node exists. + /// + /// @see NYT::ICypressClient::Exists + virtual ::NThreading::TFuture<bool> Exists( + const TYPath& path, + const TExistsOptions& options = TExistsOptions()) = 0; + + /// + /// @brief Get cypress node. + /// + /// @see NYT::ICypressClient::Get + virtual ::NThreading::TFuture<TNode> Get( + const TYPath& path, + const TGetOptions& options = TGetOptions()) = 0; + + /// + /// @brief Set cypress node. + /// + /// @see NYT::ICypressClient::Set + virtual ::NThreading::TFuture<void> Set( + const TYPath& path, + const TNode& node, + const TSetOptions& options = TSetOptions()) = 0; + + /// + /// @brief List cypress directory. + /// + /// @see NYT::ICypressClient::List + virtual ::NThreading::TFuture<TNode::TListType> List( + const TYPath& path, + const TListOptions& options = TListOptions()) = 0; + + /// + /// @brief Copy cypress node. + /// + /// @see NYT::ICypressClient::Copy + virtual ::NThreading::TFuture<TNodeId> Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()) = 0; + + /// + /// @brief Move cypress node. + /// + /// @see NYT::ICypressClient::Move + virtual ::NThreading::TFuture<TNodeId> Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()) = 0; + + /// + /// @brief Create symbolic link. + /// + /// @see NYT::ICypressClient::Link. + virtual ::NThreading::TFuture<TNodeId> Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()) = 0; + + /// + /// @brief Lock cypress node. + /// + /// @see NYT::ICypressClient::Lock + virtual ::NThreading::TFuture<ILockPtr> Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options = TLockOptions()) = 0; + + /// + /// @brief Unlock cypress node. + /// + /// @see NYT::ICypressClient::Unlock + virtual ::NThreading::TFuture<void> Unlock( + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()) = 0; + + /// + /// @brief Abort operation. + /// + /// @see NYT::IClient::AbortOperation + virtual ::NThreading::TFuture<void> AbortOperation(const TOperationId& operationId) = 0; + + /// + /// @brief Force complete operation. + /// + /// @see NYT::IClient::CompleteOperation + virtual ::NThreading::TFuture<void> CompleteOperation(const TOperationId& operationId) = 0; + + /// + /// @brief Suspend operation. + /// + /// @see NYT::IClient::SuspendOperation + virtual ::NThreading::TFuture<void> SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// + /// @brief Resume operation. + /// + /// @see NYT::IClient::ResumeOperation + virtual ::NThreading::TFuture<void> ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Update parameters of running operation. + /// + /// @see NYT::IClient::UpdateOperationParameters + virtual ::NThreading::TFuture<void> UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0; + + /// + /// @brief Canonize cypress path + /// + /// @see NYT::ICypressClient::CanonizeYPath + virtual ::NThreading::TFuture<TRichYPath> CanonizeYPath(const TRichYPath& path) = 0; + + /// + /// @brief Get table columnar statistic + /// + /// @see NYT::ICypressClient::GetTableColumnarStatistics + virtual ::NThreading::TFuture<TVector<TTableColumnarStatistics>> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options = {}) = 0; + + /// + /// @brief Check permission for given path. + /// + /// @see NYT::IClient::CheckPermission + virtual ::NThreading::TFuture<TCheckPermissionResponse> CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0; +}; + +/// +/// @brief Batch request object. +/// +/// Allows to send multiple lightweight requests at once significantly +/// reducing time of their execution. +/// +/// Methods of this class accept same arguments as @ref NYT::IClient methods but +/// return TFuture that is set after execution of @ref NYT::IBatchRequest::ExecuteBatch +/// +/// @see [Example of usage](https://a.yandex-team.ru/arc/trunk/arcadia/yt/cpp/mapreduce/examples/tutorial/batch_request/main.cpp) +class IBatchRequest + : public IBatchRequestBase +{ +public: + /// + /// @brief Temporary override current transaction. + /// + /// Using WithTransaction user can temporary override default transaction. + /// Example of usage: + /// TBatchRequest batchRequest; + /// auto noTxResult = batchRequest.Get("//some/path"); + /// auto txResult = batchRequest.WithTransaction(tx).Get("//some/path"); + virtual IBatchRequestBase& WithTransaction(const TTransactionId& transactionId) = 0; + IBatchRequestBase& WithTransaction(const ITransactionPtr& transaction); + + /// + /// @brief Executes all subrequests of batch request. + /// + /// After execution of this method all TFuture objects returned by subrequests will + /// be filled with either result or error. + /// + /// @note It is undefined in which order these requests are executed. + /// + /// @note This method doesn't throw if subrequest emits error. + /// Instead corresponding future is set with exception. + /// So it is always important to check TFuture status. + /// + /// Single TBatchRequest instance may be executed only once + /// and cannot be modified (filled with additional requests) after execution. + /// Exception is thrown on attempt to modify executed batch request + /// or execute it again. + virtual void ExecuteBatch(const TExecuteBatchOptions& options = TExecuteBatchOptions()) = 0; +}; + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client.cpp b/yt/cpp/mapreduce/interface/client.cpp new file mode 100644 index 0000000000..11d308b809 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client.cpp @@ -0,0 +1,19 @@ +#include "client.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +void ILock::Wait(TDuration timeout) +{ + return GetAcquiredFuture().GetValue(timeout); +} + +void ITransaction::Detach() +{ + Y_FAIL("ITransaction::Detach() is not implemented"); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client.h b/yt/cpp/mapreduce/interface/client.h new file mode 100644 index 0000000000..54f37c3ae0 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client.h @@ -0,0 +1,568 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/client.h +/// +/// Main header of the C++ YT Wrapper. + +/// +/// @mainpage C++ library for working with YT +/// +/// This library provides possibilities to work with YT as a [MapReduce](https://en.wikipedia.org/wiki/MapReduce) system. It allows: +/// - to read/write tables and files +/// - to run operations +/// - to work with transactions. +/// +/// This library provides only basic functions for working with dynamic tables. +/// To access full powers of YT dynamic tables one should use +/// [yt/client](https://a.yandex-team.ru/arc/trunk/arcadia/yt/19_4/yt/client) library. +/// +/// Entry points to this library: +/// - @ref NYT::Initialize() initialization function for this library; +/// - @ref NYT::IClient main interface to work with YT cluster; +/// - @ref NYT::CreateClient() function that creates client for particular cluster; +/// - @ref NYT::IOperationClient ancestor of @ref NYT::IClient containing the set of methods to run operations. +/// +/// Tutorial on how to use this library can be found [here](https://yt.yandex-team.ru/docs/api/c++/examples). + +#include "fwd.h" + +#include "client_method_options.h" +#include "constants.h" +#include "batch_request.h" +#include "cypress.h" +#include "init.h" +#include "io.h" +#include "node.h" +#include "operation.h" + +#include <library/cpp/threading/future/future.h> + +#include <util/datetime/base.h> +#include <util/generic/maybe.h> +#include <util/system/compiler.h> + +/// Main namespace of YT client +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// OAuth info (returned by @ref NYT::IClient::WhoAmI). +struct TAuthorizationInfo +{ + /// User's login. + TString Login; + + /// Realm. + TString Realm; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Part of @ref NYT::TCheckPermissionResponse. +/// +/// In case when 'Action == ESecurityAction::Deny' because of a 'deny' rule, +/// the "denying" object name and id and "denied" subject name an id may be returned. +struct TCheckPermissionResult +{ + /// Was the access granted or not. + ESecurityAction Action; + + /// Id of the object whose ACL's "deny" rule forbids the access. + TMaybe<TGUID> ObjectId; + + /// + /// @brief Name of the object whose ACL's "deny" rule forbids the access. + /// + /// Example is "node //tmp/x/y". + TMaybe<TString> ObjectName; + + /// Id of the subject for whom the access was denied by a "deny" rule. + TMaybe<TGUID> SubjectId; + + /// Name of the subject for whom the access was denied by a "deny" rule. + TMaybe<TString> SubjectName; +}; + +/// @brief Result of @ref NYT::IClient::CheckPermission command. +/// +/// The base part of the response corresponds to the check result for the node itself. +/// `Columns` vector contains check results for the columns (in the same order as in the request). +struct TCheckPermissionResponse + : public TCheckPermissionResult +{ + /// @brief Results for the table columns access permissions. + /// + /// @see [Columnar ACL doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl) + TVector<TCheckPermissionResult> Columns; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Interface representing a lock obtained from @ref NYT::ITransaction::Lock. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx) +class ILock + : public TThrRefBase +{ +public: + virtual ~ILock() = default; + + /// Get cypress node id of lock itself. + virtual const TLockId& GetId() const = 0; + + /// Get cypress node id of locked object. + virtual TNodeId GetLockedNodeId() const = 0; + + /// + /// @brief Get future that will be set once lock is in "acquired" state. + /// + /// Note that future might contain exception if some error occurred + /// e.g. lock transaction was aborted. + virtual const ::NThreading::TFuture<void>& GetAcquiredFuture() const = 0; + + /// + /// @brief Wait until lock is in "acquired" state. + /// + /// Throws exception if timeout exceeded or some error occurred + /// e.g. lock transaction was aborted. + void Wait(TDuration timeout = TDuration::Max()); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Base class for @ref NYT::IClient and @ref NYT::ITransaction. +/// +/// This class contains transactional commands. +class IClientBase + : public TThrRefBase + , public ICypressClient + , public IIOClient + , public IOperationClient +{ +public: + /// + /// @brief Start a [transaction] (https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx) + [[nodiscard]] virtual ITransactionPtr StartTransaction( + const TStartTransactionOptions& options = TStartTransactionOptions()) = 0; + + /// + /// @brief Change properties of table. + /// + /// Allows to: + /// - switch table between dynamic/static mode + /// - or change table schema + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table) + virtual void AlterTable( + const TYPath& path, + const TAlterTableOptions& options = TAlterTableOptions()) = 0; + + /// + /// @brief Create batch request object that allows to execute several light requests in parallel. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#execute-batch) + virtual TBatchRequestPtr CreateBatchRequest() = 0; + + /// @brief Get root client outside of all transactions. + virtual IClientPtr GetParentClient() = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +/// @brief Interface representing a master transaction. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions) +class ITransaction + : virtual public IClientBase +{ +public: + /// Get id of transaction. + virtual const TTransactionId& GetId() const = 0; + + /// + /// @brief Try to lock given path. + /// + /// Lock will be held until transaction is commited/aborted or @ref NYT::ITransaction::Unlock method is called. + /// Lock modes: + /// - `LM_EXCLUSIVE`: if exclusive lock is taken no other transaction can take exclusive or shared lock. + /// - `LM_SHARED`: if shared lock is taken other transactions can take shared lock but not exclusive. + /// - `LM_SNAPSHOT`: snapshot lock always succeeds, when snapshot lock is taken current transaction snapshots object. + /// It will not see changes that occurred to it in other transactions. + /// + /// Exclusive/shared lock can be waitable or not. + /// If nonwaitable lock cannot be taken exception is thrown. + /// If waitable lock cannot be taken it is created in pending state and client can wait until it actually taken. + /// Check @ref NYT::TLockOptions::Waitable and @ref NYT::ILock::GetAcquiredFuture for more details. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lock) + virtual ILockPtr Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options = TLockOptions()) = 0; + + /// + /// @brief Remove all the locks (including pending ones) for this transaction from a Cypress node at `path`. + /// + /// If the locked version of the node differs from the original one, + /// an error will be thrown. + /// + /// Command is successful even if the node has no locks. + /// Only explicit (created by @ref NYT::ITransaction::Lock) locks are removed. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unlock) + virtual void Unlock( + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()) = 0; + + /// + /// @brief Commit transaction. + /// + /// All changes that are made by transactions become visible globally or to parent transaction. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#commit) + virtual void Commit() = 0; + + /// + /// @brief Abort transaction. + /// + /// All changes made by current transaction are lost. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#abort) + virtual void Abort() = 0; + + /// @brief Explicitly ping transaction. + /// + /// User usually does not need this method (as transactions are pinged automatically, + /// see @ref NYT::TStartTransactionOptions::AutoPingable). + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#ping) + virtual void Ping() = 0; + + /// + /// @brief Detach transaction. + /// + /// Stop any activities connected with it: pinging, aborting on crashes etc. + /// Forget about the transaction totally. + virtual void Detach(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Interface containing non-transactional commands. +class IClient + : virtual public IClientBase +{ +public: + /// + /// @brief Attach to existing master transaction. + /// + /// Returned object WILL NOT: + /// - ping transaction automatically (unless @ref NYT::TAttachTransactionOptions::AutoPing is set) + /// - abort it on program termination (unless @ref NYT::TAttachTransactionOptions::AbortOnTermination is set). + /// Otherwise returned object is similar to the object returned by @ref NYT::IClientBase::StartTransaction. + /// and it can see all the changes made inside the transaction. + [[nodiscard]] virtual ITransactionPtr AttachTransaction( + const TTransactionId& transactionId, + const TAttachTransactionOptions& options = TAttachTransactionOptions()) = 0; + + /// + /// @brief Mount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#mount-table) + virtual void MountTable( + const TYPath& path, + const TMountTableOptions& options = TMountTableOptions()) = 0; + + /// + /// @brief Unmount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unmount-table) + virtual void UnmountTable( + const TYPath& path, + const TUnmountTableOptions& options = TUnmountTableOptions()) = 0; + + /// + /// @brief Remount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remount-table) + virtual void RemountTable( + const TYPath& path, + const TRemountTableOptions& options = TRemountTableOptions()) = 0; + + /// + /// @brief Switch dynamic table from `mounted' into `frozen' state. + /// + /// When table is in frozen state all its data is flushed to disk and writes are disabled. + /// + /// @note this function launches the process of switching, but doesn't wait until switching is accomplished. + /// Waiting has to be performed by user. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#freeze-table) + virtual void FreezeTable( + const TYPath& path, + const TFreezeTableOptions& options = TFreezeTableOptions()) = 0; + + /// + /// @brief Switch dynamic table from `frozen` into `mounted` state. + /// + /// @note this function launches the process of switching, but doesn't wait until switching is accomplished. + /// Waiting has to be performed by user. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unfreeze-table) + virtual void UnfreezeTable( + const TYPath& path, + const TUnfreezeTableOptions& options = TUnfreezeTableOptions()) = 0; + + /// + /// @brief Reshard dynamic table (break it into tablets) by given pivot keys. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table) + virtual void ReshardTable( + const TYPath& path, + const TVector<TKey>& pivotKeys, + const TReshardTableOptions& options = TReshardTableOptions()) = 0; + + /// + /// @brief Reshard dynamic table, breaking it into given number of tablets. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table) + virtual void ReshardTable( + const TYPath& path, + i64 tabletCount, + const TReshardTableOptions& options = TReshardTableOptions()) = 0; + + /// + /// @brief Insert rows into dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#insert-rows) + virtual void InsertRows( + const TYPath& path, + const TNode::TListType& rows, + const TInsertRowsOptions& options = TInsertRowsOptions()) = 0; + + /// + /// @brief Delete rows from dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#delete-rows) + virtual void DeleteRows( + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options = TDeleteRowsOptions()) = 0; + + /// + /// @brief Trim rows from the beginning of ordered dynamic table. + /// + /// Asynchronously removes `rowCount` rows from the beginning of ordered dynamic table. + /// Numeration of remaining rows *does not change*, e.g. after `trim(10)` and `trim(20)` + /// you get in total `20` deleted rows. + /// + /// @param path Path to ordered dynamic table. + /// @param tabletIndex Which tablet to trim. + /// @param rowCount How many trimmed rows will be in the table after command. + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#trim-rows) + virtual void TrimRows( + const TYPath& path, + i64 tabletIndex, + i64 rowCount, + const TTrimRowsOptions& options = TTrimRowsOptions()) = 0; + + /// + /// @brief Lookup rows with given keys from dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lookup-rows) + virtual TNode::TListType LookupRows( + const TYPath& path, + const TNode::TListType& keys, + const TLookupRowsOptions& options = TLookupRowsOptions()) = 0; + + /// + /// @brief Select rows from dynamic table, using [SQL dialect](https://yt.yandex-team.ru/docs//description/dynamic_tables/dyn_query_language.html). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#select-rows) + virtual TNode::TListType SelectRows( + const TString& query, + const TSelectRowsOptions& options = TSelectRowsOptions()) = 0; + + /// + /// @brief Change properties of table replica. + /// + /// Allows to enable/disable replica and/or change its mode. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table-replica) + virtual void AlterTableReplica( + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& alterTableReplicaOptions) = 0; + + /// + /// @brief Generate a monotonously increasing master timestamp. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#generate-timestamp) + virtual ui64 GenerateTimestamp() = 0; + + /// Return YT username of current client. + virtual TAuthorizationInfo WhoAmI() = 0; + + /// + /// @brief Get operation attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-operation) + virtual TOperationAttributes GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options = TGetOperationOptions()) = 0; + + /// + /// @brief List operations satisfying given filters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-operations) + virtual TListOperationsResult ListOperations( + const TListOperationsOptions& options = TListOperationsOptions()) = 0; + + /// + /// @brief Update operation runtime parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#update-op-parameters) + virtual void UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) = 0; + + /// + /// @brief Get job attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job) + virtual TJobAttributes GetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options = TGetJobOptions()) = 0; + + /// + /// List attributes of jobs satisfying given filters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-jobs) + virtual TListJobsResult ListJobs( + const TOperationId& operationId, + const TListJobsOptions& options = TListJobsOptions()) = 0; + + /// + /// @brief Get the input of a running or failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if job is missing. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-input) + virtual IFileReaderPtr GetJobInput( + const TJobId& jobId, + const TGetJobInputOptions& options = TGetJobInputOptions()) = 0; + + /// + /// @brief Get fail context of a failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if it is missing. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-fail-context) + virtual IFileReaderPtr GetJobFailContext( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& options = TGetJobFailContextOptions()) = 0; + + /// + /// @brief Get stderr of a running or failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if it is missing. + /// + /// @note YT doesn't store all job stderrs + /// + /// @note If job stderr exceeds few megabytes YT will store only head and tail of stderr. + /// + /// @see Description of `max_stderr_size` spec option [here](https://yt.yandex-team.ru/docs//description/mr/operations_options.html). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-stderr) + virtual IFileReaderPtr GetJobStderr( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& options = TGetJobStderrOptions()) = 0; + + /// + /// @brief Create one or several rbtorrents for files in a blob table. + /// + /// If specified, one torrent is created for each value of `KeyColumns` option. + /// Otherwise, a single torrent with all files of a table is created. + /// + /// @return list of nodes, each node has two fields + /// * `key`: list of key columns values. Empty if `KeyColumns` is not specified. + /// * `rbtorrent`: rbtorrent string (with `rbtorrent:` prefix) + /// + /// @see [More info.](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables#sky_share) + virtual TNode::TListType SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) = 0; + + /// + /// @brief Check if `user` has `permission` to access a Cypress node at `path`. + /// + /// For tables access to columns specified in `options.Columns_` can be checked + /// (@see [the doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl)). + /// + /// If access is denied (the returned result has `.Action == ESecurityAction::Deny`) + /// because of a `deny` rule, the "denying" object name and id + /// and "denied" subject name an id may be returned. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#check_permission) + virtual TCheckPermissionResponse CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0; + + /// @brief Get information about tablet + /// @see NYT::TTabletInfo + virtual TVector<TTabletInfo> GetTabletInfos( + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options = TGetTabletInfosOptions()) = 0; + + /// + /// @brief Suspend operation. + /// + /// Jobs will be aborted. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#suspend_op) + virtual void SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// @brief Resume previously suspended operation. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#resume_op) + virtual void ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Synchronously terminates all client's background activities + /// + /// e.g. no callbacks will be executed after the function is completed + /// + /// @note It is safe to call Shutdown multiple times + /// + /// @note @ref NYT::TApiUsageError will be thrown if any client's method is called after shutdown + /// + virtual void Shutdown() = 0; +}; + + +/// Create a client for particular MapReduce cluster. +IClientPtr CreateClient( + const TString& serverName, + const TCreateClientOptions& options = TCreateClientOptions()); + + +/// Create a client for mapreduce cluster specified in `YT_PROXY` environment variable. +IClientPtr CreateClientFromEnv( + const TCreateClientOptions& options = TCreateClientOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client_method_options.cpp b/yt/cpp/mapreduce/interface/client_method_options.cpp new file mode 100644 index 0000000000..66f72bfe5f --- /dev/null +++ b/yt/cpp/mapreduce/interface/client_method_options.cpp @@ -0,0 +1,34 @@ +#include "client_method_options.h" + +#include "tvm.h" + +namespace NYT { + +template <typename T> +static void MergeMaybe(TMaybe<T>& origin, const TMaybe<T>& patch) +{ + if (patch) { + origin = patch; + } +} + +void TFormatHints::Merge(const TFormatHints& patch) +{ + if (patch.SkipNullValuesForTNode_) { + SkipNullValuesForTNode(true); + } + MergeMaybe(EnableStringToAllConversion_, patch.EnableStringToAllConversion_); + MergeMaybe(EnableAllToStringConversion_, patch.EnableAllToStringConversion_); + MergeMaybe(EnableIntegralTypeConversion_, patch.EnableIntegralTypeConversion_); + MergeMaybe(EnableIntegralToDoubleConversion_, patch.EnableIntegralToDoubleConversion_); + MergeMaybe(EnableTypeConversion_, patch.EnableTypeConversion_); + MergeMaybe(ComplexTypeMode_, patch.ComplexTypeMode_); +} + +TCreateClientOptions& TCreateClientOptions::ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper) +{ + ServiceTicketAuth_ = std::make_shared<NAuth::IServiceTicketAuthPtrWrapper>(wrapper); + return *this; +} + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client_method_options.h b/yt/cpp/mapreduce/interface/client_method_options.h new file mode 100644 index 0000000000..8074632353 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client_method_options.h @@ -0,0 +1,1452 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/client_method_options.h +/// +/// Header containing options for @ref NYT::IClient methods. + +#include "common.h" +#include "config.h" +#include "format.h" +#include "public.h" +#include "retry_policy.h" + +#include <util/datetime/base.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Type of the cypress node. +enum ENodeType : int +{ + NT_STRING /* "string_node" */, + NT_INT64 /* "int64_node" */, + NT_UINT64 /* "uint64_node" */, + NT_DOUBLE /* "double_node" */, + NT_BOOLEAN /* "boolean_node" */, + NT_MAP /* "map_node" */, + NT_LIST /* "list_node" */, + NT_FILE /* "file" */, + NT_TABLE /* "table" */, + NT_DOCUMENT /* "document" */, + NT_REPLICATED_TABLE /* "replicated_table" */, + NT_TABLE_REPLICA /* "table_replica" */, + NT_USER /* "user" */, + NT_SCHEDULER_POOL /* "scheduler_pool" */, + NT_LINK /* "link" */, +}; + +/// +/// @brief Mode of composite type representation in yson. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson +enum class EComplexTypeMode : int +{ + Named /* "named" */, + Positional /* "positional" */, +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Create +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#create +struct TCreateOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCreateOptions; + /// @endcond + + /// Create missing parent directories if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// + /// @brief Do not raise error if node already exists. + /// + /// Node is not recreated. + /// Force and IgnoreExisting MUST NOT be used simultaneously. + FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false); + + /// + /// @brief Recreate node if it exists. + /// + /// Force and IgnoreExisting MUST NOT be used simultaneously. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// @brief Set node attributes. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Remove +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#remove +struct TRemoveOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TRemoveOptions; + /// @endcond + + /// + /// @brief Remove whole tree when removing composite cypress node (e.g. `map_node`). + /// + /// Without this option removing nonempty composite node will fail. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// @brief Do not fail if removing node doesn't exist. + FLUENT_FIELD_DEFAULT(bool, Force, false); +}; + +/// Base class for options for operations that read from master. +template <typename TDerived> +struct TMasterReadOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Where to read from. + FLUENT_FIELD_OPTION(EMasterReadKind, ReadFrom); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Exists +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#exists +struct TExistsOptions + : public TMasterReadOptions<TExistsOptions> +{ +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Get +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get +struct TGetOptions + : public TMasterReadOptions<TGetOptions> +{ + /// @brief Attributes that should be fetched with each node. + FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter); + + /// @brief Limit for the number of children node. + FLUENT_FIELD_OPTION(i64, MaxSize); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Set +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#set +struct TSetOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSetOptions; + /// @endcond + + /// Create missing parent directories if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allow setting any nodes, not only attribute and document ones. + FLUENT_FIELD_OPTION(bool, Force); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::MultisetAttributes +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes +struct TMultisetAttributesOptions +{ }; + +/// +/// @brief Options for @ref NYT::ICypressClient::List +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list +struct TListOptions + : public TMasterReadOptions<TListOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TListOptions; + /// @endcond + + /// Attributes that should be fetched for each node. + FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter); + + /// Limit for the number of children that will be fetched. + FLUENT_FIELD_OPTION(i64, MaxSize); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Copy +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#copy +struct TCopyOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCopyOptions; + /// @endcond + + /// Create missing directories in destination path if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allows to use existing node as destination, it will be overwritten. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Whether to preserves account of source node. + FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false); + + /// Whether to preserve `expiration_time` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTime); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Move +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#move +struct TMoveOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TMoveOptions; + /// @endcond + + /// Create missing directories in destination path if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allows to use existing node as destination, it will be overwritten. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Whether to preserves account of source node. + FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false); + + /// Whether to preserve `expiration_time` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTime); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Link +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#link +struct TLinkOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLinkOptions; + /// @endcond + + /// Create parent directories of destination if they don't exist. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Do not raise error if link already exists. + FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false); + + /// Force rewrite target node. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Attributes of created link. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Concatenate +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#concatenate +struct TConcatenateOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TConcatenateOptions; + /// @endcond + + /// Whether we should append to destination or rewrite it. + FLUENT_FIELD_OPTION(bool, Append); +}; + +/// +/// @brief Options for @ref NYT::IIOClient::CreateBlobTableReader +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#read_blob_table +struct TBlobTableReaderOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TBlobTableReaderOptions; + /// @endcond + + /// Name of the part index column. By default it is "part_index". + FLUENT_FIELD_OPTION(TString, PartIndexColumnName); + + /// Name of the data column. By default it is "data". + FLUENT_FIELD_OPTION(TString, DataColumnName); + + /// + /// @brief Size of each part. + /// + /// All blob parts except the last part of the blob must be of this size + /// otherwise blob table reader emits error. + FLUENT_FIELD_DEFAULT(ui64, PartSize, 4 * 1024 * 1024); + + /// @brief Offset from which to start reading + FLUENT_FIELD_DEFAULT(i64, Offset, 0); +}; + +/// +/// @brief Resource limits for operation (or pool) +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy +/// @see NYT::TUpdateOperationParametersOptions +struct TResourceLimits +{ + /// @cond Doxygen_Suppress + using TSelf = TResourceLimits; + /// @endcond + + /// Number of slots for user jobs. + FLUENT_FIELD_OPTION(i64, UserSlots); + + /// Number of cpu cores. + FLUENT_FIELD_OPTION(double, Cpu); + + /// Network usage. Doesn't have precise physical unit. + FLUENT_FIELD_OPTION(i64, Network); + + /// Memory in bytes. + FLUENT_FIELD_OPTION(i64, Memory); +}; + +/// +/// @brief Scheduling options for single pool tree. +/// +/// @see NYT::TUpdateOperationParametersOptions +struct TSchedulingOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSchedulingOptions; + /// @endcond + + /// + /// @brief Pool to switch operation to. + /// + /// @note Switching is currently disabled on the server (will induce an exception). + FLUENT_FIELD_OPTION(TString, Pool); + + /// @brief Operation weight. + FLUENT_FIELD_OPTION(double, Weight); + + /// @brief Operation resource limits. + FLUENT_FIELD_OPTION(TResourceLimits, ResourceLimits); +}; + +/// +/// @brief Collection of scheduling options for multiple pool trees. +/// +/// @see NYT::TUpdateOperationParametersOptions +struct TSchedulingOptionsPerPoolTree +{ + /// @cond Doxygen_Suppress + using TSelf = TSchedulingOptionsPerPoolTree; + /// @endcond + + TSchedulingOptionsPerPoolTree(const THashMap<TString, TSchedulingOptions>& options = {}) + : Options_(options) + { } + + /// Add scheduling options for pool tree. + TSelf& Add(TStringBuf poolTreeName, const TSchedulingOptions& schedulingOptions) + { + Y_ENSURE(Options_.emplace(poolTreeName, schedulingOptions).second); + return *this; + } + + THashMap<TString, TSchedulingOptions> Options_; +}; + +/// +/// @brief Options for @ref NYT::IOperation::SuspendOperation +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#suspend_op +struct TSuspendOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSuspendOperationOptions; + /// @endcond + + /// + /// @brief Whether to abort already running jobs. + /// + /// By default running jobs are not aborted. + FLUENT_FIELD_OPTION(bool, AbortRunningJobs); +}; + +/// +/// @brief Options for @ref NYT::IOperation::ResumeOperation +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#resume_op +struct TResumeOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TResumeOperationOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IOperation::UpdateParameters +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#update_op_parameters +struct TUpdateOperationParametersOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TUpdateOperationParametersOptions; + /// @endcond + + /// New owners of the operation. + FLUENT_VECTOR_FIELD(TString, Owner); + + /// Pool to switch operation to (for all pool trees it is running in). + FLUENT_FIELD_OPTION(TString, Pool); + + /// New operation weight (for all pool trees it is running in). + FLUENT_FIELD_OPTION(double, Weight); + + /// Scheduling options for each pool tree the operation is running in. + FLUENT_FIELD_OPTION(TSchedulingOptionsPerPoolTree, SchedulingOptionsPerPoolTree); +}; + +/// +/// @brief Base class for many options related to IO. +/// +/// @ref NYT::TFileWriterOptions +/// @ref NYT::TFileReaderOptions +/// @ref NYT::TTableReaderOptions +/// @ref NYT::TTableWriterOptions +template <class TDerived> +struct TIOOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Advanced options for reader/writer. + /// + /// Readers/writers have many options not of all of them are supported by library. + /// If you need such unsupported option, you might use `Config` option until + /// option is supported. + /// + /// Example: + /// + /// TTableWriterOptions().Config(TNode()("max_row_weight", 64 << 20))) + /// + /// @note We encourage you to ask yt@ to add native C++ support of required options + /// and use `Config` only as temporary solution while native support is not ready. + FLUENT_FIELD_OPTION(TNode, Config); + + /// + /// @brief Whether to create internal client transaction for reading / writing table. + /// + /// This is advanced option. + /// + /// If `CreateTransaction` is set to `false` reader/writer doesn't create internal transaction + /// and doesn't lock table. This option is overriden (effectively `false`) for writers by + /// @ref NYT::TTableWriterOptions::SingleHttpRequest + /// + /// WARNING: if `CreateTransaction` is `false`, read/write might become non-atomic. + /// Change ONLY if you are sure what you are doing! + FLUENT_FIELD_DEFAULT(bool, CreateTransaction, true); +}; + +/// @brief Options for reading file from YT. +struct TFileReaderOptions + : public TIOOptions<TFileReaderOptions> +{ + /// + /// @brief Offset to start reading from. + /// + /// By default reading is started from the beginning of the file. + FLUENT_FIELD_OPTION(i64, Offset); + + /// + /// @brief Maximum length to read. + /// + /// By default file is read until the end. + FLUENT_FIELD_OPTION(i64, Length); +}; + +/// @brief Options that control how server side of YT stores data. +struct TWriterOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TWriterOptions; + /// @endcond + + /// + /// @brief Whether to wait all replicas to be written. + /// + /// When set to true upload will be considered successful as soon as + /// @ref NYT::TWriterOptions::MinUploadReplicationFactor number of replicas are created. + FLUENT_FIELD_OPTION(bool, EnableEarlyFinish); + + /// Number of replicas to be created. + FLUENT_FIELD_OPTION(ui64, UploadReplicationFactor); + + /// + /// Min number of created replicas needed to consider upload successful. + /// + /// @see NYT::TWriterOptions::EnableEarlyFinish + FLUENT_FIELD_OPTION(ui64, MinUploadReplicationFactor); + + /// + /// @brief Desired size of a chunk. + /// + /// @see @ref NYT::TWriterOptions::RetryBlockSize + FLUENT_FIELD_OPTION(ui64, DesiredChunkSize); + + /// + /// @brief Size of data block accumulated in memory to provide retries. + /// + /// Data is accumulated in memory buffer so in case error occurs data could be resended. + /// + /// If `RetryBlockSize` is not set buffer size is set to `DesiredChunkSize`. + /// If niether `RetryBlockSize` nor `DesiredChunkSize` is set size of buffer is 64MB. + /// + /// @note Written chunks cannot be larger than size of this memory buffer. + /// + /// Since DesiredChunkSize is compared against data already compressed with compression codec + /// it makes sense to set `RetryBlockSize = DesiredChunkSize / ExpectedCompressionRatio` + /// + /// @see @ref NYT::TWriterOptions::DesiredChunkSize + /// @see @ref NYT::TTableWriterOptions::SingleHttpRequest + FLUENT_FIELD_OPTION(size_t, RetryBlockSize); +}; + +/// +/// @brief Options for writing file +/// +/// @see NYT::IIOClient::CreateFileWriter +struct TFileWriterOptions + : public TIOOptions<TFileWriterOptions> +{ + /// + /// @brief Whether to compute MD5 sum of written file. + /// + /// If ComputeMD5 is set to `true` and we are appending to an existing file + /// the `md5` attribute must be set (i.e. it was previously written only with `ComputeMD5 == true`). + FLUENT_FIELD_OPTION(bool, ComputeMD5); + + /// + /// @brief Options to control how YT server side writes data. + /// + /// @see NYT::TWriterOptions + FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions); +}; + +class TSkiffRowHints { +public: + /// @cond Doxygen_Suppress + using TSelf = TSkiffRowHints; + /// @endcond + + /// + /// @brief Library doesn't interpret it, only pass it to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions. + /// + /// You can set something in it to pass necessary information to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// Options that control how C++ objects represent table rows when reading or writing a table. +class TFormatHints +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TFormatHints; + /// @endcond + + /// + /// @brief Whether to skip null values. + /// + /// When set to true TNode doesn't contain null column values + /// (e.g. corresponding keys will be missing instead of containing null value). + /// + /// Only meaningful for TNode representation. + /// + /// Useful for sparse tables which have many columns in schema + /// but only few columns are set in any row. + FLUENT_FIELD_DEFAULT(bool, SkipNullValuesForTNode, false); + + /// + /// @brief Whether to convert string to numeric and boolean types (e.g. "42u" -> 42u, "false" -> %false) + /// when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableStringToAllConversion); + + /// + /// @brief Whether to convert numeric and boolean types to string (e.g., 3.14 -> "3.14", %true -> "true") + /// when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableAllToStringConversion); + + /// + /// @brief Whether to convert uint64 <-> int64 when writing to schemaful table. + /// + /// On overflow the corresponding error with be raised. + /// + /// This options is enabled by default. + FLUENT_FIELD_OPTION(bool, EnableIntegralTypeConversion); + + /// Whether to convert uint64 and int64 to double (e.g. 42 -> 42.0) when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableIntegralToDoubleConversion); + + /// Shortcut for enabling all type conversions. + FLUENT_FIELD_OPTION(bool, EnableTypeConversion); + + /// + /// @brief Controls how complex types are represented in TNode or yson-strings. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson + FLUENT_FIELD_OPTION(EComplexTypeMode, ComplexTypeMode); + + /// + /// @brief Allow to use any meta-information for creating skiff schema and parser for reading ISkiffRow. + FLUENT_FIELD_OPTION(TSkiffRowHints, SkiffRowHints); + + /// + /// @brief Apply the patch to the fields. + /// + /// Non-default and non-empty values replace the default and empty ones. + void Merge(const TFormatHints& patch); +}; + +/// Options that control which control attributes (like row_index) are added to rows during read. +class TControlAttributes +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TControlAttributes; + /// @endcond + + /// + /// @brief Whether to add "row_index" attribute to rows read. + FLUENT_FIELD_DEFAULT(bool, EnableRowIndex, true); + + /// + /// @brief Whether to add "range_index" attribute to rows read. + FLUENT_FIELD_DEFAULT(bool, EnableRangeIndex, true); +}; + +/// Options for @ref NYT::IClient::CreateTableReader +struct TTableReaderOptions + : public TIOOptions<TTableReaderOptions> +{ + /// @deprecated Size of internal client buffer. + FLUENT_FIELD_DEFAULT(size_t, SizeLimit, 4 << 20); + + /// + /// @brief Allows to fine tune format that is used for reading tables. + /// + /// Has no effect when used with raw-reader. + FLUENT_FIELD_OPTION(TFormatHints, FormatHints); + + /// + /// @brief Allows to tune which attributes are added to rows while reading tables. + /// + FLUENT_FIELD_DEFAULT(TControlAttributes, ControlAttributes, TControlAttributes()); +}; + +/// Options for @ref NYT::IClient::CreateTableWriter +struct TTableWriterOptions + : public TIOOptions<TTableWriterOptions> +{ + /// + /// @brief Enable or disable retryful writing. + /// + /// If set to true no retry is made but we also make less requests to master. + /// If set to false writer can make up to `TConfig::RetryCount` attempts to send each block of data. + /// + /// @note Writers' methods might throw strange exceptions that might look like network error + /// when `SingleHttpRequest == true` and YT node encounters an error + /// (due to limitations of HTTP protocol YT node have no chance to report error + /// before it reads the whole input so it just drops the connection). + FLUENT_FIELD_DEFAULT(bool, SingleHttpRequest, false); + + /// + /// @brief Allows to change the size of locally buffered rows before flushing to yt. + /// + /// Used only with @ref NYT::TTableWriterOptions::SingleHttpRequest + FLUENT_FIELD_DEFAULT(size_t, BufferSize, 64 << 20); + + /// + /// @brief Allows to fine tune format that is used for writing tables. + /// + /// Has no effect when used with raw-writer. + FLUENT_FIELD_OPTION(TFormatHints, FormatHints); + + /// @brief Try to infer schema of inexistent table from the type of written rows. + /// + /// @note Default values for this option may differ depending on the row type. + /// For protobuf it's currently false by default. + FLUENT_FIELD_OPTION(bool, InferSchema); + + /// + /// @brief Options to control how YT server side writes data. + /// + /// @see NYT::TWriterOptions + FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions); +}; + +/// +/// @brief Options for @ref NYT::IClient::StartTransaction +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#start_tx +struct TStartTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TStartTransactionOptions; + /// @endcond + + FLUENT_FIELD_DEFAULT(bool, PingAncestors, false); + + /// + /// @brief How long transaction lives after last ping. + /// + /// If server doesn't receive any pings for transaction for this time + /// transaction will be aborted. By default timeout is 15 seconds. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// + /// @brief Moment in the future when transaction is aborted. + FLUENT_FIELD_OPTION(TInstant, Deadline); + + /// + /// @brief Whether to ping created transaction automatically. + /// + /// When set to true library creates a thread that pings transaction. + /// When set to false library doesn't ping transaction and it's user responsibility to ping it. + FLUENT_FIELD_DEFAULT(bool, AutoPingable, true); + + /// + /// @brief Set the title attribute of transaction. + /// + /// If title was not specified + /// neither using this option nor using @ref NYT::TStartTransactionOptions::Attributes option + /// library will generate default title for transaction. + /// Such default title includes machine name, pid, user name and some other useful info. + FLUENT_FIELD_OPTION(TString, Title); + + /// + /// @brief Set custom transaction attributes + /// + /// @note @ref NYT::TStartTransactionOptions::Title option overrides `"title"` attribute. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for attaching transaction. +/// +/// @see NYT::IClient::AttachTransaction +struct TAttachTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAttachTransactionOptions; + /// @endcond + + /// + /// @brief Ping transaction automatically. + /// + /// When set to |true| library creates a thread that pings transaction. + /// When set to |false| library doesn't ping transaction and + /// it's user responsibility to ping it. + FLUENT_FIELD_DEFAULT(bool, AutoPingable, false); + + /// + /// @brief Abort transaction on program termination. + /// + /// Should the transaction be aborted on program termination + /// (either normal or by a signal or uncaught exception -- two latter + /// only if @ref TInitializeOptions::CleanupOnTermination is set). + FLUENT_FIELD_DEFAULT(bool, AbortOnTermination, false); +}; + +/// +/// @brief Type of the lock. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_mode +/// @see NYT::ITransaction::Lock +enum ELockMode : int +{ + /// Exclusive lock. + LM_EXCLUSIVE /* "exclusive" */, + + /// Shared lock. + LM_SHARED /* "shared" */, + + /// Snapshot lock. + LM_SNAPSHOT /* "snapshot" */, +}; + +/// +/// @brief Options for locking cypress node +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks +/// @see NYT::ITransaction::Lock +struct TLockOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLockOptions; + /// @endcond + + /// + /// @brief Whether to wait already locked node to be unlocked. + /// + /// If `Waitable' is set to true Lock method will create + /// waitable lock, that will be taken once other transactions + /// that hold lock to that node are commited / aborted. + /// + /// @note Lock method DOES NOT wait until lock is actually acquired. + /// Waiting should be done using corresponding methods of ILock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_queue + FLUENT_FIELD_DEFAULT(bool, Waitable, false); + + /// + /// @brief Also take attribute_key lock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility + FLUENT_FIELD_OPTION(TString, AttributeKey); + + /// + /// @brief Also take child_key lock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility + FLUENT_FIELD_OPTION(TString, ChildKey); +}; + +/// +/// @brief Options for @ref NYT::ITransaction::Unlock +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility +struct TUnlockOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TUnlockOptions; + /// @endcond +}; + +/// Base class for options that deal with tablets. +template <class TDerived> +struct TTabletOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// Index of a first tablet to deal with. + FLUENT_FIELD_OPTION(i64, FirstTabletIndex); + + /// Index of a last tablet to deal with. + FLUENT_FIELD_OPTION(i64, LastTabletIndex); +}; + +/// +/// @brief Options for @ref NYT::IClient::MountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#mount_table +struct TMountTableOptions + : public TTabletOptions<TMountTableOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TMountTableOptions; + /// @endcond + + /// If specified table will be mounted to this cell. + FLUENT_FIELD_OPTION(TTabletCellId, CellId); + + /// If set to true tablets will be mounted in freezed state. + FLUENT_FIELD_DEFAULT(bool, Freeze, false); +}; + +/// +/// @brief Options for @ref NYT::IClient::UnmountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#unmount_table +struct TUnmountTableOptions + : public TTabletOptions<TUnmountTableOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TUnmountTableOptions; + /// @endcond + + /// Advanced option, don't use unless yt team told you so. + FLUENT_FIELD_DEFAULT(bool, Force, false); +}; + +/// +/// @brief Options for @ref NYT::IClient::RemountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#remount_table +struct TRemountTableOptions + : public TTabletOptions<TRemountTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::ReshardTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#reshard_table +struct TReshardTableOptions + : public TTabletOptions<TReshardTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::FreezeTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#freeze_table +struct TFreezeTableOptions + : public TTabletOptions<TFreezeTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::UnfreezeTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#unfreeze_table +struct TUnfreezeTableOptions + : public TTabletOptions<TUnfreezeTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::AlterTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#alter_table +struct TAlterTableOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAlterTableOptions; + /// @endcond + + /// Change table schema. + FLUENT_FIELD_OPTION(TTableSchema, Schema); + + /// Alter table between static and dynamic mode. + FLUENT_FIELD_OPTION(bool, Dynamic); + + /// + /// @brief Changes id of upstream replica on metacluster. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables + FLUENT_FIELD_OPTION(TReplicaId, UpstreamReplicaId); +}; + +/// +/// @brief Options for @ref NYT::IClient::LookupRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#lookup_rows +struct TLookupRowsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLookupRowsOptions; + /// @endcond + + /// Timeout for operation. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// Column names to return. + FLUENT_FIELD_OPTION(TColumnNames, Columns); + + /// + /// @brief Whether to return rows that were not found in table. + /// + /// If set to true List returned by LookupRows method will have same + /// length as list of keys. If row is not found in table corresponding item in list + /// will have null value. + FLUENT_FIELD_DEFAULT(bool, KeepMissingRows, false); + + /// If set to true returned values will have "timestamp" attribute. + FLUENT_FIELD_OPTION(bool, Versioned); +}; + +/// +/// @brief Options for @ref NYT::IClient::SelectRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#select_rows +struct TSelectRowsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSelectRowsOptions; + /// @endcond + + /// Timeout for operation. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// + /// @brief Limitation for number of rows read by single node. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_OPTION(i64, InputRowLimit); + + /// + /// @brief Limitation for number of output rows on single cluster node. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_OPTION(i64, OutputRowLimit); + + /// + /// @brief Maximum row ranges derived from WHERE clause. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_DEFAULT(ui64, RangeExpansionLimit, 1000); + + /// + /// @brief Whether to fail if InputRowLimit or OutputRowLimit is exceeded. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_DEFAULT(bool, FailOnIncompleteResult, true); + + /// @brief Enable verbose logging on server side. + FLUENT_FIELD_DEFAULT(bool, VerboseLogging, false); + + FLUENT_FIELD_DEFAULT(bool, EnableCodeCache, true); +}; + +/// Options for NYT::CreateClient; +struct TCreateClientOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCreateClientOptions; + /// @endcond + + /// @brief Impersonated user name. + /// + /// If authenticated user is allowed to impersonate other YT users (e.g. yql_agent), this field may be used to override user name. + FLUENT_FIELD_OPTION(TString, ImpersonationUser); + + /// @brief User token. + /// + /// @see NYT::TCreateClientOptions::TokenPath + FLUENT_FIELD(TString, Token); + + /// @brief Path to the file where user token is stored. + /// + /// Token is looked in these places in following order: + /// - @ref NYT::TCreateClientOptions::Token + /// - @ref NYT::TCreateClientOptions::TokenPath + /// - `TConfig::Get()->Token` option. + /// - `YT_TOKEN` environment variable + /// - `YT_SECURE_VAULT_YT_TOKEN` environment variable + /// - File specified in `YT_TOKEN_PATH` environment variable + /// - `$HOME/.yt/token` file. + FLUENT_FIELD(TString, TokenPath); + + /// @brief TVM service ticket producer. + /// + /// We store a wrapper of NYT::TIntrusivePtr here (not a NYT::TIntrusivePtr), + /// because otherwise other projects will have build problems + /// because of visibility of two different `TIntrusivePtr`-s (::TInstrusivePtr and NYT::TInstrusivePtr). + /// + /// @see NYT::NAuth::TServiceTicketClientAuth + /// {@ + NAuth::IServiceTicketAuthPtrWrapperPtr ServiceTicketAuth_ = nullptr; + TSelf& ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper); + /// @} + + /// @brief Use tvm-only endpoints in cluster connection. + FLUENT_FIELD_DEFAULT(bool, TvmOnly, false); + + /// @brief Use HTTPs (use HTTP client from yt/yt/core always). + /// + /// @see UseCoreHttpClient + FLUENT_FIELD_DEFAULT(bool, UseTLS, false); + + /// @brief Use HTTP client from yt/yt/core. + FLUENT_FIELD_DEFAULT(bool, UseCoreHttpClient, false); + + /// + /// @brief RetryConfig provider allows to fine tune request retries. + /// + /// E.g. set total timeout for all retries. + FLUENT_FIELD_DEFAULT(IRetryConfigProviderPtr, RetryConfigProvider, nullptr); + + /// @brief Override global config for the client. + /// + /// The config contains implementation parameters such as connection timeouts, + /// access token, api version and more. + /// @see NYT::TConfig + FLUENT_FIELD_DEFAULT(TConfigPtr, Config, nullptr); +}; + +/// +/// @brief Options for @ref NYT::IBatchRequest::ExecuteBatch +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#execute_batch +struct TExecuteBatchOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TExecuteBatchOptions; + /// @endcond + + /// + /// @brief How many requests will be executed in parallel on the cluster. + /// + /// This parameter could be used to avoid RequestLimitExceeded errors. + FLUENT_FIELD_OPTION(ui64, Concurrency); + + /// + /// @brief Maximum size of batch sent in one request to server. + /// + /// Huge batches are executed using multiple requests. + /// BatchPartMaxSize is maximum size of single request that goes to server + /// If not specified it is set to `Concurrency * 5' + FLUENT_FIELD_OPTION(ui64, BatchPartMaxSize); +}; + +/// +/// @brief Durability mode. +/// +/// @see NYT::TTabletTransactionOptions::TDurability +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost +enum class EDurability +{ + /// Sync mode (default). + Sync /* "sync" */, + + /// Async mode (might reduce latency of write requests, but less reliable). + Async /* "async" */, +}; + +/// +/// @brief Atomicity mode. +/// +/// @see NYT::TTabletTransactionOptions::TDurability +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost +enum class EAtomicity +{ + /// Transactions are non atomic (might reduce latency of write requests). + None /* "none" */, + + /// Transactions are atomic (default). + Full /* "full" */, +}; + +/// +/// @brief Table replica mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#atributy +enum class ETableReplicaMode +{ + Sync /* "sync" */, + Async /* "async" */, +}; + +/// Base class for options dealing with io to dynamic tables. +template <typename TDerived> +struct TTabletTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Atomicity mode of operation + /// + /// Setting to NYT::EAtomicity::None allows to improve latency of operations + /// at the cost of weakening contracts. + /// + /// @note Use with care. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij + FLUENT_FIELD_OPTION(EAtomicity, Atomicity); + + /// + /// @brief Durability mode of operation + /// + /// Setting to NYT::EDurability::Async allows to improve latency of operations + /// at the cost of weakening contracts. + /// + /// @note Use with care. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij + FLUENT_FIELD_OPTION(EDurability, Durability); +}; + +/// +/// @brief Options for NYT::IClient::InsertRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#insert_rows +struct TInsertRowsOptions + : public TTabletTransactionOptions<TInsertRowsOptions> +{ + /// + /// @brief Whether to overwrite missing columns with nulls. + /// + /// By default all columns missing in input data are set to Null and overwrite currently stored value. + /// If `Update' is set to true currently stored value will not be overwritten for columns that are missing in input data. + FLUENT_FIELD_OPTION(bool, Update); + + /// + /// @brief Whether to overwrite or aggregate aggregated columns. + /// + /// Used with aggregating columns. + /// By default value in aggregating column will be overwritten. + /// If `Aggregate' is set to true row will be considered as delta and it will be aggregated with currently stored value. + FLUENT_FIELD_OPTION(bool, Aggregate); + + /// + /// @brief Whether to fail when inserting to table without sync replica. + /// + /// Used for insert operation for tables without sync replica. + /// https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write + /// Default value is 'false'. So insertion into table without sync replicas fails. + FLUENT_FIELD_OPTION(bool, RequireSyncReplica); +}; + +/// +/// @brief Options for NYT::IClient::DeleteRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#delete_rows +struct TDeleteRowsOptions + : public TTabletTransactionOptions<TDeleteRowsOptions> +{ + /// + /// @brief Whether to fail when deleting from table without sync replica. + /// + // Used for delete operation for tables without sync replica. + // https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write + // Default value is 'false'. So deletion into table without sync replicas fails. + FLUENT_FIELD_OPTION(bool, RequireSyncReplica); +}; + +/// +/// @brief Options for NYT::IClient::TrimRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#trim_rows +struct TTrimRowsOptions + : public TTabletTransactionOptions<TTrimRowsOptions> +{ }; + +/// @brief Options for NYT::IClient::AlterTableReplica +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#alter_table_replica +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables +struct TAlterTableReplicaOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAlterTableReplicaOptions; + /// @endcond + + /// + /// @brief Whether to enable or disable replica. + /// + /// Doesn't change state of replica if `Enabled' is not set. + FLUENT_FIELD_OPTION(bool, Enabled); + + /// + /// @brief Change replica mode. + /// + /// Doesn't change replica mode if `Mode` is not set. + FLUENT_FIELD_OPTION(ETableReplicaMode, Mode); +}; + +/// +/// @brief Options for @ref NYT::IClient::GetFileFromCache +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_file_from_cache +struct TGetFileFromCacheOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetFileFromCacheOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#put_file_to_cache +struct TPutFileToCacheOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TPutFileToCacheOptions; + /// @endcond + + /// Whether to preserve `expiration_timeout` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTimeout); +}; + +/// +/// Type of permission used in ACL. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/access_control +enum class EPermission : int +{ + /// Applies to: all objects. + Read /* "read" */, + + /// Applies to: all objects. + Write /* "write" */, + + /// Applies to: accounts / pools. + Use /* "use" */, + + /// Applies to: all objects. + Administer /* "administer" */, + + /// Applies to: schemas. + Create /* "create" */, + + /// Applies to: all objects. + Remove /* "remove" */, + + /// Applies to: tables. + Mount /* "mount" */, + + /// Applies to: operations. + Manage /* "manage" */, +}; + +/// Whether permission is granted or denied. +enum class ESecurityAction : int +{ + /// Permission is granted. + Allow /* "allow" */, + + /// Permission is denied. + Deny /* "deny" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::CheckPermission +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#check_permission +struct TCheckPermissionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCheckPermissionOptions; + /// @endcond + + /// Columns to check permission to (for tables only). + FLUENT_VECTOR_FIELD(TString, Column); +}; + +/// +/// @brief Columnar statistics fetching mode. +/// +/// @ref NYT::TGetTableColumnarStatisticsOptions::FetcherMode +enum class EColumnarStatisticsFetcherMode +{ + /// Slow mode for fetching precise columnar statistics. + FromNodes /* "from_nodes" */, + + /// + /// @brief Fast mode for fetching lightweight columnar statistics. + /// + /// Relative precision is 1 / 256. + /// + /// @note Might be unavailable for old tables in that case some upper bound is returned. + FromMaster /* "from_master" */, + + /// Use lightweight columnar statistics (FromMaster) if available otherwise switch to slow but precise mode (FromNodes). + Fallback /* "fallback" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_table_columnar_statistics +struct TGetTableColumnarStatisticsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTableColumnarStatisticsOptions; + /// @endcond + + /// + /// @brief Mode of statistics fetching. + /// + /// @ref NYT::EColumnarStatisticsFetcherMode + FLUENT_FIELD_OPTION(EColumnarStatisticsFetcherMode, FetcherMode); +}; + +/// +/// @brief Table partitioning mode. +/// +/// @ref NYT::TGetTablePartitionsOptions::PartitionMode +enum class ETablePartitionMode +{ + /// + /// @brief Ignores the order of input tables and their chunk and sorting orders. + /// + Unordered /* "unordered" */, + + /// + /// @brief The order of table ranges inside each partition obey the order of input tables and their chunk orders. + /// + Ordered /* "ordered" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTablePartitions +/// +struct TGetTablePartitionsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTablePartitionsOptions; + /// @endcond + + /// + /// @brief Table partitioning mode. + /// + /// @ref NYT::ETablePartitionMode + FLUENT_FIELD(ETablePartitionMode, PartitionMode); + + /// + /// @brief Approximate data weight of each output partition. + /// + FLUENT_FIELD(i64, DataWeightPerPartition); + + /// + /// @brief Maximum output partition count. + /// + /// Consider the situation when the `MaxPartitionCount` is given + /// and the total data weight exceeds `MaxPartitionCount * DataWeightPerPartition`. + /// If `AdjustDataWeightPerPartition` is |true| + /// `GetTablePartitions` will yield partitions exceeding the `DataWeightPerPartition`. + /// If `AdjustDataWeightPerPartition` is |false| + /// the partitioning will be aborted as soon as the output partition count exceeds this limit. + FLUENT_FIELD_OPTION(int, MaxPartitionCount); + + /// + /// @brief Allow the data weight per partition to exceed `DataWeightPerPartition` when `MaxPartitionCount` is set. + /// + /// |True| by default. + FLUENT_FIELD_DEFAULT(bool, AdjustDataWeightPerPartition, true); +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTabletInfos +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_tablet_infos +struct TGetTabletInfosOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTabletInfosOptions; + /// @endcond +}; + +/// Options for @ref NYT::IClient::SkyShareTable +struct TSkyShareTableOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSkyShareTableOptions; + /// @endcond + + /// + /// @brief Key columns that are used to group files in a table into torrents. + /// + /// One torrent is created for each value of `KeyColumns` columns. + /// If not specified, all files go into single torrent. + FLUENT_FIELD_OPTION(TColumnNames, KeyColumns); + + /// @brief Allow skynet manager to return fastbone links to skynet. See YT-11437 + FLUENT_FIELD_OPTION(bool, EnableFastbone); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/common.cpp b/yt/cpp/mapreduce/interface/common.cpp new file mode 100644 index 0000000000..f6d60127ce --- /dev/null +++ b/yt/cpp/mapreduce/interface/common.cpp @@ -0,0 +1,664 @@ +#include "common.h" + +#include "errors.h" +#include "format.h" +#include "serialize.h" +#include "fluent.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/type_info/type.h> + +#include <util/generic/xrange.h> + +namespace NYT { + +using ::google::protobuf::FieldDescriptor; +using ::google::protobuf::Descriptor; + +//////////////////////////////////////////////////////////////////////////////// + +TSortColumn::TSortColumn(TStringBuf name, ESortOrder sortOrder) + : Name_(name) + , SortOrder_(sortOrder) +{ } + +TSortColumn::TSortColumn(const TString& name, ESortOrder sortOrder) + : TSortColumn(static_cast<TStringBuf>(name), sortOrder) +{ } + +TSortColumn::TSortColumn(const char* name, ESortOrder sortOrder) + : TSortColumn(static_cast<TStringBuf>(name), sortOrder) +{ } + +const TSortColumn& TSortColumn::EnsureAscending() const +{ + Y_ENSURE(SortOrder() == ESortOrder::SO_ASCENDING); + return *this; +} + +TNode TSortColumn::ToNode() const +{ + return BuildYsonNodeFluently().Value(*this); +} + +//////////////////////////////////////////////////////////////////////////////// +// Below lie backward compatibility methods. +//////////////////////////////////////////////////////////////////////////////// + +TSortColumn& TSortColumn::operator = (TStringBuf name) +{ + EnsureAscending(); + Name_ = name; + return *this; +} + +TSortColumn& TSortColumn::operator = (const TString& name) +{ + return (*this = static_cast<TStringBuf>(name)); +} + +TSortColumn& TSortColumn::operator = (const char* name) +{ + return (*this = static_cast<TStringBuf>(name)); +} + +bool TSortColumn::operator == (TStringBuf rhsName) const +{ + EnsureAscending(); + return Name_ == rhsName; +} + +bool TSortColumn::operator != (TStringBuf rhsName) const +{ + return !(*this == rhsName); +} + +bool TSortColumn::operator == (const TString& rhsName) const +{ + return *this == static_cast<TStringBuf>(rhsName); +} + +bool TSortColumn::operator != (const TString& rhsName) const +{ + return !(*this == rhsName); +} + +bool TSortColumn::operator == (const char* rhsName) const +{ + return *this == static_cast<TStringBuf>(rhsName); +} + +bool TSortColumn::operator != (const char* rhsName) const +{ + return !(*this == rhsName); +} + +TSortColumn::operator TStringBuf() const +{ + EnsureAscending(); + return Name_; +} + +TSortColumn::operator TString() const +{ + return TString(static_cast<TStringBuf>(*this)); +} + +TSortColumn::operator std::string() const +{ + EnsureAscending(); + return static_cast<std::string>(Name_); +} + +//////////////////////////////////////////////////////////////////////////////// + +TSortColumns::TSortColumns() +{ } + +TSortColumns::TSortColumns(const TVector<TString>& names) +{ + Parts_.assign(names.begin(), names.end()); +} + +TSortColumns::TSortColumns(const TColumnNames& names) + : TSortColumns(names.Parts_) +{ } + +TSortColumns::operator TColumnNames() const +{ + return TColumnNames(EnsureAscending().GetNames()); +} + +const TSortColumns& TSortColumns::EnsureAscending() const +{ + for (const auto& sortColumn : Parts_) { + sortColumn.EnsureAscending(); + } + return *this; +} + +TVector<TString> TSortColumns::GetNames() const +{ + TVector<TString> names; + names.reserve(Parts_.size()); + for (const auto& sortColumn : Parts_) { + names.push_back(sortColumn.Name()); + } + return names; +} + +//////////////////////////////////////////////////////////////////////////////// + +static NTi::TTypePtr OldTypeToTypeV3(EValueType type) +{ + switch (type) { + case VT_INT64: + return NTi::Int64(); + case VT_UINT64: + return NTi::Uint64(); + + case VT_DOUBLE: + return NTi::Double(); + + case VT_BOOLEAN: + return NTi::Bool(); + + case VT_STRING: + return NTi::String(); + + case VT_ANY: + return NTi::Yson(); + + case VT_INT8: + return NTi::Int8(); + case VT_INT16: + return NTi::Int16(); + case VT_INT32: + return NTi::Int32(); + + case VT_UINT8: + return NTi::Uint8(); + case VT_UINT16: + return NTi::Uint16(); + case VT_UINT32: + return NTi::Uint32(); + + case VT_UTF8: + return NTi::Utf8(); + + case VT_NULL: + return NTi::Null(); + + case VT_VOID: + return NTi::Void(); + + case VT_DATE: + return NTi::Date(); + case VT_DATETIME: + return NTi::Datetime(); + case VT_TIMESTAMP: + return NTi::Timestamp(); + case VT_INTERVAL: + return NTi::Interval(); + + case VT_FLOAT: + return NTi::Float(); + case VT_JSON: + return NTi::Json(); + } +} + +static std::pair<EValueType, bool> Simplify(const NTi::TTypePtr& type) +{ + using namespace NTi; + const auto typeName = type->GetTypeName(); + switch (typeName) { + case ETypeName::Bool: + return {VT_BOOLEAN, true}; + + case ETypeName::Int8: + return {VT_INT8, true}; + case ETypeName::Int16: + return {VT_INT16, true}; + case ETypeName::Int32: + return {VT_INT32, true}; + case ETypeName::Int64: + return {VT_INT64, true}; + + case ETypeName::Uint8: + return {VT_UINT8, true}; + case ETypeName::Uint16: + return {VT_UINT16, true}; + case ETypeName::Uint32: + return {VT_UINT32, true}; + case ETypeName::Uint64: + return {VT_UINT64, true}; + + case ETypeName::Float: + return {VT_FLOAT, true}; + case ETypeName::Double: + return {VT_DOUBLE, true}; + + case ETypeName::String: + return {VT_STRING, true}; + case ETypeName::Utf8: + return {VT_UTF8, true}; + + case ETypeName::Date: + return {VT_DATE, true}; + case ETypeName::Datetime: + return {VT_DATETIME, true}; + case ETypeName::Timestamp: + return {VT_TIMESTAMP, true}; + case ETypeName::Interval: + return {VT_INTERVAL, true}; + + case ETypeName::TzDate: + case ETypeName::TzDatetime: + case ETypeName::TzTimestamp: + break; + + case ETypeName::Json: + return {VT_JSON, true}; + case ETypeName::Decimal: + return {VT_STRING, true}; + case ETypeName::Uuid: + break; + case ETypeName::Yson: + return {VT_ANY, true}; + + case ETypeName::Void: + return {VT_VOID, false}; + case ETypeName::Null: + return {VT_NULL, false}; + + case ETypeName::Optional: + { + auto itemType = type->AsOptional()->GetItemType(); + if (itemType->IsPrimitive()) { + auto simplified = Simplify(itemType->AsPrimitive()); + if (simplified.second) { + simplified.second = false; + return simplified; + } + } + return {VT_ANY, false}; + } + case ETypeName::List: + return {VT_ANY, true}; + case ETypeName::Dict: + return {VT_ANY, true}; + case ETypeName::Struct: + return {VT_ANY, true}; + case ETypeName::Tuple: + return {VT_ANY, true}; + case ETypeName::Variant: + return {VT_ANY, true}; + case ETypeName::Tagged: + return Simplify(type->AsTagged()->GetItemType()); + } + ythrow TApiUsageError() << "Unsupported type: " << typeName; +} + +NTi::TTypePtr ToTypeV3(EValueType type, bool required) +{ + auto typeV3 = OldTypeToTypeV3(type); + if (!Simplify(typeV3).second) { + if (required) { + ythrow TApiUsageError() << "type: " << type << " cannot be required"; + } else { + return typeV3; + } + } + if (required) { + return typeV3; + } else { + return NTi::Optional(typeV3); + } +} + +TColumnSchema::TColumnSchema() + : TypeV3_(NTi::Optional(NTi::Int64())) +{ } + +EValueType TColumnSchema::Type() const +{ + return Simplify(TypeV3_).first; +} + +TColumnSchema& TColumnSchema::Type(EValueType type) & +{ + return Type(ToTypeV3(type, false)); +} + +TColumnSchema TColumnSchema::Type(EValueType type) && +{ + return Type(ToTypeV3(type, false)); +} + +TColumnSchema& TColumnSchema::Type(const NTi::TTypePtr& type) & +{ + Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type"); + TypeV3_ = type; + return *this; +} + +TColumnSchema TColumnSchema::Type(const NTi::TTypePtr& type) && +{ + Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type"); + TypeV3_ = type; + return *this; +} + +TColumnSchema& TColumnSchema::TypeV3(const NTi::TTypePtr& type) & +{ + return Type(type); +} + +TColumnSchema TColumnSchema::TypeV3(const NTi::TTypePtr& type) && +{ + return Type(type); +} + +NTi::TTypePtr TColumnSchema::TypeV3() const +{ + return TypeV3_; +} + +bool TColumnSchema::Required() const +{ + return Simplify(TypeV3_).second; +} + +TColumnSchema& TColumnSchema::Type(EValueType type, bool required) & +{ + return Type(ToTypeV3(type, required)); +} + +TColumnSchema TColumnSchema::Type(EValueType type, bool required) && +{ + return Type(ToTypeV3(type, required)); +} + +bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs) +{ + return + lhs.Name() == rhs.Name() && + NTi::NEq::TStrictlyEqual()(lhs.TypeV3(), rhs.TypeV3()) && + lhs.SortOrder() == rhs.SortOrder() && + lhs.Lock() == rhs.Lock() && + lhs.Expression() == rhs.Expression() && + lhs.Aggregate() == rhs.Aggregate() && + lhs.Group() == rhs.Group(); +} + +//////////////////////////////////////////////////////////////////////////////// + +bool TTableSchema::Empty() const +{ + return Columns_.empty(); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type) && +{ + return std::move(AddColumn(name, type)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) && +{ + return std::move(AddColumn(name, type, sortOrder)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) && +{ + return std::move(AddColumn(name, type)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) && +{ + return std::move(AddColumn(name, type, sortOrder)); +} + +TTableSchema& TTableSchema::SortBy(const TSortColumns& sortColumns) & +{ + Y_ENSURE(sortColumns.Parts_.size() <= Columns_.size()); + + THashMap<TString, ui64> sortColumnIndex; + for (auto i: xrange(sortColumns.Parts_.size())) { + Y_ENSURE(sortColumnIndex.emplace(sortColumns.Parts_[i].Name(), i).second, + "Key column name '" << sortColumns.Parts_[i].Name() << "' repeats in columns list"); + } + + TVector<TColumnSchema> newColumnsSorted(sortColumns.Parts_.size()); + TVector<TColumnSchema> newColumnsUnsorted; + for (auto& column : Columns_) { + auto it = sortColumnIndex.find(column.Name()); + if (it == sortColumnIndex.end()) { + column.ResetSortOrder(); + newColumnsUnsorted.push_back(std::move(column)); + } else { + auto index = it->second; + const auto& sortColumn = sortColumns.Parts_[index]; + column.SortOrder(sortColumn.SortOrder()); + newColumnsSorted[index] = std::move(column); + sortColumnIndex.erase(it); + } + } + + Y_ENSURE(sortColumnIndex.empty(), "Column name '" << sortColumnIndex.begin()->first + << "' not found in table schema"); + + newColumnsSorted.insert(newColumnsSorted.end(), newColumnsUnsorted.begin(), newColumnsUnsorted.end()); + Columns_ = std::move(newColumnsSorted); + + return *this; +} + +TTableSchema TTableSchema::SortBy(const TSortColumns& sortColumns) && +{ + return std::move(SortBy(sortColumns)); +} + +TVector<TColumnSchema>& TTableSchema::MutableColumns() +{ + return Columns_; +} + +TNode TTableSchema::ToNode() const +{ + TNode result; + TNodeBuilder builder(&result); + Serialize(*this, &builder); + return result; +} + +TTableSchema TTableSchema::FromNode(const TNode& node) +{ + TTableSchema schema; + Deserialize(schema, node); + return schema; +} + +bool operator==(const TTableSchema& lhs, const TTableSchema& rhs) +{ + return + lhs.Columns() == rhs.Columns() && + lhs.Strict() == rhs.Strict() && + lhs.UniqueKeys() == rhs.UniqueKeys(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TKeyBound::TKeyBound(ERelation relation, TKey key) + : Relation_(relation) + , Key_(std::move(key)) +{ } + +//////////////////////////////////////////////////////////////////////////////// + +TTableSchema CreateTableSchema( + const Descriptor& messageDescriptor, + const TSortColumns& sortColumns, + bool keepFieldsWithoutExtension) +{ + auto result = CreateTableSchema(messageDescriptor, keepFieldsWithoutExtension); + if (!sortColumns.Parts_.empty()) { + result.SortBy(sortColumns.Parts_); + } + return result; +} + +TTableSchema CreateTableSchema(NTi::TTypePtr type) +{ + Y_VERIFY(type); + TTableSchema schema; + Deserialize(schema, NodeFromYsonString(NTi::NIo::AsYtSchema(type.Get()))); + return schema; +} + +//////////////////////////////////////////////////////////////////////////////// + +bool IsTrivial(const TReadLimit& readLimit) +{ + return !readLimit.Key_ && !readLimit.RowIndex_ && !readLimit.Offset_ && !readLimit.TabletIndex_ && !readLimit.KeyBound_; +} + +EValueType NodeTypeToValueType(TNode::EType nodeType) +{ + switch (nodeType) { + case TNode::EType::Int64: return VT_INT64; + case TNode::EType::Uint64: return VT_UINT64; + case TNode::EType::String: return VT_STRING; + case TNode::EType::Double: return VT_DOUBLE; + case TNode::EType::Bool: return VT_BOOLEAN; + default: + ythrow yexception() << "Cannot convert TNode type " << nodeType << " to EValueType"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path) +{ + static const TVector<TReadRange> empty; + + const auto& maybeRanges = path.GetRanges(); + if (maybeRanges.Empty()) { + return empty; + } else if (maybeRanges->size() > 0) { + return *maybeRanges; + } else { + // If you see this exception, that means that caller of this function doesn't known what to do + // with RichYPath that has set range list, but the range list is empty. + // + // To avoid this exception caller must explicitly handle such case. + // NB. YT-17683 + ythrow TApiUsageError() << "Unsupported RichYPath: explicitly empty range list"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +TString ToString(EValueType type) +{ + switch (type) { + case VT_INT8: + return "int8"; + case VT_INT16: + return "int16"; + case VT_INT32: + return "int32"; + case VT_INT64: + return "int64"; + + case VT_UINT8: + return "uint8"; + case VT_UINT16: + return "uint16"; + case VT_UINT32: + return "uint32"; + case VT_UINT64: + return "uint64"; + + case VT_DOUBLE: + return "double"; + + case VT_BOOLEAN: + return "boolean"; + + case VT_STRING: + return "string"; + case VT_UTF8: + return "utf8"; + + case VT_ANY: + return "any"; + + case VT_NULL: + return "null"; + case VT_VOID: + return "void"; + + case VT_DATE: + return "date"; + case VT_DATETIME: + return "datetime"; + case VT_TIMESTAMP: + return "timestamp"; + case VT_INTERVAL: + return "interval"; + + case VT_FLOAT: + return "float"; + + case VT_JSON: + return "json"; + } + ythrow yexception() << "Invalid value type " << static_cast<int>(type); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT + +template <> +void Out<NYT::TSortColumn>(IOutputStream& os, const NYT::TSortColumn& sortColumn) +{ + if (sortColumn.SortOrder() == NYT::ESortOrder::SO_ASCENDING) { + os << sortColumn.Name(); + } else { + os << NYT::BuildYsonStringFluently(NYson::EYsonFormat::Text).Value(sortColumn); + } +} diff --git a/yt/cpp/mapreduce/interface/common.h b/yt/cpp/mapreduce/interface/common.h new file mode 100644 index 0000000000..b1754ade70 --- /dev/null +++ b/yt/cpp/mapreduce/interface/common.h @@ -0,0 +1,1301 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/common.h +/// +/// Header containing miscellaneous structs and classes used in library. + +#include "fwd.h" + +#include <library/cpp/type_info/type_info.h> +#include <library/cpp/yson/node/node.h> + +#include <util/generic/guid.h> +#include <util/generic/map.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/system/type_name.h> +#include <util/generic/vector.h> + +#include <google/protobuf/message.h> + +#include <initializer_list> +#include <type_traits> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +#define FLUENT_FIELD(type, name) \ + type name##_; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_ENCAPSULATED(type, name) \ +private: \ + type name##_; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + const type& name() const & \ + { \ + return name##_; \ + } \ + type name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_FIELD_OPTION(type, name) \ + TMaybe<type> name##_; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_OPTION_ENCAPSULATED(type, name) \ +private: \ + TMaybe<type> name##_; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf& Reset##name() & \ + { \ + name##_ = Nothing(); \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf Reset##name() && \ + { \ + name##_ = Nothing(); \ + return static_cast<TSelf&>(*this); \ + } \ + const TMaybe<type>& name() const& \ + { \ + return name##_; \ + } \ + TMaybe<type> name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_FIELD_DEFAULT(type, name, defaultValue) \ + type name##_ = defaultValue; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_DEFAULT_ENCAPSULATED(type, name, defaultValue) \ +private: \ + type name##_ = defaultValue; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + const type& name() const & \ + { \ + return name##_; \ + } \ + type name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_VECTOR_FIELD(type, name) \ + TVector<type> name##s_; \ + TSelf& Add##name(const type& value) \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf& name##s(TVector<type> values) \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&>(*this);\ + } \ + static_assert(true) + +#define FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(type, name) \ +private: \ + TMaybe<TVector<type>> name##s_; \ +public: \ + const TMaybe<TVector<type>>& name##s() const & { \ + return name##s_; \ + } \ + TMaybe<TVector<type>>& name##s() & { \ + return name##s_; \ + } \ + TMaybe<TVector<type>> name##s() && { \ + return std::move(name##s_); \ + } \ + TSelf& Add##name(const type& value) & \ + { \ + if (name##s_.Empty()) { \ + name##s_.ConstructInPlace(); \ + } \ + name##s_->push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Add##name(const type& value) && \ + { \ + if (name##s_.Empty()) { \ + name##s_.ConstructInPlace(); \ + } \ + name##s_->push_back(value); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& name##s(TVector<type> values) & \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TVector<type> values) && \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& name##s(TNothing) & \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TNothing) && \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& Reset##name##s() & \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Reset##name##s() && \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&&>(*this);\ + } \ + static_assert(true) + +#define FLUENT_VECTOR_FIELD_ENCAPSULATED(type, name) \ +private: \ + TVector<type> name##s_; \ +public: \ + TSelf& Add##name(const type& value) & \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Add##name(const type& value) && \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf& name##s(TVector<type> value) & \ + { \ + name##s_ = std::move(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TVector<type> value) && \ + { \ + name##s_ = std::move(value); \ + return static_cast<TSelf&>(*this);\ + } \ + const TVector<type>& name##s() const & \ + { \ + return name##s_; \ + } \ + TVector<type> name##s() && \ + { \ + return name##s_; \ + } \ + static_assert(true) + +#define FLUENT_MAP_FIELD(keytype, valuetype, name) \ + TMap<keytype,valuetype> name##_; \ + TSelf& Add##name(const keytype& key, const valuetype& value) \ + { \ + name##_.emplace(key, value); \ + return static_cast<TSelf&>(*this);\ + } \ + static_assert(true) + +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Convenience class that keeps sequence of items. +/// +/// Designed to be used as function parameter. +/// +/// Users of such function can then pass: +/// - single item, +/// - initializer list of items, +/// - vector of items; +/// as argument to this function. +/// +/// Example: +/// ``` +/// void Foo(const TOneOrMany<int>& arg); +/// ... +/// Foo(1); // ok +/// Foo({1, 2, 3}); // ok +/// ``` +template <class T, class TDerived> +struct TOneOrMany +{ + /// @cond Doxygen_Suppress + using TSelf = std::conditional_t<std::is_void_v<TDerived>, TOneOrMany, TDerived>; + /// @endcond + + /// Initialize with empty sequence. + TOneOrMany() = default; + + // Initialize from initializer list. + template<class U> + TOneOrMany(std::initializer_list<U> il) + { + Parts_.assign(il.begin(), il.end()); + } + + /// Put arguments to sequence + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TOneOrMany(U&& arg, TArgs&&... args) + { + Add(arg, std::forward<TArgs>(args)...); + } + + /// Initialize from vector. + TOneOrMany(TVector<T> args) + : Parts_(std::move(args)) + { } + + /// @brief Order is defined the same way as in TVector + bool operator==(const TOneOrMany& rhs) const + { + // N.B. We would like to make this method to be `= default`, + // but this breaks MSVC compiler for the cases when T doesn't + // support comparison. + return Parts_ == rhs.Parts_; + } + + /// + /// @{ + /// + /// @brief Add all arguments to sequence + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TSelf& Add(U&& part, TArgs&&... args) & + { + Parts_.push_back(std::forward<U>(part)); + if constexpr (sizeof...(args) > 0) { + [[maybe_unused]] int dummy[sizeof...(args)] = {(Parts_.push_back(std::forward<TArgs>(args)), 0) ... }; + } + return static_cast<TSelf&>(*this); + } + + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TSelf Add(U&& part, TArgs&&... args) && + { + return std::move(Add(std::forward<U>(part), std::forward<TArgs>(args)...)); + } + /// @} + + /// Content of sequence. + TVector<T> Parts_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Type of the value that can occur in YT table. +/// +/// @ref NYT::TTableSchema +/// https://yt.yandex-team.ru/docs/description/storage/data_types +enum EValueType : int +{ + /// Int64, signed integer of 64 bits. + VT_INT64, + + /// Uint64, unsigned integer of 64 bits. + VT_UINT64, + + /// Double, floating point number of double precision (64 bits). + VT_DOUBLE, + /// Boolean, `true` or `false`. + VT_BOOLEAN, + + /// String, arbitrary byte sequence. + VT_STRING, + + /// Any, arbitrary yson document. + VT_ANY, + + /// Int8, signed integer of 8 bits. + VT_INT8, + /// Int16, signed integer of 16 bits. + VT_INT16, + /// Int32, signed integer of 32 bits. + VT_INT32, + + /// Uint8, unsigned integer of 8 bits. + VT_UINT8, + /// Uint16, unsigned integer of 16 bits. + VT_UINT16, + /// Uint32, unsigned integer of 32 bits. + VT_UINT32, + + /// Utf8, byte sequence that is valid utf8. + VT_UTF8, + + /// Null, absence of value (almost never used in schemas) + VT_NULL, + /// Void, absence of value (almost never used in schemas) the difference between null, and void is yql-specific. + VT_VOID, + + /// Date, number of days since Unix epoch (unsigned) + VT_DATE, + /// Datetime, number of seconds since Unix epoch (unsigned) + VT_DATETIME, + /// Timestamp, number of milliseconds since Unix epoch (unsigned) + VT_TIMESTAMP, + /// Interval, difference between two timestamps (signed) + VT_INTERVAL, + + /// Float, floating point number (32 bits) + VT_FLOAT, + /// Json, sequence of bytes that is valid json. + VT_JSON, +}; + +/// +/// @brief Sort order. +/// +/// @ref NYT::TTableSchema +enum ESortOrder : int +{ + /// Ascending sort order. + SO_ASCENDING /* "ascending" */, + /// Descending sort order. + SO_DESCENDING /* "descending" */, +}; + +/// +/// @brief Value of "optimize_for" attribute. +/// +/// @ref NYT::TRichYPath +enum EOptimizeForAttr : i8 +{ + /// Optimize for scan + OF_SCAN_ATTR /* "scan" */, + + /// Optimize for lookup + OF_LOOKUP_ATTR /* "lookup" */, +}; + +/// +/// @brief Value of "erasure_codec" attribute. +/// +/// @ref NYT::TRichYPath +enum EErasureCodecAttr : i8 +{ + /// @cond Doxygen_Suppress + EC_NONE_ATTR /* "none" */, + EC_REED_SOLOMON_6_3_ATTR /* "reed_solomon_6_3" */, + EC_LRC_12_2_2_ATTR /* "lrc_12_2_2" */, + EC_ISA_LRC_12_2_2_ATTR /* "isa_lrc_12_2_2" */, + /// @endcond +}; + +/// +/// @brief Value of "schema_modification" attribute. +/// +/// @ref NYT::TRichYPath +enum ESchemaModificationAttr : i8 +{ + SM_NONE_ATTR /* "none" */, + SM_UNVERSIONED_UPDATE /* "unversioned_update" */, +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Table key column description. +/// +/// The description includes column name and sort order. +/// +/// @anchor TSortOrder_backward_compatibility +/// @note +/// Many functions that use `TSortOrder` as argument used to take `TString` +/// (the only allowed sort order was "ascending" and user didn't have to specify it). +/// @note +/// This class is designed to provide backward compatibility for such code and therefore +/// objects of this class can be constructed and assigned from TString-like objects only. +/// +/// @see NYT::TSortOperationSpec +class TSortColumn +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TSortColumn; + /// @endcond + + /// Column name + FLUENT_FIELD_ENCAPSULATED(TString, Name); + + /// Sort order + FLUENT_FIELD_DEFAULT_ENCAPSULATED(ESortOrder, SortOrder, ESortOrder::SO_ASCENDING); + + /// + /// @{ + /// + /// @brief Construct object from name and sort order + /// + /// Constructors are intentionally implicit so `TSortColumn` can be compatible with old code. + /// @ref TSortOrder_backward_compatibility + TSortColumn(TStringBuf name = {}, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + TSortColumn(const TString& name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + TSortColumn(const char* name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + /// @} + + /// Check that sort order is ascending, throw exception otherwise. + const TSortColumn& EnsureAscending() const; + + /// @brief Convert sort to yson representation as YT API expects it. + TNode ToNode() const; + + /// @brief Comparison is default and checks both name and sort order. + bool operator == (const TSortColumn& rhs) const = default; + + /// + /// @{ + /// + /// @brief Assign object from column name, and set sort order to `ascending`. + /// + /// This is backward compatibility methods. + /// + /// @ref TSortOrder_backward_compatibility + TSortColumn& operator = (TStringBuf name); + TSortColumn& operator = (const TString& name); + TSortColumn& operator = (const char* name); + /// @} + + bool operator == (const TStringBuf rhsName) const; + bool operator != (const TStringBuf rhsName) const; + bool operator == (const TString& rhsName) const; + bool operator != (const TString& rhsName) const; + bool operator == (const char* rhsName) const; + bool operator != (const char* rhsName) const; + + // Intentionally implicit conversions. + operator TString() const; + operator TStringBuf() const; + operator std::string() const; + + Y_SAVELOAD_DEFINE(Name_, SortOrder_); +}; + +/// +/// @brief List of @ref TSortColumn +/// +/// Contains a bunch of helper methods such as constructing from single object. +class TSortColumns + : public TOneOrMany<TSortColumn, TSortColumns> +{ +public: + using TOneOrMany<TSortColumn, TSortColumns>::TOneOrMany; + + /// Construct empty list. + TSortColumns(); + + /// + /// @{ + /// + /// @brief Construct list of ascending sort order columns by their names. + /// + /// Required for backward compatibility. + /// + /// @ref TSortOrder_backward_compatibility + TSortColumns(const TVector<TString>& names); + TSortColumns(const TColumnNames& names); + /// @} + + + /// + /// @brief Implicit conversion to column list. + /// + /// If all columns has ascending sort order return list of their names. + /// Throw exception otherwise. + /// + /// Required for backward compatibility. + /// + /// @ref TSortOrder_backward_compatibility + operator TColumnNames() const; + + /// Make sure that all columns are of ascending sort order. + const TSortColumns& EnsureAscending() const; + + /// Get list of column names. + TVector<TString> GetNames() const; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Helper function to create new style type from old style one. +NTi::TTypePtr ToTypeV3(EValueType type, bool required); + +/// +/// @brief Single column description +/// +/// Each field describing column has setter and getter. +/// +/// Example reading field: +/// ``` +/// ... columnSchema.Name() ... +/// ``` +/// +/// Example setting field: +/// ``` +/// columnSchema.Name("my-column").Type(VT_INT64); // set name and type +/// ``` +/// +/// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema +class TColumnSchema +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TColumnSchema; + /// @endcond + + /// + /// @brief Construct empty column schemas + /// + /// @note + /// Such schema cannot be used in schema as it it doesn't have name. + TColumnSchema(); + + /// + /// @{ + /// + /// @brief Copy and move constructors are default. + TColumnSchema(const TColumnSchema&) = default; + TColumnSchema& operator=(const TColumnSchema&) = default; + /// @} + + + FLUENT_FIELD_ENCAPSULATED(TString, Name); + + /// + /// @brief Functions to work with type in old manner. + /// + /// @deprecated New code is recommended to work with types using @ref NTi::TTypePtr from type_info library. + TColumnSchema& Type(EValueType type) &; + TColumnSchema Type(EValueType type) &&; + EValueType Type() const; + + /// @brief Set and get column type. + /// @{ + TColumnSchema& Type(const NTi::TTypePtr& type) &; + TColumnSchema Type(const NTi::TTypePtr& type) &&; + + TColumnSchema& TypeV3(const NTi::TTypePtr& type) &; + TColumnSchema TypeV3(const NTi::TTypePtr& type) &&; + NTi::TTypePtr TypeV3() const; + /// @} + + /// + /// @brief Raw yson representation of column type + /// @deprecated Prefer to use `TypeV3` methods. + FLUENT_FIELD_OPTION_ENCAPSULATED(TNode, RawTypeV3); + + /// Column sort order + FLUENT_FIELD_OPTION_ENCAPSULATED(ESortOrder, SortOrder); + + /// + /// @brief Lock group name + /// + /// @ref https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#blokirovka-stroki + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Lock); + + /// Expression defining column value + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Expression); + + /// Aggregating function name + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Aggregate); + + /// + /// @brief Storage group name + /// + /// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Group); + + /// + /// @brief Column requiredness. + /// + /// Required columns doesn't accept NULL values. + /// Usually if column is required it means that it has Optional<...> type + bool Required() const; + + /// + /// @{ + /// + /// @brief Set type in old-style manner + TColumnSchema& Type(EValueType type, bool required) &; + TColumnSchema Type(EValueType type, bool required) &&; + /// @} + +private: + friend void Deserialize(TColumnSchema& columnSchema, const TNode& node); + NTi::TTypePtr TypeV3_; + bool Required_ = false; +}; + +/// Equality check checks all fields of column schema. +bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs); + +/// +/// @brief Description of table schema +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema +class TTableSchema +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TTableSchema; + /// @endcond + + /// Column schema + FLUENT_VECTOR_FIELD_ENCAPSULATED(TColumnSchema, Column); + + /// + /// @brief Strictness of the schema + /// + /// Strict schemas are not allowed to have columns not described in schema. + /// Nonstrict schemas are allowed to have such columns, all such missing columns are assumed to have + FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, Strict, true); + + /// + /// @brief Whether keys are unique + /// + /// This flag can be set only for schemas that have sorted columns. + /// If flag is set table cannot have multiple rows with same key. + FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, UniqueKeys, false); + + /// Get modifiable column list + TVector<TColumnSchema>& MutableColumns(); + + /// Check if schema has any described column + [[nodiscard]] bool Empty() const; + + /// Add column + TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &&; + + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type) &&; + + /// Add optional column of specified type + TTableSchema& AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &&; + + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema& AddColumn(const TString& name, EValueType type) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema AddColumn(const TString& name, EValueType type) &&; + + /// + /// @brief Make table schema sorted by specified columns + /// + /// Resets old key columns if any + TTableSchema& SortBy(const TSortColumns& columns) &; + + /// @copydoc NYT::TTableSchema::SortBy(const TSortColumns&)&; + TTableSchema SortBy(const TSortColumns& columns) &&; + + /// Get yson description of table schema + [[nodiscard]] TNode ToNode() const; + + /// Parse schema from yson node + static NYT::TTableSchema FromNode(const TNode& node); + + friend void Deserialize(TTableSchema& tableSchema, const TNode& node); +}; + +/// Check for equality of all columns and all schema attributes +bool operator==(const TTableSchema& lhs, const TTableSchema& rhs); + +/// Create table schema by protobuf message descriptor +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + const TSortColumns& sortColumns = TSortColumns(), + bool keepFieldsWithoutExtension = true); + +/// Create table schema by protobuf message type +template <class TProtoType, typename = std::enable_if_t<std::is_base_of_v<::google::protobuf::Message, TProtoType>>> +inline TTableSchema CreateTableSchema( + const TSortColumns& sortColumns = TSortColumns(), + bool keepFieldsWithoutExtension = true) +{ + static_assert( + std::is_base_of_v<::google::protobuf::Message, TProtoType>, + "Template argument must be derived from ::google::protobuf::Message"); + + return CreateTableSchema( + *TProtoType::descriptor(), + sortColumns, + keepFieldsWithoutExtension); +} + +/// +/// @brief Create strict table schema from `struct` type. +/// +/// Names and types of columns are taken from struct member names and types. +/// `Strict` flag is set to true, all other attribute of schema and columns +/// are left with default values +TTableSchema CreateTableSchema(NTi::TTypePtr type); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Enumeration describing comparison operation used in key bound. +/// +/// ERelation is a part of @ref NYT::TKeyBound that can be used as +/// lower or upper key limit in @ref TReadLimit. +/// +/// Relations `Less` and `LessOrEqual` are for upper limit and +/// relations `Greater` and `GreaterOrEqual` are for lower limit. +/// +/// It is a error to use relation in the limit of wrong kind. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +enum class ERelation +{ + /// + /// @brief Relation "less" + /// + /// Specifies range of keys that are before specified key. + /// Can only be used in upper limit. + Less /* "<" */, + + /// + /// @brief Relation "less or equal" + /// + /// Specifies range of keys that are before or equal specified key. + /// Can only be used in upper limit. + LessOrEqual /* "<=" */, + + /// + /// @brief Relation "greater" + /// + /// Specifies range of keys that are after specified key. + /// Can only be used in lower limit. + Greater /* ">" */, + + /// + /// @brief Relation "greater or equal" + /// + /// Specifies range of keys that are after or equal than specified key. + /// Can only be used in lower limit. + GreaterOrEqual /* ">=" */, +}; + +/// +/// @brief Key with relation specifying interval of keys in lower or upper limit of @ref NYT::TReadRange +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TKeyBound +{ + /// @cond Doxygen_Suppress + using TSelf = TKeyBound; + + explicit TKeyBound(ERelation relation = ERelation::Less, TKey key = TKey{}); + + FLUENT_FIELD_DEFAULT_ENCAPSULATED(ERelation, Relation, ERelation::Less); + FLUENT_FIELD_DEFAULT_ENCAPSULATED(TKey, Key, TKey{}); + /// @endcond +}; + +/// +/// @brief Description of the read limit. +/// +/// It is actually a variant and must store exactly one field. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TReadLimit +{ + /// @cond Doxygen_Suppress + using TSelf = TReadLimit; + /// @endcond + + /// + /// @brief KeyBound specifies table key and whether to include it + /// + /// It can be used in lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(TKeyBound, KeyBound); + + /// + /// @brief Table key + /// + /// It can be used in exact, lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(TKey, Key); + + /// + /// @brief Row index + /// + /// It can be used in exact, lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(i64, RowIndex); + + /// + /// @brief File offset + /// + /// It can be used in lower or upper limit when reading files. + FLUENT_FIELD_OPTION(i64, Offset); + + /// + /// @brief Tablet index + /// + /// It can be used in lower or upper limit in dynamic table operations + FLUENT_FIELD_OPTION(i64, TabletIndex); +}; + +/// +/// @brief Range of a table or a file +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TReadRange +{ + using TSelf = TReadRange; + + /// + /// @brief Lower limit of the range + /// + /// It is usually inclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::Greater is used). + FLUENT_FIELD(TReadLimit, LowerLimit); + + /// + /// @brief Lower limit of the range + /// + /// It is usually exclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::LessOrEqual is used). + FLUENT_FIELD(TReadLimit, UpperLimit); + + /// Exact key or row index. + FLUENT_FIELD(TReadLimit, Exact); + + /// Create read range from row indexes. + static TReadRange FromRowIndices(i64 lowerLimit, i64 upperLimit) + { + return TReadRange() + .LowerLimit(TReadLimit().RowIndex(lowerLimit)) + .UpperLimit(TReadLimit().RowIndex(upperLimit)); + } + + /// Create read range from keys. + static TReadRange FromKeys(const TKey& lowerKeyInclusive, const TKey& upperKeyExclusive) + { + return TReadRange() + .LowerLimit(TReadLimit().Key(lowerKeyInclusive)) + .UpperLimit(TReadLimit().Key(upperKeyExclusive)); + } +}; + +/// +/// @brief Path with additional attributes. +/// +/// Allows to specify additional attributes for path used in some operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TRichYPath +{ + /// @cond Doxygen_Suppress + using TSelf = TRichYPath; + /// @endcond + + /// Path itself. + FLUENT_FIELD(TYPath, Path); + + /// Specifies that path should be appended not overwritten + FLUENT_FIELD_OPTION(bool, Append); + + /// @deprecated Deprecated attribute. + FLUENT_FIELD_OPTION(bool, PartiallySorted); + + /// Specifies that path is expected to be sorted by these columns. + FLUENT_FIELD(TSortColumns, SortedBy); + + /// Add range to read. + TRichYPath& AddRange(TReadRange range) + { + if (!Ranges_) { + Ranges_.ConstructInPlace(); + } + Ranges_->push_back(std::move(range)); + return *this; + } + + TRichYPath& ResetRanges() + { + Ranges_.Clear(); + return *this; + } + + /// + /// @{ + /// + /// Return ranges to read. + /// + /// NOTE: Nothing (in TMaybe) and empty TVector are different ranges. + /// Nothing represents universal range (reader reads all table rows). + /// Empty TVector represents empty range (reader returns empty set of rows). + const TMaybe<TVector<TReadRange>>& GetRanges() const + { + return Ranges_; + } + + TMaybe<TVector<TReadRange>>& MutableRanges() + { + return Ranges_; + } + + /// + /// @{ + /// + /// Get range view, that is convenient way to iterate through all ranges. + TArrayRef<TReadRange> MutableRangesView() + { + if (Ranges_.Defined()) { + return TArrayRef(Ranges_->data(), Ranges_->size()); + } else { + return {}; + } + } + + TArrayRef<const TReadRange> GetRangesView() const + { + if (Ranges_.Defined()) { + return TArrayRef(Ranges_->data(), Ranges_->size()); + } else { + return {}; + } + } + /// @} + + /// @{ + /// + /// Get range by index. + const TReadRange& GetRange(ssize_t i) const + { + return Ranges_.GetRef()[i]; + } + + TReadRange& MutableRange(ssize_t i) + { + return Ranges_.GetRef()[i]; + } + /// @} + + /// + /// @brief Specifies columns that should be read. + /// + /// If it's set to Nothing then all columns will be read. + /// If empty TColumnNames is specified then each read row will be empty. + FLUENT_FIELD_OPTION(TColumnNames, Columns); + + FLUENT_FIELD_OPTION(bool, Teleport); + FLUENT_FIELD_OPTION(bool, Primary); + FLUENT_FIELD_OPTION(bool, Foreign); + FLUENT_FIELD_OPTION(i64, RowCountLimit); + + FLUENT_FIELD_OPTION(TString, FileName); + + /// Specifies original path to be shown in Web UI + FLUENT_FIELD_OPTION(TYPath, OriginalPath); + + /// + /// @brief Specifies that this path points to executable file + /// + /// Used in operation specs. + FLUENT_FIELD_OPTION(bool, Executable); + + /// + /// @brief Specify format to use when loading table. + /// + /// Used in operation specs. + FLUENT_FIELD_OPTION(TNode, Format); + + /// @brief Specifies table schema that will be set on the path + FLUENT_FIELD_OPTION(TTableSchema, Schema); + + /// Specifies compression codec that will be set on the path + FLUENT_FIELD_OPTION(TString, CompressionCodec); + + /// Specifies erasure codec that will be set on the path + FLUENT_FIELD_OPTION(EErasureCodecAttr, ErasureCodec); + + /// Specifies schema modification that will be set on the path + FLUENT_FIELD_OPTION(ESchemaModificationAttr, SchemaModification); + + /// Specifies optimize_for attribute that will be set on the path + FLUENT_FIELD_OPTION(EOptimizeForAttr, OptimizeFor); + + /// + /// @brief Do not put file used in operation into node cache + /// + /// If BypassArtifactCache == true, file will be loaded into the job's sandbox bypassing the cache on the YT node. + /// It helps jobs that use tmpfs to start faster, + /// because files will be loaded into tmpfs directly bypassing disk cache + FLUENT_FIELD_OPTION(bool, BypassArtifactCache); + + /// + /// @brief Timestamp of dynamic table. + /// + /// NOTE: it is _not_ unix timestamp + /// (instead it's transaction timestamp, that is more complex structure). + FLUENT_FIELD_OPTION(i64, Timestamp); + + /// + /// @brief Specify transaction that should be used to access this path. + /// + /// Allows to start cross-transactional operations. + FLUENT_FIELD_OPTION(TTransactionId, TransactionId); + + using TRenameColumnsDescriptor = THashMap<TString, TString>; + + /// Specifies columnar mapping which will be applied to columns before transfer to job. + FLUENT_FIELD_OPTION(TRenameColumnsDescriptor, RenameColumns); + + /// Create empty path with no attributes + TRichYPath() + { } + + /// + /// @{ + /// + /// @brief Create path from string + TRichYPath(const char* path) + : Path_(path) + { } + + TRichYPath(const TYPath& path) + : Path_(path) + { } + /// @} + +private: + TMaybe<TVector<TReadRange>> Ranges_; +}; + +/// +/// @ref Create copy of @ref NYT::TRichYPath with schema derived from proto message. +/// +/// +template <typename TProtoType> +TRichYPath WithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns()) +{ + static_assert(std::is_base_of_v<::google::protobuf::Message, TProtoType>, "TProtoType must be Protobuf message"); + + auto schemedPath = path; + if (!schemedPath.Schema_) { + schemedPath.Schema(CreateTableSchema<TProtoType>(sortBy)); + } + return schemedPath; +} + +/// +/// @brief Create copy of @ref NYT::TRichYPath with schema derived from TRowType if possible. +/// +/// If TRowType is protobuf message schema is derived from it and set to returned path. +/// Otherwise schema of original path is left unchanged (and probably unset). +template <typename TRowType> +TRichYPath MaybeWithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns()) +{ + if constexpr (std::is_base_of_v<::google::protobuf::Message, TRowType>) { + return WithSchema<TRowType>(path, sortBy); + } else { + return path; + } +} + +/// +/// @brief Get the list of ranges related to path in compatibility mode. +/// +/// - If path is missing ranges, empty list is returned. +/// - If path has associated range list and the list is not empty, function returns this list. +/// - If path has associated range list and this list is empty, exception is thrown. +/// +/// Before YT-17683 RichYPath didn't support empty range list and empty range actualy meant universal range. +/// This function emulates this old behavior. +/// +/// @see https://st.yandex-team.ru/YT-17683 +const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path); + +//////////////////////////////////////////////////////////////////////////////// + +/// Statistics about table columns. +struct TTableColumnarStatistics +{ + /// Total data weight for all chunks for each of requested columns. + THashMap<TString, i64> ColumnDataWeight; + + /// Total weight of all old chunks that don't keep columnar statistics. + i64 LegacyChunksDataWeight = 0; + + /// Timestamps total weight (only for dynamic tables). + TMaybe<i64> TimestampTotalWeight; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Description of a partition. +struct TMultiTablePartition +{ + struct TStatistics + { + i64 ChunkCount = 0; + i64 DataWeight = 0; + i64 RowCount = 0; + }; + + /// Ranges of input tables for this partition. + TVector<TRichYPath> TableRanges; + + /// Aggregate statistics of all the table ranges in the partition. + TStatistics AggregateStatistics; +}; + +/// Table partitions from GetTablePartitions command. +struct TMultiTablePartitions +{ + /// Disjoint partitions into which the input tables were divided. + TVector<TMultiTablePartition> Partitions; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Contains information about tablet +/// +/// @see NYT::IClient::GetTabletInfos +struct TTabletInfo +{ + /// + /// @brief Indicates the total number of rows added to the tablet (including trimmed ones). + /// + /// Currently only provided for ordered tablets. + i64 TotalRowCount = 0; + + /// + /// @brief Contains the number of front rows that are trimmed and are not guaranteed to be accessible. + /// + /// Only makes sense for ordered tablet. + i64 TrimmedRowCount = 0; + + /// + /// @brief Tablet cell barrier timestamp, which lags behind the current timestamp + /// + /// It is guaranteed that all transactions with commit timestamp not exceeding the barrier are fully committed; + /// e.g. all their added rows are visible (and are included in @ref NYT::TTabletInfo::TotalRowCount). + /// Mostly makes sense for ordered tablets. + ui64 BarrierTimestamp; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// List of attributes to retrieve in operations like @ref NYT::ICypressClient::Get +struct TAttributeFilter +{ + /// @cond Doxygen_Suppress + using TSelf = TAttributeFilter; + /// @endcond + + /// List of attributes. + FLUENT_VECTOR_FIELD(TString, Attribute); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Check if none of the fields of @ref NYT::TReadLimit is set. +/// +/// @return true if any field of readLimit is set and false otherwise. +bool IsTrivial(const TReadLimit& readLimit); + +/// Convert yson node type to table schema type +EValueType NodeTypeToValueType(TNode::EType nodeType); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Enumeration for specifying how reading from master is performed. +/// +/// Used in operations like NYT::ICypressClient::Get +enum class EMasterReadKind : int +{ + /// + /// @brief Reading from leader. + /// + /// Should almost never be used since it's expensive and for regular uses has no difference from + /// "follower" read. + Leader /* "leader" */, + + /// @brief Reading from master follower (default). + Follower /* "follower" */, + Cache /* "cache" */, + MasterCache /* "master_cache" */, +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +namespace NDetail { + +// MUST NOT BE USED BY CLIENTS +// TODO: we should use default GENERATE_ENUM_SERIALIZATION +TString ToString(EValueType type); + +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/common_ut.cpp b/yt/cpp/mapreduce/interface/common_ut.cpp new file mode 100644 index 0000000000..3f19433816 --- /dev/null +++ b/yt/cpp/mapreduce/interface/common_ut.cpp @@ -0,0 +1,303 @@ +#include "common_ut.h" + +#include "fluent.h" + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/node_builder.h> + +#include <util/generic/xrange.h> + +using namespace NYT; + +template <class T> +TString SaveToString(const T& obj) +{ + TString s; + TStringOutput out(s); + ::Save(&out, obj); + return s; +} + +template <class T> +T LoadFromString(TStringBuf s) +{ + TMemoryInput in(s); + T obj; + ::Load(&in, obj); + return obj; +} + +template <class T> +T SaveLoad(const T& obj) +{ + return LoadFromString<T>(SaveToString(obj)); +} + +Y_UNIT_TEST_SUITE(Common) +{ + Y_UNIT_TEST(SortColumnsLegacy) + { + TSortColumns keys1("a", "b"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b"})); + + keys1.Add("c", "d"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + + auto keys2 = TSortColumns(keys1).Add("e", "f"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + UNIT_ASSERT((keys2.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f"})); + + auto keys3 = TSortColumns(keys1).Add("e").Add("f").Add("g"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + UNIT_ASSERT((keys3.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f", "g"})); + } + + Y_UNIT_TEST(SortColumn) + { + auto ascending = TSortColumn("a"); + UNIT_ASSERT_VALUES_EQUAL(ascending.Name(), "a"); + UNIT_ASSERT_VALUES_EQUAL(ascending.SortOrder(), ESortOrder::SO_ASCENDING); + UNIT_ASSERT_VALUES_EQUAL(ascending, TSortColumn("a", ESortOrder::SO_ASCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(ascending, TSortColumn("a", ESortOrder::SO_DESCENDING)); + + UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending()); + UNIT_ASSERT_VALUES_EQUAL(static_cast<TString>(ascending), "a"); + UNIT_ASSERT_VALUES_EQUAL(ascending, "a"); + + auto another = ascending; + UNIT_ASSERT_NO_EXCEPTION(another = "another"); + UNIT_ASSERT_VALUES_EQUAL(another.Name(), "another"); + UNIT_ASSERT_VALUES_EQUAL(another.SortOrder(), ESortOrder::SO_ASCENDING); + UNIT_ASSERT_VALUES_EQUAL(another, TSortColumn("another", ESortOrder::SO_ASCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(another, TSortColumn("another", ESortOrder::SO_DESCENDING)); + + auto ascendingNode = BuildYsonNodeFluently().Value(ascending); + UNIT_ASSERT_VALUES_EQUAL(ascendingNode, TNode("a")); + + UNIT_ASSERT_VALUES_EQUAL(SaveLoad(ascending), ascending); + UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(ascending), SaveToString(TString("a"))); + + auto descending = TSortColumn("a", ESortOrder::SO_DESCENDING); + UNIT_ASSERT_VALUES_EQUAL(descending.Name(), "a"); + UNIT_ASSERT_VALUES_EQUAL(descending.SortOrder(), ESortOrder::SO_DESCENDING); + UNIT_ASSERT_VALUES_EQUAL(descending, TSortColumn("a", ESortOrder::SO_DESCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(descending, TSortColumn("a", ESortOrder::SO_ASCENDING)); + + UNIT_ASSERT_EXCEPTION(descending.EnsureAscending(), yexception); + UNIT_ASSERT_EXCEPTION(static_cast<TString>(descending), yexception); + UNIT_ASSERT_EXCEPTION(descending == "a", yexception); + UNIT_ASSERT_EXCEPTION(descending = "a", yexception); + + auto descendingNode = BuildYsonNodeFluently().Value(descending); + UNIT_ASSERT_VALUES_EQUAL(descendingNode, TNode()("name", "a")("sort_order", "descending")); + + UNIT_ASSERT_VALUES_EQUAL(SaveLoad(descending), descending); + UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(descending), SaveToString("a")); + + UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah")), "blah"); + UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah", ESortOrder::SO_DESCENDING)), "{\"name\"=\"blah\";\"sort_order\"=\"descending\"}"); + } + + Y_UNIT_TEST(SortColumns) + { + TSortColumns ascending("a", "b"); + UNIT_ASSERT(ascending.Parts_ == (TSortColumns{"a", "b"})); + UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending()); + UNIT_ASSERT_VALUES_EQUAL(static_cast<TColumnNames>(ascending).Parts_, (TVector<TString>{"a", "b"})); + UNIT_ASSERT_VALUES_EQUAL(ascending.GetNames(), (TVector<TString>{"a", "b"})); + + auto mixed = ascending; + mixed.Add(TSortColumn("c", ESortOrder::SO_DESCENDING), "d"); + UNIT_ASSERT((mixed.Parts_ != TVector<TSortColumn>{"a", "b", "c", "d"})); + UNIT_ASSERT((mixed.Parts_ == TVector<TSortColumn>{"a", "b", TSortColumn("c", ESortOrder::SO_DESCENDING), "d"})); + UNIT_ASSERT_VALUES_EQUAL(mixed.GetNames(), (TVector<TString>{"a", "b", "c", "d"})); + UNIT_ASSERT_EXCEPTION(mixed.EnsureAscending(), yexception); + UNIT_ASSERT_EXCEPTION(static_cast<TColumnNames>(mixed), yexception); + } + + Y_UNIT_TEST(KeyBound) + { + auto keyBound = TKeyBound(ERelation::Greater, TKey(7, "a", TNode()("x", "y"))); + UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::Greater); + UNIT_ASSERT_EQUAL(keyBound.Key(), TKey(7, "a", TNode()("x", "y"))); + + auto keyBound1 = TKeyBound().Relation(ERelation::Greater).Key(TKey(7, "a", TNode()("x", "y"))); + auto expectedNode = TNode() + .Add(">") + .Add(TNode().Add(7).Add("a").Add(TNode()("x", "y"))); + + UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound)); + UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound1)); + + keyBound.Relation(ERelation::LessOrEqual); + keyBound.Key(TKey("A", 7)); + UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::LessOrEqual); + UNIT_ASSERT_EQUAL(keyBound.Key(), TKey("A", 7)); + + UNIT_ASSERT_VALUES_EQUAL( + BuildYsonNodeFluently().Value(keyBound), + TNode() + .Add("<=") + .Add(TNode().Add("A").Add(7))); + } + + Y_UNIT_TEST(TTableSchema) + { + TTableSchema schema; + schema + .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64)) + .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64)); + auto checkSortBy = [](TTableSchema schema, const TVector<TString>& columns) { + auto initialSchema = schema; + schema.SortBy(columns); + for (auto i: xrange(columns.size())) { + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].Name(), columns[i]); + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), ESortOrder::SO_ASCENDING); + } + for (auto i: xrange(columns.size(), (size_t)initialSchema.Columns().size())) { + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), Nothing()); + } + UNIT_ASSERT_VALUES_EQUAL(initialSchema.Columns().size(), schema.Columns().size()); + return schema; + }; + auto newSchema = checkSortBy(schema, {"b"}); + UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[1].Name(), TString("a")); + UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[2].Name(), TString("c")); + checkSortBy(schema, {"b", "c"}); + checkSortBy(schema, {"c", "a"}); + UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"b", "b"}), yexception); + UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"a", "junk"}), yexception); + } + + Y_UNIT_TEST(TColumnSchema_TypeV3) + { + { + auto column = TColumnSchema().Type(NTi::Interval()); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), true); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_INTERVAL); + } + { + auto column = TColumnSchema().Type(NTi::Optional(NTi::Date())); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_DATE); + } + { + auto column = TColumnSchema().Type(NTi::Null()); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_NULL); + } + { + auto column = TColumnSchema().Type(NTi::Optional(NTi::Null())); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_ANY); + } + } + + Y_UNIT_TEST(ToTypeV3) + { + UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_INT32, true), *NTi::Int32()); + UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_UTF8, false), *NTi::Optional(NTi::Utf8())); + } + + Y_UNIT_TEST(DeserializeColumn) + { + auto deserialize = [] (TStringBuf yson) { + auto node = NodeFromYsonString(yson); + TColumnSchema column; + Deserialize(column, node); + return column; + }; + + auto column = deserialize("{name=foo; type=int64; required=%false}"); + UNIT_ASSERT_VALUES_EQUAL(column.Name(), "foo"); + UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Optional(NTi::Int64())); + + column = deserialize("{name=bar; type=utf8; required=%true; type_v3=utf8}"); + UNIT_ASSERT_VALUES_EQUAL(column.Name(), "bar"); + UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Utf8()); + } + + Y_UNIT_TEST(ColumnSchemaEquality) + { + auto base = TColumnSchema() + .Name("col") + .TypeV3(NTi::Optional(NTi::List(NTi::String()))) + .SortOrder(ESortOrder::SO_ASCENDING) + .Lock("lock") + .Expression("x + 12") + .Aggregate("sum") + .Group("group"); + + auto other = base; + ASSERT_SERIALIZABLES_EQUAL(other, base); + other.Name("other"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.TypeV3(NTi::List(NTi::String())); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.ResetSortOrder(); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Lock("lock1"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Expression("x + 13"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.ResetAggregate(); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Group("group1"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + } + + Y_UNIT_TEST(TableSchemaEquality) + { + auto col1 = TColumnSchema() + .Name("col1") + .TypeV3(NTi::Optional(NTi::List(NTi::String()))) + .SortOrder(ESortOrder::SO_ASCENDING); + + auto col2 = TColumnSchema() + .Name("col2") + .TypeV3(NTi::Uint32()); + + auto schema = TTableSchema() + .AddColumn(col1) + .AddColumn(col2) + .Strict(true) + .UniqueKeys(true); + + auto other = schema; + ASSERT_SERIALIZABLES_EQUAL(other, schema); + + other.Strict(false); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.MutableColumns()[0].TypeV3(NTi::List(NTi::String())); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.MutableColumns().push_back(col1); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.UniqueKeys(false); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + } +} diff --git a/yt/cpp/mapreduce/interface/common_ut.h b/yt/cpp/mapreduce/interface/common_ut.h new file mode 100644 index 0000000000..6f70f09bee --- /dev/null +++ b/yt/cpp/mapreduce/interface/common_ut.h @@ -0,0 +1 @@ +#pragma once diff --git a/yt/cpp/mapreduce/interface/config.cpp b/yt/cpp/mapreduce/interface/config.cpp new file mode 100644 index 0000000000..b474dc0844 --- /dev/null +++ b/yt/cpp/mapreduce/interface/config.cpp @@ -0,0 +1,321 @@ +#include "config.h" + +#include "operation.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/svnversion/svnversion.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/yson/json/yson2json_adapter.h> + +#include <util/string/strip.h> +#include <util/folder/dirut.h> +#include <util/folder/path.h> +#include <util/stream/file.h> +#include <util/generic/singleton.h> +#include <util/string/builder.h> +#include <util/string/cast.h> +#include <util/string/type.h> +#include <util/system/hostname.h> +#include <util/system/user.h> +#include <util/system/env.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +bool TConfig::GetBool(const char* var, bool defaultValue) +{ + TString val = GetEnv(var, ""); + if (val.empty()) { + return defaultValue; + } + return IsTrue(val); +} + +int TConfig::GetInt(const char* var, int defaultValue) +{ + int result = 0; + TString val = GetEnv(var, ""); + if (val.empty()) { + return defaultValue; + } + try { + result = FromString<int>(val); + } catch (const yexception& e) { + ythrow yexception() << "Cannot parse " << var << '=' << val << " as integer: " << e.what(); + } + return result; +} + +TDuration TConfig::GetDuration(const char* var, TDuration defaultValue) +{ + return TDuration::Seconds(GetInt(var, defaultValue.Seconds())); +} + +EEncoding TConfig::GetEncoding(const char* var) +{ + const TString encodingName = GetEnv(var, "identity"); + EEncoding encoding; + if (TryFromString(encodingName, encoding)) { + return encoding; + } else { + ythrow yexception() << var << ": encoding '" << encodingName << "' is not supported"; + } +} + + EUploadDeduplicationMode TConfig::GetUploadingDeduplicationMode( + const char* var, + EUploadDeduplicationMode defaultValue) +{ + const TString deduplicationMode = GetEnv(var, TEnumTraits<EUploadDeduplicationMode>::ToString(defaultValue)); + return TEnumTraits<EUploadDeduplicationMode>::FromString(deduplicationMode); +} + +void TConfig::ValidateToken(const TString& token) +{ + for (size_t i = 0; i < token.size(); ++i) { + ui8 ch = token[i]; + if (ch < 0x21 || ch > 0x7e) { + ythrow yexception() << "Incorrect token character '" << ch << "' at position " << i; + } + } +} + +TString TConfig::LoadTokenFromFile(const TString& tokenPath) +{ + TFsPath path(tokenPath); + return path.IsFile() ? Strip(TIFStream(path).ReadAll()) : TString(); +} + +TNode TConfig::LoadJsonSpec(const TString& strSpec) +{ + TNode spec; + TStringInput input(strSpec); + TNodeBuilder builder(&spec); + TYson2JsonCallbacksAdapter callbacks(&builder); + + Y_ENSURE(NJson::ReadJson(&input, &callbacks), "Cannot parse json spec: " << strSpec); + Y_ENSURE(spec.IsMap(), "Json spec is not a map"); + + return spec; +} + +TRichYPath TConfig::LoadApiFilePathOptions(const TString& ysonMap) +{ + TNode attributes; + try { + attributes = NodeFromYsonString(ysonMap); + } catch (const yexception& exc) { + ythrow yexception() << "Failed to parse YT_API_FILE_PATH_OPTIONS (it must be yson map): " << exc; + } + TNode pathNode = ""; + pathNode.Attributes() = attributes; + TRichYPath path; + Deserialize(path, pathNode); + return path; +} + +void TConfig::LoadToken() +{ + if (auto envToken = GetEnv("YT_TOKEN")) { + Token = envToken; + } else if (auto envToken = GetEnv("YT_SECURE_VAULT_YT_TOKEN")) { + // If this code runs inside an vanilla peration in YT + // it should not use regular environment variable `YT_TOKEN` + // because it would be visible in UI. + // Token should be passed via `secure_vault` parameter in operation spec. + Token = envToken; + } else if (auto tokenPath = GetEnv("YT_TOKEN_PATH")) { + Token = LoadTokenFromFile(tokenPath); + } else { + Token = LoadTokenFromFile(GetHomeDir() + "/.yt/token"); + } + ValidateToken(Token); +} + +void TConfig::LoadSpec() +{ + TString strSpec = GetEnv("YT_SPEC", "{}"); + Spec = LoadJsonSpec(strSpec); + + strSpec = GetEnv("YT_TABLE_WRITER", "{}"); + TableWriter = LoadJsonSpec(strSpec); +} + +void TConfig::LoadTimings() +{ + ConnectTimeout = GetDuration("YT_CONNECT_TIMEOUT", + TDuration::Seconds(10)); + + SocketTimeout = GetDuration("YT_SOCKET_TIMEOUT", + GetDuration("YT_SEND_RECEIVE_TIMEOUT", // common + TDuration::Seconds(60))); + + AddressCacheExpirationTimeout = TDuration::Minutes(15); + + CacheLockTimeoutPerGb = TDuration::MilliSeconds(1000.0 * 1_GB * 8 / 20_MB); // 20 Mbps = 20 MBps / 8. + + TxTimeout = GetDuration("YT_TX_TIMEOUT", + TDuration::Seconds(120)); + + PingTimeout = GetDuration("YT_PING_TIMEOUT", + TDuration::Seconds(5)); + + PingInterval = GetDuration("YT_PING_INTERVAL", + TDuration::Seconds(5)); + + WaitLockPollInterval = TDuration::Seconds(5); + + RetryInterval = GetDuration("YT_RETRY_INTERVAL", + TDuration::Seconds(3)); + + ChunkErrorsRetryInterval = GetDuration("YT_CHUNK_ERRORS_RETRY_INTERVAL", + TDuration::Seconds(60)); + + RateLimitExceededRetryInterval = GetDuration("YT_RATE_LIMIT_EXCEEDED_RETRY_INTERVAL", + TDuration::Seconds(60)); + + StartOperationRetryInterval = GetDuration("YT_START_OPERATION_RETRY_INTERVAL", + TDuration::Seconds(60)); + + HostListUpdateInterval = TDuration::Seconds(60); +} + +void TConfig::Reset() +{ + Hosts = GetEnv("YT_HOSTS", "hosts"); + Pool = GetEnv("YT_POOL"); + Prefix = GetEnv("YT_PREFIX"); + ApiVersion = GetEnv("YT_VERSION", "v3"); + LogLevel = GetEnv("YT_LOG_LEVEL", "error"); + + ContentEncoding = GetEncoding("YT_CONTENT_ENCODING"); + AcceptEncoding = GetEncoding("YT_ACCEPT_ENCODING"); + + GlobalTxId = GetEnv("YT_TRANSACTION", ""); + + UseAsyncTxPinger = false; + AsyncHttpClientThreads = 1; + AsyncTxPingerPoolThreads = 1; + + ForceIpV4 = GetBool("YT_FORCE_IPV4"); + ForceIpV6 = GetBool("YT_FORCE_IPV6"); + UseHosts = GetBool("YT_USE_HOSTS", true); + + LoadToken(); + LoadSpec(); + LoadTimings(); + + CacheUploadDeduplicationMode = GetUploadingDeduplicationMode("YT_UPLOAD_DEDUPLICATION", EUploadDeduplicationMode::Host); + + RetryCount = Max(GetInt("YT_RETRY_COUNT", 10), 1); + ReadRetryCount = Max(GetInt("YT_READ_RETRY_COUNT", 30), 1); + StartOperationRetryCount = Max(GetInt("YT_START_OPERATION_RETRY_COUNT", 30), 1); + + RemoteTempFilesDirectory = GetEnv("YT_FILE_STORAGE", + "//tmp/yt_wrapper/file_storage"); + RemoteTempTablesDirectory = GetEnv("YT_TEMP_TABLES_STORAGE", + "//tmp/yt_wrapper/table_storage"); + RemoteTempTablesDirectory = GetEnv("YT_TEMP_DIR", + RemoteTempTablesDirectory); + + InferTableSchema = false; + + UseClientProtobuf = GetBool("YT_USE_CLIENT_PROTOBUF", false); + NodeReaderFormat = ENodeReaderFormat::Auto; + ProtobufFormatWithDescriptors = true; + + MountSandboxInTmpfs = GetBool("YT_MOUNT_SANDBOX_IN_TMPFS"); + + ApiFilePathOptions = LoadApiFilePathOptions(GetEnv("YT_API_FILE_PATH_OPTIONS", "{}")); + + ConnectionPoolSize = GetInt("YT_CONNECTION_POOL_SIZE", 16); + + TraceHttpRequestsMode = FromString<ETraceHttpRequestsMode>(to_lower(GetEnv("YT_TRACE_HTTP_REQUESTS", "never"))); + + CommandsWithFraming = { + "read_table", + "get_table_columnar_statistics", + "get_job_input", + "concatenate", + "partition_tables", + }; +} + +TConfig::TConfig() +{ + Reset(); +} + +TConfigPtr TConfig::Get() +{ + struct TConfigHolder + { + TConfigHolder() + : Config(::MakeIntrusive<TConfig>()) + { } + + TConfigPtr Config; + }; + + return Singleton<TConfigHolder>()->Config; +} + +//////////////////////////////////////////////////////////////////////////////// + +TProcessState::TProcessState() +{ + try { + FqdnHostName = ::FQDNHostName(); + } catch (const yexception& e) { + try { + FqdnHostName = ::HostName(); + } catch (const yexception& e) { + ythrow yexception() << "Cannot get fqdn and host name: " << e.what(); + } + } + + try { + UserName = ::GetUsername(); + } catch (const yexception& e) { + ythrow yexception() << "Cannot get user name: " << e.what(); + } + + Pid = static_cast<int>(getpid()); + + if (!ClientVersion) { + ClientVersion = ::TStringBuilder() << "YT C++ native " << GetProgramCommitId(); + } +} + +static TString CensorString(TString input) +{ + static const TString prefix = "AQAD-"; + if (input.find(prefix) == TString::npos) { + return input; + } else { + return TString(input.size(), '*'); + } +} + +void TProcessState::SetCommandLine(int argc, const char* argv[]) +{ + for (int i = 0; i < argc; ++i) { + CommandLine.push_back(argv[i]); + CensoredCommandLine.push_back(CensorString(CommandLine.back())); + } +} + +TProcessState* TProcessState::Get() +{ + return Singleton<TProcessState>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/config.h b/yt/cpp/mapreduce/interface/config.h new file mode 100644 index 0000000000..c44ad25f1c --- /dev/null +++ b/yt/cpp/mapreduce/interface/config.h @@ -0,0 +1,228 @@ +#pragma once + +#include "fwd.h" +#include "common.h" +#include "node.h" + +#include <library/cpp/yt/misc/enum.h> + +#include <util/generic/maybe.h> +#include <util/generic/string.h> +#include <util/generic/hash_set.h> + +#include <util/datetime/base.h> + +namespace NYT { + +enum EEncoding : int +{ + E_IDENTITY /* "identity" */, + E_GZIP /* "gzip" */, + E_BROTLI /* "br" */, + E_Z_LZ4 /* "z-lz4" */, +}; + +enum class ENodeReaderFormat : int +{ + Yson, // Always use YSON format, + Skiff, // Always use Skiff format, throw exception if it's not possible (non-strict schema, dynamic table etc.) + Auto, // Use Skiff format if it's possible, YSON otherwise +}; + +enum class ETraceHttpRequestsMode +{ + // Never dump http requests. + Never /* "never" */, + // Dump failed http requests. + Error /* "error" */, + // Dump all http requests. + Always /* "always" */, +}; + +DEFINE_ENUM(EUploadDeduplicationMode, + // For each file only one process' thread from all possible hosts can upload it to the file cache at the same time. + // The others will wait for the uploading to finish and use already cached file. + ((Global) (0)) + + // For each file and each particular host only one process' thread can upload it to the file cache at the same time. + // The others will wait for the uploading to finish and use already cached file. + ((Host) (1)) + + // All processes' threads will upload a file to the cache concurrently. + ((Disabled) (2)) +); + +//////////////////////////////////////////////////////////////////////////////// + +struct TConfig + : public TThrRefBase +{ + TString Hosts; + TString Pool; + TString Token; + TString Prefix; + TString ApiVersion; + TString LogLevel; + + // Compression for data that is sent to YT cluster. + EEncoding ContentEncoding; + + // Compression for data that is read from YT cluster. + EEncoding AcceptEncoding; + + TString GlobalTxId; + + bool ForceIpV4; + bool ForceIpV6; + bool UseHosts; + + TDuration HostListUpdateInterval; + + TNode Spec; + TNode TableWriter; + + TDuration ConnectTimeout; + TDuration SocketTimeout; + TDuration AddressCacheExpirationTimeout; + TDuration TxTimeout; + TDuration PingTimeout; + TDuration PingInterval; + + bool UseAsyncTxPinger; + int AsyncHttpClientThreads; + int AsyncTxPingerPoolThreads; + + // How often should we poll for lock state + TDuration WaitLockPollInterval; + + TDuration RetryInterval; + TDuration ChunkErrorsRetryInterval; + + TDuration RateLimitExceededRetryInterval; + TDuration StartOperationRetryInterval; + + int RetryCount; + int ReadRetryCount; + int StartOperationRetryCount; + + /// @brief Period for checking status of running operation. + TDuration OperationTrackerPollPeriod = TDuration::Seconds(5); + + TString RemoteTempFilesDirectory; + TString RemoteTempTablesDirectory; + + // + // Infer schemas for nonexstent tables from typed rows (e.g. protobuf) + // when writing from operation or client writer. + // This options can be overriden in TOperationOptions and TTableWriterOptions. + bool InferTableSchema; + + bool UseClientProtobuf; + ENodeReaderFormat NodeReaderFormat; + bool ProtobufFormatWithDescriptors; + + int ConnectionPoolSize; + + /// Defines replication factor that is used for files that are uploaded to YT + /// to use them in operations. + int FileCacheReplicationFactor = 10; + + /// @brief Used when waiting for other process which uploads the same file to the file cache. + /// + /// If CacheUploadDeduplicationMode is not Disabled, current process can wait for some other + /// process which is uploading the same file. This value is proportional to the timeout of waiting, + /// actual timeout computes as follows: fileSizeGb * CacheLockTimeoutPerGb. + /// Default timeout assumes that host has uploading speed equal to 20 Mb/s. + /// If timeout was reached, the file will be uploaded by current process without any other waits. + TDuration CacheLockTimeoutPerGb; + + /// @brief Used to prevent concurrent uploading of the same file to the file cache. + /// NB: Each mode affects only users with the same mode enabled. + EUploadDeduplicationMode CacheUploadDeduplicationMode; + + bool MountSandboxInTmpfs; + + /// @brief Set upload options (e.g.) for files created by library. + /// + /// Path itself is always ignored but path options (e.g. `BypassArtifactCache`) are used when uploading system files: + /// cppbinary, job state, etc + TRichYPath ApiFilePathOptions; + + // Testing options, should never be used in user programs. + bool UseAbortableResponse = false; + bool EnableDebugMetrics = false; + + // + // There is optimization used with local YT that enables to skip binary upload and use real binary path. + // When EnableLocalModeOptimization is set to false this optimization is completely disabled. + bool EnableLocalModeOptimization = true; + + // + // If you want see stderr even if you jobs not failed set this true. + bool WriteStderrSuccessfulJobs = false; + + // + // This configuration is useful for debug. + // If set to ETraceHttpRequestsMode::Error library will dump all http error requests. + // If set to ETraceHttpRequestsMode::All library will dump all http requests. + // All tracing occurres as DEBUG level logging. + ETraceHttpRequestsMode TraceHttpRequestsMode = ETraceHttpRequestsMode::Never; + + TString SkynetApiHost; + + // Sets SO_PRIORITY option on the socket + TMaybe<int> SocketPriority; + + // Framing settings + // (cf. https://yt.yandex-team.ru/docs/description/proxy/http_proxy_reference#framing). + THashSet<TString> CommandsWithFraming; + + static bool GetBool(const char* var, bool defaultValue = false); + static int GetInt(const char* var, int defaultValue); + static TDuration GetDuration(const char* var, TDuration defaultValue); + static EEncoding GetEncoding(const char* var); + static EUploadDeduplicationMode GetUploadingDeduplicationMode( + const char* var, + EUploadDeduplicationMode defaultValue); + + static void ValidateToken(const TString& token); + static TString LoadTokenFromFile(const TString& tokenPath); + + static TNode LoadJsonSpec(const TString& strSpec); + + static TRichYPath LoadApiFilePathOptions(const TString& ysonMap); + + void LoadToken(); + void LoadSpec(); + void LoadTimings(); + + void Reset(); + + TConfig(); + + static TConfigPtr Get(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TProcessState +{ + TString FqdnHostName; + TString UserName; + TVector<TString> CommandLine; + + // Command line with everything that looks like tokens censored. + TVector<TString> CensoredCommandLine; + int Pid; + TString ClientVersion; + + TProcessState(); + + void SetCommandLine(int argc, const char* argv[]); + + static TProcessState* Get(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/config_ut.cpp b/yt/cpp/mapreduce/interface/config_ut.cpp new file mode 100644 index 0000000000..e49ba02108 --- /dev/null +++ b/yt/cpp/mapreduce/interface/config_ut.cpp @@ -0,0 +1,20 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(ConfigSuite) +{ + Y_UNIT_TEST(TestReset) { + // very limited test, checks only one config field + + auto origConfig = *TConfig::Get(); + TConfig::Get()->Reset(); + UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts); + + TConfig::Get()->Hosts = "hosts/fb867"; + TConfig::Get()->Reset(); + UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts); + } +} diff --git a/yt/cpp/mapreduce/interface/constants.h b/yt/cpp/mapreduce/interface/constants.h new file mode 100644 index 0000000000..4f70410814 --- /dev/null +++ b/yt/cpp/mapreduce/interface/constants.h @@ -0,0 +1,19 @@ +#pragma once + + +#include <util/system/defaults.h> + + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + + +// Maximum number of input tables for operation. +// If greater number of input tables are provided behaviour is undefined +// (it might work ok or it might fail or it might work very slowly). +constexpr size_t MaxInputTableCount = 1000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/cypress.cpp b/yt/cpp/mapreduce/interface/cypress.cpp new file mode 100644 index 0000000000..53686effd2 --- /dev/null +++ b/yt/cpp/mapreduce/interface/cypress.cpp @@ -0,0 +1,24 @@ +#include "cypress.h" + +#include "config.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +void ICypressClient::Concatenate( + const TVector<TYPath>& sourcePaths, + const TYPath& destinationPath, + const TConcatenateOptions& options) +{ + TVector<TRichYPath> richSourcePaths; + richSourcePaths.reserve(sourcePaths.size()); + for (const auto& path : sourcePaths) { + richSourcePaths.emplace_back(path); + } + Concatenate(richSourcePaths, destinationPath, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/cypress.h b/yt/cpp/mapreduce/interface/cypress.h new file mode 100644 index 0000000000..e05316ebc6 --- /dev/null +++ b/yt/cpp/mapreduce/interface/cypress.h @@ -0,0 +1,252 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/cypress.h +/// +/// Header containing interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands. + +#include "fwd.h" + +#include "client_method_options.h" +#include "common.h" +#include "node.h" + +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Client interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands. +class ICypressClient +{ +public: + virtual ~ICypressClient() = default; + + /// + /// @brief Create Cypress node of given type. + /// + /// @param path Path in Cypress to the new object. + /// @param type New node type. + /// @param options Optional parameters. + /// + /// @return Id of the created node. + /// + /// @note All but the last components must exist unless @ref NYT::TCreateOptions::Recursive is `true`. + /// + /// @note The node itself must not exist unless @ref NYT::TCreateOptions::IgnoreExisting or @ref NYT::TCreateOptions::Force are `true`. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#create) + virtual TNodeId Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options = TCreateOptions()) = 0; + + /// + /// @brief Create table with schema inferred from the template argument. + /// + /// @tparam TRowType type of C++ representation of the row to be stored in the table. + /// @param path Path in Cypress to the new table. + /// @param sortColumns List of columns to mark as sorted in schema. + /// @param options Optional parameters. + /// + /// @return Id of the created node. + /// + /// @note If "schema" is passed in `options.Attributes` it has priority over the deduced schema (the latter is ignored). + template <typename TRowType> + TNodeId CreateTable( + const TYPath& path, + const TSortColumns& sortColumns = TSortColumns(), + const TCreateOptions& options = TCreateOptions()); + + /// + /// @brief Remove Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remove) + virtual void Remove( + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()) = 0; + + /// + /// @brief Check if Cypress node exists. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#exists) + virtual bool Exists( + const TYPath& path, + const TExistsOptions& options = TExistsOptions()) = 0; + + /// + /// @brief Get Cypress node contents. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get) + virtual TNode Get( + const TYPath& path, + const TGetOptions& options = TGetOptions()) = 0; + + /// + /// @brief Set Cypress node contents. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#set) + virtual void Set( + const TYPath& path, + const TNode& value, + const TSetOptions& options = TSetOptions()) = 0; + + /// + /// @brief Set multiple attributes for cypress path. + /// + /// @param path Path to root of the attributes to be set e.g. "//path/to/table/@"; + /// it is important to make sure that path ends with "/@". + /// @param attributes Map with attributes + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes) + virtual void MultisetAttributes( + const TYPath& path, + const TNode::TMapType& attributes, + const TMultisetAttributesOptions& options = TMultisetAttributesOptions()) = 0; + + /// + /// @brief List Cypress map or attribute node keys. + /// + /// @param path Path in the tree to the node in question. + /// @param options Optional parameters. + /// + /// @return List of keys with attributes (if they were required in @ref NYT::TListOptions::AttributeFilter). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list) + virtual TNode::TListType List( + const TYPath& path, + const TListOptions& options = TListOptions()) = 0; + + /// + /// @brief Copy Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#copy) + virtual TNodeId Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()) = 0; + + /// + /// @brief Move Cypress node (equivalent to copy-then-remove). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#move) + virtual TNodeId Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()) = 0; + + /// + /// @brief Create link to Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#link) + virtual TNodeId Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()) = 0; + + /// + /// @brief Concatenate several tables into one. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate) + virtual void Concatenate( + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options = TConcatenateOptions()) = 0; + + /// + /// @brief Concatenate several tables into one. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate) + virtual void Concatenate( + const TVector<TYPath>& sourcePaths, + const TYPath& destinationPath, + const TConcatenateOptions& options = TConcatenateOptions()); + + /// + /// @brief Canonize YPath, moving all the complex YPath features to attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#parse-ypath) + virtual TRichYPath CanonizeYPath(const TRichYPath& path) = 0; + + /// + /// @brief Get statistics for given sets of columns in given table ranges. + /// + /// @note Paths must contain column selectors. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-table-columnar-statistics) + virtual TVector<TTableColumnarStatistics> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options = {}) = 0; + + /// + /// @brief Divide input tables into disjoint partitions. + /// + /// Resulted partitions are vectors of rich YPaths. + /// Each partition can be given to a separate worker for further independent processing. + /// + virtual TMultiTablePartitions GetTablePartitions( + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) = 0; + + /// + /// @brief Get file from file cache. + /// + /// @param md5Signature MD5 digest of the file. + /// @param cachePath Path to the file cache. + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-file-from-cache) + virtual TMaybe<TYPath> GetFileFromCache( + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options = TGetFileFromCacheOptions()) = 0; + + /// + /// @brief Put file to file cache. + /// + /// @param filePath Path in Cypress to the file to cache. + /// @param md5Signature Expected MD5 digest of the file. + /// @param cachePath Path to the file cache. + /// @param options Optional parameters. + /// + /// @note The file in `filePath` must have been written with @ref NYT::TFileWriterOptions::ComputeMD5 set to `true`. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#put-file-to-cache) + virtual TYPath PutFileToCache( + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options = TPutFileToCacheOptions()) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRowType> +TNodeId ICypressClient::CreateTable( + const TYPath& path, + const TSortColumns& sortColumns, + const TCreateOptions& options) +{ + static_assert( + std::is_base_of_v<::google::protobuf::Message, TRowType>, + "TRowType must be inherited from google::protobuf::Message"); + + TCreateOptions actualOptions = options; + if (!actualOptions.Attributes_) { + actualOptions.Attributes_ = TNode::CreateMap(); + } + + if (!actualOptions.Attributes_->HasKey("schema")) { + actualOptions.Attributes_->AsMap().emplace( + "schema", + CreateTableSchema<TRowType>(sortColumns).ToNode()); + } + + return Create(path, ENodeType::NT_TABLE, actualOptions); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/error_codes.h b/yt/cpp/mapreduce/interface/error_codes.h new file mode 100644 index 0000000000..d8d76e04fd --- /dev/null +++ b/yt/cpp/mapreduce/interface/error_codes.h @@ -0,0 +1,468 @@ +#pragma once + +// +// generated by generate-error-codes.py +// + +namespace NYT { +namespace NClusterErrorCodes { + + + +// from ./core/misc/public.h + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int OK = 0; + constexpr int Generic = 1; + constexpr int Canceled = 2; + constexpr int Timeout = 3; + +//////////////////////////////////////////////////////////////////////////////// + + + + +// from ./core/rpc/public.h +namespace NRpc { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransportError = 100; + constexpr int ProtocolError = 101; + constexpr int NoSuchService = 102; + constexpr int NoSuchMethod = 103; + constexpr int Unavailable = 105; + constexpr int PoisonPill = 106; + constexpr int RequestQueueSizeLimitExceeded = 108; + constexpr int AuthenticationError = 109; + constexpr int InvalidCsrfToken = 110; + constexpr int InvalidCredentials = 111; + constexpr int StreamingNotSupported = 112; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NRpc + + + +// from ./core/bus/public.h +namespace NBus { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransportError = 100; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NBus + + + +// from ./client/scheduler/public.h +namespace NScheduler { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchOperation = 200; + constexpr int InvalidOperationState = 201; + constexpr int TooManyOperations = 202; + constexpr int NoSuchJob = 203; + constexpr int OperationFailedOnJobRestart = 210; + constexpr int OperationFailedWithInconsistentLocking = 211; + constexpr int OperationControllerCrashed = 212; + constexpr int TestingError = 213; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NScheduler + + + +// from ./client/table_client/public.h +namespace NTableClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int SortOrderViolation = 301; + constexpr int InvalidDoubleValue = 302; + constexpr int IncomparableType = 303; + constexpr int UnhashableType = 304; + // E.g. name table with more than #MaxColumnId columns (may come from legacy chunks). + constexpr int CorruptedNameTable = 305; + constexpr int UniqueKeyViolation = 306; + constexpr int SchemaViolation = 307; + constexpr int RowWeightLimitExceeded = 308; + constexpr int InvalidColumnFilter = 309; + constexpr int InvalidColumnRenaming = 310; + constexpr int IncompatibleKeyColumns = 311; + constexpr int ReaderDeadlineExpired = 312; + constexpr int TimestampOutOfRange = 313; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTableClient + + + +// from ./client/cypress_client/public.h +namespace NCypressClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int SameTransactionLockConflict = 400; + constexpr int DescendantTransactionLockConflict = 401; + constexpr int ConcurrentTransactionLockConflict = 402; + constexpr int PendingLockConflict = 403; + constexpr int LockDestroyed = 404; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NCypressClient + + + +// from ./core/ytree/public.h +namespace NYTree { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ResolveError = 500; + constexpr int AlreadyExists = 501; + constexpr int MaxChildCountViolation = 502; + constexpr int MaxStringLengthViolation = 503; + constexpr int MaxAttributeSizeViolation = 504; + constexpr int MaxKeyLengthViolation = 505; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYTree + + + +// from ./client/hydra/public.h +namespace NHydra { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchSnapshot = 600; + constexpr int NoSuchChangelog = 601; + constexpr int InvalidEpoch = 602; + constexpr int InvalidVersion = 603; + constexpr int OutOfOrderMutations = 609; + constexpr int InvalidSnapshotVersion = 610; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NHydra + + + +// from ./client/chunk_client/public.h +namespace NChunkClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AllTargetNodesFailed = 700; + constexpr int SendBlocksFailed = 701; + constexpr int NoSuchSession = 702; + constexpr int SessionAlreadyExists = 703; + constexpr int ChunkAlreadyExists = 704; + constexpr int WindowError = 705; + constexpr int BlockContentMismatch = 706; + constexpr int NoSuchBlock = 707; + constexpr int NoSuchChunk = 708; + constexpr int NoLocationAvailable = 710; + constexpr int IOError = 711; + constexpr int MasterCommunicationFailed = 712; + constexpr int NoSuchChunkTree = 713; + constexpr int MasterNotConnected = 714; + constexpr int ChunkUnavailable = 716; + constexpr int NoSuchChunkList = 717; + constexpr int WriteThrottlingActive = 718; + constexpr int NoSuchMedium = 719; + constexpr int OptimisticLockFailure = 720; + constexpr int InvalidBlockChecksum = 721; + constexpr int BlockOutOfRange = 722; + constexpr int ObjectNotReplicated = 723; + constexpr int MissingExtension = 724; + constexpr int BandwidthThrottlingFailed = 725; + constexpr int ReaderTimeout = 726; + constexpr int NoSuchChunkView = 727; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NChunkClient + + + +// from ./client/election/public.h +namespace NElection { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int InvalidState = 800; + constexpr int InvalidLeader = 801; + constexpr int InvalidEpoch = 802; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NElection + + + +// from ./client/security_client/public.h +namespace NSecurityClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AuthenticationError = 900; + constexpr int AuthorizationError = 901; + constexpr int AccountLimitExceeded = 902; + constexpr int UserBanned = 903; + constexpr int RequestQueueSizeLimitExceeded = 904; + constexpr int NoSuchAccount = 905; + constexpr int SafeModeEnabled = 906; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NSecurityClient + + + +// from ./client/object_client/public.h +namespace NObjectClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int PrerequisiteCheckFailed = 1000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NObjectClient + + + +// from ./server/lib/exec_agent/public.h +namespace NExecAgent { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ConfigCreationFailed = 1100; + constexpr int AbortByScheduler = 1101; + constexpr int ResourceOverdraft = 1102; + constexpr int WaitingJobTimeout = 1103; + constexpr int SlotNotFound = 1104; + constexpr int JobEnvironmentDisabled = 1105; + constexpr int JobProxyConnectionFailed = 1106; + constexpr int ArtifactCopyingFailed = 1107; + constexpr int NodeDirectoryPreparationFailed = 1108; + constexpr int SlotLocationDisabled = 1109; + constexpr int QuotaSettingFailed = 1110; + constexpr int RootVolumePreparationFailed = 1111; + constexpr int NotEnoughDiskSpace = 1112; + constexpr int ArtifactDownloadFailed = 1113; + constexpr int JobProxyPreparationTimeout = 1114; + constexpr int JobPreparationTimeout = 1115; + constexpr int JobProxyFailed = 1120; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NExecAgent + + + +// from ./ytlib/job_proxy/public.h +namespace NJobProxy { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int MemoryLimitExceeded = 1200; + constexpr int MemoryCheckFailed = 1201; + constexpr int JobTimeLimitExceeded = 1202; + constexpr int UnsupportedJobType = 1203; + constexpr int JobNotPrepared = 1204; + constexpr int UserJobFailed = 1205; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NJobProxy + + + +// from ./server/node/data_node/public.h +namespace NDataNode { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int LocalChunkReaderFailed = 1300; + constexpr int LayerUnpackingFailed = 1301; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDataNode + + + +// from ./core/net/public.h +namespace NNet { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int Aborted = 1500; + constexpr int ResolveTimedOut = 1501; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NNet + + + +// from ./client/node_tracker_client/public.h +namespace NNodeTrackerClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchNode = 1600; + constexpr int InvalidState = 1601; + constexpr int NoSuchNetwork = 1602; + constexpr int NoSuchRack = 1603; + constexpr int NoSuchDataCenter = 1604; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NNodeTrackerClient + + + +// from ./client/tablet_client/public.h +namespace NTabletClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransactionLockConflict = 1700; + constexpr int NoSuchTablet = 1701; + constexpr int TabletNotMounted = 1702; + constexpr int AllWritesDisabled = 1703; + constexpr int InvalidMountRevision = 1704; + constexpr int TableReplicaAlreadyExists = 1705; + constexpr int InvalidTabletState = 1706; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTabletClient + + + +// from ./server/lib/shell/public.h +namespace NShell { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ShellExited = 1800; + constexpr int ShellManagerShutDown = 1801; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NShell + + + +// from ./client/api/public.h +namespace NApi { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TooManyConcurrentRequests = 1900; + constexpr int JobArchiveUnavailable = 1910; + constexpr int RetriableArchiveError = 1911; + constexpr int NoSuchOperation = 1915; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NApi + + + +// from ./server/controller_agent/chunk_pools/public.h +namespace NChunkPools { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int DataSliceLimitExceeded = 2000; + constexpr int MaxDataWeightPerJobExceeded = 2001; + constexpr int MaxPrimaryDataWeightPerJobExceeded = 2002; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NChunkPools + + + +// from ./client/api/rpc_proxy/public.h +namespace NApi { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ProxyBanned = 2100; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NApi + + + +// from ./ytlib/controller_agent/public.h +namespace NControllerAgent { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AgentCallFailed = 4400; + constexpr int NoOnlineNodeToScheduleJob = 4410; + constexpr int MaterializationFailed = 4415; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NControllerAgent + + + +// from ./client/transaction_client/public.h +namespace NTransactionClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchTransaction = 11000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTransactionClient + + + +// from ./server/lib/containers/public.h +namespace NContainers { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int FailedToStartContainer = 13000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NContainers + + + +// from ./ytlib/job_prober_client/public.h +namespace NJobProberClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int JobIsNotRunning = 17000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NJobProberClient + +} // namespace NClusterErrorCodes +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/error_ut.cpp b/yt/cpp/mapreduce/interface/error_ut.cpp new file mode 100644 index 0000000000..03f2751b23 --- /dev/null +++ b/yt/cpp/mapreduce/interface/error_ut.cpp @@ -0,0 +1,81 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/json/json_reader.h> + +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/common/helpers.h> + +using namespace NYT; + +template<> +void Out<NYT::TNode>(IOutputStream& s, const NYT::TNode& node) +{ + s << "TNode:" << NodeToYsonString(node); +} + +Y_UNIT_TEST_SUITE(ErrorSuite) +{ + Y_UNIT_TEST(TestParseJson) + { + // Scary real world error! Бу! + const char* jsonText = + R"""({)""" + R"""("code":500,)""" + R"""("message":"Error resolving path //home/user/link",)""" + R"""("attributes":{)""" + R"""("fid":18446484571700269066,)""" + R"""("method":"Create",)""" + R"""("tid":17558639495721339338,)""" + R"""("datetime":"2017-04-07T13:38:56.474819Z",)""" + R"""("pid":414529,)""" + R"""("host":"build01-01g.yt.yandex.net"},)""" + R"""("inner_errors":[{)""" + R"""("code":1,)""" + R"""("message":"Node //tt cannot have children",)""" + R"""("attributes":{)""" + R"""("fid":18446484571700269066,)""" + R"""("tid":17558639495721339338,)""" + R"""("datetime":"2017-04-07T13:38:56.474725Z",)""" + R"""("pid":414529,)""" + R"""("host":"build01-01g.yt.yandex.net"},)""" + R"""("inner_errors":[]}]})"""; + + NJson::TJsonValue jsonValue; + ReadJsonFastTree(jsonText, &jsonValue, /*throwOnError=*/ true); + + TYtError error(jsonValue); + UNIT_ASSERT_VALUES_EQUAL(error.GetCode(), 500); + UNIT_ASSERT_VALUES_EQUAL(error.GetMessage(), R"""(Error resolving path //home/user/link)"""); + UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors()[0].GetCode(), 1); + + UNIT_ASSERT_VALUES_EQUAL(error.HasAttributes(), true); + UNIT_ASSERT_VALUES_EQUAL(error.GetAttributes().at("method"), TNode("Create")); + + UNIT_ASSERT_VALUES_EQUAL(error.GetAllErrorCodes(), TSet<int>({500, 1})); + } + + Y_UNIT_TEST(TestGetYsonText) { + const char* jsonText = + R"""({)""" + R"""("code":500,)""" + R"""("message":"outer error",)""" + R"""("attributes":{)""" + R"""("method":"Create",)""" + R"""("pid":414529},)""" + R"""("inner_errors":[{)""" + R"""("code":1,)""" + R"""("message":"inner error",)""" + R"""("attributes":{},)""" + R"""("inner_errors":[])""" + R"""(}]})"""; + TYtError error; + error.ParseFrom(jsonText); + TString ysonText = error.GetYsonText(); + TYtError error2(NodeFromYsonString(ysonText)); + UNIT_ASSERT_EQUAL( + ysonText, + R"""({"code"=500;"message"="outer error";"attributes"={"method"="Create";"pid"=414529};"inner_errors"=[{"code"=1;"message"="inner error"}]})"""); + UNIT_ASSERT_EQUAL(error2.GetYsonText(), ysonText); + } +} diff --git a/yt/cpp/mapreduce/interface/errors.cpp b/yt/cpp/mapreduce/interface/errors.cpp new file mode 100644 index 0000000000..49a7c7cfc1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/errors.cpp @@ -0,0 +1,437 @@ +#include "errors.h" + +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/node_visitor.h> + +#include <yt/cpp/mapreduce/interface/error_codes.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/yson/writer.h> + +#include <util/string/builder.h> +#include <util/stream/str.h> +#include <util/generic/set.h> + +namespace NYT { + +using namespace NJson; + +//////////////////////////////////////////////////////////////////// + +static void WriteErrorDescription(const TYtError& error, IOutputStream* out) +{ + (*out) << '\'' << error.GetMessage() << '\''; + const auto& innerErrorList = error.InnerErrors(); + if (!innerErrorList.empty()) { + (*out) << " { "; + bool first = true; + for (const auto& innerError : innerErrorList) { + if (first) { + first = false; + } else { + (*out) << " ; "; + } + WriteErrorDescription(innerError, out); + } + (*out) << " }"; + } +} + +static void SerializeError(const TYtError& error, NYson::IYsonConsumer* consumer) +{ + consumer->OnBeginMap(); + { + consumer->OnKeyedItem("code"); + consumer->OnInt64Scalar(error.GetCode()); + + consumer->OnKeyedItem("message"); + consumer->OnStringScalar(error.GetMessage()); + + if (!error.GetAttributes().empty()) { + consumer->OnKeyedItem("attributes"); + consumer->OnBeginMap(); + { + for (const auto& item : error.GetAttributes()) { + consumer->OnKeyedItem(item.first); + TNodeVisitor(consumer).Visit(item.second); + } + } + consumer->OnEndMap(); + } + + if (!error.InnerErrors().empty()) { + consumer->OnKeyedItem("inner_errors"); + { + consumer->OnBeginList(); + for (const auto& innerError : error.InnerErrors()) { + SerializeError(innerError, consumer); + } + consumer->OnEndList(); + } + } + } + consumer->OnEndMap(); +} + +static TString DumpJobInfoForException(const TOperationId& operationId, const TVector<TFailedJobInfo>& failedJobInfoList) +{ + ::TStringBuilder output; + // Exceptions have limit to contain 65508 bytes of text, so we also limit stderr text + constexpr size_t MAX_SIZE = 65508 / 2; + + size_t written = 0; + for (const auto& failedJobInfo : failedJobInfoList) { + if (written >= MAX_SIZE) { + break; + } + TStringStream nextChunk; + nextChunk << '\n'; + nextChunk << "OperationId: " << GetGuidAsString(operationId) << " JobId: " << GetGuidAsString(failedJobInfo.JobId) << '\n'; + nextChunk << "Error: " << failedJobInfo.Error.FullDescription() << '\n'; + if (!failedJobInfo.Stderr.empty()) { + nextChunk << "Stderr: " << Endl; + size_t tmpWritten = written + nextChunk.Str().size(); + if (tmpWritten >= MAX_SIZE) { + break; + } + + if (tmpWritten + failedJobInfo.Stderr.size() > MAX_SIZE) { + nextChunk << failedJobInfo.Stderr.substr(failedJobInfo.Stderr.size() - (MAX_SIZE - tmpWritten)); + } else { + nextChunk << failedJobInfo.Stderr; + } + } + written += nextChunk.Str().size(); + output << nextChunk.Str(); + } + return output; +} + +//////////////////////////////////////////////////////////////////// + +TYtError::TYtError() + : Code_(0) +{ } + +TYtError::TYtError(const TString& message) + : Code_(NYT::NClusterErrorCodes::Generic) + , Message_(message) +{ } + +TYtError::TYtError(int code, const TString& message) + : Code_(code) + , Message_(message) +{ } + +TYtError::TYtError(const TJsonValue& value) +{ + const TJsonValue::TMapType& map = value.GetMap(); + TJsonValue::TMapType::const_iterator it = map.find("message"); + if (it != map.end()) { + Message_ = it->second.GetString(); + } + + it = map.find("code"); + if (it != map.end()) { + Code_ = static_cast<int>(it->second.GetInteger()); + } else { + Code_ = NYT::NClusterErrorCodes::Generic; + } + + it = map.find("inner_errors"); + if (it != map.end()) { + const TJsonValue::TArray& innerErrors = it->second.GetArray(); + for (const auto& innerError : innerErrors) { + InnerErrors_.push_back(TYtError(innerError)); + } + } + + it = map.find("attributes"); + if (it != map.end()) { + auto attributes = NYT::NodeFromJsonValue(it->second); + if (attributes.IsMap()) { + Attributes_ = std::move(attributes.AsMap()); + } + } +} + +TYtError::TYtError(const TNode& node) +{ + const auto& map = node.AsMap(); + auto it = map.find("message"); + if (it != map.end()) { + Message_ = it->second.AsString(); + } + + it = map.find("code"); + if (it != map.end()) { + Code_ = static_cast<int>(it->second.AsInt64()); + } else { + Code_ = NYT::NClusterErrorCodes::Generic; + } + + it = map.find("inner_errors"); + if (it != map.end()) { + const auto& innerErrors = it->second.AsList(); + for (const auto& innerError : innerErrors) { + InnerErrors_.push_back(TYtError(innerError)); + } + } + + it = map.find("attributes"); + if (it != map.end()) { + auto& attributes = it->second; + if (attributes.IsMap()) { + Attributes_ = std::move(attributes.AsMap()); + } + } +} + +int TYtError::GetCode() const +{ + return Code_; +} + +const TString& TYtError::GetMessage() const +{ + return Message_; +} + +const TVector<TYtError>& TYtError::InnerErrors() const +{ + return InnerErrors_; +} + +void TYtError::ParseFrom(const TString& jsonError) +{ + TJsonValue value; + TStringInput input(jsonError); + ReadJsonTree(&input, &value); + *this = TYtError(value); +} + +TSet<int> TYtError::GetAllErrorCodes() const +{ + TDeque<const TYtError*> queue = {this}; + TSet<int> result; + while (!queue.empty()) { + const auto* current = queue.front(); + queue.pop_front(); + result.insert(current->Code_); + for (const auto& error : current->InnerErrors_) { + queue.push_back(&error); + } + } + return result; +} + +bool TYtError::ContainsErrorCode(int code) const +{ + if (Code_ == code) { + return true; + } + for (const auto& error : InnerErrors_) { + if (error.ContainsErrorCode(code)) { + return true; + } + } + return false; +} + + +bool TYtError::ContainsText(const TStringBuf& text) const +{ + if (Message_.Contains(text)) { + return true; + } + for (const auto& error : InnerErrors_) { + if (error.ContainsText(text)) { + return true; + } + } + return false; +} + +bool TYtError::HasAttributes() const +{ + return !Attributes_.empty(); +} + +const TNode::TMapType& TYtError::GetAttributes() const +{ + return Attributes_; +} + +TString TYtError::GetYsonText() const +{ + TStringStream out; + ::NYson::TYsonWriter writer(&out, NYson::EYsonFormat::Text); + SerializeError(*this, &writer); + return std::move(out.Str()); +} + +TString TYtError::ShortDescription() const +{ + TStringStream out; + WriteErrorDescription(*this, &out); + return std::move(out.Str()); +} + +TString TYtError::FullDescription() const +{ + TStringStream s; + WriteErrorDescription(*this, &s); + s << "; full error: " << GetYsonText(); + return s.Str(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TErrorResponse::TErrorResponse(int httpCode, const TString& requestId) + : HttpCode_(httpCode) + , RequestId_(requestId) +{ } + +bool TErrorResponse::IsOk() const +{ + return Error_.GetCode() == 0; +} + +void TErrorResponse::SetRawError(const TString& message) +{ + Error_ = TYtError(message); + Setup(); +} + +void TErrorResponse::SetError(TYtError error) +{ + Error_ = std::move(error); + Setup(); +} + +void TErrorResponse::ParseFromJsonError(const TString& jsonError) +{ + Error_.ParseFrom(jsonError); + Setup(); +} + +void TErrorResponse::SetIsFromTrailers(bool isFromTrailers) +{ + IsFromTrailers_ = isFromTrailers; +} + +int TErrorResponse::GetHttpCode() const +{ + return HttpCode_; +} + +bool TErrorResponse::IsFromTrailers() const +{ + return IsFromTrailers_; +} + +bool TErrorResponse::IsTransportError() const +{ + return HttpCode_ == 503; +} + +TString TErrorResponse::GetRequestId() const +{ + return RequestId_; +} + +const TYtError& TErrorResponse::GetError() const +{ + return Error_; +} + +bool TErrorResponse::IsResolveError() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NYTree::ResolveError); +} + +bool TErrorResponse::IsAccessDenied() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::AuthorizationError); +} + +bool TErrorResponse::IsConcurrentTransactionLockConflict() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NCypressClient::ConcurrentTransactionLockConflict); +} + +bool TErrorResponse::IsRequestRateLimitExceeded() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::RequestQueueSizeLimitExceeded); +} + +bool TErrorResponse::IsRequestQueueSizeLimitExceeded() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NRpc::RequestQueueSizeLimitExceeded); +} + +bool TErrorResponse::IsChunkUnavailable() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NChunkClient::ChunkUnavailable); +} + +bool TErrorResponse::IsRequestTimedOut() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::Timeout); +} + +bool TErrorResponse::IsNoSuchTransaction() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NTransactionClient::NoSuchTransaction); +} + +bool TErrorResponse::IsConcurrentOperationsLimitReached() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NScheduler::TooManyOperations); +} + +void TErrorResponse::Setup() +{ + TStringStream s; + *this << Error_.FullDescription(); +} + +//////////////////////////////////////////////////////////////////// + +TOperationFailedError::TOperationFailedError( + EState state, + TOperationId id, + TYtError ytError, + TVector<TFailedJobInfo> failedJobInfo) + : State_(state) + , OperationId_(id) + , Error_(std::move(ytError)) + , FailedJobInfo_(std::move(failedJobInfo)) +{ + *this << Error_.FullDescription(); + if (!FailedJobInfo_.empty()) { + *this << DumpJobInfoForException(OperationId_, FailedJobInfo_); + } +} + +TOperationFailedError::EState TOperationFailedError::GetState() const +{ + return State_; +} + +TOperationId TOperationFailedError::GetOperationId() const +{ + return OperationId_; +} + +const TYtError& TOperationFailedError::GetError() const +{ + return Error_; +} + +const TVector<TFailedJobInfo>& TOperationFailedError::GetFailedJobInfo() const +{ + return FailedJobInfo_; +} + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/errors.h b/yt/cpp/mapreduce/interface/errors.h new file mode 100644 index 0000000000..afad58ed72 --- /dev/null +++ b/yt/cpp/mapreduce/interface/errors.h @@ -0,0 +1,290 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/errors.h +/// +/// Errors and exceptions emitted by library. + +#include "fwd.h" +#include "common.h" + +#include <library/cpp/yson/node/node.h> + +#include <util/generic/bt_exception.h> +#include <util/generic/yexception.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> + +namespace NJson { + class TJsonValue; +} // namespace NJson + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error that is thrown when library detects invalid usage of API. +/// +/// For example trying to start operations on empty table list. +class TApiUsageError + : public TWithBackTrace<yexception> +{ }; + +/// +/// @brief Error that is thrown when request retries continues for too long. +/// +/// @see NYT::TRetryConfig +/// @see NYT::IRetryConfigProvider +class TRequestRetriesTimeout + : public yexception +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error returned by YT cluster. +/// +/// An object of this class describe error that happened on YT server. +/// Internally each error is a tree. Each node of the tree contains: +/// - integer error code; +/// - text description of error; +/// - attributes describing error context. +/// +/// To get text description of an error one should use +/// @ref NYT::TYtError::ShortDescription or @ref NYT::TYtError::FullDescription +/// +/// To distinguish between error kinds @ref NYT::TYtError::ContainsErrorCode should be used. +/// +/// @see NYT::TErrorResponse +/// @see NYT::TOperationFailedError +class TYtError +{ +public: + /// Constructs error with NYT::NClusterErrorCodes::OK code and empty message. + TYtError(); + + /// Constructs error with NYT::NClusterErrorCodes::Generic code and given message. + explicit TYtError(const TString& message); + + /// Constructs error with given code and given message. + TYtError(int code, const TString& message); + + /// Construct error from json representation. + TYtError(const ::NJson::TJsonValue& value); + + /// Construct error from TNode representation. + TYtError(const TNode& value); + + /// + /// @brief Check if error or any of inner errors has given error code. + /// + /// Use this method to distinguish kind of error. + bool ContainsErrorCode(int code) const; + + /// + /// @brief Get short description of error. + /// + /// Short description contain text description of error and all inner errors. + /// It is human readable but misses some important information (error codes, error attributes). + /// + /// Usually it's better to use @ref NYT::TYtError::FullDescription to log errors. + TString ShortDescription() const; + + /// + /// @brief Get full description of error. + /// + /// Full description contains readable short description + /// followed by text yson representation of error that contains error codes and attributes. + TString FullDescription() const; + + /// + /// @brief Get error code of the topmost error. + /// + /// @warning Do not use this method to distinguish between error kinds + /// @ref NYT::TYtError::ContainsErrorCode should be used instead. + int GetCode() const; + + /// + /// @brief Get error text of the topmost error. + /// + /// @warning This method should not be used to log errors + /// since text description of inner errors is going to be lost. + /// @ref NYT::TYtError::FullDescription should be used instead. + const TString& GetMessage() const; + + /// + /// @brief Check if error or any of inner errors contains given text chunk. + /// + /// @warning @ref NYT::TYtError::ContainsErrorCode must be used instead of + /// this method when possible. If there is no suitable error code it's + /// better to ask yt@ to add one. This method should only be used as workaround. + bool ContainsText(const TStringBuf& text) const; + + /// @brief Get inner errors. + const TVector<TYtError>& InnerErrors() const; + + /// Parse error from json string. + void ParseFrom(const TString& jsonError); + + /// Collect error codes from entire error tree. + TSet<int> GetAllErrorCodes() const; + + /// Check if error has any attributes. + bool HasAttributes() const; + + /// Get error attributes. + const TNode::TMapType& GetAttributes() const; + + /// Get text yson representation of error + TString GetYsonText() const; + +private: + int Code_; + TString Message_; + TVector<TYtError> InnerErrors_; + TNode::TMapType Attributes_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Generic error response returned by server. +/// +/// TErrorResponse can be thrown from almost any client method when server responds with error. +/// +class TErrorResponse + : public yexception +{ +public: + TErrorResponse(int httpCode, const TString& requestId); + TErrorResponse(int httpCode, TYtError error); + + /// Get error object returned by server. + const TYtError& GetError() const; + + /// Get if (correlation-id) of request that was responded with error. + TString GetRequestId() const; + + /// Get HTTP code of response. + int GetHttpCode() const; + + /// Is error parsed from response trailers. + bool IsFromTrailers() const; + + /// Check if error was caused by transport problems inside YT cluster. + bool IsTransportError() const; + + /// Check if error was caused by failure to resolve cypress path. + bool IsResolveError() const; + + /// Check if error was caused by lack of permissions to execute request. + bool IsAccessDenied() const; + + /// Check if error was caused by failure to lock object because of another transaction is holding lock. + bool IsConcurrentTransactionLockConflict() const; + + /// Check if error was caused by request quota limit exceeding. + bool IsRequestRateLimitExceeded() const; + + // YT can't serve request because it is overloaded. + bool IsRequestQueueSizeLimitExceeded() const; + + /// Check if error was caused by failure to get chunk. Such errors are almost always temporary. + bool IsChunkUnavailable() const; + + /// Check if error was caused by internal YT timeout. + bool IsRequestTimedOut() const; + + /// Check if error was caused by trying to work with transaction that was finished or never existed. + bool IsNoSuchTransaction() const; + + // User reached their limit of concurrently running operations. + bool IsConcurrentOperationsLimitReached() const; + + /// @deprecated This method must not be used. + bool IsOk() const; + + void SetRawError(const TString& message); + void SetError(TYtError error); + void ParseFromJsonError(const TString& jsonError); + void SetIsFromTrailers(bool isFromTrailers); + +private: + void Setup(); + +private: + int HttpCode_; + TString RequestId_; + TYtError Error_; + bool IsFromTrailers_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Info about failed jobs. +/// +/// @see NYT::TOperationFailedError +struct TFailedJobInfo +{ + /// Id of a job. + TJobId JobId; + + /// Error describing job failure. + TYtError Error; + + /// Stderr of job. + /// + /// @note YT doesn't store all job stderrs, check @ref NYT::IOperationClient::GetJobStderr + /// for list of limitations. + /// + /// @see NYT::IOperationClient::GetJobStderr + TString Stderr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error that is thrown when operation watched by library fails. +/// +/// This error is thrown from operation starting methods when they are started in sync mode (@ refNYT::TOperationOptions::Wait == true) +/// or from future returned by NYT::IOperation::Watch. +/// +/// @see NYT::IOperationClient +class TOperationFailedError + : public yexception +{ +public: + /// Final state of operation. + enum EState { + /// Operation was failed due to some error. + Failed, + /// Operation didn't experienced errors, but was aborted by user request or by YT. + Aborted, + }; + +public: + TOperationFailedError(EState state, TOperationId id, TYtError ytError, TVector<TFailedJobInfo> failedJobInfo); + + /// Get final state of operation. + EState GetState() const; + + /// Get operation id. + TOperationId GetOperationId() const; + + /// Return operation error. + const TYtError& GetError() const; + + /// Return info about failed jobs (if any). + const TVector<TFailedJobInfo>& GetFailedJobInfo() const; + +private: + EState State_; + TOperationId OperationId_; + TYtError Error_; + TVector<TFailedJobInfo> FailedJobInfo_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/finish_or_die.h b/yt/cpp/mapreduce/interface/finish_or_die.h new file mode 100644 index 0000000000..9d7dcece02 --- /dev/null +++ b/yt/cpp/mapreduce/interface/finish_or_die.h @@ -0,0 +1,41 @@ +#pragma once + +#include <util/system/yassert.h> + +#include <exception> + +/// @cond Doxygen_Suppress +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void FinishOrDie(T* pThis, const char* className) noexcept +{ + auto fail = [&] (const char* what) { + Y_FAIL( + "\n\n" + "Destructor of %s caught exception during Finish: %s.\n" + "Some data is probably has not been written.\n" + "In order to handle such exceptions consider explicitly call Finish() method.\n", + className, + what); + }; + + try { + pThis->Finish(); + } catch (const std::exception& ex) { + if (!std::uncaught_exceptions()) { + fail(ex.what()); + } + } catch (...) { + if (!std::uncaught_exceptions()) { + fail("<unknown exception>"); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail +/// @endcond diff --git a/yt/cpp/mapreduce/interface/fluent.h b/yt/cpp/mapreduce/interface/fluent.h new file mode 100644 index 0000000000..8ca6e86336 --- /dev/null +++ b/yt/cpp/mapreduce/interface/fluent.h @@ -0,0 +1,678 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/fluent.h +/// +/// Adapters for working with @ref NYson::IYsonConsumer in a structured way, with compile-time syntax checks. +/// +/// The following documentation is copied verbatim from `yt/core/ytree/fluent.h`. +/// +/// WHAT IS THIS +/// +/// Fluent adapters encapsulate invocation of IYsonConsumer methods in a +/// convenient structured manner. Key advantage of fluent-like code is that +/// attempt of building syntactically incorrect YSON structure will result +/// in a compile-time error. +/// +/// Each fluent object is associated with a context that defines possible YSON +/// tokens that may appear next. For example, TFluentMap is a fluent object +/// that corresponds to a location within YSON map right before a key-value +/// pair or the end of the map. +/// +/// More precisely, each object that may be obtained by a sequence of fluent +/// method calls has the full history of its enclosing YSON composite types in +/// its single template argument hereinafter referred to as TParent. This allows +/// us not to forget the original context after opening and closing the embedded +/// composite structure. +/// +/// It is possible to invoke a separate YSON building procedure by calling +/// one of convenience Do* methods. There are two possibilities here: it is +/// possible to delegate invocation context either as a fluent object (like +/// TFluentMap, TFluentList, TFluentAttributes or TFluentAny) or as a raw +/// IYsonConsumer*. The latter is discouraged since it is impossible to check +/// if a given side-built YSON structure fits current fluent context. +/// For example it is possible to call Do() method inside YSON map passing +/// consumer to a procedure that will treat context like it is in a list. +/// Passing typed fluent builder saves you from such a misbehaviour. +/// +/// TFluentXxx corresponds to an internal class of TXxx +/// without any history hidden in template argument. It allows you to +/// write procedures of form: +/// +/// void BuildSomeAttributesInYson(TFluentMap fluent) { ... } +/// +/// without thinking about the exact way how this procedure is nested in other +/// procedures. +/// +/// An important notation: we will refer to a function whose first argument +/// is TFluentXxx as TFuncXxx. +/// +/// +/// BRIEF LIST OF AVAILABLE METHODS +/// +/// Only the most popular methods are covered here. Refer to the code for the +/// rest of them. +/// +/// TAny: +/// * Value(T value) -> TParent, serialize `value` using underlying consumer. +/// T should be such that free function Serialize(NYson::IYsonConsumer*, const T&) is +/// defined; +/// * BeginMap() -> TFluentMap, open map; +/// * BeginList() -> TFluentList, open list; +/// * BeginAttributes() -> TFluentAttributes, open attributes; +/// +/// * Do(TFuncAny func) -> TAny, delegate invocation to a separate procedure. +/// * DoIf(bool condition, TFuncAny func) -> TAny, same as Do() but invoke +/// `func` only if `condition` is true; +/// * DoFor(TCollection collection, TFuncAny func) -> TAny, same as Do() +/// but iterate over `collection` and pass each of its elements as a second +/// argument to `func`. Instead of passing a collection you may it is possible +/// to pass two iterators as an argument; +/// +/// * DoMap(TFuncMap func) -> TAny, open a map, delegate invocation to a separate +/// procedure and close map; +/// * DoMapFor(TCollection collection, TFuncMap func) -> TAny, open a map, iterate +/// over `collection` and pass each of its elements as a second argument to `func` +/// and close map; +/// * DoList(TFuncList func) -> TAny, same as DoMap(); +/// * DoListFor(TCollection collection, TFuncList func) -> TAny; same as DoMapFor(). +/// +/// +/// TFluentMap: +/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`; +/// * EndMap() -> TParent, close map; +/// * Do(TFuncMap func) -> TFluentMap, same as Do() for TAny; +/// * DoIf(bool condition, TFuncMap func) -> TFluentMap, same as DoIf() for TAny; +/// * DoFor(TCollection collection, TFuncMap func) -> TFluentMap, same as DoFor() for TAny. +/// +/// +/// TFluentList: +/// * Item() -> TAny, open an new list element; +/// * EndList() -> TParent, close list; +/// * Do(TFuncList func) -> TFluentList, same as Do() for TAny; +/// * DoIf(bool condition, TFuncList func) -> TFluentList, same as DoIf() for TAny; +/// * DoFor(TCollection collection, TListMap func) -> TFluentList, same as DoFor() for TAny. +/// +/// +/// TFluentAttributes: +/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`. +/// * EndAttributes() -> TParentWithoutAttributes, close attributes. Note that +/// this method leads to a context that is forces not to have attributes, +/// preventing us from putting attributes twice before an object. +/// * Do(TFuncAttributes func) -> TFluentAttributes, same as Do() for TAny; +/// * DoIf(bool condition, TFuncAttributes func) -> TFluentAttributes, same as DoIf() +/// for TAny; +/// * DoFor(TCollection collection, TListAttributes func) -> TFluentAttributes, same as DoFor() +/// for TAny. +/// + + +#include "common.h" +#include "serialize.h" + +#include <library/cpp/yson/node/serialize.h> +#include <library/cpp/yson/node/node_builder.h> + +#include <library/cpp/yson/consumer.h> +#include <library/cpp/yson/writer.h> + +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/stream/str.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +struct TFluentYsonUnwrapper +{ + using TUnwrapped = T; + + static TUnwrapped Unwrap(T t) + { + return std::move(t); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TFluentYsonVoid +{ }; + +template <> +struct TFluentYsonUnwrapper<TFluentYsonVoid> +{ + using TUnwrapped = void; + + static TUnwrapped Unwrap(TFluentYsonVoid) + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// This class is actually a namespace for specific fluent adapter classes. +class TFluentYsonBuilder + : private TNonCopyable +{ +private: + template <class T> + static void WriteValue(NYT::NYson::IYsonConsumer* consumer, const T& value) + { + Serialize(value, consumer); + } + +public: + class TFluentAny; + template <class TParent> class TAny; + template <class TParent> class TToAttributes; + template <class TParent> class TAttributes; + template <class TParent> class TListType; + template <class TParent> class TMapType; + + /// Base class for all fluent adapters. + template <class TParent> + class TFluentBase + { + public: + /// Implicit conversion to yson consumer + operator NYT::NYson::IYsonConsumer* () const + { + return Consumer; + } + + protected: + /// @cond Doxygen_Suppress + NYT::NYson::IYsonConsumer* Consumer; + TParent Parent; + + TFluentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : Consumer(consumer) + , Parent(std::move(parent)) + { } + + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + TUnwrappedParent GetUnwrappedParent() + { + return TFluentYsonUnwrapper<TParent>::Unwrap(std::move(Parent)); + } + /// @endcond Doxygen_Suppress + }; + + /// Base class for fluent adapters for fragment of list, map or attributes. + template <template <class TParent> class TThis, class TParent> + class TFluentFragmentBase + : public TFluentBase<TParent> + { + public: + using TDeepThis = TThis<TParent>; + using TShallowThis = TThis<TFluentYsonVoid>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TFluentFragmentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentBase<TParent>(consumer, std::move(parent)) + { } + + /// Delegate invocation to a separate procedure. + template <class TFunc> + TDeepThis& Do(const TFunc& func) + { + func(TShallowThis(this->Consumer)); + return *static_cast<TDeepThis*>(this); + } + + /// Conditionally delegate invocation to a separate procedure. + template <class TFunc> + TDeepThis& DoIf(bool condition, const TFunc& func) + { + if (condition) { + func(TShallowThis(this->Consumer)); + } + return *static_cast<TDeepThis*>(this); + } + + /// Calls `func(*this, element)` for each `element` in range `[begin, end)`. + template <class TFunc, class TIterator> + TDeepThis& DoFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + for (auto current = begin; current != end; ++current) { + func(TShallowThis(this->Consumer), current); + } + return *static_cast<TDeepThis*>(this); + } + + /// Calls `func(*this, element)` for each `element` in `collection`. + template <class TFunc, class TCollection> + TDeepThis& DoFor(const TCollection& collection, const TFunc& func) + { + for (const auto& item : collection) { + func(TShallowThis(this->Consumer), item); + } + return *static_cast<TDeepThis*>(this); + } + + }; + + /// Fluent adapter of a value without attributes. + template <class TParent> + class TAnyWithoutAttributes + : public TFluentBase<TParent> + { + public: + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + TAnyWithoutAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : TFluentBase<TParent>(consumer, std::move(parent)) + { } + + /// Pass `value` to underlying consumer. + template <class T> + TUnwrappedParent Value(const T& value) + { + WriteValue(this->Consumer, value); + return this->GetUnwrappedParent(); + } + + /// Call `OnEntity()` of underlying consumer. + TUnwrappedParent Entity() + { + this->Consumer->OnEntity(); + return this->GetUnwrappedParent(); + } + + /// Serialize `collection` to underlying consumer as a list. + template <class TCollection> + TUnwrappedParent List(const TCollection& collection) + { + this->Consumer->OnBeginList(); + for (const auto& item : collection) { + this->Consumer->OnListItem(); + WriteValue(this->Consumer, item); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Serialize maximum `maxSize` elements of `collection` to underlying consumer as a list. + template <class TCollection> + TUnwrappedParent ListLimited(const TCollection& collection, size_t maxSize) + { + this->Consumer->OnBeginAttributes(); + this->Consumer->OnKeyedItem("count"); + this->Consumer->OnInt64Scalar(collection.size()); + this->Consumer->OnEndAttributes(); + this->Consumer->OnBeginList(); + size_t printedSize = 0; + for (const auto& item : collection) { + if (printedSize >= maxSize) + break; + this->Consumer->OnListItem(); + WriteValue(this->Consumer, item); + ++printedSize; + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list. + TListType<TParent> BeginList() + { + this->Consumer->OnBeginList(); + return TListType<TParent>(this->Consumer, this->Parent); + } + + /// Open a list, delegate invocation to `func`, then close the list. + template <class TFunc> + TUnwrappedParent DoList(const TFunc& func) + { + this->Consumer->OnBeginList(); + func(TListType<TFluentYsonVoid>(this->Consumer)); + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list, call `func(*this, element)` for each `element` of range, then close the list. + template <class TFunc, class TIterator> + TUnwrappedParent DoListFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + this->Consumer->OnBeginList(); + for (auto current = begin; current != end; ++current) { + func(TListType<TFluentYsonVoid>(this->Consumer), current); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list, call `func(*this, element)` for each `element` of `collection`, then close the list. + template <class TFunc, class TCollection> + TUnwrappedParent DoListFor(const TCollection& collection, const TFunc& func) + { + this->Consumer->OnBeginList(); + for (const auto& item : collection) { + func(TListType<TFluentYsonVoid>(this->Consumer), item); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a map. + TMapType<TParent> BeginMap() + { + this->Consumer->OnBeginMap(); + return TMapType<TParent>(this->Consumer, this->Parent); + } + + /// Open a map, delegate invocation to `func`, then close the map. + template <class TFunc> + TUnwrappedParent DoMap(const TFunc& func) + { + this->Consumer->OnBeginMap(); + func(TMapType<TFluentYsonVoid>(this->Consumer)); + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + + /// Open a map, call `func(*this, element)` for each `element` of range, then close the map. + template <class TFunc, class TIterator> + TUnwrappedParent DoMapFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + this->Consumer->OnBeginMap(); + for (auto current = begin; current != end; ++current) { + func(TMapType<TFluentYsonVoid>(this->Consumer), current); + } + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + + /// Open a map, call `func(*this, element)` for each `element` of `collection`, then close the map. + template <class TFunc, class TCollection> + TUnwrappedParent DoMapFor(const TCollection& collection, const TFunc& func) + { + this->Consumer->OnBeginMap(); + for (const auto& item : collection) { + func(TMapType<TFluentYsonVoid>(this->Consumer), item); + } + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of any value. + template <class TParent> + class TAny + : public TAnyWithoutAttributes<TParent> + { + public: + using TBase = TAnyWithoutAttributes<TParent>; + + explicit TAny(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : TBase(consumer, std::move(parent)) + { } + + /// Open attributes. + TAttributes<TBase> BeginAttributes() + { + this->Consumer->OnBeginAttributes(); + return TAttributes<TBase>( + this->Consumer, + TBase(this->Consumer, this->Parent)); + } + }; + + /// Fluent adapter of attributes fragment (the inside part of attributes). + template <class TParent = TFluentYsonVoid> + class TAttributes + : public TFluentFragmentBase<TAttributes, TParent> + { + public: + using TThis = TAttributes<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TAttributes, TParent>(consumer, std::move(parent)) + { } + + /// Pass attribute key to underlying consumer. + TAny<TThis> Item(const TStringBuf& key) + { + this->Consumer->OnKeyedItem(key); + return TAny<TThis>(this->Consumer, *this); + } + + /// Pass attribute key to underlying consumer. + template <size_t Size> + TAny<TThis> Item(const char (&key)[Size]) + { + return Item(TStringBuf(key, Size - 1)); + } + + //TODO: from TNode + + /// Close the attributes. + TUnwrappedParent EndAttributes() + { + this->Consumer->OnEndAttributes(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of list fragment (the inside part of a list). + template <class TParent = TFluentYsonVoid> + class TListType + : public TFluentFragmentBase<TListType, TParent> + { + public: + using TThis = TListType<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TListType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TListType, TParent>(consumer, std::move(parent)) + { } + + /// Call `OnListItem()` of underlying consumer. + TAny<TThis> Item() + { + this->Consumer->OnListItem(); + return TAny<TThis>(this->Consumer, *this); + } + + // TODO: from TNode + + /// Close the list. + TUnwrappedParent EndList() + { + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of map fragment (the inside part of a map). + template <class TParent = TFluentYsonVoid> + class TMapType + : public TFluentFragmentBase<TMapType, TParent> + { + public: + using TThis = TMapType<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TMapType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TMapType, TParent>(consumer, std::move(parent)) + { } + + /// Pass map key to underlying consumer. + template <size_t Size> + TAny<TThis> Item(const char (&key)[Size]) + { + return Item(TStringBuf(key, Size - 1)); + } + + /// Pass map key to underlying consumer. + TAny<TThis> Item(const TStringBuf& key) + { + this->Consumer->OnKeyedItem(key); + return TAny<TThis>(this->Consumer, *this); + } + + // TODO: from TNode + + /// Close the map. + TUnwrappedParent EndMap() + { + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + }; + +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Builder representing any value. +using TFluentAny = TFluentYsonBuilder::TAny<TFluentYsonVoid>; + +/// Builder representing the inside of a list (list fragment). +using TFluentList = TFluentYsonBuilder::TListType<TFluentYsonVoid>; + +/// Builder representing the inside of a map (map fragment). +using TFluentMap = TFluentYsonBuilder::TMapType<TFluentYsonVoid>; + +/// Builder representing the inside of attributes. +using TFluentAttributes = TFluentYsonBuilder::TAttributes<TFluentYsonVoid>; + +//////////////////////////////////////////////////////////////////////////////// + +/// Create a fluent adapter to invoke methods of `consumer`. +static inline TFluentAny BuildYsonFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentAny(consumer, TFluentYsonVoid()); +} + +/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a list. +static inline TFluentList BuildYsonListFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentList(consumer); +} + +/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a map. +static inline TFluentMap BuildYsonMapFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentMap(consumer); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFluentYsonWriterState + : public TThrRefBase +{ +public: + using TValue = TString; + + explicit TFluentYsonWriterState(::NYson::EYsonFormat format) + : Writer(&Output, format) + { } + + TString GetValue() + { + return Output.Str(); + } + + NYT::NYson::IYsonConsumer* GetConsumer() + { + return &Writer; + } + +private: + TStringStream Output; + ::NYson::TYsonWriter Writer; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TFluentYsonBuilderState + : public TThrRefBase +{ +public: + using TValue = TNode; + + explicit TFluentYsonBuilderState() + : Builder(&Node) + { } + + TNode GetValue() + { + return std::move(Node); + } + + NYT::NYson::IYsonConsumer* GetConsumer() + { + return &Builder; + } + +private: + TNode Node; + TNodeBuilder Builder; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +class TFluentYsonHolder +{ +public: + explicit TFluentYsonHolder(::TIntrusivePtr<TState> state) + : State(state) + { } + + ::TIntrusivePtr<TState> GetState() const + { + return State; + } + +private: + ::TIntrusivePtr<TState> State; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +struct TFluentYsonUnwrapper< TFluentYsonHolder<TState> > +{ + using TUnwrapped = typename TState::TValue; + + static TUnwrapped Unwrap(const TFluentYsonHolder<TState>& holder) + { + return std::move(holder.GetState()->GetValue()); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>> +BuildYsonFluentlyWithState(::TIntrusivePtr<TState> state) +{ + return TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>>( + state->GetConsumer(), + TFluentYsonHolder<TState>(state)); +} + +/// Create a fluent adapter returning a `TString` with corresponding YSON when construction is finished. +inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonWriterState>> +BuildYsonStringFluently(::NYson::EYsonFormat format = ::NYson::EYsonFormat::Text) +{ + ::TIntrusivePtr<TFluentYsonWriterState> state(new TFluentYsonWriterState(format)); + return BuildYsonFluentlyWithState(state); +} + +/// Create a fluent adapter returning a @ref NYT::TNode when construction is finished. +inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonBuilderState>> +BuildYsonNodeFluently() +{ + ::TIntrusivePtr<TFluentYsonBuilderState> state(new TFluentYsonBuilderState); + return BuildYsonFluentlyWithState(state); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format.cpp b/yt/cpp/mapreduce/interface/format.cpp new file mode 100644 index 0000000000..f8318310a4 --- /dev/null +++ b/yt/cpp/mapreduce/interface/format.cpp @@ -0,0 +1,135 @@ +#include "format.h" +#include "protobuf_format.h" + +#include "errors.h" + +#include <google/protobuf/descriptor.h> +#include <google/protobuf/messagext.h> + +namespace NYT { + +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension) +{ + return NDetail::CreateTableSchemaImpl(messageDescriptor, keepFieldsWithoutExtension); +} + +//////////////////////////////////////////////////////////////////////////////// + +TFormat::TFormat(const TNode& config) + : Config(config) +{ } + + +TFormat TFormat::Protobuf( + const TVector<const ::google::protobuf::Descriptor*>& descriptors, + bool withDescriptors) +{ + if (withDescriptors) { + return TFormat(NDetail::MakeProtoFormatConfigWithDescriptors(descriptors)); + } else { + return TFormat(NDetail::MakeProtoFormatConfigWithTables(descriptors)); + } +} + +TFormat TFormat::YsonText() +{ + TNode config("yson"); + config.Attributes()("format", "text"); + return TFormat(config); +} + +TFormat TFormat::YsonBinary() +{ + TNode config("yson"); + config.Attributes()("format", "binary"); + return TFormat(config); +} + +TFormat TFormat::YaMRLenval() +{ + TNode config("yamr"); + config.Attributes()("lenval", true)("has_subkey", true); + return TFormat(config); +} + +TFormat TFormat::Json() +{ + return TFormat(TNode("json")); +} + +bool TFormat::IsTextYson() const +{ + if (!Config.IsString() || Config.AsString() != "yson") { + return false; + } + if (!Config.HasAttributes()) { + return false; + } + const auto& attributes = Config.GetAttributes(); + if (!attributes.HasKey("format") || attributes["format"] != TNode("text")) { + return false; + } + return true; +} + +bool TFormat::IsProtobuf() const +{ + return Config.IsString() && Config.AsString() == "protobuf"; +} + +bool TFormat::IsYamredDsv() const +{ + return Config.IsString() && Config.AsString() == "yamred_dsv"; +} + +static TString FormatName(const TFormat& format) +{ + if (!format.Config.IsString()) { + Y_VERIFY(format.Config.IsUndefined()); + return "<undefined>"; + } + return format.Config.AsString(); +} + +TYamredDsvAttributes TFormat::GetYamredDsvAttributes() const +{ + if (!IsYamredDsv()) { + ythrow TApiUsageError() << "Cannot get yamred_dsv attributes for " << FormatName(*this) << " format"; + } + TYamredDsvAttributes attributes; + + const auto& nodeAttributes = Config.GetAttributes(); + { + const auto& keyColumns = nodeAttributes["key_column_names"]; + if (!keyColumns.IsList()) { + ythrow yexception() << "Ill-formed format: key_column_names is of non-list type: " << keyColumns.GetType(); + } + for (auto& column : keyColumns.AsList()) { + if (!column.IsString()) { + ythrow yexception() << "Ill-formed format: key_column_names: " << column.GetType(); + } + attributes.KeyColumnNames.push_back(column.AsString()); + } + } + + if (nodeAttributes.HasKey("subkey_column_names")) { + const auto& subkeyColumns = nodeAttributes["subkey_column_names"]; + if (!subkeyColumns.IsList()) { + ythrow yexception() << "Ill-formed format: subkey_column_names is not a list: " << subkeyColumns.GetType(); + } + for (const auto& column : subkeyColumns.AsList()) { + if (!column.IsString()) { + ythrow yexception() << "Ill-formed format: non-string inside subkey_key_column_names: " << column.GetType(); + } + attributes.SubkeyColumnNames.push_back(column.AsString()); + } + } + + return attributes; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format.h b/yt/cpp/mapreduce/interface/format.h new file mode 100644 index 0000000000..e297576464 --- /dev/null +++ b/yt/cpp/mapreduce/interface/format.h @@ -0,0 +1,122 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/format.h +/// +/// Header containing class to work with raw [YT formats](https://yt.yandex-team.ru/docs/description/storage/formats.html). + +#include "node.h" + +#include <google/protobuf/descriptor.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// @deprecated +struct TYamredDsvAttributes +{ + /// Names of key columns. + TVector<TString> KeyColumnNames; + + /// Names of subkey columns. + TVector<TString> SubkeyColumnNames; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class representing YT data format. +/// +/// Normally the user does not need to use it. +/// However, the class is handy for "raw" operations and table reading and writing, +/// e.g. @ref NYT::IOperationClient::RawMap and other raw operations, +/// @ref NYT::IIOClient::CreateRawReader and @ref NYT::IIOClient::CreateRawWriter. +/// Anyway, the static factory methods should be preferred to the constructor. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/formats.html). +struct TFormat +{ +public: + /// Format representation understandable by YT. + TNode Config; + +public: + /// @brief Construct format from given YT format representation. + /// + /// @note Prefer using static factory methods (e.g. @ref NYT::TFormat::YsonBinary, @ref NYT::TFormat::YsonText, @ref NYT::TFormat::Protobuf). + explicit TFormat(const TNode& config = TNode()); + + /// @brief Create text YSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + static TFormat YsonText(); + + /// @brief Create binary YSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + static TFormat YsonBinary(); + + /// @brief Create YaMR format. + /// + /// @deprecated + static TFormat YaMRLenval(); + + /// @brief Create protobuf format from protobuf message descriptors. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html). + static TFormat Protobuf( + const TVector<const ::google::protobuf::Descriptor*>& descriptors, + bool withDescriptors = false); + + /// @brief Create JSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#JSON) + static TFormat Json(); + + /// @brief Create protobuf format for the message specified in template parameter. + /// + /// `T` must be inherited from `Message`. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html). + template<typename T> + static inline TFormat Protobuf(bool withDescriptors = false); + + /// @brief Is the format text YSON? + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + bool IsTextYson() const; + + /// @brief Is the format protobuf? + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html) + bool IsProtobuf() const; + + /// @brief Is the format YaMR? + /// + /// @deprecated + bool IsYamredDsv() const; + + /// @brief For YAMR format returns its attributes in structured way. + /// + /// @deprecated + TYamredDsvAttributes GetYamredDsvAttributes() const; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template<typename T> +TFormat TFormat::Protobuf(bool withDescriptors) { + return TFormat::Protobuf({T::descriptor()}, withDescriptors); +} + +/// @brief Create table schema from protobuf message descriptor. +/// +/// @param messageDescriptor Message descriptor +/// @param keepFieldsWithoutExtension Add to schema fields without "column_name" or "key_column_name" extensions. +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format_ut.cpp b/yt/cpp/mapreduce/interface/format_ut.cpp new file mode 100644 index 0000000000..069c29087d --- /dev/null +++ b/yt/cpp/mapreduce/interface/format_ut.cpp @@ -0,0 +1,235 @@ +#include "common.h" +#include "errors.h" +#include "format.h" +#include "common_ut.h" + +#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h> +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +static TNode GetColumns(const TFormat& format, int tableIndex = 0) +{ + return format.Config.GetAttributes()["tables"][tableIndex]["columns"]; +} + +Y_UNIT_TEST_SUITE(ProtobufFormat) +{ + Y_UNIT_TEST(TIntegral) + { + const auto format = TFormat::Protobuf<NUnitTesting::TIntegral>(); + auto columns = GetColumns(format); + + struct TColumn + { + TString Name; + TString ProtoType; + int FieldNumber; + }; + + auto expected = TVector<TColumn>{ + {"DoubleField", "double", 1}, + {"FloatField", "float", 2}, + {"Int32Field", "int32", 3}, + {"Int64Field", "int64", 4}, + {"Uint32Field", "uint32", 5}, + {"Uint64Field", "uint64", 6}, + {"Sint32Field", "sint32", 7}, + {"Sint64Field", "sint64", 8}, + {"Fixed32Field", "fixed32", 9}, + {"Fixed64Field", "fixed64", 10}, + {"Sfixed32Field", "sfixed32", 11}, + {"Sfixed64Field", "sfixed64", 12}, + {"BoolField", "bool", 13}, + {"EnumField", "enum_string", 14}, + }; + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), expected.size()); + for (int i = 0; i < static_cast<int>(columns.Size()); ++i) { + UNIT_ASSERT_VALUES_EQUAL(columns[i]["name"], expected[i].Name); + UNIT_ASSERT_VALUES_EQUAL(columns[i]["proto_type"], expected[i].ProtoType); + UNIT_ASSERT_VALUES_EQUAL(columns[i]["field_number"], expected[i].FieldNumber); + } + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto format = TFormat::Protobuf<NUnitTesting::TRowFieldSerializationOption>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + const auto& fields = columns[0]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3); + + UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2); + } + + Y_UNIT_TEST(Packed) + { + const auto format = TFormat::Protobuf<NUnitTesting::TPacked>(); + auto column = GetColumns(format)[0]; + + UNIT_ASSERT_VALUES_EQUAL(column["name"], "PackedListInt64"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1); + UNIT_ASSERT_VALUES_EQUAL(column["packed"], true); + UNIT_ASSERT_VALUES_EQUAL(column["repeated"], true); + } + + Y_UNIT_TEST(Cyclic) + { + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TA>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TB>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TC>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TD>(), TApiUsageError); + + const auto format = TFormat::Protobuf<NUnitTesting::TCyclic::TE>(); + auto column = GetColumns(format)[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "d"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1); + } + + Y_UNIT_TEST(Map) + { + const auto format = TFormat::Protobuf<NUnitTesting::TWithMap>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 5); + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message"); + } + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructsLegacy"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message"); + } + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructs"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapOptionalDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[4]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + } + + Y_UNIT_TEST(Oneof) + { + const auto format = TFormat::Protobuf<NUnitTesting::TWithOneof>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4); + auto check = [] (const TNode& column, TStringBuf name, TStringBuf oneof2Name) { + UNIT_ASSERT_VALUES_EQUAL(column["name"], name); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field"); + + const auto& oneof2 = column["fields"][1]; + UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], oneof2Name); + UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message"); + const auto& embeddedOneof = oneof2["fields"][1]["fields"][0]; + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["name"], "Oneof"); + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][1]["name"], "y"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1"); + }; + + check(columns[0], "DefaultSeparateFields", "variant_field_name"); + check(columns[1], "NoDefault", "Oneof2"); + + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "TopLevelOneof"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 1); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "MemberOfTopLevelOneof"); + } + } +} + +Y_UNIT_TEST_SUITE(Proto3) +{ + Y_UNIT_TEST(TWithOptional) + { + const auto format = TFormat::Protobuf<NTestingProto3::TWithOptional>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + } + + Y_UNIT_TEST(TWithOptionalMessage) + { + const auto format = TFormat::Protobuf<NTestingProto3::TWithOptionalMessage>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"].Size(), 1); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["field_number"], 1); + } +} diff --git a/yt/cpp/mapreduce/interface/fwd.h b/yt/cpp/mapreduce/interface/fwd.h new file mode 100644 index 0000000000..0434c03d8b --- /dev/null +++ b/yt/cpp/mapreduce/interface/fwd.h @@ -0,0 +1,397 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/fwd.h +/// +/// Header containing mostly forward declarations of types. + + +#include <util/generic/fwd.h> +#include <util/system/types.h> + +#include <variant> + +/// @cond Doxygen_Suppress +namespace google::protobuf { + class Message; +} + +namespace NYT { + + //////////////////////////////////////////////////////////////////////////////// + // batch_request.h + //////////////////////////////////////////////////////////////////////////////// + + class IBatchRequest; + using TBatchRequestPtr = ::TIntrusivePtr<IBatchRequest>; + + //////////////////////////////////////////////////////////////////////////////// + // client.h + //////////////////////////////////////////////////////////////////////////////// + + enum ELockMode : int; + + struct TStartTransactionOptions; + + struct TLockOptions; + + template <class TDerived> + struct TTabletOptions; + + struct TMountTableOptions; + + struct TUnmountTableOptions; + + struct TRemountTableOptions; + + struct TReshardTableOptions; + + struct TAlterTableOptions; + + struct TLookupRowsOptions; + + struct TSelectRowsOptions; + + struct TCreateClientOptions; + + struct TAlterTableReplicaOptions; + + struct TGetFileFromCacheOptions; + + struct TPutFileToCacheOptions; + + struct TCheckPermissionResult; + struct TCheckPermissionResponse; + struct TCheckPermissionOptions; + + struct TTabletInfo; + + class ILock; + using ILockPtr = ::TIntrusivePtr<ILock>; + + class ITransaction; + using ITransactionPtr = ::TIntrusivePtr<ITransaction>; + + class ITransactionPinger; + using ITransactionPingerPtr = ::TIntrusivePtr<ITransactionPinger>; + + struct IOperation; + using IOperationPtr = ::TIntrusivePtr<IOperation>; + + class IClientBase; + + class IClient; + + using IClientPtr = ::TIntrusivePtr<IClient>; + using IClientBasePtr = ::TIntrusivePtr<IClientBase>; + + //////////////////////////////////////////////////////////////////////////////// + // config.h + //////////////////////////////////////////////////////////////////////////////// + + struct TConfig; + using TConfigPtr = ::TIntrusivePtr<TConfig>; + + //////////////////////////////////////////////////////////////////////////////// + // cypress.h + //////////////////////////////////////////////////////////////////////////////// + + enum ENodeType : int; + + struct TCreateOptions; + + struct TRemoveOptions; + + struct TGetOptions; + + struct TSetOptions; + + struct TMultisetAttributesOptions; + + struct TListOptions; + + struct TCopyOptions; + + struct TMoveOptions; + + struct TLinkOptions; + + struct TConcatenateOptions; + + struct TInsertRowsOptions; + + struct TDeleteRowsOptions; + + struct TTrimRowsOptions; + + class ICypressClient; + + //////////////////////////////////////////////////////////////////////////////// + // errors.h + //////////////////////////////////////////////////////////////////////////////// + + class TApiUsageError; + + class TYtError; + + class TErrorResponse; + + struct TFailedJobInfo; + + class TOperationFailedError; + + //////////////////////////////////////////////////////////////////////////////// + // node.h + //////////////////////////////////////////////////////////////////////////////// + + class TNode; + + //////////////////////////////////////////////////////////////////////////////// + // common.h + //////////////////////////////////////////////////////////////////////////////// + + using TTransactionId = TGUID; + using TNodeId = TGUID; + using TLockId = TGUID; + using TOperationId = TGUID; + using TTabletCellId = TGUID; + using TReplicaId = TGUID; + using TJobId = TGUID; + + using TYPath = TString; + using TLocalFilePath = TString; + + template <class T, class TDerived = void> + struct TOneOrMany; + + // key column values + using TKey = TOneOrMany<TNode>; + + class TSortColumn; + + // column names + using TColumnNames = TOneOrMany<TString>; + + // key column descriptors. + class TSortColumns; + + enum EValueType : int; + + enum ESortOrder : int; + + enum EOptimizeForAttr : i8; + + enum EErasureCodecAttr : i8; + + enum ESchemaModificationAttr : i8; + + enum class EMasterReadKind : int; + + class TColumnSchema; + + class TTableSchema; + + enum class ERelation; + + struct TKeyBound; + + struct TReadLimit; + + struct TReadRange; + + struct TRichYPath; + + struct TAttributeFilter; + + //////////////////////////////////////////////////////////////////////////////// + // io.h + //////////////////////////////////////////////////////////////////////////////// + + enum class EFormatType : int; + + struct TFormat; + + class IFileReader; + + using IFileReaderPtr = ::TIntrusivePtr<IFileReader>; + + class IFileWriter; + + using IFileWriterPtr = ::TIntrusivePtr<IFileWriter>; + + class IBlobTableReader; + using IBlobTableReaderPtr = ::TIntrusivePtr<IBlobTableReader>; + + class TRawTableReader; + + using TRawTableReaderPtr = ::TIntrusivePtr<TRawTableReader>; + + class TRawTableWriter; + + using TRawTableWriterPtr = ::TIntrusivePtr<TRawTableWriter>; + + template <class T, class = void> + class TTableReader; + + template <class T, class = void> + class TTableRangesReader; + + template <typename T> + using TTableRangesReaderPtr = ::TIntrusivePtr<TTableRangesReader<T>>; + + template <class T> + using TTableReaderPtr = ::TIntrusivePtr<TTableReader<T>>; + + template <class T, class = void> + class TTableWriter; + + template <class T> + using TTableWriterPtr = ::TIntrusivePtr<TTableWriter<T>>; + + struct TYaMRRow; + + using ::google::protobuf::Message; + + class ISkiffRowParser; + + using ISkiffRowParserPtr = ::TIntrusivePtr<ISkiffRowParser>; + + class ISkiffRowSkipper; + + using ISkiffRowSkipperPtr = ::TIntrusivePtr<ISkiffRowSkipper>; + + namespace NDetail { + + class TYdlGenericRowType; + + } // namespace NDetail + + template<class... TYdlRowTypes> + class TYdlOneOf; + + template<class... TProtoRowTypes> + class TProtoOneOf; + + template<class... TSkiffRowTypes> + class TSkiffRowOneOf; + + using TYaMRReader = TTableReader<TYaMRRow>; + using TYaMRWriter = TTableWriter<TYaMRRow>; + using TNodeReader = TTableReader<TNode>; + using TNodeWriter = TTableWriter<TNode>; + using TMessageReader = TTableReader<Message>; + using TMessageWriter = TTableWriter<Message>; + using TYdlTableWriter = TTableWriter<NDetail::TYdlGenericRowType>; + + template <class TDerived> + struct TIOOptions; + + struct TFileReaderOptions; + + struct TFileWriterOptions; + + struct TTableReaderOptions; + + class TSkiffRowHints; + + struct TTableWriterOptions; + + //////////////////////////////////////////////////////////////////////////////// + // job_statistics.h + //////////////////////////////////////////////////////////////////////////////// + + class TJobStatistics; + + template <typename T> + class TJobStatisticsEntry; + + //////////////////////////////////////////////////////////////////////////////// + // operation.h + //////////////////////////////////////////////////////////////////////////////// + + class TFormatHints; + + struct TUserJobSpec; + + struct TMapOperationSpec; + + struct TRawMapOperationSpec; + + struct TReduceOperationSpec; + + struct TMapReduceOperationSpec; + + struct TJoinReduceOperationSpec; + + struct TSortOperationSpec; + + class IIOperationPreparationContext; + + class IJob; + using IJobPtr = ::TIntrusivePtr<IJob>; + + class IRawJob; + using IRawJobPtr = ::TIntrusivePtr<IRawJob>; + + enum EMergeMode : int; + + struct TMergeOperationSpec; + + struct TEraseOperationSpec; + + template <class TR, class TW> + class IMapper; + + template <class TR, class TW> + class IReducer; + + template <class TR, class TW> + class IAggregatorReducer; + + struct TSuspendOperationOptions; + + struct TResumeOperationOptions; + + enum class EOperationBriefState : int; + + struct TOperationAttributes; + + struct TOperationOptions; + + enum class EOperationAttribute : int; + + struct TOperationAttributeFilter; + + struct TGetOperationOptions; + + struct TListOperationsOptions; + + struct TGetJobOptions; + + struct TListJobsOptions; + + struct IOperationClient; + + enum class EFinishedJobState : int; + + enum class EJobType : int; + enum class EJobState : int; + enum class ETaskName : int; + class TTaskName; + + struct TJobBinaryDefault; + + struct TJobBinaryLocalPath; + + struct TJobBinaryCypressPath; + + using TJobBinaryConfig = std::variant< + TJobBinaryDefault, + TJobBinaryLocalPath, + TJobBinaryCypressPath>; + + struct TRetryConfig; + class IRetryConfigProvider; + using IRetryConfigProviderPtr = ::TIntrusivePtr<IRetryConfigProvider>; +} +/// @endcond diff --git a/yt/cpp/mapreduce/interface/init.h b/yt/cpp/mapreduce/interface/init.h new file mode 100644 index 0000000000..302be268fc --- /dev/null +++ b/yt/cpp/mapreduce/interface/init.h @@ -0,0 +1,71 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/init.h +/// +/// Initialization functions of YT Wrapper. + +#include <yt/cpp/mapreduce/interface/wait_proxy.h> + +#include <util/generic/fwd.h> + +#include <functional> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Options for @ref NYT::Initialize() and @ref NYT::JoblessInitialize() functions +struct TInitializeOptions +{ + using TSelf = TInitializeOptions; + + /// + /// @brief Override waiting functions for YT Wrapper. + /// + /// This options allows to override functions used by this library to wait something. + FLUENT_FIELD_DEFAULT(::TIntrusivePtr<IWaitProxy>, WaitProxy, nullptr); + + /// + /// @brief Enable/disable cleanup when program execution terminates abnormally. + /// + /// When set to true, library will abort all active transactions and running operations when program + /// terminates on error or signal. + FLUENT_FIELD_DEFAULT(bool, CleanupOnTermination, false); + + /// + /// @brief Set callback to be called before exit() in job mode. + /// + /// Provided function will be called just before exit() when program is started in job mode. + /// This might be useful for shutting down libraries that are used inside operations. + /// + /// NOTE: Keep in mind that inside job execution environment differs from client execution environment. + /// So JobOnExitFunction should not depend on argc/argv environment variables etc. + FLUENT_FIELD_OPTION(std::function<void()>, JobOnExitFunction); +}; + +/// +/// @brief Performs basic initialization (logging, termination handlers, etc). +/// +/// This function never switches to job mode. +void JoblessInitialize(const TInitializeOptions& options = TInitializeOptions()); + +/// +/// @brief Performs basic initialization and switches to a job mode if required. +/// +/// This function performs basic initialization (it sets up logging reads the config, etc) and checks if binary is launched +/// on YT machine inside a job. If latter is true this function launches proper job and after job is done it calls exit(). +/// +/// This function must be called if application starts any operation. +/// This function must be called immediately after entering main() function before any argument parsing is done. +void Initialize(int argc, const char **argv, const TInitializeOptions &options = TInitializeOptions()); + +/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&) +void Initialize(int argc, char **argv, const TInitializeOptions &options = TInitializeOptions()); + +/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&) +void Initialize(const TInitializeOptions &options = TInitializeOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io-inl.h b/yt/cpp/mapreduce/interface/io-inl.h new file mode 100644 index 0000000000..c35ebb7481 --- /dev/null +++ b/yt/cpp/mapreduce/interface/io-inl.h @@ -0,0 +1,1015 @@ +#pragma once + +#ifndef IO_INL_H_ +#error "Direct inclusion of this file is not allowed, use io.h" +#endif +#undef IO_INL_H_ + +#include "finish_or_die.h" + +#include <util/generic/typetraits.h> +#include <util/generic/yexception.h> +#include <util/stream/length.h> + +#include <util/system/mutex.h> +#include <util/system/spinlock.h> + +#include <library/cpp/yson/node/node_builder.h> + +#include <yt/cpp/mapreduce/interface/serialize.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template<class T> +struct TIsProtoOneOf + : std::false_type +{ }; + +template <class ...TProtoRowTypes> +struct TIsProtoOneOf<TProtoOneOf<TProtoRowTypes...>> + : std::true_type +{ }; + +template <class T> +struct TIsSkiffRowOneOf + : std::false_type +{ }; + +template <class ...TSkiffRowTypes> +struct TIsSkiffRowOneOf<TSkiffRowOneOf<TSkiffRowTypes...>> + : std::true_type +{ }; + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <class T, class = void> +struct TRowTraits; + +template <> +struct TRowTraits<TNode> +{ + using TRowType = TNode; + using IReaderImpl = INodeReaderImpl; + using IWriterImpl = INodeWriterImpl; +}; + +template <> +struct TRowTraits<TYaMRRow> +{ + using TRowType = TYaMRRow; + using IReaderImpl = IYaMRReaderImpl; + using IWriterImpl = IYaMRWriterImpl; +}; + +template <> +struct TRowTraits<Message> +{ + using TRowType = Message; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +template <class T> +struct TRowTraits<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> +{ + using TRowType = T; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +template <class T> +struct TRowTraits<T, std::enable_if_t<TIsSkiffRow<T>::value>> +{ + using TRowType = T; + using IReaderImpl = ISkiffRowReaderImpl; +}; + +template <class... TSkiffRowTypes> +struct TRowTraits<TSkiffRowOneOf<TSkiffRowTypes...>> +{ + using TRowType = TSkiffRowOneOf<TSkiffRowTypes...>; + using IReaderImpl = ISkiffRowReaderImpl; +}; + +template <class... TProtoRowTypes> +struct TRowTraits<TProtoOneOf<TProtoRowTypes...>> +{ + using TRowType = TProtoOneOf<TProtoRowTypes...>; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct IReaderImplBase + : public TThrRefBase +{ + virtual bool IsValid() const = 0; + virtual void Next() = 0; + virtual ui32 GetTableIndex() const = 0; + virtual ui32 GetRangeIndex() const = 0; + virtual ui64 GetRowIndex() const = 0; + virtual void NextKey() = 0; + + // Not pure virtual because of clients that has already implemented this interface. + virtual TMaybe<size_t> GetReadByteCount() const; + virtual i64 GetTabletIndex() const; + virtual bool IsEndOfStream() const; + virtual bool IsRawReaderExhausted() const; +}; + +struct INodeReaderImpl + : public IReaderImplBase +{ + virtual const TNode& GetRow() const = 0; + virtual void MoveRow(TNode* row) = 0; +}; + +struct IYaMRReaderImpl + : public IReaderImplBase +{ + virtual const TYaMRRow& GetRow() const = 0; + virtual void MoveRow(TYaMRRow* row) + { + *row = GetRow(); + } +}; + +struct IProtoReaderImpl + : public IReaderImplBase +{ + virtual void ReadRow(Message* row) = 0; +}; + +struct ISkiffRowReaderImpl + : public IReaderImplBase +{ + virtual void ReadRow(const ISkiffRowParserPtr& parser) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +// We don't include <yt/cpp/mapreduce/interface/logging/yt_log.h> in this file +// to avoid macro name clashes (specifically YT_LOG_DEBUG) +void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount); + +template <class T> +class TTableReaderBase + : public TThrRefBase +{ +public: + using TRowType = typename TRowTraits<T>::TRowType; + using IReaderImpl = typename TRowTraits<T>::IReaderImpl; + + explicit TTableReaderBase(::TIntrusivePtr<IReaderImpl> reader) + : Reader_(reader) + { } + + ~TTableReaderBase() override + { + NDetail::LogTableReaderStatistics(ReadRowCount_, Reader_->GetReadByteCount()); + } + + bool IsValid() const + { + return Reader_->IsValid(); + } + + void Next() + { + Reader_->Next(); + ++ReadRowCount_; + RowState_ = ERowState::None; + } + + bool IsEndOfStream() + { + return Reader_->IsEndOfStream(); + } + + bool IsRawReaderExhausted() + { + return Reader_->IsRawReaderExhausted(); + } + + ui32 GetTableIndex() const + { + return Reader_->GetTableIndex(); + } + + ui32 GetRangeIndex() const + { + return Reader_->GetRangeIndex(); + } + + ui64 GetRowIndex() const + { + return Reader_->GetRowIndex(); + } + + i64 GetTabletIndex() const + { + return Reader_->GetTabletIndex(); + } + +protected: + template <typename TCacher, typename TCacheGetter> + const auto& DoGetRowCached(TCacher cacher, TCacheGetter cacheGetter) const + { + switch (RowState_) { + case ERowState::None: + cacher(); + RowState_ = ERowState::Cached; + break; + case ERowState::Cached: + break; + case ERowState::MovedOut: + ythrow yexception() << "Row is already moved"; + } + return *cacheGetter(); + } + + template <typename U, typename TMover, typename TCacheMover> + void DoMoveRowCached(U* result, TMover mover, TCacheMover cacheMover) + { + Y_VERIFY(result); + switch (RowState_) { + case ERowState::None: + mover(result); + break; + case ERowState::Cached: + cacheMover(result); + break; + case ERowState::MovedOut: + ythrow yexception() << "Row is already moved"; + } + RowState_ = ERowState::MovedOut; + } + +private: + enum class ERowState + { + None, + Cached, + MovedOut, + }; + +protected: + ::TIntrusivePtr<IReaderImpl> Reader_; + +private: + ui64 ReadRowCount_ = 0; + mutable ERowState RowState_ = ERowState::None; +}; + +template <class T> +class TSimpleTableReader + : public TTableReaderBase<T> +{ +public: + using TBase = TTableReaderBase<T>; + using typename TBase::TRowType; + + using TBase::TBase; + + const TRowType& GetRow() const + { + // Caching is implemented in underlying reader. + return TBase::DoGetRowCached( + /* cacher */ [&] {}, + /* cacheGetter */ [&] { + return &Reader_->GetRow(); + }); + } + + void MoveRow(TRowType* result) + { + // Caching is implemented in underlying reader. + TBase::DoMoveRowCached( + result, + /* mover */ [&] (TRowType* result) { + Reader_->MoveRow(result); + }, + /* cacheMover */ [&] (TRowType* result) { + Reader_->MoveRow(result); + }); + } + + TRowType MoveRow() + { + TRowType result; + MoveRow(&result); + return result; + } + +private: + using TBase::Reader_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +template <> +class TTableReader<TNode> + : public NDetail::TSimpleTableReader<TNode> +{ + using TSimpleTableReader<TNode>::TSimpleTableReader; +}; + +template <> +class TTableReader<TYaMRRow> + : public NDetail::TSimpleTableReader<TYaMRRow> +{ + using TSimpleTableReader<TYaMRRow>::TSimpleTableReader; +}; + +template <> +class TTableReader<Message> + : public NDetail::TTableReaderBase<Message> +{ +public: + using TBase = NDetail::TTableReaderBase<Message>; + + using TBase::TBase; + + template <class U> + const U& GetRow() const + { + static_assert(TIsBaseOf<Message, U>::Value); + + return TBase::DoGetRowCached( + /* cacher */ [&] { + CachedRow_.Reset(new U); + Reader_->ReadRow(CachedRow_.Get()); + }, + /* cacheGetter */ [&] { + auto result = dynamic_cast<const U*>(CachedRow_.Get()); + Y_VERIFY(result); + return result; + }); + } + + template <class U> + void MoveRow(U* result) + { + static_assert(TIsBaseOf<Message, U>::Value); + + TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + Reader_->ReadRow(result); + }, + /* cacheMover */ [&] (U* result) { + auto cast = dynamic_cast<U*>(CachedRow_.Get()); + Y_VERIFY(cast); + result->Swap(cast); + }); + } + + template <class U> + U MoveRow() + { + static_assert(TIsBaseOf<Message, U>::Value); + + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + mutable THolder<Message> CachedRow_; +}; + +template<class... TProtoRowTypes> +class TTableReader<TProtoOneOf<TProtoRowTypes...>> + : public NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>> +{ +public: + using TBase = NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>>; + + using TBase::TBase; + + template <class U> + const U& GetRow() const + { + AssertIsOneOf<U>(); + return TBase::DoGetRowCached( + /* cacher */ [&] { + Reader_->ReadRow(&std::get<U>(CachedRows_)); + CachedIndex_ = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + }, + /* cacheGetter */ [&] { + return &std::get<U>(CachedRows_); + }); + } + + template <class U> + void MoveRow(U* result) + { + AssertIsOneOf<U>(); + return TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + Reader_->ReadRow(result); + }, + /* cacheMover */ [&] (U* result) { + Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_); + *result = std::move(std::get<U>(CachedRows_)); + }); + } + + template <class U> + U MoveRow() + { + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + // std::variant could also be used here, but std::tuple leads to better performance + // because of deallocations that std::variant has to do + mutable std::tuple<TProtoRowTypes...> CachedRows_; + mutable int CachedIndex_; + + template <class U> + static constexpr void AssertIsOneOf() + { + static_assert( + (std::is_same<U, TProtoRowTypes>::value || ...), + "Template parameter must be one of TProtoOneOf template parameter"); + } +}; + +template <class T> +class TTableReader<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> + : public TTableReader<TProtoOneOf<T>> +{ +public: + using TRowType = T; + using TBase = TTableReader<TProtoOneOf<T>>; + + using TBase::TBase; + + const T& GetRow() const + { + return TBase::template GetRow<T>(); + } + + void MoveRow(T* result) + { + TBase::template MoveRow<T>(result); + } + + T MoveRow() + { + return TBase::template MoveRow<T>(); + } +}; + +template<class... TSkiffRowTypes> +class TTableReader<TSkiffRowOneOf<TSkiffRowTypes...>> + : public NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>> +{ +public: + using TBase = NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>>; + + using TBase::TBase; + + explicit TTableReader(::TIntrusivePtr<typename TBase::IReaderImpl> reader, const TMaybe<TSkiffRowHints>& hints) + : TBase(reader) + , Parsers_({(CreateSkiffParser<TSkiffRowTypes>(&std::get<TSkiffRowTypes>(CachedRows_), hints))...}) + { } + + template <class U> + const U& GetRow() const + { + AssertIsOneOf<U>(); + return TBase::DoGetRowCached( + /* cacher */ [&] { + auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + Reader_->ReadRow(Parsers_[index]); + CachedIndex_ = index; + }, + /* cacheGetter */ [&] { + return &std::get<U>(CachedRows_); + }); + } + + template <class U> + void MoveRow(U* result) + { + AssertIsOneOf<U>(); + return TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + Reader_->ReadRow(Parsers_[index]); + *result = std::move(std::get<U>(CachedRows_)); + }, + /* cacheMover */ [&] (U* result) { + Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_); + *result = std::move(std::get<U>(CachedRows_)); + }); + } + + template <class U> + U MoveRow() + { + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<ISkiffRowReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + // std::variant could also be used here, but std::tuple leads to better performance + // because of deallocations that std::variant has to do + mutable std::tuple<TSkiffRowTypes...> CachedRows_; + mutable std::vector<ISkiffRowParserPtr> Parsers_; + mutable int CachedIndex_; + + template <class U> + static constexpr void AssertIsOneOf() + { + static_assert( + (std::is_same<U, TSkiffRowTypes>::value || ...), + "Template parameter must be one of TSkiffRowOneOf template parameter"); + } +}; + +template <class T> +class TTableReader<T, std::enable_if_t<TIsSkiffRow<T>::value>> + : public TTableReader<TSkiffRowOneOf<T>> +{ +public: + using TRowType = T; + using TBase = TTableReader<TSkiffRowOneOf<T>>; + + using TBase::TBase; + + const T& GetRow() + { + return TBase::template GetRow<T>(); + } + + void MoveRow(T* result) + { + TBase::template MoveRow<T>(result); + } + + T MoveRow() + { + return TBase::template MoveRow<T>(); + } +}; + +template <> +inline TTableReaderPtr<TNode> IIOClient::CreateTableReader<TNode>( + const TRichYPath& path, const TTableReaderOptions& options) +{ + return new TTableReader<TNode>(CreateNodeReader(path, options)); +} + +template <> +inline TTableReaderPtr<TYaMRRow> IIOClient::CreateTableReader<TYaMRRow>( + const TRichYPath& path, const TTableReaderOptions& options) +{ + return new TTableReader<TYaMRRow>(CreateYaMRReader(path, options)); +} + +template <class T, class = std::enable_if_t<TIsBaseOf<Message, T>::Value>> +struct TReaderCreator +{ + static TTableReaderPtr<T> Create(::TIntrusivePtr<IProtoReaderImpl> reader) + { + return new TTableReader<T>(reader); + } +}; + +template <class T> +inline TTableReaderPtr<T> IIOClient::CreateTableReader( + const TRichYPath& path, const TTableReaderOptions& options) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + TAutoPtr<T> prototype(new T); + return new TTableReader<T>(CreateProtoReader(path, options, prototype.Get())); + } else if constexpr (TIsSkiffRow<T>::value) { + const auto& hints = options.FormatHints_ ? options.FormatHints_->SkiffRowHints_ : Nothing(); + auto schema = GetSkiffSchema<T>(hints); + auto skipper = CreateSkiffSkipper<T>(hints); + return new TTableReader<T>(CreateSkiffRowReader(path, options, skipper, schema), hints); + } else { + static_assert(TDependentFalse<T>, "Unsupported type for table reader"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TTableReaderPtr<T> CreateTableReader( + IInputStream* stream, + const TTableReaderOptions& options) +{ + return TReaderCreator<T>::Create(NDetail::CreateProtoReader(stream, options, T::descriptor())); +} + +template <class... Ts> +TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader( + IInputStream* stream, + const TTableReaderOptions& options) +{ + return new TTableReader<typename NDetail::TProtoOneOfUnique<Ts...>::TType>( + NDetail::CreateProtoReader(stream, options, {Ts::descriptor()...})); +} + +template <class T> +TTableReaderPtr<T> CreateProtoMultiTableReader( + IInputStream* stream, + int tableCount, + const TTableReaderOptions& options) +{ + static_assert(TIsBaseOf<::google::protobuf::Message, T>::Value); + TVector<const ::google::protobuf::Descriptor*> descriptors(tableCount, T::descriptor()); + return new TTableReader<T>(NDetail::CreateProtoReader(stream, options, std::move(descriptors))); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +class TTableRangesReader<T> + : public TThrRefBase +{ +public: + using TRowType = T; + +private: + using TReaderImpl = typename TRowTraits<TRowType>::IReaderImpl; + +public: + TTableRangesReader(::TIntrusivePtr<TReaderImpl> readerImpl) + : ReaderImpl_(readerImpl) + , Reader_(MakeIntrusive<TTableReader<TRowType>>(readerImpl)) + , IsValid_(Reader_->IsValid()) + { } + + TTableReader<T>& GetRange() + { + return *Reader_; + } + + bool IsValid() const + { + return IsValid_; + } + + void Next() + { + ReaderImpl_->NextKey(); + if ((IsValid_ = Reader_->IsValid())) { + Reader_->Next(); + } + } + +private: + ::TIntrusivePtr<TReaderImpl> ReaderImpl_; + ::TIntrusivePtr<TTableReader<TRowType>> Reader_; + bool IsValid_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +struct IWriterImplBase + : public TThrRefBase +{ + virtual void AddRow(const T& row, size_t tableIndex) = 0; + + virtual void AddRow(const T& row, size_t tableIndex, size_t /*rowWeight*/) + { + AddRow(row, tableIndex); + } + + virtual void AddRow(T&& row, size_t tableIndex) = 0; + + virtual void AddRow(T&& row, size_t tableIndex, size_t /*rowWeight*/) + { + AddRow(std::move(row), tableIndex); + } + + virtual void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0) + { + for (const auto& row : rowBatch) { + AddRow(row, tableIndex, rowBatchWeight / rowBatch.size()); + } + } + + virtual void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0) + { + auto rowBatchSize = rowBatch.size(); + for (auto&& row : std::move(rowBatch)) { + AddRow(std::move(row), tableIndex, rowBatchWeight / rowBatchSize); + } + } + + virtual size_t GetTableCount() const = 0; + virtual void FinishTable(size_t tableIndex) = 0; + virtual void Abort() + { } +}; + +struct INodeWriterImpl + : public IWriterImplBase<TNode> +{ +}; + +struct IYaMRWriterImpl + : public IWriterImplBase<TYaMRRow> +{ +}; + +struct IProtoWriterImpl + : public IWriterImplBase<Message> +{ +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +class TTableWriterBase + : public TThrRefBase +{ +public: + using TRowType = T; + using IWriterImpl = typename TRowTraits<T>::IWriterImpl; + + explicit TTableWriterBase(::TIntrusivePtr<IWriterImpl> writer) + : Writer_(writer) + , Locks_(MakeAtomicShared<TVector<TAdaptiveLock>>(writer->GetTableCount())) + { } + + ~TTableWriterBase() override + { + if (Locks_.RefCount() == 1) { + NDetail::FinishOrDie(this, "TTableWriterBase"); + } + } + + void Abort() + { + Writer_->Abort(); + } + + void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + DoAddRow<T>(row, tableIndex, rowWeight); + } + + void AddRow(T&& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + DoAddRow<T>(std::move(row), tableIndex, rowWeight); + } + + void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + DoAddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight); + } + + void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + DoAddRowBatch<T>(std::move(rowBatch), tableIndex, rowBatchWeight); + } + + void Finish() + { + for (size_t i = 0; i < Locks_->size(); ++i) { + auto guard = Guard((*Locks_)[i]); + Writer_->FinishTable(i); + } + } + +protected: + template <class U> + void DoAddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRow(row, tableIndex, rowWeight); + } + + template <class U> + void DoAddRow(U&& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRow(std::move(row), tableIndex, rowWeight); + } + + template <class U> + void DoAddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRowBatch(rowBatch, tableIndex, rowBatchWeight); + } + + template <class U> + void DoAddRowBatch(TVector<U>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRowBatch(std::move(rowBatch), tableIndex, rowBatchWeight); + } + + ::TIntrusivePtr<IWriterImpl> GetWriterImpl() + { + return Writer_; + } + +private: + ::TIntrusivePtr<IWriterImpl> Writer_; + TAtomicSharedPtr<TVector<TAdaptiveLock>> Locks_; +}; + +template <> +class TTableWriter<TNode> + : public TTableWriterBase<TNode> +{ +public: + using TBase = TTableWriterBase<TNode>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } +}; + +template <> +class TTableWriter<TYaMRRow> + : public TTableWriterBase<TYaMRRow> +{ +public: + using TBase = TTableWriterBase<TYaMRRow>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } +}; + +template <> +class TTableWriter<Message> + : public TTableWriterBase<Message> +{ +public: + using TBase = TTableWriterBase<Message>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } + + template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr> + void AddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + TBase::AddRow(row, tableIndex, rowWeight); + } + + template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr> + void AddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + for (const auto& row : rowBatch) { + AddRow(row, tableIndex, rowBatchWeight / rowBatch.size()); + } + } +}; + +template <class T> +class TTableWriter<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> + : public TTableWriter<Message> +{ +public: + using TRowType = T; + using TBase = TTableWriter<Message>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } + + void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + TBase::AddRow<T>(row, tableIndex, rowWeight); + } + + void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + TBase::AddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight); + } +}; + +template <> +inline TTableWriterPtr<TNode> IIOClient::CreateTableWriter<TNode>( + const TRichYPath& path, const TTableWriterOptions& options) +{ + return new TTableWriter<TNode>(CreateNodeWriter(path, options)); +} + +template <> +inline TTableWriterPtr<TYaMRRow> IIOClient::CreateTableWriter<TYaMRRow>( + const TRichYPath& path, const TTableWriterOptions& options) +{ + return new TTableWriter<TYaMRRow>(CreateYaMRWriter(path, options)); +} + +template <class T> +inline TTableWriterPtr<T> IIOClient::CreateTableWriter( + const TRichYPath& path, const TTableWriterOptions& options) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + TAutoPtr<T> prototype(new T); + return new TTableWriter<T>(CreateProtoWriter(path, options, prototype.Get())); + } else { + static_assert(TDependentFalse<T>, "Unsupported type for table writer"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader) +{ + static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)"); + Y_ENSURE(reader, "reader must be non-null"); + return ::MakeIntrusive<TTableReader<T>>(reader->GetReaderImpl()); +} + +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader) +{ + Y_ENSURE(reader, "reader must be non-null"); + return CreateConcreteProtobufReader<T>(reader.Get()); +} + +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader) +{ + static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)"); + Y_ENSURE(reader, "reader must be non-null"); + return ::MakeIntrusive<TTableReader<Message>>(reader->GetReaderImpl()); +} + +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader) +{ + Y_ENSURE(reader, "reader must be non-null"); + return CreateGenericProtobufReader(reader.Get()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io.cpp b/yt/cpp/mapreduce/interface/io.cpp new file mode 100644 index 0000000000..f97629721a --- /dev/null +++ b/yt/cpp/mapreduce/interface/io.cpp @@ -0,0 +1,47 @@ +#include "io.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <util/string/cast.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TMaybe<size_t> IReaderImplBase::GetReadByteCount() const +{ + return Nothing(); +} + +i64 IReaderImplBase::GetTabletIndex() const +{ + Y_FAIL("Unimplemented"); +} + +bool IReaderImplBase::IsEndOfStream() const +{ + Y_FAIL("Unimplemented"); +} + +bool IReaderImplBase::IsRawReaderExhausted() const +{ + Y_FAIL("Unimplemented"); +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount) +{ + TString byteCountStr = (byteCount ? ::ToString(*byteCount) : "<unknown>"); + YT_LOG_DEBUG("Table reader has read %v rows, %v bytes", + rowCount, + byteCountStr); +} + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io.h b/yt/cpp/mapreduce/interface/io.h new file mode 100644 index 0000000000..e2b20a1802 --- /dev/null +++ b/yt/cpp/mapreduce/interface/io.h @@ -0,0 +1,586 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/io.h +/// +/// Header containing client interface for reading and writing tables and files. + + +#include "fwd.h" + +#include "client_method_options.h" +#include "common.h" +#include "format.h" +#include "node.h" +#include "mpl.h" +#include "skiff_row.h" + +#include <google/protobuf/message.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> +#include <util/generic/yexception.h> +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief "Marker" type to use for several protobuf types in @ref NYT::TTableReader. +/// +/// @tparam Ts Possible types of rows to be read. +template<class... TProtoRowTypes> +class TProtoOneOf +{ +public: + static_assert( + (TIsBaseOf<::google::protobuf::Message, TProtoRowTypes>::Value && ...), + "Template parameters can only be protobuf types"); + + TProtoOneOf() = delete; +}; + +/// +/// @brief "Marker" type to use for several skiff row types in @ref NYT::TTableReader. +/// +/// @tparam Ts Possible types of rows to be read. +template<class... TSkiffRowTypes> +class TSkiffRowOneOf +{ +public: + static_assert( + (TIsSkiffRow<TSkiffRowTypes>::value && ...), + "Template parameters can only be SkiffRow types"); + + TSkiffRowOneOf() = delete; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template <class TTuple> +struct TProtoOneOfFromTuple; + +template <class... Ts> +struct TProtoOneOfFromTuple<std::tuple<Ts...>> +{ + using TType = TProtoOneOf<Ts...>; +}; + +template <class... Ts> +struct TProtoOneOfUnique +{ + using TTuple = typename TUniqueTypes<std::tuple<>, std::tuple<Ts...>>::TType; + using TType = typename TProtoOneOfFromTuple<TTuple>::TType; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +struct INodeReaderImpl; +struct IYaMRReaderImpl; +struct IProtoReaderImpl; +struct ISkiffRowReaderImpl; +struct INodeWriterImpl; +struct IYaMRWriterImpl; +struct IProtoWriterImpl; + +//////////////////////////////////////////////////////////////////////////////// + +/// Class of exceptions connected to reading or writing tables or files. +class TIOException + : public yexception +{ }; + +/////////////////////////////////////////////////////////////////////////////// + +/// Interface representing YT file reader. +class IFileReader + : public TThrRefBase + , public IInputStream +{ }; + +/// Interface representing YT file writer. +class IFileWriter + : public TThrRefBase + , public IOutputStream +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// Low-level interface to read YT table with retries. +class TRawTableReader + : public TThrRefBase + , public IInputStream +{ +public: + /// @brief Retry table read starting from the specified `rangeIndex` and `rowIndex`. + /// + /// @param rangeIndex Index of first range to read + /// @param rowIndex Index of first row to read; if `rowIndex == Nothing` entire request will be retried. + /// + /// @return `true` on successful request retry, `false` if no retry attempts are left (then `Retry()` shouldn't be called any more). + /// + /// `rowIndex` must be inside the range with index `rangeIndex` if the latter is specified. + /// + /// After successful retry the user should reset `rangeIndex` / `rowIndex` values and read new ones + /// from the stream. + virtual bool Retry( + const TMaybe<ui32>& rangeIndex, + const TMaybe<ui64>& rowIndex) = 0; + + /// Resets retry attempt count to the initial value (then `Retry()` can be called again). + virtual void ResetRetries() = 0; + + /// @brief May the input stream contain table ranges? + /// + /// In the case when it is `true` the `TRawTableReader` user is responsible + /// to track active range index in order to pass it to Retry(). + virtual bool HasRangeIndices() const = 0; +}; + +/// @brief Low-level interface to write YT table. +/// +/// Retries must be handled by implementation. +class TRawTableWriter + : public TThrRefBase + , public IOutputStream +{ +public: + /// @brief Call this method after complete row representation is written to the stream. + /// + /// When this method is called `TRowTableWriter` can check its buffer + /// and if it is full send data to YT. + /// @note `TRawTableWriter` never sends partial records to YT (due to retries). + virtual void NotifyRowEnd() = 0; + + /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers). + /// + /// By default it does nothing, but implementations are welcome to override this method. + virtual void Abort() + { } +}; + +/// @brief Interface to deal with multiple raw output streams. +class IProxyOutput +{ +public: + virtual ~IProxyOutput() + { } + + /// Get amount of managed streams. + virtual size_t GetStreamCount() const = 0; + + /// Get stream corresponding to the specified table index. + virtual IOutputStream* GetStream(size_t tableIndex) const = 0; + + /// This handler must be called right after the next row has been written. + virtual void OnRowFinished(size_t tableIndex) = 0; + + /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers). + /// + /// By default it does nothing, but implementations are welcome to override this method. + virtual void Abort() + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class template to read typed rows from YT tables. +/// +/// @tparam T Row type. +/// +/// Correct usage of this class usually looks like +/// ``` +/// for (const auto& cursor : *reader) { +/// const auto& row = cursor.GetRow(); +/// ... +/// } +/// ``` +/// or, more verbosely, +/// ``` +/// for (; reader->IsValid(); reader->Next()) { +/// const auto& row = reader->GetRow(); +/// ... +/// } +/// ``` +/// +/// @note Actual (partial) specializations of this template may look a bit different, +/// e.g. @ref NYT::TTableReader::GetRow, @ref NYT::TTableReader::MoveRow may be method templates. +template <class T, class> +class TTableReader + : public TThrRefBase +{ +public: + /// Get current row. + const T& GetRow() const; + + /// Extract current row; further calls to `GetRow` and `MoveRow` will fail. + T MoveRow(); + + /// Extract current row to `result`; further calls to `GetRow` and `MoveRow` will fail. + void MoveRow(T* result); + + /// Check whether all the rows were read. + bool IsValid() const; + + /// Move the cursor to the next row. + void Next(); + + /// Get table index of the current row. + ui32 GetTableIndex() const; + + /// Get range index of the current row (zero if it is unknown or read request contains no ranges) + ui32 GetRangeIndex() const; + + /// Get current row index (zero if it unknown). + ui64 GetRowIndex() const; + + /// Get current tablet index (for ordered dynamic tables). + i64 GetTabletIndex() const; + + /// Returns `true` if job consumed all the input and `false` otherwise. + bool IsEndOfStream() const; + + /// Returns `true` if job raw input stream was closed and `false` otherwise. + bool IsRawReaderExhausted() const; +}; + +/// @brief Iterator for use in range-based-for. +/// +/// @note Idiomatic usage: +/// ``` +/// for (const auto& cursor : *reader) { +/// const auto& row = cursor.GetRow(); +/// ... +/// } +/// ``` +template <class T> +class TTableReaderIterator +{ +public: + /// Construct iterator from table reader (can be `nullptr`). + explicit TTableReaderIterator<T>(TTableReader<T>* reader) + { + if (reader && reader->IsValid()) { + Reader_ = reader; + } else { + Reader_ = nullptr; + } + } + + /// Equality operator. + bool operator==(const TTableReaderIterator& it) const + { + return Reader_ == it.Reader_; + } + + /// Inequality operator. + bool operator!=(const TTableReaderIterator& it) const + { + return Reader_ != it.Reader_; + } + + /// Dereference operator. + TTableReader<T>& operator*() + { + return *Reader_; + } + + /// Const dereference operator. + const TTableReader<T>& operator*() const + { + return *Reader_; + } + + /// Preincrement operator. + TTableReaderIterator& operator++() + { + Reader_->Next(); + if (!Reader_->IsValid()) { + Reader_ = nullptr; + } + return *this; + } + +private: + TTableReader<T>* Reader_; +}; + +/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader. +/// +/// @see @ref NYT::TTableReaderIterator +template <class T> +TTableReaderIterator<T> begin(TTableReader<T>& reader) +{ + return TTableReaderIterator<T>(&reader); +} + +/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader. +/// +/// @see @ref NYT::TTableReaderIterator +template <class T> +TTableReaderIterator<T> end(TTableReader<T>&) +{ + return TTableReaderIterator<T>(nullptr); +} + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class to facilitate reading table rows sorted by key. +/// +/// Each reader returned from @ref NYT::TTableRangesReader::GetRange represents +/// a range of rows with the same key. +/// +/// @note Idiomatic usage: +/// ``` +/// for (; reader->IsValid(); reader->Next()) { +/// auto& rangeReader = reader->GetRange(); +/// ... +/// } +/// ``` +template <class T, class> +class TTableRangesReader + : public TThrRefBase +{ +public: + /// Get reader for rows with the same key. + TTableReader<T>& GetRange(); + + /// Check whether all rows are read. + bool IsValid() const; + + /// Move cursor to the next range. + void Next(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Class template to write typed rows to YT tables. +template <class T, class> +class TTableWriter + : public TThrRefBase +{ +public: + /// @brief Submit a row for writing. + /// + /// The row may (and very probably will) *not* be written immediately. + void AddRow(const T& row); + + /// Stop writing data as soon as possible (without flushing data, e.g. before aborting parent transaction). + void Finish(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Type representing YaMR table row. +/// +/// @deprecated +struct TYaMRRow +{ + /// Key column. + TStringBuf Key; + + /// Subkey column. + TStringBuf SubKey; + + /// Value column. + TStringBuf Value; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Interface for creating table and file readers and writer. +class IIOClient +{ +public: + virtual ~IIOClient() = default; + + /// Create a reader for file at `path`. + virtual IFileReaderPtr CreateFileReader( + const TRichYPath& path, + const TFileReaderOptions& options = TFileReaderOptions()) = 0; + + /// Create a writer for file at `path`. + virtual IFileWriterPtr CreateFileWriter( + const TRichYPath& path, + const TFileWriterOptions& options = TFileWriterOptions()) = 0; + + /// Create a typed reader for table at `path`. + template <class T> + TTableReaderPtr<T> CreateTableReader( + const TRichYPath& path, + const TTableReaderOptions& options = TTableReaderOptions()); + + /// Create a typed writer for table at `path`. + template <class T> + TTableWriterPtr<T> CreateTableWriter( + const TRichYPath& path, + const TTableWriterOptions& options = TTableWriterOptions()); + + /// Create a writer to write protobuf messages with specified descriptor. + virtual TTableWriterPtr<::google::protobuf::Message> CreateTableWriter( + const TRichYPath& path, + const ::google::protobuf::Descriptor& descriptor, + const TTableWriterOptions& options = TTableWriterOptions()) = 0; + + /// Create a reader to read a table using specified format. + virtual TRawTableReaderPtr CreateRawReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options = TTableReaderOptions()) = 0; + + /// Create a reader to write a table using specified format. + virtual TRawTableWriterPtr CreateRawWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options = TTableWriterOptions()) = 0; + + /// + /// @brief Create a reader for [blob table](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables) at `path`. + /// + /// @param path Blob table path. + /// @param blobId Key identifying the blob. + /// @param options Optional parameters + /// + /// Blob table is a table that stores a number of blobs. + /// Blobs are sliced into parts of the same size (maybe except of last part). + /// Those parts are stored in the separate rows. + /// + /// Blob table have constraints on its schema. + /// - There must be columns that identify blob (blob id columns). That columns might be of any type. + /// - There must be a column of `int64` type that identify part inside the blob (this column is called `part index`). + /// - There must be a column of `string` type that stores actual data (this column is called `data column`). + virtual IFileReaderPtr CreateBlobTableReader( + const TYPath& path, + const TKey& blobId, + const TBlobTableReaderOptions& options = TBlobTableReaderOptions()) = 0; + +private: + virtual ::TIntrusivePtr<INodeReaderImpl> CreateNodeReader( + const TRichYPath& path, const TTableReaderOptions& options) = 0; + + virtual ::TIntrusivePtr<IYaMRReaderImpl> CreateYaMRReader( + const TRichYPath& path, const TTableReaderOptions& options) = 0; + + virtual ::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ::google::protobuf::Message* prototype) = 0; + + virtual ::TIntrusivePtr<ISkiffRowReaderImpl> CreateSkiffRowReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ISkiffRowSkipperPtr& skipper, + const NSkiff::TSkiffSchemaPtr& schema) = 0; + + virtual ::TIntrusivePtr<INodeWriterImpl> CreateNodeWriter( + const TRichYPath& path, const TTableWriterOptions& options) = 0; + + virtual ::TIntrusivePtr<IYaMRWriterImpl> CreateYaMRWriter( + const TRichYPath& path, const TTableWriterOptions& options) = 0; + + virtual ::TIntrusivePtr<IProtoWriterImpl> CreateProtoWriter( + const TRichYPath& path, + const TTableWriterOptions& options, + const ::google::protobuf::Message* prototype) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Create a protobuf table reader from a stream. +/// +/// @tparam T Protobuf message type to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +template <typename T> +TTableReaderPtr<T> CreateTableReader( + IInputStream* stream, + const TTableReaderOptions& options = {}); + +/// +/// @brief Create a protobuf multi table reader from a stream. +/// +/// @tparam Ts Protobuf message types to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +template <class... Ts> +TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader( + IInputStream* stream, + const TTableReaderOptions& options = {}); + +/// +/// @brief Create a homogenous protobuf multi table reader from a stream. +/// +/// @tparam T Protobuf message type to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +/// @param tableCount Number of tables in input stream. +template <class T> +TTableReaderPtr<T> CreateProtoMultiTableReader( + IInputStream* stream, + int tableCount, + const TTableReaderOptions& options = {}); + +/// Create a @ref NYT::TNode table reader from a stream. +template <> +TTableReaderPtr<TNode> CreateTableReader<TNode>( + IInputStream* stream, const TTableReaderOptions& options); + +/// Create a @ref NYT::TYaMRRow table reader from a stream. +template <> +TTableReaderPtr<TYaMRRow> CreateTableReader<TYaMRRow>( + IInputStream* stream, const TTableReaderOptions& options); + +namespace NDetail { + +/// Create a protobuf table reader from a stream. +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& options, + const ::google::protobuf::Descriptor* descriptor); + + +/// Create a protobuf table reader from a stream that can contain table switches. +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& options, + TVector<const ::google::protobuf::Descriptor*> descriptors); + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +/// Convert generic protobuf table reader to a concrete one (for certain type `T`). +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader); + +/// Convert generic protobuf table reader to a concrete one (for certain type `T`). +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader); + +/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one. +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader); + +/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one. +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define IO_INL_H_ +#include "io-inl.h" +#undef IO_INL_H_ diff --git a/yt/cpp/mapreduce/interface/job_counters.cpp b/yt/cpp/mapreduce/interface/job_counters.cpp new file mode 100644 index 0000000000..6d4a2a6fcb --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters.cpp @@ -0,0 +1,164 @@ +#include "job_counters.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +namespace { + ui64 CountTotal(const TNode& data) + { + if (data.IsMap()) { + if (auto totalPtr = data.AsMap().FindPtr("total")) { + return data["total"].IntCast<ui64>(); + } else { + ui64 total = 0; + for (const auto& keyVal: data.AsMap()) { + total += CountTotal(keyVal.second); + } + return total; + } + } else { + return data.IntCast<ui64>(); + } + } + + TNode GetNode(const TNode& data, const TStringBuf& key) + { + if (auto resPtr = data.AsMap().FindPtr(key)) { + return *resPtr; + } + return TNode(); + } +} // namespace + +//////////////////////////////////////////////////////////////////// + +TJobCounter::TJobCounter(TNode data) + : Data_(std::move(data)) +{ + if (Data_.HasValue()) { + Total_ = CountTotal(Data_); + } +} + +TJobCounter::TJobCounter(ui64 total) + : Total_(total) +{ } + +ui64 TJobCounter::GetTotal() const +{ + return Total_; +} + +ui64 TJobCounter::GetValue(const TStringBuf key) const +{ + if (Data_.HasValue()) { + return CountTotal(Data_[key]); + } + return 0; +} + +//////////////////////////////////////////////////////////////////// + +TJobCounters::TJobCounters(const NYT::TNode& counters) + : Total_(0) +{ + if (!counters.IsMap()) { + ythrow yexception() << "TJobCounters must be initialized with Map type TNode"; + } + auto abortedNode = GetNode(counters, "aborted"); + if (abortedNode.HasValue()) { + Aborted_ = TJobCounter(GetNode(abortedNode, "total")); + AbortedScheduled_ = TJobCounter(GetNode(abortedNode, "scheduled")); + AbortedNonScheduled_ = TJobCounter(GetNode(abortedNode, "non_scheduled")); + } + auto completedNode = GetNode(counters, "completed"); + if (completedNode.HasValue()) { + Completed_ = TJobCounter(GetNode(completedNode, "total")); + CompletedNonInterrupted_ = TJobCounter(GetNode(completedNode, "non-interrupted")); + CompletedInterrupted_ = TJobCounter(GetNode(completedNode, "interrupted")); + } + Lost_ = TJobCounter(GetNode(counters, "lost")); + Invalidated_ = TJobCounter(GetNode(counters, "invalidated")); + Failed_ = TJobCounter(GetNode(counters, "failed")); + Running_ = TJobCounter(GetNode(counters, "running")); + Suspended_ = TJobCounter(GetNode(counters, "suspended")); + Pending_ = TJobCounter(GetNode(counters, "pending")); + Blocked_ = TJobCounter(GetNode(counters, "blocked")); + Total_ = CountTotal(counters); +} + + +const TJobCounter& TJobCounters::GetAborted() const +{ + return Aborted_; +} + +const TJobCounter& TJobCounters::GetAbortedScheduled() const +{ + return AbortedScheduled_; +} + +const TJobCounter& TJobCounters::GetAbortedNonScheduled() const +{ + return AbortedNonScheduled_; +} + +const TJobCounter& TJobCounters::GetCompleted() const +{ + return Completed_; +} + +const TJobCounter& TJobCounters::GetCompletedNonInterrupted() const +{ + return CompletedNonInterrupted_; +} + +const TJobCounter& TJobCounters::GetCompletedInterrupted() const +{ + return CompletedInterrupted_; +} + +const TJobCounter& TJobCounters::GetLost() const +{ + return Lost_; +} + +const TJobCounter& TJobCounters::GetInvalidated() const +{ + return Invalidated_; +} + +const TJobCounter& TJobCounters::GetFailed() const +{ + return Failed_; +} + +const TJobCounter& TJobCounters::GetRunning() const +{ + return Running_; +} + +const TJobCounter& TJobCounters::GetSuspended() const +{ + return Suspended_; +} + +const TJobCounter& TJobCounters::GetPending() const +{ + return Pending_; +} + +const TJobCounter& TJobCounters::GetBlocked() const +{ + return Blocked_; +} + +ui64 TJobCounters::GetTotal() const +{ + return Total_; +} + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_counters.h b/yt/cpp/mapreduce/interface/job_counters.h new file mode 100644 index 0000000000..9257cc1ec1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters.h @@ -0,0 +1,74 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/node.h> + +namespace NYT { + +class TJobCounter +{ +private: + TNode Data_; + ui64 Total_ = 0; + +public: + TJobCounter() = default; + + TJobCounter(TNode data); + TJobCounter(ui64 total); + + ui64 GetTotal() const; + + ui64 GetValue(const TStringBuf key) const; +}; + +/// Class representing a collection of job counters. +class TJobCounters +{ +public: + /// + /// Construct empty counter. + TJobCounters() = default; + + /// + /// Construct counter from counters node. + TJobCounters(const NYT::TNode& counters); + + const TJobCounter& GetAborted() const; + const TJobCounter& GetAbortedScheduled() const; + const TJobCounter& GetAbortedNonScheduled() const; + const TJobCounter& GetCompleted() const; + const TJobCounter& GetCompletedNonInterrupted() const; + const TJobCounter& GetCompletedInterrupted() const; + const TJobCounter& GetLost() const; + const TJobCounter& GetInvalidated() const; + const TJobCounter& GetFailed() const; + const TJobCounter& GetRunning() const; + const TJobCounter& GetSuspended() const; + const TJobCounter& GetPending() const; + const TJobCounter& GetBlocked() const; + + ui64 GetTotal() const; + +private: + ui64 Total_ = 0; + + TJobCounter Aborted_; + TJobCounter AbortedScheduled_; + TJobCounter AbortedNonScheduled_; + TJobCounter Completed_; + TJobCounter CompletedNonInterrupted_; + TJobCounter CompletedInterrupted_; + TJobCounter Lost_; + TJobCounter Invalidated_; + TJobCounter Failed_; + TJobCounter Running_; + TJobCounter Suspended_; + TJobCounter Pending_; + TJobCounter Blocked_; +}; + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_counters_ut.cpp b/yt/cpp/mapreduce/interface/job_counters_ut.cpp new file mode 100644 index 0000000000..56d3932b8f --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters_ut.cpp @@ -0,0 +1,103 @@ +#include <yt/cpp/mapreduce/interface/job_counters.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(JobCounters) +{ + Y_UNIT_TEST(Full) + { + const TString input = R"""( + { + "completed" = { + "total" = 6; + "non-interrupted" = 1; + "interrupted" = { + "whatever_interrupted" = 2; + "whatever_else_interrupted" = 3; + }; + }; + "aborted" = { + "non_scheduled" = { + "whatever_non_scheduled" = 4; + "whatever_else_non_scheduled" = 5; + }; + "scheduled" = { + "whatever_scheduled" = 6; + "whatever_else_scheduled" = 7; + }; + "total" = 22; + }; + "lost" = 8; + "invalidated" = 9; + "failed" = 10; + "running" = 11; + "suspended" = 12; + "pending" = 13; + "blocked" = 14; + "total" = 105; + })"""; + + TJobCounters counters(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 105); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 6); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 1); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 5); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 22); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 9); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 13); + UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 8); + UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 9); + UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 10); + UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 11); + UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 12); + UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 13); + UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 14); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_interrupted"), 2); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_else_interrupted"), 3); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_non_scheduled"), 4); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_else_non_scheduled"), 5); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_scheduled"), 6); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_else_scheduled"), 7); + + UNIT_ASSERT_EXCEPTION(counters.GetCompletedInterrupted().GetValue("Nothingness"), yexception); + } + + Y_UNIT_TEST(Empty) + { + const TString input = "{}"; + + TJobCounters counters(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 0); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 0); + } + + Y_UNIT_TEST(Broken) + { + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode()), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1)), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1.0)), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode("Whatever")), yexception, "TJobCounters"); + } +} diff --git a/yt/cpp/mapreduce/interface/job_statistics.cpp b/yt/cpp/mapreduce/interface/job_statistics.cpp new file mode 100644 index 0000000000..bd9791672d --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics.cpp @@ -0,0 +1,361 @@ +#include "job_statistics.h" + +#include "operation.h" + +#include <library/cpp/yson/node/node.h> +#include <library/cpp/yson/node/serialize.h> + +#include <library/cpp/yson/writer.h> + +#include <util/datetime/base.h> +#include <util/generic/hash_set.h> +#include <util/generic/ptr.h> +#include <util/stream/file.h> +#include <util/string/cast.h> +#include <util/string/subst.h> +#include <util/system/file.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +template <> +i64 ConvertJobStatisticsEntry(i64 value) +{ + return value; +} + +template <> +TDuration ConvertJobStatisticsEntry(i64 value) +{ + return TDuration::MilliSeconds(value); +} + +//////////////////////////////////////////////////////////////////// + +static TTaskName JobTypeToTaskName(EJobType jobType) +{ + switch (jobType) { + case EJobType::PartitionMap: + return ETaskName::PartitionMap0; + case EJobType::Partition: + return ETaskName::Partition0; + default: + return ToString(jobType); + } +} + +static TTaskName FixTaskName(TString taskName) +{ + if (taskName == "partition") { + return ETaskName::Partition0; + } else if (taskName == "partition_map") { + return ETaskName::PartitionMap0; + } + return taskName; +} + +//////////////////////////////////////////////////////////////////// + +class TJobStatistics::TData + : public TThrRefBase +{ +public: + using TTaskName2Data = THashMap<TString, TJobStatistics::TDataEntry>; + using TState2TaskName2Data = THashMap<EJobState, TTaskName2Data>; + using TName2State2TaskName2Data = THashMap<TString, TState2TaskName2Data>; + +public: + TName2State2TaskName2Data Name2State2TaskName2Data; + +public: + TData() = default; + + TData(const TNode& statisticsNode) + { + ParseNode(statisticsNode, TString(), &Name2State2TaskName2Data); + } + + static void Aggregate(TJobStatistics::TDataEntry* result, const TJobStatistics::TDataEntry& other) + { + result->Max = Max(result->Max, other.Max); + result->Min = Min(result->Min, other.Min); + result->Sum += other.Sum; + result->Count += other.Count; + } + + static void ParseNode(const TNode& node, TState2TaskName2Data* output) + { + auto getInt = [] (const TNode& theNode, TStringBuf key) { + const auto& nodeAsMap = theNode.AsMap(); + auto it = nodeAsMap.find(key); + if (it == nodeAsMap.end()) { + ythrow yexception() << "Key '" << key << "' is not found"; + } + const auto& valueNode = it->second; + if (!valueNode.IsInt64()) { + ythrow yexception() << "Key '" << key << "' is not of int64 type"; + } + return valueNode.AsInt64(); + }; + + for (const auto& [stateStr, taskName2DataNode] : node.AsMap()) { + EJobState state; + if (!TryFromString(stateStr, state)) { + continue; + } + for (const auto& [taskName, dataNode] : taskName2DataNode.AsMap()) { + auto fixedTaskName = FixTaskName(taskName); + auto& data = (*output)[state][fixedTaskName.Get()]; + data.Max = getInt(dataNode, "max"); + data.Min = getInt(dataNode, "min"); + data.Sum = getInt(dataNode, "sum"); + data.Count = getInt(dataNode, "count"); + } + } + } + + static void ParseNode(const TNode& node, const TString& curPath, TName2State2TaskName2Data* output) + { + Y_VERIFY(node.IsMap()); + + for (const auto& [key, value] : node.AsMap()) { + if (key == "$"sv) { + ParseNode(value, &(*output)[curPath]); + } else { + TString childPath = curPath; + if (!childPath.empty()) { + childPath.push_back('/'); + } + if (key.find_first_of('/') != key.npos) { + TString keyCopy(key); + SubstGlobal(keyCopy, "/", "\\/"); + childPath += keyCopy; + } else { + childPath += key; + } + ParseNode(value, childPath, output); + } + } + } +}; + +//////////////////////////////////////////////////////////////////// + +struct TJobStatistics::TFilter + : public TThrRefBase +{ + TVector<TTaskName> TaskNameFilter; + TVector<EJobState> JobStateFilter = {EJobState::Completed}; +}; + +//////////////////////////////////////////////////////////////////// + +const TString TJobStatistics::CustomStatisticsNamePrefix_ = "custom/"; + +TJobStatistics::TJobStatistics() + : Data_(::MakeIntrusive<TData>()) + , Filter_(::MakeIntrusive<TFilter>()) +{ } + + +TJobStatistics::TJobStatistics(const NYT::TNode& statisticsNode) + : Data_(::MakeIntrusive<TData>(statisticsNode)) + , Filter_(::MakeIntrusive<TFilter>()) +{ } + +TJobStatistics::TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter) + : Data_(data) + , Filter_(::MakeIntrusive<TFilter>(*filter)) +{ } + +TJobStatistics::TJobStatistics(const TJobStatistics& jobStatistics) = default; +TJobStatistics::TJobStatistics(TJobStatistics&&) = default; + +TJobStatistics& TJobStatistics::operator=(const TJobStatistics& jobStatistics) = default; +TJobStatistics& TJobStatistics::operator=(TJobStatistics&& jobStatistics) = default; + +TJobStatistics::~TJobStatistics() = default; + +TJobStatistics TJobStatistics::TaskName(TVector<TTaskName> taskNames) const +{ + auto newFilter = ::MakeIntrusive<TFilter>(*Filter_); + newFilter->TaskNameFilter = std::move(taskNames); + return TJobStatistics(Data_, std::move(newFilter)); +} + +TJobStatistics TJobStatistics::JobState(TVector<EJobState> jobStates) const +{ + auto newFilter = ::MakeIntrusive<TFilter>(*Filter_); + newFilter->JobStateFilter = std::move(jobStates); + return TJobStatistics(Data_, std::move(newFilter)); +} + +TJobStatistics TJobStatistics::JobType(TVector<EJobType> jobTypes) const +{ + TVector<TTaskName> taskNames; + for (auto jobType : jobTypes) { + taskNames.push_back(JobTypeToTaskName(jobType)); + } + return TaskName(std::move(taskNames)); +} + +bool TJobStatistics::HasStatistics(TStringBuf name) const +{ + return Data_->Name2State2TaskName2Data.contains(name); +} + +TJobStatisticsEntry<i64> TJobStatistics::GetStatistics(TStringBuf name) const +{ + return GetStatisticsAs<i64>(name); +} + +TVector<TString> TJobStatistics::GetStatisticsNames() const +{ + TVector<TString> result; + result.reserve(Data_->Name2State2TaskName2Data.size()); + for (const auto& entry : Data_->Name2State2TaskName2Data) { + result.push_back(entry.first); + } + return result; +} + +bool TJobStatistics::HasCustomStatistics(TStringBuf name) const +{ + return HasStatistics(CustomStatisticsNamePrefix_ + name); +} + +TJobStatisticsEntry<i64> TJobStatistics::GetCustomStatistics(TStringBuf name) const +{ + return GetCustomStatisticsAs<i64>(name); +} + +TVector<TString> TJobStatistics::GetCustomStatisticsNames() const +{ + TVector<TString> result; + for (const auto& entry : Data_->Name2State2TaskName2Data) { + if (entry.first.StartsWith(CustomStatisticsNamePrefix_)) { + result.push_back(entry.first.substr(CustomStatisticsNamePrefix_.size())); + } + } + return result; +} + +TMaybe<TJobStatistics::TDataEntry> TJobStatistics::GetStatisticsImpl(TStringBuf name) const +{ + auto name2State2TaskName2DataIt = Data_->Name2State2TaskName2Data.find(name); + Y_ENSURE( + name2State2TaskName2DataIt != Data_->Name2State2TaskName2Data.end(), + "Statistics '" << name << "' are missing"); + const auto& state2TaskName2Data = name2State2TaskName2DataIt->second; + + TMaybe<TDataEntry> result; + auto aggregate = [&] (const TDataEntry& data) { + if (result) { + TData::Aggregate(&result.GetRef(), data); + } else { + result = data; + } + }; + + auto aggregateTaskName2Data = [&] (const TData::TTaskName2Data& taskName2Data) { + if (Filter_->TaskNameFilter.empty()) { + for (const auto& [taskName, data] : taskName2Data) { + aggregate(data); + } + } else { + for (const auto& taskName : Filter_->TaskNameFilter) { + auto it = taskName2Data.find(taskName.Get()); + if (it == taskName2Data.end()) { + continue; + } + const auto& data = it->second; + aggregate(data); + } + } + }; + + if (Filter_->JobStateFilter.empty()) { + for (const auto& [state, taskName2Data] : state2TaskName2Data) { + aggregateTaskName2Data(taskName2Data); + } + } else { + for (auto state : Filter_->JobStateFilter) { + auto it = state2TaskName2Data.find(state); + if (it == state2TaskName2Data.end()) { + continue; + } + const auto& taskName2Data = it->second; + aggregateTaskName2Data(taskName2Data); + } + } + + return result; +} + +//////////////////////////////////////////////////////////////////// + +namespace { + +constexpr int USER_STATISTICS_FILE_DESCRIPTOR = 5; +constexpr char PATH_DELIMITER = '/'; +constexpr char ESCAPE = '\\'; + +IOutputStream* GetStatisticsStream() +{ + static TFile file = Duplicate(USER_STATISTICS_FILE_DESCRIPTOR); + static TFileOutput stream(file); + return &stream; +} + +template <typename T> +void WriteCustomStatisticsAny(TStringBuf path, const T& value) +{ + ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment); + int depth = 0; + size_t begin = 0; + size_t end = 0; + TVector<TString> items; + while (end <= path.size()) { + if (end + 1 < path.size() && path[end] == ESCAPE && path[end + 1] == PATH_DELIMITER) { + end += 2; + continue; + } + if (end == path.size() || path[end] == PATH_DELIMITER) { + writer.OnBeginMap(); + items.emplace_back(path.data() + begin, end - begin); + SubstGlobal(items.back(), "\\/", "/"); + writer.OnKeyedItem(TStringBuf(items.back())); + ++depth; + begin = end + 1; + } + ++end; + } + Serialize(value, &writer); + while (depth > 0) { + writer.OnEndMap(); + --depth; + } +} + +} + +//////////////////////////////////////////////////////////////////// + +void WriteCustomStatistics(const TNode& statistics) +{ + ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment); + Serialize(statistics, &writer); +} + +void WriteCustomStatistics(TStringBuf path, i64 value) +{ + WriteCustomStatisticsAny(path, value); +} + +void FlushCustomStatisticsStream() { + GetStatisticsStream()->Flush(); +} +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_statistics.h b/yt/cpp/mapreduce/interface/job_statistics.h new file mode 100644 index 0000000000..8af751604f --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics.h @@ -0,0 +1,268 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/job_statistics.h +/// +/// Header containing classes and utility functions to work with +/// [job statistics](https://docs.yandex-team.ru/yt/problems/jobstatistics). + +#include "fwd.h" + +#include <library/cpp/yson/node/node.h> + +#include <util/system/defaults.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Convert i64 representation of statistics to other type. +/// +/// Library defines this template for types TDuration and i64. +/// Users may define it for their types. +/// +/// @see @ref NYT::TJobStatistics::GetStatisticsAs method. +template <typename T> +T ConvertJobStatisticsEntry(i64 value); + +//////////////////////////////////////////////////////////////////// + +/// Class representing a collection of job statistics. +class TJobStatistics +{ +public: + /// + /// Construct empty statistics. + TJobStatistics(); + + /// + /// Construct statistics from statistics node. + TJobStatistics(const NYT::TNode& statistics); + + TJobStatistics(const TJobStatistics& jobStatistics); + TJobStatistics(TJobStatistics&& jobStatistics); + + TJobStatistics& operator=(const TJobStatistics& jobStatistics); + TJobStatistics& operator=(TJobStatistics&& jobStatistics); + + ~TJobStatistics(); + + /// + /// @brief Filter statistics by task name. + /// + /// @param taskNames What task names to include (empty means all). + TJobStatistics TaskName(TVector<TTaskName> taskNames) const; + + /// + /// @brief Filter statistics by job state. + /// + /// @param filter What job states to include (empty means all). + /// + /// @note Default statistics include only (successfully) completed jobs. + TJobStatistics JobState(TVector<EJobState> filter) const; + + /// + /// @brief Filter statistics by job type. + /// + /// @param filter What job types to include (empty means all). + /// + /// @deprecated Use @ref TJobStatistics::TaskName instead. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/jobs#obshaya-shema + TJobStatistics JobType(TVector<EJobType> filter) const; + + /// + /// @brief Check that given statistics exist. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + bool HasStatistics(TStringBuf name) const; + + /// + /// @brief Get statistics by name. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + /// + /// @note If statistics is missing an exception is thrown. If because of filters + /// no fields remain the returned value is empty (all fields are `Nothing`). + /// + /// @note We don't use `TMaybe<TJobStatisticsEntry>` here; + /// instead, @ref NYT::TJobStatisticsEntry methods return `TMaybe<i64>`, + /// so user easier use `.GetOrElse`: + /// ``` + /// jobStatistics.GetStatistics("some/statistics/name").Max().GetOrElse(0); + /// ``` + TJobStatisticsEntry<i64> GetStatistics(TStringBuf name) const; + + /// + /// @brief Get statistics by name. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + /// + /// @note In order to use `GetStatisticsAs` method, @ref NYT::ConvertJobStatisticsEntry function must be defined + /// (the library defines it for `i64` and `TDuration`, user may define it for other types). + template <typename T> + TJobStatisticsEntry<T> GetStatisticsAs(TStringBuf name) const; + + /// + /// Get (slash separated) names of statistics. + TVector<TString> GetStatisticsNames() const; + + /// + /// @brief Check if given custom statistics exists. + /// + /// @param name Slash separated custom statistics name. + bool HasCustomStatistics(TStringBuf name) const; + + /// + /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics). + /// + /// @param name Slash separated custom statistics name. + TJobStatisticsEntry<i64> GetCustomStatistics(TStringBuf name) const; + + /// + /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics). + /// + /// @param name Slash separated custom statistics name. + template <typename T> + TJobStatisticsEntry<T> GetCustomStatisticsAs(TStringBuf name) const; + + /// + /// Get names of all custom statistics. + TVector<TString> GetCustomStatisticsNames() const; + +private: + class TData; + struct TFilter; + + struct TDataEntry { + i64 Max; + i64 Min; + i64 Sum; + i64 Count; + }; + + static const TString CustomStatisticsNamePrefix_; + +private: + TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter); + + TMaybe<TDataEntry> GetStatisticsImpl(TStringBuf name) const; + +private: + ::TIntrusivePtr<TData> Data_; + ::TIntrusivePtr<TFilter> Filter_; + +private: + template<typename T> + friend class TJobStatisticsEntry; +}; + +//////////////////////////////////////////////////////////////////// + +/// Class representing single statistic. +template <typename T> +class TJobStatisticsEntry +{ +public: + TJobStatisticsEntry(TMaybe<TJobStatistics::TDataEntry> data) + : Data_(std::move(data)) + { } + + /// Sum of the statistic over all jobs. + TMaybe<T> Sum() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Sum); + } + return Nothing(); + } + + /// @brief Average of the statistic over all jobs. + /// + /// @note Only jobs that emitted statistics are taken into account. + TMaybe<T> Avg() const + { + if (Data_ && Data_->Count) { + return ConvertJobStatisticsEntry<T>(Data_->Sum / Data_->Count); + } + return Nothing(); + } + + /// @brief Number of jobs that emitted this statistic. + TMaybe<T> Count() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Count); + } + return Nothing(); + } + + /// @brief Maximum value of the statistic over all jobs. + TMaybe<T> Max() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Max); + } + return Nothing(); + } + + /// @brief Minimum value of the statistic over all jobs. + TMaybe<T> Min() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Min); + } + return Nothing(); + } + +private: + TMaybe<TJobStatistics::TDataEntry> Data_; + +private: + friend class TJobStatistics; +}; + +//////////////////////////////////////////////////////////////////// + +template <typename T> +TJobStatisticsEntry<T> TJobStatistics::GetStatisticsAs(TStringBuf name) const +{ + return TJobStatisticsEntry<T>(GetStatisticsImpl(name)); +} + +template <typename T> +TJobStatisticsEntry<T> TJobStatistics::GetCustomStatisticsAs(TStringBuf name) const +{ + return TJobStatisticsEntry<T>(GetStatisticsImpl(CustomStatisticsNamePrefix_ + name)); +} + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Write [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats). +/// +/// @param path Slash-separated path (length must not exceed 512 bytes). +/// @param value Value of the statistic. +/// +/// @note The function must be called in job. +/// Total number of statistics (with different paths) must not exceed 128. +void WriteCustomStatistics(TStringBuf path, i64 value); + +/// +/// @brief Write several [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats) at once. +/// +/// @param statistics A tree of map nodes with leaves of type `i64`. +/// +/// @note The call is equivalent to calling @ref NYT::WriteCustomStatistics(TStringBuf, i64) for every path in the given map. +void WriteCustomStatistics(const TNode& statistics); + +/// +/// @brief Flush [custom statistics stream](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats) +/// +void FlushCustomStatisticsStream(); +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_statistics_ut.cpp b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp new file mode 100644 index 0000000000..0cf53d771a --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp @@ -0,0 +1,257 @@ +#include <yt/cpp/mapreduce/interface/job_statistics.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(JobStatistics) +{ + Y_UNIT_TEST(Simple) + { + const TString input = R"""( + { + "data" = { + "output" = { + "0" = { + "uncompressed_data_size" = { + "$" = { + "completed" = { + "simple_sort" = { + "max" = 130; + "count" = 1; + "min" = 130; + "sum" = 130; + }; + "map" = { + "max" = 42; + "count" = 1; + "min" = 42; + "sum" = 42; + }; + }; + "aborted" = { + "simple_sort" = { + "max" = 24; + "count" = 1; + "min" = 24; + "sum" = 24; + }; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size")); + UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics"); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"}); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2); + + UNIT_ASSERT_VALUES_EQUAL(stat.JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), 24); + UNIT_ASSERT_VALUES_EQUAL(stat.JobType({EJobType::Map}).JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), TMaybe<i64>()); + } + + Y_UNIT_TEST(TestOtherTypes) + { + const TString input = R"""( + { + "time" = { + "exec" = { + "$" = { + "completed" = { + "map" = { + "max" = 2482468; + "count" = 38; + "min" = 578976; + "sum" = 47987270; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsAs<TDuration>("time/exec").Max(), TDuration::MilliSeconds(2482468)); + } + + Y_UNIT_TEST(Custom) + { + const TString input = R"""( + { + "custom" = { + "some" = { + "path" = { + "$" = { + "completed" = { + "map" = { + "max" = -1; + "count" = 1; + "min" = -1; + "sum" = -1; + }; + }; + }; + }; + }; + "another" = { + "path" = { + "$" = { + "completed" = { + "map" = { + "max" = 1001; + "count" = 2; + "min" = 1001; + "sum" = 2002; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasCustomStatistics("some/path")); + UNIT_ASSERT(!stat.HasCustomStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetCustomStatistics("BLAH-BLAH"), yexception, "Statistics"); + + const auto names = stat.GetCustomStatisticsNames(); + const THashSet<TString> expected = {"some/path", "another/path"}; + UNIT_ASSERT_VALUES_EQUAL(THashSet<TString>(names.begin(), names.end()), expected); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("some/path").Max(), -1); + UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("another/path").Avg(), 1001); + } + + Y_UNIT_TEST(TaskNames) + { + const TString input = R"""( + { + "data" = { + "output" = { + "0" = { + "uncompressed_data_size" = { + "$" = { + "completed" = { + "partition_map" = { + "max" = 130; + "count" = 1; + "min" = 130; + "sum" = 130; + }; + "partition(0)" = { + "max" = 42; + "count" = 1; + "min" = 42; + "sum" = 42; + }; + }; + "aborted" = { + "simple_sort" = { + "max" = 24; + "count" = 1; + "min" = 24; + "sum" = 24; + }; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size")); + UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics"); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"}); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2); + + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobState({EJobState::Aborted}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 24); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::Partition}) + .JobState({EJobState::Aborted}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition(0)"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition_map(0)"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::Partition}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::PartitionMap}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::Partition0}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::Partition1}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::PartitionMap0}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + } +} diff --git a/yt/cpp/mapreduce/interface/logging/logger.cpp b/yt/cpp/mapreduce/interface/logging/logger.cpp new file mode 100644 index 0000000000..bfa56b94f6 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/logger.cpp @@ -0,0 +1,188 @@ +#include "logger.h" + +#include <util/datetime/base.h> + +#include <util/stream/file.h> +#include <util/stream/format.h> +#include <util/stream/printf.h> +#include <util/stream/str.h> + +#include <util/system/mutex.h> +#include <util/system/rwlock.h> +#include <util/system/thread.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +static TStringBuf StripFileName(TStringBuf path) { + TStringBuf l, r; + if (path.TryRSplit('/', l, r) || path.TryRSplit('\\', l, r)) { + return r; + } else { + return path; + } +} + +static char GetLogLevelCode(ILogger::ELevel level) { + switch (level) { + case ILogger::FATAL: return 'F'; + case ILogger::ERROR: return 'E'; + case ILogger::INFO: return 'I'; + case ILogger::DEBUG: return 'D'; + } + Y_UNREACHABLE(); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TNullLogger + : public ILogger +{ +public: + void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override + { + Y_UNUSED(level); + Y_UNUSED(sourceLocation); + Y_UNUSED(format); + Y_UNUSED(args); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TLoggerBase + : public ILogger +{ +public: + TLoggerBase(ELevel cutLevel) + : CutLevel_(cutLevel) + { } + + virtual void OutputLine(const TString& line) = 0; + + void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override + { + if (level > CutLevel_) { + return; + } + + TStringStream stream; + stream << TInstant::Now().ToStringLocal() + << " " << GetLogLevelCode(level) + << " [" << Hex(TThread::CurrentThreadId(), HF_FULL) << "] "; + Printf(stream, format, args); + stream << " - " << StripFileName(sourceLocation.File) << ':' << sourceLocation.Line << Endl; + + TGuard<TMutex> guard(Mutex_); + OutputLine(stream.Str()); + } + +private: + ELevel CutLevel_; + TMutex Mutex_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TStdErrLogger + : public TLoggerBase +{ +public: + TStdErrLogger(ELevel cutLevel) + : TLoggerBase(cutLevel) + { } + + void OutputLine(const TString& line) override + { + Cerr << line; + } +}; + +ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel) +{ + return new TStdErrLogger(cutLevel); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFileLogger + : public TLoggerBase +{ +public: + TFileLogger(ELevel cutLevel, const TString& path, bool append) + : TLoggerBase(cutLevel) + , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode()))) + { } + + void OutputLine(const TString& line) override + { + Stream_ << line; + } + +private: + TUnbufferedFileOutput Stream_; +}; + +ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append) +{ + return new TFileLogger(cutLevel, path, append); +} +//////////////////////////////////////////////////////////////////////////////// + +class TBufferedFileLogger + : public TLoggerBase +{ +public: + TBufferedFileLogger(ELevel cutLevel, const TString& path, bool append) + : TLoggerBase(cutLevel) + , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode()))) + { } + + void OutputLine(const TString& line) override + { + Stream_ << line; + } + +private: + TFileOutput Stream_; +}; + +ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append) +{ + return new TBufferedFileLogger(cutLevel, path, append); +} + +//////////////////////////////////////////////////////////////////////////////// + +static TRWMutex LoggerMutex; +static ILoggerPtr Logger; + +struct TLoggerInitializer +{ + TLoggerInitializer() + { + Logger = new TNullLogger; + } +} LoggerInitializer; + +void SetLogger(ILoggerPtr logger) +{ + auto guard = TWriteGuard(LoggerMutex); + if (logger) { + Logger = logger; + } else { + Logger = new TNullLogger; + } +} + +ILoggerPtr GetLogger() +{ + auto guard = TReadGuard(LoggerMutex); + return Logger; +} + +//////////////////////////////////////////////////////////////////////////////// + +} + diff --git a/yt/cpp/mapreduce/interface/logging/logger.h b/yt/cpp/mapreduce/interface/logging/logger.h new file mode 100644 index 0000000000..2b5aae87d1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/logger.h @@ -0,0 +1,43 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/system/compat.h> +#include <util/system/src_location.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class ILogger + : public TThrRefBase +{ +public: + enum ELevel + { + FATAL /* "fatal", "FATAL" */, + // We don't have such level as `warning', but we support it for compatibility with other APIs. + ERROR /* "error", "warning", "ERROR", "WARNING" */, + INFO /* "info", "INFO" */, + DEBUG /* "debug", "DEBUG" */ + }; + + virtual void Log(ELevel level, const ::TSourceLocation& sourceLocation, const char* format, va_list args) = 0; +}; + +using ILoggerPtr = ::TIntrusivePtr<ILogger>; + +void SetLogger(ILoggerPtr logger); +ILoggerPtr GetLogger(); + +ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel); +ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false); + +/** + * Create logger that writes to a file in a buffered manner. + * It should result in fewer system calls (useful if you expect a lot of log messages), + * but in case of a crash, you would lose some log messages that haven't been flushed yet. + */ +ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false); + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/logging/ya.make b/yt/cpp/mapreduce/interface/logging/ya.make new file mode 100644 index 0000000000..8095bfe4ba --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/ya.make @@ -0,0 +1,16 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + logger.cpp + yt_log.cpp +) + +PEERDIR( + library/cpp/yt/logging +) + +GENERATE_ENUM_SERIALIZATION(logger.h) + +END() diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.cpp b/yt/cpp/mapreduce/interface/logging/yt_log.cpp new file mode 100644 index 0000000000..9fa7b91580 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/yt_log.cpp @@ -0,0 +1,126 @@ +#include "yt_log.h" + +#include "logger.h" + +#include <util/generic/guid.h> + +#include <util/system/mutex.h> + +namespace NYT { + +using namespace NLogging; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +class TLogManager + : public ILogManager +{ +public: + static constexpr TStringBuf CategoryName = "Wrapper"; + +public: + void RegisterStaticAnchor( + TLoggingAnchor* anchor, + ::TSourceLocation sourceLocation, + TStringBuf anchorMessage) override + { + if (anchor->Registered.exchange(true)) { + return; + } + + anchor->Enabled.store(true); + + auto guard = Guard(Mutex_); + anchor->SourceLocation = sourceLocation; + anchor->AnchorMessage = anchorMessage; + } + + void UpdateAnchor(TLoggingAnchor* /*position*/) override + { } + + void Enqueue(TLogEvent&& event) override + { + auto message = TString(event.MessageRef.ToStringBuf()); + LogMessage( + ToImplLevel(event.Level), + ::TSourceLocation(event.SourceFile, event.SourceLine), + "%.*s", + event.MessageRef.size(), + event.MessageRef.begin()); + } + + const TLoggingCategory* GetCategory(TStringBuf categoryName) override + { + Y_VERIFY(categoryName == CategoryName); + return &Category_; + } + + void UpdateCategory(TLoggingCategory* /*category*/) override + { + Y_FAIL(); + } + + bool GetAbortOnAlert() const override + { + return false; + } + +private: + static ILogger::ELevel ToImplLevel(ELogLevel level) + { + switch (level) { + case ELogLevel::Minimum: + case ELogLevel::Trace: + case ELogLevel::Debug: + return ILogger::ELevel::DEBUG; + case ELogLevel::Info: + return ILogger::ELevel::INFO; + case ELogLevel::Warning: + case ELogLevel::Error: + return ILogger::ELevel::ERROR; + case ELogLevel::Alert: + case ELogLevel::Fatal: + case ELogLevel::Maximum: + return ILogger::ELevel::FATAL; + } + } + + static void LogMessage(ILogger::ELevel level, const ::TSourceLocation& sourceLocation, const char* format, ...) + { + va_list args; + va_start(args, format); + GetLogger()->Log(level, sourceLocation, format, args); + va_end(args); + } + +private: + ::TMutex Mutex_; + std::atomic<int> ActualVersion_{1}; + const TLoggingCategory Category_{ + .Name{CategoryName}, + .MinPlainTextLevel{ELogLevel::Minimum}, + .CurrentVersion{1}, + .ActualVersion = &ActualVersion_, + }; +}; + +TLogManager LogManager; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TLogger Logger(&LogManager, TLogManager::CategoryName); + +//////////////////////////////////////////////////////////////////////////////// + +void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf /*format*/) +{ + builder->AppendString(GetGuidAsString(value)); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.h b/yt/cpp/mapreduce/interface/logging/yt_log.h new file mode 100644 index 0000000000..4cf93a6ba1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/yt_log.h @@ -0,0 +1,17 @@ +#pragma once + +#include <library/cpp/yt/logging/logger.h> + +struct TGUID; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +extern NLogging::TLogger Logger; + +void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf format); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/mpl.h b/yt/cpp/mapreduce/interface/mpl.h new file mode 100644 index 0000000000..9865e28b6c --- /dev/null +++ b/yt/cpp/mapreduce/interface/mpl.h @@ -0,0 +1,73 @@ +#pragma once + +#include "fwd.h" + +#include <tuple> +#include <type_traits> + +namespace NYT { + +/// @cond Doxygen_Suppress + +//////////////////////////////////////////////////////////////////////////////// + +template <class TBase, class TDerived> +struct TIsBaseOf +{ + static constexpr bool Value = std::is_base_of_v<TBase, TDerived> && !std::is_same_v<TBase, TDerived>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template <class T, class Tuple> +struct TIndexInTuple; + +template <class T, class... Types> +struct TIndexInTuple<T, std::tuple<T, Types...>> +{ + static constexpr int Value = 0; +}; + +template <class T> +struct TIndexInTuple<T, std::tuple<>> +{ + static constexpr int Value = 0; +}; + +template <class T, class U, class... Types> +struct TIndexInTuple<T, std::tuple<U, Types...>> +{ + static constexpr int Value = 1 + TIndexInTuple<T, std::tuple<Types...>>::Value; +}; + +template <class T, class TTuple> +constexpr bool DoesTupleContainType = (TIndexInTuple<T, TTuple>::Value < std::tuple_size<TTuple>{}); + +template <class TOut, class TIn = std::tuple<>> +struct TUniqueTypes; + +template <class... TOut, class TInCar, class... TInCdr> +struct TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCar, TInCdr...>> +{ + using TType = std::conditional_t< + DoesTupleContainType<TInCar, std::tuple<TOut...>>, + typename TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCdr...>>::TType, + typename TUniqueTypes<std::tuple<TOut..., TInCar>, std::tuple<TInCdr...>>::TType + >; +}; + +template <class TOut> +struct TUniqueTypes<TOut, std::tuple<>> +{ + using TType = TOut; +}; + +} // namespace NDetail + +/// @endcond Doxygen_Suppress + +//////////////////////////////////////////////////////////////////////////////// + +} diff --git a/yt/cpp/mapreduce/interface/node.h b/yt/cpp/mapreduce/interface/node.h new file mode 100644 index 0000000000..fece1b36de --- /dev/null +++ b/yt/cpp/mapreduce/interface/node.h @@ -0,0 +1,7 @@ +#pragma once + +// Backward compatibility +#include "fwd.h" +#include <library/cpp/yson/node/node.h> + + diff --git a/yt/cpp/mapreduce/interface/operation-inl.h b/yt/cpp/mapreduce/interface/operation-inl.h new file mode 100644 index 0000000000..8d53cd446f --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation-inl.h @@ -0,0 +1,928 @@ +#pragma once + +#ifndef OPERATION_INL_H_ +#error "Direct inclusion of this file is not allowed, use operation.h" +#include "operation.h" +#endif +#undef OPERATION_INL_H_ + +#include "errors.h" + +#include <util/generic/bt_exception.h> +#include <util/generic/singleton.h> +#include <util/system/type_name.h> + +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/string/subst.h> + +#include <typeindex> + +namespace NYT { + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template<class T> +void Assign(TVector<T>& array, size_t idx, const T& value) { + array.resize(std::max(array.size(), idx + 1)); + array[idx] = value; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TStructuredRowStreamDescription GetStructuredRowStreamDescription() +{ + if constexpr (std::is_same_v<TRow, NYT::TNode>) { + return TTNodeStructuredRowStream{}; + } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) { + return TTYaMRRowStructuredRowStream{}; + } else if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) { + return TProtobufStructuredRowStream{nullptr}; + } else if constexpr (TIsBaseOf<::google::protobuf::Message, TRow>::Value) { + return TProtobufStructuredRowStream{TRow::descriptor()}; + } else if constexpr (TIsProtoOneOf<TRow>::value) { + return TProtobufStructuredRowStream{nullptr}; + } else { + static_assert(TDependentFalse<TRow>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TStructuredTablePath Structured(TRichYPath richYPath) +{ + return TStructuredTablePath(std::move(richYPath), StructuredTableDescription<TRow>()); +} + +template <typename TRow> +TTableStructure StructuredTableDescription() +{ + if constexpr (std::is_same_v<TRow, NYT::TNode>) { + return TUnspecifiedTableStructure{}; + } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) { + return TUnspecifiedTableStructure{}; + } else if constexpr (std::is_base_of_v<::google::protobuf::Message, TRow>) { + if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) { + static_assert(TDependentFalse<TRow>, "Cannot use ::google::protobuf::Message as table descriptor"); + } else { + return TProtobufTableStructure{TRow::descriptor()}; + } + } else { + static_assert(TDependentFalse<TRow>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::AddInput(const TRichYPath& path) +{ + Inputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Inputs_, tableIndex, path); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::AddOutput(const TRichYPath& path) +{ + Outputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Outputs_, tableIndex, path); +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetInputs() const +{ + return Inputs_; +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetOutputs() const +{ + return Outputs_; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TDerived> +TDerived& TRawMapReduceOperationIoSpec<TDerived>::AddMapOutput(const TRichYPath& path) +{ + MapOutputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawMapReduceOperationIoSpec<TDerived>::SetMapOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(MapOutputs_, tableIndex, path); +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawMapReduceOperationIoSpec<TDerived>::GetMapOutputs() const +{ + return MapOutputs_; +} + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<INodeReaderImpl> CreateJobNodeReader(TRawTableReaderPtr rawTableReader); +::TIntrusivePtr<IYaMRReaderImpl> CreateJobYaMRReader(TRawTableReaderPtr rawTableReader); +::TIntrusivePtr<IProtoReaderImpl> CreateJobProtoReader(TRawTableReaderPtr rawTableReader); + +::TIntrusivePtr<INodeWriterImpl> CreateJobNodeWriter(THolder<IProxyOutput> rawTableWriter); +::TIntrusivePtr<IYaMRWriterImpl> CreateJobYaMRWriter(THolder<IProxyOutput> rawTableWriter); +::TIntrusivePtr<IProtoWriterImpl> CreateJobProtoWriter(THolder<IProxyOutput> rawTableWriter); + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader); + +template <> +inline ::TIntrusivePtr<INodeReaderImpl> CreateJobReaderImpl<TNode>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobNodeReader(rawTableReader); +} + +template <> +inline ::TIntrusivePtr<IYaMRReaderImpl> CreateJobReaderImpl<TYaMRRow>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobYaMRReader(rawTableReader); +} + +template <> +inline ::TIntrusivePtr<IProtoReaderImpl> CreateJobReaderImpl<Message>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobProtoReader(rawTableReader); +} + +template <class T> +inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader) +{ + if constexpr (TIsBaseOf<Message, T>::Value || NDetail::TIsProtoOneOf<T>::value) { + return CreateJobProtoReader(rawTableReader); + } else { + static_assert(TDependentFalse<T>, "Unknown row type"); + } +} + +template <class T> +inline TTableReaderPtr<T> CreateJobReader(TRawTableReaderPtr rawTableReader) +{ + return new TTableReader<T>(CreateJobReaderImpl<T>(rawTableReader)); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter); + +template <> +inline TTableWriterPtr<TNode> CreateJobWriter<TNode>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<TNode>(CreateJobNodeWriter(std::move(rawJobWriter))); +} + +template <> +inline TTableWriterPtr<TYaMRRow> CreateJobWriter<TYaMRRow>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<TYaMRRow>(CreateJobYaMRWriter(std::move(rawJobWriter))); +} + +template <> +inline TTableWriterPtr<Message> CreateJobWriter<Message>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<Message>(CreateJobProtoWriter(std::move(rawJobWriter))); +} + +template <class T, class = void> +struct TProtoWriterCreator; + +template <class T> +struct TProtoWriterCreator<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> +{ + static TTableWriterPtr<T> Create(::TIntrusivePtr<IProtoWriterImpl> writer) + { + return new TTableWriter<T>(writer); + } +}; + +template <class T> +inline TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + return TProtoWriterCreator<T>::Create(CreateJobProtoWriter(std::move(rawJobWriter))); + } else { + static_assert(TDependentFalse<T>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +void TOperationInputSpecBase::AddInput(const TRichYPath& path) +{ + Inputs_.push_back(path); + StructuredInputs_.emplace_back(Structured<T>(path)); +} + +template <class T> +void TOperationInputSpecBase::SetInput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Inputs_, tableIndex, path); + NDetail::Assign(StructuredInputs_, tableIndex, Structured<T>(path)); +} + + +template <class T> +void TOperationOutputSpecBase::AddOutput(const TRichYPath& path) +{ + Outputs_.push_back(path); + StructuredOutputs_.emplace_back(Structured<T>(path)); +} + +template <class T> +void TOperationOutputSpecBase::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Outputs_, tableIndex, path); + NDetail::Assign(StructuredOutputs_, tableIndex, Structured<T>(path)); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::AddInput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationInputSpecBase::AddInput<T>(path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationInputSpecBase::SetInput<T>(tableIndex, path); + return *static_cast<TDerived*>(this); +} + + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::AddOutput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::AddOutput<T>(path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::SetOutput<T>(tableIndex, path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddStructuredInput(TStructuredTablePath path) +{ + TOperationInputSpecBase::AddStructuredInput(std::move(path)); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddStructuredOutput(TStructuredTablePath path) +{ + TOperationOutputSpecBase::AddStructuredOutput(std::move(path)); + return *static_cast<TDerived*>(this); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +TVanillaTask& TVanillaTask::AddOutput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::AddOutput<T>(path); + return *this; +} + +template <class T> +TVanillaTask& TVanillaTask::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::SetOutput<T>(tableIndex, path); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +void ResetUseClientProtobuf(const char* methodName); + +} // namespace NDetail + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path) +{ + NDetail::ResetUseClientProtobuf("AddProtobufInput_VerySlow_Deprecated"); + Inputs_.push_back(path); + StructuredInputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr})); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path) +{ + NDetail::ResetUseClientProtobuf("AddProtobufOutput_VerySlow_Deprecated"); + Outputs_.push_back(path); + StructuredOutputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr})); + return *static_cast<TDerived*>(this); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::Description() +{ + for (auto i : Indices_) { + Preparer_.InputDescription<TRow>(i); + } + return *this; +} + +template <typename TRow> +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Description(bool inferSchema) +{ + for (auto i : Indices_) { + Preparer_.OutputDescription<TRow>(i, inferSchema); + } + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TCont> +TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(const TCont& indices) +{ + for (auto i : indices) { + ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()")); + } + return TInputGroup(*this, TVector<int>(std::begin(indices), std::end(indices))); +} + +template <typename TCont> +TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(const TCont& indices) +{ + for (auto i : indices) { + ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()")); + } + return TOutputGroup(*this, indices); +} + + +template <typename TRow> +TJobOperationPreparer& TJobOperationPreparer::InputDescription(int tableIndex) +{ + ValidateMissingInputDescription(tableIndex); + InputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>(); + return *this; +} + +template <typename TRow> +TJobOperationPreparer& TJobOperationPreparer::OutputDescription(int tableIndex, bool inferSchema) +{ + ValidateMissingOutputDescription(tableIndex); + OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>(); + if (inferSchema && !OutputSchemas_[tableIndex]) { + OutputSchemas_[tableIndex] = CreateTableSchema<TRow>(); + } + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintMapOutput() +{ + IntermediateMapOutputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::AddMapOutput(const TRichYPath& path) +{ + MapOutputs_.push_back(path); + StructuredMapOutputs_.emplace_back(Structured<TRow>(path)); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerInput() +{ + IntermediateReduceCombinerInputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerOutput() +{ + IntermediateReduceCombinerOutputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceInput() +{ + IntermediateReducerInputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +const TVector<TStructuredTablePath>& TIntermediateTablesHintSpec<TDerived>::GetStructuredMapOutputs() const +{ + return StructuredMapOutputs_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateMapOutputDescription() const +{ + return IntermediateMapOutputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerInputDescription() const +{ + return IntermediateReduceCombinerInputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerOutputDescription() const +{ + return IntermediateReduceCombinerOutputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReducerInputDescription() const +{ + return IntermediateReducerInputDescription_; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TReducerContext +{ + bool Break = false; + static TReducerContext* Get() { return Singleton<TReducerContext>(); } +}; + +template <class TR, class TW> +inline void IReducer<TR, TW>::Break() +{ + TReducerContext::Get()->Break = true; +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IMapper<TReader, TWriter>* mapper, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto reader = MakeIntrusive<TTableReader<TInputRow>>(readerImpl); + mapper->Do(reader.Get(), writer); +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IReducer<TReader, TWriter>* reducer, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl); + for (; rangesReader->IsValid(); rangesReader->Next()) { + reducer->Do(&rangesReader->GetRange(), writer); + if (TReducerContext::Get()->Break) { + break; + } + } +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IAggregatorReducer<TReader, TWriter>* reducer, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl); + reducer->Do(rangesReader.Get(), writer); +} + +template <class TRawJob> +int RunRawJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + TRawJobContext context(outputTableCount); + + TRawJob job; + job.Load(jobStateStream); + job.Do(context); + return 0; +} + +template <> +inline int RunRawJob<TCommandRawJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */) +{ + Y_FAIL(); +} + +template <class TVanillaJob> +int RunVanillaJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + TVanillaJob job; + job.Load(jobStateStream); + + if constexpr (std::is_base_of<IVanillaJob<>, TVanillaJob>::value) { + Y_VERIFY(outputTableCount == 0, "Void vanilla job expects zero 'outputTableCount'"); + job.Do(); + } else { + Y_VERIFY(outputTableCount, "Vanilla job with table writer expects nonzero 'outputTableCount'"); + using TOutputRow = typename TVanillaJob::TWriter::TRowType; + + THolder<IProxyOutput> rawJobWriter; + if (auto customWriter = job.CreateCustomRawJobWriter(outputTableCount)) { + rawJobWriter = std::move(customWriter); + } else { + rawJobWriter = CreateRawJobWriter(outputTableCount); + } + auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter)); + + job.Start(writer.Get()); + job.Do(writer.Get()); + job.Finish(writer.Get()); + + writer->Finish(); + } + return 0; +} + +template <> +inline int RunVanillaJob<TCommandVanillaJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */) +{ + Y_FAIL(); +} + +template <class TJob> + requires TIsBaseOf<IStructuredJob, TJob>::Value +int RunJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + using TInputRow = typename TJob::TReader::TRowType; + using TOutputRow = typename TJob::TWriter::TRowType; + + auto job = MakeIntrusive<TJob>(); + job->Load(jobStateStream); + + TRawTableReaderPtr rawJobReader; + if (auto customReader = job->CreateCustomRawJobReader(/*fd*/ 0)) { + rawJobReader = customReader; + } else { + rawJobReader = CreateRawJobReader(/*fd*/ 0); + } + auto readerImpl = CreateJobReaderImpl<TInputRow>(rawJobReader); + + // Many users don't expect to have jobs with empty input so we skip such jobs. + if (!readerImpl->IsValid()) { + return 0; + } + + THolder<IProxyOutput> rawJobWriter; + if (auto customWriter = job->CreateCustomRawJobWriter(outputTableCount)) { + rawJobWriter = std::move(customWriter); + } else { + rawJobWriter = CreateRawJobWriter(outputTableCount); + } + auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter)); + + job->Start(writer.Get()); + FeedJobInput(job.Get(), readerImpl.Get(), writer.Get()); + job->Finish(writer.Get()); + + writer->Finish(); + + return 0; +} + +// +// We leave RunMapJob/RunReduceJob/RunAggregatorReducer for backward compatibility, +// some user use them already. :( + +template <class TMapper> +int RunMapJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TMapper>(outputTableCount, jobStateStream); +} + +template <class TReducer> +int RunReduceJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TReducer>(outputTableCount, jobStateStream); +} + +template <class TReducer> +int RunAggregatorReducer(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TReducer>(outputTableCount, jobStateStream); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T, typename = void> +struct TIsConstructibleFromNode + : std::false_type +{ }; + +template <typename T> +struct TIsConstructibleFromNode<T, std::void_t<decltype(T::FromNode(std::declval<TNode&>()))>> + : std::true_type +{ }; + +template <class TJob> +::TIntrusivePtr<NYT::IStructuredJob> ConstructJobFromNode(const TNode& node) +{ + if constexpr (TIsConstructibleFromNode<TJob>::value) { + Y_ENSURE(node.GetType() != TNode::Undefined, + "job has FromNode method but constructor arguments were not provided"); + return TJob::FromNode(node); + } else { + Y_ENSURE(node.GetType() == TNode::Undefined, + "constructor arguments provided but job does not contain FromNode method"); + return MakeIntrusive<TJob>(); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +using TJobFunction = int (*)(size_t, IInputStream&); +using TConstructJobFunction = ::TIntrusivePtr<NYT::IStructuredJob> (*)(const TNode&); + +class TJobFactory +{ +public: + static TJobFactory* Get() + { + return Singleton<TJobFactory>(); + } + + template <class TJob> + void RegisterJob(const char* name) + { + RegisterJobImpl<TJob>(name, RunJob<TJob>); + JobConstructors[name] = ConstructJobFromNode<TJob>; + } + + template <class TRawJob> + void RegisterRawJob(const char* name) + { + RegisterJobImpl<TRawJob>(name, RunRawJob<TRawJob>); + } + + template <class TVanillaJob> + void RegisterVanillaJob(const char* name) + { + RegisterJobImpl<TVanillaJob>(name, RunVanillaJob<TVanillaJob>); + } + + TString GetJobName(const IJob* job) + { + const auto typeIndex = std::type_index(typeid(*job)); + CheckJobRegistered(typeIndex); + return JobNames[typeIndex]; + } + + TJobFunction GetJobFunction(const char* name) + { + CheckNameRegistered(name); + return JobFunctions[name]; + } + + TConstructJobFunction GetConstructingFunction(const char* name) + { + CheckNameRegistered(name); + return JobConstructors[name]; + } + +private: + TMap<std::type_index, TString> JobNames; + THashMap<TString, TJobFunction> JobFunctions; + THashMap<TString, TConstructJobFunction> JobConstructors; + + template <typename TJob, typename TRunner> + void RegisterJobImpl(const char* name, TRunner runner) { + const auto typeIndex = std::type_index(typeid(TJob)); + CheckNotRegistered(typeIndex, name); + JobNames[typeIndex] = name; + JobFunctions[name] = runner; + } + + void CheckNotRegistered(const std::type_index& typeIndex, const char* name) + { + Y_ENSURE(!JobNames.contains(typeIndex), + "type_info '" << typeIndex.name() << "'" + "is already registered under name '" << JobNames[typeIndex] << "'"); + Y_ENSURE(!JobFunctions.contains(name), + "job with name '" << name << "' is already registered"); + } + + void CheckJobRegistered(const std::type_index& typeIndex) + { + Y_ENSURE(JobNames.contains(typeIndex), + "type_info '" << typeIndex.name() << "' is not registered, use REGISTER_* macros"); + } + + void CheckNameRegistered(const char* name) + { + Y_ENSURE(JobFunctions.contains(name), + "job with name '" << name << "' is not registered, use REGISTER_* macros"); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TMapper> +struct TMapperRegistrator +{ + TMapperRegistrator(const char* name) + { + static_assert(TMapper::JobType == IJob::EType::Mapper, + "REGISTER_MAPPER is not compatible with this job class"); + + NYT::TJobFactory::Get()->RegisterJob<TMapper>(name); + } +}; + +template <class TReducer> +struct TReducerRegistrator +{ + TReducerRegistrator(const char* name) + { + static_assert(TReducer::JobType == IJob::EType::Reducer || + TReducer::JobType == IJob::EType::ReducerAggregator, + "REGISTER_REDUCER is not compatible with this job class"); + + NYT::TJobFactory::Get()->RegisterJob<TReducer>(name); + } +}; + +template <class TRawJob> +struct TRawJobRegistrator +{ + TRawJobRegistrator(const char* name) + { + static_assert(TRawJob::JobType == IJob::EType::RawJob, + "REGISTER_RAW_JOB is not compatible with this job class"); + NYT::TJobFactory::Get()->RegisterRawJob<TRawJob>(name); + } +}; + +template <class TVanillaJob> +struct TVanillaJobRegistrator +{ + TVanillaJobRegistrator(const char* name) + { + static_assert(TVanillaJob::JobType == IJob::EType::VanillaJob, + "REGISTER_VANILLA_JOB is not compatible with this job class"); + NYT::TJobFactory::Get()->RegisterVanillaJob<TVanillaJob>(name); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +inline TString YtRegistryTypeName(const TString& name) { + TString res = name; +#ifdef _win_ + SubstGlobal(res, "class ", ""); +#endif + return res; +} + +//////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_MAPPER(...) \ +static const NYT::TMapperRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()); + +#define REGISTER_NAMED_MAPPER(name, ...) \ +static const NYT::TMapperRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_REDUCER(...) \ +static const NYT::TReducerRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()); + +#define REGISTER_NAMED_REDUCER(name, ...) \ +static const NYT::TReducerRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_NAMED_RAW_JOB(name, ...) \ +static const NYT::TRawJobRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_RAW_JOB(...) \ +REGISTER_NAMED_RAW_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__) + +#define REGISTER_NAMED_VANILLA_JOB(name, ...) \ +static NYT::TVanillaJobRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_VANILLA_JOB(...) \ +REGISTER_NAMED_VANILLA_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__) + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TWriter> +TStructuredRowStreamDescription IVanillaJob<TWriter>::GetInputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +template <typename TWriter> +TStructuredRowStreamDescription IVanillaJob<TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/operation.cpp b/yt/cpp/mapreduce/interface/operation.cpp new file mode 100644 index 0000000000..706fc4caa4 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation.cpp @@ -0,0 +1,663 @@ +#include "operation.h" + +#include <util/generic/iterator_range.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + i64 OutputTableCount = -1; +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +TTaskName::TTaskName(TString taskName) + : TaskName_(std::move(taskName)) +{ } + +TTaskName::TTaskName(const char* taskName) + : TaskName_(taskName) +{ } + +TTaskName::TTaskName(ETaskName taskName) + : TaskName_(ToString(taskName)) +{ } + +const TString& TTaskName::Get() const +{ + return TaskName_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TCommandRawJob::TCommandRawJob(TStringBuf command) + : Command_(command) +{ } + +const TString& TCommandRawJob::GetCommand() const +{ + return Command_; +} + +void TCommandRawJob::Do(const TRawJobContext& /* jobContext */) +{ + Y_FAIL("TCommandRawJob::Do must not be called"); +} + +REGISTER_NAMED_RAW_JOB("NYT::TCommandRawJob", TCommandRawJob) + +//////////////////////////////////////////////////////////////////////////////// + +TCommandVanillaJob::TCommandVanillaJob(TStringBuf command) + : Command_(command) +{ } + +const TString& TCommandVanillaJob::GetCommand() const +{ + return Command_; +} + +void TCommandVanillaJob::Do() +{ + Y_FAIL("TCommandVanillaJob::Do must not be called"); +} + +REGISTER_NAMED_VANILLA_JOB("NYT::TCommandVanillaJob", TCommandVanillaJob); + +//////////////////////////////////////////////////////////////////////////////// + +bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&) +{ + return true; +} + +bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs) +{ + return lhs.Descriptor == rhs.Descriptor; +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<TStructuredTablePath>& TOperationInputSpecBase::GetStructuredInputs() const +{ + return StructuredInputs_; +} + +const TVector<TStructuredTablePath>& TOperationOutputSpecBase::GetStructuredOutputs() const +{ + return StructuredOutputs_; +} + +void TOperationInputSpecBase::AddStructuredInput(TStructuredTablePath path) +{ + Inputs_.push_back(path.RichYPath); + StructuredInputs_.push_back(std::move(path)); +} + +void TOperationOutputSpecBase::AddStructuredOutput(TStructuredTablePath path) +{ + Outputs_.push_back(path.RichYPath); + StructuredOutputs_.push_back(std::move(path)); +} + +//////////////////////////////////////////////////////////////////////////////// + +TVanillaTask& TVanillaTask::AddStructuredOutput(TStructuredTablePath path) +{ + TOperationOutputSpecBase::AddStructuredOutput(std::move(path)); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +TStructuredRowStreamDescription IVanillaJob<void>::GetInputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +TStructuredRowStreamDescription IVanillaJob<void>::GetOutputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TRawJobContext::TRawJobContext(size_t outputTableCount) + : InputFile_(Duplicate(0)) +{ + for (size_t i = 0; i != outputTableCount; ++i) { + OutputFileList_.emplace_back(Duplicate(3 * i + 1)); + } +} + +const TFile& TRawJobContext::GetInputFile() const +{ + return InputFile_; +} + +const TVector<TFile>& TRawJobContext::GetOutputFileList() const +{ + return OutputFileList_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TUserJobSpec& TUserJobSpec::AddLocalFile( + const TLocalFilePath& path, + const TAddLocalFileOptions& options) +{ + LocalFiles_.emplace_back(path, options); + return *this; +} + +TUserJobSpec& TUserJobSpec::JobBinaryLocalPath(TString path, TMaybe<TString> md5) +{ + JobBinary_ = TJobBinaryLocalPath{path, md5}; + return *this; +} + +TUserJobSpec& TUserJobSpec::JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId) +{ + JobBinary_ = TJobBinaryCypressPath{path, transactionId}; + return *this; +} + +const TJobBinaryConfig& TUserJobSpec::GetJobBinary() const +{ + return JobBinary_; +} + +TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> TUserJobSpec::GetLocalFiles() const +{ + return LocalFiles_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobOperationPreparer::TInputGroup::TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices) + : Preparer_(preparer) + , Indices_(std::move(indices)) +{ } + +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnRenaming(const THashMap<TString, TString>& renaming) +{ + for (auto i : Indices_) { + Preparer_.InputColumnRenaming(i, renaming); + } + return *this; +} + +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnFilter(const TVector<TString>& columns) +{ + for (auto i : Indices_) { + Preparer_.InputColumnFilter(i, columns); + } + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::TInputGroup::EndInputGroup() +{ + return Preparer_; +} + +TJobOperationPreparer::TOutputGroup::TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices) + : Preparer_(preparer) + , Indices_(std::move(indices)) +{ } + +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Schema(const TTableSchema &schema) +{ + for (auto i : Indices_) { + Preparer_.OutputSchema(i, schema); + } + return *this; +} + +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::NoSchema() +{ + for (auto i : Indices_) { + Preparer_.NoOutputSchema(i); + } + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::TOutputGroup::EndOutputGroup() +{ + return Preparer_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobOperationPreparer::TJobOperationPreparer(const IOperationPreparationContext& context) + : Context_(context) + , OutputSchemas_(context.GetOutputCount()) + , InputColumnRenamings_(context.GetInputCount()) + , InputColumnFilters_(context.GetInputCount()) + , InputTableDescriptions_(context.GetInputCount()) + , OutputTableDescriptions_(context.GetOutputCount()) +{ } + +TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(int begin, int end) +{ + Y_ENSURE_EX(begin <= end, TApiUsageError() + << "BeginInputGroup(): begin must not exceed end, got " << begin << ", " << end); + TVector<int> indices; + for (int i = begin; i < end; ++i) { + ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()")); + indices.push_back(i); + } + return TInputGroup(*this, std::move(indices)); +} + + +TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(int begin, int end) +{ + Y_ENSURE_EX(begin <= end, TApiUsageError() + << "BeginOutputGroup(): begin must not exceed end, got " << begin << ", " << end); + TVector<int> indices; + for (int i = begin; i < end; ++i) { + ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()")); + indices.push_back(i); + } + return TOutputGroup(*this, std::move(indices)); +} + +TJobOperationPreparer& TJobOperationPreparer::NodeOutput(int tableIndex) +{ + ValidateMissingOutputDescription(tableIndex); + OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TNode>(); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::OutputSchema(int tableIndex, TTableSchema schema) +{ + ValidateMissingOutputSchema(tableIndex); + OutputSchemas_[tableIndex] = std::move(schema); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::NoOutputSchema(int tableIndex) +{ + ValidateMissingOutputSchema(tableIndex); + OutputSchemas_[tableIndex] = EmptyNonstrictSchema(); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::InputColumnRenaming( + int tableIndex, + const THashMap<TString,TString>& renaming) +{ + ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnRenaming()")); + InputColumnRenamings_[tableIndex] = renaming; + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::InputColumnFilter(int tableIndex, const TVector<TString>& columns) +{ + ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnFilter()")); + InputColumnFilters_[tableIndex] = columns; + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::FormatHints(TUserJobFormatHints newFormatHints) +{ + FormatHints_ = newFormatHints; + return *this; +} + +void TJobOperationPreparer::Finish() +{ + FinallyValidate(); +} + +TVector<TTableSchema> TJobOperationPreparer::GetOutputSchemas() +{ + TVector<TTableSchema> result; + result.reserve(OutputSchemas_.size()); + for (auto& schema : OutputSchemas_) { + Y_VERIFY(schema.Defined()); + result.push_back(std::move(*schema)); + schema.Clear(); + } + return result; +} + +void TJobOperationPreparer::FinallyValidate() const +{ + TVector<int> illegallyMissingSchemaIndices; + for (int i = 0; i < static_cast<int>(OutputSchemas_.size()); ++i) { + if (!OutputSchemas_[i]) { + illegallyMissingSchemaIndices.push_back(i); + } + } + if (illegallyMissingSchemaIndices.empty()) { + return; + } + TApiUsageError error; + error << "Output table schemas are missing: "; + for (auto i : illegallyMissingSchemaIndices) { + error << "no. " << i; + if (auto path = Context_.GetInputPath(i)) { + error << "(" << *path << ")"; + } + error << "; "; + } + ythrow std::move(error); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TJobOperationPreparer::ValidateInputTableIndex(int tableIndex, TStringBuf message) const +{ + Y_ENSURE_EX( + 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetInputCount()), + TApiUsageError() << + message << ": input table index " << tableIndex << " us out of range [0;" << + OutputSchemas_.size() << ")"); +} + +void TJobOperationPreparer::ValidateOutputTableIndex(int tableIndex, TStringBuf message) const +{ + Y_ENSURE_EX( + 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetOutputCount()), + TApiUsageError() << + message << ": output table index " << tableIndex << " us out of range [0;" << + OutputSchemas_.size() << ")"); +} + +void TJobOperationPreparer::ValidateMissingOutputSchema(int tableIndex) const +{ + ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputSchema()"); + Y_ENSURE_EX(!OutputSchemas_[tableIndex], + TApiUsageError() << + "Output table schema no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +void TJobOperationPreparer::ValidateMissingInputDescription(int tableIndex) const +{ + ValidateInputTableIndex(tableIndex, "ValidateMissingInputDescription()"); + Y_ENSURE_EX(!InputTableDescriptions_[tableIndex], + TApiUsageError() << + "Description for input no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +void TJobOperationPreparer::ValidateMissingOutputDescription(int tableIndex) const +{ + ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputDescription()"); + Y_ENSURE_EX(!OutputTableDescriptions_[tableIndex], + TApiUsageError() << + "Description for output no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +TTableSchema TJobOperationPreparer::EmptyNonstrictSchema() { + return TTableSchema().Strict(false); +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<THashMap<TString, TString>>& TJobOperationPreparer::GetInputColumnRenamings() const +{ + return InputColumnRenamings_; +} + +const TVector<TMaybe<TVector<TString>>>& TJobOperationPreparer::GetInputColumnFilters() const +{ + return InputColumnFilters_; +} + +const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetInputDescriptions() const +{ + return InputTableDescriptions_; +} + +const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetOutputDescriptions() const +{ + return OutputTableDescriptions_; +} + +const TUserJobFormatHints& TJobOperationPreparer::GetFormatHints() const +{ + return FormatHints_; +} + +TJobOperationPreparer& TJobOperationPreparer::InputFormatHints(TFormatHints hints) +{ + FormatHints_.InputFormatHints(hints); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::OutputFormatHints(TFormatHints hints) +{ + FormatHints_.OutputFormatHints(hints); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +void IJob::PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& resultBuilder) const +{ + for (int i = 0; i < context.GetOutputCount(); ++i) { + resultBuilder.NoOutputSchema(i); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +IOperationPtr IOperationClient::Map( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + const TOperationOptions& options) +{ + Y_VERIFY(mapper.Get()); + + return DoMap( + spec, + std::move(mapper), + options); +} + +IOperationPtr IOperationClient::Map( + ::TIntrusivePtr<IMapperBase> mapper, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TMapOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapOperationSpec::Outputs MUST be empty"); + + auto mapSpec = spec; + for (const auto& inputPath : input.Parts_) { + mapSpec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + mapSpec.AddStructuredOutput(outputPath); + } + return Map(mapSpec, std::move(mapper), options); +} + +IOperationPtr IOperationClient::Reduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoReduce( + spec, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::Reduce( + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + const TReduceOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TReduceOperationSpec::ReduceBy MUST be empty"); + + auto reduceSpec = spec; + for (const auto& inputPath : input.Parts_) { + reduceSpec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + reduceSpec.AddStructuredOutput(outputPath); + } + reduceSpec.ReduceBy(reduceBy); + return Reduce(reduceSpec, std::move(reducer), options); +} + +IOperationPtr IOperationClient::JoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoJoinReduce( + spec, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoMapReduce( + spec, + std::move(mapper), + nullptr, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoMapReduce( + spec, + std::move(mapper), + std::move(reduceCombiner), + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty"); + + for (const auto& inputPath : input.Parts_) { + spec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + spec.AddStructuredOutput(outputPath); + } + spec.ReduceBy(reduceBy); + return MapReduce(spec, std::move(mapper), std::move(reducer), options); +} + +IOperationPtr IOperationClient::MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty"); + + for (const auto& inputPath : input.Parts_) { + spec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + spec.AddStructuredOutput(outputPath); + } + spec.ReduceBy(reduceBy); + return MapReduce(spec, std::move(mapper), std::move(reduceCombiner), std::move(reducer), options); +} + +IOperationPtr IOperationClient::Sort( + const TOneOrMany<TRichYPath>& input, + const TRichYPath& output, + const TSortColumns& sortBy, + const TSortOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TSortOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Output_.Path_.empty(), + TApiUsageError() << "TSortOperationSpec::Output MUST be empty"); + Y_ENSURE_EX(spec.SortBy_.Parts_.empty(), + TApiUsageError() << "TSortOperationSpec::SortBy MUST be empty"); + + auto sortSpec = spec; + for (const auto& inputPath : input.Parts_) { + sortSpec.AddInput(inputPath); + } + sortSpec.Output(output); + sortSpec.SortBy(sortBy); + return Sort(sortSpec, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +TRawTableReaderPtr IStructuredJob::CreateCustomRawJobReader(int) const +{ + return nullptr; +} + +THolder<IProxyOutput> IStructuredJob::CreateCustomRawJobWriter(size_t) const +{ + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/operation.h b/yt/cpp/mapreduce/interface/operation.h new file mode 100644 index 0000000000..171a7e4af7 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation.h @@ -0,0 +1,3494 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/operation.h +/// +/// Header containing interface to run operations in YT +/// and retrieve information about them. +/// @see [the doc](https://yt.yandex-team.ru/docs/description/mr/map_reduce_overview.html). + +#include "client_method_options.h" +#include "errors.h" +#include "io.h" +#include "job_statistics.h" +#include "job_counters.h" + +#include <library/cpp/threading/future/future.h> +#include <library/cpp/type_info/type_info.h> + +#include <util/datetime/base.h> +#include <util/generic/variant.h> +#include <util/generic/vector.h> +#include <util/generic/maybe.h> +#include <util/system/file.h> +#include <util/system/types.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that the row type for table is not specified. +struct TUnspecifiedTableStructure +{ }; + +/// Tag class marking that table rows have protobuf type. +struct TProtobufTableStructure +{ + /// @brief Descriptor of the protobuf type of table rows. + /// + /// @note If table is tagged with @ref ::google::protobuf::Message instead of real proto class + /// this descriptor might be null. + const ::google::protobuf::Descriptor* Descriptor = nullptr; +}; + + +/// Tag class to specify table row type. +using TTableStructure = std::variant< + TUnspecifiedTableStructure, + TProtobufTableStructure +>; + +bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&); +bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs); + +/// Table path marked with @ref NYT::TTableStructure tag. +struct TStructuredTablePath +{ + TStructuredTablePath(TRichYPath richYPath = TRichYPath(), TTableStructure description = TUnspecifiedTableStructure()) + : RichYPath(std::move(richYPath)) + , Description(std::move(description)) + { } + + TStructuredTablePath(TRichYPath richYPath, const ::google::protobuf::Descriptor* descriptor) + : RichYPath(std::move(richYPath)) + , Description(TProtobufTableStructure({descriptor})) + { } + + TStructuredTablePath(TYPath path) + : RichYPath(std::move(path)) + , Description(TUnspecifiedTableStructure()) + { } + + TStructuredTablePath(const char* path) + : RichYPath(path) + , Description(TUnspecifiedTableStructure()) + { } + + TRichYPath RichYPath; + TTableStructure Description; +}; + +/// Create marked table path from row type. +template <typename TRow> +TStructuredTablePath Structured(TRichYPath richYPath); + +/// Create tag class from row type. +template <typename TRow> +TTableStructure StructuredTableDescription(); + +/////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that row stream is empty. +struct TVoidStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of `NYT::TNode`. +struct TTNodeStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of @ref NYT::TYaMRRow. +struct TTYaMRRowStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of protobuf rows of given type. +struct TProtobufStructuredRowStream +{ + /// @brief Descriptor of the protobuf type of table rows. + /// + /// @note If `Descriptor` is nullptr, then row stream consists of multiple message types. + const ::google::protobuf::Descriptor* Descriptor = nullptr; +}; + +/// Tag class to specify type of rows in an operation row stream +using TStructuredRowStreamDescription = std::variant< + TVoidStructuredRowStream, + TTNodeStructuredRowStream, + TTYaMRRowStructuredRowStream, + TProtobufStructuredRowStream +>; + +/////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that current binary should be used in operation. +struct TJobBinaryDefault +{ }; + +/// Tag class marking that binary from specified local path should be used in operation. +struct TJobBinaryLocalPath +{ + TString Path; + TMaybe<TString> MD5CheckSum; +}; + +/// Tag class marking that binary from specified Cypress path should be used in operation. +struct TJobBinaryCypressPath +{ + TYPath Path; + TMaybe<TTransactionId> TransactionId; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +/// @cond Doxygen_Suppress +namespace NDetail { + extern i64 OutputTableCount; +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Auto merge mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/automerge +enum class EAutoMergeMode +{ + /// Auto merge is disabled. + Disabled /* "disabled" */, + + /// Mode that tries to achieve good chunk sizes and doesn't limit usage of chunk quota for intermediate chunks. + Relaxed /* "relaxed" */, + + /// Mode that tries to optimize usage of chunk quota for intermediate chunks, operation might run slower. + Economy /* "economy" */, + + /// + /// @brief Manual configuration of automerge parameters. + /// + /// @ref TAutoMergeSpec + Manual /* "manual" */, +}; + +/// +/// @brief Options for auto merge operation stage. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/automerge +class TAutoMergeSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TAutoMergeSpec; + /// @endcond + + /// Mode of the auto merge. + FLUENT_FIELD_OPTION(EAutoMergeMode, Mode); + + /// @brief Upper limit for number of intermediate chunks. + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, MaxIntermediateChunkCount); + + /// @brief Number of chunks limit to merge in one job. + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, ChunkCountPerMergeJob); + + /// @brief Automerge will not merge chunks that are larger than `DesiredChunkSize * (ChunkSizeThreshold / 100.)` + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, ChunkSizeThreshold); +}; + +/// Base for operations with auto merge options. +template <class TDerived> +class TWithAutoMergeSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Options for auto merge operation stage. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/automerge + FLUENT_FIELD_OPTION(TAutoMergeSpec, AutoMerge); +}; + +/// +/// @brief Resources controlled by scheduler and used by running operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy +class TSchedulerResources +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TSchedulerResources; + /// @endcond + + /// Each job consumes exactly one user slot. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, UserSlots); + + /// Number of (virtual) cpu cores consumed by all jobs. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Cpu); + + /// Amount of memory in bytes. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Memory); +}; + +/// Base for input format hints of a user job. +template <class TDerived> +class TUserJobInputFormatHintsBase +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Fine tune input format of the job. + FLUENT_FIELD_OPTION(TFormatHints, InputFormatHints); +}; + +/// Base for output format hints of a user job. +template <class TDerived> +class TUserJobOutputFormatHintsBase +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Fine tune output format of the job. + FLUENT_FIELD_OPTION(TFormatHints, OutputFormatHints); +}; + +/// Base for format hints of a user job. +template <class TDerived> +class TUserJobFormatHintsBase + : public TUserJobInputFormatHintsBase<TDerived> + , public TUserJobOutputFormatHintsBase<TDerived> +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond +}; + +/// User job format hints. +class TUserJobFormatHints + : public TUserJobFormatHintsBase<TUserJobFormatHints> +{ }; + +/// Spec of input and output tables of a raw operation. +template <class TDerived> +class TRawOperationIoTableSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// Add input table path to input path list. + TDerived& AddInput(const TRichYPath& path); + + /// Set input table path no. `tableIndex`. + TDerived& SetInput(size_t tableIndex, const TRichYPath& path); + + /// Add output table path to output path list. + TDerived& AddOutput(const TRichYPath& path); + + /// Set output table path no. `tableIndex`. + TDerived& SetOutput(size_t tableIndex, const TRichYPath& path); + + /// Get all input table paths. + const TVector<TRichYPath>& GetInputs() const; + + /// Get all output table paths. + const TVector<TRichYPath>& GetOutputs() const; + +private: + TVector<TRichYPath> Inputs_; + TVector<TRichYPath> Outputs_; +}; + +/// Base spec for IO in "simple" raw operations (Map, Reduce etc.). +template <class TDerived> +struct TSimpleRawOperationIoSpec + : public TRawOperationIoTableSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Describes format for both input and output. + /// + /// @note `Format' is overriden by `InputFormat' and `OutputFormat'. + FLUENT_FIELD_OPTION(TFormat, Format); + + /// Describes input format. + FLUENT_FIELD_OPTION(TFormat, InputFormat); + + /// Describes output format. + FLUENT_FIELD_OPTION(TFormat, OutputFormat); +}; + +/// Spec for IO in MapReduce operation. +template <class TDerived> +class TRawMapReduceOperationIoSpec + : public TRawOperationIoTableSpec<TDerived> +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Describes format for both input and output of mapper. + /// + /// @note `MapperFormat' is overriden by `MapperInputFormat' and `MapperOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, MapperFormat); + + /// Describes mapper input format. + FLUENT_FIELD_OPTION(TFormat, MapperInputFormat); + + /// Describes mapper output format. + FLUENT_FIELD_OPTION(TFormat, MapperOutputFormat); + + /// @brief Describes format for both input and output of reduce combiner. + /// + /// @note `ReduceCombinerFormat' is overriden by `ReduceCombinerInputFormat' and `ReduceCombinerOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerFormat); + + /// Describes reduce combiner input format. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerInputFormat); + + /// Describes reduce combiner output format. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerOutputFormat); + + /// @brief Describes format for both input and output of reducer. + /// + /// @note `ReducerFormat' is overriden by `ReducerInputFormat' and `ReducerOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, ReducerFormat); + + /// Describes reducer input format. + FLUENT_FIELD_OPTION(TFormat, ReducerInputFormat); + + /// Describes reducer output format. + FLUENT_FIELD_OPTION(TFormat, ReducerOutputFormat); + + /// Add direct map output table path. + TDerived& AddMapOutput(const TRichYPath& path); + + /// Set direct map output table path no. `tableIndex`. + TDerived& SetMapOutput(size_t tableIndex, const TRichYPath& path); + + /// Get all direct map output table paths + const TVector<TRichYPath>& GetMapOutputs() const; + +private: + TVector<TRichYPath> MapOutputs_; +}; + +/// +/// @brief Base spec of operations with input tables. +class TOperationInputSpecBase +{ +public: + template <class T, class = void> + struct TFormatAdder; + + /// + /// @brief Add input table path to input path list and specify type of rows. + template <class T> + void AddInput(const TRichYPath& path); + + /// + /// @brief Add input table path as structured paths. + void AddStructuredInput(TStructuredTablePath path); + + /// + /// @brief Set input table path and type. + template <class T> + void SetInput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief All input paths. + TVector<TRichYPath> Inputs_; + + /// + /// @brief Get all input structured paths. + const TVector<TStructuredTablePath>& GetStructuredInputs() const; + +private: + TVector<TStructuredTablePath> StructuredInputs_; + friend struct TOperationIOSpecBase; + template <class T> + friend struct TOperationIOSpec; +}; + +/// +/// @brief Base spec of operations with output tables. +class TOperationOutputSpecBase +{ +public: + template <class T, class = void> + struct TFormatAdder; + + /// + /// @brief Add output table path to output path list and specify type of rows. + template <class T> + void AddOutput(const TRichYPath& path); + + /// + /// @brief Add output table path as structured paths. + void AddStructuredOutput(TStructuredTablePath path); + + /// + /// @brief Set output table path and type. + template <class T> + void SetOutput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief All output paths. + TVector<TRichYPath> Outputs_; + + /// + /// @brief Get all output structured paths. + const TVector<TStructuredTablePath>& GetStructuredOutputs() const; + +private: + TVector<TStructuredTablePath> StructuredOutputs_; + friend struct TOperationIOSpecBase; + template <class T> + friend struct TOperationIOSpec; +}; + +/// +/// @brief Base spec for operations with inputs and outputs. +struct TOperationIOSpecBase + : public TOperationInputSpecBase + , public TOperationOutputSpecBase +{ }; + +/// +/// @brief Base spec for operations with inputs and outputs. +template <class TDerived> +struct TOperationIOSpec + : public TOperationIOSpecBase +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + template <class T> + TDerived& AddInput(const TRichYPath& path); + + TDerived& AddStructuredInput(TStructuredTablePath path); + + template <class T> + TDerived& SetInput(size_t tableIndex, const TRichYPath& path); + + template <class T> + TDerived& AddOutput(const TRichYPath& path); + + TDerived& AddStructuredOutput(TStructuredTablePath path); + + template <class T> + TDerived& SetOutput(size_t tableIndex, const TRichYPath& path); + + + // DON'T USE THESE METHODS! They are left solely for backward compatibility. + // These methods are the only way to do equivalent of (Add/Set)(Input/Output)<Message> + // but please consider using (Add/Set)(Input/Output)<TConcreteMessage> + // (where TConcreteMessage is some descendant of Message) + // because they are faster and better (see https://st.yandex-team.ru/YT-6967) + TDerived& AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path); + TDerived& AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path); +}; + +/// +/// @brief Base spec for all operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options +template <class TDerived> +struct TOperationSpecBase +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Limit on operation execution time. + /// + /// If operation doesn't finish in time it will be aborted. + FLUENT_FIELD_OPTION(TDuration, TimeLimit); + + /// @brief Title to be shown in web interface. + FLUENT_FIELD_OPTION(TString, Title); + + /// @brief Pool to be used for this operation. + FLUENT_FIELD_OPTION(TString, Pool); + + /// @brief Weight of operation. + /// + /// Coefficient defining how much resources operation gets relative to its siblings in the same pool. + FLUENT_FIELD_OPTION(double, Weight); + + /// @breif Pool tree list that operation will use. + FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(TString, PoolTree); + + /// How much resources can be consumed by operation. + FLUENT_FIELD_OPTION_ENCAPSULATED(TSchedulerResources, ResourceLimits); +}; + +/// +/// @brief Base spec for all operations with user jobs. +template <class TDerived> +struct TUserOperationSpecBase + : TOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// How many jobs can fail before operation is failed. + FLUENT_FIELD_OPTION(ui64, MaxFailedJobCount); + + /// On any unsuccessful job completion (i.e. abortion or failure) force the whole operation to fail. + FLUENT_FIELD_OPTION(bool, FailOnJobRestart); + + /// + /// @brief Table to save whole stderr of operation. + /// + /// @see https://clubs.at.yandex-team.ru/yt/1045 + FLUENT_FIELD_OPTION(TYPath, StderrTablePath); + + /// + /// @brief Table to save coredumps of operation. + /// + /// @see https://clubs.at.yandex-team.ru/yt/1045 + FLUENT_FIELD_OPTION(TYPath, CoreTablePath); + + /// + /// @brief How long should the scheduler wait for the job to be started on a node. + /// + /// When you run huge jobs that require preemption of all the other jobs on + /// a node, the default timeout might be insufficient and your job may be + /// aborted with 'waiting_timeout' reason. This is especially problematic + /// when you are setting 'FailOnJobRestart' option. + /// + /// @note The value must be between 10 seconds and 10 minutes. + FLUENT_FIELD_OPTION(TDuration, WaitingJobTimeout); +}; + +/// +/// @brief Class to provide information on intermediate mapreduce stream protobuf types. +/// +/// When using protobuf format it is important to know exact types of proto messages +/// that are used in input/output. +/// +/// Sometimes such messages cannot be derived from job class +/// i.e. when job class uses `NYT::TTableReader<::google::protobuf::Message>` +/// or `NYT::TTableWriter<::google::protobuf::Message>`. +/// +/// When using such jobs user can provide exact message type using this class. +/// +/// @note Only input/output that relate to intermediate tables can be hinted. +/// Input to map and output of reduce is derived from `AddInput`/`AddOutput`. +template <class TDerived> +struct TIntermediateTablesHintSpec +{ + /// Specify intermediate map output type. + template <class T> + TDerived& HintMapOutput(); + + /// Specify reduce combiner input. + template <class T> + TDerived& HintReduceCombinerInput(); + + /// Specify reduce combiner output. + template <class T> + TDerived& HintReduceCombinerOutput(); + + /// Specify reducer input. + template <class T> + TDerived& HintReduceInput(); + + /// + /// @brief Add output of map stage. + /// + /// Mapper output table #0 is always intermediate table that is going to be reduced later. + /// Rows that mapper write to tables #1, #2, ... are saved in MapOutput tables. + template <class T> + TDerived& AddMapOutput(const TRichYPath& path); + + TVector<TRichYPath> MapOutputs_; + + const TVector<TStructuredTablePath>& GetStructuredMapOutputs() const; + const TMaybe<TTableStructure>& GetIntermediateMapOutputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReduceCombinerInputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReduceCombinerOutputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReducerInputDescription() const; + +private: + TVector<TStructuredTablePath> StructuredMapOutputs_; + TMaybe<TTableStructure> IntermediateMapOutputDescription_; + TMaybe<TTableStructure> IntermediateReduceCombinerInputDescription_; + TMaybe<TTableStructure> IntermediateReduceCombinerOutputDescription_; + TMaybe<TTableStructure> IntermediateReducerInputDescription_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TAddLocalFileOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAddLocalFileOptions; + /// @endcond + + /// + /// @brief Path by which job will see the uploaded file. + /// + /// Defaults to basename of the local path. + FLUENT_FIELD_OPTION(TString, PathInJob); + + /// + /// @brief MD5 checksum of uploaded file. + /// + /// If not specified it is computed by this library. + /// If this argument is provided, the user can some cpu and disk IO. + FLUENT_FIELD_OPTION(TString, MD5CheckSum); + + /// + /// @brief Do not put file into node cache + /// + /// @see NYT::TRichYPath::BypassArtifactCache + FLUENT_FIELD_OPTION(bool, BypassArtifactCache); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Binary to run job profiler on. +enum class EProfilingBinary +{ + /// Profile job proxy. + JobProxy /* "job_proxy" */, + + /// Profile user job. + UserJob /* "user_job" */, +}; + +/// @brief Type of job profiler. +enum class EProfilerType +{ + /// Profile CPU usage. + Cpu /* "cpu" */, + + /// Profile memory usage. + Memory /* "memory" */, + + /// Profiler peak memory usage. + PeakMemory /* "peak_memory" */, +}; + +/// @brief Specifies a job profiler. +struct TJobProfilerSpec +{ + /// @cond Doxygen_Suppress + using TSelf = TJobProfilerSpec; + /// @endcond + + /// @brief Binary to profile. + FLUENT_FIELD_OPTION(EProfilingBinary, ProfilingBinary); + + /// @brief Type of the profiler. + FLUENT_FIELD_OPTION(EProfilerType, ProfilerType); + + /// @brief Probabiliy of the job being selected for profiling. + FLUENT_FIELD_OPTION(double, ProfilingProbability); + + /// @brief For sampling profilers, sets the number of samples per second. + FLUENT_FIELD_OPTION(int, SamplingFrequency); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of user job. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#user_script_options +struct TUserJobSpec +{ + /// @cond Doxygen_Suppress + using TSelf = TUserJobSpec; + /// @endcond + + /// + /// @brief Specify a local file to upload to Cypress and prepare for use in job. + TSelf& AddLocalFile(const TLocalFilePath& path, const TAddLocalFileOptions& options = TAddLocalFileOptions()); + + /// + /// @brief Get the list of all added local files. + TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> GetLocalFiles() const; + + /// @brief Paths to files in Cypress to use in job. + FLUENT_VECTOR_FIELD(TRichYPath, File); + + /// + /// @brief MemoryLimit specifies how much memory job process can use. + /// + /// @note + /// If job uses tmpfs (check @ref NYT::TOperationOptions::MountSandboxInTmpfs) + /// YT computes its memory usage as total of: + /// - memory usage of job process itself (including mapped files); + /// - total size of tmpfs used by this job. + /// + /// @note + /// When @ref NYT::TOperationOptions::MountSandboxInTmpfs is enabled library will compute + /// total size of all files used by this job and add this total size to MemoryLimit. + /// Thus you shouldn't include size of your files (e.g. binary file) into MemoryLimit. + /// + /// @note + /// Final memory memory_limit passed to YT is calculated as follows: + /// + /// @note + /// ``` + /// memory_limit = MemoryLimit + <total-size-of-used-files> + ExtraTmpfsSize + /// ``` + /// + /// @see NYT::TUserJobSpec::ExtraTmpfsSize + FLUENT_FIELD_OPTION(i64, MemoryLimit); + + /// + /// @brief Size of data that is going to be written to tmpfs. + /// + /// This option should be used if job writes data to tmpfs. + /// + /// ExtraTmpfsSize should not include size of files specified with + /// @ref NYT::TUserJobSpec::AddLocalFile or @ref NYT::TUserJobSpec::AddFile + /// These files are copied to tmpfs automatically and their total size + /// is computed automatically. + /// + /// @see NYT::TOperationOptions::MountSandboxInTmpfs + /// @see NYT::TUserJobSpec::MemoryLimit + FLUENT_FIELD_OPTION(i64, ExtraTmpfsSize); + + /// + /// @brief Maximum number of CPU cores for a single job to use. + FLUENT_FIELD_OPTION(double, CpuLimit); + + /// + /// @brief Fraction of @ref NYT::TUserJobSpec::MemoryLimit that job gets at start. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#memory_reserve_factor + FLUENT_FIELD_OPTION(double, MemoryReserveFactor); + + /// + /// @brief Local path to executable to be used inside jobs. + //// + /// Provided executable must use C++ YT API library (this library) + /// and implement job class that is going to be used. + /// + /// This option might be useful if we want to start operation from nonlinux machines + /// (in that case we use `JobBinary` to provide path to the same program compiled for linux). + /// Other example of using this option is uploading executable to cypress in advance + /// and save the time required to upload current executable to cache. + /// `md5` argument can be used to save cpu time and disk IO when binary MD5 checksum is known. + /// When argument is not provided library will compute it itself. + TUserJobSpec& JobBinaryLocalPath(TString path, TMaybe<TString> md5 = Nothing()); + + /// + /// @brief Cypress path to executable to be used inside jobs. + TUserJobSpec& JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId = Nothing()); + + /// + /// @brief String that will be prepended to the command. + /// + /// This option overrides @ref NYT::TOperationOptions::JobCommandPrefix. + FLUENT_FIELD(TString, JobCommandPrefix); + + /// + /// @brief String that will be appended to the command. + /// + /// This option overrides @ref NYT::TOperationOptions::JobCommandSuffix. + FLUENT_FIELD(TString, JobCommandSuffix); + + /// + /// @brief Map of environment variables that will be set for jobs. + FLUENT_MAP_FIELD(TString, TString, Environment); + + /// + /// @brief Limit for all files inside job sandbox (in bytes). + FLUENT_FIELD_OPTION(ui64, DiskSpaceLimit); + + /// + /// @brief Number of ports reserved for the job (passed through environment in YT_PORT_0, YT_PORT_1, ...). + FLUENT_FIELD_OPTION(ui16, PortCount); + + /// + /// @brief Network project used to isolate job network. + FLUENT_FIELD_OPTION(TString, NetworkProject); + + /// + /// @brief Limit on job execution time. + /// + /// Jobs that exceed this limit will be considered failed. + FLUENT_FIELD_OPTION(TDuration, JobTimeLimit); + + /// + /// @brief Get job binary config. + const TJobBinaryConfig& GetJobBinary() const; + + /// + /// @brief List of profilers to run. + FLUENT_VECTOR_FIELD(TJobProfilerSpec, JobProfiler); + +private: + TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> LocalFiles_; + TJobBinaryConfig JobBinary_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +template <typename TDerived> +struct TMapOperationSpecBase + : public TUserOperationSpecBase<TDerived> + , public TWithAutoMergeSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of mapper job. + FLUENT_FIELD(TUserJobSpec, MapperSpec); + + /// + /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table. + /// + /// When `Ordered' is false (by default), there is no guaranties about order of reading rows. + /// In this case mapper might work slightly faster because row delivered from fast node can be processed YT waits + /// response from slow nodes. + /// When `Ordered' is true, rows will come in order in which they are stored in input tables. + FLUENT_FIELD_OPTION(bool, Ordered); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMapOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMapOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +struct TMapOperationSpec + : public TMapOperationSpecBase<TMapOperationSpec> + , public TOperationIOSpec<TMapOperationSpec> + , public TUserJobFormatHintsBase<TMapOperationSpec> +{ }; + +/// +/// @brief Spec of raw Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +struct TRawMapOperationSpec + : public TMapOperationSpecBase<TRawMapOperationSpec> + , public TSimpleRawOperationIoSpec<TRawMapOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +template <typename TDerived> +struct TReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> + , public TWithAutoMergeSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Columns to sort rows by (must include `ReduceBy` as prefix). + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Columns to group rows by. + FLUENT_FIELD(TSortColumns, ReduceBy); + + /// + /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`). + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables + FLUENT_FIELD_OPTION(TSortColumns, JoinBy); + + /// + /// @brief Guarantee to feed all rows with same `ReduceBy` columns to a single job (`true` by default). + FLUENT_FIELD_OPTION(bool, EnableKeyGuarantee); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TReduceOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TReduceOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +struct TReduceOperationSpec + : public TReduceOperationSpecBase<TReduceOperationSpec> + , public TOperationIOSpec<TReduceOperationSpec> + , public TUserJobFormatHintsBase<TReduceOperationSpec> +{ }; + +/// +/// @brief Spec of raw Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +struct TRawReduceOperationSpec + : public TReduceOperationSpecBase<TRawReduceOperationSpec> + , public TSimpleRawOperationIoSpec<TRawReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +template <typename TDerived> +struct TJoinReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`). + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables + FLUENT_FIELD(TSortColumns, JoinBy); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TJoinReduceOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TJoinReduceOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +struct TJoinReduceOperationSpec + : public TJoinReduceOperationSpecBase<TJoinReduceOperationSpec> + , public TOperationIOSpec<TJoinReduceOperationSpec> + , public TUserJobFormatHintsBase<TJoinReduceOperationSpec> +{ }; + +/// +/// @brief Spec of raw JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +struct TRawJoinReduceOperationSpec + : public TJoinReduceOperationSpecBase<TRawJoinReduceOperationSpec> + , public TSimpleRawOperationIoSpec<TRawJoinReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +template <typename TDerived> +struct TMapReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of map job. + FLUENT_FIELD(TUserJobSpec, MapperSpec); + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Spec of reduce combiner. + FLUENT_FIELD(TUserJobSpec, ReduceCombinerSpec); + + /// + /// @brief Columns to sort rows by (must include `ReduceBy` as prefix). + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Columns to group rows by. + FLUENT_FIELD(TSortColumns, ReduceBy); + + /// + /// @brief Recommended number of map jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMapReduceOperationSpecBase::DataSizePerMapJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, MapJobCount); + + /// + /// @brief Recommended of data size for each map job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMapReduceOperationSpecBase::MapJobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerMapJob); + + /// + /// @brief Recommended number of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionCount); + + /// + /// @brief Recommended size of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionDataSize); + + /// + /// @brief Account to use for intermediate data. + FLUENT_FIELD_OPTION(TString, IntermediateDataAccount); + + /// + /// @brief Replication factor for intermediate data (1 by default). + FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor); + + /// + /// @brief Recommended size of data to be passed to a single reduce combiner. + FLUENT_FIELD_OPTION(ui64, DataSizePerSortJob); + + /// + /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table. + /// + /// @see @ref NYT::TMapOperationSpec::Ordered for more info. + FLUENT_FIELD_OPTION(bool, Ordered); + + /// + /// @brief Guarantee to run reduce combiner before reducer. + FLUENT_FIELD_OPTION(bool, ForceReduceCombiners); +}; + +/// +/// @brief Spec of MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +struct TMapReduceOperationSpec + : public TMapReduceOperationSpecBase<TMapReduceOperationSpec> + , public TOperationIOSpec<TMapReduceOperationSpec> + , public TIntermediateTablesHintSpec<TMapReduceOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TMapReduceOperationSpec; + /// @endcond + + /// + /// @brief Format hints for mapper. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, MapperFormatHints, TUserJobFormatHints()); + + /// + /// @brief Format hints for reducer. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReducerFormatHints, TUserJobFormatHints()); + + /// + /// @brief Format hints for reduce combiner. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReduceCombinerFormatHints, TUserJobFormatHints()); +}; + +/// +/// @brief Spec of raw MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +struct TRawMapReduceOperationSpec + : public TMapReduceOperationSpecBase<TRawMapReduceOperationSpec> + , public TRawMapReduceOperationIoSpec<TRawMapReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Schema inference mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference +enum class ESchemaInferenceMode : int +{ + FromInput /* "from_input" */, + FromOutput /* "from_output" */, + Auto /* "auto" */, +}; + +/// +/// @brief Spec of Sort operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/sort +struct TSortOperationSpec + : TOperationSpecBase<TSortOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TSortOperationSpec; + /// @endcond + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Columns to sort table by. + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Recommended number of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionCount); + + /// + /// @brief Recommended size of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionDataSize); + + /// + /// @brief Recommended number of partition jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TSortOperationSpec::DataSizePerPartitionJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, PartitionJobCount); + + /// + /// @brief Recommended of data size for each partition job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TSortOperationSpec::PartitionJobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerPartitionJob); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); + + /// + /// @brief Account to use for intermediate data. + FLUENT_FIELD_OPTION(TString, IntermediateDataAccount); + + /// + /// @brief Replication factor for intermediate data (1 by default). + FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor); +}; + + +/// +/// @brief Merge mode. +enum EMergeMode : int +{ + MM_UNORDERED /* "unordered" */, + MM_ORDERED /* "ordered" */, + MM_SORTED /* "sorted" */, +}; + +/// +/// @brief Spec of Merge operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/merge +struct TMergeOperationSpec + : TOperationSpecBase<TMergeOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TMergeOperationSpec; + /// @endcond + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Columns by which to merge (for @ref NYT::EMergeMode::MM_SORTED). + FLUENT_FIELD(TSortColumns, MergeBy); + + /// + /// @brief Merge mode. + FLUENT_FIELD_DEFAULT(EMergeMode, Mode, MM_UNORDERED); + + /// + /// @brief Combine output chunks to larger ones. + FLUENT_FIELD_DEFAULT(bool, CombineChunks, false); + + /// + /// @brief Guarantee that all input chunks will be read. + FLUENT_FIELD_DEFAULT(bool, ForceTransform, false); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMergeOperationSpec::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMergeOperationSpec::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); +}; + +/// +/// @brief Spec of Erase operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/erase +struct TEraseOperationSpec + : TOperationSpecBase<TEraseOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TEraseOperationSpec; + /// @endcond + + /// + /// @brief Which table (or row range) to erase. + FLUENT_FIELD(TRichYPath, TablePath); + + /// + /// Combine output chunks to larger ones. + FLUENT_FIELD_DEFAULT(bool, CombineChunks, false); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); +}; + +/// +/// @brief Spec of RemoteCopy operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy +struct TRemoteCopyOperationSpec + : TOperationSpecBase<TRemoteCopyOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TRemoteCopyOperationSpec; + /// @endcond + + /// + /// @brief Source cluster name. + FLUENT_FIELD(TString, ClusterName); + + /// + /// @brief Network to use for copy (all remote cluster nodes must have it configured). + FLUENT_FIELD_OPTION(TString, NetworkName); + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); + + /// + /// @brief Copy user attributes from input to output table (allowed only for single input table). + FLUENT_FIELD_DEFAULT(bool, CopyAttributes, false); + + /// + /// @brief Names of user attributes to copy from input to output table. + /// + /// @note To make this option make sense set @ref NYT::TRemoteCopyOperationSpec::CopyAttributes to `true`. + FLUENT_VECTOR_FIELD(TString, AttributeKey); + +private: + + /// + /// @brief Config for remote cluster connection. + FLUENT_FIELD_OPTION(TNode, ClusterConnection); +}; + +class IVanillaJobBase; + +/// +/// @brief Task of Vanilla operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +struct TVanillaTask + : public TOperationOutputSpecBase + , public TUserJobOutputFormatHintsBase<TVanillaTask> +{ + /// @cond Doxygen_Suppress + using TSelf = TVanillaTask; + /// @endcond + + /// + /// @brief Add output table path and specify the task output type (i.e. TMyProtoMessage). + template <class T> + TSelf& AddOutput(const TRichYPath& path); + + /// + /// @brief Add output table path as structured path. + TSelf& AddStructuredOutput(TStructuredTablePath path); + + /// + /// @brief Set output table path and specify the task output type (i.e. TMyProtoMessage). + template <class T> + TSelf& SetOutput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief Task name. + FLUENT_FIELD(TString, Name); + + /// + /// @brief Job to be executed in this task. + FLUENT_FIELD(::TIntrusivePtr<IVanillaJobBase>, Job); + + /// + /// @brief User job spec. + FLUENT_FIELD(TUserJobSpec, Spec); + + /// + /// @brief Number of jobs to run and wait for successful completion. + /// + /// @note If @ref NYT::TUserOperationSpecBase::FailOnJobRestart is `false`, a failed job will be restarted + /// and will not count in this amount. + FLUENT_FIELD(ui64, JobCount); + + /// + /// @brief Network project name. + FLUENT_FIELD(TMaybe<TString>, NetworkProject); + +}; + +/// +/// @brief Spec of Vanilla operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +struct TVanillaOperationSpec + : TUserOperationSpecBase<TVanillaOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TVanillaOperationSpec; + /// @endcond + + /// + /// @brief Description of tasks to run in this operation. + FLUENT_VECTOR_FIELD(TVanillaTask, Task); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IOperationClient::Map and other operation start commands. +struct TOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TOperationOptions; + /// @endcond + + /// + /// @brief Additional field to put to operation spec. + FLUENT_FIELD_OPTION(TNode, Spec); + + /// + /// @brief Start operation mode. + enum class EStartOperationMode : int + { + /// + /// @brief Prepare operation asynchronously. Call IOperation::Start() to start operation. + AsyncPrepare, + + /// + /// @brief Prepare and start operation asynchronously. Don't wait for operation completion. + AsyncStart, + + /// + /// @brief Prepare and start operation synchronously. Don't wait for operation completion. + SyncStart, + + /// + /// @brief Prepare, start and wait for operation completion synchronously. + SyncWait, + }; + + /// + /// @brief Start operation mode. + FLUENT_FIELD_DEFAULT(EStartOperationMode, StartOperationMode, EStartOperationMode::SyncWait); + + /// + /// @brief Wait for operation finish synchronously. + /// + /// @deprecated Use StartOperationMode() instead. + TSelf& Wait(bool value) { + StartOperationMode_ = value ? EStartOperationMode::SyncWait : EStartOperationMode::SyncStart; + return static_cast<TSelf&>(*this); + } + + /// + /// + /// @brief Use format from table attribute (for YAMR-like format). + /// + /// @deprecated + FLUENT_FIELD_DEFAULT(bool, UseTableFormats, false); + + /// + /// @brief Prefix for bash command running the jobs. + /// + /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec. + FLUENT_FIELD(TString, JobCommandPrefix); + + /// + /// @brief Suffix for bash command running the jobs. + /// + /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec. + FLUENT_FIELD(TString, JobCommandSuffix); + + /// + /// @brief Put all files required by the job into tmpfs. + /// + /// This option can be set globally using @ref NYT::TConfig::MountSandboxInTmpfs. + /// @see https://yt.yandex-team.ru/docs/problems/woodpeckers + FLUENT_FIELD_DEFAULT(bool, MountSandboxInTmpfs, false); + + /// + /// @brief Path to directory to store temporary files. + FLUENT_FIELD_OPTION(TString, FileStorage); + + /// + /// @brief Expiration timeout for uploaded files. + FLUENT_FIELD_OPTION(TDuration, FileExpirationTimeout); + + /// + /// @brief Info to be passed securely to the job. + FLUENT_FIELD_OPTION(TNode, SecureVault); + + /// + /// @brief File cache mode. + enum class EFileCacheMode : int + { + /// + /// @brief Use YT API commands "get_file_from_cache" and "put_file_to_cache". + ApiCommandBased, + + /// + /// @brief Upload files to random paths inside @ref NYT::TOperationOptions::FileStorage without caching. + CachelessRandomPathUpload, + }; + + /// + /// @brief File cache mode. + FLUENT_FIELD_DEFAULT(EFileCacheMode, FileCacheMode, EFileCacheMode::ApiCommandBased); + + /// + /// @brief Id of transaction within which all Cypress file storage entries will be checked/created. + /// + /// By default, the root transaction is used. + /// + /// @note Set a specific transaction only if you + /// 1. specify non-default file storage path in @ref NYT::TOperationOptions::FileStorage or in @ref NYT::TConfig::RemoteTempFilesDirectory. + /// 2. use `CachelessRandomPathUpload` caching mode (@ref NYT::TOperationOptions::FileCacheMode). + FLUENT_FIELD(TTransactionId, FileStorageTransactionId); + + /// + /// @brief Ensure stderr and core tables exist before starting operation. + /// + /// If set to `false`, it is user's responsibility to ensure these tables exist. + FLUENT_FIELD_DEFAULT(bool, CreateDebugOutputTables, true); + + /// + /// @brief Ensure output tables exist before starting operation. + /// + /// If set to `false`, it is user's responsibility to ensure output tables exist. + FLUENT_FIELD_DEFAULT(bool, CreateOutputTables, true); + + /// + /// @brief Try to infer schema of inexistent table from the type of written rows. + /// + /// @note Default values for this option may differ depending on the row type. + /// For protobuf it's currently `false` by default. + FLUENT_FIELD_OPTION(bool, InferOutputSchema); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job. +const TNode& GetJobSecureVault(); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Context passed to @ref NYT::IRawJob::Do. +class TRawJobContext +{ +public: + explicit TRawJobContext(size_t outputTableCount); + + /// + /// @brief Get file corresponding to input stream. + const TFile& GetInputFile() const; + + /// + /// @brief Get files corresponding to output streams. + const TVector<TFile>& GetOutputFileList() const; + +private: + TFile InputFile_; + TVector<TFile> OutputFileList_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for classes that can be Saved/Loaded (to be used with @ref Y_SAVELOAD_JOB). +class ISerializableForJob +{ +public: + virtual ~ISerializableForJob() = default; + + /// + /// @brief Dump state to output stream to be restored in job. + virtual void Save(IOutputStream& stream) const = 0; + + /// + /// @brief Load state from a stream. + virtual void Load(IInputStream& stream) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Provider of information about operation inputs/outputs during @ref NYT::IJob::PrepareOperation. +class IOperationPreparationContext +{ +public: + virtual ~IOperationPreparationContext() = default; + + /// @brief Get the number of input tables. + virtual int GetInputCount() const = 0; + + /// @brief Get the number of output tables. + virtual int GetOutputCount() const = 0; + + /// @brief Get the schema of input table no. `index`. + virtual const TTableSchema& GetInputSchema(int index) const = 0; + + /// @brief Get all the input table schemas. + virtual const TVector<TTableSchema>& GetInputSchemas() const = 0; + + /// @brief Path to the input table if available (`Nothing()` for intermediate tables). + virtual TMaybe<TYPath> GetInputPath(int index) const = 0; + + /// @brief Path to the output table if available (`Nothing()` for intermediate tables). + virtual TMaybe<TYPath> GetOutputPath(int index) const = 0; +}; + +/// +/// @brief Fluent builder class for @ref NYT::IJob::PrepareOperation. +/// +/// @note Method calls are supposed to be chained. +class TJobOperationPreparer +{ +public: + + /// + /// @brief Group of input tables that allows to specify properties on all of them at once. + /// + /// The instances are created with @ref NYT::TJobOperationPreparer::BeginInputGroup, not directly. + class TInputGroup + { + public: + TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices); + + /// @brief Specify the type of input rows. + template <typename TRow> + TInputGroup& Description(); + + /// @brief Specify renaming of input columns. + TInputGroup& ColumnRenaming(const THashMap<TString, TString>& renaming); + + /// @brief Specify what input columns to send to job + /// + /// @note Filter is applied before renaming, so it must specify original column names. + TInputGroup& ColumnFilter(const TVector<TString>& columns); + + /// @brief Finish describing the input group. + TJobOperationPreparer& EndInputGroup(); + + private: + TJobOperationPreparer& Preparer_; + TVector<int> Indices_; + }; + + /// + /// @brief Group of output tables that allows to specify properties on all of them at once. + /// + /// The instances are created with @ref NYT::TJobOperationPreparer::BeginOutputGroup, not directly. + class TOutputGroup + { + public: + TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices); + + /// @brief Specify the type of output rows. + /// + /// @tparam TRow type of output rows from tables of this group. + /// @param inferSchema Infer schema from `TRow` and specify it for these output tables. + template <typename TRow> + TOutputGroup& Description(bool inferSchema = true); + + /// @brief Specify schema for these tables. + TOutputGroup& Schema(const TTableSchema& schema); + + /// @brief Specify that all the the tables in this group are unschematized. + /// + /// It is equivalent of `.Schema(TTableSchema().Strict(false)`. + TOutputGroup& NoSchema(); + + /// @brief Finish describing the output group. + TJobOperationPreparer& EndOutputGroup(); + + private: + TJobOperationPreparer& Preparer_; + TVector<int> Indices_; + }; + +public: + explicit TJobOperationPreparer(const IOperationPreparationContext& context); + + /// @brief Begin input group consisting of tables with indices `[begin, end)`. + /// + /// @param begin First index. + /// @param end Index after the last one. + TInputGroup BeginInputGroup(int begin, int end); + + /// @brief Begin input group consisting of tables with indices from `indices`. + /// + /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions. + /// @param indices Indices of tables to include in the group. + template <typename TCont> + TInputGroup BeginInputGroup(const TCont& indices); + + /// @brief Begin output group consisting of tables with indices `[begin, end)`. + /// + /// @param begin First index. + /// @param end Index after the last one. + TOutputGroup BeginOutputGroup(int begin, int end); + + /// @brief Begin input group consisting of tables with indices from `indices`. + /// + /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions. + /// @param indices Indices of tables to include in the group. + template <typename TCont> + TOutputGroup BeginOutputGroup(const TCont& indices); + + /// @brief Specify the schema for output table no `tableIndex`. + /// + /// @note All the output schemas must be specified either with this method, `NoOutputSchema` or `OutputDescription` with `inferSchema == true` + TJobOperationPreparer& OutputSchema(int tableIndex, TTableSchema schema); + + /// @brief Mark the output table no. `tableIndex` as unschematized. + TJobOperationPreparer& NoOutputSchema(int tableIndex); + + /// @brief Specify renaming of input columns for table no. `tableIndex`. + TJobOperationPreparer& InputColumnRenaming(int tableIndex, const THashMap<TString, TString>& renaming); + + /// @brief Specify what input columns of table no. `tableIndex` to send to job + /// + /// @note Filter is applied before renaming, so it must specify original column names. + TJobOperationPreparer& InputColumnFilter(int tableIndex, const TVector<TString>& columns); + + /// @brief Specify the type of input rows for table no. `tableIndex`. + /// + /// @tparam TRow type of input rows. + template <typename TRow> + TJobOperationPreparer& InputDescription(int tableIndex); + + /// @brief Specify the type of output rows for table no. `tableIndex`. + /// + /// @tparam TRow type of output rows. + /// @param inferSchema Infer schema from `TRow` and specify it for the output tables. + template <typename TRow> + TJobOperationPreparer& OutputDescription(int tableIndex, bool inferSchema = true); + + /// @brief Set type of output rows for table no. `tableIndex` to TNode + /// + /// @note Set schema via `OutputSchema` if needed + TJobOperationPreparer& NodeOutput(int tableIndex); + + /// @brief Specify input format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& InputFormatHints(TFormatHints hints); + + /// @brief Specify output format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& OutputFormatHints(TFormatHints hints); + + /// @brief Specify format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& FormatHints(TUserJobFormatHints newFormatHints); + + /// @name "Private" members + /// The following methods should not be used by clients in @ref NYT::IJob::PrepareOperation + ///@{ + + /// @brief Finish the building process. + void Finish(); + + /// @brief Get output table schemas as specified by the user. + TVector<TTableSchema> GetOutputSchemas(); + + /// @brief Get input column renamings as specified by the user. + const TVector<THashMap<TString, TString>>& GetInputColumnRenamings() const; + + /// @brief Get input column filters as specified by the user. + const TVector<TMaybe<TVector<TString>>>& GetInputColumnFilters() const; + + /// @brief Get input column descriptions as specified by the user. + const TVector<TMaybe<TTableStructure>>& GetInputDescriptions() const; + + /// @brief Get output column descriptions as specified by the user. + const TVector<TMaybe<TTableStructure>>& GetOutputDescriptions() const; + + /// @brief Get format hints as specified by the user. + const TUserJobFormatHints& GetFormatHints() const; + + ///@} +private: + + /// @brief Validate that schema for output table no. `tableIndex` has not been set yet. + void ValidateMissingOutputSchema(int tableIndex) const; + + /// @brief Validate that description for input table no. `tableIndex` has not been set yet. + void ValidateMissingInputDescription(int tableIndex) const; + + /// @brief Validate that description for output table no. `tableIndex` has not been set yet. + void ValidateMissingOutputDescription(int tableIndex) const; + + /// @brief Validate that `tableIndex` is in correct range for input table indices. + /// + /// @param message Message to add to the exception in case of violation. + void ValidateInputTableIndex(int tableIndex, TStringBuf message) const; + + /// @brief Validate that `tableIndex` is in correct range for output table indices. + /// + /// @param message Message to add to the exception in case of violation. + void ValidateOutputTableIndex(int tableIndex, TStringBuf message) const; + + /// @brief Validate that all the output schemas has been set. + void FinallyValidate() const; + + static TTableSchema EmptyNonstrictSchema(); + +private: + const IOperationPreparationContext& Context_; + + TVector<TMaybe<TTableSchema>> OutputSchemas_; + TVector<THashMap<TString, TString>> InputColumnRenamings_; + TVector<TMaybe<TVector<TString>>> InputColumnFilters_; + TVector<TMaybe<TTableStructure>> InputTableDescriptions_; + TVector<TMaybe<TTableStructure>> OutputTableDescriptions_; + TUserJobFormatHints FormatHints_ = {}; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for all user jobs. +class IJob + : public TThrRefBase +{ +public: + + /// + /// @brief Type of job. + enum EType + { + Mapper, + Reducer, + ReducerAggregator, + RawJob, + VanillaJob, + }; + + /// + /// @brief Save job state to stream to be restored on cluster nodes. + virtual void Save(IOutputStream& stream) const + { + Y_UNUSED(stream); + } + + /// + /// @brief Restore job state from a stream. + virtual void Load(IInputStream& stream) + { + Y_UNUSED(stream); + } + + /// + /// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job. + const TNode& SecureVault() const + { + return GetJobSecureVault(); + } + + /// + /// @brief Get number of output tables. + i64 GetOutputTableCount() const + { + Y_VERIFY(NDetail::OutputTableCount > 0); + + return NDetail::OutputTableCount; + } + + /// + /// @brief Method allowing user to control some properties of input and output tables and formats. + /// + /// User can override this method in their job class to: + /// - specify output table schemas. + /// The most natural way is usually through @ref NYT::TJobOperationPreparer::OutputDescription (especially for protobuf), + /// but you can use @ref NYT::TJobOperationPreparer::OutputSchema directly + /// - specify output row type (@ref NYT::TJobOperationPreparer::OutputDescription) + /// - specify input row type (@ref NYT::TJobOperationPreparer::InputDescription) + /// - specify input column filter and renaming (@ref NYT::TJobOperationPreparer::InputColumnFilter and @ref NYT::TJobOperationPreparer::InputColumnRenaming) + /// - specify format hints (@ref NYT::TJobOperationPreparer::InputFormatHints, + /// NYT::TJobOperationPreparer::OutputFormatHints and @ref NYT::TJobOperationPreparer::FormatHints) + /// - maybe something more, cf. the methods of @ref NYT::TJobOperationPreparer. + /// + /// If one has several similar tables, groups can be used. + /// Groups are delimited by @ref NYT::TJobOperationPreparer::BeginInputGroup / + /// @ref NYT::TJobOperationPreparer::TInputGroup::EndInputGroup and + /// @ref NYT::TJobOperationPreparer::BeginOutputGroup / + /// @ref NYT::TJobOperationPreparer::TOutputGroup::EndOutputGroup. + /// Example: + /// @code{.cpp} + /// preparer + /// .BeginInputGroup({1,2,4,8}) + /// .ColumnRenaming({{"a", "b"}, {"c", "d"}}) + /// .ColumnFilter({"a", "c"}) + /// .EndInputGroup(); + /// @endcode + /// + /// @note All the output table schemas must be set + /// (possibly as empty nonstrict using @ref NYT::TJobOperationPreparer::NoOutputSchema or + /// @ref NYT::TJobOperationPreparer::TOutputGroup::NoSchema). + /// By default all the output table schemas are marked as empty nonstrict. + virtual void PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& preparer) const; +}; + +/// +/// @brief Declare what fields of currently declared job class to save and restore on cluster node. +#define Y_SAVELOAD_JOB(...) \ + virtual void Save(IOutputStream& stream) const override { Save(&stream); } \ + virtual void Load(IInputStream& stream) override { Load(&stream); } \ + Y_PASS_VA_ARGS(Y_SAVELOAD_DEFINE(__VA_ARGS__)) + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for jobs with typed inputs and outputs. +class IStructuredJob + : public IJob +{ +public: + /// + /// @brief This methods are called when creating table reader and writer for the job. + /// + /// Override them if you want to implement custom input logic. (e.g. addtitional bufferization) + virtual TRawTableReaderPtr CreateCustomRawJobReader(int fd) const; + virtual THolder<IProxyOutput> CreateCustomRawJobWriter(size_t outputTableCount) const; + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const = 0; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Create default raw job reader. +TRawTableReaderPtr CreateRawJobReader(int fd = 0); + +/// +/// @brief Create default raw job writer. +THolder<IProxyOutput> CreateRawJobWriter(size_t outputTableCount); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for structured (typed) map jobs. +class IMapperBase + : public IStructuredJob +{ }; + +/// +/// @brief Base interface for structured (typed) map jobs with given reader and writer. +template <class TR, class TW> +class IMapper + : public IMapperBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::Mapper; + + /// + /// @brief This method is called before feeding input rows to mapper (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for the whole job input. + /// + /// Read input rows from `reader` and write output ones to `writer`. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to mapper (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for structured (typed) reduce jobs. +/// +/// It is common base for @ref NYT::IReducer and @ref NYT::IAggregatorReducer. +class IReducerBase + : public IStructuredJob +{ }; + +/// +/// @brief Base interface for structured (typed) reduce jobs with given reader and writer. +template <class TR, class TW> +class IReducer + : public IReducerBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::Reducer; + +public: + + /// + /// @brief This method is called before feeding input rows to reducer (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for each range with same value of `ReduceBy` (or `JoinBy`) keys. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to reducer (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief Refuse to process the remaining row ranges and finish the job (successfully). + void Break(); + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface of jobs used inside reduce operations. +/// +/// Unlike @ref NYT::IReducer jobs their `Do' method is called only once +/// and takes whole range of records split by key boundaries. +/// +/// Template argument `TR` must be @ref NYT::TTableRangesReader. +template <class TR, class TW> +class IAggregatorReducer + : public IReducerBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::ReducerAggregator; + +public: + /// + /// @brief This method is called before feeding input rows to reducer (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for the whole job input. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to reducer (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for raw jobs (i.e. reading and writing byte streams). +class IRawJob + : public IJob +{ +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::RawJob; + + /// + /// @brief This method is called exactly once for the whole job input. + virtual void Do(const TRawJobContext& jobContext) = 0; +}; + +/// +/// @brief Interface of jobs that run the given bash command. +class ICommandJob + : public IJob +{ +public: + /// + /// @brief Get bash command to run. + /// + /// @note This method is called on the client side. + virtual const TString& GetCommand() const = 0; +}; + +/// +/// @brief Raw job executing given bash command. +/// +/// @note The binary will not be uploaded. +class TCommandRawJob + : public IRawJob + , public ICommandJob +{ +public: + /// + /// @brief Create job with specified command. + /// + /// @param command Bash command to run. + explicit TCommandRawJob(TStringBuf command = {}); + + const TString& GetCommand() const override; + void Do(const TRawJobContext& jobContext) override; + +private: + TString Command_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for vanilla jobs. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +class IVanillaJobBase + : public virtual IStructuredJob +{ +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::VanillaJob; +}; + +template <class TW = void> +class IVanillaJob; + +/// +/// @brief Interface of vanilla job without outputs. +template <> +class IVanillaJob<void> + : public IVanillaJobBase +{ +public: + /// + /// @brief This method is called exactly once for each vanilla job. + virtual void Do() = 0; + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +/// +/// @brief Vanilla job executing given bash command. +/// +/// @note The binary will not be uploaded. +class TCommandVanillaJob + : public IVanillaJob<> + , public ICommandJob +{ +public: + /// + /// @brief Create job with specified command. + /// + /// @param command Bash command to run. + explicit TCommandVanillaJob(TStringBuf command = {}); + + const TString& GetCommand() const override; + void Do() override; + +private: + TString Command_; +}; + +/// +/// @brief Interface for vanilla jobs with output tables. +template <class TW> +class IVanillaJob + : public IVanillaJobBase +{ +public: + using TWriter = TW; + +public: + /// + /// @brief This method is called before `Do` method. + virtual void Start(TWriter* /* writer */) + { } + + /// + /// @brief This method is called exactly once for each vanilla job. + /// + /// Write output rows to `writer`. + virtual void Do(TWriter* writer) = 0; + + /// + /// @brief This method is called after `Do` method. + virtual void Finish(TWriter* /* writer */) + { } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Attributes to request for an operation. +enum class EOperationAttribute : int +{ + Id /* "id" */, + Type /* "type" */, + State /* "state" */, + AuthenticatedUser /* "authenticated_user" */, + StartTime /* "start_time" */, + FinishTime /* "finish_time" */, + BriefProgress /* "brief_progress" */, + BriefSpec /* "brief_spec" */, + Suspended /* "suspended" */, + Result /* "result" */, + Progress /* "progress" */, + Events /* "events" */, + Spec /* "spec" */, + FullSpec /* "full_spec" */, + UnrecognizedSpec /* "unrecognized_spec" */, +}; + +/// +/// @brief Class describing which attributes to request in @ref NYT::IClient::GetOperation or @ref NYT::IClient::ListOperations. +struct TOperationAttributeFilter +{ + /// @cond Doxygen_Suppress + using TSelf = TOperationAttributeFilter; + /// @endcond + + TVector<EOperationAttribute> Attributes_; + + /// + /// @brief Add attribute to the filter. Calls are supposed to be chained. + TSelf& Add(EOperationAttribute attribute) + { + Attributes_.push_back(attribute); + return *this; + } +}; + +/// +/// @brief Options for @ref NYT::IClient::GetOperation call. +struct TGetOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetOperationOptions; + /// @endcond + + /// + /// @brief What attributes to request (if omitted, the default set of attributes will be requested). + FLUENT_FIELD_OPTION(TOperationAttributeFilter, AttributeFilter); +}; + +/// +/// @brief "Coarse-grained" state of an operation. +enum class EOperationBriefState : int +{ + InProgress /* "in_progress" */, + Completed /* "completed" */, + Aborted /* "aborted" */, + + /// Failed + Failed /* "failed" */, +}; + +/// +/// @brief Operation type. +enum class EOperationType : int +{ + Map /* "map" */, + Merge /* "merge" */, + Erase /* "erase" */, + Sort /* "sort" */, + Reduce /* "reduce" */, + MapReduce /* "map_reduce" */, + RemoteCopy /* "remote_copy" */, + JoinReduce /* "join_reduce" */, + Vanilla /* "vanilla" */, +}; + +/// +/// @brief Operation progress. +struct TOperationProgress +{ + /// + /// @brief Total job statistics. + TJobStatistics JobStatistics; + + /// + /// @brief Job counter for various job states with hierarchy. + TJobCounters JobCounters; + + /// + /// @brief Time when this progress was built on scheduler or CA. + TMaybe<TInstant> BuildTime; +}; + +/// +/// @brief Brief operation progress (numbers of jobs in these states). +struct TOperationBriefProgress +{ + ui64 Aborted = 0; + ui64 Completed = 0; + ui64 Failed = 0; + ui64 Lost = 0; + ui64 Pending = 0; + ui64 Running = 0; + ui64 Total = 0; +}; + +/// +/// @brief Operation result. +struct TOperationResult +{ + /// + /// @brief For a unsuccessfully finished operation: description of error. + TMaybe<TYtError> Error; +}; + +/// +/// @brief Operation event (change of state). +struct TOperationEvent +{ + /// + /// @brief New state of operation. + TString State; + + /// + /// @brief Time of state change. + TInstant Time; +}; + +/// +/// @brief Operation info. +/// +/// A field may be `Nothing()` either if it was not requested (see @ref NYT::TGetOperationOptions::AttributeFilter) +/// or it is not available (i.e. `FinishTime` for a running operation). +/// @see https://yt.yandex-team.ru/docs/api/commands#get_operation +struct TOperationAttributes +{ + /// + /// @brief Operation id. + TMaybe<TOperationId> Id; + + /// + /// @brief Operation type. + TMaybe<EOperationType> Type; + + /// + /// @brief Operation state. + TMaybe<TString> State; + + /// + /// @brief "Coarse-grained" operation state. + TMaybe<EOperationBriefState> BriefState; + + /// + /// @brief Name of user that started the operation. + TMaybe<TString> AuthenticatedUser; + + /// + /// @brief Operation start time. + TMaybe<TInstant> StartTime; + + /// + /// @brief Operation finish time (if the operation has finished). + TMaybe<TInstant> FinishTime; + + /// + /// @brief Brief progress of the operation. + TMaybe<TOperationBriefProgress> BriefProgress; + + /// + /// @brief Brief spec of operation (light-weight fields only). + TMaybe<TNode> BriefSpec; + + /// + /// @brief Spec of the operation as provided by the user. + TMaybe<TNode> Spec; + + /// + /// @brief Full spec of operation (all fields not specified by user are filled with default values). + TMaybe<TNode> FullSpec; + + /// + /// @brief Fields not recognized by scheduler. + TMaybe<TNode> UnrecognizedSpec; + + /// + /// @brief Is operation suspended. + TMaybe<bool> Suspended; + + /// + /// @brief Operation result. + TMaybe<TOperationResult> Result; + + /// + /// @brief Operation progress. + TMaybe<TOperationProgress> Progress; + + /// + /// @brief List of operation events (changes of state). + TMaybe<TVector<TOperationEvent>> Events; + + /// + /// @brief Map from alert name to its description. + TMaybe<THashMap<TString, TYtError>> Alerts; +}; + +/// +/// @brief Direction of cursor for paging, see @ref NYT::TListOperationsOptions::CursorDirection. +enum class ECursorDirection +{ + Past /* "past" */, + Future /* "future" */, +}; + +/// +/// @brief Options of @ref NYT::IClient::ListOperations command. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_operations +struct TListOperationsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TListOperationsOptions; + /// @endcond + + /// + /// @name Time range specification + /// + /// List operations with start time in half-closed interval + /// `[CursorTime, ToTime)` if `CursorDirection == Future` or + /// `[FromTime, CursorTime)` if `CursorDirection == Past`. + ///@{ + + /// + /// @brief Search for operations with start time >= `FromTime`. + FLUENT_FIELD_OPTION(TInstant, FromTime); + + /// + /// @brief Search for operations with start time < `ToTime`. + FLUENT_FIELD_OPTION(TInstant, ToTime); + + /// + /// @brief Additional restriction on operation start time (useful for pagination). + /// + /// Search for operations with start time >= `CursorTime` if `CursorDirection == Future` + /// and with start time < `CursorTime` if `CursorDirection == Past` + FLUENT_FIELD_OPTION(TInstant, CursorTime); + + /// + /// @brief Direction of pagination (see @ref NYT::TListOperationsOptions::CursorTime). + FLUENT_FIELD_OPTION(ECursorDirection, CursorDirection); + + ///@} + + /// + /// @name Filters + /// Choose operations satisfying given filters. + ///@{ + + /// + /// @brief Search for `Filter` as a substring in operation text factors + /// (e.g. title or input/output table paths). + FLUENT_FIELD_OPTION(TString, Filter); + + /// + /// @brief Choose operations whose pools include `Pool`. + FLUENT_FIELD_OPTION(TString, Pool); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::AuthenticatedUser. + FLUENT_FIELD_OPTION(TString, User); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::State. + FLUENT_FIELD_OPTION(TString, State); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::Type. + FLUENT_FIELD_OPTION(EOperationType, Type); + + /// + /// @brief Choose operations having (or not having) any failed jobs. + FLUENT_FIELD_OPTION(bool, WithFailedJobs); + + ///@} + + /// + /// @brief Search for operations in the archive in addition to Cypress. + FLUENT_FIELD_OPTION(bool, IncludeArchive); + + /// + /// @brief Include the counters for different filter parameters in the response. + /// + /// Include number of operations for each pool, user, state, type + /// and the number of operations having failed jobs. + FLUENT_FIELD_OPTION(bool, IncludeCounters); + + /// + /// @brief Return no more than `Limit` operations (current default and maximum value is 1000). + FLUENT_FIELD_OPTION(i64, Limit); +}; + +/// +/// @brief Response for @ref NYT::IClient::ListOperations command. +struct TListOperationsResult +{ + /// + /// @brief Found operations' attributes. + TVector<TOperationAttributes> Operations; + + /// + /// @name Counters for different filter. + /// + /// If counters were requested (@ref NYT::TListOperationsOptions::IncludeCounters is `true`) + /// the maps contain the number of operations found for each pool, user, state and type. + /// NOTE: + /// 1) Counters ignore CursorTime and CursorDirection, + /// they always are collected in the whole [FromTime, ToTime) interval. + /// 2) Each next counter in the sequence [pool, user, state, type, with_failed_jobs] + /// takes into account all the previous filters (i.e. if you set User filter to "some-user" + /// type counts describe only operations with user "some-user"). + /// @{ + + /// + /// @brief Number of operations for each pool. + TMaybe<THashMap<TString, i64>> PoolCounts; + + /// + /// @brief Number of operations for each user (subject to previous filters). + TMaybe<THashMap<TString, i64>> UserCounts; + + /// + /// @brief Number of operations for each state (subject to previous filters). + TMaybe<THashMap<TString, i64>> StateCounts; + + /// + /// @brief Number of operations for each type (subject to previous filters). + TMaybe<THashMap<EOperationType, i64>> TypeCounts; + + /// + /// @brief Number of operations having failed jobs (subject to all previous filters). + TMaybe<i64> WithFailedJobsCount; + + /// @} + + /// + /// @brief Whether some operations were not returned due to @ref NYT::TListOperationsOptions::Limit. + /// + /// `Incomplete == true` means that not all operations satisfying filters + /// were returned (limit exceeded) and you need to repeat the request with new @ref NYT::TListOperationsOptions::CursorTime + /// (e.g. `CursorTime == *Operations.back().StartTime`, but don't forget to + /// remove the duplicates). + bool Incomplete; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Data source for @ref NYT::IClient::ListJobs command. +enum class EListJobsDataSource : int +{ + Runtime /* "runtime" */, + Archive /* "archive" */, + Auto /* "auto" */, + Manual /* "manual" */, +}; + +/// +/// @brief Job type. +enum class EJobType : int +{ + SchedulerFirst /* "scheduler_first" */, + Map /* "map" */, + PartitionMap /* "partition_map" */, + SortedMerge /* "sorted_merge" */, + OrderedMerge /* "ordered_merge" */, + UnorderedMerge /* "unordered_merge" */, + Partition /* "partition" */, + SimpleSort /* "simple_sort" */, + FinalSort /* "final_sort" */, + SortedReduce /* "sorted_reduce" */, + PartitionReduce /* "partition_reduce" */, + ReduceCombiner /* "reduce_combiner" */, + RemoteCopy /* "remote_copy" */, + IntermediateSort /* "intermediate_sort" */, + OrderedMap /* "ordered_map" */, + JoinReduce /* "join_reduce" */, + Vanilla /* "vanilla" */, + SchedulerUnknown /* "scheduler_unknown" */, + SchedulerLast /* "scheduler_last" */, + ReplicatorFirst /* "replicator_first" */, + ReplicateChunk /* "replicate_chunk" */, + RemoveChunk /* "remove_chunk" */, + RepairChunk /* "repair_chunk" */, + SealChunk /* "seal_chunk" */, + ReplicatorLast /* "replicator_last" */, +}; + +/// +/// @brief Well-known task names. +enum class ETaskName : int +{ + Map /* "map" */, + PartitionMap0 /* "partition_map(0)" */, + SortedMerge /* "sorted_merge" */, + OrderedMerge /* "ordered_merge" */, + UnorderedMerge /* "unordered_merge" */, + Partition0 /* "partition(0)" */, + Partition1 /* "partition(1)" */, + Partition2 /* "partition(2)" */, + SimpleSort /* "simple_sort" */, + FinalSort /* "final_sort" */, + SortedReduce /* "sorted_reduce" */, + PartitionReduce /* "partition_reduce" */, + ReduceCombiner /* "reduce_combiner" */, + RemoteCopy /* "remote_copy" */, + IntermediateSort /* "intermediate_sort" */, + OrderedMap /* "ordered_map" */, + JoinReduce /* "join_reduce" */, +}; + +/// +/// @brief Task name (can either well-known or just a string). +class TTaskName +{ +public: + + // Constructors are implicit by design. + + /// + /// @brief Construct a custom task name. + TTaskName(TString taskName); + + /// + /// @brief Construct a custom task name. + TTaskName(const char* taskName); + + /// + /// @brief Construct a well-known task name. + TTaskName(ETaskName taskName); + + const TString& Get() const; + +private: + TString TaskName_; +}; + +/// +/// @brief Job state. +enum class EJobState : int +{ + None /* "none" */, + Waiting /* "waiting" */, + Running /* "running" */, + Aborting /* "aborting" */, + Completed /* "completed" */, + Failed /* "failed" */, + Aborted /* "aborted" */, + Lost /* "lost" */, +}; + +/// +/// @brief Job sort field. +/// +/// @see @ref NYT::TListJobsOptions. +enum class EJobSortField : int +{ + Type /* "type" */, + State /* "state" */, + StartTime /* "start_time" */, + FinishTime /* "finish_time" */, + Address /* "address" */, + Duration /* "duration" */, + Progress /* "progress" */, + Id /* "id" */, +}; + +/// +/// @brief Job sort direction. +/// +/// @see @ref NYT::TListJobsOptions. +enum class EJobSortDirection : int +{ + Ascending /* "ascending" */, + Descending /* "descending" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::ListJobs. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_jobs +struct TListJobsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TListJobsOptions; + /// @endcond + + /// + /// @name Filters + /// Return only jobs with given value of parameter (type, state, address and existence of stderr). + /// If a field is `Nothing()`, return jobs with all possible values of the corresponding parameter. + /// @{ + + /// + /// @brief Job type. + FLUENT_FIELD_OPTION(EJobType, Type); + + /// + /// @brief Job state. + FLUENT_FIELD_OPTION(EJobState, State); + + /// + /// @brief Address of the cluster node where job was running. + FLUENT_FIELD_OPTION(TString, Address); + + /// + /// @brief Return only jobs whose stderr has been saved. + FLUENT_FIELD_OPTION(bool, WithStderr); + + /// + /// @brief Return only jobs whose spec has been saved. + FLUENT_FIELD_OPTION(bool, WithSpec); + + /// + /// @brief Return only jobs whose fail context has been saved. + FLUENT_FIELD_OPTION(bool, WithFailContext); + + /// @} + + /// + /// @name Sort options + /// @{ + + /// + /// @brief Sort by this field. + FLUENT_FIELD_OPTION(EJobSortField, SortField); + + /// + /// @brief Sort order. + FLUENT_FIELD_OPTION(ESortOrder, SortOrder); + + /// @} + + /// + /// @brief Data source. + /// + /// Where to search for jobs: in scheduler and Cypress ('Runtime'), in archive ('Archive'), + /// automatically basing on operation presence in Cypress ('Auto') or choose manually (`Manual'). + FLUENT_FIELD_OPTION(EListJobsDataSource, DataSource); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeCypress); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeControllerAgent); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeArchive); + + /// + /// @brief Maximum number of jobs to return. + FLUENT_FIELD_OPTION(i64, Limit); + + /// + /// @brief Number of jobs (in specified sort order) to skip. + /// + /// Together with @ref NYT::TListJobsOptions::Limit may be used for pagination. + FLUENT_FIELD_OPTION(i64, Offset); +}; + +/// +/// @brief Description of a core dump that happened in the job. +struct TCoreInfo +{ + i64 ProcessId; + TString ExecutableName; + TMaybe<ui64> Size; + TMaybe<TYtError> Error; +}; + +/// +/// @brief Job attributes. +/// +/// A field may be `Nothing()` if it is not available (i.e. `FinishTime` for a running job). +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#get_job +struct TJobAttributes +{ + /// + /// @brief Job id. + TMaybe<TJobId> Id; + + /// + /// @brief Job type + TMaybe<EJobType> Type; + + /// + /// @brief Job state. + TMaybe<EJobState> State; + + /// + /// @brief Address of a cluster node where job was running. + TMaybe<TString> Address; + + /// + /// @brief The name of the task that job corresponds to. + TMaybe<TString> TaskName; + + /// + /// @brief Job start time. + TMaybe<TInstant> StartTime; + + /// + /// @brief Job finish time (for a finished job). + TMaybe<TInstant> FinishTime; + + /// + /// @brief Estimated ratio of job's completed work. + TMaybe<double> Progress; + + /// + /// @brief Size of saved job stderr. + TMaybe<i64> StderrSize; + + /// + /// @brief Error for a unsuccessfully finished job. + TMaybe<TYtError> Error; + + /// + /// @brief Job brief statistics. + TMaybe<TNode> BriefStatistics; + + /// + /// @brief Job input paths (with ranges). + TMaybe<TVector<TRichYPath>> InputPaths; + + /// + /// @brief Infos for core dumps produced by job. + TMaybe<TVector<TCoreInfo>> CoreInfos; +}; + +/// +/// @brief Response for @ref NYT::IOperation::ListJobs. +struct TListJobsResult +{ + /// + /// @brief Jobs. + TVector<TJobAttributes> Jobs; + + /// + /// @deprecated + TMaybe<i64> CypressJobCount; + + /// + /// @brief Number of jobs retrieved from controller agent. + TMaybe<i64> ControllerAgentJobCount; + + /// + /// @brief Number of jobs retrieved from archive. + TMaybe<i64> ArchiveJobCount; +}; + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IClient::GetJob. +struct TGetJobOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobInput. +struct TGetJobInputOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobInputOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobFailContext. +struct TGetJobFailContextOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobFailContextOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobStderr. +struct TGetJobStderrOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobStderrOptions; + /// @endcond +}; + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IOperation::GetFailedJobInfo. +struct TGetFailedJobInfoOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetFailedJobInfoOptions; + /// @endcond + + /// + /// @brief How many jobs to download. Which jobs will be chosen is undefined. + FLUENT_FIELD_DEFAULT(ui64, MaxJobCount, 10); + + /// + /// @brief How much of stderr tail should be downloaded. + FLUENT_FIELD_DEFAULT(ui64, StderrTailSize, 64 * 1024); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface representing an operation. +struct IOperation + : public TThrRefBase +{ + virtual ~IOperation() = default; + + /// + /// @brief Get operation id. + virtual const TOperationId& GetId() const = 0; + + /// + /// @brief Get URL of the operation in YT Web UI. + virtual TString GetWebInterfaceUrl() const = 0; + + /// + /// @brief Get last error for not started operations. Get state on YT cluster for started operations. + /// + /// For not started operations last error is an error that's being retried during operation + /// preparation/start (e.g. lock files, start operation request). + virtual TString GetStatus() const = 0; + + /// + /// @brief Get preparation future. + /// + /// @return future that is set when operation is prepared. + virtual ::NThreading::TFuture<void> GetPreparedFuture() = 0; + + /// + /// @brief Start operation synchronously. + /// + /// @note: Do NOT call this method twice. + /// + /// If operation is not prepared yet, Start() will block waiting for preparation finish. + /// Be ready to catch exception if operation preparation or start failed. + virtual void Start() = 0; + + /// + /// @brief Is the operation started + /// + /// Returns true if the operation is started on the cluster + virtual bool IsStarted() const = 0; + + /// + /// @brief Get start future. + /// + /// @return future that is set when operation is started. + virtual ::NThreading::TFuture<void> GetStartedFuture() = 0; + + /// + /// @brief Start watching operation. + /// + /// @return future that is set when operation is complete. + /// + /// @note: the user should check value of returned future to ensure that operation completed successfully e.g. + /// @code{.cpp} + /// auto operationComplete = operation->Watch(); + /// operationComplete.Wait(); + /// operationComplete.GetValue(); /// will throw if operation completed with errors + /// @endcode + /// + /// If operation is completed successfully the returned future contains void value. + /// If operation is completed with error future contains @ref NYT::TOperationFailedError. + /// In rare cases when error occurred while waiting (e.g. YT become unavailable) future might contain other exception. + virtual ::NThreading::TFuture<void> Watch() = 0; + + /// + /// @brief Get information about failed jobs. + /// + /// Can be called for operation in any stage. + /// Though user should keep in mind that this method always fetches info from cypress + /// and doesn't work when operation is archived. Successfully completed operations can be archived + /// quite quickly (in about ~30 seconds). + virtual TVector<TFailedJobInfo> GetFailedJobInfo(const TGetFailedJobInfoOptions& options = TGetFailedJobInfoOptions()) = 0; + + /// + /// Get operation brief state. + virtual EOperationBriefState GetBriefState() = 0; + + /// + /// @brief Get error (if operation has failed). + /// + /// @return `Nothing()` if operation is in 'Completed' or 'InProgress' state (or reason for failed / aborted operation). + virtual TMaybe<TYtError> GetError() = 0; + + /// + /// Get job statistics. + virtual TJobStatistics GetJobStatistics() = 0; + + /// + /// Get operation progress. + /// + /// @return `Nothing()` if operation has no running jobs yet, e.g. when it is in "materializing" or "pending" state. + virtual TMaybe<TOperationBriefProgress> GetBriefProgress() = 0; + + /// + /// @brief Abort operation. + /// + /// Operation will be finished immediately. + /// All results of completed/running jobs will be lost. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op + virtual void AbortOperation() = 0; + + /// + /// @brief Complete operation. + /// + /// Operation will be finished immediately. + /// All results of completed jobs will appear in output tables. + /// All results of running (not completed) jobs will be lost. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op + virtual void CompleteOperation() = 0; + + /// + /// @brief Suspend operation. + /// + /// Jobs will not be aborted by default, c.f. @ref NYT::TSuspendOperationOptions. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#suspend_op + virtual void SuspendOperation( + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// + /// @brief Resume previously suspended operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#resume_op + virtual void ResumeOperation( + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Get operation attributes. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#get_operation + virtual TOperationAttributes GetAttributes( + const TGetOperationOptions& options = TGetOperationOptions()) = 0; + + /// + /// @brief Update operation runtime parameters. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#update_op_parameters + virtual void UpdateParameters( + const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0; + + /// + /// @brief Get job attributes. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#get_job + virtual TJobAttributes GetJob( + const TJobId& jobId, + const TGetJobOptions& options = TGetJobOptions()) = 0; + + /// + /// List jobs satisfying given filters (see @ref NYT::TListJobsOptions). + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#list_jobs + virtual TListJobsResult ListJobs( + const TListJobsOptions& options = TListJobsOptions()) = 0; +}; + +/// +/// @brief Interface of client capable of managing operations. +struct IOperationClient +{ + /// + /// @brief Run Map operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + IOperationPtr Map( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Map operation. + /// + /// @param mapper Instance of a job to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + IOperationPtr Map( + ::TIntrusivePtr<IMapperBase> mapper, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TMapOperationSpec& spec = TMapOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw Map operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw mapper to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + virtual IOperationPtr RawMap( + const TRawMapOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Reduce operation. + /// + /// @param spec Operation spec. + /// @param reducer Instance of a job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + IOperationPtr Reduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Reduce operation. + /// + /// @param reducer Instance of a job to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + IOperationPtr Reduce( + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + const TReduceOperationSpec& spec = TReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw Reduce operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + virtual IOperationPtr RawReduce( + const TRawReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run JoinReduce operation. + /// + /// @param spec Operation spec. + /// @param reducer Instance of a job to run. + /// @param options Optional parameters. + /// + /// @deprecated Use @ref NYT::IOperationClient::Reduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false. + IOperationPtr JoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw JoinReduce operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @deprecated Use @ref NYT::IOperationClient::RawReduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false. + virtual IOperationPtr RawJoinReduce( + const TRawJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a map job to run (identity mapper if `nullptr`). + /// @param reducer Instance of a reduce job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a map job to run (identity mapper if `nullptr`). + /// @param reducerCombiner Instance of a reduce combiner to run (identity reduce combiner if `nullptr`). + /// @param reducer Instance of a reduce job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param mapper Instance of mapper to run (identity mapper if `nullptr`). + /// @param reducer Instance of reducer to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec = TMapReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param mapper Instance of mapper to run (identity mapper if `nullptr`). + /// @param reduceCombiner Instance of reduceCombiner to run (identity reduce combiner if `nullptr`). + /// @param reducer Instance of reducer to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec = TMapReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a raw mapper to run (identity mapper if `nullptr`). + /// @param mapper Instance of a raw reduce combiner to run (identity reduce combiner if `nullptr`). + /// @param mapper Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + virtual IOperationPtr RawMapReduce( + const TRawMapReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + ::TIntrusivePtr<IRawJob> reduceCombiner, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Sort operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/sort + virtual IOperationPtr Sort( + const TSortOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Sort operation. + /// + /// @param input Input table(s). + /// @param output Output table. + /// @param sortBy Columns to sort input rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/sort + IOperationPtr Sort( + const TOneOrMany<TRichYPath>& input, + const TRichYPath& output, + const TSortColumns& sortBy, + const TSortOperationSpec& spec = TSortOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Merge operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/merge + virtual IOperationPtr Merge( + const TMergeOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Erase operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/erase + virtual IOperationPtr Erase( + const TEraseOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run RemoteCopy operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy + virtual IOperationPtr RemoteCopy( + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Vanilla operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/vanilla + virtual IOperationPtr RunVanilla( + const TVanillaOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Abort operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op + virtual void AbortOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Complete operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op + virtual void CompleteOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Wait for operation to finish. + virtual void WaitForOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Check and return operation status. + /// + /// @note this function will never return @ref NYT::EOperationBriefState::Failed or @ref NYT::EOperationBriefState::Aborted status, + /// it will throw @ref NYT::TOperationFailedError instead. + virtual EOperationBriefState CheckOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Create an operation object given operation id. + /// + /// @throw @ref NYT::TErrorResponse if the operation doesn't exist. + virtual IOperationPtr AttachOperation(const TOperationId& operationId) = 0; + +private: + virtual IOperationPtr DoMap( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoReduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoJoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoMapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + ::TIntrusivePtr<IStructuredJob> reduceCombiner, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define OPERATION_INL_H_ +#include "operation-inl.h" +#undef OPERATION_INL_H_ diff --git a/yt/cpp/mapreduce/interface/operation_ut.cpp b/yt/cpp/mapreduce/interface/operation_ut.cpp new file mode 100644 index 0000000000..0fa62e1568 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation_ut.cpp @@ -0,0 +1,269 @@ +#include <yt/cpp/mapreduce/interface/common_ut.h> +#include <yt/cpp/mapreduce/interface/job_statistics.h> +#include <yt/cpp/mapreduce/interface/operation.h> +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; +using namespace NYT::NUnitTesting; + +class TDummyInferenceContext + : public IOperationPreparationContext +{ +public: + TDummyInferenceContext(int inputCount, int outputCount) + : InputCount_(inputCount) + , OutputCount_(outputCount) + , InputSchemas_(inputCount) + { } + + int GetInputCount() const override + { + return InputCount_; + } + + int GetOutputCount() const override + { + return OutputCount_; + } + + const TVector<TTableSchema>& GetInputSchemas() const override + { + return InputSchemas_; + } + + const TTableSchema& GetInputSchema(int index) const override + { + return InputSchemas_[index]; + } + + TMaybe<TYPath> GetInputPath(int) const override + { + return Nothing(); + } + + TMaybe<TYPath> GetOutputPath(int) const override + { + return Nothing(); + } + +private: + int InputCount_; + int OutputCount_; + TVector<TTableSchema> InputSchemas_; +}; + +Y_UNIT_TEST_SUITE(PrepareOperation) +{ + + Y_UNIT_TEST(BasicSchemas) + { + auto firstSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + auto otherSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN)); + auto thirdSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING)); + + TDummyInferenceContext context(3,7); + TJobOperationPreparer builder(context); + + builder + .OutputSchema(1, firstSchema) + .BeginOutputGroup(TVector<int>{2, 5}) + .Schema(otherSchema) + .EndOutputGroup() + .BeginOutputGroup(3, 5) + .Schema(thirdSchema) + .EndOutputGroup() + .BeginOutputGroup(TVector<int>{0, 6}) + .Schema(thirdSchema) + .EndOutputGroup(); + + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, otherSchema), TApiUsageError); + UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(3, 5).Schema(otherSchema), TApiUsageError); + UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(TVector<int>{3,6,7}).Schema(otherSchema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + ASSERT_SERIALIZABLES_EQUAL(result[0], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[1], firstSchema); + ASSERT_SERIALIZABLES_EQUAL(result[2], otherSchema); + ASSERT_SERIALIZABLES_EQUAL(result[3], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[4], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[5], otherSchema); + ASSERT_SERIALIZABLES_EQUAL(result[6], thirdSchema); + } + + Y_UNIT_TEST(NoSchema) + { + auto schema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + + TDummyInferenceContext context(3,4); + TJobOperationPreparer builder(context); + + builder + .OutputSchema(1, schema) + .NoOutputSchema(0) + .BeginOutputGroup(2, 4) + .Schema(schema) + .EndOutputGroup(); + + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(0, schema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + UNIT_ASSERT(result[0].Empty()); + + ASSERT_SERIALIZABLES_EQUAL(result[1], schema); + ASSERT_SERIALIZABLES_EQUAL(result[2], schema); + ASSERT_SERIALIZABLES_EQUAL(result[3], schema); + } + + Y_UNIT_TEST(Descriptions) + { + auto urlRowSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("Host").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("Path").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("HttpCode").Type(NTi::Optional(NTi::Int32()))); + + auto urlRowStruct = NTi::Struct({ + {"Host", NTi::Optional(NTi::String())}, + {"Path", NTi::Optional(NTi::String())}, + {"HttpCode", NTi::Optional(NTi::Int32())}, + }); + + auto rowFieldSerializationOptionSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(NTi::Optional(urlRowStruct))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(NTi::Optional(NTi::String()))); + + auto rowSerializedRepeatedFieldsSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("Ints").Type(NTi::List(NTi::Int64()))) + .AddColumn(TColumnSchema().Name("UrlRows").Type(NTi::List(urlRowStruct))); + + TDummyInferenceContext context(5,7); + TJobOperationPreparer builder(context); + + builder + .InputDescription<TUrlRow>(0) + .BeginInputGroup(2, 3) + .Description<TUrlRow>() + .EndInputGroup() + .BeginInputGroup(TVector<int>{1, 4}) + .Description<TRowSerializedRepeatedFields>() + .EndInputGroup() + .InputDescription<TUrlRow>(3); + + UNIT_ASSERT_EXCEPTION(builder.InputDescription<TUrlRow>(0), TApiUsageError); + + builder + .OutputDescription<TUrlRow>(0, false) + .OutputDescription<TRowFieldSerializationOption>(1) + .BeginOutputGroup(2, 4) + .Description<TUrlRow>() + .EndOutputGroup() + .BeginOutputGroup(TVector<int>{4,6}) + .Description<TRowSerializedRepeatedFields>() + .EndOutputGroup() + .OutputDescription<TUrlRow>(5, false); + + UNIT_ASSERT_EXCEPTION(builder.OutputDescription<TUrlRow>(0), TApiUsageError); + UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(0, urlRowSchema)); + UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(5, urlRowSchema)); + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, urlRowSchema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + ASSERT_SERIALIZABLES_EQUAL(result[0], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[1], rowFieldSerializationOptionSchema); + ASSERT_SERIALIZABLES_EQUAL(result[2], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[3], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[4], rowSerializedRepeatedFieldsSchema); + ASSERT_SERIALIZABLES_EQUAL(result[5], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[6], rowSerializedRepeatedFieldsSchema); + + auto expectedInputDescriptions = TVector<TMaybe<TTableStructure>>{ + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + }; + UNIT_ASSERT_EQUAL(expectedInputDescriptions, builder.GetInputDescriptions()); + + auto expectedOutputDescriptions = TVector<TMaybe<TTableStructure>>{ + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowFieldSerializationOption::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + }; + UNIT_ASSERT_EQUAL(expectedOutputDescriptions, builder.GetOutputDescriptions()); + } + + Y_UNIT_TEST(InputColumns) + { + TDummyInferenceContext context(5, 1); + TJobOperationPreparer builder(context); + builder + .InputColumnFilter(2, {"a", "b"}) + .BeginInputGroup(0, 2) + .ColumnFilter({"b", "c"}) + .ColumnRenaming({{"b", "B"}, {"c", "C"}}) + .EndInputGroup() + .InputColumnRenaming(3, {{"a", "AAA"}}) + .NoOutputSchema(0); + builder.Finish(); + + auto expectedRenamings = TVector<THashMap<TString, TString>>{ + {{"b", "B"}, {"c", "C"}}, + {{"b", "B"}, {"c", "C"}}, + {}, + {{"a", "AAA"}}, + {}, + }; + UNIT_ASSERT_EQUAL(builder.GetInputColumnRenamings(), expectedRenamings); + + auto expectedFilters = TVector<TMaybe<TVector<TString>>>{ + {{"b", "c"}}, + {{"b", "c"}}, + {{"a", "b"}}, + {}, + {}, + }; + UNIT_ASSERT_EQUAL(builder.GetInputColumnFilters(), expectedFilters); + } + + Y_UNIT_TEST(Bug_r7349102) + { + auto firstSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + auto otherSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN)); + auto thirdSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING)); + + TDummyInferenceContext context(3,1); + TJobOperationPreparer builder(context); + + builder + .InputDescription<TUrlRow>(0) + .InputDescription<TUrlRow>(1) + .InputDescription<TUrlRow>(2) + .OutputDescription<TUrlRow>(0); + + builder.Finish(); + } + +} // Y_UNIT_TEST_SUITE(SchemaInference) diff --git a/yt/cpp/mapreduce/interface/proto3_ut.proto b/yt/cpp/mapreduce/interface/proto3_ut.proto new file mode 100644 index 0000000000..b24c13085b --- /dev/null +++ b/yt/cpp/mapreduce/interface/proto3_ut.proto @@ -0,0 +1,17 @@ +syntax = "proto3"; + +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NTestingProto3; + +option (NYT.file_default_field_flags) = SERIALIZATION_YT; + +message TWithOptional +{ + optional int64 x = 1; +} + +message TWithOptionalMessage +{ + optional TWithOptional x = 1; +} diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp new file mode 100644 index 0000000000..5ffa9564d7 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp @@ -0,0 +1,271 @@ +#include "errors.h" +#include "format.h" +#include "common_ut.h" + +#include <yt/cpp/mapreduce/interface/protobuf_file_options_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(ProtobufFileOptions) +{ + NTi::TTypePtr GetUrlRowType(bool required) + { + static const NTi::TTypePtr structType = NTi::Struct({ + {"Host", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}}); + return required ? structType : NTi::TTypePtr(NTi::Optional(structType)); + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TRowFieldSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TRowMixedSerializationOptions>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(FieldSortOrder) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TFieldSortOrder>(); + + auto asInProtoFile = NTi::Optional(NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + {"z", NTi::Optional(NTi::Bool())}, + })); + auto byFieldNumber = NTi::Optional(NTi::Struct({ + {"z", NTi::Optional(NTi::Bool())}, + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(asInProtoFile)) + .AddColumn(TColumnSchema().Name("EmbeddedAsInProtoFile").Type(asInProtoFile)) + .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber))); + } + + Y_UNIT_TEST(Map) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TWithMap>(); + + auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) { + return NTi::List(NTi::Struct({ + {"key", NTi::Optional(key)}, + {"value", NTi::Optional(value)}, + })); + }; + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("MapDefault") + .Type(createKeyValueStruct(NTi::Int64(), embedded))) + .AddColumn(TColumnSchema() + .Name("MapDict") + .Type(NTi::Dict(NTi::Int64(), embedded)))); + } + + Y_UNIT_TEST(Oneof) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TWithOneof>(); + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + auto defaultVariantType = NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {"Oneof2", NTi::Optional(NTi::Variant(NTi::Struct({ + {"y2", NTi::String()}, + {"z2", embedded}, + {"x2", NTi::Int64()}, + })))}, + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + })); + + auto noDefaultType = NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {"y2", NTi::Optional(NTi::String())}, + {"z2", NTi::Optional(embedded)}, + {"x2", NTi::Optional(NTi::Int64())}, + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("DefaultVariant") + .Type(defaultVariantType) + ) + .AddColumn(TColumnSchema() + .Name("NoDefault") + .Type(noDefaultType) + ) + .AddColumn(TColumnSchema() + .Name("SerializationProtobuf") + .Type(NTi::Optional(NTi::Struct({ + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(NTi::String())}, + }))) + ) + .AddColumn(TColumnSchema() + .Name("MemberOfTopLevelOneof") + .Type(NTi::Optional(NTi::Int64())) + ) + ); + } +} + +static TNode GetColumns(const TFormat& format, int tableIndex = 0) +{ + return format.Config.GetAttributes()["tables"][tableIndex]["columns"]; +} + +Y_UNIT_TEST_SUITE(ProtobufFormatFileOptions) +{ + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TRowFieldSerializationOption>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2); + const auto& fields = columns[1]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3); + } + + Y_UNIT_TEST(Map) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TWithMap>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 2); + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + } + + Y_UNIT_TEST(Oneof) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TWithOneof>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4); + + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "DefaultVariant"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field"); + + const auto& oneof2 = column["fields"][1]; + UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], "Oneof2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message"); + const auto& embeddedFields = oneof2["fields"][1]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y"); + + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1"); + }; + + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "NoDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + const auto& fields = column["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields.Size(), 7); + + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "field"); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "y2"); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "structured_message"); + const auto& embeddedFields = fields[2]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y"); + + UNIT_ASSERT_VALUES_EQUAL(fields[3]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(fields[4]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(fields[5]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(fields[6]["name"], "z1"); + }; + + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MemberOfTopLevelOneof"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64"); + } + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto new file mode 100644 index 0000000000..4804b2f60c --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto @@ -0,0 +1,142 @@ +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NTestingFileOptions; + +option (NYT.file_default_field_flags) = SERIALIZATION_YT; +option (NYT.file_default_field_flags) = MAP_AS_LIST_OF_STRUCTS; +option (NYT.file_default_message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; +option (NYT.file_default_oneof_flags) = SEPARATE_FIELDS; + +message TUrlRow +{ + optional string Host = 1 [(NYT.column_name) = "Host"]; + optional string Path = 2 [(NYT.column_name) = "Path"]; + optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"]; +} + +message TRowFieldSerializationOption +{ + optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMixedSerializationOptions +{ + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_YT]; +} + +message TRowSerializedRepeatedFields +{ + repeated int64 Ints = 1; + repeated TUrlRow UrlRows = 2; +} + +message TFieldSortOrder +{ + message TEmbeddedDefault { + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedAsInProtoFile { + option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedByFieldNumber { + option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TEmbeddedDefault EmbeddedDefault = 1; + optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2; + optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3; +} + +message TWithMap +{ + message TEmbedded { + optional int64 x = 1; + optional string y = 2; + } + + map<int64, TEmbedded> MapDefault = 1; + map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT]; +} + +message TWithOneof +{ + message TEmbedded + { + oneof Oneof { + int64 x = 1; + string y = 2; + } + } + + message TDefaultVariant + { + option (NYT.default_oneof_flags) = VARIANT; + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + option (NYT.oneof_flags) = SEPARATE_FIELDS; + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TNoDefault + { + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TSerializationProtobuf + { + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + oneof Oneof + { + int64 x1 = 2; + string y1 = 1; + TEmbedded z1 = 3; + } + } + + optional TDefaultVariant DefaultVariant = 1; + optional TNoDefault NoDefault = 2; + optional TSerializationProtobuf SerializationProtobuf = 3; + + oneof TopLevelOneof + { + int64 MemberOfTopLevelOneof = 4; + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_format.cpp b/yt/cpp/mapreduce/interface/protobuf_format.cpp new file mode 100644 index 0000000000..3d57ed2797 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_format.cpp @@ -0,0 +1,1498 @@ +#include "protobuf_format.h" + +#include "errors.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <google/protobuf/text_format.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <util/generic/hash_set.h> +#include <util/generic/stack.h> +#include <util/generic/overloaded.h> + +#include <util/stream/output.h> +#include <util/stream/file.h> + +namespace NYT::NDetail { + +using ::google::protobuf::Descriptor; +using ::google::protobuf::DescriptorProto; +using ::google::protobuf::EnumDescriptor; +using ::google::protobuf::EnumDescriptorProto; +using ::google::protobuf::FieldDescriptor; +using ::google::protobuf::FieldDescriptorProto; +using ::google::protobuf::OneofDescriptor; +using ::google::protobuf::Message; +using ::google::protobuf::FileDescriptor; +using ::google::protobuf::FileDescriptorProto; +using ::google::protobuf::FileDescriptorSet; +using ::google::protobuf::FieldOptions; +using ::google::protobuf::FileOptions; +using ::google::protobuf::OneofOptions; +using ::google::protobuf::MessageOptions; + +using ::ToString; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using TOneofOption = std::variant< + EProtobufOneofMode>; + +using TFieldOption = std::variant< + EProtobufType, + EProtobufSerializationMode, + EProtobufListMode, + EProtobufMapMode, + EProtobufEnumWritingMode>; + +using TMessageOption = std::variant< + EProtobufFieldSortOrder>; + +struct TOtherColumns +{ }; + +using TValueTypeOrOtherColumns = std::variant<EValueType, TOtherColumns>; + +//////////////////////////////////////////////////////////////////////////////// + +TFieldOption FieldFlagToOption(EWrapperFieldFlag::Enum flag) +{ + using EFlag = EWrapperFieldFlag; + switch (flag) { + case EFlag::SERIALIZATION_PROTOBUF: + return EProtobufSerializationMode::Protobuf; + case EFlag::SERIALIZATION_YT: + return EProtobufSerializationMode::Yt; + + case EFlag::ANY: + return EProtobufType::Any; + case EFlag::OTHER_COLUMNS: + return EProtobufType::OtherColumns; + case EFlag::ENUM_INT: + return EProtobufType::EnumInt; + case EFlag::ENUM_STRING: + return EProtobufType::EnumString; + + case EFlag::OPTIONAL_LIST: + return EProtobufListMode::Optional; + case EFlag::REQUIRED_LIST: + return EProtobufListMode::Required; + + case EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY: + return EProtobufMapMode::ListOfStructsLegacy; + case EFlag::MAP_AS_LIST_OF_STRUCTS: + return EProtobufMapMode::ListOfStructs; + case EFlag::MAP_AS_DICT: + return EProtobufMapMode::Dict; + case EFlag::MAP_AS_OPTIONAL_DICT: + return EProtobufMapMode::OptionalDict; + case EFlag::EMBEDDED: + return EProtobufSerializationMode::Embedded; + + case EFlag::ENUM_SKIP_UNKNOWN_VALUES: + return EProtobufEnumWritingMode::SkipUnknownValues; + case EFlag::ENUM_CHECK_VALUES: + return EProtobufEnumWritingMode::CheckValues; + } + Y_FAIL(); +} + +TMessageOption MessageFlagToOption(EWrapperMessageFlag::Enum flag) +{ + using EFlag = EWrapperMessageFlag; + switch (flag) { + case EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE: + return EProtobufFieldSortOrder::AsInProtoFile; + case EFlag::SORT_FIELDS_BY_FIELD_NUMBER: + return EProtobufFieldSortOrder::ByFieldNumber; + } + Y_FAIL(); +} + +TOneofOption OneofFlagToOption(EWrapperOneofFlag::Enum flag) +{ + using EFlag = EWrapperOneofFlag; + switch (flag) { + case EFlag::SEPARATE_FIELDS: + return EProtobufOneofMode::SeparateFields; + case EFlag::VARIANT: + return EProtobufOneofMode::Variant; + } + Y_FAIL(); +} + +EWrapperFieldFlag::Enum OptionToFieldFlag(TFieldOption option) +{ + using EFlag = EWrapperFieldFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufType type) + { + switch (type) { + case EProtobufType::Any: + return EFlag::ANY; + case EProtobufType::OtherColumns: + return EFlag::OTHER_COLUMNS; + case EProtobufType::EnumInt: + return EFlag::ENUM_INT; + case EProtobufType::EnumString: + return EFlag::ENUM_STRING; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufSerializationMode serializationMode) + { + switch (serializationMode) { + case EProtobufSerializationMode::Yt: + return EFlag::SERIALIZATION_YT; + case EProtobufSerializationMode::Protobuf: + return EFlag::SERIALIZATION_PROTOBUF; + case EProtobufSerializationMode::Embedded: + return EFlag::EMBEDDED; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufListMode listMode) + { + switch (listMode) { + case EProtobufListMode::Optional: + return EFlag::OPTIONAL_LIST; + case EProtobufListMode::Required: + return EFlag::REQUIRED_LIST; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufMapMode mapMode) + { + switch (mapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + return EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY; + case EProtobufMapMode::ListOfStructs: + return EFlag::MAP_AS_LIST_OF_STRUCTS; + case EProtobufMapMode::Dict: + return EFlag::MAP_AS_DICT; + case EProtobufMapMode::OptionalDict: + return EFlag::MAP_AS_OPTIONAL_DICT; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufEnumWritingMode enumWritingMode) + { + switch (enumWritingMode) { + case EProtobufEnumWritingMode::SkipUnknownValues: + return EFlag::ENUM_SKIP_UNKNOWN_VALUES; + case EProtobufEnumWritingMode::CheckValues: + return EFlag::ENUM_CHECK_VALUES; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + +EWrapperMessageFlag::Enum OptionToMessageFlag(TMessageOption option) +{ + using EFlag = EWrapperMessageFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufFieldSortOrder sortOrder) + { + switch (sortOrder) { + case EProtobufFieldSortOrder::AsInProtoFile: + return EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + case EProtobufFieldSortOrder::ByFieldNumber: + return EFlag::SORT_FIELDS_BY_FIELD_NUMBER; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + +EWrapperOneofFlag::Enum OptionToOneofFlag(TOneofOption option) +{ + using EFlag = EWrapperOneofFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufOneofMode mode) + { + switch (mode) { + case EProtobufOneofMode::SeparateFields: + return EFlag::SEPARATE_FIELDS; + case EProtobufOneofMode::Variant: + return EFlag::VARIANT; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + + +template <typename T, typename TOptionToFlag> +void SetOption(TMaybe<T>& option, T newOption, TOptionToFlag optionToFlag) +{ + if (option) { + if (*option == newOption) { + ythrow yexception() << "Duplicate protobuf flag " << optionToFlag(newOption); + } else { + ythrow yexception() << "Incompatible protobuf flags " << + optionToFlag(*option) << " and " << optionToFlag(newOption); + } + } + option = newOption; +} + +class TParseProtobufFieldOptionsVisitor +{ +public: + void operator() (EProtobufType type) + { + SetOption(Type, type); + } + + void operator() (EProtobufSerializationMode serializationMode) + { + SetOption(SerializationMode, serializationMode); + } + + void operator() (EProtobufListMode listMode) + { + SetOption(ListMode, listMode); + } + + void operator() (EProtobufMapMode mapMode) + { + SetOption(MapMode, mapMode); + } + + void operator() (EProtobufEnumWritingMode enumWritingMode) + { + SetOption(EnumWritingMode, enumWritingMode); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToFieldFlag); + } + +public: + TMaybe<EProtobufType> Type; + TMaybe<EProtobufSerializationMode> SerializationMode; + TMaybe<EProtobufListMode> ListMode; + TMaybe<EProtobufMapMode> MapMode; + TMaybe<EProtobufEnumWritingMode> EnumWritingMode; +}; + +class TParseProtobufMessageOptionsVisitor +{ +public: + void operator() (EProtobufFieldSortOrder fieldSortOrder) + { + SetOption(FieldSortOrder, fieldSortOrder); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToMessageFlag); + } + +public: + TMaybe<EProtobufFieldSortOrder> FieldSortOrder; +}; + +class TParseProtobufOneofOptionsVisitor +{ +public: + void operator() (EProtobufOneofMode mode) + { + SetOption(Mode, mode); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToOneofFlag); + } + +public: + TMaybe<EProtobufOneofMode> Mode; +}; + +void ParseProtobufFieldOptions( + const ::google::protobuf::RepeatedField<EWrapperFieldFlag::Enum>& flags, + TProtobufFieldOptions* fieldOptions) +{ + TParseProtobufFieldOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, FieldFlagToOption(flag)); + } + if (visitor.Type) { + fieldOptions->Type = *visitor.Type; + } + if (visitor.SerializationMode) { + fieldOptions->SerializationMode = *visitor.SerializationMode; + } + if (visitor.ListMode) { + fieldOptions->ListMode = *visitor.ListMode; + } + if (visitor.MapMode) { + fieldOptions->MapMode = *visitor.MapMode; + } +} + +void ParseProtobufMessageOptions( + const ::google::protobuf::RepeatedField<EWrapperMessageFlag::Enum>& flags, + TProtobufMessageOptions* messageOptions) +{ + TParseProtobufMessageOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, MessageFlagToOption(flag)); + } + if (visitor.FieldSortOrder) { + messageOptions->FieldSortOrder = *visitor.FieldSortOrder; + } +} + +void ParseProtobufOneofOptions( + const ::google::protobuf::RepeatedField<EWrapperOneofFlag::Enum>& flags, + TProtobufOneofOptions* messageOptions) +{ + TParseProtobufOneofOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, OneofFlagToOption(flag)); + } + if (visitor.Mode) { + messageOptions->Mode = *visitor.Mode; + } +} + +TProtobufFieldOptions GetDefaultFieldOptions( + const Descriptor* descriptor, + TProtobufFieldOptions defaultFieldOptions = {}) +{ + ParseProtobufFieldOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_field_flags), + &defaultFieldOptions); + ParseProtobufFieldOptions( + descriptor->options().GetRepeatedExtension(default_field_flags), + &defaultFieldOptions); + return defaultFieldOptions; +} + +TProtobufOneofOptions GetDefaultOneofOptions(const Descriptor* descriptor) +{ + TProtobufOneofOptions defaultOneofOptions; + ParseProtobufOneofOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_oneof_flags), + &defaultOneofOptions); + ParseProtobufOneofOptions( + descriptor->options().GetRepeatedExtension(default_oneof_flags), + &defaultOneofOptions); + switch (defaultOneofOptions.Mode) { + case EProtobufOneofMode::Variant: { + auto defaultFieldOptions = GetDefaultFieldOptions(descriptor); + switch (defaultFieldOptions.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + // For Protobuf serialization mode default is SeparateFields. + defaultOneofOptions.Mode = EProtobufOneofMode::SeparateFields; + return defaultOneofOptions; + case EProtobufSerializationMode::Yt: + case EProtobufSerializationMode::Embedded: + return defaultOneofOptions; + } + Y_FAIL(); + } + case EProtobufOneofMode::SeparateFields: + return defaultOneofOptions; + } + Y_FAIL(); +} + +//////////////////////////////////////////////////////////////////////////////// + +void ValidateProtobufType(const FieldDescriptor& fieldDescriptor, EProtobufType protobufType) +{ + const auto fieldType = fieldDescriptor.type(); + auto ensureType = [&] (FieldDescriptor::Type expectedType) { + Y_ENSURE(fieldType == expectedType, + "Type of field " << fieldDescriptor.name() << "does not match specified field flag " << + OptionToFieldFlag(protobufType) << ": " + "expected " << FieldDescriptor::TypeName(expectedType) << ", " << + "got " << FieldDescriptor::TypeName(fieldType)); + }; + switch (protobufType) { + case EProtobufType::Any: + ensureType(FieldDescriptor::TYPE_BYTES); + return; + case EProtobufType::OtherColumns: + ensureType(FieldDescriptor::TYPE_BYTES); + return; + case EProtobufType::EnumInt: + ensureType(FieldDescriptor::TYPE_ENUM); + return; + case EProtobufType::EnumString: + ensureType(FieldDescriptor::TYPE_ENUM); + return; + } + Y_FAIL(); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TCycleChecker +{ +private: + class TGuard + { + public: + TGuard(TCycleChecker* checker, const Descriptor* descriptor) + : Checker_(checker) + , Descriptor_(descriptor) + { + Checker_->ActiveVertices_.insert(Descriptor_); + Checker_->Stack_.push(Descriptor_); + } + + ~TGuard() + { + Checker_->ActiveVertices_.erase(Descriptor_); + Checker_->Stack_.pop(); + } + + private: + TCycleChecker* Checker_; + const Descriptor* Descriptor_; + }; + +public: + [[nodiscard]] TGuard Enter(const Descriptor* descriptor) + { + if (ActiveVertices_.contains(descriptor)) { + Y_VERIFY(!Stack_.empty()); + ythrow TApiUsageError() << "Cyclic reference found for protobuf messages. " << + "Consider removing " << EWrapperFieldFlag::SERIALIZATION_YT << " flag " << + "somewhere on the cycle containing " << + Stack_.top()->full_name() << " and " << descriptor->full_name(); + } + return TGuard(this, descriptor); + } + +private: + THashSet<const Descriptor*> ActiveVertices_; + TStack<const Descriptor*> Stack_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TProtobufFieldOptions GetFieldOptions( + const FieldDescriptor* fieldDescriptor, + const TMaybe<TProtobufFieldOptions>& defaultFieldOptions) +{ + TProtobufFieldOptions options; + if (defaultFieldOptions) { + options = *defaultFieldOptions; + } else { + options = GetDefaultFieldOptions(fieldDescriptor->containing_type()); + } + ParseProtobufFieldOptions(fieldDescriptor->options().GetRepeatedExtension(flags), &options); + return options; +} + +TProtobufOneofOptions GetOneofOptions( + const OneofDescriptor* oneofDescriptor, + const TMaybe<TProtobufOneofOptions>& defaultOneofOptions) +{ + TProtobufOneofOptions options; + if (defaultOneofOptions) { + options = *defaultOneofOptions; + } else { + options = GetDefaultOneofOptions(oneofDescriptor->containing_type()); + } + ParseProtobufOneofOptions(oneofDescriptor->options().GetRepeatedExtension(oneof_flags), &options); + + if (oneofDescriptor->is_synthetic()) { + options.Mode = EProtobufOneofMode::SeparateFields; + } + + auto variantFieldName = oneofDescriptor->options().GetExtension(variant_field_name); + switch (options.Mode) { + case EProtobufOneofMode::SeparateFields: + if (variantFieldName) { + ythrow TApiUsageError() << "\"variant_field_name\" requires (NYT.oneof_flags) = VARIANT"; + } + break; + case EProtobufOneofMode::Variant: + if (variantFieldName) { + options.VariantFieldName = variantFieldName; + } else { + options.VariantFieldName = oneofDescriptor->name(); + } + break; + } + return options; +} + + +TProtobufMessageOptions GetMessageOptions(const Descriptor* descriptor) +{ + TProtobufMessageOptions options; + ParseProtobufMessageOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_message_flags), + &options); + ParseProtobufMessageOptions( + descriptor->options().GetRepeatedExtension(message_flags), + &options); + return options; +} + +TNode MakeEnumerationConfig(const ::google::protobuf::EnumDescriptor* enumDescriptor) +{ + auto config = TNode::CreateMap(); + for (int i = 0; i < enumDescriptor->value_count(); ++i) { + config[enumDescriptor->value(i)->name()] = enumDescriptor->value(i)->number(); + } + return config; +} + +TString DeduceProtobufType( + const FieldDescriptor* fieldDescriptor, + const TProtobufFieldOptions& options) +{ + if (options.Type) { + ValidateProtobufType(*fieldDescriptor, *options.Type); + return ToString(*options.Type); + } + switch (fieldDescriptor->type()) { + case FieldDescriptor::TYPE_ENUM: + return ToString(EProtobufType::EnumString); + case FieldDescriptor::TYPE_MESSAGE: + switch (options.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + return "message"; + case EProtobufSerializationMode::Yt: + return "structured_message"; + case EProtobufSerializationMode::Embedded: + return "embedded_message"; + } + Y_FAIL(); + default: + return fieldDescriptor->type_name(); + } + Y_FAIL(); +} + +TString GetColumnName(const ::google::protobuf::FieldDescriptor& field) +{ + const auto& options = field.options(); + const auto columnName = options.GetExtension(column_name); + if (!columnName.empty()) { + return columnName; + } + const auto keyColumnName = options.GetExtension(key_column_name); + if (!keyColumnName.empty()) { + return keyColumnName; + } + return field.name(); +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + TCycleChecker& cycleChecker); + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker); + +TNode MakeMapFieldsConfig( + const FieldDescriptor* fieldDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& fieldOptions, + TCycleChecker& cycleChecker) +{ + Y_VERIFY(fieldDescriptor->is_map()); + auto message = fieldDescriptor->message_type(); + switch (fieldOptions.MapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + return MakeProtoFormatMessageFieldsConfig( + message, + enumerations, + cycleChecker); + case EProtobufMapMode::ListOfStructs: + case EProtobufMapMode::Dict: + case EProtobufMapMode::OptionalDict: { + TProtobufFieldOptions defaultFieldOptions; + defaultFieldOptions.SerializationMode = EProtobufSerializationMode::Yt; + return MakeProtoFormatMessageFieldsConfig( + message, + enumerations, + defaultFieldOptions, + TProtobufOneofOptions{}, + cycleChecker); + } + } + Y_FAIL(); +} + +TNode MakeProtoFormatFieldConfig( + const FieldDescriptor* fieldDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultOptions, + TCycleChecker& cycleChecker) +{ + auto fieldConfig = TNode::CreateMap(); + fieldConfig["field_number"] = fieldDescriptor->number(); + fieldConfig["name"] = GetColumnName(*fieldDescriptor); + + auto fieldOptions = GetFieldOptions(fieldDescriptor, defaultOptions); + + Y_ENSURE(fieldOptions.SerializationMode != EProtobufSerializationMode::Embedded, + "EMBEDDED flag is currently supported only with " + "ProtobufFormatWithDescriptors config option set to true"); + + if (fieldDescriptor->is_repeated()) { + Y_ENSURE_EX(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt, + TApiUsageError() << "Repeated field \"" << fieldDescriptor->full_name() << "\" " << + "must have flag \"" << EWrapperFieldFlag::SERIALIZATION_YT << "\""); + } + fieldConfig["repeated"] = fieldDescriptor->is_repeated(); + fieldConfig["packed"] = fieldDescriptor->is_packed(); + + fieldConfig["proto_type"] = DeduceProtobufType(fieldDescriptor, fieldOptions); + + if (fieldDescriptor->type() == FieldDescriptor::TYPE_ENUM) { + auto* enumeration = fieldDescriptor->enum_type(); + (*enumerations)[enumeration->full_name()] = MakeEnumerationConfig(enumeration); + fieldConfig["enumeration_name"] = enumeration->full_name(); + } + + if (fieldOptions.SerializationMode != EProtobufSerializationMode::Yt) { + return fieldConfig; + } + + if (fieldDescriptor->is_map()) { + fieldConfig["fields"] = MakeMapFieldsConfig(fieldDescriptor, enumerations, fieldOptions, cycleChecker); + return fieldConfig; + } + + if (fieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE) { + fieldConfig["fields"] = MakeProtoFormatMessageFieldsConfig( + fieldDescriptor->message_type(), + enumerations, + cycleChecker); + } + + return fieldConfig; +} + +void MakeProtoFormatOneofConfig( + const OneofDescriptor* oneofDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker, + TNode* fields) +{ + auto addFields = [&] (TNode* fields) { + for (int i = 0; i < oneofDescriptor->field_count(); ++i) { + fields->Add(MakeProtoFormatFieldConfig( + oneofDescriptor->field(i), + enumerations, + defaultFieldOptions, + cycleChecker)); + } + }; + + auto oneofOptions = GetOneofOptions(oneofDescriptor, defaultOneofOptions); + switch (oneofOptions.Mode) { + case EProtobufOneofMode::SeparateFields: + addFields(fields); + return; + case EProtobufOneofMode::Variant: { + auto oneofFields = TNode::CreateList(); + addFields(&oneofFields); + auto oneofField = TNode() + ("proto_type", "oneof") + ("name", oneofOptions.VariantFieldName) + ("fields", std::move(oneofFields)); + fields->Add(std::move(oneofField)); + return; + } + } + Y_FAIL(); +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker) +{ + auto fields = TNode::CreateList(); + THashSet<const OneofDescriptor*> visitedOneofs; + auto guard = cycleChecker.Enter(descriptor); + for (int fieldIndex = 0; fieldIndex < descriptor->field_count(); ++fieldIndex) { + auto fieldDescriptor = descriptor->field(fieldIndex); + auto oneofDescriptor = fieldDescriptor->containing_oneof(); + if (!oneofDescriptor) { + fields.Add(MakeProtoFormatFieldConfig( + fieldDescriptor, + enumerations, + defaultFieldOptions, + cycleChecker)); + } else if (!visitedOneofs.contains(oneofDescriptor)) { + MakeProtoFormatOneofConfig( + oneofDescriptor, + enumerations, + defaultFieldOptions, + defaultOneofOptions, + cycleChecker, + &fields); + visitedOneofs.insert(oneofDescriptor); + } + } + return fields; +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + TCycleChecker& cycleChecker) +{ + return MakeProtoFormatMessageFieldsConfig( + descriptor, + enumerations, + GetDefaultFieldOptions(descriptor), + GetDefaultOneofOptions(descriptor), + cycleChecker); +} + +TNode MakeProtoFormatConfigWithTables(const TVector<const Descriptor*>& descriptors) +{ + TNode config("protobuf"); + config.Attributes() + ("enumerations", TNode::CreateMap()) + ("tables", TNode::CreateList()); + + auto& enumerations = config.Attributes()["enumerations"]; + + for (auto* descriptor : descriptors) { + TCycleChecker cycleChecker; + auto columns = MakeProtoFormatMessageFieldsConfig(descriptor, &enumerations, cycleChecker); + config.Attributes()["tables"].Add( + TNode()("columns", std::move(columns))); + } + + return config; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFileDescriptorSetBuilder +{ +public: + TFileDescriptorSetBuilder() + : ExtensionFile_(EWrapperFieldFlag::descriptor()->file()) + { } + + void AddDescriptor(const Descriptor* descriptor) + { + auto [it, inserted] = AllDescriptors_.insert(descriptor); + if (!inserted) { + return; + } + + const auto* containingType = descriptor->containing_type(); + while (containingType) { + AddDescriptor(containingType); + containingType = containingType->containing_type(); + } + for (int i = 0; i < descriptor->field_count(); ++i) { + AddField(descriptor->field(i)); + } + } + + FileDescriptorSet Build() + { + THashSet<const FileDescriptor*> visitedFiles; + TVector<const FileDescriptor*> fileTopoOrder; + for (const auto* descriptor : AllDescriptors_) { + TraverseDependencies(descriptor->file(), visitedFiles, fileTopoOrder); + } + + THashSet<TString> messageTypeNames; + THashSet<TString> enumTypeNames; + for (const auto* descriptor : AllDescriptors_) { + messageTypeNames.insert(descriptor->full_name()); + } + for (const auto* enumDescriptor : EnumDescriptors_) { + enumTypeNames.insert(enumDescriptor->full_name()); + } + FileDescriptorSet fileDescriptorSetProto; + for (const auto* file : fileTopoOrder) { + auto* fileProto = fileDescriptorSetProto.add_file(); + file->CopyTo(fileProto); + Strip(fileProto, messageTypeNames, enumTypeNames); + } + return fileDescriptorSetProto; + } + +private: + void AddField(const FieldDescriptor* fieldDescriptor) + { + if (fieldDescriptor->message_type()) { + AddDescriptor(fieldDescriptor->message_type()); + } + if (fieldDescriptor->enum_type()) { + AddEnumDescriptor(fieldDescriptor->enum_type()); + } + } + + void AddEnumDescriptor(const EnumDescriptor* enumDescriptor) + { + auto [it, inserted] = EnumDescriptors_.insert(enumDescriptor); + if (!inserted) { + return; + } + const auto* containingType = enumDescriptor->containing_type(); + while (containingType) { + AddDescriptor(containingType); + containingType = containingType->containing_type(); + } + } + + void TraverseDependencies( + const FileDescriptor* current, + THashSet<const FileDescriptor*>& visited, + TVector<const FileDescriptor*>& topoOrder) + { + auto [it, inserted] = visited.insert(current); + if (!inserted) { + return; + } + for (int i = 0; i < current->dependency_count(); ++i) { + TraverseDependencies(current->dependency(i), visited, topoOrder); + } + topoOrder.push_back(current); + } + + template <typename TOptions> + void StripUnknownOptions(TOptions* options) + { + std::vector<const FieldDescriptor*> fields; + auto reflection = options->GetReflection(); + reflection->ListFields(*options, &fields); + for (auto field : fields) { + if (field->is_extension() && field->file() != ExtensionFile_) { + reflection->ClearField(options, field); + } + } + } + + template <typename TRepeatedField, typename TPredicate> + void RemoveIf(TRepeatedField* repeatedField, TPredicate predicate) + { + repeatedField->erase( + std::remove_if(repeatedField->begin(), repeatedField->end(), predicate), + repeatedField->end()); + } + + void Strip( + const TString& containingTypePrefix, + DescriptorProto* messageProto, + const THashSet<TString>& messageTypeNames, + const THashSet<TString>& enumTypeNames) + { + const auto prefix = containingTypePrefix + messageProto->name() + '.'; + + RemoveIf(messageProto->mutable_nested_type(), [&] (const DescriptorProto& descriptorProto) { + return !messageTypeNames.contains(prefix + descriptorProto.name()); + }); + RemoveIf(messageProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) { + return !enumTypeNames.contains(prefix + enumDescriptorProto.name()); + }); + + messageProto->clear_extension(); + StripUnknownOptions(messageProto->mutable_options()); + for (auto& fieldProto : *messageProto->mutable_field()) { + StripUnknownOptions(fieldProto.mutable_options()); + } + for (auto& oneofProto : *messageProto->mutable_oneof_decl()) { + StripUnknownOptions(oneofProto.mutable_options()); + } + for (auto& nestedTypeProto : *messageProto->mutable_nested_type()) { + Strip(prefix, &nestedTypeProto, messageTypeNames, enumTypeNames); + } + for (auto& enumProto : *messageProto->mutable_enum_type()) { + StripUnknownOptions(enumProto.mutable_options()); + for (auto& enumValue : *enumProto.mutable_value()) { + StripUnknownOptions(enumValue.mutable_options()); + } + } + } + + void Strip( + FileDescriptorProto* fileProto, + const THashSet<TString>& messageTypeNames, + const THashSet<TString>& enumTypeNames) + { + const auto prefix = fileProto->package().Empty() + ? "" + : fileProto->package() + '.'; + + RemoveIf(fileProto->mutable_message_type(), [&] (const DescriptorProto& descriptorProto) { + return !messageTypeNames.contains(prefix + descriptorProto.name()); + }); + RemoveIf(fileProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) { + return !enumTypeNames.contains(prefix + enumDescriptorProto.name()); + }); + + fileProto->clear_service(); + fileProto->clear_extension(); + + StripUnknownOptions(fileProto->mutable_options()); + for (auto& messageProto : *fileProto->mutable_message_type()) { + Strip(prefix, &messageProto, messageTypeNames, enumTypeNames); + } + for (auto& enumProto : *fileProto->mutable_enum_type()) { + StripUnknownOptions(enumProto.mutable_options()); + for (auto& enumValue : *enumProto.mutable_value()) { + StripUnknownOptions(enumValue.mutable_options()); + } + } + } + +private: + const FileDescriptor* const ExtensionFile_; + THashSet<const Descriptor*> AllDescriptors_; + THashSet<const EnumDescriptor*> EnumDescriptors_; +}; + +TNode MakeProtoFormatConfigWithDescriptors(const TVector<const Descriptor*>& descriptors) +{ + TFileDescriptorSetBuilder builder; + auto typeNames = TNode::CreateList(); + for (const auto* descriptor : descriptors) { + builder.AddDescriptor(descriptor); + typeNames.Add(descriptor->full_name()); + } + + auto fileDescriptorSetText = builder.Build().ShortDebugString(); + TNode config("protobuf"); + config.Attributes() + ("file_descriptor_set_text", std::move(fileDescriptorSetText)) + ("type_names", std::move(typeNames)); + return config; +} + +//////////////////////////////////////////////////////////////////////////////// + +using TTypePtrOrOtherColumns = std::variant<NTi::TTypePtr, TOtherColumns>; + +struct TMember { + TString Name; + TTypePtrOrOtherColumns TypeOrOtherColumns; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TValueTypeOrOtherColumns GetScalarFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& options) +{ + if (options.Type) { + switch (*options.Type) { + case EProtobufType::EnumInt: + return EValueType::VT_INT64; + case EProtobufType::EnumString: + return EValueType::VT_STRING; + case EProtobufType::Any: + return EValueType::VT_ANY; + case EProtobufType::OtherColumns: + return TOtherColumns{}; + } + Y_FAIL(); + } + + switch (fieldDescriptor.cpp_type()) { + case FieldDescriptor::CPPTYPE_INT32: + return EValueType::VT_INT32; + case FieldDescriptor::CPPTYPE_INT64: + return EValueType::VT_INT64; + case FieldDescriptor::CPPTYPE_UINT32: + return EValueType::VT_UINT32; + case FieldDescriptor::CPPTYPE_UINT64: + return EValueType::VT_UINT64; + case FieldDescriptor::CPPTYPE_FLOAT: + case FieldDescriptor::CPPTYPE_DOUBLE: + return EValueType::VT_DOUBLE; + case FieldDescriptor::CPPTYPE_BOOL: + return EValueType::VT_BOOLEAN; + case FieldDescriptor::CPPTYPE_STRING: + case FieldDescriptor::CPPTYPE_MESSAGE: + case FieldDescriptor::CPPTYPE_ENUM: + return EValueType::VT_STRING; + default: + ythrow yexception() << + "Unexpected field type '" << fieldDescriptor.cpp_type_name() << "' " << + "for field " << fieldDescriptor.name(); + } +} + +bool HasNameExtension(const FieldDescriptor& fieldDescriptor) +{ + const auto& options = fieldDescriptor.options(); + return options.HasExtension(column_name) || options.HasExtension(key_column_name); +} + +void SortFields(TVector<const FieldDescriptor*>& fieldDescriptors, EProtobufFieldSortOrder fieldSortOrder) +{ + switch (fieldSortOrder) { + case EProtobufFieldSortOrder::AsInProtoFile: + return; + case EProtobufFieldSortOrder::ByFieldNumber: + SortBy(fieldDescriptors, [] (const FieldDescriptor* fieldDescriptor) { + return fieldDescriptor->number(); + }); + return; + } + Y_FAIL(); +} + +NTi::TTypePtr CreateStruct(TStringBuf fieldName, TVector<TMember> members) +{ + TVector<NTi::TStructType::TOwnedMember> structMembers; + structMembers.reserve(members.size()); + for (auto& member : members) { + std::visit(TOverloaded{ + [&] (TOtherColumns) { + ythrow TApiUsageError() << + "Could not deduce YT type for field " << member.Name << " of " << + "embedded message field " << fieldName << " " << + "(note that " << EWrapperFieldFlag::OTHER_COLUMNS << " fields " << + "are not allowed inside embedded messages)"; + }, + [&] (NTi::TTypePtr& type) { + structMembers.emplace_back(std::move(member.Name), std::move(type)); + }, + }, member.TypeOrOtherColumns); + } + return NTi::Struct(std::move(structMembers)); +} + +TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor) +{ + auto isOtherColumns = [] (const ::google::protobuf::FieldDescriptor& field) { + return GetFieldOptions(&field).Type == EProtobufType::OtherColumns; + }; + + TVector<TString> result; + result.reserve(descriptor.field_count()); + for (int i = 0; i < descriptor.field_count(); ++i) { + const auto& field = *descriptor.field(i); + if (isOtherColumns(field)) { + return {}; + } + result.push_back(GetColumnName(field)); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TTableSchemaInferrer +{ +public: + TTableSchemaInferrer(bool keepFieldsWithoutExtension) + : KeepFieldsWithoutExtension_(keepFieldsWithoutExtension) + { } + + TTableSchema InferSchema(const Descriptor& messageDescriptor); + +private: + TTypePtrOrOtherColumns GetFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& defaultOptions); + + void ProcessOneofField( + TStringBuf containingFieldName, + const OneofDescriptor& oneofDescriptor, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + EProtobufFieldSortOrder fieldSortOrder, + TVector<TMember>* members); + + TVector<TMember> GetMessageMembers( + TStringBuf containingFieldName, + const Descriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder = std::nullopt); + + NTi::TTypePtr GetMessageType( + const FieldDescriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions); + + NTi::TTypePtr GetMapType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& fieldOptions); + +private: + void GetMessageMembersImpl( + TStringBuf containingFieldName, + const Descriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder, + TVector<TMember>* members); + +private: + const bool KeepFieldsWithoutExtension_; + TCycleChecker CycleChecker_; +}; + +void TTableSchemaInferrer::ProcessOneofField( + TStringBuf containingFieldName, + const OneofDescriptor& oneofDescriptor, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + EProtobufFieldSortOrder fieldSortOrder, + TVector<TMember>* members) +{ + auto oneofOptions = GetOneofOptions(&oneofDescriptor, defaultOneofOptions); + + auto addFields = [&] (TVector<TMember>* members, bool removeOptionality) { + TVector<const FieldDescriptor*> fieldDescriptors; + for (int i = 0; i < oneofDescriptor.field_count(); ++i) { + fieldDescriptors.push_back(oneofDescriptor.field(i)); + } + SortFields(fieldDescriptors, fieldSortOrder); + for (auto innerFieldDescriptor : fieldDescriptors) { + auto typeOrOtherColumns = GetFieldType( + *innerFieldDescriptor, + defaultFieldOptions); + if (auto* maybeType = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + maybeType && removeOptionality && (*maybeType)->IsOptional()) + { + typeOrOtherColumns = (*maybeType)->AsOptional()->GetItemType(); + } + members->push_back(TMember{ + GetColumnName(*innerFieldDescriptor), + std::move(typeOrOtherColumns), + }); + } + }; + + switch (oneofOptions.Mode) { + case EProtobufOneofMode::SeparateFields: + addFields(members, /* removeOptionality */ false); + return; + case EProtobufOneofMode::Variant: { + TVector<TMember> variantMembers; + addFields(&variantMembers, /* removeOptionality */ true); + members->push_back(TMember{ + oneofOptions.VariantFieldName, + NTi::Optional( + NTi::Variant( + CreateStruct(containingFieldName, std::move(variantMembers)) + ) + ) + }); + return; + } + } + Y_FAIL(); +} + +TVector<TMember> TTableSchemaInferrer::GetMessageMembers( + TStringBuf containingFieldName, + const Descriptor& messageDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder) +{ + TVector<TMember> members; + GetMessageMembersImpl( + containingFieldName, + messageDescriptor, + defaultFieldOptions, + overrideFieldSortOrder, + &members + ); + return members; +} + +void TTableSchemaInferrer::GetMessageMembersImpl( + TStringBuf containingFieldName, + const Descriptor& messageDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder, + TVector<TMember>* members) +{ + auto guard = CycleChecker_.Enter(&messageDescriptor); + defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor, defaultFieldOptions); + auto messageOptions = GetMessageOptions(&messageDescriptor); + auto defaultOneofOptions = GetDefaultOneofOptions(&messageDescriptor); + + TVector<const FieldDescriptor*> fieldDescriptors; + fieldDescriptors.reserve(messageDescriptor.field_count()); + for (int i = 0; i < messageDescriptor.field_count(); ++i) { + if (!KeepFieldsWithoutExtension_ && !HasNameExtension(*messageDescriptor.field(i))) { + continue; + } + fieldDescriptors.push_back(messageDescriptor.field(i)); + } + + auto fieldSortOrder = overrideFieldSortOrder.value_or(messageOptions.FieldSortOrder); + SortFields(fieldDescriptors, fieldSortOrder); + + THashSet<const OneofDescriptor*> visitedOneofs; + for (const auto innerFieldDescriptor : fieldDescriptors) { + auto oneofDescriptor = innerFieldDescriptor->containing_oneof(); + if (oneofDescriptor) { + if (visitedOneofs.contains(oneofDescriptor)) { + continue; + } + ProcessOneofField( + containingFieldName, + *oneofDescriptor, + defaultFieldOptions, + defaultOneofOptions, + messageOptions.FieldSortOrder, + members); + visitedOneofs.insert(oneofDescriptor); + continue; + } + auto fieldOptions = GetFieldOptions(innerFieldDescriptor, defaultFieldOptions); + if (fieldOptions.SerializationMode == EProtobufSerializationMode::Embedded) { + Y_ENSURE(innerFieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE, + "EMBEDDED column must have message type"); + Y_ENSURE(innerFieldDescriptor->label() == FieldDescriptor::LABEL_REQUIRED, + "EMBEDDED column must be marked required"); + GetMessageMembersImpl( + innerFieldDescriptor->full_name(), + *innerFieldDescriptor->message_type(), + defaultFieldOptions, + /*overrideFieldSortOrder*/ std::nullopt, + members); + } else { + auto typeOrOtherColumns = GetFieldType( + *innerFieldDescriptor, + defaultFieldOptions); + members->push_back(TMember{ + GetColumnName(*innerFieldDescriptor), + std::move(typeOrOtherColumns), + }); + } + } +} + +NTi::TTypePtr TTableSchemaInferrer::GetMessageType( + const FieldDescriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions) +{ + Y_VERIFY(fieldDescriptor.message_type()); + const auto& messageDescriptor = *fieldDescriptor.message_type(); + auto members = GetMessageMembers( + fieldDescriptor.full_name(), + messageDescriptor, + defaultFieldOptions); + + return CreateStruct(fieldDescriptor.full_name(), std::move(members)); +} + +NTi::TTypePtr TTableSchemaInferrer::GetMapType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& fieldOptions) +{ + Y_VERIFY(fieldDescriptor.is_map()); + switch (fieldOptions.MapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + case EProtobufMapMode::ListOfStructs: { + TProtobufFieldOptions embeddedOptions; + if (fieldOptions.MapMode == EProtobufMapMode::ListOfStructs) { + embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt; + } + auto list = NTi::List(GetMessageType(fieldDescriptor, embeddedOptions)); + switch (fieldOptions.ListMode) { + case EProtobufListMode::Required: + return list; + case EProtobufListMode::Optional: + return NTi::Optional(std::move(list)); + } + Y_FAIL(); + } + case EProtobufMapMode::Dict: + case EProtobufMapMode::OptionalDict: { + auto message = fieldDescriptor.message_type(); + Y_VERIFY(message->field_count() == 2); + auto keyVariant = GetScalarFieldType(*message->field(0), TProtobufFieldOptions{}); + Y_VERIFY(std::holds_alternative<EValueType>(keyVariant)); + auto key = std::get<EValueType>(keyVariant); + TProtobufFieldOptions embeddedOptions; + embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt; + auto valueVariant = GetFieldType(*message->field(1), embeddedOptions); + Y_VERIFY(std::holds_alternative<NTi::TTypePtr>(valueVariant)); + auto value = std::get<NTi::TTypePtr>(valueVariant); + Y_VERIFY(value->IsOptional()); + value = value->AsOptional()->GetItemType(); + auto dict = NTi::Dict(ToTypeV3(key, true), value); + if (fieldOptions.MapMode == EProtobufMapMode::OptionalDict) { + return NTi::Optional(dict); + } else { + return dict; + } + } + } +} + +TTypePtrOrOtherColumns TTableSchemaInferrer::GetFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& defaultOptions) +{ + auto fieldOptions = GetFieldOptions(&fieldDescriptor, defaultOptions); + if (fieldOptions.Type) { + ValidateProtobufType(fieldDescriptor, *fieldOptions.Type); + } + + auto getScalarType = [&] { + auto valueTypeOrOtherColumns = GetScalarFieldType(fieldDescriptor, fieldOptions); + return std::visit(TOverloaded{ + [] (TOtherColumns) -> TTypePtrOrOtherColumns { + return TOtherColumns{}; + }, + [] (EValueType valueType) -> TTypePtrOrOtherColumns { + return ToTypeV3(valueType, true); + } + }, valueTypeOrOtherColumns); + }; + + auto withFieldLabel = [&] (const TTypePtrOrOtherColumns& typeOrOtherColumns) -> TTypePtrOrOtherColumns { + switch (fieldDescriptor.label()) { + case FieldDescriptor::Label::LABEL_REPEATED: { + Y_ENSURE(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt, + "Repeated fields are supported only for YT serialization mode, field \"" + fieldDescriptor.full_name() + + "\" has incorrect serialization mode"); + auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + Y_ENSURE(type, "OTHER_COLUMNS field can not be repeated"); + switch (fieldOptions.ListMode) { + case EProtobufListMode::Required: + return NTi::TTypePtr(NTi::List(*type)); + case EProtobufListMode::Optional: + return NTi::TTypePtr(NTi::Optional(NTi::List(*type))); + } + Y_FAIL(); + } + case FieldDescriptor::Label::LABEL_OPTIONAL: + return std::visit(TOverloaded{ + [] (TOtherColumns) -> TTypePtrOrOtherColumns { + return TOtherColumns{}; + }, + [] (NTi::TTypePtr type) -> TTypePtrOrOtherColumns { + return NTi::TTypePtr(NTi::Optional(std::move(type))); + } + }, typeOrOtherColumns); + case FieldDescriptor::LABEL_REQUIRED: { + auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + Y_ENSURE(type, "OTHER_COLUMNS field can not be required"); + return *type; + } + } + Y_FAIL(); + }; + + switch (fieldOptions.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + return withFieldLabel(getScalarType()); + case EProtobufSerializationMode::Yt: + if (fieldDescriptor.type() == FieldDescriptor::TYPE_MESSAGE) { + if (fieldDescriptor.is_map()) { + return GetMapType(fieldDescriptor, fieldOptions); + } else { + return withFieldLabel(GetMessageType(fieldDescriptor, TProtobufFieldOptions{})); + } + } else { + return withFieldLabel(getScalarType()); + } + case EProtobufSerializationMode::Embedded: + ythrow yexception() << "EMBEDDED field is not allowed for field " + << fieldDescriptor.full_name(); + } + Y_FAIL(); +} + +TTableSchema TTableSchemaInferrer::InferSchema(const Descriptor& messageDescriptor) +{ + TTableSchema result; + + auto defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor); + auto members = GetMessageMembers( + messageDescriptor.full_name(), + messageDescriptor, + defaultFieldOptions, + // Use special sort order for top level messages. + /*overrideFieldSortOrder*/ EProtobufFieldSortOrder::AsInProtoFile); + + for (auto& member : members) { + std::visit(TOverloaded{ + [&] (TOtherColumns) { + result.Strict(false); + }, + [&] (NTi::TTypePtr& type) { + result.AddColumn(TColumnSchema() + .Name(std::move(member.Name)) + .Type(std::move(type)) + ); + }, + }, member.TypeOrOtherColumns); + } + + return result; +} + +TTableSchema CreateTableSchemaImpl( + const Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension) +{ + TTableSchemaInferrer inferrer(keepFieldsWithoutExtension); + return inferrer.InferSchema(messageDescriptor); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <> +void Out<NYT::EWrapperFieldFlag::Enum>(IOutputStream& stream, NYT::EWrapperFieldFlag::Enum value) +{ + stream << NYT::EWrapperFieldFlag_Enum_Name(value); +} + +template <> +void Out<NYT::EWrapperMessageFlag::Enum>(IOutputStream& stream, NYT::EWrapperMessageFlag::Enum value) +{ + stream << NYT::EWrapperMessageFlag_Enum_Name(value); +} + +template <> +void Out<NYT::EWrapperOneofFlag::Enum>(IOutputStream& stream, NYT::EWrapperOneofFlag::Enum value) +{ + stream << NYT::EWrapperOneofFlag_Enum_Name(value); +} diff --git a/yt/cpp/mapreduce/interface/protobuf_format.h b/yt/cpp/mapreduce/interface/protobuf_format.h new file mode 100644 index 0000000000..aafbced386 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_format.h @@ -0,0 +1,106 @@ +#pragma once + +#include "common.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <util/generic/maybe.h> + +#include <google/protobuf/message.h> + +/// @cond Doxygen_Suppress +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +enum class EProtobufType +{ + EnumInt /* "enum_int" */, + EnumString /* "enum_string" */, + Any /* "any" */, + OtherColumns /* "other_columns" */, +}; + +enum class EProtobufSerializationMode +{ + Protobuf, + Yt, + Embedded, +}; + +enum class EProtobufListMode +{ + Optional, + Required, +}; + +enum class EProtobufMapMode +{ + ListOfStructsLegacy, + ListOfStructs, + Dict, + OptionalDict, +}; + +enum class EProtobufFieldSortOrder +{ + AsInProtoFile, + ByFieldNumber, +}; + +enum class EProtobufOneofMode +{ + SeparateFields, + Variant, +}; + +enum class EProtobufEnumWritingMode +{ + SkipUnknownValues, + CheckValues, +}; + +struct TProtobufOneofOptions +{ + EProtobufOneofMode Mode = EProtobufOneofMode::Variant; + TString VariantFieldName; +}; + +struct TProtobufFieldOptions +{ + TMaybe<EProtobufType> Type; + EProtobufSerializationMode SerializationMode = EProtobufSerializationMode::Protobuf; + EProtobufListMode ListMode = EProtobufListMode::Required; + EProtobufMapMode MapMode = EProtobufMapMode::ListOfStructsLegacy; +}; + +struct TProtobufMessageOptions +{ + EProtobufFieldSortOrder FieldSortOrder = EProtobufFieldSortOrder::ByFieldNumber; +}; + +TString GetColumnName(const ::google::protobuf::FieldDescriptor& field); + +TProtobufFieldOptions GetFieldOptions( + const ::google::protobuf::FieldDescriptor* fieldDescriptor, + const TMaybe<TProtobufFieldOptions>& defaultFieldOptions = {}); + +TProtobufOneofOptions GetOneofOptions( + const ::google::protobuf::OneofDescriptor* oneofDescriptor, + const TMaybe<TProtobufOneofOptions>& defaultOneofOptions = {}); + +TProtobufMessageOptions GetMessageOptions(const ::google::protobuf::Descriptor* descriptor); + +TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor); + +TNode MakeProtoFormatConfigWithTables(const TVector<const ::google::protobuf::Descriptor*>& descriptors); +TNode MakeProtoFormatConfigWithDescriptors(const TVector<const ::google::protobuf::Descriptor*>& descriptors); + +TTableSchema CreateTableSchemaImpl( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail +/// @endcond diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp new file mode 100644 index 0000000000..19a3d5163f --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp @@ -0,0 +1,451 @@ +#include "common.h" +#include "errors.h" +#include "common_ut.h" +#include "util/generic/fwd.h" + +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> +#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <algorithm> + +using namespace NYT; + +bool IsFieldPresent(const TTableSchema& schema, TStringBuf name) +{ + for (const auto& field : schema.Columns()) { + if (field.Name() == name) { + return true; + } + } + return false; +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Simple) +{ + Y_UNIT_TEST(TIntegral) + { + const auto schema = CreateTableSchema<NUnitTesting::TIntegral>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("FloatField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Int64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("Uint32Field").Type(ToTypeV3(EValueType::VT_UINT32, false))) + .AddColumn(TColumnSchema().Name("Uint64Field").Type(ToTypeV3(EValueType::VT_UINT64, false))) + .AddColumn(TColumnSchema().Name("Sint32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Sint64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("Fixed32Field").Type(ToTypeV3(EValueType::VT_UINT32, false))) + .AddColumn(TColumnSchema().Name("Fixed64Field").Type(ToTypeV3(EValueType::VT_UINT64, false))) + .AddColumn(TColumnSchema().Name("Sfixed32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Sfixed64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false))) + .AddColumn(TColumnSchema().Name("EnumField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TOneOf) + { + const auto schema = CreateTableSchema<NUnitTesting::TOneOf>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false)))); + } + + Y_UNIT_TEST(TWithRequired) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithRequired>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("RequiredField").Type(ToTypeV3(EValueType::VT_STRING, true))) + .AddColumn(TColumnSchema().Name("NotRequiredField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TAggregated) + { + const auto schema = CreateTableSchema<NUnitTesting::TAggregated>(); + + UNIT_ASSERT_VALUES_EQUAL(6, schema.Columns().size()); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("StringField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("BytesField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedRepeatedField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedOneOfField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedRecursiveField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TAliased) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("subkey").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumns) + { + const TSortColumns keys = {"key", "subkey"}; + + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("key") + .Type(ToTypeV3(EValueType::VT_INT32, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema() + .Name("subkey") + .Type(ToTypeV3(EValueType::VT_DOUBLE, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumnsReordered) + { + const TSortColumns keys = {"subkey"}; + + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("subkey") + .Type(ToTypeV3(EValueType::VT_DOUBLE, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumnsInvalid) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"subkey", "subkey"}), yexception); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"key", "junk"}), yexception); + } + + Y_UNIT_TEST(KeepFieldsWithoutExtensionTrue) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, true); + UNIT_ASSERT(IsFieldPresent(schema, "key")); + UNIT_ASSERT(IsFieldPresent(schema, "subkey")); + UNIT_ASSERT(IsFieldPresent(schema, "Data")); + UNIT_ASSERT(schema.Strict()); + } + + Y_UNIT_TEST(KeepFieldsWithoutExtensionFalse) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, false); + UNIT_ASSERT(IsFieldPresent(schema, "key")); + UNIT_ASSERT(IsFieldPresent(schema, "subkey")); + UNIT_ASSERT(!IsFieldPresent(schema, "Data")); + UNIT_ASSERT(schema.Strict()); + } + + Y_UNIT_TEST(ProtobufTypeOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithTypeOptions>({}); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .Strict(false) + .AddColumn(TColumnSchema().Name("ColorIntField").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("ColorStringField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("AnyField").Type(ToTypeV3(EValueType::VT_ANY, false))) + .AddColumn(TColumnSchema().Name("EmbeddedField").Type( + NTi::Optional(NTi::Struct({ + {"ColorIntField", ToTypeV3(EValueType::VT_INT64, false)}, + {"ColorStringField", ToTypeV3(EValueType::VT_STRING, false)}, + {"AnyField", ToTypeV3(EValueType::VT_ANY, false)}})))) + .AddColumn(TColumnSchema().Name("RepeatedEnumIntField").Type(NTi::List(NTi::Int64())))); + } + + Y_UNIT_TEST(ProtobufTypeOption_TypeMismatch) + { + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumInt>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumString>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_Any>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_OtherColumns>({}), + yexception); + } +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Complex) +{ + Y_UNIT_TEST(TRepeated) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TRepeated>(), yexception); + + const auto schema = CreateTableSchema<NUnitTesting::TRepeatedYtMode>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("Int32Field").Type(NTi::List(ToTypeV3(EValueType::VT_INT32, true))))); + } + + Y_UNIT_TEST(TRepeatedOptionalList) + { + const auto schema = CreateTableSchema<NUnitTesting::TOptionalList>(); + auto type = NTi::Optional(NTi::List(NTi::Int64())); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("OptionalListInt64").TypeV3(type))); + } + + NTi::TTypePtr GetUrlRowType(bool required) + { + static const NTi::TTypePtr structType = NTi::Struct({ + {"Host", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}}); + return required ? structType : NTi::TTypePtr(NTi::Optional(structType)); + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowFieldSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TRowMessageSerializationOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMessageSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + NTi::TTypePtr GetUrlRowType_ColumnNames(bool required) + { + static const NTi::TTypePtr type = NTi::Struct({ + {"Host_ColumnName", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path_KeyColumnName", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}, + }); + return required ? type : NTi::TTypePtr(NTi::Optional(type)); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions_ColumnNames) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions_ColumnNames>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType_ColumnNames(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(NoOptionInheritance) + { + auto deepestEmbedded = NTi::Optional(NTi::Struct({{"x", ToTypeV3(EValueType::VT_INT64, false)}})); + + const auto schema = CreateTableSchema<NUnitTesting::TNoOptionInheritance>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("EmbeddedYt_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", deepestEmbedded}})))) + .AddColumn(TColumnSchema().Name("EmbeddedYt_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("EmbeddedYt_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema() + .Name("EmbeddedProtobuf_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}})))) + .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema() + .Name("Embedded_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}})))) + .AddColumn(TColumnSchema().Name("Embedded_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("Embedded_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(Cyclic) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TA>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TB>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TC>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TD>(), TApiUsageError); + + ASSERT_SERIALIZABLES_EQUAL( + TTableSchema().AddColumn( + TColumnSchema().Name("d").TypeV3(NTi::Optional(NTi::String()))), + CreateTableSchema<NUnitTesting::TCyclic::TE>()); + } + + Y_UNIT_TEST(FieldSortOrder) + { + const auto schema = CreateTableSchema<NUnitTesting::TFieldSortOrder>(); + + auto byFieldNumber = NTi::Optional(NTi::Struct({ + {"z", NTi::Optional(NTi::Bool())}, + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(byFieldNumber)) + .AddColumn(TColumnSchema() + .Name("EmbeddedAsInProtoFile") + .Type(NTi::Optional(NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + {"z", NTi::Optional(NTi::Bool())}, + })))) + .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber))); + } + + Y_UNIT_TEST(Map) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithMap>(); + + auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) { + return NTi::List(NTi::Struct({ + {"key", NTi::Optional(key)}, + {"value", NTi::Optional(value)}, + })); + }; + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("MapDefault") + .Type(createKeyValueStruct(NTi::Int64(), NTi::String()))) + .AddColumn(TColumnSchema() + .Name("MapListOfStructsLegacy") + .Type(createKeyValueStruct(NTi::Int64(), NTi::String()))) + .AddColumn(TColumnSchema() + .Name("MapListOfStructs") + .Type(createKeyValueStruct(NTi::Int64(), embedded))) + .AddColumn(TColumnSchema() + .Name("MapOptionalDict") + .Type(NTi::Optional(NTi::Dict(NTi::Int64(), embedded)))) + .AddColumn(TColumnSchema() + .Name("MapDict") + .Type(NTi::Dict(NTi::Int64(), embedded)))); + } + + Y_UNIT_TEST(Oneof) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithOneof>(); + + auto embedded = NTi::Struct({ + {"Oneof", NTi::Optional(NTi::Variant(NTi::Struct({ + {"x", NTi::Int64()}, + {"y", NTi::String()}, + })))}, + }); + + auto createType = [&] (TString oneof2Name) { + return NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {oneof2Name, NTi::Optional(NTi::Variant(NTi::Struct({ + {"x2", NTi::Int64()}, + {"y2", NTi::String()}, + {"z2", embedded}, + })))}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + {"x1", NTi::Optional(NTi::Int64())}, + })); + }; + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("DefaultSeparateFields") + .Type(createType("variant_field_name"))) + .AddColumn(TColumnSchema() + .Name("NoDefault") + .Type(createType("Oneof2"))) + .AddColumn(TColumnSchema() + .Name("SerializationProtobuf") + .Type(NTi::Optional(NTi::Struct({ + {"y1", NTi::Optional(NTi::String())}, + {"x1", NTi::Optional(NTi::Int64())}, + {"z1", NTi::Optional(NTi::String())}, + })))) + .AddColumn(TColumnSchema() + .Name("TopLevelOneof") + .Type( + NTi::Optional( + NTi::Variant(NTi::Struct({ + {"MemberOfTopLevelOneof", NTi::Int64()} + })) + ) + )) + ); + } + + Y_UNIT_TEST(Embedded) + { + const auto schema = CreateTableSchema<NUnitTesting::TEmbeddingMessage>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .Strict(false) + .AddColumn(TColumnSchema().Name("embedded2_num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("embedded2_struct").Type(NTi::Optional(NTi::Struct({ + {"float1", NTi::Optional(NTi::Double())}, + {"string1", NTi::Optional(NTi::String())}, + })))) + .AddColumn(TColumnSchema().Name("embedded2_repeated").Type(NTi::List(NTi::String()))) + .AddColumn(TColumnSchema().Name("embedded_num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("embedded_extra_field").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("variant").Type(NTi::Optional(NTi::Variant(NTi::Struct({ + {"str_variant", NTi::String()}, + {"uint_variant", NTi::Uint64()}, + }))))) + .AddColumn(TColumnSchema().Name("num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("extra_field").Type(NTi::Optional(NTi::String()))) + ); + } +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Proto3) +{ + Y_UNIT_TEST(TWithOptional) + { + const auto schema = CreateTableSchema<NTestingProto3::TWithOptional>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("x").Type(NTi::Optional(NTi::Int64())) + ) + ); + } + + Y_UNIT_TEST(TWithOptionalMessage) + { + const auto schema = CreateTableSchema<NTestingProto3::TWithOptionalMessage>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("x").Type( + NTi::Optional( + NTi::Struct({{"x", NTi::Optional(NTi::Int64())}}) + ) + ) + ) + ); + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto new file mode 100644 index 0000000000..60bad6e650 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto @@ -0,0 +1,402 @@ +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NUnitTesting; + +message TIntegral +{ + optional double DoubleField = 1; + optional float FloatField = 2; + optional int32 Int32Field = 3; + optional int64 Int64Field = 4; + optional uint32 Uint32Field = 5; + optional uint64 Uint64Field = 6; + optional sint32 Sint32Field = 7; + optional sint64 Sint64Field = 8; + optional fixed32 Fixed32Field = 9; + optional fixed64 Fixed64Field = 10; + optional sfixed32 Sfixed32Field = 11; + optional sfixed64 Sfixed64Field = 12; + optional bool BoolField = 13; + enum TriBool + { + TRI_FALSE = 0; + TRI_TRUE = 1; + TRI_UNDEF = -1; + } + optional TriBool EnumField = 14; +} + +message TRepeated +{ + repeated int32 Int32Field = 1; +} + +message TRepeatedYtMode +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated int32 Int32Field = 1; +} + +message TWithTypeOptions +{ + enum Color + { + WHITE = 0; + BLUE = 1; + RED = -1; + } + + message TEmbedded + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT]; + optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING]; + optional bytes AnyField = 3 [(NYT.flags) = ANY]; + } + + optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT]; + optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING]; + optional bytes AnyField = 3 [(NYT.flags) = ANY]; + optional bytes OtherColumnsField = 4 [(NYT.flags) = OTHER_COLUMNS]; + optional TEmbedded EmbeddedField = 5 [(NYT.flags) = SERIALIZATION_YT]; + repeated Color RepeatedEnumIntField = 6 [(NYT.flags) = SERIALIZATION_YT, (NYT.flags) = ENUM_INT]; +} + +message TWithTypeOptions_TypeMismatch_EnumInt +{ + optional int64 EnumField = 1 [(NYT.flags) = ENUM_INT]; +} + +message TWithTypeOptions_TypeMismatch_EnumString +{ + optional string EnumField = 1 [(NYT.flags) = ENUM_STRING]; +} + +message TWithTypeOptions_TypeMismatch_Any +{ + optional string AnyField = 1 [(NYT.flags) = ANY]; +} + +message TWithTypeOptions_TypeMismatch_OtherColumns +{ + optional string OtherColumnsField = 1 [(NYT.flags) = OTHER_COLUMNS]; +} + +message TOneOf +{ + oneof Chooser + { + double DoubleField = 1; + int32 Int32Field = 2; + } + optional bool BoolField = 3; +} + +message TWithRequired +{ + required string RequiredField = 1; + optional string NotRequiredField = 2; +}; + +message TAggregated +{ + optional string StringField = 1; + optional bytes BytesField = 2; + optional TIntegral NestedField = 3; + optional TRepeated NestedRepeatedField = 4; + optional TOneOf NestedOneOfField = 5; + optional TAggregated NestedRecursiveField = 6; +} + +message TAliased +{ + optional int32 Key = 1 [(NYT.key_column_name) = "key"]; + optional double Subkey = 2 [(NYT.key_column_name) = "subkey"]; + optional TAggregated Data = 3; +} + +//////////////////////////////////////////////////////////////////////////////// + +message TUrlRow +{ + optional string Host = 1 [(NYT.column_name) = "Host"]; + optional string Path = 2 [(NYT.column_name) = "Path"]; + optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"]; +} + +message TRowFieldSerializationOption +{ + optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_YT]; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMessageSerializationOption +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMixedSerializationOptions +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; +} + +message TRowSerializedRepeatedFields +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated int64 Ints = 1; + repeated TUrlRow UrlRows = 2; +} + +message TUrlRowWithColumnNames +{ + optional string Host = 1 [(NYT.column_name) = "Host_ColumnName", (NYT.key_column_name) = "Host_KeyColumnName"]; + optional string Path = 2 [(NYT.key_column_name) = "Path_KeyColumnName"]; + optional sint32 HttpCode = 3; +} + +message TRowMixedSerializationOptions_ColumnNames +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRowWithColumnNames UrlRow_1 = 1; + optional TUrlRowWithColumnNames UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; +} + +message TNoOptionInheritance +{ + message TDeepestEmbedded + { + optional int64 x = 1; + } + + message TEmbedded + { + optional TDeepestEmbedded embedded = 1; + } + + message TEmbeddedYt + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TDeepestEmbedded embedded = 1; + } + + message TEmbeddedProtobuf + { + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + + optional TDeepestEmbedded embedded = 1; + } + + optional TEmbeddedYt EmbeddedYt_YtOption = 1 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbeddedYt EmbeddedYt_ProtobufOption = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbeddedYt EmbeddedYt_NoOption = 3; + optional TEmbeddedProtobuf EmbeddedProtobuf_YtOption = 4 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbeddedProtobuf EmbeddedProtobuf_ProtobufOption = 5 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbeddedProtobuf EmbeddedProtobuf_NoOption = 6; + optional TEmbedded Embedded_YtOption = 7 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbedded Embedded_ProtobufOption = 8 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbedded Embedded_NoOption = 9; +} + +message TOptionalList +{ + repeated int64 OptionalListInt64 = 1 [(NYT.flags) = OPTIONAL_LIST, (NYT.flags) = SERIALIZATION_YT]; +} + +message TPacked +{ + repeated int64 PackedListInt64 = 1 [(NYT.flags) = SERIALIZATION_YT, packed=true]; +} + +message TCyclic +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TA + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated TB b = 1; + optional TC c = 2; + } + + message TB + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TD d = 1; + } + + message TC + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TD d = 1; + } + + message TD + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TA a = 1; + } + + message TE + { + optional TD d = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + } + + optional TA a = 1; +} + +message TFieldSortOrder +{ + message TEmbeddedDefault { + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedAsInProtoFile { + option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedByFieldNumber { + option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TEmbeddedDefault EmbeddedDefault = 1; + optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2; + optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3; +} + +message TWithMap +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TEmbedded { + optional int64 x = 1; + optional string y = 2; + } + + map<int64, TEmbedded> MapDefault = 1; + map<int64, TEmbedded> MapListOfStructsLegacy = 2 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS_LEGACY]; + map<int64, TEmbedded> MapListOfStructs = 3 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS]; + map<int64, TEmbedded> MapOptionalDict = 4 [(NYT.flags) = MAP_AS_OPTIONAL_DICT]; + map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT]; +} + +message TWithOneof +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TEmbedded + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + oneof Oneof { + int64 x = 1; + string y = 2; + } + } + + message TDefaultSeparateFields + { + option (NYT.default_oneof_flags) = SEPARATE_FIELDS; + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional string field = 1; + + oneof Oneof2 + { + option (NYT.variant_field_name) = "variant_field_name"; + option (NYT.oneof_flags) = VARIANT; + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TNoDefault + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + option (NYT.oneof_flags) = SEPARATE_FIELDS; + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TSerializationProtobuf + { + oneof Oneof + { + int64 x1 = 2; + string y1 = 1; + TEmbedded z1 = 3; + } + } + + optional TDefaultSeparateFields DefaultSeparateFields = 1; + optional TNoDefault NoDefault = 2; + optional TSerializationProtobuf SerializationProtobuf = 3; + + oneof TopLevelOneof + { + int64 MemberOfTopLevelOneof = 4; + } +} + +message TEmbeddedStruct { + optional float float1 = 1; + optional string string1 = 2; +} + +message TEmbedded2Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional uint64 embedded2_num = 10; + optional TEmbeddedStruct embedded2_struct = 17; + repeated string embedded2_repeated = 42; +} + +message TEmbedded1Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + required TEmbedded2Message t2 = 1 [(NYT.flags) = EMBEDDED]; + oneof variant { + string str_variant = 101; + uint64 uint_variant = 102; + } + optional uint64 embedded_num = 10; // make intensional field_num collision! + optional string embedded_extra_field = 11; +} + +message TEmbeddingMessage { + optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; + required TEmbedded1Message t1 = 2 [(NYT.flags) = EMBEDDED]; + optional uint64 num = 12; + optional string extra_field = 13; +} diff --git a/yt/cpp/mapreduce/interface/public.h b/yt/cpp/mapreduce/interface/public.h new file mode 100644 index 0000000000..bdeda78795 --- /dev/null +++ b/yt/cpp/mapreduce/interface/public.h @@ -0,0 +1,10 @@ +#pragma once + +#include <memory> + +namespace NYT::NAuth { + +struct IServiceTicketAuthPtrWrapper; +using IServiceTicketAuthPtrWrapperPtr = std::shared_ptr<IServiceTicketAuthPtrWrapper>; + +} // namespace NYT::NAuth diff --git a/yt/cpp/mapreduce/interface/retry_policy.h b/yt/cpp/mapreduce/interface/retry_policy.h new file mode 100644 index 0000000000..c198839079 --- /dev/null +++ b/yt/cpp/mapreduce/interface/retry_policy.h @@ -0,0 +1,47 @@ +#pragma once + +#include <util/datetime/base.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// A configuration that controls retries of a single request. +struct TRetryConfig +{ + /// + /// @brief How long retries of a single YT request can go on. + /// + /// If this limit is reached while retry count is not yet exceeded @ref TRequestRetriesTimeout exception is thrown. + TDuration RetriesTimeLimit = TDuration::Max(); +}; + +/// The library uses this class to understand how to retry individual requests. +class IRetryConfigProvider + : public virtual TThrRefBase +{ +public: + /// + /// @brief Gets retry policy for single request. + /// + /// CreateRetryConfig is called before ANY request. + /// Returned config controls retries of this request. + /// + /// Must be thread safe since it can be used from different threads + /// to perform internal library requests (e.g. pings). + /// + /// Some methods (e.g. IClient::Map) involve multiple requests to YT and therefore + /// this method will be called several times during execution of single method. + /// + /// If user needs to limit overall retries inside long operation they might create + /// retry policy that knows about overall deadline + /// @ref NYT::TRetryConfig::RetriesTimeLimit taking into account that overall deadline. + /// (E.g. when deadline reached it returns zero limit for retries). + virtual TRetryConfig CreateRetryConfig() = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + diff --git a/yt/cpp/mapreduce/interface/serialize.cpp b/yt/cpp/mapreduce/interface/serialize.cpp new file mode 100644 index 0000000000..ae05d9f50d --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize.cpp @@ -0,0 +1,553 @@ +#include "serialize.h" + +#include "common.h" +#include "fluent.h" + +#include <library/cpp/yson/parser.h> +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/serialize.h> + +#include <library/cpp/type_info/type_io.h> + +#include <util/generic/string.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +// const auto& nodeMap = node.AsMap(); +#define DESERIALIZE_ITEM(NAME, MEMBER) \ + if (const auto* item = nodeMap.FindPtr(NAME)) { \ + Deserialize(MEMBER, *item); \ + } + +// const auto& attributesMap = node.GetAttributes().AsMap(); +#define DESERIALIZE_ATTR(NAME, MEMBER) \ + if (const auto* attr = attributesMap.FindPtr(NAME)) { \ + Deserialize(MEMBER, *attr); \ + } + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TSortColumn& sortColumn, NYson::IYsonConsumer* consumer) +{ + if (sortColumn.SortOrder() == ESortOrder::SO_ASCENDING) { + Serialize(sortColumn.Name(), consumer); + } else { + BuildYsonFluently(consumer).BeginMap() + .Item("name").Value(sortColumn.Name()) + .Item("sort_order").Value(ToString(sortColumn.SortOrder())) + .EndMap(); + } +} + +void Deserialize(TSortColumn& sortColumn, const TNode& node) +{ + if (node.IsString()) { + sortColumn = TSortColumn(node.AsString()); + } else if (node.IsMap()) { + const auto& name = node["name"].AsString(); + const auto& sortOrderString = node["sort_order"].AsString(); + sortColumn = TSortColumn(name, ::FromString<ESortOrder>(sortOrderString)); + } else { + ythrow yexception() << "Expected sort column to be string or map, got " << node.GetType(); + } +} + +template <class T, class TDerived> +void SerializeOneOrMany(const TOneOrMany<T, TDerived>& oneOrMany, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).List(oneOrMany.Parts_); +} + +template <class T, class TDerived> +void DeserializeOneOrMany(TOneOrMany<T, TDerived>& oneOrMany, const TNode& node) +{ + Deserialize(oneOrMany.Parts_, node); +} + +void Serialize(const TKey& key, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(key, consumer); +} + +void Deserialize(TKey& key, const TNode& node) +{ + DeserializeOneOrMany(key, node); +} + +void Serialize(const TSortColumns& sortColumns, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(sortColumns, consumer); +} + +void Deserialize(TSortColumns& sortColumns, const TNode& node) +{ + DeserializeOneOrMany(sortColumns, node); +} + +void Serialize(const TColumnNames& columnNames, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(columnNames, consumer); +} + +void Deserialize(TColumnNames& columnNames, const TNode& node) +{ + DeserializeOneOrMany(columnNames, node); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Deserialize(EValueType& valueType, const TNode& node) +{ + const auto& nodeStr = node.AsString(); + static const THashMap<TString, EValueType> str2ValueType = { + {"int8", VT_INT8}, + {"int16", VT_INT16}, + {"int32", VT_INT32}, + {"int64", VT_INT64}, + + {"uint8", VT_UINT8}, + {"uint16", VT_UINT16}, + {"uint32", VT_UINT32}, + {"uint64", VT_UINT64}, + + {"boolean", VT_BOOLEAN}, + {"double", VT_DOUBLE}, + + {"string", VT_STRING}, + {"utf8", VT_UTF8}, + + {"any", VT_ANY}, + + {"null", VT_NULL}, + {"void", VT_VOID}, + + {"date", VT_DATE}, + {"datetime", VT_DATETIME}, + {"timestamp", VT_TIMESTAMP}, + {"interval", VT_INTERVAL}, + {"float", VT_FLOAT}, + {"json", VT_JSON}, + }; + + auto it = str2ValueType.find(nodeStr); + if (it == str2ValueType.end()) { + ythrow yexception() << "Invalid value type '" << nodeStr << "'"; + } + + valueType = it->second; +} + +void Deserialize(ESortOrder& sortOrder, const TNode& node) +{ + sortOrder = FromString<ESortOrder>(node.AsString()); +} + +void Deserialize(EOptimizeForAttr& optimizeFor, const TNode& node) +{ + optimizeFor = FromString<EOptimizeForAttr>(node.AsString()); +} + +void Deserialize(EErasureCodecAttr& erasureCodec, const TNode& node) +{ + erasureCodec = FromString<EErasureCodecAttr>(node.AsString()); +} + +void Deserialize(ESchemaModificationAttr& schemaModification, const TNode& node) +{ + schemaModification = FromString<ESchemaModificationAttr>(node.AsString()); +} + +void Serialize(const TColumnSchema& columnSchema, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .Item("name").Value(columnSchema.Name()) + .DoIf(!columnSchema.RawTypeV3().Defined(), + [&] (TFluentMap fluent) { + fluent.Item("type").Value(NDetail::ToString(columnSchema.Type())); + fluent.Item("required").Value(columnSchema.Required()); + if (columnSchema.Type() == VT_ANY + && *columnSchema.TypeV3() != *NTi::Optional(NTi::Yson())) + { + // A lot of user canonize serialized schema. + // To be backward compatible we only set type_v3 for new types. + fluent.Item("type_v3").Value(columnSchema.TypeV3()); + } + } + ) + .DoIf(columnSchema.RawTypeV3().Defined(), [&] (TFluentMap fluent) { + const auto& rawTypeV3 = *columnSchema.RawTypeV3(); + fluent.Item("type_v3").Value(rawTypeV3); + + // We going set old fields `type` and `required` to be compatible + // with old clusters that doesn't support type_v3 yet. + + // if type is simple return its name otherwise return empty optional + auto isRequired = [](TStringBuf simpleType) { + return simpleType != "null" && simpleType != "void"; + }; + auto getSimple = [] (const TNode& typeV3) -> TMaybe<TString> { + static const THashMap<TString,TString> typeV3ToOld = { + {"bool", "boolean"}, + {"yson", "any"}, + }; + TMaybe<TString> result; + if (typeV3.IsString()) { + result = typeV3.AsString(); + } else if (typeV3.IsMap() && typeV3.Size() == 1) { + Y_VERIFY(typeV3["type_name"].IsString(), "invalid type is passed"); + result = typeV3["type_name"].AsString(); + } + if (result) { + auto it = typeV3ToOld.find(*result); + if (it != typeV3ToOld.end()) { + result = it->second; + } + } + return result; + }; + auto simplify = [&](const TNode& typeV3) -> TMaybe<std::pair<TString, bool>> { + auto simple = getSimple(typeV3); + if (simple) { + return std::make_pair(*simple, isRequired(*simple)); + } + if (typeV3.IsMap() && typeV3["type_name"] == "optional") { + auto simpleItem = getSimple(typeV3["item"]); + if (simpleItem && isRequired(*simpleItem)) { + return std::make_pair(*simpleItem, false); + } + } + return {}; + }; + + auto simplified = simplify(rawTypeV3); + + if (simplified) { + const auto& [simpleType, required] = *simplified; + fluent + .Item("type").Value(simpleType) + .Item("required").Value(required); + return; + } + }) + .DoIf(columnSchema.SortOrder().Defined(), [&] (TFluentMap fluent) { + fluent.Item("sort_order").Value(ToString(*columnSchema.SortOrder())); + }) + .DoIf(columnSchema.Lock().Defined(), [&] (TFluentMap fluent) { + fluent.Item("lock").Value(*columnSchema.Lock()); + }) + .DoIf(columnSchema.Expression().Defined(), [&] (TFluentMap fluent) { + fluent.Item("expression").Value(*columnSchema.Expression()); + }) + .DoIf(columnSchema.Aggregate().Defined(), [&] (TFluentMap fluent) { + fluent.Item("aggregate").Value(*columnSchema.Aggregate()); + }) + .DoIf(columnSchema.Group().Defined(), [&] (TFluentMap fluent) { + fluent.Item("group").Value(*columnSchema.Group()); + }) + .EndMap(); +} + +void Deserialize(TColumnSchema& columnSchema, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("name", columnSchema.Name_); + DESERIALIZE_ITEM("type_v3", columnSchema.RawTypeV3_); + DESERIALIZE_ITEM("sort_order", columnSchema.SortOrder_); + DESERIALIZE_ITEM("lock", columnSchema.Lock_); + DESERIALIZE_ITEM("expression", columnSchema.Expression_); + DESERIALIZE_ITEM("aggregate", columnSchema.Aggregate_); + DESERIALIZE_ITEM("group", columnSchema.Group_); + + if (nodeMap.contains("type_v3")) { + NTi::TTypePtr type; + DESERIALIZE_ITEM("type_v3", type); + columnSchema.Type(type); + } else { + EValueType oldType = VT_INT64; + bool required = false; + DESERIALIZE_ITEM("type", oldType); + DESERIALIZE_ITEM("required", required); + columnSchema.Type(ToTypeV3(oldType, required)); + } +} + +void Serialize(const TTableSchema& tableSchema, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginAttributes() + .Item("strict").Value(tableSchema.Strict()) + .Item("unique_keys").Value(tableSchema.UniqueKeys()) + .EndAttributes() + .List(tableSchema.Columns()); +} + +void Deserialize(TTableSchema& tableSchema, const TNode& node) +{ + const auto& attributesMap = node.GetAttributes().AsMap(); + DESERIALIZE_ATTR("strict", tableSchema.Strict_); + DESERIALIZE_ATTR("unique_keys", tableSchema.UniqueKeys_); + Deserialize(tableSchema.Columns_, node); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TKeyBound& keyBound, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginList() + .Item().Value(ToString(keyBound.Relation())) + .Item().Value(keyBound.Key()) + .EndList(); +} + +void Deserialize(TKeyBound& keyBound, const TNode& node) +{ + const auto& nodeList = node.AsList(); + Y_ENSURE(nodeList.size() == 2); + + const auto& relationNode = nodeList[0]; + keyBound.Relation(::FromString<ERelation>(relationNode.AsString())); + + const auto& keyNode = nodeList[1]; + TKey key; + Deserialize(key, keyNode); + keyBound.Key(std::move(key)); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TReadLimit& readLimit, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .DoIf(readLimit.KeyBound_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("key_bound").Value(*readLimit.KeyBound_); + }) + .DoIf(readLimit.Key_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("key").Value(*readLimit.Key_); + }) + .DoIf(readLimit.RowIndex_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("row_index").Value(*readLimit.RowIndex_); + }) + .DoIf(readLimit.Offset_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("offset").Value(*readLimit.Offset_); + }) + .DoIf(readLimit.TabletIndex_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("tablet_index").Value(*readLimit.TabletIndex_); + }) + .EndMap(); +} + +void Deserialize(TReadLimit& readLimit, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("key_bound", readLimit.KeyBound_); + DESERIALIZE_ITEM("key", readLimit.Key_); + DESERIALIZE_ITEM("row_index", readLimit.RowIndex_); + DESERIALIZE_ITEM("offset", readLimit.Offset_); + DESERIALIZE_ITEM("tablet_index", readLimit.TabletIndex_); +} + +void Serialize(const TReadRange& readRange, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .DoIf(!IsTrivial(readRange.LowerLimit_), [&] (TFluentMap fluent) { + fluent.Item("lower_limit").Value(readRange.LowerLimit_); + }) + .DoIf(!IsTrivial(readRange.UpperLimit_), [&] (TFluentMap fluent) { + fluent.Item("upper_limit").Value(readRange.UpperLimit_); + }) + .DoIf(!IsTrivial(readRange.Exact_), [&] (TFluentMap fluent) { + fluent.Item("exact").Value(readRange.Exact_); + }) + .EndMap(); +} + +void Deserialize(TReadRange& readRange, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("lower_limit", readRange.LowerLimit_); + DESERIALIZE_ITEM("upper_limit", readRange.UpperLimit_); + DESERIALIZE_ITEM("exact", readRange.Exact_); +} + +void Serialize(const THashMap<TString, TString>& renameColumns, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer) + .DoMapFor(renameColumns, [] (TFluentMap fluent, const auto& item) { + fluent.Item(item.first).Value(item.second); + }); +} + +void Serialize(const TRichYPath& path, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginAttributes() + .DoIf(path.GetRanges().Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("ranges").List(*path.GetRanges()); + }) + .DoIf(path.Columns_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("columns").Value(*path.Columns_); + }) + .DoIf(path.Append_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("append").Value(*path.Append_); + }) + .DoIf(path.PartiallySorted_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("partially_sorted").Value(*path.PartiallySorted_); + }) + .DoIf(!path.SortedBy_.Parts_.empty(), [&] (TFluentAttributes fluent) { + fluent.Item("sorted_by").Value(path.SortedBy_); + }) + .DoIf(path.Teleport_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("teleport").Value(*path.Teleport_); + }) + .DoIf(path.Primary_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("primary").Value(*path.Primary_); + }) + .DoIf(path.Foreign_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("foreign").Value(*path.Foreign_); + }) + .DoIf(path.RowCountLimit_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("row_count_limit").Value(*path.RowCountLimit_); + }) + .DoIf(path.FileName_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("file_name").Value(*path.FileName_); + }) + .DoIf(path.OriginalPath_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("original_path").Value(*path.OriginalPath_); + }) + .DoIf(path.Executable_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("executable").Value(*path.Executable_); + }) + .DoIf(path.Format_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("format").Value(*path.Format_); + }) + .DoIf(path.Schema_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("schema").Value(*path.Schema_); + }) + .DoIf(path.Timestamp_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("timestamp").Value(*path.Timestamp_); + }) + .DoIf(path.CompressionCodec_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("compression_codec").Value(*path.CompressionCodec_); + }) + .DoIf(path.ErasureCodec_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("erasure_codec").Value(ToString(*path.ErasureCodec_)); + }) + .DoIf(path.SchemaModification_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("schema_modification").Value(ToString(*path.SchemaModification_)); + }) + .DoIf(path.OptimizeFor_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("optimize_for").Value(ToString(*path.OptimizeFor_)); + }) + .DoIf(path.TransactionId_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("transaction_id").Value(GetGuidAsString(*path.TransactionId_)); + }) + .DoIf(path.RenameColumns_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("rename_columns").Value(*path.RenameColumns_); + }) + .DoIf(path.BypassArtifactCache_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("bypass_artifact_cache").Value(*path.BypassArtifactCache_); + }) + .EndAttributes() + .Value(path.Path_); +} + +void Deserialize(TRichYPath& path, const TNode& node) +{ + path = {}; + + const auto& attributesMap = node.GetAttributes().AsMap(); + DESERIALIZE_ATTR("ranges", path.MutableRanges()); + DESERIALIZE_ATTR("columns", path.Columns_); + DESERIALIZE_ATTR("append", path.Append_); + DESERIALIZE_ATTR("partially_sorted", path.PartiallySorted_); + DESERIALIZE_ATTR("sorted_by", path.SortedBy_); + DESERIALIZE_ATTR("teleport", path.Teleport_); + DESERIALIZE_ATTR("primary", path.Primary_); + DESERIALIZE_ATTR("foreign", path.Foreign_); + DESERIALIZE_ATTR("row_count_limit", path.RowCountLimit_); + DESERIALIZE_ATTR("file_name", path.FileName_); + DESERIALIZE_ATTR("original_path", path.OriginalPath_); + DESERIALIZE_ATTR("executable", path.Executable_); + DESERIALIZE_ATTR("format", path.Format_); + DESERIALIZE_ATTR("schema", path.Schema_); + DESERIALIZE_ATTR("timestamp", path.Timestamp_); + DESERIALIZE_ATTR("compression_codec", path.CompressionCodec_); + DESERIALIZE_ATTR("erasure_codec", path.ErasureCodec_); + DESERIALIZE_ATTR("schema_modification", path.SchemaModification_); + DESERIALIZE_ATTR("optimize_for", path.OptimizeFor_); + DESERIALIZE_ATTR("transaction_id", path.TransactionId_); + DESERIALIZE_ATTR("rename_columns", path.RenameColumns_); + DESERIALIZE_ATTR("bypass_artifact_cache", path.BypassArtifactCache_); + Deserialize(path.Path_, node); +} + +void Serialize(const TAttributeFilter& filter, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).List(filter.Attributes_); +} + +void Deserialize(TTableColumnarStatistics& statistics, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("column_data_weights", statistics.ColumnDataWeight); + DESERIALIZE_ITEM("legacy_chunks_data_weight", statistics.LegacyChunksDataWeight); + DESERIALIZE_ITEM("timestamp_total_weight", statistics.TimestampTotalWeight); +} + +void Deserialize(TMultiTablePartition::TStatistics& statistics, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("chunk_count", statistics.ChunkCount); + DESERIALIZE_ITEM("data_weight", statistics.DataWeight); + DESERIALIZE_ITEM("row_count", statistics.RowCount); +} + +void Deserialize(TMultiTablePartition& partition, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("table_ranges", partition.TableRanges); + DESERIALIZE_ITEM("aggregate_statistics", partition.AggregateStatistics); +} + +void Deserialize(TMultiTablePartitions& partitions, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("partitions", partitions.Partitions); +} + +void Serialize(const TGUID& value, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).Value(GetGuidAsString(value)); +} + +void Deserialize(TGUID& value, const TNode& node) +{ + value = GetGuid(node.AsString()); +} + +void Deserialize(TTabletInfo& value, const TNode& node) +{ + auto nodeMap = node.AsMap(); + DESERIALIZE_ITEM("total_row_count", value.TotalRowCount) + DESERIALIZE_ITEM("trimmed_row_count", value.TrimmedRowCount) + DESERIALIZE_ITEM("barrier_timestamp", value.BarrierTimestamp) +} + +void Serialize(const NTi::TTypePtr& type, NYson::IYsonConsumer* consumer) +{ + auto yson = NTi::NIo::SerializeYson(type.Get()); + ::NYson::ParseYsonStringBuffer(yson, consumer); +} + +void Deserialize(NTi::TTypePtr& type, const TNode& node) +{ + auto yson = NodeToYsonString(node, NYson::EYsonFormat::Binary); + type = NTi::NIo::DeserializeYson(*NTi::HeapFactory(), yson); +} + +#undef DESERIALIZE_ITEM +#undef DESERIALIZE_ATTR + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/serialize.h b/yt/cpp/mapreduce/interface/serialize.h new file mode 100644 index 0000000000..223dd446ba --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize.h @@ -0,0 +1,90 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/serialize.h +/// +/// Header containing declaration of functions for serializing to/from YSON. + +#include "common.h" + +#include <library/cpp/type_info/fwd.h> + +namespace NYT::NYson { +struct IYsonConsumer; +} // namespace NYT::NYson + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +void Deserialize(TMaybe<T>& value, const TNode& node) +{ + value.ConstructInPlace(); + Deserialize(value.GetRef(), node); +} + +template <class T> +void Deserialize(TVector<T>& value, const TNode& node) +{ + for (const auto& element : node.AsList()) { + value.emplace_back(); + Deserialize(value.back(), element); + } +} + +template <class T> +void Deserialize(THashMap<TString, T>& value, const TNode& node) +{ + for (const auto& item : node.AsMap()) { + Deserialize(value[item.first], item.second); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TKey& key, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TKey& key, const TNode& node); + +void Serialize(const TSortColumns& sortColumns, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TSortColumns& sortColumns, const TNode& node); + +void Serialize(const TColumnNames& columnNames, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TColumnNames& columnNames, const TNode& node); + +void Serialize(const TSortColumn& sortColumn, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TSortColumn& sortColumn, const TNode& node); + +void Serialize(const TKeyBound& keyBound, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TKeyBound& keyBound, const TNode& node); + +void Serialize(const TReadLimit& readLimit, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TReadLimit& readLimit, const TNode& node); + +void Serialize(const TReadRange& readRange, NYT::NYson::IYsonConsumer* consumer); + +void Serialize(const TRichYPath& path, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TRichYPath& path, const TNode& node); + +void Serialize(const TAttributeFilter& filter, NYT::NYson::IYsonConsumer* consumer); + +void Serialize(const TColumnSchema& columnSchema, NYT::NYson::IYsonConsumer* consumer); +void Serialize(const TTableSchema& tableSchema, NYT::NYson::IYsonConsumer* consumer); + +void Deserialize(EValueType& valueType, const TNode& node); +void Deserialize(TTableSchema& tableSchema, const TNode& node); +void Deserialize(TColumnSchema& columnSchema, const TNode& node); +void Deserialize(TTableColumnarStatistics& statistics, const TNode& node); +void Deserialize(TMultiTablePartition& partition, const TNode& node); +void Deserialize(TMultiTablePartitions& partitions, const TNode& node); +void Deserialize(TTabletInfo& tabletInfos, const TNode& node); + +void Serialize(const TGUID& path, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TGUID& value, const TNode& node); + +void Serialize(const NTi::TTypePtr& type, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(NTi::TTypePtr& type, const TNode& node); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/serialize_ut.cpp b/yt/cpp/mapreduce/interface/serialize_ut.cpp new file mode 100644 index 0000000000..59d4501ee8 --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize_ut.cpp @@ -0,0 +1,49 @@ +#include <yt/cpp/mapreduce/interface/serialize.h> +#include <yt/cpp/mapreduce/interface/common.h> + +#include <library/cpp/yson/node/node_builder.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/serialized_enum.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(Serialization) +{ + Y_UNIT_TEST(TableSchema) + { + auto schema = TTableSchema() + .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64)) + .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64, true)); + + auto schemaNode = schema.ToNode(); + UNIT_ASSERT(schemaNode.IsList()); + UNIT_ASSERT_VALUES_EQUAL(schemaNode.Size(), 3); + + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["name"], "a"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["required"], false); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["sort_order"], "ascending"); + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["name"], "b"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["type"], "uint64"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["required"], false); + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["name"], "c"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["required"], true); + } + + Y_UNIT_TEST(ValueTypeSerialization) + { + for (const auto value : GetEnumAllValues<EValueType>()) { + TNode serialized = NYT::NDetail::ToString(value); + EValueType deserialized; + Deserialize(deserialized, serialized); + UNIT_ASSERT_VALUES_EQUAL(value, deserialized); + } + } +} diff --git a/yt/cpp/mapreduce/interface/skiff_row.cpp b/yt/cpp/mapreduce/interface/skiff_row.cpp new file mode 100644 index 0000000000..7838bdaee9 --- /dev/null +++ b/yt/cpp/mapreduce/interface/skiff_row.cpp @@ -0,0 +1 @@ +#include "skiff_row.h" diff --git a/yt/cpp/mapreduce/interface/skiff_row.h b/yt/cpp/mapreduce/interface/skiff_row.h new file mode 100644 index 0000000000..5dd335cb65 --- /dev/null +++ b/yt/cpp/mapreduce/interface/skiff_row.h @@ -0,0 +1,127 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/skiff_row.h +/// Header containing interfaces that you need to define for using TSkiffRowTableReader +/// What you need to do for your struct type TMyType: +/// 1. Write `true` specialization TIsSkiffRow<TMyType>; +/// 2. Write specialization GetSkiffSchema<TMyType>(); +/// 3. Write your own parser derived from ISkiffRowParser and write specialization GetSkiffParser<TMyType>() which returns this parser. + +#include "fwd.h" + +#include <yt/cpp/mapreduce/skiff/skiff_schema.h> + +#include <yt/cpp/mapreduce/interface/format.h> + +#include <library/cpp/skiff/skiff.h> + +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +//! Need to write `true_type` specialization for your row type `T`. +/// And implement two functions: `GetSkiffSchema` and `CreateSkiffParser`. +/// +/// Example: +/// +/// template <> +/// struct TIsSkiffRow<T> +/// : std::true_type +/// { }; +/// +template<class T> +struct TIsSkiffRow + : std::false_type +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +//! Return skiff schema for row type `T`. +/// Need to write its specialization. +template <typename T> +NSkiff::TSkiffSchemaPtr GetSkiffSchema(const TMaybe<TSkiffRowHints>& /*hints*/) +{ + static_assert(TDependentFalse<T>, "Unimplemented `GetSkiffSchema` method"); +} + +//////////////////////////////////////////////////////////////////////////////// + +//! Allow to parse rows as user's structs from stream (TCheckedInDebugSkiffParser). +/// Need to write derived class for your own row type. +/// +/// Example: +/// +/// class TMySkiffRowParser : public ISkiffRowParser +/// { +/// public: +/// TMySkiffRowParser(TMySkiffRow* row) +/// : Row_(row) +/// {} +/// +/// void Parse(NSkiff::TCheckedInDebugSkiffParser* parser) +/// . { +/// Row_->SomeInt64Field = parser->ParseInt64(); +/// } +/// +/// private: +/// TMySkiffRow* Row_; +/// } +/// +class ISkiffRowParser + : public TThrRefBase +{ +public: + //! Read one row from parser + virtual void Parse(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0; +}; + +//! Creates a parser for row type `T`. +template <typename T> +ISkiffRowParserPtr CreateSkiffParser(T* /*row*/, const TMaybe<TSkiffRowHints>& /*hints*/) +{ + static_assert(TDependentFalse<T>, "Unimplemented `CreateSkiffParser` function"); +} + +//////////////////////////////////////////////////////////////////////////////// + +//! Allow to skip row content without getting row. +/// By default row will be parsed using your parser derived from ISkiffRowParser. +/// If you want, you can write more optimal skipper, but it isn't required. +class ISkiffRowSkipper + : public TThrRefBase +{ +public: + virtual void SkipRow(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0; +}; + +//! Default ISkiffRowSkipper implementation. +template <typename T> +class TSkiffRowSkipper : public ISkiffRowSkipper { +public: + explicit TSkiffRowSkipper(const TMaybe<TSkiffRowHints>& hints) + : Parser_(CreateSkiffParser<T>(&Row_, hints)) + { } + + void SkipRow(NSkiff::TCheckedInDebugSkiffParser* parser) { + Parser_->Parse(parser); + } + +private: + T Row_; + ISkiffRowParserPtr Parser_; +}; + +//! Creates a skipper for row type 'T'. +/// You don't need to write its specialization. +template <typename T> +ISkiffRowSkipperPtr CreateSkiffSkipper(const TMaybe<TSkiffRowHints>& hints) +{ + return ::MakeIntrusive<TSkiffRowSkipper<T>>(hints); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/tvm.cpp b/yt/cpp/mapreduce/interface/tvm.cpp new file mode 100644 index 0000000000..bfa3f0304e --- /dev/null +++ b/yt/cpp/mapreduce/interface/tvm.cpp @@ -0,0 +1 @@ +#include "tvm.h" diff --git a/yt/cpp/mapreduce/interface/tvm.h b/yt/cpp/mapreduce/interface/tvm.h new file mode 100644 index 0000000000..d8d16d841b --- /dev/null +++ b/yt/cpp/mapreduce/interface/tvm.h @@ -0,0 +1,35 @@ +#pragma once + +#include <yt/yt/library/tvm/tvm_base.h> + +#include <library/cpp/yt/memory/intrusive_ptr.h> + +namespace NYT::NAuth { + +//////////////////////////////////////////////////////////////////////////////// + +/// This wrapper is required because NYT::NAuth::IServiceTicketAuthPtr is NYT::TIntrusivePtr, +/// and, if we used this pointer in interfaces of `mapreduce/yt` client, a lot of users of this library +/// could get unexpected build errors that `TIntrusivePtr` is ambigious +/// (from `::` namespace and from `::NYT::` namespace). +/// So we use this wrapper in our interfaces to avoid such problems for users. +struct IServiceTicketAuthPtrWrapper +{ + // + /// Construct wrapper from NYT::TIntrusivePtr + /// + /// This constructor is implicit so users can transparently pass NYT::TIntrusivePtr to the functions of + /// mapreduce/yt client. + template <class T, class = typename std::enable_if_t<std::is_convertible_v<T*, IServiceTicketAuth*>>> + IServiceTicketAuthPtrWrapper(const TIntrusivePtr<T> ptr) + : Ptr(ptr) + { + } + + /// Wrapped pointer + NYT::TIntrusivePtr<IServiceTicketAuth> Ptr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NAuth diff --git a/yt/cpp/mapreduce/interface/ut/ya.make b/yt/cpp/mapreduce/interface/ut/ya.make new file mode 100644 index 0000000000..0219e6430c --- /dev/null +++ b/yt/cpp/mapreduce/interface/ut/ya.make @@ -0,0 +1,25 @@ +UNITTEST_FOR(yt/cpp/mapreduce/interface) + +SRCS( + common_ut.cpp + config_ut.cpp + error_ut.cpp + format_ut.cpp + job_counters_ut.cpp + job_statistics_ut.cpp + operation_ut.cpp + proto3_ut.proto + protobuf_table_schema_ut.cpp + protobuf_file_options_ut.cpp + protobuf_table_schema_ut.proto + protobuf_file_options_ut.proto + serialize_ut.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/testing/unittest + yt/yt_proto/yt/formats +) + +END() diff --git a/yt/cpp/mapreduce/interface/wait_proxy.h b/yt/cpp/mapreduce/interface/wait_proxy.h new file mode 100644 index 0000000000..f7d8e0638e --- /dev/null +++ b/yt/cpp/mapreduce/interface/wait_proxy.h @@ -0,0 +1,54 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/serialize.h +/// +/// Header containing interface to enable customizable waiting. + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <util/datetime/base.h> + +namespace NThreading { +template <typename T> +class TFuture; +} + +class TSystemEvent; +class TCondVar; +class TMutex; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface to facilitate customizable waiting. +/// +/// All the waiting functions in the library are obliged to use the methods of a wait proxy instead of direct function calls. +class IWaitProxy + : public TThrRefBase +{ +public: + virtual ~IWaitProxy() = default; + + /// + /// @brief Wait for the future setting with timeout. + virtual bool WaitFuture(const ::NThreading::TFuture<void>& future, TDuration timeout) = 0; + + /// + /// @brief Wait for a system event with timeout. + virtual bool WaitEvent(TSystemEvent& event, TDuration timeout) = 0; + + /// + /// @brief Wait for the notification on the condition variable with timeout. + virtual bool WaitCondVar(TCondVar& condVar, TMutex& mutex, TDuration timeout) = 0; + + /// + /// @brief Sleep in the current thread for (approximately) specified amount of time. + virtual void Sleep(TDuration timeout) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/ya.make b/yt/cpp/mapreduce/interface/ya.make new file mode 100644 index 0000000000..0e94f14633 --- /dev/null +++ b/yt/cpp/mapreduce/interface/ya.make @@ -0,0 +1,46 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + batch_request.cpp + client.cpp + client_method_options.cpp + common.cpp + config.cpp + cypress.cpp + errors.cpp + format.cpp + job_counters.cpp + job_statistics.cpp + io.cpp + operation.cpp + protobuf_format.cpp + serialize.cpp + skiff_row.cpp + tvm.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/type_info + library/cpp/threading/future + library/cpp/yson/node + yt/cpp/mapreduce/interface/logging + yt/yt_proto/yt/formats + yt/yt/library/tvm +) + +GENERATE_ENUM_SERIALIZATION(client_method_options.h) +GENERATE_ENUM_SERIALIZATION(client.h) +GENERATE_ENUM_SERIALIZATION(common.h) +GENERATE_ENUM_SERIALIZATION(config.h) +GENERATE_ENUM_SERIALIZATION(cypress.h) +GENERATE_ENUM_SERIALIZATION(job_counters.h) +GENERATE_ENUM_SERIALIZATION(job_statistics.h) +GENERATE_ENUM_SERIALIZATION(operation.h) +GENERATE_ENUM_SERIALIZATION(protobuf_format.h) + +END() + +RECURSE_FOR_TESTS(ut) diff --git a/yt/cpp/mapreduce/io/counting_raw_reader.cpp b/yt/cpp/mapreduce/io/counting_raw_reader.cpp new file mode 100644 index 0000000000..6a918bdddb --- /dev/null +++ b/yt/cpp/mapreduce/io/counting_raw_reader.cpp @@ -0,0 +1,38 @@ +#include "counting_raw_reader.h" + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +bool TCountingRawTableReader::Retry(const TMaybe<ui32>& rangeIndex, const TMaybe<ui64>& rowIndex) +{ + return Reader_->Retry(rangeIndex, rowIndex); +} + +void TCountingRawTableReader::ResetRetries() +{ + Reader_->ResetRetries(); +} + +bool TCountingRawTableReader::HasRangeIndices() const +{ + return Reader_->HasRangeIndices(); +} + +size_t TCountingRawTableReader::GetReadByteCount() const +{ + return ReadByteCount_; +} + +size_t TCountingRawTableReader::DoRead(void* buf, size_t len) +{ + auto readLen = Reader_->Read(buf, len); + ReadByteCount_ += readLen; + return readLen; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/counting_raw_reader.h b/yt/cpp/mapreduce/io/counting_raw_reader.h new file mode 100644 index 0000000000..3b6705c5e4 --- /dev/null +++ b/yt/cpp/mapreduce/io/counting_raw_reader.h @@ -0,0 +1,31 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { +namespace NDetail { + +class TCountingRawTableReader + final : public TRawTableReader +{ +public: + TCountingRawTableReader(::TIntrusivePtr<TRawTableReader> reader) + : Reader_(std::move(reader)) + { } + + bool Retry(const TMaybe<ui32>& rangeIndex, const TMaybe<ui64>& rowIndex) override; + void ResetRetries() override; + bool HasRangeIndices() const override; + + size_t GetReadByteCount() const; + +protected: + size_t DoRead(void* buf, size_t len) override; + +private: + ::TIntrusivePtr<TRawTableReader> Reader_; + size_t ReadByteCount_ = 0; +}; + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/helpers.h b/yt/cpp/mapreduce/io/helpers.h new file mode 100644 index 0000000000..5dbbf20906 --- /dev/null +++ b/yt/cpp/mapreduce/io/helpers.h @@ -0,0 +1,130 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/common/helpers.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +template <class TOptions> +struct TIOOptionsTraits; + +template <> +struct TIOOptionsTraits<TFileReaderOptions> +{ + static constexpr const char* const ConfigName = "file_reader"; +}; +template <> +struct TIOOptionsTraits<TFileWriterOptions> +{ + static constexpr const char* const ConfigName = "file_writer"; +}; +template <> +struct TIOOptionsTraits<TTableReaderOptions> +{ + static constexpr const char* const ConfigName = "table_reader"; +}; +template <> +struct TIOOptionsTraits<TTableWriterOptions> +{ + static constexpr const char* const ConfigName = "table_writer"; +}; + +template <class TOptions> +TNode FormIORequestParameters( + const TRichYPath& path, + const TOptions& options) +{ + auto params = PathToParamNode(path); + if (options.Config_) { + params[TIOOptionsTraits<TOptions>::ConfigName] = *options.Config_; + } + return params; +} + +template <> +inline TNode FormIORequestParameters( + const TRichYPath& path, + const TFileReaderOptions& options) +{ + auto params = PathToParamNode(path); + if (options.Config_) { + params[TIOOptionsTraits<TTableReaderOptions>::ConfigName] = *options.Config_; + } + if (options.Offset_) { + params["offset"] = *options.Offset_; + } + if (options.Length_) { + params["length"] = *options.Length_; + } + return params; +} + +static void AddWriterOptionsToNode(const TWriterOptions& options, TNode* node) +{ + if (options.EnableEarlyFinish_) { + (*node)["enable_early_finish"] = *options.EnableEarlyFinish_; + } + if (options.UploadReplicationFactor_) { + (*node)["upload_replication_factor"] = *options.UploadReplicationFactor_; + } + if (options.MinUploadReplicationFactor_) { + (*node)["min_upload_replication_factor"] = *options.MinUploadReplicationFactor_; + } + if (options.DesiredChunkSize_) { + (*node)["desired_chunk_size"] = *options.DesiredChunkSize_; + } +} + +template <> +inline TNode FormIORequestParameters( + const TRichYPath& path, + const TFileWriterOptions& options) +{ + auto params = PathToParamNode(path); + TNode fileWriter = TNode::CreateMap(); + if (options.Config_) { + fileWriter = *options.Config_; + } + if (options.WriterOptions_) { + AddWriterOptionsToNode(*options.WriterOptions_, &fileWriter); + } + if (fileWriter.Empty()) { + AddWriterOptionsToNode( + TWriterOptions() + .EnableEarlyFinish(true) + .UploadReplicationFactor(3) + .MinUploadReplicationFactor(2), + &fileWriter); + } + params[TIOOptionsTraits<TFileWriterOptions>::ConfigName] = fileWriter; + if (options.ComputeMD5_) { + params["compute_md5"] = *options.ComputeMD5_; + } + return params; +} + +template <> +inline TNode FormIORequestParameters( + const TRichYPath& path, + const TTableWriterOptions& options) +{ + auto params = PathToParamNode(path); + auto tableWriter = TConfig::Get()->TableWriter; + if (options.Config_) { + MergeNodes(tableWriter, *options.Config_); + } + if (options.WriterOptions_) { + AddWriterOptionsToNode(*options.WriterOptions_, &tableWriter); + } + if (!tableWriter.Empty()) { + params[TIOOptionsTraits<TTableWriterOptions>::ConfigName] = std::move(tableWriter); + } + return params; +} + +//////////////////////////////////////////////////////////////////////////////// + +} diff --git a/yt/cpp/mapreduce/io/job_reader.cpp b/yt/cpp/mapreduce/io/job_reader.cpp new file mode 100644 index 0000000000..39056f00e2 --- /dev/null +++ b/yt/cpp/mapreduce/io/job_reader.cpp @@ -0,0 +1,46 @@ +#include "job_reader.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TJobReader::TJobReader(int fd) + : TJobReader(Duplicate(fd)) +{ } + +TJobReader::TJobReader(const TFile& file) + : FdFile_(file) + , FdInput_(FdFile_) + , BufferedInput_(&FdInput_, BUFFER_SIZE) +{ } + +bool TJobReader::Retry(const TMaybe<ui32>& /*rangeIndex*/, const TMaybe<ui64>& /*rowIndex*/) +{ + return false; +} + +void TJobReader::ResetRetries() +{ } + +bool TJobReader::HasRangeIndices() const +{ + return true; +} + +size_t TJobReader::DoRead(void* buf, size_t len) +{ + return BufferedInput_.Read(buf, len); +} + +//////////////////////////////////////////////////////////////////////////////// + +TRawTableReaderPtr CreateRawJobReader(int fd) +{ + return ::MakeIntrusive<TJobReader>(fd); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/job_reader.h b/yt/cpp/mapreduce/io/job_reader.h new file mode 100644 index 0000000000..ce62ec180f --- /dev/null +++ b/yt/cpp/mapreduce/io/job_reader.h @@ -0,0 +1,38 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <util/stream/buffered.h> +#include <util/stream/file.h> +#include <util/system/file.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TJobReader + : public TRawTableReader +{ +public: + explicit TJobReader(int fd); + explicit TJobReader(const TFile& file); + + virtual bool Retry( const TMaybe<ui32>& /*rangeIndex*/, const TMaybe<ui64>& /*rowIndex*/) override; + virtual void ResetRetries() override; + virtual bool HasRangeIndices() const override; + +protected: + size_t DoRead(void* buf, size_t len) override; + +private: + TFile FdFile_; + TUnbufferedFileInput FdInput_; + TBufferedInput BufferedInput_; + + static const size_t BUFFER_SIZE = 64 << 10; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/job_writer.cpp b/yt/cpp/mapreduce/io/job_writer.cpp new file mode 100644 index 0000000000..d08bb0a665 --- /dev/null +++ b/yt/cpp/mapreduce/io/job_writer.cpp @@ -0,0 +1,68 @@ +#include "job_writer.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <util/system/file.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TJobWriter::TStream::TStream(int fd) + : TStream(Duplicate(fd)) +{ } + +TJobWriter::TStream::TStream(const TFile& file) + : FdFile(file) + , FdOutput(FdFile) + , BufferedOutput(&FdOutput, BUFFER_SIZE) +{ } + +TJobWriter::TStream::~TStream() +{ +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobWriter::TJobWriter(size_t outputTableCount) +{ + for (size_t i = 0; i < outputTableCount; ++i) { + Streams_.emplace_back(MakeHolder<TStream>(int(i * 3 + 1))); + } +} + +TJobWriter::TJobWriter(const TVector<TFile>& fileList) +{ + for (const auto& f : fileList) { + Streams_.emplace_back(MakeHolder<TStream>(f)); + } +} + +size_t TJobWriter::GetStreamCount() const +{ + return Streams_.size(); +} + +IOutputStream* TJobWriter::GetStream(size_t tableIndex) const +{ + if (tableIndex >= Streams_.size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Streams_.size() << ")"; + } + return &Streams_[tableIndex]->BufferedOutput; +} + +void TJobWriter::OnRowFinished(size_t) +{ } + +//////////////////////////////////////////////////////////////////////////////// + +THolder<IProxyOutput> CreateRawJobWriter(size_t outputTableCount) +{ + return ::MakeHolder<TJobWriter>(outputTableCount); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/job_writer.h b/yt/cpp/mapreduce/io/job_writer.h new file mode 100644 index 0000000000..9b24650640 --- /dev/null +++ b/yt/cpp/mapreduce/io/job_writer.h @@ -0,0 +1,43 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <util/generic/vector.h> +#include <util/generic/ptr.h> +#include <util/stream/file.h> +#include <util/stream/buffered.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TJobWriter + : public IProxyOutput +{ +public: + explicit TJobWriter(size_t outputTableCount); + explicit TJobWriter(const TVector<TFile>& fileList); + + size_t GetStreamCount() const override; + IOutputStream* GetStream(size_t tableIndex) const override; + void OnRowFinished(size_t tableIndex) override; + +private: + struct TStream { + TFile FdFile; + TUnbufferedFileOutput FdOutput; + TBufferedOutput BufferedOutput; + + explicit TStream(int fd); + explicit TStream(const TFile& file); + ~TStream(); + + static const size_t BUFFER_SIZE = 1 << 20; + }; + + TVector<THolder<TStream>> Streams_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/lenval_table_reader.cpp b/yt/cpp/mapreduce/io/lenval_table_reader.cpp new file mode 100644 index 0000000000..98274c7996 --- /dev/null +++ b/yt/cpp/mapreduce/io/lenval_table_reader.cpp @@ -0,0 +1,198 @@ +#include "lenval_table_reader.h" + +#include <yt/cpp/mapreduce/common/helpers.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <util/string/printf.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +const i32 CONTROL_ATTR_TABLE_INDEX = -1; +const i32 CONTROL_ATTR_KEY_SWITCH = -2; +const i32 CONTROL_ATTR_RANGE_INDEX = -3; +const i32 CONTROL_ATTR_ROW_INDEX = -4; +const i32 CONTROL_ATTR_END_OF_STREAM = -5; +const i32 CONTROL_ATTR_TABLET_INDEX = -6; + +//////////////////////////////////////////////////////////////////////////////// + +TLenvalTableReader::TLenvalTableReader(::TIntrusivePtr<TRawTableReader> input) + : Input_(std::move(input)) +{ + TLenvalTableReader::Next(); +} + +TLenvalTableReader::~TLenvalTableReader() +{ } + +void TLenvalTableReader::CheckValidity() const +{ + if (!IsValid()) { + ythrow yexception() << "Iterator is not valid"; + } +} + +bool TLenvalTableReader::IsValid() const +{ + return Valid_; +} + +void TLenvalTableReader::Next() +{ + if (!RowTaken_) { + SkipRow(); + } + + CheckValidity(); + + if (RowIndex_) { + ++*RowIndex_; + } + + while (true) { + try { + i32 value = 0; + if (!ReadInteger(&value, true)) { + return; + } + + while (value < 0 && !IsEndOfStream_) { + switch (value) { + case CONTROL_ATTR_KEY_SWITCH: + if (!AtStart_) { + Valid_ = false; + return; + } else { + ReadInteger(&value); + } + break; + + case CONTROL_ATTR_TABLE_INDEX: { + ui32 tmp = 0; + ReadInteger(&tmp); + TableIndex_ = tmp; + ReadInteger(&value); + break; + } + case CONTROL_ATTR_ROW_INDEX: { + ui64 tmp = 0; + ReadInteger(&tmp); + RowIndex_ = tmp; + ReadInteger(&value); + break; + } + case CONTROL_ATTR_RANGE_INDEX: { + ui32 tmp = 0; + ReadInteger(&tmp); + RangeIndex_ = tmp; + ReadInteger(&value); + break; + } + case CONTROL_ATTR_TABLET_INDEX: { + ui64 tmp = 0; + ReadInteger(&tmp); + TabletIndex_ = tmp; + ReadInteger(&value); + break; + } + case CONTROL_ATTR_END_OF_STREAM: { + IsEndOfStream_ = true; + break; + } + default: + ythrow yexception() << + Sprintf("Invalid control integer %d in lenval stream", value); + } + } + + Length_ = static_cast<ui32>(value); + RowTaken_ = false; + AtStart_ = false; + } catch (const std::exception& e) { + if (!PrepareRetry()) { + throw; + } + continue; + } + break; + } +} + +bool TLenvalTableReader::Retry() +{ + if (PrepareRetry()) { + RowTaken_ = true; + Next(); + return true; + } + return false; +} + +void TLenvalTableReader::NextKey() +{ + while (Valid_) { + Next(); + } + + if (Finished_) { + return; + } + + Valid_ = true; + + if (RowIndex_) { + --*RowIndex_; + } + + RowTaken_ = true; +} + +ui32 TLenvalTableReader::GetTableIndex() const +{ + CheckValidity(); + return TableIndex_; +} + +ui32 TLenvalTableReader::GetRangeIndex() const +{ + CheckValidity(); + return RangeIndex_.GetOrElse(0); +} + +ui64 TLenvalTableReader::GetRowIndex() const +{ + CheckValidity(); + return RowIndex_.GetOrElse(0UL); +} + +TMaybe<size_t> TLenvalTableReader::GetReadByteCount() const +{ + return Input_.GetReadByteCount(); +} + +bool TLenvalTableReader::IsEndOfStream() const +{ + return IsEndOfStream_; +} + +bool TLenvalTableReader::IsRawReaderExhausted() const +{ + return Finished_; +} + +bool TLenvalTableReader::PrepareRetry() +{ + if (Input_.Retry(RangeIndex_, RowIndex_)) { + RowIndex_.Clear(); + RangeIndex_.Clear(); + return true; + } + return false; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/lenval_table_reader.h b/yt/cpp/mapreduce/io/lenval_table_reader.h new file mode 100644 index 0000000000..990fe0b756 --- /dev/null +++ b/yt/cpp/mapreduce/io/lenval_table_reader.h @@ -0,0 +1,67 @@ +#pragma once + +#include "counting_raw_reader.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TLenvalTableReader +{ +public: + explicit TLenvalTableReader(::TIntrusivePtr<TRawTableReader> input); + virtual ~TLenvalTableReader(); + +protected: + bool IsValid() const; + void Next(); + ui32 GetTableIndex() const; + ui32 GetRangeIndex() const; + ui64 GetRowIndex() const; + void NextKey(); + TMaybe<size_t> GetReadByteCount() const; + bool IsEndOfStream() const; + bool IsRawReaderExhausted() const; + + void CheckValidity() const; + + bool Retry(); + + template <class T> + bool ReadInteger(T* result, bool acceptEndOfStream = false) + { + size_t count = Input_.Load(result, sizeof(T)); + if (acceptEndOfStream && count == 0) { + Finished_ = true; + Valid_ = false; + return false; + } + Y_ENSURE(count == sizeof(T), "Premature end of stream"); + return true; + } + + virtual void SkipRow() = 0; + +protected: + NDetail::TCountingRawTableReader Input_; + + bool Valid_ = true; + bool Finished_ = false; + ui32 TableIndex_ = 0; + TMaybe<ui64> RowIndex_; + TMaybe<ui32> RangeIndex_; + TMaybe<ui64> TabletIndex_; + bool IsEndOfStream_ = false; + bool AtStart_ = true; + bool RowTaken_ = true; + ui32 Length_ = 0; + +private: + bool PrepareRetry(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/node_table_reader.cpp b/yt/cpp/mapreduce/io/node_table_reader.cpp new file mode 100644 index 0000000000..d39e1398a5 --- /dev/null +++ b/yt/cpp/mapreduce/io/node_table_reader.cpp @@ -0,0 +1,375 @@ +#include "node_table_reader.h" + +#include <yt/cpp/mapreduce/common/node_builder.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/parser.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TRowBuilder + : public ::NYson::TYsonConsumerBase +{ +public: + explicit TRowBuilder(TMaybe<TRowElement>* resultRow); + + void OnStringScalar(TStringBuf value) override; + void OnInt64Scalar(i64 value) override; + void OnUint64Scalar(ui64 value) override; + void OnDoubleScalar(double value) override; + void OnBooleanScalar(bool value) override; + void OnBeginList() override; + void OnEntity() override; + void OnListItem() override; + void OnEndList() override; + void OnBeginMap() override; + void OnKeyedItem(TStringBuf key) override; + void OnEndMap() override; + void OnBeginAttributes() override; + void OnEndAttributes() override; + + void Finalize(); + +private: + THolder<TNodeBuilder> Builder_; + TRowElement Row_; + int Depth_ = 0; + bool Started_ = false; + TMaybe<TRowElement>* ResultRow_; + + void SaveResultRow(); +}; + +TRowBuilder::TRowBuilder(TMaybe<TRowElement>* resultRow) + : ResultRow_(resultRow) +{ } + +void TRowBuilder::OnStringScalar(TStringBuf value) +{ + Row_.Size += sizeof(TNode) + sizeof(TString) + value.size(); + Builder_->OnStringScalar(value); +} + +void TRowBuilder::OnInt64Scalar(i64 value) +{ + Row_.Size += sizeof(TNode); + Builder_->OnInt64Scalar(value); +} + +void TRowBuilder::OnUint64Scalar(ui64 value) +{ + Row_.Size += sizeof(TNode); + Builder_->OnUint64Scalar(value); +} + +void TRowBuilder::OnDoubleScalar(double value) +{ + Row_.Size += sizeof(TNode); + Builder_->OnDoubleScalar(value); +} + +void TRowBuilder::OnBooleanScalar(bool value) +{ + Row_.Size += sizeof(TNode); + Builder_->OnBooleanScalar(value); +} + +void TRowBuilder::OnBeginList() +{ + ++Depth_; + Builder_->OnBeginList(); +} + +void TRowBuilder::OnEntity() +{ + Row_.Size += sizeof(TNode); + Builder_->OnEntity(); +} + +void TRowBuilder::OnListItem() +{ + if (Depth_ == 0) { + SaveResultRow(); + } else { + Builder_->OnListItem(); + } +} + +void TRowBuilder::OnEndList() +{ + --Depth_; + Builder_->OnEndList(); +} + +void TRowBuilder::OnBeginMap() +{ + ++Depth_; + Builder_->OnBeginMap(); +} + +void TRowBuilder::OnKeyedItem(TStringBuf key) +{ + Row_.Size += sizeof(TString) + key.size(); + Builder_->OnKeyedItem(key); +} + +void TRowBuilder::OnEndMap() +{ + --Depth_; + Builder_->OnEndMap(); +} + +void TRowBuilder::OnBeginAttributes() +{ + ++Depth_; + Builder_->OnBeginAttributes(); +} + +void TRowBuilder::OnEndAttributes() +{ + --Depth_; + Builder_->OnEndAttributes(); +} + +void TRowBuilder::SaveResultRow() +{ + if (!Started_) { + Started_ = true; + } else { + *ResultRow_ = std::move(Row_); + } + Row_.Reset(); + Builder_.Reset(new TNodeBuilder(&Row_.Node)); +} + +void TRowBuilder::Finalize() +{ + if (Started_) { + *ResultRow_ = std::move(Row_); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TNodeTableReader::TNodeTableReader(::TIntrusivePtr<TRawTableReader> input) + : Input_(std::move(input)) +{ + PrepareParsing(); + Next(); +} + +TNodeTableReader::~TNodeTableReader() +{ +} + +void TNodeTableReader::ParseListFragmentItem() { + if (!Parser_->Parse()) { + Builder_->Finalize(); + IsLast_ = true; + } +} + +const TNode& TNodeTableReader::GetRow() const +{ + CheckValidity(); + if (!Row_) { + ythrow yexception() << "Row is moved"; + } + return Row_->Node; +} + +void TNodeTableReader::MoveRow(TNode* result) +{ + CheckValidity(); + if (!Row_) { + ythrow yexception() << "Row is moved"; + } + *result = std::move(Row_->Node); + Row_.Clear(); +} + +bool TNodeTableReader::IsValid() const +{ + return Valid_; +} + +void TNodeTableReader::Next() +{ + try { + NextImpl(); + } catch (const std::exception& ex) { + YT_LOG_ERROR("TNodeTableReader::Next failed: %v", ex.what()); + throw; + } +} + +void TNodeTableReader::NextImpl() +{ + CheckValidity(); + + if (RowIndex_) { + ++*RowIndex_; + } + + // At the begin of stream parser doesn't return a finished row. + ParseFirstListFragmentItem(); + + while (true) { + if (IsLast_) { + Finished_ = true; + Valid_ = false; + break; + } + + try { + ParseListFragmentItem(); + } catch (std::exception& ex) { + NeedParseFirst_ = true; + OnStreamError(std::current_exception(), ex.what()); + ParseFirstListFragmentItem(); + continue; + } + + Row_ = std::move(*NextRow_); + if (!Row_) { + throw yexception() << "No row in NextRow_"; + } + + // We successfully parsed one more row from the stream, + // so reset retry count to their initial value. + Input_.ResetRetries(); + + if (!Row_->Node.IsNull()) { + AtStart_ = false; + break; + } + + for (auto& entry : Row_->Node.GetAttributes().AsMap()) { + if (entry.first == "key_switch") { + if (!AtStart_) { + Valid_ = false; + } + } else if (entry.first == "table_index") { + TableIndex_ = static_cast<ui32>(entry.second.AsInt64()); + } else if (entry.first == "row_index") { + RowIndex_ = static_cast<ui64>(entry.second.AsInt64()); + } else if (entry.first == "range_index") { + RangeIndex_ = static_cast<ui32>(entry.second.AsInt64()); + } else if (entry.first == "tablet_index") { + TabletIndex_ = entry.second.AsInt64(); + } else if (entry.first == "end_of_stream") { + IsEndOfStream_ = true; + } + } + + if (!Valid_) { + break; + } + } +} + +void TNodeTableReader::ParseFirstListFragmentItem() +{ + while (NeedParseFirst_) { + try { + ParseListFragmentItem(); + NeedParseFirst_ = false; + break; + } catch (std::exception& ex) { + OnStreamError(std::current_exception(), ex.what()); + } + } +} + +ui32 TNodeTableReader::GetTableIndex() const +{ + CheckValidity(); + return TableIndex_; +} + +ui32 TNodeTableReader::GetRangeIndex() const +{ + CheckValidity(); + return RangeIndex_.GetOrElse(0); +} + +ui64 TNodeTableReader::GetRowIndex() const +{ + CheckValidity(); + return RowIndex_.GetOrElse(0UL); +} + +i64 TNodeTableReader::GetTabletIndex() const +{ + CheckValidity(); + return TabletIndex_.GetOrElse(0L); +} + +void TNodeTableReader::NextKey() +{ + while (Valid_) { + Next(); + } + + if (Finished_) { + return; + } + + Valid_ = true; + + if (RowIndex_) { + --*RowIndex_; + } +} + +TMaybe<size_t> TNodeTableReader::GetReadByteCount() const +{ + return Input_.GetReadByteCount(); +} + +bool TNodeTableReader::IsEndOfStream() const +{ + return IsEndOfStream_; +} + +bool TNodeTableReader::IsRawReaderExhausted() const +{ + return Finished_; +} + +//////////////////////////////////////////////////////////////////////////////// + +void TNodeTableReader::PrepareParsing() +{ + NextRow_.Clear(); + Builder_.Reset(new TRowBuilder(&NextRow_)); + Parser_.Reset(new ::NYson::TYsonListParser(Builder_.Get(), &Input_)); +} + +void TNodeTableReader::OnStreamError(std::exception_ptr exception, TString error) +{ + YT_LOG_ERROR("Read error: %v", error); + Exception_ = exception; + if (Input_.Retry(RangeIndex_, RowIndex_)) { + RowIndex_.Clear(); + RangeIndex_.Clear(); + PrepareParsing(); + } else { + std::rethrow_exception(Exception_); + } +} + +void TNodeTableReader::CheckValidity() const +{ + if (!Valid_) { + ythrow yexception() << "Iterator is not valid"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/node_table_reader.h b/yt/cpp/mapreduce/io/node_table_reader.h new file mode 100644 index 0000000000..4fe839eeb6 --- /dev/null +++ b/yt/cpp/mapreduce/io/node_table_reader.h @@ -0,0 +1,91 @@ +#pragma once + +#include "counting_raw_reader.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <library/cpp/yson/public.h> + +#include <util/stream/input.h> +#include <util/generic/buffer.h> +#include <util/system/event.h> +#include <util/system/thread.h> + +#include <atomic> + +namespace NYT { + +class TRawTableReader; +class TRowBuilder; + +//////////////////////////////////////////////////////////////////////////////// + +struct TRowElement +{ + TNode Node; + size_t Size = 0; + + void Reset() + { + Node = TNode(); + Size = 0; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TNodeTableReader + : public INodeReaderImpl +{ +public: + explicit TNodeTableReader(::TIntrusivePtr<TRawTableReader> input); + ~TNodeTableReader() override; + + const TNode& GetRow() const override; + void MoveRow(TNode* result) override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + i64 GetTabletIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsEndOfStream() const override; + bool IsRawReaderExhausted() const override; + +private: + void NextImpl(); + void OnStreamError(std::exception_ptr exception, TString error); + void CheckValidity() const; + void PrepareParsing(); + void ParseListFragmentItem(); + void ParseFirstListFragmentItem(); + +private: + NDetail::TCountingRawTableReader Input_; + + bool Valid_ = true; + bool Finished_ = false; + ui32 TableIndex_ = 0; + TMaybe<ui64> RowIndex_; + TMaybe<ui32> RangeIndex_; + TMaybe<i64> TabletIndex_; + bool IsEndOfStream_ = false; + bool AtStart_ = true; + + TMaybe<TRowElement> Row_; + TMaybe<TRowElement> NextRow_; + + THolder<TRowBuilder> Builder_; + THolder<::NYson::TYsonListParser> Parser_; + + std::exception_ptr Exception_; + bool NeedParseFirst_ = true; + bool IsLast_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/node_table_writer.cpp b/yt/cpp/mapreduce/io/node_table_writer.cpp new file mode 100644 index 0000000000..dcb5a0f5b5 --- /dev/null +++ b/yt/cpp/mapreduce/io/node_table_writer.cpp @@ -0,0 +1,72 @@ +#include "node_table_writer.h" + +#include <yt/cpp/mapreduce/common/node_visitor.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/writer.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TNodeTableWriter::TNodeTableWriter(THolder<IProxyOutput> output, NYson::EYsonFormat format) + : Output_(std::move(output)) +{ + for (size_t i = 0; i < Output_->GetStreamCount(); ++i) { + Writers_.push_back( + MakeHolder<::NYson::TYsonWriter>(Output_->GetStream(i), format, NYT::NYson::EYsonType::ListFragment)); + } +} + +TNodeTableWriter::~TNodeTableWriter() +{ } + +size_t TNodeTableWriter::GetTableCount() const +{ + return Output_->GetStreamCount(); +} + +void TNodeTableWriter::FinishTable(size_t tableIndex) { + Output_->GetStream(tableIndex)->Finish(); +} + +void TNodeTableWriter::AddRow(const TNode& row, size_t tableIndex) +{ + if (row.HasAttributes()) { + ythrow TIOException() << "Row cannot have attributes"; + } + + static const TNode emptyMap = TNode::CreateMap(); + const TNode* outRow = &emptyMap; + if (row.GetType() != TNode::Undefined) { + if (!row.IsMap()) { + ythrow TIOException() << "Row should be a map node"; + } else { + outRow = &row; + } + } + + auto* writer = Writers_[tableIndex].Get(); + writer->OnListItem(); + + TNodeVisitor visitor(writer); + visitor.Visit(*outRow); + + Output_->OnRowFinished(tableIndex); +} + +void TNodeTableWriter::AddRow(TNode&& row, size_t tableIndex) { + AddRow(row, tableIndex); +} + +void TNodeTableWriter::Abort() +{ + Output_->Abort(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/node_table_writer.h b/yt/cpp/mapreduce/io/node_table_writer.h new file mode 100644 index 0000000000..4bf8cb2fe7 --- /dev/null +++ b/yt/cpp/mapreduce/io/node_table_writer.h @@ -0,0 +1,33 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> +#include <library/cpp/yson/public.h> + +namespace NYT { + +class IProxyOutput; + +//////////////////////////////////////////////////////////////////////////////// + +class TNodeTableWriter + : public INodeWriterImpl +{ +public: + explicit TNodeTableWriter(THolder<IProxyOutput> output, ::NYson::EYsonFormat format = ::NYson::EYsonFormat::Binary); + ~TNodeTableWriter() override; + + void AddRow(const TNode& row, size_t tableIndex) override; + void AddRow(TNode&& row, size_t tableIndex) override; + + size_t GetTableCount() const override; + void FinishTable(size_t) override; + void Abort() override; + +private: + THolder<IProxyOutput> Output_; + TVector<THolder<::NYson::TYsonWriter>> Writers_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_helpers.cpp b/yt/cpp/mapreduce/io/proto_helpers.cpp new file mode 100644 index 0000000000..2ffbfd8d89 --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_helpers.cpp @@ -0,0 +1,101 @@ +#include "proto_helpers.h" + +#include <yt/cpp/mapreduce/interface/io.h> +#include <yt/cpp/mapreduce/interface/fluent.h> + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <google/protobuf/descriptor.h> +#include <google/protobuf/descriptor.pb.h> +#include <google/protobuf/messagext.h> +#include <google/protobuf/io/coded_stream.h> + +#include <util/stream/str.h> +#include <util/stream/file.h> +#include <util/folder/path.h> + +namespace NYT { + +using ::google::protobuf::Message; +using ::google::protobuf::Descriptor; +using ::google::protobuf::DescriptorPool; + +using ::google::protobuf::io::CodedInputStream; +using ::google::protobuf::io::TCopyingInputStreamAdaptor; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +TVector<const Descriptor*> GetJobDescriptors(const TString& fileName) +{ + TVector<const Descriptor*> descriptors; + if (!TFsPath(fileName).Exists()) { + ythrow TIOException() << + "Cannot load '" << fileName << "' file"; + } + + TIFStream input(fileName); + TString line; + while (input.ReadLine(line)) { + const auto* pool = DescriptorPool::generated_pool(); + const auto* descriptor = pool->FindMessageTypeByName(line); + descriptors.push_back(descriptor); + } + + return descriptors; +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TVector<const Descriptor*> GetJobInputDescriptors() +{ + return GetJobDescriptors("proto_input"); +} + +TVector<const Descriptor*> GetJobOutputDescriptors() +{ + return GetJobDescriptors("proto_output"); +} + +void ValidateProtoDescriptor( + const Message& row, + size_t tableIndex, + const TVector<const Descriptor*>& descriptors, + bool isRead) +{ + const char* direction = isRead ? "input" : "output"; + + if (tableIndex >= descriptors.size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << descriptors.size() << + ") in " << direction; + } + + if (row.GetDescriptor() != descriptors[tableIndex]) { + ythrow TIOException() << + "Invalid row of type " << row.GetDescriptor()->full_name() << + " at index " << tableIndex << + ", row of type " << descriptors[tableIndex]->full_name() << + " expected in " << direction; + } +} + +void ParseFromArcadiaStream(IInputStream* stream, Message& row, ui32 length) +{ + TLengthLimitedInput input(stream, length); + TCopyingInputStreamAdaptor adaptor(&input); + CodedInputStream codedStream(&adaptor); + codedStream.SetTotalBytesLimit(length + 1); + bool parsedOk = row.ParseFromCodedStream(&codedStream); + Y_ENSURE(parsedOk, "Failed to parse protobuf message"); + + Y_ENSURE(input.Left() == 0); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_helpers.h b/yt/cpp/mapreduce/io/proto_helpers.h new file mode 100644 index 0000000000..9d1ec0027c --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_helpers.h @@ -0,0 +1,36 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/node.h> + +namespace google { +namespace protobuf { + +class Message; +class Descriptor; + +} +} + +class IInputStream; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TVector<const ::google::protobuf::Descriptor*> GetJobInputDescriptors(); +TVector<const ::google::protobuf::Descriptor*> GetJobOutputDescriptors(); + +void ValidateProtoDescriptor( + const ::google::protobuf::Message& row, + size_t tableIndex, + const TVector<const ::google::protobuf::Descriptor*>& descriptors, + bool isRead); + +void ParseFromArcadiaStream( + IInputStream* stream, + ::google::protobuf::Message& row, + ui32 size); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_table_reader.cpp b/yt/cpp/mapreduce/io/proto_table_reader.cpp new file mode 100644 index 0000000000..28a4bc8719 --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_table_reader.cpp @@ -0,0 +1,305 @@ +#include "proto_table_reader.h" + +#include "node_table_reader.h" + +#include "proto_helpers.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <util/string/escape.h> +#include <util/string/printf.h> + +namespace NYT { + +using ::google::protobuf::Descriptor; +using ::google::protobuf::FieldDescriptor; +using ::google::protobuf::EnumValueDescriptor; + +const TString& GetFieldColumnName(const FieldDescriptor* fieldDesc) { + const auto& columnName = fieldDesc->options().GetExtension(column_name); + if (!columnName.empty()) { + return columnName; + } + const auto& keyColumnName = fieldDesc->options().GetExtension(key_column_name); + if (!keyColumnName.empty()) { + return keyColumnName; + } + return fieldDesc->name(); +} + +void ReadMessageFromNode(const TNode& node, Message* row) +{ + auto* descriptor = row->GetDescriptor(); + auto* reflection = row->GetReflection(); + + int count = descriptor->field_count(); + for (int i = 0; i < count; ++i) { + auto* fieldDesc = descriptor->field(i); + + const auto& columnName = GetFieldColumnName(fieldDesc); + + const auto& nodeMap = node.AsMap(); + auto it = nodeMap.find(columnName); + if (it == nodeMap.end()) { + continue; // no such column + } + auto actualType = it->second.GetType(); + if (actualType == TNode::Null) { + continue; // null field + } + + auto checkType = [&columnName] (TNode::EType expected, TNode::EType actual) { + if (expected != actual) { + ythrow TNode::TTypeError() << "expected node type " << expected + << ", actual " << actual << " for node " << columnName.data(); + } + }; + + switch (fieldDesc->type()) { + case FieldDescriptor::TYPE_STRING: + case FieldDescriptor::TYPE_BYTES: + checkType(TNode::String, actualType); + reflection->SetString(row, fieldDesc, it->second.AsString()); + break; + case FieldDescriptor::TYPE_INT64: + case FieldDescriptor::TYPE_SINT64: + case FieldDescriptor::TYPE_SFIXED64: + checkType(TNode::Int64, actualType); + reflection->SetInt64(row, fieldDesc, it->second.AsInt64()); + break; + case FieldDescriptor::TYPE_INT32: + case FieldDescriptor::TYPE_SINT32: + case FieldDescriptor::TYPE_SFIXED32: + checkType(TNode::Int64, actualType); + reflection->SetInt32(row, fieldDesc, it->second.AsInt64()); + break; + case FieldDescriptor::TYPE_UINT64: + case FieldDescriptor::TYPE_FIXED64: + checkType(TNode::Uint64, actualType); + reflection->SetUInt64(row, fieldDesc, it->second.AsUint64()); + break; + case FieldDescriptor::TYPE_UINT32: + case FieldDescriptor::TYPE_FIXED32: + checkType(TNode::Uint64, actualType); + reflection->SetUInt32(row, fieldDesc, it->second.AsUint64()); + break; + case FieldDescriptor::TYPE_DOUBLE: + checkType(TNode::Double, actualType); + reflection->SetDouble(row, fieldDesc, it->second.AsDouble()); + break; + case FieldDescriptor::TYPE_FLOAT: + checkType(TNode::Double, actualType); + reflection->SetFloat(row, fieldDesc, it->second.AsDouble()); + break; + case FieldDescriptor::TYPE_BOOL: + checkType(TNode::Bool, actualType); + reflection->SetBool(row, fieldDesc, it->second.AsBool()); + break; + case FieldDescriptor::TYPE_ENUM: { + TNode::EType columnType = TNode::String; + for (const auto& flag : fieldDesc->options().GetRepeatedExtension(flags)) { + if (flag == EWrapperFieldFlag::ENUM_INT) { + columnType = TNode::Int64; + break; + } + } + checkType(columnType, actualType); + + const EnumValueDescriptor* valueDesc = nullptr; + TString stringValue; + if (columnType == TNode::String) { + const auto& value = it->second.AsString(); + valueDesc = fieldDesc->enum_type()->FindValueByName(value); + stringValue = value; + } else if (columnType == TNode::Int64) { + const auto& value = it->second.AsInt64(); + valueDesc = fieldDesc->enum_type()->FindValueByNumber(value); + stringValue = ToString(value); + } else { + Y_FAIL(); + } + + if (valueDesc == nullptr) { + ythrow yexception() << "Failed to parse value '" << EscapeC(stringValue) << "' as " << fieldDesc->enum_type()->full_name(); + } + + reflection->SetEnum(row, fieldDesc, valueDesc); + + break; + } + case FieldDescriptor::TYPE_MESSAGE: { + checkType(TNode::String, actualType); + Message* message = reflection->MutableMessage(row, fieldDesc); + if (!message->ParseFromArray(it->second.AsString().data(), it->second.AsString().size())) { + ythrow yexception() << "Failed to parse protobuf message"; + } + break; + } + default: + ythrow yexception() << "Incorrect protobuf type"; + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +TProtoTableReader::TProtoTableReader( + ::TIntrusivePtr<TRawTableReader> input, + TVector<const Descriptor*>&& descriptors) + : NodeReader_(new TNodeTableReader(std::move(input))) + , Descriptors_(std::move(descriptors)) +{ } + +TProtoTableReader::~TProtoTableReader() +{ } + +void TProtoTableReader::ReadRow(Message* row) +{ + const auto& node = NodeReader_->GetRow(); + ReadMessageFromNode(node, row); +} + +bool TProtoTableReader::IsValid() const +{ + return NodeReader_->IsValid(); +} + +void TProtoTableReader::Next() +{ + NodeReader_->Next(); +} + +ui32 TProtoTableReader::GetTableIndex() const +{ + return NodeReader_->GetTableIndex(); +} + +ui32 TProtoTableReader::GetRangeIndex() const +{ + return NodeReader_->GetRangeIndex(); +} + +ui64 TProtoTableReader::GetRowIndex() const +{ + return NodeReader_->GetRowIndex(); +} + +void TProtoTableReader::NextKey() +{ + NodeReader_->NextKey(); +} + +TMaybe<size_t> TProtoTableReader::GetReadByteCount() const +{ + return NodeReader_->GetReadByteCount(); +} + +bool TProtoTableReader::IsEndOfStream() const +{ + return NodeReader_->IsEndOfStream(); +} + +bool TProtoTableReader::IsRawReaderExhausted() const +{ + return NodeReader_->IsRawReaderExhausted(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TLenvalProtoTableReader::TLenvalProtoTableReader( + ::TIntrusivePtr<TRawTableReader> input, + TVector<const Descriptor*>&& descriptors) + : TLenvalTableReader(std::move(input)) + , Descriptors_(std::move(descriptors)) +{ } + +TLenvalProtoTableReader::~TLenvalProtoTableReader() +{ } + +void TLenvalProtoTableReader::ReadRow(Message* row) +{ + ValidateProtoDescriptor(*row, GetTableIndex(), Descriptors_, true); + + while (true) { + try { + ParseFromArcadiaStream(&Input_, *row, Length_); + RowTaken_ = true; + + // We successfully parsed one more row from the stream, + // so reset retry count to their initial value. + Input_.ResetRetries(); + + break; + } catch (const std::exception& ) { + if (!TLenvalTableReader::Retry()) { + throw; + } + } + } +} + +bool TLenvalProtoTableReader::IsValid() const +{ + return TLenvalTableReader::IsValid(); +} + +void TLenvalProtoTableReader::Next() +{ + TLenvalTableReader::Next(); +} + +ui32 TLenvalProtoTableReader::GetTableIndex() const +{ + return TLenvalTableReader::GetTableIndex(); +} + +ui32 TLenvalProtoTableReader::GetRangeIndex() const +{ + return TLenvalTableReader::GetRangeIndex(); +} + +ui64 TLenvalProtoTableReader::GetRowIndex() const +{ + return TLenvalTableReader::GetRowIndex(); +} + +void TLenvalProtoTableReader::NextKey() +{ + TLenvalTableReader::NextKey(); +} + +TMaybe<size_t> TLenvalProtoTableReader::GetReadByteCount() const +{ + return TLenvalTableReader::GetReadByteCount(); +} + +bool TLenvalProtoTableReader::IsEndOfStream() const +{ + return TLenvalTableReader::IsEndOfStream(); +} + +bool TLenvalProtoTableReader::IsRawReaderExhausted() const +{ + return TLenvalTableReader::IsRawReaderExhausted(); +} + +void TLenvalProtoTableReader::SkipRow() +{ + while (true) { + try { + size_t skipped = Input_.Skip(Length_); + if (skipped != Length_) { + ythrow yexception() << "Premature end of stream"; + } + break; + } catch (const std::exception& ) { + if (!TLenvalTableReader::Retry()) { + throw; + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_table_reader.h b/yt/cpp/mapreduce/io/proto_table_reader.h new file mode 100644 index 0000000000..05a528b9c6 --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_table_reader.h @@ -0,0 +1,76 @@ +#pragma once + +#include "lenval_table_reader.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +class TRawTableReader; +class TNodeTableReader; + +//////////////////////////////////////////////////////////////////////////////// + +class TProtoTableReader + : public IProtoReaderImpl +{ +public: + explicit TProtoTableReader( + ::TIntrusivePtr<TRawTableReader> input, + TVector<const ::google::protobuf::Descriptor*>&& descriptors); + ~TProtoTableReader() override; + + void ReadRow(Message* row) override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsEndOfStream() const override; + bool IsRawReaderExhausted() const override; + +private: + THolder<TNodeTableReader> NodeReader_; + TVector<const ::google::protobuf::Descriptor*> Descriptors_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TLenvalProtoTableReader + : public IProtoReaderImpl + , public TLenvalTableReader +{ +public: + explicit TLenvalProtoTableReader( + ::TIntrusivePtr<TRawTableReader> input, + TVector<const ::google::protobuf::Descriptor*>&& descriptors); + ~TLenvalProtoTableReader() override; + + void ReadRow(Message* row) override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsEndOfStream() const override; + bool IsRawReaderExhausted() const override; + +protected: + void SkipRow() override; + +private: + TVector<const ::google::protobuf::Descriptor*> Descriptors_; +}; + +// Sometime useful outside mapreduce/yt +void ReadMessageFromNode(const TNode& node, Message* row); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_table_writer.cpp b/yt/cpp/mapreduce/io/proto_table_writer.cpp new file mode 100644 index 0000000000..1ce7811625 --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_table_writer.cpp @@ -0,0 +1,184 @@ +#include "proto_table_writer.h" + +#include "node_table_writer.h" +#include "proto_helpers.h" + +#include <yt/cpp/mapreduce/common/node_builder.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <google/protobuf/unknown_field_set.h> + +namespace NYT { + +using ::google::protobuf::Descriptor; +using ::google::protobuf::FieldDescriptor; + +//////////////////////////////////////////////////////////////////////////////// + +TNode MakeNodeFromMessage(const Message& row) +{ + TNode node; + TNodeBuilder builder(&node); + builder.OnBeginMap(); + + auto* descriptor = row.GetDescriptor(); + auto* reflection = row.GetReflection(); + + int count = descriptor->field_count(); + for (int i = 0; i < count; ++i) { + auto* fieldDesc = descriptor->field(i); + if (fieldDesc->is_repeated()) { + Y_ENSURE(reflection->FieldSize(row, fieldDesc) == 0, "Storing repeated protobuf fields is not supported yet"); + continue; + } else if (!reflection->HasField(row, fieldDesc)) { + continue; + } + + TString columnName = fieldDesc->options().GetExtension(column_name); + if (columnName.empty()) { + const auto& keyColumnName = fieldDesc->options().GetExtension(key_column_name); + columnName = keyColumnName.empty() ? fieldDesc->name() : keyColumnName; + } + + builder.OnKeyedItem(columnName); + + switch (fieldDesc->type()) { + case FieldDescriptor::TYPE_STRING: + case FieldDescriptor::TYPE_BYTES: + builder.OnStringScalar(reflection->GetString(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_INT64: + case FieldDescriptor::TYPE_SINT64: + case FieldDescriptor::TYPE_SFIXED64: + builder.OnInt64Scalar(reflection->GetInt64(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_INT32: + case FieldDescriptor::TYPE_SINT32: + case FieldDescriptor::TYPE_SFIXED32: + builder.OnInt64Scalar(reflection->GetInt32(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_UINT64: + case FieldDescriptor::TYPE_FIXED64: + builder.OnUint64Scalar(reflection->GetUInt64(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_UINT32: + case FieldDescriptor::TYPE_FIXED32: + builder.OnUint64Scalar(reflection->GetUInt32(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_DOUBLE: + builder.OnDoubleScalar(reflection->GetDouble(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_FLOAT: + builder.OnDoubleScalar(reflection->GetFloat(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_BOOL: + builder.OnBooleanScalar(reflection->GetBool(row, fieldDesc)); + break; + case FieldDescriptor::TYPE_ENUM: + builder.OnStringScalar(reflection->GetEnum(row, fieldDesc)->name()); + break; + case FieldDescriptor::TYPE_MESSAGE: + builder.OnStringScalar(reflection->GetMessage(row, fieldDesc).SerializeAsString()); + break; + default: + ythrow yexception() << "Invalid field type for column: " << columnName; + break; + } + } + + builder.OnEndMap(); + return node; +} + +//////////////////////////////////////////////////////////////////////////////// + +TProtoTableWriter::TProtoTableWriter( + THolder<IProxyOutput> output, + TVector<const Descriptor*>&& descriptors) + : NodeWriter_(new TNodeTableWriter(std::move(output))) + , Descriptors_(std::move(descriptors)) +{ } + +TProtoTableWriter::~TProtoTableWriter() +{ } + +size_t TProtoTableWriter::GetTableCount() const +{ + return NodeWriter_->GetTableCount(); +} + +void TProtoTableWriter::FinishTable(size_t tableIndex) +{ + NodeWriter_->FinishTable(tableIndex); +} + +void TProtoTableWriter::AddRow(const Message& row, size_t tableIndex) +{ + NodeWriter_->AddRow(MakeNodeFromMessage(row), tableIndex); +} + +void TProtoTableWriter::AddRow(Message&& row, size_t tableIndex) +{ + TProtoTableWriter::AddRow(row, tableIndex); +} + + +void TProtoTableWriter::Abort() +{ + NodeWriter_->Abort(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TLenvalProtoTableWriter::TLenvalProtoTableWriter( + THolder<IProxyOutput> output, + TVector<const Descriptor*>&& descriptors) + : Output_(std::move(output)) + , Descriptors_(std::move(descriptors)) +{ } + +TLenvalProtoTableWriter::~TLenvalProtoTableWriter() +{ } + +size_t TLenvalProtoTableWriter::GetTableCount() const +{ + return Output_->GetStreamCount(); +} + +void TLenvalProtoTableWriter::FinishTable(size_t tableIndex) +{ + Output_->GetStream(tableIndex)->Finish(); +} + +void TLenvalProtoTableWriter::AddRow(const Message& row, size_t tableIndex) +{ + ValidateProtoDescriptor(row, tableIndex, Descriptors_, false); + + Y_VERIFY(row.GetReflection()->GetUnknownFields(row).empty(), + "Message has unknown fields. This probably means bug in client code.\n" + "Message: %s", row.DebugString().data()); + + auto* stream = Output_->GetStream(tableIndex); + i32 size = row.ByteSize(); + stream->Write(&size, sizeof(size)); + bool serializedOk = row.SerializeToArcadiaStream(stream); + Y_ENSURE(serializedOk, "Failed to serialize protobuf message"); + Output_->OnRowFinished(tableIndex); +} + +void TLenvalProtoTableWriter::AddRow(Message&& row, size_t tableIndex) +{ + TLenvalProtoTableWriter::AddRow(row, tableIndex); +} + +void TLenvalProtoTableWriter::Abort() +{ + Output_->Abort(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/proto_table_writer.h b/yt/cpp/mapreduce/io/proto_table_writer.h new file mode 100644 index 0000000000..a6df69e6ae --- /dev/null +++ b/yt/cpp/mapreduce/io/proto_table_writer.h @@ -0,0 +1,61 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +class IProxyOutput; +class TNodeTableWriter; + +//////////////////////////////////////////////////////////////////////////////// + +class TProtoTableWriter + : public IProtoWriterImpl +{ +public: + TProtoTableWriter( + THolder<IProxyOutput> output, + TVector<const ::google::protobuf::Descriptor*>&& descriptors); + ~TProtoTableWriter() override; + + void AddRow(const Message& row, size_t tableIndex) override; + void AddRow(Message&& row, size_t tableIndex) override; + + size_t GetTableCount() const override; + void FinishTable(size_t) override; + void Abort() override; + +private: + THolder<TNodeTableWriter> NodeWriter_; + TVector<const ::google::protobuf::Descriptor*> Descriptors_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TLenvalProtoTableWriter + : public IProtoWriterImpl +{ +public: + TLenvalProtoTableWriter( + THolder<IProxyOutput> output, + TVector<const ::google::protobuf::Descriptor*>&& descriptors); + ~TLenvalProtoTableWriter() override; + + void AddRow(const Message& row, size_t tableIndex) override; + void AddRow(Message&& row, size_t tableIndex) override; + + size_t GetTableCount() const override; + void FinishTable(size_t) override; + void Abort() override; + +private: + THolder<IProxyOutput> Output_; + TVector<const ::google::protobuf::Descriptor*> Descriptors_; +}; + +// Sometime useful outside mapreduce/yt +TNode MakeNodeFromMessage(const ::google::protobuf::Message& row); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/skiff_row_table_reader.cpp b/yt/cpp/mapreduce/io/skiff_row_table_reader.cpp new file mode 100644 index 0000000000..8da3b2da31 --- /dev/null +++ b/yt/cpp/mapreduce/io/skiff_row_table_reader.cpp @@ -0,0 +1,232 @@ +#include "skiff_row_table_reader.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/interface/skiff_row.h> + +#include <library/cpp/skiff/skiff.h> + +#include <library/cpp/yt/logging/logger.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TSkiffRowTableReader::TSkiffRowTableReader( + ::TIntrusivePtr<TRawTableReader> input, + const NSkiff::TSkiffSchemaPtr& schema, + TVector<ISkiffRowSkipperPtr>&& skippers, + NDetail::TCreateSkiffSchemaOptions&& options) + : Input_(std::move(input)) + , BufferedInput_(&Input_) + , Parser_({schema, &BufferedInput_}) + , Skippers_(std::move(skippers)) + , Options_(std::move(options)) +{ + Next(); +} + +TSkiffRowTableReader::~TSkiffRowTableReader() +{ } + +bool TSkiffRowTableReader::Retry() +{ + if (PrepareRetry()) { + RowTaken_ = true; + Next(); + return true; + } + return false; +} + +bool TSkiffRowTableReader::PrepareRetry() +{ + if (Input_.Retry(RangeIndex_, RowIndex_)) { + RowIndex_.Clear(); + RangeIndex_.Clear(); + BufferedInput_ = TBufferedInput(&Input_); + Parser_.emplace(&BufferedInput_); + return true; + } + return false; +} + +void TSkiffRowTableReader::ReadRow(const ISkiffRowParserPtr& parser) +{ + while (true) { + try { + parser->Parse(&Parser_.value()); + RowTaken_ = true; + + // We successfully parsed one more row from the stream, + // so reset retry count to their initial value. + Input_.ResetRetries(); + + break; + } catch (const std::exception& ex) { + YT_LOG_ERROR("Read error during parsing: %v", ex.what()); + + if (!Retry()) { + throw; + } + } + } +} + +bool TSkiffRowTableReader::IsValid() const +{ + return Valid_; +} + +void TSkiffRowTableReader::SkipRow() +{ + CheckValidity(); + while (true) { + try { + Skippers_[TableIndex_]->SkipRow(&Parser_.value()); + + break; + } catch (const std::exception& ex) { + YT_LOG_ERROR("Read error during skipping row: %v", ex.what()); + + if (!Retry()) { + throw; + } + } + } +} + +void TSkiffRowTableReader::CheckValidity() const { + if (!IsValid()) { + ythrow yexception() << "Iterator is not valid"; + } +} + +void TSkiffRowTableReader::Next() +{ + if (!RowTaken_) { + SkipRow(); + } + + CheckValidity(); + + if (Y_UNLIKELY(Finished_ || !Parser_->HasMoreData())) { + Finished_ = true; + Valid_ = false; + return; + } + + if (AfterKeySwitch_) { + AfterKeySwitch_ = false; + return; + } + + if (RowIndex_) { + ++*RowIndex_; + } + + while (true) { + try { + auto tag = Parser_->ParseVariant16Tag(); + if (tag == NSkiff::EndOfSequenceTag<ui16>()) { + IsEndOfStream_ = true; + break; + } else { + TableIndex_ = tag; + } + + if (TableIndex_ >= Skippers_.size()) { + ythrow TIOException() << + "Table index " << TableIndex_ << + " is out of range [0, " << Skippers_.size() << + ") in read"; + } + + if (Options_.HasKeySwitch_) { + auto keySwitch = Parser_->ParseBoolean(); + if (keySwitch) { + AfterKeySwitch_ = true; + Valid_ = false; + } + } + + auto tagRowIndex = Parser_->ParseVariant8Tag(); + if (tagRowIndex == 1) { + RowIndex_ = Parser_->ParseInt64(); + } else { + Y_ENSURE(tagRowIndex == 0, "Tag for row_index was expected to be 0 or 1, got " << tagRowIndex); + } + + if (Options_.HasRangeIndex_) { + auto tagRangeIndex = Parser_->ParseVariant8Tag(); + if (tagRangeIndex == 1) { + RangeIndex_ = Parser_->ParseInt64(); + } else { + Y_ENSURE(tagRangeIndex == 0, "Tag for range_index was expected to be 0 or 1, got " << tagRangeIndex); + } + } + + break; + } catch (const std::exception& ex) { + YT_LOG_ERROR("Read error: %v", ex.what()); + + if (!PrepareRetry()) { + throw; + } + } + } + + RowTaken_ = false; +} + +ui32 TSkiffRowTableReader::GetTableIndex() const +{ + CheckValidity(); + return TableIndex_; +} + +ui32 TSkiffRowTableReader::GetRangeIndex() const +{ + CheckValidity(); + return RangeIndex_.GetOrElse(0); +} + +ui64 TSkiffRowTableReader::GetRowIndex() const +{ + CheckValidity(); + return RowIndex_.GetOrElse(0ULL); +} + +void TSkiffRowTableReader::NextKey() { + while (Valid_) { + Next(); + } + + if (Finished_) { + return; + } + + Valid_ = true; + + if (RowIndex_) { + --*RowIndex_; + } + + RowTaken_ = true; +} + +TMaybe<size_t> TSkiffRowTableReader::GetReadByteCount() const { + return Input_.GetReadByteCount(); +} + +bool TSkiffRowTableReader::IsEndOfStream() const { + return IsEndOfStream_; +} + +bool TSkiffRowTableReader::IsRawReaderExhausted() const { + return Finished_; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/skiff_row_table_reader.h b/yt/cpp/mapreduce/io/skiff_row_table_reader.h new file mode 100644 index 0000000000..368968266c --- /dev/null +++ b/yt/cpp/mapreduce/io/skiff_row_table_reader.h @@ -0,0 +1,67 @@ +#pragma once + +#include "counting_raw_reader.h" + +#include <yt/cpp/mapreduce/client/skiff.h> + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/cpp/mapreduce/skiff/unchecked_parser.h> + +#include <util/stream/buffered.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class TSkiffRowTableReader + : public ISkiffRowReaderImpl +{ +public: + explicit TSkiffRowTableReader( + ::TIntrusivePtr<TRawTableReader> input, + const NSkiff::TSkiffSchemaPtr& schema, + TVector<ISkiffRowSkipperPtr>&& skippers, + NDetail::TCreateSkiffSchemaOptions&& options); + + ~TSkiffRowTableReader() override; + + void ReadRow(const ISkiffRowParserPtr& parser) override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsEndOfStream() const override; + bool IsRawReaderExhausted() const override; + +private: + bool Retry(); + void SkipRow(); + void CheckValidity() const; + bool PrepareRetry(); + +private: + NDetail::TCountingRawTableReader Input_; + TBufferedInput BufferedInput_; + std::optional<NSkiff::TCheckedInDebugSkiffParser> Parser_; + TVector<ISkiffRowSkipperPtr> Skippers_; + NDetail::TCreateSkiffSchemaOptions Options_; + + bool RowTaken_ = true; + bool Valid_ = true; + bool Finished_ = false; + bool AfterKeySwitch_ = false; + bool IsEndOfStream_ = false; + + TMaybe<ui64> RowIndex_; + TMaybe<ui32> RangeIndex_; + ui32 TableIndex_ = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/skiff_table_reader.cpp b/yt/cpp/mapreduce/io/skiff_table_reader.cpp new file mode 100644 index 0000000000..51c20609f0 --- /dev/null +++ b/yt/cpp/mapreduce/io/skiff_table_reader.cpp @@ -0,0 +1,293 @@ +#include "skiff_table_reader.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <yt/cpp/mapreduce/skiff/wire_type.h> +#include <yt/cpp/mapreduce/skiff/skiff_schema.h> + +#include <util/string/cast.h> + +namespace NYT { +namespace NDetail { +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +enum EColumnType : i8 +{ + Dense, + KeySwitch, + RangeIndex, + RowIndex +}; + +struct TSkiffColumnSchema +{ + EColumnType Type; + bool Required; + NSkiff::EWireType WireType; + TString Name; + + TSkiffColumnSchema(EColumnType type, bool required, NSkiff::EWireType wireType, const TString& name) + : Type(type) + , Required(required) + , WireType(wireType) + , Name(name) + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +struct TSkiffTableReader::TSkiffTableSchema +{ + TVector<TSkiffColumnSchema> Columns; +}; + +TSkiffTableReader::TSkiffTableReader( + ::TIntrusivePtr<TRawTableReader> input, + const NSkiff::TSkiffSchemaPtr& schema) + : Input_(std::move(input)) + , BufferedInput_(&Input_) + , Parser_(&BufferedInput_) + , Schemas_(CreateSkiffTableSchemas(schema)) +{ + Next(); +} + +TSkiffTableReader::~TSkiffTableReader() = default; + +const TNode& TSkiffTableReader::GetRow() const +{ + EnsureValidity(); + Y_ENSURE(!Row_.IsUndefined(), "Row is moved"); + return Row_; +} + +void TSkiffTableReader::MoveRow(TNode* result) +{ + EnsureValidity(); + Y_ENSURE(!Row_.IsUndefined(), "Row is moved"); + *result = std::move(Row_); + Row_ = TNode(); +} + +bool TSkiffTableReader::IsValid() const +{ + return Valid_; +} + +void TSkiffTableReader::Next() +{ + EnsureValidity(); + if (Y_UNLIKELY(Finished_ || !Parser_->HasMoreData())) { + Finished_ = true; + Valid_ = false; + return; + } + + if (AfterKeySwitch_) { + AfterKeySwitch_ = false; + return; + } + + while (true) { + try { + ReadRow(); + break; + } catch (const std::exception& exception) { + YT_LOG_ERROR("Read error: %v", exception.what()); + if (!Input_.Retry(RangeIndex_, RowIndex_)) { + throw; + } + BufferedInput_ = TBufferedInput(&Input_); + Parser_.emplace(NSkiff::TUncheckedSkiffParser(&BufferedInput_)); + RangeIndex_.Clear(); + RowIndex_.Clear(); + } + } +} + +ui32 TSkiffTableReader::GetTableIndex() const +{ + EnsureValidity(); + return TableIndex_; +} + +ui32 TSkiffTableReader::GetRangeIndex() const +{ + EnsureValidity(); + return RangeIndex_.GetOrElse(0); +} + +ui64 TSkiffTableReader::GetRowIndex() const +{ + EnsureValidity(); + return RowIndex_.GetOrElse(0ULL); +} + +void TSkiffTableReader::NextKey() +{ + while (Valid_) { + Next(); + } + + if (Finished_) { + return; + } + + Valid_ = true; +} + +TMaybe<size_t> TSkiffTableReader::GetReadByteCount() const +{ + return Input_.GetReadByteCount(); +} + +bool TSkiffTableReader::IsRawReaderExhausted() const +{ + return Finished_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TVector<TSkiffTableReader::TSkiffTableSchema> TSkiffTableReader::CreateSkiffTableSchemas( + const NSkiff::TSkiffSchemaPtr& schema) +{ + using NSkiff::EWireType; + + constexpr auto keySwitchColumnName = "$key_switch"; + constexpr auto rangeIndexColumnName = "$range_index"; + constexpr auto rowIndexColumnName = "$row_index"; + + static const THashMap<TString, TSkiffColumnSchema> specialColumns = { + {keySwitchColumnName, {EColumnType::KeySwitch, true, EWireType::Boolean, keySwitchColumnName}}, + {rangeIndexColumnName, {EColumnType::RangeIndex, false, EWireType::Int64, rangeIndexColumnName}}, + {rowIndexColumnName, {EColumnType::RowIndex, false, EWireType::Int64, rowIndexColumnName}}, + }; + + Y_ENSURE(schema->GetWireType() == EWireType::Variant16, + "Expected 'variant16' wire type for schema, got '" << schema->GetWireType() << "'"); + TVector<TSkiffTableSchema> result; + for (const auto& tableSchema : schema->GetChildren()) { + Y_ENSURE(tableSchema->GetWireType() == EWireType::Tuple, + "Expected 'tuple' wire type for table schema, got '" << tableSchema->GetWireType() << "'"); + TVector<TSkiffColumnSchema> columns; + for (const auto& columnSchema : tableSchema->GetChildren()) { + if (columnSchema->GetName().StartsWith("$")) { + auto iter = specialColumns.find(columnSchema->GetName()); + Y_ENSURE(iter != specialColumns.end(), "Unknown special column: " << columnSchema->GetName()); + columns.push_back(iter->second); + } else { + auto wireType = columnSchema->GetWireType(); + bool required = true; + if (wireType == EWireType::Variant8) { + const auto& children = columnSchema->GetChildren(); + Y_ENSURE( + children.size() == 2 && children[0]->GetWireType() == EWireType::Nothing && + NSkiff::IsSimpleType(children[1]->GetWireType()), + "Expected schema of form 'variant8<nothing, simple-type>', got " + << NSkiff::GetShortDebugString(columnSchema)); + wireType = children[1]->GetWireType(); + required = false; + } + Y_ENSURE(NSkiff::IsSimpleType(wireType), + "Expected column schema to be of simple type, got " << NSkiff::GetShortDebugString(columnSchema)); + columns.emplace_back( + EColumnType::Dense, + required, + wireType, + columnSchema->GetName()); + } + } + result.push_back({std::move(columns)}); + } + return result; +} + +void TSkiffTableReader::ReadRow() +{ + if (Row_.IsUndefined()) { + Row_ = TNode::CreateMap(); + } else { + Row_.AsMap().clear(); + } + + if (RowIndex_) { + ++*RowIndex_; + } + + TableIndex_ = Parser_->ParseVariant16Tag(); + Y_ENSURE(TableIndex_ < Schemas_.size(), "Table index out of range: " << TableIndex_ << " >= " << Schemas_.size()); + const auto& tableSchema = Schemas_[TableIndex_]; + + auto parse = [&](NSkiff::EWireType wireType) -> TNode { + switch (wireType) { + case NSkiff::EWireType::Int64: + return Parser_->ParseInt64(); + case NSkiff::EWireType::Uint64: + return Parser_->ParseUint64(); + case NSkiff::EWireType::Boolean: + return Parser_->ParseBoolean(); + case NSkiff::EWireType::Double: + return Parser_->ParseDouble(); + case NSkiff::EWireType::String32: + return Parser_->ParseString32(); + case NSkiff::EWireType::Yson32: + return NodeFromYsonString(Parser_->ParseYson32()); + case NSkiff::EWireType::Nothing: + return TNode::CreateEntity(); + default: + Y_FAIL("Bad column wire type: '%s'", ::ToString(wireType).data()); + } + }; + + for (const auto& columnSchema : tableSchema.Columns) { + if (!columnSchema.Required) { + auto tag = Parser_->ParseVariant8Tag(); + if (tag == 0) { + if (columnSchema.Type == EColumnType::Dense) { + Row_[columnSchema.Name] = TNode::CreateEntity(); + } + continue; + } + Y_ENSURE(tag == 1, "Tag for 'variant8<nothing," << columnSchema.WireType + << ">' expected to be 0 or 1, got " << tag); + } + auto value = parse(columnSchema.WireType); + switch (columnSchema.Type) { + case EColumnType::Dense: + Row_[columnSchema.Name] = std::move(value); + break; + case EColumnType::KeySwitch: + if (value.AsBool()) { + AfterKeySwitch_ = true; + Valid_ = false; + } + break; + case EColumnType::RangeIndex: + RangeIndex_ = value.AsInt64(); + break; + case EColumnType::RowIndex: + RowIndex_ = value.AsInt64(); + break; + default: + Y_FAIL("Bad column type: %d", static_cast<int>(columnSchema.Type)); + } + } + + // We successfully parsed one more row from the stream, + // so reset retry count to their initial value. + Input_.ResetRetries(); +} + +void TSkiffTableReader::EnsureValidity() const +{ + Y_ENSURE(Valid_, "Iterator is not valid"); +} + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/skiff_table_reader.h b/yt/cpp/mapreduce/io/skiff_table_reader.h new file mode 100644 index 0000000000..95ece5f9c7 --- /dev/null +++ b/yt/cpp/mapreduce/io/skiff_table_reader.h @@ -0,0 +1,65 @@ +#pragma once + +#include "counting_raw_reader.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +#include <yt/cpp/mapreduce/skiff/wire_type.h> +#include <yt/cpp/mapreduce/skiff/unchecked_parser.h> + +#include <util/stream/buffered.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TSkiffTableReader + : public INodeReaderImpl +{ +public: + TSkiffTableReader( + ::TIntrusivePtr<TRawTableReader> input, + const std::shared_ptr<NSkiff::TSkiffSchema>& schema); + ~TSkiffTableReader() override; + + virtual const TNode& GetRow() const override; + virtual void MoveRow(TNode* row) override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsRawReaderExhausted() const override; + +private: + struct TSkiffTableSchema; + +private: + void EnsureValidity() const; + void ReadRow(); + static TVector<TSkiffTableSchema> CreateSkiffTableSchemas(const std::shared_ptr<NSkiff::TSkiffSchema>& schema); + +private: + NDetail::TCountingRawTableReader Input_; + TBufferedInput BufferedInput_; + std::optional<NSkiff::TUncheckedSkiffParser> Parser_; + TVector<TSkiffTableSchema> Schemas_; + + TNode Row_; + + bool Valid_ = true; + bool AfterKeySwitch_ = false; + bool Finished_ = false; + TMaybe<ui64> RangeIndex_; + TMaybe<ui64> RowIndex_; + ui32 TableIndex_ = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/stream_raw_reader.cpp b/yt/cpp/mapreduce/io/stream_raw_reader.cpp new file mode 100644 index 0000000000..ec19b67d0b --- /dev/null +++ b/yt/cpp/mapreduce/io/stream_raw_reader.cpp @@ -0,0 +1,59 @@ +#include "stream_table_reader.h" + +#include "node_table_reader.h" +#include "proto_table_reader.h" +#include "skiff_table_reader.h" +#include "yamr_table_reader.h" + +#include <util/system/env.h> +#include <util/string/type.h> + +namespace NYT { + +template <> +TTableReaderPtr<TNode> CreateTableReader<TNode>( + IInputStream* stream, const TTableReaderOptions& /*options*/) +{ + auto impl = ::MakeIntrusive<TNodeTableReader>( + ::MakeIntrusive<NDetail::TInputStreamProxy>(stream)); + return new TTableReader<TNode>(impl); +} + +template <> +TTableReaderPtr<TYaMRRow> CreateTableReader<TYaMRRow>( + IInputStream* stream, const TTableReaderOptions& /*options*/) +{ + auto impl = ::MakeIntrusive<TYaMRTableReader>( + ::MakeIntrusive<NDetail::TInputStreamProxy>(stream)); + return new TTableReader<TYaMRRow>(impl); +} + + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& /* options */, + const ::google::protobuf::Descriptor* descriptor) +{ + return new TLenvalProtoTableReader( + ::MakeIntrusive<TInputStreamProxy>(stream), + {descriptor}); +} + +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& /* options */, + TVector<const ::google::protobuf::Descriptor*> descriptors) +{ + return new TLenvalProtoTableReader( + ::MakeIntrusive<TInputStreamProxy>(stream), + std::move(descriptors)); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/stream_table_reader.h b/yt/cpp/mapreduce/io/stream_table_reader.h new file mode 100644 index 0000000000..d799c63cf4 --- /dev/null +++ b/yt/cpp/mapreduce/io/stream_table_reader.h @@ -0,0 +1,65 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +class TInputStreamProxy + : public TRawTableReader +{ +public: + TInputStreamProxy(IInputStream* stream) + : Stream_(stream) + { } + + bool Retry(const TMaybe<ui32>& /* rangeIndex */, const TMaybe<ui64>& /* rowIndex */) override + { + return false; + } + + void ResetRetries() override + { } + + bool HasRangeIndices() const override + { + return false; + } + +protected: + size_t DoRead(void* buf, size_t len) override + { + return Stream_->Read(buf, len); + } + +private: + IInputStream* Stream_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& /* options */, + const ::google::protobuf::Descriptor* descriptor); + +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& /* options */, + TVector<const ::google::protobuf::Descriptor*> descriptors); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +template <> +TTableReaderPtr<TNode> CreateTableReader<TNode>( + IInputStream* stream, const TTableReaderOptions& options); + +template <> +TTableReaderPtr<TYaMRRow> CreateTableReader<TYaMRRow>( + IInputStream* stream, const TTableReaderOptions& /*options*/); + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/ya.make b/yt/cpp/mapreduce/io/ya.make new file mode 100644 index 0000000000..d355e86850 --- /dev/null +++ b/yt/cpp/mapreduce/io/ya.make @@ -0,0 +1,33 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + counting_raw_reader.cpp + job_reader.cpp + job_writer.cpp + lenval_table_reader.cpp + node_table_reader.cpp + node_table_writer.cpp + proto_helpers.cpp + proto_table_reader.cpp + proto_table_writer.cpp + skiff_row_table_reader.cpp + skiff_table_reader.cpp + stream_raw_reader.cpp + yamr_table_reader.cpp + yamr_table_writer.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/yson + yt/cpp/mapreduce/common + yt/cpp/mapreduce/interface + yt/cpp/mapreduce/interface/logging + yt/yt_proto/yt/formats + library/cpp/yson/node + yt/cpp/mapreduce/skiff +) + +END() diff --git a/yt/cpp/mapreduce/io/yamr_table_reader.cpp b/yt/cpp/mapreduce/io/yamr_table_reader.cpp new file mode 100644 index 0000000000..6204738e10 --- /dev/null +++ b/yt/cpp/mapreduce/io/yamr_table_reader.cpp @@ -0,0 +1,145 @@ +#include "yamr_table_reader.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/raw_client/raw_requests.h> + +//////////////////////////////////////////////////////////////////// + +static void CheckedSkip(IInputStream* input, size_t byteCount) +{ + size_t skipped = input->Skip(byteCount); + Y_ENSURE(skipped == byteCount, "Premature end of YaMR stream"); +} + +//////////////////////////////////////////////////////////////////// + +namespace NYT { + +using namespace NYT::NDetail::NRawClient; + +//////////////////////////////////////////////////////////////////////////////// + +TYaMRTableReader::TYaMRTableReader(::TIntrusivePtr<TRawTableReader> input) + : TLenvalTableReader(std::move(input)) +{ } + +TYaMRTableReader::~TYaMRTableReader() +{ } + +const TYaMRRow& TYaMRTableReader::GetRow() const +{ + CheckValidity(); + if (!RowTaken_) { + const_cast<TYaMRTableReader*>(this)->ReadRow(); + } + return Row_; +} + +bool TYaMRTableReader::IsValid() const +{ + return Valid_; +} + +void TYaMRTableReader::Next() +{ + TLenvalTableReader::Next(); +} + +void TYaMRTableReader::NextKey() +{ + TLenvalTableReader::NextKey(); +} + +ui32 TYaMRTableReader::GetTableIndex() const +{ + return TLenvalTableReader::GetTableIndex(); +} + +ui32 TYaMRTableReader::GetRangeIndex() const +{ + return TLenvalTableReader::GetRangeIndex(); +} + +ui64 TYaMRTableReader::GetRowIndex() const +{ + return TLenvalTableReader::GetRowIndex(); +} + +TMaybe<size_t> TYaMRTableReader::GetReadByteCount() const +{ + return TLenvalTableReader::GetReadByteCount(); +} + +bool TYaMRTableReader::IsEndOfStream() const +{ + return TLenvalTableReader::IsEndOfStream(); +} + +bool TYaMRTableReader::IsRawReaderExhausted() const +{ + return TLenvalTableReader::IsRawReaderExhausted(); +} + +void TYaMRTableReader::ReadField(TString* result, i32 length) +{ + result->resize(length); + size_t count = Input_.Load(result->begin(), length); + Y_ENSURE(count == static_cast<size_t>(length), "Premature end of YaMR stream"); +} + +void TYaMRTableReader::ReadRow() +{ + while (true) { + try { + i32 value = static_cast<i32>(Length_); + ReadField(&Key_, value); + Row_.Key = Key_; + + ReadInteger(&value); + ReadField(&SubKey_, value); + Row_.SubKey = SubKey_; + + ReadInteger(&value); + ReadField(&Value_, value); + Row_.Value = Value_; + + RowTaken_ = true; + + // We successfully parsed one more row from the stream, + // so reset retry count to their initial value. + Input_.ResetRetries(); + + break; + } catch (const std::exception& ) { + if (!TLenvalTableReader::Retry()) { + throw; + } + } + } +} + +void TYaMRTableReader::SkipRow() +{ + while (true) { + try { + i32 value = static_cast<i32>(Length_); + CheckedSkip(&Input_, value); + + ReadInteger(&value); + CheckedSkip(&Input_, value); + + ReadInteger(&value); + CheckedSkip(&Input_, value); + break; + } catch (const std::exception& ) { + if (!TLenvalTableReader::Retry()) { + throw; + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/yamr_table_reader.h b/yt/cpp/mapreduce/io/yamr_table_reader.h new file mode 100644 index 0000000000..39fdecfa71 --- /dev/null +++ b/yt/cpp/mapreduce/io/yamr_table_reader.h @@ -0,0 +1,48 @@ +#pragma once + +#include "lenval_table_reader.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +class TRawTableReader; +struct TClientContext; + +//////////////////////////////////////////////////////////////////////////////// + +class TYaMRTableReader + : public IYaMRReaderImpl + , public TLenvalTableReader +{ +public: + explicit TYaMRTableReader(::TIntrusivePtr<TRawTableReader> input); + ~TYaMRTableReader() override; + + const TYaMRRow& GetRow() const override; + + bool IsValid() const override; + void Next() override; + ui32 GetTableIndex() const override; + ui32 GetRangeIndex() const override; + ui64 GetRowIndex() const override; + void NextKey() override; + TMaybe<size_t> GetReadByteCount() const override; + bool IsEndOfStream() const override; + bool IsRawReaderExhausted() const override; + +private: + void ReadField(TString* result, i32 length); + + void ReadRow(); + void SkipRow() override; + + TYaMRRow Row_; + TString Key_; + TString SubKey_; + TString Value_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/yamr_table_writer.cpp b/yt/cpp/mapreduce/io/yamr_table_writer.cpp new file mode 100644 index 0000000000..cce7ceb0f0 --- /dev/null +++ b/yt/cpp/mapreduce/io/yamr_table_writer.cpp @@ -0,0 +1,53 @@ +#include "yamr_table_writer.h" + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TYaMRTableWriter::TYaMRTableWriter(THolder<IProxyOutput> output) + : Output_(std::move(output)) +{ } + +TYaMRTableWriter::~TYaMRTableWriter() +{ } + +size_t TYaMRTableWriter::GetTableCount() const +{ + return Output_->GetStreamCount(); +} + +void TYaMRTableWriter::FinishTable(size_t tableIndex) { + Output_->GetStream(tableIndex)->Finish(); +} + +void TYaMRTableWriter::AddRow(const TYaMRRow& row, size_t tableIndex) +{ + auto* stream = Output_->GetStream(tableIndex); + + auto writeField = [&stream] (const TStringBuf& field) { + i32 length = static_cast<i32>(field.length()); + stream->Write(&length, sizeof(length)); + stream->Write(field.data(), field.length()); + }; + + writeField(row.Key); + writeField(row.SubKey); + writeField(row.Value); + + Output_->OnRowFinished(tableIndex); +} + +void TYaMRTableWriter::AddRow(TYaMRRow&& row, size_t tableIndex) { + TYaMRTableWriter::AddRow(row, tableIndex); +} + +void TYaMRTableWriter::Abort() +{ + Output_->Abort(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/io/yamr_table_writer.h b/yt/cpp/mapreduce/io/yamr_table_writer.h new file mode 100644 index 0000000000..cf88eaf287 --- /dev/null +++ b/yt/cpp/mapreduce/io/yamr_table_writer.h @@ -0,0 +1,31 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/io.h> + +namespace NYT { + +class IProxyOutput; + +//////////////////////////////////////////////////////////////////////////////// + +class TYaMRTableWriter + : public IYaMRWriterImpl +{ +public: + explicit TYaMRTableWriter(THolder<IProxyOutput> output); + ~TYaMRTableWriter() override; + + void AddRow(const TYaMRRow& row, size_t tableIndex) override; + void AddRow(TYaMRRow&& row, size_t tableIndex) override; + + size_t GetTableCount() const override; + void FinishTable(size_t) override; + void Abort() override; + +private: + THolder<IProxyOutput> Output_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/library/table_schema/protobuf.cpp b/yt/cpp/mapreduce/library/table_schema/protobuf.cpp new file mode 100644 index 0000000000..888da828e7 --- /dev/null +++ b/yt/cpp/mapreduce/library/table_schema/protobuf.cpp @@ -0,0 +1 @@ +#include "protobuf.h" diff --git a/yt/cpp/mapreduce/library/table_schema/protobuf.h b/yt/cpp/mapreduce/library/table_schema/protobuf.h new file mode 100644 index 0000000000..e29e096745 --- /dev/null +++ b/yt/cpp/mapreduce/library/table_schema/protobuf.h @@ -0,0 +1,3 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/common.h> diff --git a/yt/cpp/mapreduce/library/table_schema/ya.make b/yt/cpp/mapreduce/library/table_schema/ya.make new file mode 100644 index 0000000000..4aebad72dd --- /dev/null +++ b/yt/cpp/mapreduce/library/table_schema/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + protobuf.h + protobuf.cpp +) + +PEERDIR( + yt/cpp/mapreduce/interface +) + +END() diff --git a/yt/cpp/mapreduce/raw_client/raw_batch_request.cpp b/yt/cpp/mapreduce/raw_client/raw_batch_request.cpp new file mode 100644 index 0000000000..be81f5a21a --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/raw_batch_request.cpp @@ -0,0 +1,687 @@ +#include "raw_batch_request.h" + +#include "raw_requests.h" +#include "rpc_parameters_serialization.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <yt/cpp/mapreduce/interface/client.h> +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/interface/serialize.h> + +#include <library/cpp/yson/node/node.h> + +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <util/generic/guid.h> +#include <util/string/builder.h> + +#include <exception> + +namespace NYT::NDetail::NRawClient { + +using NThreading::TFuture; +using NThreading::TPromise; +using NThreading::NewPromise; + +//////////////////////////////////////////////////////////////////// + +static TString RequestInfo(const TNode& request) +{ + return ::TStringBuilder() + << request["command"].AsString() << ' ' << NodeToYsonString(request["parameters"]); +} + +static void EnsureNothing(const TMaybe<TNode>& node) +{ + Y_ENSURE(!node, "Internal error: expected to have no response, but got response of type " << node->GetType()); +} + +static void EnsureSomething(const TMaybe<TNode>& node) +{ + Y_ENSURE(node, "Internal error: expected to have response of any type, but got no response."); +} + +static void EnsureType(const TNode& node, TNode::EType type) +{ + Y_ENSURE(node.GetType() == type, "Internal error: unexpected response type. " + << "Expected: " << type << ", actual: " << node.GetType()); +} + +static void EnsureType(const TMaybe<TNode>& node, TNode::EType type) +{ + Y_ENSURE(node, "Internal error: expected to have response of type " << type << ", but got no response."); + EnsureType(*node, type); +} + +//////////////////////////////////////////////////////////////////// + +template <typename TReturnType> +class TResponseParserBase + : public TRawBatchRequest::IResponseItemParser +{ +public: + using TFutureResult = TFuture<TReturnType>; + +public: + TResponseParserBase() + : Result(NewPromise<TReturnType>()) + { } + + void SetException(std::exception_ptr e) override + { + Result.SetException(std::move(e)); + } + + TFuture<TReturnType> GetFuture() + { + return Result.GetFuture(); + } + +protected: + TPromise<TReturnType> Result; +}; + +//////////////////////////////////////////////////////////////////// + + +class TGetResponseParser + : public TResponseParserBase<TNode> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureSomething(node); + Result.SetValue(std::move(*node)); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TVoidResponseParser + : public TResponseParserBase<void> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureNothing(node); + Result.SetValue(); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TListResponseParser + : public TResponseParserBase<TNode::TListType> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::List); + Result.SetValue(std::move(node->AsList())); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TExistsResponseParser + : public TResponseParserBase<bool> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::Bool); + Result.SetValue(std::move(node->AsBool())); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TGuidResponseParser + : public TResponseParserBase<TGUID> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::String); + Result.SetValue(GetGuid(node->AsString())); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TCanonizeYPathResponseParser + : public TResponseParserBase<TRichYPath> +{ +public: + explicit TCanonizeYPathResponseParser(TString pathPrefix, const TRichYPath& original) + : OriginalNode_(PathToNode(original)) + , PathPrefix_(std::move(pathPrefix)) + { } + + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::String); + + for (const auto& item : OriginalNode_.GetAttributes().AsMap()) { + node->Attributes()[item.first] = item.second; + } + TRichYPath result; + Deserialize(result, *node); + result.Path_ = AddPathPrefix(result.Path_, PathPrefix_); + Result.SetValue(result); + } + +private: + TNode OriginalNode_; + TString PathPrefix_; +}; + +//////////////////////////////////////////////////////////////////// + +class TGetOperationResponseParser + : public TResponseParserBase<TOperationAttributes> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::Map); + Result.SetValue(ParseOperationAttributes(*node)); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TTableColumnarStatisticsParser + : public TResponseParserBase<TVector<TTableColumnarStatistics>> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::Map); + TVector<TTableColumnarStatistics> statistics; + Deserialize(statistics, *node); + Result.SetValue(std::move(statistics)); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TTablePartitionsParser + : public TResponseParserBase<TMultiTablePartitions> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::Map); + TMultiTablePartitions partitions; + Deserialize(partitions, *node); + Result.SetValue(std::move(partitions)); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TGetFileFromCacheParser + : public TResponseParserBase<TMaybe<TYPath>> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::String); + if (node->AsString().empty()) { + Result.SetValue(Nothing()); + } else { + Result.SetValue(node->AsString()); + } + } +}; + +//////////////////////////////////////////////////////////////////// + +class TYPathParser + : public TResponseParserBase<TYPath> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::String); + Result.SetValue(node->AsString()); + } +}; + +//////////////////////////////////////////////////////////////////// + +class TCheckPermissionParser + : public TResponseParserBase<TCheckPermissionResponse> +{ +public: + void SetResponse(TMaybe<TNode> node) override + { + EnsureType(node, TNode::Map); + Result.SetValue(ParseCheckPermissionResponse(*node)); + } +}; + +//////////////////////////////////////////////////////////////////// + +TRawBatchRequest::TBatchItem::TBatchItem(TNode parameters, ::TIntrusivePtr<IResponseItemParser> responseParser) + : Parameters(std::move(parameters)) + , ResponseParser(std::move(responseParser)) + , NextTry() +{ } + +TRawBatchRequest::TBatchItem::TBatchItem(const TBatchItem& batchItem, TInstant nextTry) + : Parameters(batchItem.Parameters) + , ResponseParser(batchItem.ResponseParser) + , NextTry(nextTry) +{ } + +//////////////////////////////////////////////////////////////////// + +TRawBatchRequest::TRawBatchRequest(const TConfigPtr& config) + : Config_(config) +{ } + +TRawBatchRequest::~TRawBatchRequest() = default; + +bool TRawBatchRequest::IsExecuted() const +{ + return Executed_; +} + +void TRawBatchRequest::MarkExecuted() +{ + Executed_ = true; +} + +template <typename TResponseParser> +typename TResponseParser::TFutureResult TRawBatchRequest::AddRequest( + const TString& command, + TNode parameters, + TMaybe<TNode> input) +{ + return AddRequest(command, parameters, input, MakeIntrusive<TResponseParser>()); +} + +template <typename TResponseParser> +typename TResponseParser::TFutureResult TRawBatchRequest::AddRequest( + const TString& command, + TNode parameters, + TMaybe<TNode> input, + ::TIntrusivePtr<TResponseParser> parser) +{ + Y_ENSURE(!Executed_, "Cannot add request: batch request is already executed"); + TNode request; + request["command"] = command; + request["parameters"] = std::move(parameters); + if (input) { + request["input"] = std::move(*input); + } + BatchItemList_.emplace_back(std::move(request), parser); + return parser->GetFuture(); +} + +void TRawBatchRequest::AddRequest(TBatchItem batchItem) +{ + Y_ENSURE(!Executed_, "Cannot add request: batch request is already executed"); + BatchItemList_.push_back(batchItem); +} + +TFuture<TNodeId> TRawBatchRequest::Create( + const TTransactionId& transaction, + const TYPath& path, + ENodeType type, + const TCreateOptions& options) +{ + return AddRequest<TGuidResponseParser>( + "create", + SerializeParamsForCreate(transaction, Config_->Prefix, path, type, options), + Nothing()); +} + +TFuture<void> TRawBatchRequest::Remove( + const TTransactionId& transaction, + const TYPath& path, + const TRemoveOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "remove", + SerializeParamsForRemove(transaction, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<bool> TRawBatchRequest::Exists( + const TTransactionId& transaction, + const TYPath& path, + const TExistsOptions& options) +{ + return AddRequest<TExistsResponseParser>( + "exists", + SerializeParamsForExists(transaction, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<TNode> TRawBatchRequest::Get( + const TTransactionId& transaction, + const TYPath& path, + const TGetOptions& options) +{ + return AddRequest<TGetResponseParser>( + "get", + SerializeParamsForGet(transaction, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<void> TRawBatchRequest::Set( + const TTransactionId& transaction, + const TYPath& path, + const TNode& node, + const TSetOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "set", + SerializeParamsForSet(transaction, Config_->Prefix, path, options), + node); +} + +TFuture<TNode::TListType> TRawBatchRequest::List( + const TTransactionId& transaction, + const TYPath& path, + const TListOptions& options) +{ + return AddRequest<TListResponseParser>( + "list", + SerializeParamsForList(transaction, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<TNodeId> TRawBatchRequest::Copy( + const TTransactionId& transaction, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) +{ + return AddRequest<TGuidResponseParser>( + "copy", + SerializeParamsForCopy(transaction, Config_->Prefix, sourcePath, destinationPath, options), + Nothing()); +} + +TFuture<TNodeId> TRawBatchRequest::Move( + const TTransactionId& transaction, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) +{ + return AddRequest<TGuidResponseParser>( + "move", + SerializeParamsForMove(transaction, Config_->Prefix, sourcePath, destinationPath, options), + Nothing()); +} + +TFuture<TNodeId> TRawBatchRequest::Link( + const TTransactionId& transaction, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) +{ + return AddRequest<TGuidResponseParser>( + "link", + SerializeParamsForLink(transaction, Config_->Prefix, targetPath, linkPath, options), + Nothing()); +} + +TFuture<TLockId> TRawBatchRequest::Lock( + const TTransactionId& transaction, + const TYPath& path, + ELockMode mode, + const TLockOptions& options) +{ + return AddRequest<TGuidResponseParser>( + "lock", + SerializeParamsForLock(transaction, Config_->Prefix, path, mode, options), + Nothing()); +} + +TFuture<void> TRawBatchRequest::Unlock( + const TTransactionId& transaction, + const TYPath& path, + const TUnlockOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "unlock", + SerializeParamsForUnlock(transaction, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<TMaybe<TYPath>> TRawBatchRequest::GetFileFromCache( + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options) +{ + return AddRequest<TGetFileFromCacheParser>( + "get_file_from_cache", + SerializeParamsForGetFileFromCache(transactionId, md5Signature, cachePath, options), + Nothing()); +} + +TFuture<TYPath> TRawBatchRequest::PutFileToCache( + const TTransactionId& transactionId, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options) +{ + return AddRequest<TYPathParser>( + "put_file_to_cache", + SerializeParamsForPutFileToCache(transactionId, Config_->Prefix, filePath, md5Signature, cachePath, options), + Nothing()); +} + +TFuture<TCheckPermissionResponse> TRawBatchRequest::CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) +{ + return AddRequest<TCheckPermissionParser>( + "check_permission", + SerializeParamsForCheckPermission(user, permission, Config_->Prefix, path, options), + Nothing()); +} + +TFuture<TOperationAttributes> TRawBatchRequest::GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options) +{ + return AddRequest<TGetOperationResponseParser>( + "get_operation", + SerializeParamsForGetOperation(operationId, options), + Nothing()); +} + +TFuture<void> TRawBatchRequest::AbortOperation(const TOperationId& operationId) +{ + return AddRequest<TVoidResponseParser>( + "abort_op", + SerializeParamsForAbortOperation(operationId), + Nothing()); +} + +TFuture<void> TRawBatchRequest::CompleteOperation(const TOperationId& operationId) +{ + return AddRequest<TVoidResponseParser>( + "complete_op", + SerializeParamsForCompleteOperation(operationId), + Nothing()); +} +TFuture<void> TRawBatchRequest::SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "suspend_operation", + SerializeParamsForSuspendOperation(operationId, options), + Nothing()); +} +TFuture<void> TRawBatchRequest::ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "resume_operation", + SerializeParamsForResumeOperation(operationId, options), + Nothing()); +} + +TFuture<void> TRawBatchRequest::UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) +{ + return AddRequest<TVoidResponseParser>( + "update_op_parameters", + SerializeParamsForUpdateOperationParameters(operationId, options), + Nothing()); +} + +TFuture<TRichYPath> TRawBatchRequest::CanonizeYPath(const TRichYPath& path) +{ + if (path.Path_.find_first_of("<>{}[]") != TString::npos) { + return AddRequest<TCanonizeYPathResponseParser>( + "parse_ypath", + SerializeParamsForParseYPath(path), + Nothing(), + MakeIntrusive<TCanonizeYPathResponseParser>(Config_->Prefix, path)); + } else { + TRichYPath result = path; + result.Path_ = AddPathPrefix(result.Path_, Config_->Prefix); + return NThreading::MakeFuture(result); + } +} + +TFuture<TVector<TTableColumnarStatistics>> TRawBatchRequest::GetTableColumnarStatistics( + const TTransactionId& transaction, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) +{ + return AddRequest<TTableColumnarStatisticsParser>( + "get_table_columnar_statistics", + SerializeParamsForGetTableColumnarStatistics(transaction, paths, options), + Nothing()); +} + +TFuture<TMultiTablePartitions> TRawBatchRequest::GetTablePartitions( + const TTransactionId& transaction, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) +{ + return AddRequest<TTablePartitionsParser>( + "partition_tables", + SerializeParamsForGetTablePartitions(transaction, paths, options), + Nothing()); +} + +void TRawBatchRequest::FillParameterList(size_t maxSize, TNode* result, TInstant* nextTry) const +{ + Y_VERIFY(result); + Y_VERIFY(nextTry); + + *nextTry = TInstant(); + maxSize = Min(maxSize, BatchItemList_.size()); + *result = TNode::CreateList(); + for (size_t i = 0; i < maxSize; ++i) { + YT_LOG_DEBUG("ExecuteBatch preparing: %v", + RequestInfo(BatchItemList_[i].Parameters)); + + result->Add(BatchItemList_[i].Parameters); + if (BatchItemList_[i].NextTry > *nextTry) { + *nextTry = BatchItemList_[i].NextTry; + } + } +} + +void TRawBatchRequest::ParseResponse( + const TResponseInfo& requestResult, + const IRequestRetryPolicyPtr& retryPolicy, + TRawBatchRequest* retryBatch, + TInstant now) +{ + TNode node = NodeFromYsonString(requestResult.Response); + return ParseResponse(node, requestResult.RequestId, retryPolicy, retryBatch, now); +} + +void TRawBatchRequest::ParseResponse( + TNode node, + const TString& requestId, + const IRequestRetryPolicyPtr& retryPolicy, + TRawBatchRequest* retryBatch, + TInstant now) +{ + Y_VERIFY(retryBatch); + + EnsureType(node, TNode::List); + auto& responseList = node.AsList(); + const auto size = responseList.size(); + Y_ENSURE(size <= BatchItemList_.size(), + "Size of server response exceeds size of batch request;" + " size of batch: " << BatchItemList_.size() << + " size of server response: " << size << '.'); + + for (size_t i = 0; i != size; ++i) { + try { + EnsureType(responseList[i], TNode::Map); + auto& responseNode = responseList[i].AsMap(); + const auto outputIt = responseNode.find("output"); + if (outputIt != responseNode.end()) { + BatchItemList_[i].ResponseParser->SetResponse(std::move(outputIt->second)); + } else { + const auto errorIt = responseNode.find("error"); + if (errorIt == responseNode.end()) { + BatchItemList_[i].ResponseParser->SetResponse(Nothing()); + } else { + TErrorResponse error(400, requestId); + error.SetError(TYtError(errorIt->second)); + if (auto curInterval = IsRetriable(error) ? retryPolicy->OnRetriableError(error) : Nothing()) { + YT_LOG_INFO( + "Batch subrequest (%s) failed, will retry, error: %s", + RequestInfo(BatchItemList_[i].Parameters), + error.what()); + retryBatch->AddRequest(TBatchItem(BatchItemList_[i], now + *curInterval)); + } else { + YT_LOG_ERROR( + "Batch subrequest (%s) failed, error: %s", + RequestInfo(BatchItemList_[i].Parameters), + error.what()); + BatchItemList_[i].ResponseParser->SetException(std::make_exception_ptr(error)); + } + } + } + } catch (const std::exception& e) { + // We don't expect other exceptions, so we don't catch (...) + BatchItemList_[i].ResponseParser->SetException(std::current_exception()); + } + } + BatchItemList_.erase(BatchItemList_.begin(), BatchItemList_.begin() + size); +} + +void TRawBatchRequest::SetErrorResult(std::exception_ptr e) const +{ + for (const auto& batchItem : BatchItemList_) { + batchItem.ResponseParser->SetException(e); + } +} + +size_t TRawBatchRequest::BatchSize() const +{ + return BatchItemList_.size(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail::NRawClient diff --git a/yt/cpp/mapreduce/raw_client/raw_batch_request.h b/yt/cpp/mapreduce/raw_client/raw_batch_request.h new file mode 100644 index 0000000000..7ed5bebf5e --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/raw_batch_request.h @@ -0,0 +1,190 @@ +#pragma once + +#include <yt/cpp/mapreduce/common/fwd.h> + +#include <yt/cpp/mapreduce/interface/batch_request.h> +#include <yt/cpp/mapreduce/interface/fwd.h> +#include <yt/cpp/mapreduce/interface/node.h> +#include <yt/cpp/mapreduce/interface/retry_policy.h> + +#include <yt/cpp/mapreduce/http/requests.h> + +#include <library/cpp/threading/future/future.h> + +#include <util/generic/ptr.h> +#include <util/generic/deque.h> + +#include <exception> + +namespace NYT::NDetail { + struct TResponseInfo; +} + +namespace NYT::NDetail::NRawClient { + +//////////////////////////////////////////////////////////////////////////////// + +class TRawBatchRequest + : public TThrRefBase +{ +public: + struct IResponseItemParser + : public TThrRefBase + { + ~IResponseItemParser() = default; + + virtual void SetResponse(TMaybe<TNode> node) = 0; + virtual void SetException(std::exception_ptr e) = 0; + }; + +public: + TRawBatchRequest(const TConfigPtr& config); + ~TRawBatchRequest(); + + bool IsExecuted() const; + void MarkExecuted(); + + void FillParameterList(size_t maxSize, TNode* result, TInstant* nextTry) const; + + size_t BatchSize() const; + + void ParseResponse( + const TResponseInfo& requestResult, + const IRequestRetryPolicyPtr& retryPolicy, + TRawBatchRequest* retryBatch, + TInstant now = TInstant::Now()); + void ParseResponse( + TNode response, + const TString& requestId, + const IRequestRetryPolicyPtr& retryPolicy, + TRawBatchRequest* retryBatch, + TInstant now = TInstant::Now()); + void SetErrorResult(std::exception_ptr e) const; + + ::NThreading::TFuture<TNodeId> Create( + const TTransactionId& transaction, + const TYPath& path, + ENodeType type, + const TCreateOptions& options); + ::NThreading::TFuture<void> Remove( + const TTransactionId& transaction, + const TYPath& path, + const TRemoveOptions& options); + ::NThreading::TFuture<bool> Exists( + const TTransactionId& transaction, + const TYPath& path, + const TExistsOptions& options); + ::NThreading::TFuture<TNode> Get( + const TTransactionId& transaction, + const TYPath& path, + const TGetOptions& options); + ::NThreading::TFuture<void> Set( + const TTransactionId& transaction, + const TYPath& path, + const TNode& value, + const TSetOptions& options); + ::NThreading::TFuture<TNode::TListType> List( + const TTransactionId& transaction, + const TYPath& path, + const TListOptions& options); + ::NThreading::TFuture<TNodeId> Copy( + const TTransactionId& transaction, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options); + ::NThreading::TFuture<TNodeId> Move( + const TTransactionId& transaction, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options); + ::NThreading::TFuture<TNodeId> Link( + const TTransactionId& transaction, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options); + ::NThreading::TFuture<TLockId> Lock( + const TTransactionId& transaction, + const TYPath& path, + ELockMode mode, + const TLockOptions& options); + ::NThreading::TFuture<void> Unlock( + const TTransactionId& transaction, + const TYPath& path, + const TUnlockOptions& options); + ::NThreading::TFuture<TMaybe<TYPath>> GetFileFromCache( + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options); + ::NThreading::TFuture<TYPath> PutFileToCache( + const TTransactionId& transactionId, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options); + ::NThreading::TFuture<TCheckPermissionResponse> CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options); + ::NThreading::TFuture<TOperationAttributes> GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options); + ::NThreading::TFuture<void> AbortOperation(const TOperationId& operationId); + ::NThreading::TFuture<void> CompleteOperation(const TOperationId& operationId); + ::NThreading::TFuture<void> SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options); + ::NThreading::TFuture<void> ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options); + ::NThreading::TFuture<void> UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options); + ::NThreading::TFuture<TRichYPath> CanonizeYPath(const TRichYPath& path); + ::NThreading::TFuture<TVector<TTableColumnarStatistics>> GetTableColumnarStatistics( + const TTransactionId& transaction, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options); + ::NThreading::TFuture<TMultiTablePartitions> GetTablePartitions( + const TTransactionId& transaction, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options); + +private: + struct TBatchItem { + TNode Parameters; + ::TIntrusivePtr<IResponseItemParser> ResponseParser; + TInstant NextTry; + + TBatchItem(TNode parameters, ::TIntrusivePtr<IResponseItemParser> responseParser); + + TBatchItem(const TBatchItem& batchItem, TInstant nextTry); + }; + +private: + template <typename TResponseParser> + typename TResponseParser::TFutureResult AddRequest( + const TString& command, + TNode parameters, + TMaybe<TNode> input); + + template <typename TResponseParser> + typename TResponseParser::TFutureResult AddRequest( + const TString& command, + TNode parameters, + TMaybe<TNode> input, + ::TIntrusivePtr<TResponseParser> parser); + + void AddRequest(TBatchItem batchItem); + +private: + TConfigPtr Config_; + + TDeque<TBatchItem> BatchItemList_; + bool Executed_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail::NRawClient diff --git a/yt/cpp/mapreduce/raw_client/raw_requests.cpp b/yt/cpp/mapreduce/raw_client/raw_requests.cpp new file mode 100644 index 0000000000..26120759fd --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/raw_requests.cpp @@ -0,0 +1,1027 @@ +#include "raw_requests.h" + +#include "raw_batch_request.h" +#include "rpc_parameters_serialization.h" + +#include <yt/cpp/mapreduce/common/helpers.h> +#include <yt/cpp/mapreduce/common/retry_lib.h> +#include <yt/cpp/mapreduce/common/wait_proxy.h> + +#include <yt/cpp/mapreduce/http/fwd.h> +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/http/helpers.h> +#include <yt/cpp/mapreduce/http/http_client.h> +#include <yt/cpp/mapreduce/http/retry_request.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/client.h> +#include <yt/cpp/mapreduce/interface/operation.h> +#include <yt/cpp/mapreduce/interface/serialize.h> +#include <yt/cpp/mapreduce/interface/tvm.h> + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <util/generic/guid.h> +#include <util/generic/scope.h> + +namespace NYT::NDetail::NRawClient { + +/////////////////////////////////////////////////////////////////////////////// + +void ExecuteBatch( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + TRawBatchRequest& batchRequest, + const TExecuteBatchOptions& options) +{ + if (batchRequest.IsExecuted()) { + ythrow yexception() << "Cannot execute batch request since it is already executed"; + } + Y_DEFER { + batchRequest.MarkExecuted(); + }; + + const auto concurrency = options.Concurrency_.GetOrElse(50); + const auto batchPartMaxSize = options.BatchPartMaxSize_.GetOrElse(concurrency * 5); + + if (!retryPolicy) { + retryPolicy = CreateDefaultRequestRetryPolicy(context.Config); + } + + while (batchRequest.BatchSize()) { + TRawBatchRequest retryBatch(context.Config); + + while (batchRequest.BatchSize()) { + auto parameters = TNode::CreateMap(); + TInstant nextTry; + batchRequest.FillParameterList(batchPartMaxSize, ¶meters["requests"], &nextTry); + if (nextTry) { + SleepUntil(nextTry); + } + parameters["concurrency"] = concurrency; + auto body = NodeToYsonString(parameters); + THttpHeader header("POST", "execute_batch"); + header.AddMutationId(); + NDetail::TResponseInfo result; + try { + result = RetryRequestWithPolicy(retryPolicy, context, header, body); + } catch (const std::exception& e) { + batchRequest.SetErrorResult(std::current_exception()); + retryBatch.SetErrorResult(std::current_exception()); + throw; + } + batchRequest.ParseResponse(std::move(result), retryPolicy.Get(), &retryBatch); + } + + batchRequest = std::move(retryBatch); + } +} + +TNode Get( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TGetOptions& options) +{ + THttpHeader header("GET", "get"); + header.MergeParameters(SerializeParamsForGet(transactionId, context.Config->Prefix, path, options)); + return NodeFromYsonString(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +TNode TryGet( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TGetOptions& options) +{ + try { + return Get(retryPolicy, context, transactionId, path, options); + } catch (const TErrorResponse& error) { + if (!error.IsResolveError()) { + throw; + } + return TNode(); + } +} + +void Set( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TNode& value, + const TSetOptions& options) +{ + THttpHeader header("PUT", "set"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForSet(transactionId, context.Config->Prefix, path, options)); + auto body = NodeToYsonString(value); + RetryRequestWithPolicy(retryPolicy, context, header, body); +} + +void MultisetAttributes( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TNode::TMapType& value, + const TMultisetAttributesOptions& options) +{ + THttpHeader header("PUT", "api/v4/multiset_attributes", false); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForMultisetAttributes(transactionId, context.Config->Prefix, path, options)); + + auto body = NodeToYsonString(value); + RetryRequestWithPolicy(retryPolicy, context, header, body); +} + +bool Exists( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TExistsOptions& options) +{ + THttpHeader header("GET", "exists"); + header.MergeParameters(SerializeParamsForExists(transactionId, context.Config->Prefix, path, options)); + return ParseBoolFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +TNodeId Create( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const ENodeType& type, + const TCreateOptions& options) +{ + THttpHeader header("POST", "create"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForCreate(transactionId, context.Config->Prefix, path, type, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +TNodeId Copy( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) +{ + THttpHeader header("POST", "copy"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForCopy(transactionId, context.Config->Prefix, sourcePath, destinationPath, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +TNodeId Move( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) +{ + THttpHeader header("POST", "move"); + header.AddMutationId(); + header.MergeParameters(NRawClient::SerializeParamsForMove(transactionId, context.Config->Prefix, sourcePath, destinationPath, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +void Remove( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TRemoveOptions& options) +{ + THttpHeader header("POST", "remove"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForRemove(transactionId, context.Config->Prefix, path, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +TNode::TListType List( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TListOptions& options) +{ + THttpHeader header("GET", "list"); + + TYPath updatedPath = AddPathPrefix(path, context.Config->Prefix); + // Translate "//" to "/" + // Translate "//some/constom/prefix/from/config/" to "//some/constom/prefix/from/config" + if (path.empty() && updatedPath.EndsWith('/')) { + updatedPath.pop_back(); + } + header.MergeParameters(SerializeParamsForList(transactionId, context.Config->Prefix, updatedPath, options)); + auto result = RetryRequestWithPolicy(retryPolicy, context, header); + return NodeFromYsonString(result.Response).AsList(); +} + +TNodeId Link( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) +{ + THttpHeader header("POST", "link"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForLink(transactionId, context.Config->Prefix, targetPath, linkPath, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +TLockId Lock( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + ELockMode mode, + const TLockOptions& options) +{ + THttpHeader header("POST", "lock"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForLock(transactionId, context.Config->Prefix, path, mode, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +void Unlock( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TUnlockOptions& options) +{ + THttpHeader header("POST", "unlock"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForUnlock(transactionId, context.Config->Prefix, path, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void Concatenate( + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options) +{ + THttpHeader header("POST", "concatenate"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForConcatenate(transactionId, context.Config->Prefix, sourcePaths, destinationPath, options)); + RequestWithoutRetry(context, header); +} + +void PingTx( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId) +{ + THttpHeader header("POST", "ping_tx"); + header.MergeParameters(SerializeParamsForPingTx(transactionId)); + TRequestConfig requestConfig; + requestConfig.HttpConfig = NHttpClient::THttpConfig{ + .SocketTimeout = context.Config->PingTimeout + }; + RetryRequestWithPolicy(retryPolicy, context, header, {}, requestConfig); +} + +TOperationAttributes ParseOperationAttributes(const TNode& node) +{ + const auto& mapNode = node.AsMap(); + TOperationAttributes result; + + if (auto idNode = mapNode.FindPtr("id")) { + result.Id = GetGuid(idNode->AsString()); + } + + if (auto typeNode = mapNode.FindPtr("type")) { + result.Type = FromString<EOperationType>(typeNode->AsString()); + } else if (auto operationTypeNode = mapNode.FindPtr("operation_type")) { + // COMPAT(levysotsky): "operation_type" is a deprecated synonim for "type". + // This branch should be removed when all clusters are updated. + result.Type = FromString<EOperationType>(operationTypeNode->AsString()); + } + + if (auto stateNode = mapNode.FindPtr("state")) { + result.State = stateNode->AsString(); + // We don't use FromString here, because OS_IN_PROGRESS unites many states: "initializing", "running", etc. + if (*result.State == "completed") { + result.BriefState = EOperationBriefState::Completed; + } else if (*result.State == "aborted") { + result.BriefState = EOperationBriefState::Aborted; + } else if (*result.State == "failed") { + result.BriefState = EOperationBriefState::Failed; + } else { + result.BriefState = EOperationBriefState::InProgress; + } + } + if (auto authenticatedUserNode = mapNode.FindPtr("authenticated_user")) { + result.AuthenticatedUser = authenticatedUserNode->AsString(); + } + if (auto startTimeNode = mapNode.FindPtr("start_time")) { + result.StartTime = TInstant::ParseIso8601(startTimeNode->AsString()); + } + if (auto finishTimeNode = mapNode.FindPtr("finish_time")) { + result.FinishTime = TInstant::ParseIso8601(finishTimeNode->AsString()); + } + auto briefProgressNode = mapNode.FindPtr("brief_progress"); + if (briefProgressNode && briefProgressNode->HasKey("jobs")) { + result.BriefProgress.ConstructInPlace(); + static auto load = [] (const TNode& item) { + // Backward compatibility with old YT versions + return item.IsInt64() ? item.AsInt64() : item["total"].AsInt64(); + }; + const auto& jobs = (*briefProgressNode)["jobs"]; + result.BriefProgress->Aborted = load(jobs["aborted"]); + result.BriefProgress->Completed = load(jobs["completed"]); + result.BriefProgress->Running = jobs["running"].AsInt64(); + result.BriefProgress->Total = jobs["total"].AsInt64(); + result.BriefProgress->Failed = jobs["failed"].AsInt64(); + result.BriefProgress->Lost = jobs["lost"].AsInt64(); + result.BriefProgress->Pending = jobs["pending"].AsInt64(); + } + if (auto briefSpecNode = mapNode.FindPtr("brief_spec")) { + result.BriefSpec = *briefSpecNode; + } + if (auto specNode = mapNode.FindPtr("spec")) { + result.Spec = *specNode; + } + if (auto fullSpecNode = mapNode.FindPtr("full_spec")) { + result.FullSpec = *fullSpecNode; + } + if (auto unrecognizedSpecNode = mapNode.FindPtr("unrecognized_spec")) { + result.UnrecognizedSpec = *unrecognizedSpecNode; + } + if (auto suspendedNode = mapNode.FindPtr("suspended")) { + result.Suspended = suspendedNode->AsBool(); + } + if (auto resultNode = mapNode.FindPtr("result")) { + result.Result.ConstructInPlace(); + auto error = TYtError((*resultNode)["error"]); + if (error.GetCode() != 0) { + result.Result->Error = std::move(error); + } + } + if (auto progressNode = mapNode.FindPtr("progress")) { + const auto& progressMap = progressNode->AsMap(); + TMaybe<TInstant> buildTime; + if (auto buildTimeNode = progressMap.FindPtr("build_time")) { + buildTime = TInstant::ParseIso8601(buildTimeNode->AsString()); + } + TJobStatistics jobStatistics; + if (auto jobStatisticsNode = progressMap.FindPtr("job_statistics")) { + jobStatistics = TJobStatistics(*jobStatisticsNode); + } + TJobCounters jobCounters; + if (auto jobCountersNode = progressMap.FindPtr("total_job_counter")) { + jobCounters = TJobCounters(*jobCountersNode); + } + result.Progress = TOperationProgress{ + .JobStatistics = std::move(jobStatistics), + .JobCounters = std::move(jobCounters), + .BuildTime = buildTime, + }; + } + if (auto eventsNode = mapNode.FindPtr("events")) { + result.Events.ConstructInPlace().reserve(eventsNode->Size()); + for (const auto& eventNode : eventsNode->AsList()) { + result.Events->push_back(TOperationEvent{ + eventNode["state"].AsString(), + TInstant::ParseIso8601(eventNode["time"].AsString()), + }); + } + } + if (auto alertsNode = mapNode.FindPtr("alerts")) { + result.Alerts.ConstructInPlace(); + for (const auto& [alertType, alertError] : alertsNode->AsMap()) { + result.Alerts->emplace(alertType, TYtError(alertError)); + } + } + + return result; +} + +TOperationAttributes GetOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TGetOperationOptions& options) +{ + THttpHeader header("GET", "get_operation"); + header.MergeParameters(SerializeParamsForGetOperation(operationId, options)); + auto result = RetryRequestWithPolicy(retryPolicy, context, header); + return ParseOperationAttributes(NodeFromYsonString(result.Response)); +} + +void AbortOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId) +{ + THttpHeader header("POST", "abort_op"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForAbortOperation(operationId)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void CompleteOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId) +{ + THttpHeader header("POST", "complete_op"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForCompleteOperation(operationId)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void SuspendOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TSuspendOperationOptions& options) +{ + THttpHeader header("POST", "suspend_op"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForSuspendOperation(operationId, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void ResumeOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TResumeOperationOptions& options) +{ + THttpHeader header("POST", "resume_op"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForResumeOperation(operationId, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +template <typename TKey> +static THashMap<TKey, i64> GetCounts(const TNode& countsNode) +{ + THashMap<TKey, i64> counts; + for (const auto& entry : countsNode.AsMap()) { + counts.emplace(FromString<TKey>(entry.first), entry.second.AsInt64()); + } + return counts; +} + +TListOperationsResult ListOperations( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TListOperationsOptions& options) +{ + THttpHeader header("GET", "list_operations"); + header.MergeParameters(SerializeParamsForListOperations(options)); + auto responseInfo = RetryRequestWithPolicy(retryPolicy, context, header); + auto resultNode = NodeFromYsonString(responseInfo.Response); + + TListOperationsResult result; + for (const auto& operationNode : resultNode["operations"].AsList()) { + result.Operations.push_back(ParseOperationAttributes(operationNode)); + } + + if (resultNode.HasKey("pool_counts")) { + result.PoolCounts = GetCounts<TString>(resultNode["pool_counts"]); + } + if (resultNode.HasKey("user_counts")) { + result.UserCounts = GetCounts<TString>(resultNode["user_counts"]); + } + if (resultNode.HasKey("type_counts")) { + result.TypeCounts = GetCounts<EOperationType>(resultNode["type_counts"]); + } + if (resultNode.HasKey("state_counts")) { + result.StateCounts = GetCounts<TString>(resultNode["state_counts"]); + } + if (resultNode.HasKey("failed_jobs_count")) { + result.WithFailedJobsCount = resultNode["failed_jobs_count"].AsInt64(); + } + + result.Incomplete = resultNode["incomplete"].AsBool(); + + return result; +} + +void UpdateOperationParameters( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) +{ + THttpHeader header("POST", "update_op_parameters"); + header.MergeParameters(SerializeParamsForUpdateOperationParameters(operationId, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +TJobAttributes ParseJobAttributes(const TNode& node) +{ + const auto& mapNode = node.AsMap(); + TJobAttributes result; + + // Currently "get_job" returns "job_id" field and "list_jobs" returns "id" field. + auto idNode = mapNode.FindPtr("id"); + if (!idNode) { + idNode = mapNode.FindPtr("job_id"); + } + if (idNode) { + result.Id = GetGuid(idNode->AsString()); + } + + if (auto typeNode = mapNode.FindPtr("type")) { + result.Type = FromString<EJobType>(typeNode->AsString()); + } + if (auto stateNode = mapNode.FindPtr("state")) { + result.State = FromString<EJobState>(stateNode->AsString()); + } + if (auto addressNode = mapNode.FindPtr("address")) { + result.Address = addressNode->AsString(); + } + if (auto taskNameNode = mapNode.FindPtr("task_name")) { + result.TaskName = taskNameNode->AsString(); + } + if (auto startTimeNode = mapNode.FindPtr("start_time")) { + result.StartTime = TInstant::ParseIso8601(startTimeNode->AsString()); + } + if (auto finishTimeNode = mapNode.FindPtr("finish_time")) { + result.FinishTime = TInstant::ParseIso8601(finishTimeNode->AsString()); + } + if (auto progressNode = mapNode.FindPtr("progress")) { + result.Progress = progressNode->AsDouble(); + } + if (auto stderrSizeNode = mapNode.FindPtr("stderr_size")) { + result.StderrSize = stderrSizeNode->AsUint64(); + } + if (auto errorNode = mapNode.FindPtr("error")) { + result.Error.ConstructInPlace(*errorNode); + } + if (auto briefStatisticsNode = mapNode.FindPtr("brief_statistics")) { + result.BriefStatistics = *briefStatisticsNode; + } + if (auto inputPathsNode = mapNode.FindPtr("input_paths")) { + const auto& inputPathNodesList = inputPathsNode->AsList(); + result.InputPaths.ConstructInPlace(); + result.InputPaths->reserve(inputPathNodesList.size()); + for (const auto& inputPathNode : inputPathNodesList) { + TRichYPath path; + Deserialize(path, inputPathNode); + result.InputPaths->push_back(std::move(path)); + } + } + if (auto coreInfosNode = mapNode.FindPtr("core_infos")) { + const auto& coreInfoNodesList = coreInfosNode->AsList(); + result.CoreInfos.ConstructInPlace(); + result.CoreInfos->reserve(coreInfoNodesList.size()); + for (const auto& coreInfoNode : coreInfoNodesList) { + TCoreInfo coreInfo; + coreInfo.ProcessId = coreInfoNode["process_id"].AsInt64(); + coreInfo.ExecutableName = coreInfoNode["executable_name"].AsString(); + if (coreInfoNode.HasKey("size")) { + coreInfo.Size = coreInfoNode["size"].AsUint64(); + } + if (coreInfoNode.HasKey("error")) { + coreInfo.Error.ConstructInPlace(coreInfoNode["error"]); + } + result.CoreInfos->push_back(std::move(coreInfo)); + } + } + return result; +} + +TJobAttributes GetJob( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options) +{ + THttpHeader header("GET", "get_job"); + header.MergeParameters(SerializeParamsForGetJob(operationId, jobId, options)); + auto responseInfo = RetryRequestWithPolicy(retryPolicy, context, header); + auto resultNode = NodeFromYsonString(responseInfo.Response); + return ParseJobAttributes(resultNode); +} + +TListJobsResult ListJobs( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TListJobsOptions& options) +{ + THttpHeader header("GET", "list_jobs"); + header.MergeParameters(SerializeParamsForListJobs(operationId, options)); + auto responseInfo = RetryRequestWithPolicy(retryPolicy, context, header); + auto resultNode = NodeFromYsonString(responseInfo.Response); + + TListJobsResult result; + + const auto& jobNodesList = resultNode["jobs"].AsList(); + result.Jobs.reserve(jobNodesList.size()); + for (const auto& jobNode : jobNodesList) { + result.Jobs.push_back(ParseJobAttributes(jobNode)); + } + + if (resultNode.HasKey("cypress_job_count") && !resultNode["cypress_job_count"].IsNull()) { + result.CypressJobCount = resultNode["cypress_job_count"].AsInt64(); + } + if (resultNode.HasKey("controller_agent_job_count") && !resultNode["controller_agent_job_count"].IsNull()) { + result.ControllerAgentJobCount = resultNode["scheduler_job_count"].AsInt64(); + } + if (resultNode.HasKey("archive_job_count") && !resultNode["archive_job_count"].IsNull()) { + result.ArchiveJobCount = resultNode["archive_job_count"].AsInt64(); + } + + return result; +} + +class TResponseReader + : public IFileReader +{ +public: + TResponseReader(const TClientContext& context, THttpHeader header) + { + if (context.ServiceTicketAuth) { + header.SetServiceTicket(context.ServiceTicketAuth->Ptr->IssueServiceTicket()); + } else { + header.SetToken(context.Token); + } + + auto hostName = GetProxyForHeavyRequest(context); + auto requestId = CreateGuidAsString(); + + Response_ = context.HttpClient->Request(GetFullUrl(hostName, context, header), requestId, header); + ResponseStream_ = Response_->GetResponseStream(); + } + +private: + size_t DoRead(void* buf, size_t len) override + { + return ResponseStream_->Read(buf, len); + } + + size_t DoSkip(size_t len) override + { + return ResponseStream_->Skip(len); + } + +private: + THttpRequest Request_; + NHttpClient::IHttpResponsePtr Response_; + IInputStream* ResponseStream_; +}; + +IFileReaderPtr GetJobInput( + const TClientContext& context, + const TJobId& jobId, + const TGetJobInputOptions& /* options */) +{ + THttpHeader header("GET", "get_job_input"); + header.AddParameter("job_id", GetGuidAsString(jobId)); + return new TResponseReader(context, std::move(header)); +} + +IFileReaderPtr GetJobFailContext( + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& /* options */) +{ + THttpHeader header("GET", "get_job_fail_context"); + header.AddOperationId(operationId); + header.AddParameter("job_id", GetGuidAsString(jobId)); + return new TResponseReader(context, std::move(header)); +} + +TString GetJobStderrWithRetries( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& /* options */) +{ + THttpHeader header("GET", "get_job_stderr"); + header.AddOperationId(operationId); + header.AddParameter("job_id", GetGuidAsString(jobId)); + TRequestConfig config; + config.IsHeavy = true; + auto responseInfo = RetryRequestWithPolicy(retryPolicy, context, header, {}, config); + return responseInfo.Response; +} + +IFileReaderPtr GetJobStderr( + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& /* options */) +{ + THttpHeader header("GET", "get_job_stderr"); + header.AddOperationId(operationId); + header.AddParameter("job_id", GetGuidAsString(jobId)); + return new TResponseReader(context, std::move(header)); +} + +TMaybe<TYPath> GetFileFromCache( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options) +{ + THttpHeader header("GET", "get_file_from_cache"); + header.MergeParameters(SerializeParamsForGetFileFromCache(transactionId, md5Signature, cachePath, options)); + auto responseInfo = RetryRequestWithPolicy(retryPolicy, context, header); + auto path = NodeFromYsonString(responseInfo.Response).AsString(); + return path.empty() ? Nothing() : TMaybe<TYPath>(path); +} + +TYPath PutFileToCache( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options) +{ + THttpHeader header("POST", "put_file_to_cache"); + header.MergeParameters(SerializeParamsForPutFileToCache(transactionId, context.Config->Prefix, filePath, md5Signature, cachePath, options)); + auto result = RetryRequestWithPolicy(retryPolicy, context, header); + return NodeFromYsonString(result.Response).AsString(); +} + +TNode::TListType SkyShareTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) +{ + THttpHeader header("POST", "api/v1/share", /*IsApi*/ false); + + auto proxyName = context.ServerName.substr(0, context.ServerName.find('.')); + + auto host = context.Config->SkynetApiHost; + if (host == "") { + host = "skynet." + proxyName + ".yt.yandex.net"; + } + + header.MergeParameters(SerializeParamsForSkyShareTable(proxyName, context.Config->Prefix, tablePaths, options)); + TClientContext skyApiHost({ .ServerName = host, .HttpClient = NHttpClient::CreateDefaultHttpClient() }); + TResponseInfo response = {}; + + // As documented at https://wiki.yandex-team.ru/yt/userdoc/blob_tables/#shag3.sozdajomrazdachu + // first request returns HTTP status code 202 (Accepted). And we need retrying until we have 200 (OK). + while (response.HttpCode != 200) { + response = RetryRequestWithPolicy(retryPolicy, skyApiHost, header, ""); + TWaitProxy::Get()->Sleep(TDuration::Seconds(5)); + } + + if (options.KeyColumns_) { + return NodeFromJsonString(response.Response)["torrents"].AsList(); + } else { + TNode torrent; + + torrent["key"] = TNode::CreateList(); + torrent["rbtorrent"] = response.Response; + + return TNode::TListType{ torrent }; + } +} + +TCheckPermissionResponse ParseCheckPermissionResponse(const TNode& node) +{ + auto parseSingleResult = [] (const TNode::TMapType& node) { + TCheckPermissionResult result; + result.Action = ::FromString<ESecurityAction>(node.at("action").AsString()); + if (auto objectId = node.FindPtr("object_id")) { + result.ObjectId = GetGuid(objectId->AsString()); + } + if (auto objectName = node.FindPtr("object_name")) { + result.ObjectName = objectName->AsString(); + } + if (auto subjectId = node.FindPtr("subject_id")) { + result.SubjectId = GetGuid(subjectId->AsString()); + } + if (auto subjectName = node.FindPtr("subject_name")) { + result.SubjectName = subjectName->AsString(); + } + return result; + }; + + const auto& mapNode = node.AsMap(); + TCheckPermissionResponse result; + static_cast<TCheckPermissionResult&>(result) = parseSingleResult(mapNode); + if (auto columns = mapNode.FindPtr("columns")) { + result.Columns.reserve(columns->AsList().size()); + for (const auto& columnNode : columns->AsList()) { + result.Columns.push_back(parseSingleResult(columnNode.AsMap())); + } + } + return result; +} + +TCheckPermissionResponse CheckPermission( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options) +{ + THttpHeader header("GET", "check_permission"); + header.MergeParameters(SerializeParamsForCheckPermission(user, permission, context.Config->Prefix, path, options)); + auto response = RetryRequestWithPolicy(retryPolicy, context, header); + return ParseCheckPermissionResponse(NodeFromYsonString(response.Response)); +} + +TVector<TTabletInfo> GetTabletInfos( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options) +{ + THttpHeader header("POST", "api/v4/get_tablet_infos", false); + header.MergeParameters(SerializeParamsForGetTabletInfos(context.Config->Prefix, path, tabletIndexes, options)); + auto response = RetryRequestWithPolicy(retryPolicy, context, header); + TVector<TTabletInfo> result; + Deserialize(result, *NodeFromYsonString(response.Response).AsMap().FindPtr("tablets")); + return result; +} + +TVector<TTableColumnarStatistics> GetTableColumnarStatistics( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) +{ + THttpHeader header("GET", "get_table_columnar_statistics"); + header.MergeParameters(SerializeParamsForGetTableColumnarStatistics(transactionId, paths, options)); + TRequestConfig config; + config.IsHeavy = true; + auto requestResult = RetryRequestWithPolicy(retryPolicy, context, header, {}, config); + auto response = NodeFromYsonString(requestResult.Response); + TVector<TTableColumnarStatistics> result; + Deserialize(result, response); + return result; +} + +TMultiTablePartitions GetTablePartitions( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) +{ + THttpHeader header("GET", "partition_tables"); + header.MergeParameters(SerializeParamsForGetTablePartitions(transactionId, paths, options)); + TRequestConfig config; + config.IsHeavy = true; + auto requestResult = RetryRequestWithPolicy(retryPolicy, context, header, {}, config); + auto response = NodeFromYsonString(requestResult.Response); + TMultiTablePartitions result; + Deserialize(result, response); + return result; +} + +TRichYPath CanonizeYPath( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TRichYPath& path) +{ + return CanonizeYPaths(retryPolicy, context, {path}).front(); +} + +TVector<TRichYPath> CanonizeYPaths( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TVector<TRichYPath>& paths) +{ + TRawBatchRequest batch(context.Config); + TVector<NThreading::TFuture<TRichYPath>> futures; + futures.reserve(paths.size()); + for (int i = 0; i < static_cast<int>(paths.size()); ++i) { + futures.push_back(batch.CanonizeYPath(paths[i])); + } + ExecuteBatch(retryPolicy, context, batch, TExecuteBatchOptions{}); + TVector<TRichYPath> result; + result.reserve(futures.size()); + for (auto& future : futures) { + result.push_back(future.ExtractValueSync()); + } + return result; +} + +void AlterTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TAlterTableOptions& options) +{ + THttpHeader header("POST", "alter_table"); + header.AddMutationId(); + header.MergeParameters(SerializeParamsForAlterTable(transactionId, context.Config->Prefix, path, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void AlterTableReplica( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& options) +{ + THttpHeader header("POST", "alter_table_replica"); + header.AddMutationId(); + header.MergeParameters(NRawClient::SerializeParamsForAlterTableReplica(replicaId, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void DeleteRows( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options) +{ + THttpHeader header("PUT", "delete_rows"); + header.SetInputFormat(TFormat::YsonBinary()); + header.MergeParameters(NRawClient::SerializeParametersForDeleteRows(context.Config->Prefix, path, options)); + + auto body = NodeListToYsonString(keys); + TRequestConfig requestConfig; + requestConfig.IsHeavy = true; + RetryRequestWithPolicy(retryPolicy, context, header, body, requestConfig); +} + +void FreezeTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TFreezeTableOptions& options) +{ + THttpHeader header("POST", "freeze_table"); + header.MergeParameters(SerializeParamsForFreezeTable(context.Config->Prefix, path, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void UnfreezeTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TUnfreezeTableOptions& options) +{ + THttpHeader header("POST", "unfreeze_table"); + header.MergeParameters(SerializeParamsForUnfreezeTable(context.Config->Prefix, path, options)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void AbortTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId) +{ + THttpHeader header("POST", "abort_tx"); + header.AddMutationId(); + header.MergeParameters(NRawClient::SerializeParamsForAbortTransaction(transactionId)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +void CommitTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId) +{ + THttpHeader header("POST", "commit_tx"); + header.AddMutationId(); + header.MergeParameters(NRawClient::SerializeParamsForCommitTransaction(transactionId)); + RetryRequestWithPolicy(retryPolicy, context, header); +} + +TTransactionId StartTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& parentTransactionId, + const TStartTransactionOptions& options) +{ + THttpHeader header("POST", "start_tx"); + header.AddMutationId(); + header.MergeParameters(NRawClient::SerializeParamsForStartTransaction(parentTransactionId, context.Config->TxTimeout, options)); + return ParseGuidFromResponse(RetryRequestWithPolicy(retryPolicy, context, header).Response); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail::NRawClient diff --git a/yt/cpp/mapreduce/raw_client/raw_requests.h b/yt/cpp/mapreduce/raw_client/raw_requests.h new file mode 100644 index 0000000000..05fcbade76 --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/raw_requests.h @@ -0,0 +1,397 @@ +#pragma once + +#include "raw_batch_request.h" + +#include <yt/cpp/mapreduce/common/fwd.h> +#include <yt/cpp/mapreduce/http/context.h> +#include <yt/cpp/mapreduce/interface/client_method_options.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class IRequestRetryPolicy; +struct TClientContext; +struct TExecuteBatchOptions; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail::NRawClient { + +//////////////////////////////////////////////////////////////////////////////// + +TOperationAttributes ParseOperationAttributes(const TNode& node); + +TCheckPermissionResponse ParseCheckPermissionResponse(const TNode& node); + +//////////////////////////////////////////////////////////////////////////////// + +// +// marks `batchRequest' as executed +void ExecuteBatch( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + TRawBatchRequest& batchRequest, + const TExecuteBatchOptions& options = TExecuteBatchOptions()); + +// +// Cypress +// + +TNode Get( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TGetOptions& options = TGetOptions()); + +TNode TryGet( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TGetOptions& options); + +void Set( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TNode& value, + const TSetOptions& options = TSetOptions()); + +void MultisetAttributes( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TNode::TMapType& value, + const TMultisetAttributesOptions& options = TMultisetAttributesOptions()); + +bool Exists( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TExistsOptions& options = TExistsOptions()); + +TNodeId Create( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const ENodeType& type, + const TCreateOptions& options = TCreateOptions()); + +TNodeId Copy( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()); + +TNodeId Move( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()); + +void Remove( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()); + +TNode::TListType List( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TListOptions& options = TListOptions()); + +TNodeId Link( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()); + +TLockId Lock( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + ELockMode mode, + const TLockOptions& options = TLockOptions()); + +void Unlock( + IRequestRetryPolicyPtr retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()); + +void Concatenate( + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options = TConcatenateOptions()); + +// +// Transactions +// + +void PingTx( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId); + +// +// Operations +// + +TOperationAttributes GetOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TGetOperationOptions& options = TGetOperationOptions()); + +void AbortOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId); + +void CompleteOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId); + +void SuspendOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TSuspendOperationOptions& options = TSuspendOperationOptions()); + +void ResumeOperation( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TResumeOperationOptions& options = TResumeOperationOptions()); + +TListOperationsResult ListOperations( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TListOperationsOptions& options = TListOperationsOptions()); + +void UpdateOperationParameters( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()); + +// +// Jobs +// + +TJobAttributes GetJob( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options = TGetJobOptions()); + +TListJobsResult ListJobs( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TListJobsOptions& options = TListJobsOptions()); + +::TIntrusivePtr<IFileReader> GetJobInput( + const TClientContext& context, + const TJobId& jobId, + const TGetJobInputOptions& options = TGetJobInputOptions()); + +::TIntrusivePtr<IFileReader> GetJobFailContext( + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& options = TGetJobFailContextOptions()); + +TString GetJobStderrWithRetries( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& /* options */ = TGetJobStderrOptions()); + +::TIntrusivePtr<IFileReader> GetJobStderr( + const TClientContext& context, + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& options = TGetJobStderrOptions()); + +// +// File cache +// + +TMaybe<TYPath> GetFileFromCache( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options = TGetFileFromCacheOptions()); + +TYPath PutFileToCache( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options = TPutFileToCacheOptions()); + +// +// SkyShare +// + +TNode::TListType SkyShareTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options); + +// +// Misc +// + +TCheckPermissionResponse CheckPermission( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options = TCheckPermissionOptions()); + +TVector<TTabletInfo> GetTabletInfos( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options); + +TVector<TTableColumnarStatistics> GetTableColumnarStatistics( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options); + +TMultiTablePartitions GetTablePartitions( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options); + +TRichYPath CanonizeYPath( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TRichYPath& path); + +TVector<TRichYPath> CanonizeYPaths( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TVector<TRichYPath>& paths); + +// +// Tables +// + +void AlterTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId, + const TYPath& path, + const TAlterTableOptions& options); + +void AlterTableReplica( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& options); + +void DeleteRows( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options); + +void FreezeTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TFreezeTableOptions& options); + +void UnfreezeTable( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TYPath& path, + const TUnfreezeTableOptions& options); + + +// Transactions +void AbortTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId); + +void CommitTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& transactionId); + +TTransactionId StartTransaction( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TTransactionId& parentId, + const TStartTransactionOptions& options); + +//////////////////////////////////////////////////////////////////////////////// + +template<typename TSrc, typename TBatchAdder> +auto BatchTransform( + const IRequestRetryPolicyPtr& retryPolicy, + const TClientContext& context, + const TSrc& src, + TBatchAdder batchAdder, + const TExecuteBatchOptions& executeBatchOptions = {}) +{ + TRawBatchRequest batch(context.Config); + using TFuture = decltype(batchAdder(batch, *std::begin(src))); + TVector<TFuture> futures; + for (const auto& el : src) { + futures.push_back(batchAdder(batch, el)); + } + ExecuteBatch(retryPolicy, context, batch, executeBatchOptions); + using TDst = decltype(futures[0].ExtractValueSync()); + TVector<TDst> result; + result.reserve(std::size(src)); + for (auto& future : futures) { + result.push_back(future.ExtractValueSync()); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail::NRawClient +} // namespace NYT diff --git a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp new file mode 100644 index 0000000000..1936266d0d --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.cpp @@ -0,0 +1,873 @@ +#include "rpc_parameters_serialization.h" + +#include <yt/cpp/mapreduce/common/helpers.h> + +#include <yt/cpp/mapreduce/interface/config.h> +#include <yt/cpp/mapreduce/interface/client_method_options.h> +#include <yt/cpp/mapreduce/interface/operation.h> +#include <yt/cpp/mapreduce/interface/serialize.h> + +#include <library/cpp/yson/node/node.h> +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/node_builder.h> + +#include <util/generic/guid.h> +#include <util/string/cast.h> + +namespace NYT::NDetail::NRawClient { + +using ::ToString; + +//////////////////////////////////////////////////////////////////// + +static void SetTransactionIdParam(TNode* node, const TTransactionId& transactionId) +{ + if (transactionId != TTransactionId()) { + (*node)["transaction_id"] = GetGuidAsString(transactionId); + } +} + +static void SetOperationIdParam(TNode* node, const TOperationId& operationId) +{ + (*node)["operation_id"] = GetGuidAsString(operationId); +} + +static void SetPathParam(TNode* node, const TString& pathPrefix, const TYPath& path) +{ + (*node)["path"] = AddPathPrefix(path, pathPrefix); +} + +static TNode SerializeAttributeFilter(const TAttributeFilter& attributeFilter) +{ + TNode result = TNode::CreateList(); + for (const auto& attribute : attributeFilter.Attributes_) { + result.Add(attribute); + } + return result; +} + +static TNode SerializeAttributeFilter(const TOperationAttributeFilter& attributeFilter) +{ + TNode result = TNode::CreateList(); + for (const auto& attribute : attributeFilter.Attributes_) { + result.Add(ToString(attribute)); + } + return result; +} + +template <typename TOptions> +static void SetFirstLastTabletIndex(TNode* node, const TOptions& options) +{ + if (options.FirstTabletIndex_) { + (*node)["first_tablet_index"] = *options.FirstTabletIndex_; + } + if (options.LastTabletIndex_) { + (*node)["last_tablet_index"] = *options.LastTabletIndex_; + } +} + +static TString GetDefaultTransactionTitle() +{ + const auto processState = TProcessState::Get(); + TStringStream res; + + res << "User transaction. Created by: " << processState->UserName << " on " << processState->FqdnHostName + << " client: " << processState->ClientVersion << " pid: " << processState->Pid; + if (!processState->CommandLine.empty()) { + res << " program: " << processState->CommandLine[0]; + } else { + res << " command line is unknown probably NYT::Initialize was never called"; + } + +#ifndef NDEBUG + res << " build: debug"; +#endif + + return res.Str(); +} + +template <typename T> +void SerializeMasterReadOptions(TNode* node, const TMasterReadOptions<T>& options) +{ + if (options.ReadFrom_) { + (*node)["read_from"] = ToString(*options.ReadFrom_); + } +} + +//////////////////////////////////////////////////////////////////// + +TNode SerializeParamsForCreate( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + ENodeType type, + const TCreateOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + result["recursive"] = options.Recursive_; + result["type"] = ToString(type); + result["ignore_existing"] = options.IgnoreExisting_; + result["force"] = options.Force_; + if (options.Attributes_) { + result["attributes"] = *options.Attributes_; + } + return result; +} + +TNode SerializeParamsForRemove( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TRemoveOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + result["recursive"] = options.Recursive_; + result["force"] = options.Force_; + return result; +} + +TNode SerializeParamsForExists( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TExistsOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + SerializeMasterReadOptions(&result, options); + return result; +} + +TNode SerializeParamsForGet( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TGetOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + SerializeMasterReadOptions(&result, options); + if (options.AttributeFilter_) { + result["attributes"] = SerializeAttributeFilter(*options.AttributeFilter_); + } + if (options.MaxSize_) { + result["max_size"] = *options.MaxSize_; + } + return result; +} + +TNode SerializeParamsForSet( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TSetOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + result["recursive"] = options.Recursive_; + if (options.Force_) { + result["force"] = *options.Force_; + } + return result; +} + +TNode SerializeParamsForMultisetAttributes( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + [[maybe_unused]] const TMultisetAttributesOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + return result; +} + +TNode SerializeParamsForList( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TListOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + SerializeMasterReadOptions(&result, options); + if (options.MaxSize_) { + result["max_size"] = *options.MaxSize_; + } + if (options.AttributeFilter_) { + result["attributes"] = SerializeAttributeFilter(*options.AttributeFilter_); + } + return result; +} + +TNode SerializeParamsForCopy( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + result["source_path"] = AddPathPrefix(sourcePath, pathPrefix); + result["destination_path"] = AddPathPrefix(destinationPath, pathPrefix); + result["recursive"] = options.Recursive_; + result["force"] = options.Force_; + result["preserve_account"] = options.PreserveAccount_; + if (options.PreserveExpirationTime_) { + result["preserve_expiration_time"] = *options.PreserveExpirationTime_; + } + return result; +} + +TNode SerializeParamsForMove( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + result["source_path"] = AddPathPrefix(sourcePath, pathPrefix); + result["destination_path"] = AddPathPrefix(destinationPath, pathPrefix); + result["recursive"] = options.Recursive_; + result["force"] = options.Force_; + result["preserve_account"] = options.PreserveAccount_; + if (options.PreserveExpirationTime_) { + result["preserve_expiration_time"] = *options.PreserveExpirationTime_; + } + return result; +} + +TNode SerializeParamsForLink( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + result["target_path"] = AddPathPrefix(targetPath, pathPrefix); + result["link_path"] = AddPathPrefix(linkPath, pathPrefix); + result["recursive"] = options.Recursive_; + result["ignore_existing"] = options.IgnoreExisting_; + result["force"] = options.Force_; + if (options.Attributes_) { + result["attributes"] = *options.Attributes_; + } + return result; +} + +TNode SerializeParamsForLock( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + ELockMode mode, + const TLockOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + result["mode"] = ToString(mode); + result["waitable"] = options.Waitable_; + if (options.AttributeKey_) { + result["attribute_key"] = *options.AttributeKey_; + } + if (options.ChildKey_) { + result["child_key"] = *options.ChildKey_; + } + return result; +} + +TNode SerializeParamsForUnlock( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TUnlockOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + Y_UNUSED(options); + return result; +} + +TNode SerializeParamsForConcatenate( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + { + auto actualDestination = destinationPath; + actualDestination.Path(AddPathPrefix(actualDestination.Path_, pathPrefix)); + if (options.Append_) { + actualDestination.Append(*options.Append_); + } + result["destination_path"] = PathToNode(actualDestination); + } + auto& sourcePathsNode = result["source_paths"]; + for (const auto& path : sourcePaths) { + auto actualSource = path; + actualSource.Path(AddPathPrefix(actualSource.Path_, pathPrefix)); + sourcePathsNode.Add(PathToNode(actualSource)); + } + return result; +} + +TNode SerializeParamsForPingTx( + const TTransactionId& transactionId) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + return result; +} + +TNode SerializeParamsForListOperations( + const TListOperationsOptions& options) +{ + TNode result = TNode::CreateMap(); + if (options.FromTime_) { + result["from_time"] = ToString(*options.FromTime_); + } + if (options.ToTime_) { + result["to_time"] = ToString(*options.ToTime_); + } + if (options.CursorTime_) { + result["cursor_time"] = ToString(*options.CursorTime_); + } + if (options.CursorDirection_) { + result["cursor_direction"] = ToString(*options.CursorDirection_); + } + if (options.Pool_) { + result["pool"] = *options.Pool_; + } + if (options.Filter_) { + result["filter"] = *options.Filter_; + } + if (options.User_) { + result["user"] = *options.User_; + } + if (options.State_) { + result["state"] = *options.State_; + } + if (options.Type_) { + result["type"] = ToString(*options.Type_); + } + if (options.WithFailedJobs_) { + result["with_failed_jobs"] = *options.WithFailedJobs_; + } + if (options.IncludeCounters_) { + result["include_counters"] = *options.IncludeCounters_; + } + if (options.IncludeArchive_) { + result["include_archive"] = *options.IncludeArchive_; + } + if (options.Limit_) { + result["limit"] = *options.Limit_; + } + return result; +} + +TNode SerializeParamsForGetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options) +{ + TNode result; + SetOperationIdParam(&result, operationId); + if (options.AttributeFilter_) { + result["attributes"] = SerializeAttributeFilter(*options.AttributeFilter_); + } + return result; +} + +TNode SerializeParamsForAbortOperation(const TOperationId& operationId) +{ + TNode result; + SetOperationIdParam(&result, operationId); + return result; +} + +TNode SerializeParamsForCompleteOperation(const TOperationId& operationId) +{ + TNode result; + SetOperationIdParam(&result, operationId); + return result; +} + +TNode SerializeParamsForSuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options) +{ + TNode result; + SetOperationIdParam(&result, operationId); + if (options.AbortRunningJobs_) { + result["abort_running_jobs"] = *options.AbortRunningJobs_; + } + return result; +} + +TNode SerializeParamsForResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options) +{ + TNode result; + SetOperationIdParam(&result, operationId); + Y_UNUSED(options); + return result; +} + +TNode SerializeParamsForUpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) +{ + TNode result; + SetOperationIdParam(&result, operationId); + TNode& parameters = result["parameters"]; + if (options.Pool_) { + parameters["pool"] = *options.Pool_; + } + if (options.Weight_) { + parameters["weight"] = *options.Weight_; + } + if (!options.Owners_.empty()) { + parameters["owners"] = TNode::CreateList(); + for (const auto& owner : options.Owners_) { + parameters["owners"].Add(owner); + } + } + if (options.SchedulingOptionsPerPoolTree_) { + parameters["scheduling_options_per_pool_tree"] = TNode::CreateMap(); + for (const auto& entry : options.SchedulingOptionsPerPoolTree_->Options_) { + auto schedulingOptionsNode = TNode::CreateMap(); + const auto& schedulingOptions = entry.second; + if (schedulingOptions.Pool_) { + schedulingOptionsNode["pool"] = *schedulingOptions.Pool_; + } + if (schedulingOptions.Weight_) { + schedulingOptionsNode["weight"] = *schedulingOptions.Weight_; + } + if (schedulingOptions.ResourceLimits_) { + auto resourceLimitsNode = TNode::CreateMap(); + const auto& resourceLimits = *schedulingOptions.ResourceLimits_; + if (resourceLimits.UserSlots_) { + resourceLimitsNode["user_slots"] = *resourceLimits.UserSlots_; + } + if (resourceLimits.Memory_) { + resourceLimitsNode["memory"] = *resourceLimits.Memory_; + } + if (resourceLimits.Cpu_) { + resourceLimitsNode["cpu"] = *resourceLimits.Cpu_; + } + if (resourceLimits.Network_) { + resourceLimitsNode["network"] = *resourceLimits.Network_; + } + schedulingOptionsNode["resource_limits"] = std::move(resourceLimitsNode); + } + parameters["scheduling_options_per_pool_tree"][entry.first] = std::move(schedulingOptionsNode); + } + } + return result; +} + +TNode SerializeParamsForGetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& /* options */) +{ + TNode result; + SetOperationIdParam(&result, operationId); + result["job_id"] = GetGuidAsString(jobId); + return result; +} + +TNode SerializeParamsForListJobs( + const TOperationId& operationId, + const TListJobsOptions& options) +{ + TNode result; + SetOperationIdParam(&result, operationId); + + if (options.Type_) { + result["type"] = ToString(*options.Type_); + } + if (options.State_) { + result["state"] = ToString(*options.State_); + } + if (options.Address_) { + result["address"] = *options.Address_; + } + if (options.WithStderr_) { + result["with_stderr"] = *options.WithStderr_; + } + if (options.WithSpec_) { + result["with_spec"] = *options.WithSpec_; + } + if (options.WithFailContext_) { + result["with_fail_context"] = *options.WithFailContext_; + } + + if (options.SortField_) { + result["sort_field"] = ToString(*options.SortField_); + } + if (options.SortOrder_) { + result["sort_order"] = ToString(*options.SortOrder_); + } + + if (options.Offset_) { + result["offset"] = *options.Offset_; + } + if (options.Limit_) { + result["limit"] = *options.Limit_; + } + + if (options.IncludeCypress_) { + result["include_cypress"] = *options.IncludeCypress_; + } + if (options.IncludeArchive_) { + result["include_archive"] = *options.IncludeArchive_; + } + if (options.IncludeControllerAgent_) { + result["include_controller_agent"] = *options.IncludeControllerAgent_; + } + return result; +} + +TNode SerializeParametersForInsertRows( + const TString& pathPrefix, + const TYPath& path, + const TInsertRowsOptions& options) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + if (options.Aggregate_) { + result["aggregate"] = *options.Aggregate_; + } + if (options.Update_) { + result["update"] = *options.Update_; + } + if (options.Atomicity_) { + result["atomicity"] = ToString(*options.Atomicity_); + } + if (options.Durability_) { + result["durability"] = ToString(*options.Durability_); + } + if (options.RequireSyncReplica_) { + result["require_sync_replica"] = *options.RequireSyncReplica_; + } + return result; +} + +TNode SerializeParametersForDeleteRows( + const TString& pathPrefix, + const TYPath& path, + const TDeleteRowsOptions& options) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + if (options.Atomicity_) { + result["atomicity"] = ToString(*options.Atomicity_); + } + if (options.Durability_) { + result["durability"] = ToString(*options.Durability_); + } + if (options.RequireSyncReplica_) { + result["require_sync_replica"] = *options.RequireSyncReplica_; + } + return result; +} + +TNode SerializeParametersForTrimRows( + const TString& pathPrefix, + const TYPath& path, + const TTrimRowsOptions& /* options*/) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + return result; +} + +TNode SerializeParamsForParseYPath(const TRichYPath& path) +{ + TNode result; + result["path"] = PathToNode(path); + return result; +} + +TNode SerializeParamsForEnableTableReplica( + const TReplicaId& replicaId) +{ + TNode result; + result["replica_id"] = GetGuidAsString(replicaId); + return result; +} + +TNode SerializeParamsForDisableTableReplica( + const TReplicaId& replicaId) +{ + TNode result; + result["replica_id"] = GetGuidAsString(replicaId); + return result; +} + +TNode SerializeParamsForAlterTableReplica(const TReplicaId& replicaId, const TAlterTableReplicaOptions& options) +{ + TNode result; + result["replica_id"] = GetGuidAsString(replicaId); + if (options.Enabled_) { + result["enabled"] = *options.Enabled_; + } + if (options.Mode_) { + result["mode"] = ToString(*options.Mode_); + } + return result; +} + +TNode SerializeParamsForFreezeTable( + const TString& pathPrefix, + const TYPath& path, + const TFreezeTableOptions& options) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + SetFirstLastTabletIndex(&result, options); + return result; +} + +TNode SerializeParamsForUnfreezeTable( + const TString& pathPrefix, + const TYPath& path, + const TUnfreezeTableOptions& options) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + SetFirstLastTabletIndex(&result, options); + return result; +} + +TNode SerializeParamsForAlterTable( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TAlterTableOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, path); + if (options.Dynamic_) { + result["dynamic"] = *options.Dynamic_; + } + if (options.Schema_) { + TNode schema; + { + TNodeBuilder builder(&schema); + Serialize(*options.Schema_, &builder); + } + result["schema"] = schema; + } + if (options.UpstreamReplicaId_) { + result["upstream_replica_id"] = GetGuidAsString(*options.UpstreamReplicaId_); + } + return result; +} + +TNode SerializeParamsForGetTableColumnarStatistics( + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + for (const auto& path : paths) { + result["paths"].Add(PathToNode(path)); + } + if (options.FetcherMode_) { + result["fetcher_mode"] = ToString(*options.FetcherMode_); + } + return result; +} + +TNode SerializeParamsForGetTablePartitions( + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + for (const auto& path : paths) { + result["paths"].Add(PathToNode(path)); + } + result["partition_mode"] = ToString(options.PartitionMode_); + result["data_weight_per_partition"] = options.DataWeightPerPartition_; + if (options.MaxPartitionCount_) { + result["max_partition_count"] = *options.MaxPartitionCount_; + } + result["adjust_data_weight_per_partition"] = options.AdjustDataWeightPerPartition_; + return result; +} + +TNode SerializeParamsForGetFileFromCache( + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions&) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + result["md5"] = md5Signature; + result["cache_path"] = cachePath; + return result; +} + +TNode SerializeParamsForPutFileToCache( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + SetPathParam(&result, pathPrefix, filePath); + result["md5"] = md5Signature; + result["cache_path"] = cachePath; + if (options.PreserveExpirationTimeout_) { + result["preserve_expiration_timeout"] = *options.PreserveExpirationTimeout_; + } + return result; +} + +TNode SerializeParamsForSkyShareTable( + const TString& serverName, + const TString& pathPrefix, + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) +{ + TNode result; + + if (tablePaths.size() == 1) { + SetPathParam(&result, pathPrefix, tablePaths[0]); + } else { + auto pathList = TNode::CreateList(); + for (const auto& p : tablePaths) { + pathList.Add(AddPathPrefix(p, pathPrefix)); + } + result["paths"] = pathList; + } + result["cluster"] = serverName; + + if (options.KeyColumns_) { + auto keyColumnsList = TNode::CreateList(); + for (const auto& s : options.KeyColumns_->Parts_) { + if (s.empty()) { + continue; + } + keyColumnsList.Add(s); + } + result["key_columns"] = keyColumnsList; + } + + if (options.EnableFastbone_) { + result["enable_fastbone"] = *options.EnableFastbone_; + } + + return result; +} + +TNode SerializeParamsForCheckPermission( + const TString& user, + EPermission permission, + const TString& pathPrefix, + const TYPath& path, + const TCheckPermissionOptions& options) +{ + TNode result; + SetPathParam(&result, pathPrefix, path); + result["path"] = path; + result["user"] = user; + result["permission"] = ToString(permission); + if (!options.Columns_.empty()) { + result["columns"] = TNode::CreateList(); + result["columns"].AsList().assign(options.Columns_.begin(), options.Columns_.end()); + } + return result; +} + +TNode SerializeParamsForGetTabletInfos( + const TString& pathPrefix, + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options) +{ + Y_UNUSED(options); + TNode result; + SetPathParam(&result, pathPrefix, path); + result["tablet_indexes"] = TNode::CreateList(); + result["tablet_indexes"].AsList().assign(tabletIndexes.begin(), tabletIndexes.end()); + return result; +} + +TNode SerializeParamsForAbortTransaction(const TTransactionId& transactionId) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + return result; +} + +TNode SerializeParamsForCommitTransaction(const TTransactionId& transactionId) +{ + TNode result; + SetTransactionIdParam(&result, transactionId); + return result; +} + +TNode SerializeParamsForStartTransaction( + const TTransactionId& parentTransactionId, + TDuration txTimeout, + const TStartTransactionOptions& options) +{ + TNode result; + + SetTransactionIdParam(&result, parentTransactionId); + result["timeout"] = static_cast<i64>((options.Timeout_.GetOrElse(txTimeout).MilliSeconds())); + if (options.Deadline_) { + result["deadline"] = ToString(options.Deadline_); + } + + if (options.PingAncestors_) { + result["ping_ancestor_transactions"] = true; + } + + if (options.Attributes_ && !options.Attributes_->IsMap()) { + ythrow TApiUsageError() << "Attributes must be a Map node"; + } + + auto attributes = options.Attributes_.GetOrElse(TNode::CreateMap()); + if (options.Title_) { + attributes["title"] = *options.Title_; + } else if (!attributes.HasKey("title")) { + attributes["title"] = GetDefaultTransactionTitle(); + } + result["attributes"] = attributes; + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail::NRawClient diff --git a/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h new file mode 100644 index 0000000000..a60e3ea369 --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/rpc_parameters_serialization.h @@ -0,0 +1,231 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/fwd.h> +#include <yt/cpp/mapreduce/interface/client_method_options.h> + +namespace NYT::NDetail::NRawClient { + +//////////////////////////////////////////////////////////////////// + +TNode SerializeParamsForCreate( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + ENodeType type, + const TCreateOptions& options); + +TNode SerializeParamsForRemove( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TRemoveOptions& options); + +TNode SerializeParamsForExists( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TExistsOptions& options); + +TNode SerializeParamsForGet( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TGetOptions& options); + +TNode SerializeParamsForSet( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TSetOptions& options); + +TNode SerializeParamsForMultisetAttributes( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TMultisetAttributesOptions& options); + +TNode SerializeParamsForList( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TListOptions& options); + +TNode SerializeParamsForCopy( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options); + +TNode SerializeParamsForMove( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options); + +TNode SerializeParamsForLink( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options); + +TNode SerializeParamsForLock( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + ELockMode mode, + const TLockOptions& options); + +TNode SerializeParamsForUnlock( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TUnlockOptions& options); + +TNode SerializeParamsForConcatenate( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options); + +TNode SerializeParamsForPingTx( + const TTransactionId& transactionId); + +TNode SerializeParamsForGetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options); + +TNode SerializeParamsForAbortOperation( + const TOperationId& operationId); + +TNode SerializeParamsForCompleteOperation( + const TOperationId& operationId); + +TNode SerializeParamsForSuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options); + +TNode SerializeParamsForResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options); + +TNode SerializeParamsForListOperations( + const TListOperationsOptions& options); + +TNode SerializeParamsForUpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options); + +TNode SerializeParamsForGetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options); + +TNode SerializeParamsForListJobs( + const TOperationId& operationId, + const TListJobsOptions& options); + +TNode SerializeParametersForInsertRows( + const TString& pathPrefix, + const TYPath& path, + const TInsertRowsOptions& options); + +TNode SerializeParametersForDeleteRows( + const TString& pathPrefix, + const TYPath& path, + const TDeleteRowsOptions& options); + +TNode SerializeParametersForTrimRows( + const TString& pathPrefix, + const TYPath& path, + const TTrimRowsOptions& options); + +TNode SerializeParamsForParseYPath( + const TRichYPath& path); + +TNode SerializeParamsForEnableTableReplica( + const TReplicaId& replicaId); + +TNode SerializeParamsForDisableTableReplica( + const TReplicaId& replicaId); + +TNode SerializeParamsForAlterTableReplica( + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& options); + +TNode SerializeParamsForFreezeTable( + const TString& pathPrefix, + const TYPath& path, + const TFreezeTableOptions& options); + +TNode SerializeParamsForUnfreezeTable( + const TString& pathPrefix, + const TYPath& path, + const TUnfreezeTableOptions& options); + +TNode SerializeParamsForAlterTable( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& path, + const TAlterTableOptions& options); + +TNode SerializeParamsForGetTableColumnarStatistics( + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options); + +TNode SerializeParamsForGetTablePartitions( + const TTransactionId& transactionId, + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options); + +TNode SerializeParamsForGetFileFromCache( + const TTransactionId& transactionId, + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions&); + +TNode SerializeParamsForPutFileToCache( + const TTransactionId& transactionId, + const TString& pathPrefix, + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options); + +TNode SerializeParamsForSkyShareTable( + const TString& serverName, + const TString& pathPrefix, + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options); + +TNode SerializeParamsForCheckPermission( + const TString& user, + EPermission permission, + const TString& pathPrefix, + const TYPath& path, + const TCheckPermissionOptions& options); + +TNode SerializeParamsForGetTabletInfos( + const TString& pathPrefix, + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options); + +TNode SerializeParamsForAbortTransaction( + const TTransactionId& transactionId); + +TNode SerializeParamsForCommitTransaction( + const TTransactionId& transactionId); + +TNode SerializeParamsForStartTransaction( + const TTransactionId& parentTransactionId, + TDuration txTimeout, + const TStartTransactionOptions& options); + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail::NRawClient diff --git a/yt/cpp/mapreduce/raw_client/ya.make b/yt/cpp/mapreduce/raw_client/ya.make new file mode 100644 index 0000000000..0d03aae80c --- /dev/null +++ b/yt/cpp/mapreduce/raw_client/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + raw_batch_request.cpp + raw_requests.cpp + rpc_parameters_serialization.cpp +) + +PEERDIR( + yt/cpp/mapreduce/common + yt/cpp/mapreduce/http + yt/cpp/mapreduce/interface + yt/cpp/mapreduce/interface/logging + library/cpp/yson/node +) + +END() diff --git a/yt/cpp/mapreduce/skiff/skiff_schema.h b/yt/cpp/mapreduce/skiff/skiff_schema.h new file mode 100644 index 0000000000..e8c97de8e8 --- /dev/null +++ b/yt/cpp/mapreduce/skiff/skiff_schema.h @@ -0,0 +1,3 @@ +#pragma once + +#include <library/cpp/skiff/skiff_schema.h> diff --git a/yt/cpp/mapreduce/skiff/unchecked_parser.h b/yt/cpp/mapreduce/skiff/unchecked_parser.h new file mode 100644 index 0000000000..8fd9f90b0b --- /dev/null +++ b/yt/cpp/mapreduce/skiff/unchecked_parser.h @@ -0,0 +1 @@ +#include <library/cpp/skiff/skiff.h> diff --git a/yt/cpp/mapreduce/skiff/wire_type.h b/yt/cpp/mapreduce/skiff/wire_type.h new file mode 100644 index 0000000000..96d19c06d3 --- /dev/null +++ b/yt/cpp/mapreduce/skiff/wire_type.h @@ -0,0 +1 @@ +#include <library/cpp/skiff/public.h> diff --git a/yt/cpp/mapreduce/skiff/ya.make b/yt/cpp/mapreduce/skiff/ya.make new file mode 100644 index 0000000000..95d91ecd47 --- /dev/null +++ b/yt/cpp/mapreduce/skiff/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +PEERDIR( + library/cpp/skiff +) + +END() diff --git a/yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h b/yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h new file mode 100644 index 0000000000..37d9d501cd --- /dev/null +++ b/yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h @@ -0,0 +1,194 @@ +#pragma once + +#include <yt/cpp/mapreduce/interface/logging/logger.h> +#include <yt/cpp/mapreduce/interface/client.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <util/generic/bt_exception.h> + +#include <util/datetime/base.h> + +//////////////////////////////////////////////////////////////////////////////// + +template<> +void Out<NYT::TNode>(IOutputStream& s, const NYT::TNode& node); + +template<> +void Out<TGUID>(IOutputStream& s, const TGUID& guid); + +//////////////////////////////////////////////////////////////////////////////// + +namespace NYT { +namespace NTesting { + +//////////////////////////////////////////////////////////////////////////////// + +IClientPtr CreateTestClient(TString proxy = "", const TCreateClientOptions& options = {}); + +// Create map node by unique path in Cypress and return that path. +TYPath CreateTestDirectory(const IClientBasePtr& client); + +TString GenerateRandomData(size_t size, ui64 seed = 42); + +TVector<TNode> ReadTable(const IClientBasePtr& client, const TString& tablePath); + +//////////////////////////////////////////////////////////////////////////////// + +// TODO: should be removed, usages should be replaced with TConfigSaverGuard +class TZeroWaitLockPollIntervalGuard +{ +public: + TZeroWaitLockPollIntervalGuard(); + + ~TZeroWaitLockPollIntervalGuard(); + +private: + TDuration OldWaitLockPollInterval_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TConfigSaverGuard +{ +public: + TConfigSaverGuard(); + ~TConfigSaverGuard(); + +private: + TConfig Config_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TDebugMetricDiff +{ +public: + TDebugMetricDiff(TString name); + ui64 GetTotal() const; + +private: + TString Name_; + ui64 InitialValue_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TOwningYaMRRow +{ + TString Key; + TString SubKey; + TString Value; + + TOwningYaMRRow(const TYaMRRow& row = {}); + TOwningYaMRRow(TString key, TString subKey, TString value); + + operator TYaMRRow() const; +}; + +bool operator == (const TOwningYaMRRow& row1, const TOwningYaMRRow& row2); + +//////////////////////////////////////////////////////////////////////////////// + +class TTestFixture +{ +public: + explicit TTestFixture(const TCreateClientOptions& options = {}); + ~TTestFixture(); + + // Return precreated client. + IClientPtr GetClient() const; + + // Return newly created client. Useful for cases: + // - when we want to have multiple clients objects; + // - when we want to control to control destruction of client object; + IClientPtr CreateClient(const TCreateClientOptions& options = {}) const; + + IClientPtr CreateClientForUser(const TString& user, TCreateClientOptions options = {}); + + TYPath GetWorkingDir() const; + +private: + TConfigSaverGuard ConfigGuard_; + IClientPtr Client_; + TYPath WorkingDir_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TTabletFixture + : public TTestFixture +{ +public: + TTabletFixture(); + +private: + void WaitForTabletCell(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +// Compares only columns and only "name" and "type" fields of columns. +bool AreSchemasEqual(const TTableSchema& lhs, const TTableSchema& rhs); + +class TWaitFailedException + : public TWithBackTrace<yexception> +{ }; + +void WaitForPredicate(const std::function<bool()>& predicate, TDuration timeout = TDuration::Seconds(60)); + +//////////////////////////////////////////////////////////////////////////////// + +// Redirects all the LOG_* calls with the corresponding level to `stream`. +// Moreover, the LOG_* calls are delegated to `oldLogger`. +class TStreamTeeLogger + : public ILogger +{ +public: + TStreamTeeLogger(ELevel cutLevel, IOutputStream* stream, ILoggerPtr oldLogger); + void Log(ELevel level, const ::TSourceLocation& sourceLocation, const char* format, va_list args) override; + +private: + ILoggerPtr OldLogger_; + IOutputStream* Stream_; + ELevel Level_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TString ToYson(const T& x) +{ + TNode result; + TNodeBuilder builder(&result); + Serialize(x, &builder); + return NodeToYsonString(result); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTesting +} // namespace NYT + +//////////////////////////////////////////////////////////////////////////////// + +template <> +void Out<NYT::NTesting::TOwningYaMRRow>(IOutputStream& out, const NYT::NTesting::TOwningYaMRRow& row); + +//////////////////////////////////////////////////////////////////////////////// + +// for UNITTEST() +#define ASSERT_SERIALIZABLES_EQUAL(a, b) \ + UNIT_ASSERT_EQUAL_C(a, b, NYT::NTesting::ToYson(a) << " != " << NYT::NTesting::ToYson(b)) + +#define ASSERT_SERIALIZABLES_UNEQUAL(a, b) \ + UNIT_ASSERT_UNEQUAL_C(a, b, NYT::NTesting::ToYson(a) << " == " << NYT::NTesting::ToYson(b)) + +// for GTEST() +#define ASSERT_SERIALIZABLES_EQ(a, b) \ + ASSERT_EQ(a, b) << NYT::NTesting::ToYson(a) << " != " << NYT::NTesting::ToYson(b) + +#define ASSERT_SERIALIZABLES_NE(a, b) \ + ASSERT_NE(a, b) << NYT::NTesting::ToYson(a) << " == " << NYT::NTesting::ToYson(b) |