aboutsummaryrefslogtreecommitdiffstats
path: root/yt/cpp/mapreduce/interface
diff options
context:
space:
mode:
authormax42 <max42@yandex-team.com>2023-06-30 03:37:03 +0300
committermax42 <max42@yandex-team.com>2023-06-30 03:37:03 +0300
commitfac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a (patch)
treeb8cbc1deb00309c7f1a7ab6df520a76cf0b5c6d7 /yt/cpp/mapreduce/interface
parent7bf166b1a7ed0af927f230022b245af618e998c1 (diff)
downloadydb-fac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a.tar.gz
YT-19324: move YT provider to ydb/library/yql
This commit is formed by the following script: https://paste.yandex-team.ru/6f92e4b8-efc5-4d34-948b-15ee2accd7e7/text. This commit has zero effect on all projects that depend on YQL. The summary of changes: - `yql/providers/yt -> ydb/library/yql/providers/yt `- the whole implementation of YT provider is moved into YDB code base for further export as a part of YT YQL plugin shared library; - `yql/providers/stat/{expr_nodes,uploader} -> ydb/library/yql/providers/stat/{expr_nodes,uploader}` - a small interface without implementation and the description of stat expr nodes; - `yql/core/extract_predicate/ut -> ydb/library/yql/core/extract_predicate/ut`; - `yql/core/{ut,ut_common} -> ydb/library/yql/core/{ut,ut_common}`; - `yql/core` is gone; - `yql/library/url_preprocessing -> ydb/library/yql/core/url_preprocessing`. **NB**: all new targets inside `ydb/` are under `IF (NOT CMAKE_EXPORT)` clause which disables them from open-source cmake generation and ya make build. They will be enabled in the subsequent commits.
Diffstat (limited to 'yt/cpp/mapreduce/interface')
-rw-r--r--yt/cpp/mapreduce/interface/batch_request.cpp15
-rw-r--r--yt/cpp/mapreduce/interface/batch_request.h222
-rw-r--r--yt/cpp/mapreduce/interface/client.cpp19
-rw-r--r--yt/cpp/mapreduce/interface/client.h568
-rw-r--r--yt/cpp/mapreduce/interface/client_method_options.cpp34
-rw-r--r--yt/cpp/mapreduce/interface/client_method_options.h1452
-rw-r--r--yt/cpp/mapreduce/interface/common.cpp664
-rw-r--r--yt/cpp/mapreduce/interface/common.h1301
-rw-r--r--yt/cpp/mapreduce/interface/common_ut.cpp303
-rw-r--r--yt/cpp/mapreduce/interface/common_ut.h1
-rw-r--r--yt/cpp/mapreduce/interface/config.cpp321
-rw-r--r--yt/cpp/mapreduce/interface/config.h228
-rw-r--r--yt/cpp/mapreduce/interface/config_ut.cpp20
-rw-r--r--yt/cpp/mapreduce/interface/constants.h19
-rw-r--r--yt/cpp/mapreduce/interface/cypress.cpp24
-rw-r--r--yt/cpp/mapreduce/interface/cypress.h252
-rw-r--r--yt/cpp/mapreduce/interface/error_codes.h468
-rw-r--r--yt/cpp/mapreduce/interface/error_ut.cpp81
-rw-r--r--yt/cpp/mapreduce/interface/errors.cpp437
-rw-r--r--yt/cpp/mapreduce/interface/errors.h290
-rw-r--r--yt/cpp/mapreduce/interface/finish_or_die.h41
-rw-r--r--yt/cpp/mapreduce/interface/fluent.h678
-rw-r--r--yt/cpp/mapreduce/interface/format.cpp135
-rw-r--r--yt/cpp/mapreduce/interface/format.h122
-rw-r--r--yt/cpp/mapreduce/interface/format_ut.cpp235
-rw-r--r--yt/cpp/mapreduce/interface/fwd.h397
-rw-r--r--yt/cpp/mapreduce/interface/init.h71
-rw-r--r--yt/cpp/mapreduce/interface/io-inl.h1015
-rw-r--r--yt/cpp/mapreduce/interface/io.cpp47
-rw-r--r--yt/cpp/mapreduce/interface/io.h586
-rw-r--r--yt/cpp/mapreduce/interface/job_counters.cpp164
-rw-r--r--yt/cpp/mapreduce/interface/job_counters.h74
-rw-r--r--yt/cpp/mapreduce/interface/job_counters_ut.cpp103
-rw-r--r--yt/cpp/mapreduce/interface/job_statistics.cpp361
-rw-r--r--yt/cpp/mapreduce/interface/job_statistics.h268
-rw-r--r--yt/cpp/mapreduce/interface/job_statistics_ut.cpp257
-rw-r--r--yt/cpp/mapreduce/interface/logging/logger.cpp188
-rw-r--r--yt/cpp/mapreduce/interface/logging/logger.h43
-rw-r--r--yt/cpp/mapreduce/interface/logging/ya.make16
-rw-r--r--yt/cpp/mapreduce/interface/logging/yt_log.cpp126
-rw-r--r--yt/cpp/mapreduce/interface/logging/yt_log.h17
-rw-r--r--yt/cpp/mapreduce/interface/mpl.h73
-rw-r--r--yt/cpp/mapreduce/interface/node.h7
-rw-r--r--yt/cpp/mapreduce/interface/operation-inl.h928
-rw-r--r--yt/cpp/mapreduce/interface/operation.cpp663
-rw-r--r--yt/cpp/mapreduce/interface/operation.h3494
-rw-r--r--yt/cpp/mapreduce/interface/operation_ut.cpp269
-rw-r--r--yt/cpp/mapreduce/interface/proto3_ut.proto17
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp271
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto142
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_format.cpp1498
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_format.h106
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp451
-rw-r--r--yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto402
-rw-r--r--yt/cpp/mapreduce/interface/public.h10
-rw-r--r--yt/cpp/mapreduce/interface/retry_policy.h47
-rw-r--r--yt/cpp/mapreduce/interface/serialize.cpp553
-rw-r--r--yt/cpp/mapreduce/interface/serialize.h90
-rw-r--r--yt/cpp/mapreduce/interface/serialize_ut.cpp49
-rw-r--r--yt/cpp/mapreduce/interface/skiff_row.cpp1
-rw-r--r--yt/cpp/mapreduce/interface/skiff_row.h127
-rw-r--r--yt/cpp/mapreduce/interface/tvm.cpp1
-rw-r--r--yt/cpp/mapreduce/interface/tvm.h35
-rw-r--r--yt/cpp/mapreduce/interface/ut/ya.make25
-rw-r--r--yt/cpp/mapreduce/interface/wait_proxy.h54
-rw-r--r--yt/cpp/mapreduce/interface/ya.make46
66 files changed, 21022 insertions, 0 deletions
diff --git a/yt/cpp/mapreduce/interface/batch_request.cpp b/yt/cpp/mapreduce/interface/batch_request.cpp
new file mode 100644
index 0000000000..fefdacb61a
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/batch_request.cpp
@@ -0,0 +1,15 @@
+#include "batch_request.h"
+#include "client.h"
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+IBatchRequestBase& IBatchRequest::WithTransaction(const ITransactionPtr& transaction)
+{
+ return WithTransaction(transaction->GetId());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/batch_request.h b/yt/cpp/mapreduce/interface/batch_request.h
new file mode 100644
index 0000000000..3ea28f76fd
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/batch_request.h
@@ -0,0 +1,222 @@
+#pragma once
+
+#include "fwd.h"
+
+#include "client_method_options.h"
+
+#include <library/cpp/threading/future/future.h>
+#include <util/generic/ptr.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////
+
+/// Helper base of @ref NYT::IBatchRequest holding most of useful methods.
+class IBatchRequestBase
+ : public TThrRefBase
+{
+public:
+ virtual ~IBatchRequestBase() = default;
+
+ ///
+ /// @brief Create cypress node.
+ ///
+ /// @see NYT::ICypressClient::Create
+ virtual ::NThreading::TFuture<TNodeId> Create(
+ const TYPath& path,
+ ENodeType type,
+ const TCreateOptions& options = TCreateOptions()) = 0;
+
+ ///
+ /// @brief Remove cypress node.
+ ///
+ /// @see NYT::ICypressClient::Remove
+ virtual ::NThreading::TFuture<void> Remove(
+ const TYPath& path,
+ const TRemoveOptions& options = TRemoveOptions()) = 0;
+
+ ///
+ /// @brief Check wether cypress node exists.
+ ///
+ /// @see NYT::ICypressClient::Exists
+ virtual ::NThreading::TFuture<bool> Exists(
+ const TYPath& path,
+ const TExistsOptions& options = TExistsOptions()) = 0;
+
+ ///
+ /// @brief Get cypress node.
+ ///
+ /// @see NYT::ICypressClient::Get
+ virtual ::NThreading::TFuture<TNode> Get(
+ const TYPath& path,
+ const TGetOptions& options = TGetOptions()) = 0;
+
+ ///
+ /// @brief Set cypress node.
+ ///
+ /// @see NYT::ICypressClient::Set
+ virtual ::NThreading::TFuture<void> Set(
+ const TYPath& path,
+ const TNode& node,
+ const TSetOptions& options = TSetOptions()) = 0;
+
+ ///
+ /// @brief List cypress directory.
+ ///
+ /// @see NYT::ICypressClient::List
+ virtual ::NThreading::TFuture<TNode::TListType> List(
+ const TYPath& path,
+ const TListOptions& options = TListOptions()) = 0;
+
+ ///
+ /// @brief Copy cypress node.
+ ///
+ /// @see NYT::ICypressClient::Copy
+ virtual ::NThreading::TFuture<TNodeId> Copy(
+ const TYPath& sourcePath,
+ const TYPath& destinationPath,
+ const TCopyOptions& options = TCopyOptions()) = 0;
+
+ ///
+ /// @brief Move cypress node.
+ ///
+ /// @see NYT::ICypressClient::Move
+ virtual ::NThreading::TFuture<TNodeId> Move(
+ const TYPath& sourcePath,
+ const TYPath& destinationPath,
+ const TMoveOptions& options = TMoveOptions()) = 0;
+
+ ///
+ /// @brief Create symbolic link.
+ ///
+ /// @see NYT::ICypressClient::Link.
+ virtual ::NThreading::TFuture<TNodeId> Link(
+ const TYPath& targetPath,
+ const TYPath& linkPath,
+ const TLinkOptions& options = TLinkOptions()) = 0;
+
+ ///
+ /// @brief Lock cypress node.
+ ///
+ /// @see NYT::ICypressClient::Lock
+ virtual ::NThreading::TFuture<ILockPtr> Lock(
+ const TYPath& path,
+ ELockMode mode,
+ const TLockOptions& options = TLockOptions()) = 0;
+
+ ///
+ /// @brief Unlock cypress node.
+ ///
+ /// @see NYT::ICypressClient::Unlock
+ virtual ::NThreading::TFuture<void> Unlock(
+ const TYPath& path,
+ const TUnlockOptions& options = TUnlockOptions()) = 0;
+
+ ///
+ /// @brief Abort operation.
+ ///
+ /// @see NYT::IClient::AbortOperation
+ virtual ::NThreading::TFuture<void> AbortOperation(const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Force complete operation.
+ ///
+ /// @see NYT::IClient::CompleteOperation
+ virtual ::NThreading::TFuture<void> CompleteOperation(const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Suspend operation.
+ ///
+ /// @see NYT::IClient::SuspendOperation
+ virtual ::NThreading::TFuture<void> SuspendOperation(
+ const TOperationId& operationId,
+ const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0;
+
+ ///
+ /// @brief Resume operation.
+ ///
+ /// @see NYT::IClient::ResumeOperation
+ virtual ::NThreading::TFuture<void> ResumeOperation(
+ const TOperationId& operationId,
+ const TResumeOperationOptions& options = TResumeOperationOptions()) = 0;
+
+ ///
+ /// @brief Update parameters of running operation.
+ ///
+ /// @see NYT::IClient::UpdateOperationParameters
+ virtual ::NThreading::TFuture<void> UpdateOperationParameters(
+ const TOperationId& operationId,
+ const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0;
+
+ ///
+ /// @brief Canonize cypress path
+ ///
+ /// @see NYT::ICypressClient::CanonizeYPath
+ virtual ::NThreading::TFuture<TRichYPath> CanonizeYPath(const TRichYPath& path) = 0;
+
+ ///
+ /// @brief Get table columnar statistic
+ ///
+ /// @see NYT::ICypressClient::GetTableColumnarStatistics
+ virtual ::NThreading::TFuture<TVector<TTableColumnarStatistics>> GetTableColumnarStatistics(
+ const TVector<TRichYPath>& paths,
+ const TGetTableColumnarStatisticsOptions& options = {}) = 0;
+
+ ///
+ /// @brief Check permission for given path.
+ ///
+ /// @see NYT::IClient::CheckPermission
+ virtual ::NThreading::TFuture<TCheckPermissionResponse> CheckPermission(
+ const TString& user,
+ EPermission permission,
+ const TYPath& path,
+ const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0;
+};
+
+///
+/// @brief Batch request object.
+///
+/// Allows to send multiple lightweight requests at once significantly
+/// reducing time of their execution.
+///
+/// Methods of this class accept same arguments as @ref NYT::IClient methods but
+/// return TFuture that is set after execution of @ref NYT::IBatchRequest::ExecuteBatch
+///
+/// @see [Example of usage](https://a.yandex-team.ru/arc/trunk/arcadia/yt/cpp/mapreduce/examples/tutorial/batch_request/main.cpp)
+class IBatchRequest
+ : public IBatchRequestBase
+{
+public:
+ ///
+ /// @brief Temporary override current transaction.
+ ///
+ /// Using WithTransaction user can temporary override default transaction.
+ /// Example of usage:
+ /// TBatchRequest batchRequest;
+ /// auto noTxResult = batchRequest.Get("//some/path");
+ /// auto txResult = batchRequest.WithTransaction(tx).Get("//some/path");
+ virtual IBatchRequestBase& WithTransaction(const TTransactionId& transactionId) = 0;
+ IBatchRequestBase& WithTransaction(const ITransactionPtr& transaction);
+
+ ///
+ /// @brief Executes all subrequests of batch request.
+ ///
+ /// After execution of this method all TFuture objects returned by subrequests will
+ /// be filled with either result or error.
+ ///
+ /// @note It is undefined in which order these requests are executed.
+ ///
+ /// @note This method doesn't throw if subrequest emits error.
+ /// Instead corresponding future is set with exception.
+ /// So it is always important to check TFuture status.
+ ///
+ /// Single TBatchRequest instance may be executed only once
+ /// and cannot be modified (filled with additional requests) after execution.
+ /// Exception is thrown on attempt to modify executed batch request
+ /// or execute it again.
+ virtual void ExecuteBatch(const TExecuteBatchOptions& options = TExecuteBatchOptions()) = 0;
+};
+
+////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/client.cpp b/yt/cpp/mapreduce/interface/client.cpp
new file mode 100644
index 0000000000..11d308b809
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/client.cpp
@@ -0,0 +1,19 @@
+#include "client.h"
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ILock::Wait(TDuration timeout)
+{
+ return GetAcquiredFuture().GetValue(timeout);
+}
+
+void ITransaction::Detach()
+{
+ Y_FAIL("ITransaction::Detach() is not implemented");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/client.h b/yt/cpp/mapreduce/interface/client.h
new file mode 100644
index 0000000000..54f37c3ae0
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/client.h
@@ -0,0 +1,568 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/client.h
+///
+/// Main header of the C++ YT Wrapper.
+
+///
+/// @mainpage C++ library for working with YT
+///
+/// This library provides possibilities to work with YT as a [MapReduce](https://en.wikipedia.org/wiki/MapReduce) system. It allows:
+/// - to read/write tables and files
+/// - to run operations
+/// - to work with transactions.
+///
+/// This library provides only basic functions for working with dynamic tables.
+/// To access full powers of YT dynamic tables one should use
+/// [yt/client](https://a.yandex-team.ru/arc/trunk/arcadia/yt/19_4/yt/client) library.
+///
+/// Entry points to this library:
+/// - @ref NYT::Initialize() initialization function for this library;
+/// - @ref NYT::IClient main interface to work with YT cluster;
+/// - @ref NYT::CreateClient() function that creates client for particular cluster;
+/// - @ref NYT::IOperationClient ancestor of @ref NYT::IClient containing the set of methods to run operations.
+///
+/// Tutorial on how to use this library can be found [here](https://yt.yandex-team.ru/docs/api/c++/examples).
+
+#include "fwd.h"
+
+#include "client_method_options.h"
+#include "constants.h"
+#include "batch_request.h"
+#include "cypress.h"
+#include "init.h"
+#include "io.h"
+#include "node.h"
+#include "operation.h"
+
+#include <library/cpp/threading/future/future.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/maybe.h>
+#include <util/system/compiler.h>
+
+/// Main namespace of YT client
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// OAuth info (returned by @ref NYT::IClient::WhoAmI).
+struct TAuthorizationInfo
+{
+ /// User's login.
+ TString Login;
+
+ /// Realm.
+ TString Realm;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Part of @ref NYT::TCheckPermissionResponse.
+///
+/// In case when 'Action == ESecurityAction::Deny' because of a 'deny' rule,
+/// the "denying" object name and id and "denied" subject name an id may be returned.
+struct TCheckPermissionResult
+{
+ /// Was the access granted or not.
+ ESecurityAction Action;
+
+ /// Id of the object whose ACL's "deny" rule forbids the access.
+ TMaybe<TGUID> ObjectId;
+
+ ///
+ /// @brief Name of the object whose ACL's "deny" rule forbids the access.
+ ///
+ /// Example is "node //tmp/x/y".
+ TMaybe<TString> ObjectName;
+
+ /// Id of the subject for whom the access was denied by a "deny" rule.
+ TMaybe<TGUID> SubjectId;
+
+ /// Name of the subject for whom the access was denied by a "deny" rule.
+ TMaybe<TString> SubjectName;
+};
+
+/// @brief Result of @ref NYT::IClient::CheckPermission command.
+///
+/// The base part of the response corresponds to the check result for the node itself.
+/// `Columns` vector contains check results for the columns (in the same order as in the request).
+struct TCheckPermissionResponse
+ : public TCheckPermissionResult
+{
+ /// @brief Results for the table columns access permissions.
+ ///
+ /// @see [Columnar ACL doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl)
+ TVector<TCheckPermissionResult> Columns;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Interface representing a lock obtained from @ref NYT::ITransaction::Lock.
+///
+/// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx)
+class ILock
+ : public TThrRefBase
+{
+public:
+ virtual ~ILock() = default;
+
+ /// Get cypress node id of lock itself.
+ virtual const TLockId& GetId() const = 0;
+
+ /// Get cypress node id of locked object.
+ virtual TNodeId GetLockedNodeId() const = 0;
+
+ ///
+ /// @brief Get future that will be set once lock is in "acquired" state.
+ ///
+ /// Note that future might contain exception if some error occurred
+ /// e.g. lock transaction was aborted.
+ virtual const ::NThreading::TFuture<void>& GetAcquiredFuture() const = 0;
+
+ ///
+ /// @brief Wait until lock is in "acquired" state.
+ ///
+ /// Throws exception if timeout exceeded or some error occurred
+ /// e.g. lock transaction was aborted.
+ void Wait(TDuration timeout = TDuration::Max());
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Base class for @ref NYT::IClient and @ref NYT::ITransaction.
+///
+/// This class contains transactional commands.
+class IClientBase
+ : public TThrRefBase
+ , public ICypressClient
+ , public IIOClient
+ , public IOperationClient
+{
+public:
+ ///
+ /// @brief Start a [transaction] (https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions).
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx)
+ [[nodiscard]] virtual ITransactionPtr StartTransaction(
+ const TStartTransactionOptions& options = TStartTransactionOptions()) = 0;
+
+ ///
+ /// @brief Change properties of table.
+ ///
+ /// Allows to:
+ /// - switch table between dynamic/static mode
+ /// - or change table schema
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table)
+ virtual void AlterTable(
+ const TYPath& path,
+ const TAlterTableOptions& options = TAlterTableOptions()) = 0;
+
+ ///
+ /// @brief Create batch request object that allows to execute several light requests in parallel.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#execute-batch)
+ virtual TBatchRequestPtr CreateBatchRequest() = 0;
+
+ /// @brief Get root client outside of all transactions.
+ virtual IClientPtr GetParentClient() = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// @brief Interface representing a master transaction.
+///
+/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions)
+class ITransaction
+ : virtual public IClientBase
+{
+public:
+ /// Get id of transaction.
+ virtual const TTransactionId& GetId() const = 0;
+
+ ///
+ /// @brief Try to lock given path.
+ ///
+ /// Lock will be held until transaction is commited/aborted or @ref NYT::ITransaction::Unlock method is called.
+ /// Lock modes:
+ /// - `LM_EXCLUSIVE`: if exclusive lock is taken no other transaction can take exclusive or shared lock.
+ /// - `LM_SHARED`: if shared lock is taken other transactions can take shared lock but not exclusive.
+ /// - `LM_SNAPSHOT`: snapshot lock always succeeds, when snapshot lock is taken current transaction snapshots object.
+ /// It will not see changes that occurred to it in other transactions.
+ ///
+ /// Exclusive/shared lock can be waitable or not.
+ /// If nonwaitable lock cannot be taken exception is thrown.
+ /// If waitable lock cannot be taken it is created in pending state and client can wait until it actually taken.
+ /// Check @ref NYT::TLockOptions::Waitable and @ref NYT::ILock::GetAcquiredFuture for more details.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lock)
+ virtual ILockPtr Lock(
+ const TYPath& path,
+ ELockMode mode,
+ const TLockOptions& options = TLockOptions()) = 0;
+
+ ///
+ /// @brief Remove all the locks (including pending ones) for this transaction from a Cypress node at `path`.
+ ///
+ /// If the locked version of the node differs from the original one,
+ /// an error will be thrown.
+ ///
+ /// Command is successful even if the node has no locks.
+ /// Only explicit (created by @ref NYT::ITransaction::Lock) locks are removed.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unlock)
+ virtual void Unlock(
+ const TYPath& path,
+ const TUnlockOptions& options = TUnlockOptions()) = 0;
+
+ ///
+ /// @brief Commit transaction.
+ ///
+ /// All changes that are made by transactions become visible globally or to parent transaction.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#commit)
+ virtual void Commit() = 0;
+
+ ///
+ /// @brief Abort transaction.
+ ///
+ /// All changes made by current transaction are lost.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#abort)
+ virtual void Abort() = 0;
+
+ /// @brief Explicitly ping transaction.
+ ///
+ /// User usually does not need this method (as transactions are pinged automatically,
+ /// see @ref NYT::TStartTransactionOptions::AutoPingable).
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#ping)
+ virtual void Ping() = 0;
+
+ ///
+ /// @brief Detach transaction.
+ ///
+ /// Stop any activities connected with it: pinging, aborting on crashes etc.
+ /// Forget about the transaction totally.
+ virtual void Detach();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Interface containing non-transactional commands.
+class IClient
+ : virtual public IClientBase
+{
+public:
+ ///
+ /// @brief Attach to existing master transaction.
+ ///
+ /// Returned object WILL NOT:
+ /// - ping transaction automatically (unless @ref NYT::TAttachTransactionOptions::AutoPing is set)
+ /// - abort it on program termination (unless @ref NYT::TAttachTransactionOptions::AbortOnTermination is set).
+ /// Otherwise returned object is similar to the object returned by @ref NYT::IClientBase::StartTransaction.
+ /// and it can see all the changes made inside the transaction.
+ [[nodiscard]] virtual ITransactionPtr AttachTransaction(
+ const TTransactionId& transactionId,
+ const TAttachTransactionOptions& options = TAttachTransactionOptions()) = 0;
+
+ ///
+ /// @brief Mount dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#mount-table)
+ virtual void MountTable(
+ const TYPath& path,
+ const TMountTableOptions& options = TMountTableOptions()) = 0;
+
+ ///
+ /// @brief Unmount dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unmount-table)
+ virtual void UnmountTable(
+ const TYPath& path,
+ const TUnmountTableOptions& options = TUnmountTableOptions()) = 0;
+
+ ///
+ /// @brief Remount dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remount-table)
+ virtual void RemountTable(
+ const TYPath& path,
+ const TRemountTableOptions& options = TRemountTableOptions()) = 0;
+
+ ///
+ /// @brief Switch dynamic table from `mounted' into `frozen' state.
+ ///
+ /// When table is in frozen state all its data is flushed to disk and writes are disabled.
+ ///
+ /// @note this function launches the process of switching, but doesn't wait until switching is accomplished.
+ /// Waiting has to be performed by user.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#freeze-table)
+ virtual void FreezeTable(
+ const TYPath& path,
+ const TFreezeTableOptions& options = TFreezeTableOptions()) = 0;
+
+ ///
+ /// @brief Switch dynamic table from `frozen` into `mounted` state.
+ ///
+ /// @note this function launches the process of switching, but doesn't wait until switching is accomplished.
+ /// Waiting has to be performed by user.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unfreeze-table)
+ virtual void UnfreezeTable(
+ const TYPath& path,
+ const TUnfreezeTableOptions& options = TUnfreezeTableOptions()) = 0;
+
+ ///
+ /// @brief Reshard dynamic table (break it into tablets) by given pivot keys.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table)
+ virtual void ReshardTable(
+ const TYPath& path,
+ const TVector<TKey>& pivotKeys,
+ const TReshardTableOptions& options = TReshardTableOptions()) = 0;
+
+ ///
+ /// @brief Reshard dynamic table, breaking it into given number of tablets.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table)
+ virtual void ReshardTable(
+ const TYPath& path,
+ i64 tabletCount,
+ const TReshardTableOptions& options = TReshardTableOptions()) = 0;
+
+ ///
+ /// @brief Insert rows into dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#insert-rows)
+ virtual void InsertRows(
+ const TYPath& path,
+ const TNode::TListType& rows,
+ const TInsertRowsOptions& options = TInsertRowsOptions()) = 0;
+
+ ///
+ /// @brief Delete rows from dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#delete-rows)
+ virtual void DeleteRows(
+ const TYPath& path,
+ const TNode::TListType& keys,
+ const TDeleteRowsOptions& options = TDeleteRowsOptions()) = 0;
+
+ ///
+ /// @brief Trim rows from the beginning of ordered dynamic table.
+ ///
+ /// Asynchronously removes `rowCount` rows from the beginning of ordered dynamic table.
+ /// Numeration of remaining rows *does not change*, e.g. after `trim(10)` and `trim(20)`
+ /// you get in total `20` deleted rows.
+ ///
+ /// @param path Path to ordered dynamic table.
+ /// @param tabletIndex Which tablet to trim.
+ /// @param rowCount How many trimmed rows will be in the table after command.
+ /// @param options Optional parameters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#trim-rows)
+ virtual void TrimRows(
+ const TYPath& path,
+ i64 tabletIndex,
+ i64 rowCount,
+ const TTrimRowsOptions& options = TTrimRowsOptions()) = 0;
+
+ ///
+ /// @brief Lookup rows with given keys from dynamic table.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lookup-rows)
+ virtual TNode::TListType LookupRows(
+ const TYPath& path,
+ const TNode::TListType& keys,
+ const TLookupRowsOptions& options = TLookupRowsOptions()) = 0;
+
+ ///
+ /// @brief Select rows from dynamic table, using [SQL dialect](https://yt.yandex-team.ru/docs//description/dynamic_tables/dyn_query_language.html).
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#select-rows)
+ virtual TNode::TListType SelectRows(
+ const TString& query,
+ const TSelectRowsOptions& options = TSelectRowsOptions()) = 0;
+
+ ///
+ /// @brief Change properties of table replica.
+ ///
+ /// Allows to enable/disable replica and/or change its mode.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table-replica)
+ virtual void AlterTableReplica(
+ const TReplicaId& replicaId,
+ const TAlterTableReplicaOptions& alterTableReplicaOptions) = 0;
+
+ ///
+ /// @brief Generate a monotonously increasing master timestamp.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#generate-timestamp)
+ virtual ui64 GenerateTimestamp() = 0;
+
+ /// Return YT username of current client.
+ virtual TAuthorizationInfo WhoAmI() = 0;
+
+ ///
+ /// @brief Get operation attributes.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-operation)
+ virtual TOperationAttributes GetOperation(
+ const TOperationId& operationId,
+ const TGetOperationOptions& options = TGetOperationOptions()) = 0;
+
+ ///
+ /// @brief List operations satisfying given filters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-operations)
+ virtual TListOperationsResult ListOperations(
+ const TListOperationsOptions& options = TListOperationsOptions()) = 0;
+
+ ///
+ /// @brief Update operation runtime parameters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#update-op-parameters)
+ virtual void UpdateOperationParameters(
+ const TOperationId& operationId,
+ const TUpdateOperationParametersOptions& options) = 0;
+
+ ///
+ /// @brief Get job attributes.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job)
+ virtual TJobAttributes GetJob(
+ const TOperationId& operationId,
+ const TJobId& jobId,
+ const TGetJobOptions& options = TGetJobOptions()) = 0;
+
+ ///
+ /// List attributes of jobs satisfying given filters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-jobs)
+ virtual TListJobsResult ListJobs(
+ const TOperationId& operationId,
+ const TListJobsOptions& options = TListJobsOptions()) = 0;
+
+ ///
+ /// @brief Get the input of a running or failed job.
+ ///
+ /// @ref NYT::TErrorResponse exception is thrown if job is missing.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-input)
+ virtual IFileReaderPtr GetJobInput(
+ const TJobId& jobId,
+ const TGetJobInputOptions& options = TGetJobInputOptions()) = 0;
+
+ ///
+ /// @brief Get fail context of a failed job.
+ ///
+ /// @ref NYT::TErrorResponse exception is thrown if it is missing.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-fail-context)
+ virtual IFileReaderPtr GetJobFailContext(
+ const TOperationId& operationId,
+ const TJobId& jobId,
+ const TGetJobFailContextOptions& options = TGetJobFailContextOptions()) = 0;
+
+ ///
+ /// @brief Get stderr of a running or failed job.
+ ///
+ /// @ref NYT::TErrorResponse exception is thrown if it is missing.
+ ///
+ /// @note YT doesn't store all job stderrs
+ ///
+ /// @note If job stderr exceeds few megabytes YT will store only head and tail of stderr.
+ ///
+ /// @see Description of `max_stderr_size` spec option [here](https://yt.yandex-team.ru/docs//description/mr/operations_options.html).
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-stderr)
+ virtual IFileReaderPtr GetJobStderr(
+ const TOperationId& operationId,
+ const TJobId& jobId,
+ const TGetJobStderrOptions& options = TGetJobStderrOptions()) = 0;
+
+ ///
+ /// @brief Create one or several rbtorrents for files in a blob table.
+ ///
+ /// If specified, one torrent is created for each value of `KeyColumns` option.
+ /// Otherwise, a single torrent with all files of a table is created.
+ ///
+ /// @return list of nodes, each node has two fields
+ /// * `key`: list of key columns values. Empty if `KeyColumns` is not specified.
+ /// * `rbtorrent`: rbtorrent string (with `rbtorrent:` prefix)
+ ///
+ /// @see [More info.](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables#sky_share)
+ virtual TNode::TListType SkyShareTable(
+ const std::vector<TYPath>& tablePaths,
+ const TSkyShareTableOptions& options) = 0;
+
+ ///
+ /// @brief Check if `user` has `permission` to access a Cypress node at `path`.
+ ///
+ /// For tables access to columns specified in `options.Columns_` can be checked
+ /// (@see [the doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl)).
+ ///
+ /// If access is denied (the returned result has `.Action == ESecurityAction::Deny`)
+ /// because of a `deny` rule, the "denying" object name and id
+ /// and "denied" subject name an id may be returned.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#check_permission)
+ virtual TCheckPermissionResponse CheckPermission(
+ const TString& user,
+ EPermission permission,
+ const TYPath& path,
+ const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0;
+
+ /// @brief Get information about tablet
+ /// @see NYT::TTabletInfo
+ virtual TVector<TTabletInfo> GetTabletInfos(
+ const TYPath& path,
+ const TVector<int>& tabletIndexes,
+ const TGetTabletInfosOptions& options = TGetTabletInfosOptions()) = 0;
+
+ ///
+ /// @brief Suspend operation.
+ ///
+ /// Jobs will be aborted.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#suspend_op)
+ virtual void SuspendOperation(
+ const TOperationId& operationId,
+ const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0;
+
+ /// @brief Resume previously suspended operation.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#resume_op)
+ virtual void ResumeOperation(
+ const TOperationId& operationId,
+ const TResumeOperationOptions& options = TResumeOperationOptions()) = 0;
+
+ ///
+ /// @brief Synchronously terminates all client's background activities
+ ///
+ /// e.g. no callbacks will be executed after the function is completed
+ ///
+ /// @note It is safe to call Shutdown multiple times
+ ///
+ /// @note @ref NYT::TApiUsageError will be thrown if any client's method is called after shutdown
+ ///
+ virtual void Shutdown() = 0;
+};
+
+
+/// Create a client for particular MapReduce cluster.
+IClientPtr CreateClient(
+ const TString& serverName,
+ const TCreateClientOptions& options = TCreateClientOptions());
+
+
+/// Create a client for mapreduce cluster specified in `YT_PROXY` environment variable.
+IClientPtr CreateClientFromEnv(
+ const TCreateClientOptions& options = TCreateClientOptions());
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/client_method_options.cpp b/yt/cpp/mapreduce/interface/client_method_options.cpp
new file mode 100644
index 0000000000..66f72bfe5f
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/client_method_options.cpp
@@ -0,0 +1,34 @@
+#include "client_method_options.h"
+
+#include "tvm.h"
+
+namespace NYT {
+
+template <typename T>
+static void MergeMaybe(TMaybe<T>& origin, const TMaybe<T>& patch)
+{
+ if (patch) {
+ origin = patch;
+ }
+}
+
+void TFormatHints::Merge(const TFormatHints& patch)
+{
+ if (patch.SkipNullValuesForTNode_) {
+ SkipNullValuesForTNode(true);
+ }
+ MergeMaybe(EnableStringToAllConversion_, patch.EnableStringToAllConversion_);
+ MergeMaybe(EnableAllToStringConversion_, patch.EnableAllToStringConversion_);
+ MergeMaybe(EnableIntegralTypeConversion_, patch.EnableIntegralTypeConversion_);
+ MergeMaybe(EnableIntegralToDoubleConversion_, patch.EnableIntegralToDoubleConversion_);
+ MergeMaybe(EnableTypeConversion_, patch.EnableTypeConversion_);
+ MergeMaybe(ComplexTypeMode_, patch.ComplexTypeMode_);
+}
+
+TCreateClientOptions& TCreateClientOptions::ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper)
+{
+ ServiceTicketAuth_ = std::make_shared<NAuth::IServiceTicketAuthPtrWrapper>(wrapper);
+ return *this;
+}
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/client_method_options.h b/yt/cpp/mapreduce/interface/client_method_options.h
new file mode 100644
index 0000000000..8074632353
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/client_method_options.h
@@ -0,0 +1,1452 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/client_method_options.h
+///
+/// Header containing options for @ref NYT::IClient methods.
+
+#include "common.h"
+#include "config.h"
+#include "format.h"
+#include "public.h"
+#include "retry_policy.h"
+
+#include <util/datetime/base.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Type of the cypress node.
+enum ENodeType : int
+{
+ NT_STRING /* "string_node" */,
+ NT_INT64 /* "int64_node" */,
+ NT_UINT64 /* "uint64_node" */,
+ NT_DOUBLE /* "double_node" */,
+ NT_BOOLEAN /* "boolean_node" */,
+ NT_MAP /* "map_node" */,
+ NT_LIST /* "list_node" */,
+ NT_FILE /* "file" */,
+ NT_TABLE /* "table" */,
+ NT_DOCUMENT /* "document" */,
+ NT_REPLICATED_TABLE /* "replicated_table" */,
+ NT_TABLE_REPLICA /* "table_replica" */,
+ NT_USER /* "user" */,
+ NT_SCHEDULER_POOL /* "scheduler_pool" */,
+ NT_LINK /* "link" */,
+};
+
+///
+/// @brief Mode of composite type representation in yson.
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson
+enum class EComplexTypeMode : int
+{
+ Named /* "named" */,
+ Positional /* "positional" */,
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Create
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#create
+struct TCreateOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TCreateOptions;
+ /// @endcond
+
+ /// Create missing parent directories if required.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ ///
+ /// @brief Do not raise error if node already exists.
+ ///
+ /// Node is not recreated.
+ /// Force and IgnoreExisting MUST NOT be used simultaneously.
+ FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false);
+
+ ///
+ /// @brief Recreate node if it exists.
+ ///
+ /// Force and IgnoreExisting MUST NOT be used simultaneously.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+
+ /// @brief Set node attributes.
+ FLUENT_FIELD_OPTION(TNode, Attributes);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Remove
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#remove
+struct TRemoveOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TRemoveOptions;
+ /// @endcond
+
+ ///
+ /// @brief Remove whole tree when removing composite cypress node (e.g. `map_node`).
+ ///
+ /// Without this option removing nonempty composite node will fail.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ /// @brief Do not fail if removing node doesn't exist.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+};
+
+/// Base class for options for operations that read from master.
+template <typename TDerived>
+struct TMasterReadOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Where to read from.
+ FLUENT_FIELD_OPTION(EMasterReadKind, ReadFrom);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Exists
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#exists
+struct TExistsOptions
+ : public TMasterReadOptions<TExistsOptions>
+{
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Get
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#get
+struct TGetOptions
+ : public TMasterReadOptions<TGetOptions>
+{
+ /// @brief Attributes that should be fetched with each node.
+ FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter);
+
+ /// @brief Limit for the number of children node.
+ FLUENT_FIELD_OPTION(i64, MaxSize);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Set
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#set
+struct TSetOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSetOptions;
+ /// @endcond
+
+ /// Create missing parent directories if required.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ /// Allow setting any nodes, not only attribute and document ones.
+ FLUENT_FIELD_OPTION(bool, Force);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::MultisetAttributes
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes
+struct TMultisetAttributesOptions
+{ };
+
+///
+/// @brief Options for @ref NYT::ICypressClient::List
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#list
+struct TListOptions
+ : public TMasterReadOptions<TListOptions>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TListOptions;
+ /// @endcond
+
+ /// Attributes that should be fetched for each node.
+ FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter);
+
+ /// Limit for the number of children that will be fetched.
+ FLUENT_FIELD_OPTION(i64, MaxSize);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Copy
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#copy
+struct TCopyOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TCopyOptions;
+ /// @endcond
+
+ /// Create missing directories in destination path if required.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ /// Allows to use existing node as destination, it will be overwritten.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+
+ /// Whether to preserves account of source node.
+ FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false);
+
+ /// Whether to preserve `expiration_time` attribute of source node.
+ FLUENT_FIELD_OPTION(bool, PreserveExpirationTime);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Move
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#move
+struct TMoveOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TMoveOptions;
+ /// @endcond
+
+ /// Create missing directories in destination path if required.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ /// Allows to use existing node as destination, it will be overwritten.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+
+ /// Whether to preserves account of source node.
+ FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false);
+
+ /// Whether to preserve `expiration_time` attribute of source node.
+ FLUENT_FIELD_OPTION(bool, PreserveExpirationTime);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Link
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#link
+struct TLinkOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TLinkOptions;
+ /// @endcond
+
+ /// Create parent directories of destination if they don't exist.
+ FLUENT_FIELD_DEFAULT(bool, Recursive, false);
+
+ /// Do not raise error if link already exists.
+ FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false);
+
+ /// Force rewrite target node.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+
+ /// Attributes of created link.
+ FLUENT_FIELD_OPTION(TNode, Attributes);
+};
+
+///
+/// @brief Options for @ref NYT::ICypressClient::Concatenate
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#concatenate
+struct TConcatenateOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TConcatenateOptions;
+ /// @endcond
+
+ /// Whether we should append to destination or rewrite it.
+ FLUENT_FIELD_OPTION(bool, Append);
+};
+
+///
+/// @brief Options for @ref NYT::IIOClient::CreateBlobTableReader
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#read_blob_table
+struct TBlobTableReaderOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TBlobTableReaderOptions;
+ /// @endcond
+
+ /// Name of the part index column. By default it is "part_index".
+ FLUENT_FIELD_OPTION(TString, PartIndexColumnName);
+
+ /// Name of the data column. By default it is "data".
+ FLUENT_FIELD_OPTION(TString, DataColumnName);
+
+ ///
+ /// @brief Size of each part.
+ ///
+ /// All blob parts except the last part of the blob must be of this size
+ /// otherwise blob table reader emits error.
+ FLUENT_FIELD_DEFAULT(ui64, PartSize, 4 * 1024 * 1024);
+
+ /// @brief Offset from which to start reading
+ FLUENT_FIELD_DEFAULT(i64, Offset, 0);
+};
+
+///
+/// @brief Resource limits for operation (or pool)
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy
+/// @see NYT::TUpdateOperationParametersOptions
+struct TResourceLimits
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TResourceLimits;
+ /// @endcond
+
+ /// Number of slots for user jobs.
+ FLUENT_FIELD_OPTION(i64, UserSlots);
+
+ /// Number of cpu cores.
+ FLUENT_FIELD_OPTION(double, Cpu);
+
+ /// Network usage. Doesn't have precise physical unit.
+ FLUENT_FIELD_OPTION(i64, Network);
+
+ /// Memory in bytes.
+ FLUENT_FIELD_OPTION(i64, Memory);
+};
+
+///
+/// @brief Scheduling options for single pool tree.
+///
+/// @see NYT::TUpdateOperationParametersOptions
+struct TSchedulingOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSchedulingOptions;
+ /// @endcond
+
+ ///
+ /// @brief Pool to switch operation to.
+ ///
+ /// @note Switching is currently disabled on the server (will induce an exception).
+ FLUENT_FIELD_OPTION(TString, Pool);
+
+ /// @brief Operation weight.
+ FLUENT_FIELD_OPTION(double, Weight);
+
+ /// @brief Operation resource limits.
+ FLUENT_FIELD_OPTION(TResourceLimits, ResourceLimits);
+};
+
+///
+/// @brief Collection of scheduling options for multiple pool trees.
+///
+/// @see NYT::TUpdateOperationParametersOptions
+struct TSchedulingOptionsPerPoolTree
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSchedulingOptionsPerPoolTree;
+ /// @endcond
+
+ TSchedulingOptionsPerPoolTree(const THashMap<TString, TSchedulingOptions>& options = {})
+ : Options_(options)
+ { }
+
+ /// Add scheduling options for pool tree.
+ TSelf& Add(TStringBuf poolTreeName, const TSchedulingOptions& schedulingOptions)
+ {
+ Y_ENSURE(Options_.emplace(poolTreeName, schedulingOptions).second);
+ return *this;
+ }
+
+ THashMap<TString, TSchedulingOptions> Options_;
+};
+
+///
+/// @brief Options for @ref NYT::IOperation::SuspendOperation
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#suspend_op
+struct TSuspendOperationOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSuspendOperationOptions;
+ /// @endcond
+
+ ///
+ /// @brief Whether to abort already running jobs.
+ ///
+ /// By default running jobs are not aborted.
+ FLUENT_FIELD_OPTION(bool, AbortRunningJobs);
+};
+
+///
+/// @brief Options for @ref NYT::IOperation::ResumeOperation
+///
+/// @note They are empty for now but options might appear in the future.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#resume_op
+struct TResumeOperationOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TResumeOperationOptions;
+ /// @endcond
+};
+
+///
+/// @brief Options for @ref NYT::IOperation::UpdateParameters
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#update_op_parameters
+struct TUpdateOperationParametersOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TUpdateOperationParametersOptions;
+ /// @endcond
+
+ /// New owners of the operation.
+ FLUENT_VECTOR_FIELD(TString, Owner);
+
+ /// Pool to switch operation to (for all pool trees it is running in).
+ FLUENT_FIELD_OPTION(TString, Pool);
+
+ /// New operation weight (for all pool trees it is running in).
+ FLUENT_FIELD_OPTION(double, Weight);
+
+ /// Scheduling options for each pool tree the operation is running in.
+ FLUENT_FIELD_OPTION(TSchedulingOptionsPerPoolTree, SchedulingOptionsPerPoolTree);
+};
+
+///
+/// @brief Base class for many options related to IO.
+///
+/// @ref NYT::TFileWriterOptions
+/// @ref NYT::TFileReaderOptions
+/// @ref NYT::TTableReaderOptions
+/// @ref NYT::TTableWriterOptions
+template <class TDerived>
+struct TIOOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Advanced options for reader/writer.
+ ///
+ /// Readers/writers have many options not of all of them are supported by library.
+ /// If you need such unsupported option, you might use `Config` option until
+ /// option is supported.
+ ///
+ /// Example:
+ ///
+ /// TTableWriterOptions().Config(TNode()("max_row_weight", 64 << 20)))
+ ///
+ /// @note We encourage you to ask yt@ to add native C++ support of required options
+ /// and use `Config` only as temporary solution while native support is not ready.
+ FLUENT_FIELD_OPTION(TNode, Config);
+
+ ///
+ /// @brief Whether to create internal client transaction for reading / writing table.
+ ///
+ /// This is advanced option.
+ ///
+ /// If `CreateTransaction` is set to `false` reader/writer doesn't create internal transaction
+ /// and doesn't lock table. This option is overriden (effectively `false`) for writers by
+ /// @ref NYT::TTableWriterOptions::SingleHttpRequest
+ ///
+ /// WARNING: if `CreateTransaction` is `false`, read/write might become non-atomic.
+ /// Change ONLY if you are sure what you are doing!
+ FLUENT_FIELD_DEFAULT(bool, CreateTransaction, true);
+};
+
+/// @brief Options for reading file from YT.
+struct TFileReaderOptions
+ : public TIOOptions<TFileReaderOptions>
+{
+ ///
+ /// @brief Offset to start reading from.
+ ///
+ /// By default reading is started from the beginning of the file.
+ FLUENT_FIELD_OPTION(i64, Offset);
+
+ ///
+ /// @brief Maximum length to read.
+ ///
+ /// By default file is read until the end.
+ FLUENT_FIELD_OPTION(i64, Length);
+};
+
+/// @brief Options that control how server side of YT stores data.
+struct TWriterOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TWriterOptions;
+ /// @endcond
+
+ ///
+ /// @brief Whether to wait all replicas to be written.
+ ///
+ /// When set to true upload will be considered successful as soon as
+ /// @ref NYT::TWriterOptions::MinUploadReplicationFactor number of replicas are created.
+ FLUENT_FIELD_OPTION(bool, EnableEarlyFinish);
+
+ /// Number of replicas to be created.
+ FLUENT_FIELD_OPTION(ui64, UploadReplicationFactor);
+
+ ///
+ /// Min number of created replicas needed to consider upload successful.
+ ///
+ /// @see NYT::TWriterOptions::EnableEarlyFinish
+ FLUENT_FIELD_OPTION(ui64, MinUploadReplicationFactor);
+
+ ///
+ /// @brief Desired size of a chunk.
+ ///
+ /// @see @ref NYT::TWriterOptions::RetryBlockSize
+ FLUENT_FIELD_OPTION(ui64, DesiredChunkSize);
+
+ ///
+ /// @brief Size of data block accumulated in memory to provide retries.
+ ///
+ /// Data is accumulated in memory buffer so in case error occurs data could be resended.
+ ///
+ /// If `RetryBlockSize` is not set buffer size is set to `DesiredChunkSize`.
+ /// If niether `RetryBlockSize` nor `DesiredChunkSize` is set size of buffer is 64MB.
+ ///
+ /// @note Written chunks cannot be larger than size of this memory buffer.
+ ///
+ /// Since DesiredChunkSize is compared against data already compressed with compression codec
+ /// it makes sense to set `RetryBlockSize = DesiredChunkSize / ExpectedCompressionRatio`
+ ///
+ /// @see @ref NYT::TWriterOptions::DesiredChunkSize
+ /// @see @ref NYT::TTableWriterOptions::SingleHttpRequest
+ FLUENT_FIELD_OPTION(size_t, RetryBlockSize);
+};
+
+///
+/// @brief Options for writing file
+///
+/// @see NYT::IIOClient::CreateFileWriter
+struct TFileWriterOptions
+ : public TIOOptions<TFileWriterOptions>
+{
+ ///
+ /// @brief Whether to compute MD5 sum of written file.
+ ///
+ /// If ComputeMD5 is set to `true` and we are appending to an existing file
+ /// the `md5` attribute must be set (i.e. it was previously written only with `ComputeMD5 == true`).
+ FLUENT_FIELD_OPTION(bool, ComputeMD5);
+
+ ///
+ /// @brief Options to control how YT server side writes data.
+ ///
+ /// @see NYT::TWriterOptions
+ FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions);
+};
+
+class TSkiffRowHints {
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TSkiffRowHints;
+ /// @endcond
+
+ ///
+ /// @brief Library doesn't interpret it, only pass it to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions.
+ ///
+ /// You can set something in it to pass necessary information to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions.
+ FLUENT_FIELD_OPTION(TNode, Attributes);
+};
+
+/// Options that control how C++ objects represent table rows when reading or writing a table.
+class TFormatHints
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TFormatHints;
+ /// @endcond
+
+ ///
+ /// @brief Whether to skip null values.
+ ///
+ /// When set to true TNode doesn't contain null column values
+ /// (e.g. corresponding keys will be missing instead of containing null value).
+ ///
+ /// Only meaningful for TNode representation.
+ ///
+ /// Useful for sparse tables which have many columns in schema
+ /// but only few columns are set in any row.
+ FLUENT_FIELD_DEFAULT(bool, SkipNullValuesForTNode, false);
+
+ ///
+ /// @brief Whether to convert string to numeric and boolean types (e.g. "42u" -> 42u, "false" -> %false)
+ /// when writing to schemaful table.
+ FLUENT_FIELD_OPTION(bool, EnableStringToAllConversion);
+
+ ///
+ /// @brief Whether to convert numeric and boolean types to string (e.g., 3.14 -> "3.14", %true -> "true")
+ /// when writing to schemaful table.
+ FLUENT_FIELD_OPTION(bool, EnableAllToStringConversion);
+
+ ///
+ /// @brief Whether to convert uint64 <-> int64 when writing to schemaful table.
+ ///
+ /// On overflow the corresponding error with be raised.
+ ///
+ /// This options is enabled by default.
+ FLUENT_FIELD_OPTION(bool, EnableIntegralTypeConversion);
+
+ /// Whether to convert uint64 and int64 to double (e.g. 42 -> 42.0) when writing to schemaful table.
+ FLUENT_FIELD_OPTION(bool, EnableIntegralToDoubleConversion);
+
+ /// Shortcut for enabling all type conversions.
+ FLUENT_FIELD_OPTION(bool, EnableTypeConversion);
+
+ ///
+ /// @brief Controls how complex types are represented in TNode or yson-strings.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson
+ FLUENT_FIELD_OPTION(EComplexTypeMode, ComplexTypeMode);
+
+ ///
+ /// @brief Allow to use any meta-information for creating skiff schema and parser for reading ISkiffRow.
+ FLUENT_FIELD_OPTION(TSkiffRowHints, SkiffRowHints);
+
+ ///
+ /// @brief Apply the patch to the fields.
+ ///
+ /// Non-default and non-empty values replace the default and empty ones.
+ void Merge(const TFormatHints& patch);
+};
+
+/// Options that control which control attributes (like row_index) are added to rows during read.
+class TControlAttributes
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TControlAttributes;
+ /// @endcond
+
+ ///
+ /// @brief Whether to add "row_index" attribute to rows read.
+ FLUENT_FIELD_DEFAULT(bool, EnableRowIndex, true);
+
+ ///
+ /// @brief Whether to add "range_index" attribute to rows read.
+ FLUENT_FIELD_DEFAULT(bool, EnableRangeIndex, true);
+};
+
+/// Options for @ref NYT::IClient::CreateTableReader
+struct TTableReaderOptions
+ : public TIOOptions<TTableReaderOptions>
+{
+ /// @deprecated Size of internal client buffer.
+ FLUENT_FIELD_DEFAULT(size_t, SizeLimit, 4 << 20);
+
+ ///
+ /// @brief Allows to fine tune format that is used for reading tables.
+ ///
+ /// Has no effect when used with raw-reader.
+ FLUENT_FIELD_OPTION(TFormatHints, FormatHints);
+
+ ///
+ /// @brief Allows to tune which attributes are added to rows while reading tables.
+ ///
+ FLUENT_FIELD_DEFAULT(TControlAttributes, ControlAttributes, TControlAttributes());
+};
+
+/// Options for @ref NYT::IClient::CreateTableWriter
+struct TTableWriterOptions
+ : public TIOOptions<TTableWriterOptions>
+{
+ ///
+ /// @brief Enable or disable retryful writing.
+ ///
+ /// If set to true no retry is made but we also make less requests to master.
+ /// If set to false writer can make up to `TConfig::RetryCount` attempts to send each block of data.
+ ///
+ /// @note Writers' methods might throw strange exceptions that might look like network error
+ /// when `SingleHttpRequest == true` and YT node encounters an error
+ /// (due to limitations of HTTP protocol YT node have no chance to report error
+ /// before it reads the whole input so it just drops the connection).
+ FLUENT_FIELD_DEFAULT(bool, SingleHttpRequest, false);
+
+ ///
+ /// @brief Allows to change the size of locally buffered rows before flushing to yt.
+ ///
+ /// Used only with @ref NYT::TTableWriterOptions::SingleHttpRequest
+ FLUENT_FIELD_DEFAULT(size_t, BufferSize, 64 << 20);
+
+ ///
+ /// @brief Allows to fine tune format that is used for writing tables.
+ ///
+ /// Has no effect when used with raw-writer.
+ FLUENT_FIELD_OPTION(TFormatHints, FormatHints);
+
+ /// @brief Try to infer schema of inexistent table from the type of written rows.
+ ///
+ /// @note Default values for this option may differ depending on the row type.
+ /// For protobuf it's currently false by default.
+ FLUENT_FIELD_OPTION(bool, InferSchema);
+
+ ///
+ /// @brief Options to control how YT server side writes data.
+ ///
+ /// @see NYT::TWriterOptions
+ FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::StartTransaction
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#start_tx
+struct TStartTransactionOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TStartTransactionOptions;
+ /// @endcond
+
+ FLUENT_FIELD_DEFAULT(bool, PingAncestors, false);
+
+ ///
+ /// @brief How long transaction lives after last ping.
+ ///
+ /// If server doesn't receive any pings for transaction for this time
+ /// transaction will be aborted. By default timeout is 15 seconds.
+ FLUENT_FIELD_OPTION(TDuration, Timeout);
+
+ ///
+ /// @brief Moment in the future when transaction is aborted.
+ FLUENT_FIELD_OPTION(TInstant, Deadline);
+
+ ///
+ /// @brief Whether to ping created transaction automatically.
+ ///
+ /// When set to true library creates a thread that pings transaction.
+ /// When set to false library doesn't ping transaction and it's user responsibility to ping it.
+ FLUENT_FIELD_DEFAULT(bool, AutoPingable, true);
+
+ ///
+ /// @brief Set the title attribute of transaction.
+ ///
+ /// If title was not specified
+ /// neither using this option nor using @ref NYT::TStartTransactionOptions::Attributes option
+ /// library will generate default title for transaction.
+ /// Such default title includes machine name, pid, user name and some other useful info.
+ FLUENT_FIELD_OPTION(TString, Title);
+
+ ///
+ /// @brief Set custom transaction attributes
+ ///
+ /// @note @ref NYT::TStartTransactionOptions::Title option overrides `"title"` attribute.
+ FLUENT_FIELD_OPTION(TNode, Attributes);
+};
+
+///
+/// @brief Options for attaching transaction.
+///
+/// @see NYT::IClient::AttachTransaction
+struct TAttachTransactionOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TAttachTransactionOptions;
+ /// @endcond
+
+ ///
+ /// @brief Ping transaction automatically.
+ ///
+ /// When set to |true| library creates a thread that pings transaction.
+ /// When set to |false| library doesn't ping transaction and
+ /// it's user responsibility to ping it.
+ FLUENT_FIELD_DEFAULT(bool, AutoPingable, false);
+
+ ///
+ /// @brief Abort transaction on program termination.
+ ///
+ /// Should the transaction be aborted on program termination
+ /// (either normal or by a signal or uncaught exception -- two latter
+ /// only if @ref TInitializeOptions::CleanupOnTermination is set).
+ FLUENT_FIELD_DEFAULT(bool, AbortOnTermination, false);
+};
+
+///
+/// @brief Type of the lock.
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_mode
+/// @see NYT::ITransaction::Lock
+enum ELockMode : int
+{
+ /// Exclusive lock.
+ LM_EXCLUSIVE /* "exclusive" */,
+
+ /// Shared lock.
+ LM_SHARED /* "shared" */,
+
+ /// Snapshot lock.
+ LM_SNAPSHOT /* "snapshot" */,
+};
+
+///
+/// @brief Options for locking cypress node
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks
+/// @see NYT::ITransaction::Lock
+struct TLockOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TLockOptions;
+ /// @endcond
+
+ ///
+ /// @brief Whether to wait already locked node to be unlocked.
+ ///
+ /// If `Waitable' is set to true Lock method will create
+ /// waitable lock, that will be taken once other transactions
+ /// that hold lock to that node are commited / aborted.
+ ///
+ /// @note Lock method DOES NOT wait until lock is actually acquired.
+ /// Waiting should be done using corresponding methods of ILock.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_queue
+ FLUENT_FIELD_DEFAULT(bool, Waitable, false);
+
+ ///
+ /// @brief Also take attribute_key lock.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility
+ FLUENT_FIELD_OPTION(TString, AttributeKey);
+
+ ///
+ /// @brief Also take child_key lock.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility
+ FLUENT_FIELD_OPTION(TString, ChildKey);
+};
+
+///
+/// @brief Options for @ref NYT::ITransaction::Unlock
+///
+/// @note They are empty for now but options might appear in the future.
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility
+struct TUnlockOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TUnlockOptions;
+ /// @endcond
+};
+
+/// Base class for options that deal with tablets.
+template <class TDerived>
+struct TTabletOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// Index of a first tablet to deal with.
+ FLUENT_FIELD_OPTION(i64, FirstTabletIndex);
+
+ /// Index of a last tablet to deal with.
+ FLUENT_FIELD_OPTION(i64, LastTabletIndex);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::MountTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#mount_table
+struct TMountTableOptions
+ : public TTabletOptions<TMountTableOptions>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TMountTableOptions;
+ /// @endcond
+
+ /// If specified table will be mounted to this cell.
+ FLUENT_FIELD_OPTION(TTabletCellId, CellId);
+
+ /// If set to true tablets will be mounted in freezed state.
+ FLUENT_FIELD_DEFAULT(bool, Freeze, false);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::UnmountTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#unmount_table
+struct TUnmountTableOptions
+ : public TTabletOptions<TUnmountTableOptions>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TUnmountTableOptions;
+ /// @endcond
+
+ /// Advanced option, don't use unless yt team told you so.
+ FLUENT_FIELD_DEFAULT(bool, Force, false);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::RemountTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#remount_table
+struct TRemountTableOptions
+ : public TTabletOptions<TRemountTableOptions>
+{ };
+
+///
+/// @brief Options for @ref NYT::IClient::ReshardTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#reshard_table
+struct TReshardTableOptions
+ : public TTabletOptions<TReshardTableOptions>
+{ };
+
+///
+/// @brief Options for @ref NYT::IClient::FreezeTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#freeze_table
+struct TFreezeTableOptions
+ : public TTabletOptions<TFreezeTableOptions>
+{ };
+
+///
+/// @brief Options for @ref NYT::IClient::UnfreezeTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#unfreeze_table
+struct TUnfreezeTableOptions
+ : public TTabletOptions<TUnfreezeTableOptions>
+{ };
+
+///
+/// @brief Options for @ref NYT::IClient::AlterTable
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#alter_table
+struct TAlterTableOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TAlterTableOptions;
+ /// @endcond
+
+ /// Change table schema.
+ FLUENT_FIELD_OPTION(TTableSchema, Schema);
+
+ /// Alter table between static and dynamic mode.
+ FLUENT_FIELD_OPTION(bool, Dynamic);
+
+ ///
+ /// @brief Changes id of upstream replica on metacluster.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables
+ FLUENT_FIELD_OPTION(TReplicaId, UpstreamReplicaId);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::LookupRows
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#lookup_rows
+struct TLookupRowsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TLookupRowsOptions;
+ /// @endcond
+
+ /// Timeout for operation.
+ FLUENT_FIELD_OPTION(TDuration, Timeout);
+
+ /// Column names to return.
+ FLUENT_FIELD_OPTION(TColumnNames, Columns);
+
+ ///
+ /// @brief Whether to return rows that were not found in table.
+ ///
+ /// If set to true List returned by LookupRows method will have same
+ /// length as list of keys. If row is not found in table corresponding item in list
+ /// will have null value.
+ FLUENT_FIELD_DEFAULT(bool, KeepMissingRows, false);
+
+ /// If set to true returned values will have "timestamp" attribute.
+ FLUENT_FIELD_OPTION(bool, Versioned);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::SelectRows
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#select_rows
+struct TSelectRowsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSelectRowsOptions;
+ /// @endcond
+
+ /// Timeout for operation.
+ FLUENT_FIELD_OPTION(TDuration, Timeout);
+
+ ///
+ /// @brief Limitation for number of rows read by single node.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii)
+ FLUENT_FIELD_OPTION(i64, InputRowLimit);
+
+ ///
+ /// @brief Limitation for number of output rows on single cluster node.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii)
+ FLUENT_FIELD_OPTION(i64, OutputRowLimit);
+
+ ///
+ /// @brief Maximum row ranges derived from WHERE clause.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii)
+ FLUENT_FIELD_DEFAULT(ui64, RangeExpansionLimit, 1000);
+
+ ///
+ /// @brief Whether to fail if InputRowLimit or OutputRowLimit is exceeded.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii)
+ FLUENT_FIELD_DEFAULT(bool, FailOnIncompleteResult, true);
+
+ /// @brief Enable verbose logging on server side.
+ FLUENT_FIELD_DEFAULT(bool, VerboseLogging, false);
+
+ FLUENT_FIELD_DEFAULT(bool, EnableCodeCache, true);
+};
+
+/// Options for NYT::CreateClient;
+struct TCreateClientOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TCreateClientOptions;
+ /// @endcond
+
+ /// @brief Impersonated user name.
+ ///
+ /// If authenticated user is allowed to impersonate other YT users (e.g. yql_agent), this field may be used to override user name.
+ FLUENT_FIELD_OPTION(TString, ImpersonationUser);
+
+ /// @brief User token.
+ ///
+ /// @see NYT::TCreateClientOptions::TokenPath
+ FLUENT_FIELD(TString, Token);
+
+ /// @brief Path to the file where user token is stored.
+ ///
+ /// Token is looked in these places in following order:
+ /// - @ref NYT::TCreateClientOptions::Token
+ /// - @ref NYT::TCreateClientOptions::TokenPath
+ /// - `TConfig::Get()->Token` option.
+ /// - `YT_TOKEN` environment variable
+ /// - `YT_SECURE_VAULT_YT_TOKEN` environment variable
+ /// - File specified in `YT_TOKEN_PATH` environment variable
+ /// - `$HOME/.yt/token` file.
+ FLUENT_FIELD(TString, TokenPath);
+
+ /// @brief TVM service ticket producer.
+ ///
+ /// We store a wrapper of NYT::TIntrusivePtr here (not a NYT::TIntrusivePtr),
+ /// because otherwise other projects will have build problems
+ /// because of visibility of two different `TIntrusivePtr`-s (::TInstrusivePtr and NYT::TInstrusivePtr).
+ ///
+ /// @see NYT::NAuth::TServiceTicketClientAuth
+ /// {@
+ NAuth::IServiceTicketAuthPtrWrapperPtr ServiceTicketAuth_ = nullptr;
+ TSelf& ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper);
+ /// @}
+
+ /// @brief Use tvm-only endpoints in cluster connection.
+ FLUENT_FIELD_DEFAULT(bool, TvmOnly, false);
+
+ /// @brief Use HTTPs (use HTTP client from yt/yt/core always).
+ ///
+ /// @see UseCoreHttpClient
+ FLUENT_FIELD_DEFAULT(bool, UseTLS, false);
+
+ /// @brief Use HTTP client from yt/yt/core.
+ FLUENT_FIELD_DEFAULT(bool, UseCoreHttpClient, false);
+
+ ///
+ /// @brief RetryConfig provider allows to fine tune request retries.
+ ///
+ /// E.g. set total timeout for all retries.
+ FLUENT_FIELD_DEFAULT(IRetryConfigProviderPtr, RetryConfigProvider, nullptr);
+
+ /// @brief Override global config for the client.
+ ///
+ /// The config contains implementation parameters such as connection timeouts,
+ /// access token, api version and more.
+ /// @see NYT::TConfig
+ FLUENT_FIELD_DEFAULT(TConfigPtr, Config, nullptr);
+};
+
+///
+/// @brief Options for @ref NYT::IBatchRequest::ExecuteBatch
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#execute_batch
+struct TExecuteBatchOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TExecuteBatchOptions;
+ /// @endcond
+
+ ///
+ /// @brief How many requests will be executed in parallel on the cluster.
+ ///
+ /// This parameter could be used to avoid RequestLimitExceeded errors.
+ FLUENT_FIELD_OPTION(ui64, Concurrency);
+
+ ///
+ /// @brief Maximum size of batch sent in one request to server.
+ ///
+ /// Huge batches are executed using multiple requests.
+ /// BatchPartMaxSize is maximum size of single request that goes to server
+ /// If not specified it is set to `Concurrency * 5'
+ FLUENT_FIELD_OPTION(ui64, BatchPartMaxSize);
+};
+
+///
+/// @brief Durability mode.
+///
+/// @see NYT::TTabletTransactionOptions::TDurability
+/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost
+enum class EDurability
+{
+ /// Sync mode (default).
+ Sync /* "sync" */,
+
+ /// Async mode (might reduce latency of write requests, but less reliable).
+ Async /* "async" */,
+};
+
+///
+/// @brief Atomicity mode.
+///
+/// @see NYT::TTabletTransactionOptions::TDurability
+/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost
+enum class EAtomicity
+{
+ /// Transactions are non atomic (might reduce latency of write requests).
+ None /* "none" */,
+
+ /// Transactions are atomic (default).
+ Full /* "full" */,
+};
+
+///
+/// @brief Table replica mode.
+///
+/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#atributy
+enum class ETableReplicaMode
+{
+ Sync /* "sync" */,
+ Async /* "async" */,
+};
+
+/// Base class for options dealing with io to dynamic tables.
+template <typename TDerived>
+struct TTabletTransactionOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Atomicity mode of operation
+ ///
+ /// Setting to NYT::EAtomicity::None allows to improve latency of operations
+ /// at the cost of weakening contracts.
+ ///
+ /// @note Use with care.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij
+ FLUENT_FIELD_OPTION(EAtomicity, Atomicity);
+
+ ///
+ /// @brief Durability mode of operation
+ ///
+ /// Setting to NYT::EDurability::Async allows to improve latency of operations
+ /// at the cost of weakening contracts.
+ ///
+ /// @note Use with care.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij
+ FLUENT_FIELD_OPTION(EDurability, Durability);
+};
+
+///
+/// @brief Options for NYT::IClient::InsertRows
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#insert_rows
+struct TInsertRowsOptions
+ : public TTabletTransactionOptions<TInsertRowsOptions>
+{
+ ///
+ /// @brief Whether to overwrite missing columns with nulls.
+ ///
+ /// By default all columns missing in input data are set to Null and overwrite currently stored value.
+ /// If `Update' is set to true currently stored value will not be overwritten for columns that are missing in input data.
+ FLUENT_FIELD_OPTION(bool, Update);
+
+ ///
+ /// @brief Whether to overwrite or aggregate aggregated columns.
+ ///
+ /// Used with aggregating columns.
+ /// By default value in aggregating column will be overwritten.
+ /// If `Aggregate' is set to true row will be considered as delta and it will be aggregated with currently stored value.
+ FLUENT_FIELD_OPTION(bool, Aggregate);
+
+ ///
+ /// @brief Whether to fail when inserting to table without sync replica.
+ ///
+ /// Used for insert operation for tables without sync replica.
+ /// https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write
+ /// Default value is 'false'. So insertion into table without sync replicas fails.
+ FLUENT_FIELD_OPTION(bool, RequireSyncReplica);
+};
+
+///
+/// @brief Options for NYT::IClient::DeleteRows
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#delete_rows
+struct TDeleteRowsOptions
+ : public TTabletTransactionOptions<TDeleteRowsOptions>
+{
+ ///
+ /// @brief Whether to fail when deleting from table without sync replica.
+ ///
+ // Used for delete operation for tables without sync replica.
+ // https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write
+ // Default value is 'false'. So deletion into table without sync replicas fails.
+ FLUENT_FIELD_OPTION(bool, RequireSyncReplica);
+};
+
+///
+/// @brief Options for NYT::IClient::TrimRows
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#trim_rows
+struct TTrimRowsOptions
+ : public TTabletTransactionOptions<TTrimRowsOptions>
+{ };
+
+/// @brief Options for NYT::IClient::AlterTableReplica
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#alter_table_replica
+/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables
+struct TAlterTableReplicaOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TAlterTableReplicaOptions;
+ /// @endcond
+
+ ///
+ /// @brief Whether to enable or disable replica.
+ ///
+ /// Doesn't change state of replica if `Enabled' is not set.
+ FLUENT_FIELD_OPTION(bool, Enabled);
+
+ ///
+ /// @brief Change replica mode.
+ ///
+ /// Doesn't change replica mode if `Mode` is not set.
+ FLUENT_FIELD_OPTION(ETableReplicaMode, Mode);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetFileFromCache
+///
+/// @note They are empty for now but options might appear in the future.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_file_from_cache
+struct TGetFileFromCacheOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetFileFromCacheOptions;
+ /// @endcond
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics
+///
+/// @note They are empty for now but options might appear in the future.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#put_file_to_cache
+struct TPutFileToCacheOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TPutFileToCacheOptions;
+ /// @endcond
+
+ /// Whether to preserve `expiration_timeout` attribute of source node.
+ FLUENT_FIELD_OPTION(bool, PreserveExpirationTimeout);
+};
+
+///
+/// Type of permission used in ACL.
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/access_control
+enum class EPermission : int
+{
+ /// Applies to: all objects.
+ Read /* "read" */,
+
+ /// Applies to: all objects.
+ Write /* "write" */,
+
+ /// Applies to: accounts / pools.
+ Use /* "use" */,
+
+ /// Applies to: all objects.
+ Administer /* "administer" */,
+
+ /// Applies to: schemas.
+ Create /* "create" */,
+
+ /// Applies to: all objects.
+ Remove /* "remove" */,
+
+ /// Applies to: tables.
+ Mount /* "mount" */,
+
+ /// Applies to: operations.
+ Manage /* "manage" */,
+};
+
+/// Whether permission is granted or denied.
+enum class ESecurityAction : int
+{
+ /// Permission is granted.
+ Allow /* "allow" */,
+
+ /// Permission is denied.
+ Deny /* "deny" */,
+};
+
+///
+/// @brief Options for @ref NYT::IClient::CheckPermission
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#check_permission
+struct TCheckPermissionOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TCheckPermissionOptions;
+ /// @endcond
+
+ /// Columns to check permission to (for tables only).
+ FLUENT_VECTOR_FIELD(TString, Column);
+};
+
+///
+/// @brief Columnar statistics fetching mode.
+///
+/// @ref NYT::TGetTableColumnarStatisticsOptions::FetcherMode
+enum class EColumnarStatisticsFetcherMode
+{
+ /// Slow mode for fetching precise columnar statistics.
+ FromNodes /* "from_nodes" */,
+
+ ///
+ /// @brief Fast mode for fetching lightweight columnar statistics.
+ ///
+ /// Relative precision is 1 / 256.
+ ///
+ /// @note Might be unavailable for old tables in that case some upper bound is returned.
+ FromMaster /* "from_master" */,
+
+ /// Use lightweight columnar statistics (FromMaster) if available otherwise switch to slow but precise mode (FromNodes).
+ Fallback /* "fallback" */,
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_table_columnar_statistics
+struct TGetTableColumnarStatisticsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetTableColumnarStatisticsOptions;
+ /// @endcond
+
+ ///
+ /// @brief Mode of statistics fetching.
+ ///
+ /// @ref NYT::EColumnarStatisticsFetcherMode
+ FLUENT_FIELD_OPTION(EColumnarStatisticsFetcherMode, FetcherMode);
+};
+
+///
+/// @brief Table partitioning mode.
+///
+/// @ref NYT::TGetTablePartitionsOptions::PartitionMode
+enum class ETablePartitionMode
+{
+ ///
+ /// @brief Ignores the order of input tables and their chunk and sorting orders.
+ ///
+ Unordered /* "unordered" */,
+
+ ///
+ /// @brief The order of table ranges inside each partition obey the order of input tables and their chunk orders.
+ ///
+ Ordered /* "ordered" */,
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetTablePartitions
+///
+struct TGetTablePartitionsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetTablePartitionsOptions;
+ /// @endcond
+
+ ///
+ /// @brief Table partitioning mode.
+ ///
+ /// @ref NYT::ETablePartitionMode
+ FLUENT_FIELD(ETablePartitionMode, PartitionMode);
+
+ ///
+ /// @brief Approximate data weight of each output partition.
+ ///
+ FLUENT_FIELD(i64, DataWeightPerPartition);
+
+ ///
+ /// @brief Maximum output partition count.
+ ///
+ /// Consider the situation when the `MaxPartitionCount` is given
+ /// and the total data weight exceeds `MaxPartitionCount * DataWeightPerPartition`.
+ /// If `AdjustDataWeightPerPartition` is |true|
+ /// `GetTablePartitions` will yield partitions exceeding the `DataWeightPerPartition`.
+ /// If `AdjustDataWeightPerPartition` is |false|
+ /// the partitioning will be aborted as soon as the output partition count exceeds this limit.
+ FLUENT_FIELD_OPTION(int, MaxPartitionCount);
+
+ ///
+ /// @brief Allow the data weight per partition to exceed `DataWeightPerPartition` when `MaxPartitionCount` is set.
+ ///
+ /// |True| by default.
+ FLUENT_FIELD_DEFAULT(bool, AdjustDataWeightPerPartition, true);
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetTabletInfos
+///
+/// @note They are empty for now but options might appear in the future.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_tablet_infos
+struct TGetTabletInfosOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetTabletInfosOptions;
+ /// @endcond
+};
+
+/// Options for @ref NYT::IClient::SkyShareTable
+struct TSkyShareTableOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSkyShareTableOptions;
+ /// @endcond
+
+ ///
+ /// @brief Key columns that are used to group files in a table into torrents.
+ ///
+ /// One torrent is created for each value of `KeyColumns` columns.
+ /// If not specified, all files go into single torrent.
+ FLUENT_FIELD_OPTION(TColumnNames, KeyColumns);
+
+ /// @brief Allow skynet manager to return fastbone links to skynet. See YT-11437
+ FLUENT_FIELD_OPTION(bool, EnableFastbone);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/common.cpp b/yt/cpp/mapreduce/interface/common.cpp
new file mode 100644
index 0000000000..f6d60127ce
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/common.cpp
@@ -0,0 +1,664 @@
+#include "common.h"
+
+#include "errors.h"
+#include "format.h"
+#include "serialize.h"
+#include "fluent.h"
+
+#include <yt/yt_proto/yt/formats/extension.pb.h>
+
+#include <library/cpp/yson/node/node_builder.h>
+#include <library/cpp/yson/node/node_io.h>
+#include <library/cpp/type_info/type.h>
+
+#include <util/generic/xrange.h>
+
+namespace NYT {
+
+using ::google::protobuf::FieldDescriptor;
+using ::google::protobuf::Descriptor;
+
+////////////////////////////////////////////////////////////////////////////////
+
+TSortColumn::TSortColumn(TStringBuf name, ESortOrder sortOrder)
+ : Name_(name)
+ , SortOrder_(sortOrder)
+{ }
+
+TSortColumn::TSortColumn(const TString& name, ESortOrder sortOrder)
+ : TSortColumn(static_cast<TStringBuf>(name), sortOrder)
+{ }
+
+TSortColumn::TSortColumn(const char* name, ESortOrder sortOrder)
+ : TSortColumn(static_cast<TStringBuf>(name), sortOrder)
+{ }
+
+const TSortColumn& TSortColumn::EnsureAscending() const
+{
+ Y_ENSURE(SortOrder() == ESortOrder::SO_ASCENDING);
+ return *this;
+}
+
+TNode TSortColumn::ToNode() const
+{
+ return BuildYsonNodeFluently().Value(*this);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Below lie backward compatibility methods.
+////////////////////////////////////////////////////////////////////////////////
+
+TSortColumn& TSortColumn::operator = (TStringBuf name)
+{
+ EnsureAscending();
+ Name_ = name;
+ return *this;
+}
+
+TSortColumn& TSortColumn::operator = (const TString& name)
+{
+ return (*this = static_cast<TStringBuf>(name));
+}
+
+TSortColumn& TSortColumn::operator = (const char* name)
+{
+ return (*this = static_cast<TStringBuf>(name));
+}
+
+bool TSortColumn::operator == (TStringBuf rhsName) const
+{
+ EnsureAscending();
+ return Name_ == rhsName;
+}
+
+bool TSortColumn::operator != (TStringBuf rhsName) const
+{
+ return !(*this == rhsName);
+}
+
+bool TSortColumn::operator == (const TString& rhsName) const
+{
+ return *this == static_cast<TStringBuf>(rhsName);
+}
+
+bool TSortColumn::operator != (const TString& rhsName) const
+{
+ return !(*this == rhsName);
+}
+
+bool TSortColumn::operator == (const char* rhsName) const
+{
+ return *this == static_cast<TStringBuf>(rhsName);
+}
+
+bool TSortColumn::operator != (const char* rhsName) const
+{
+ return !(*this == rhsName);
+}
+
+TSortColumn::operator TStringBuf() const
+{
+ EnsureAscending();
+ return Name_;
+}
+
+TSortColumn::operator TString() const
+{
+ return TString(static_cast<TStringBuf>(*this));
+}
+
+TSortColumn::operator std::string() const
+{
+ EnsureAscending();
+ return static_cast<std::string>(Name_);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TSortColumns::TSortColumns()
+{ }
+
+TSortColumns::TSortColumns(const TVector<TString>& names)
+{
+ Parts_.assign(names.begin(), names.end());
+}
+
+TSortColumns::TSortColumns(const TColumnNames& names)
+ : TSortColumns(names.Parts_)
+{ }
+
+TSortColumns::operator TColumnNames() const
+{
+ return TColumnNames(EnsureAscending().GetNames());
+}
+
+const TSortColumns& TSortColumns::EnsureAscending() const
+{
+ for (const auto& sortColumn : Parts_) {
+ sortColumn.EnsureAscending();
+ }
+ return *this;
+}
+
+TVector<TString> TSortColumns::GetNames() const
+{
+ TVector<TString> names;
+ names.reserve(Parts_.size());
+ for (const auto& sortColumn : Parts_) {
+ names.push_back(sortColumn.Name());
+ }
+ return names;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static NTi::TTypePtr OldTypeToTypeV3(EValueType type)
+{
+ switch (type) {
+ case VT_INT64:
+ return NTi::Int64();
+ case VT_UINT64:
+ return NTi::Uint64();
+
+ case VT_DOUBLE:
+ return NTi::Double();
+
+ case VT_BOOLEAN:
+ return NTi::Bool();
+
+ case VT_STRING:
+ return NTi::String();
+
+ case VT_ANY:
+ return NTi::Yson();
+
+ case VT_INT8:
+ return NTi::Int8();
+ case VT_INT16:
+ return NTi::Int16();
+ case VT_INT32:
+ return NTi::Int32();
+
+ case VT_UINT8:
+ return NTi::Uint8();
+ case VT_UINT16:
+ return NTi::Uint16();
+ case VT_UINT32:
+ return NTi::Uint32();
+
+ case VT_UTF8:
+ return NTi::Utf8();
+
+ case VT_NULL:
+ return NTi::Null();
+
+ case VT_VOID:
+ return NTi::Void();
+
+ case VT_DATE:
+ return NTi::Date();
+ case VT_DATETIME:
+ return NTi::Datetime();
+ case VT_TIMESTAMP:
+ return NTi::Timestamp();
+ case VT_INTERVAL:
+ return NTi::Interval();
+
+ case VT_FLOAT:
+ return NTi::Float();
+ case VT_JSON:
+ return NTi::Json();
+ }
+}
+
+static std::pair<EValueType, bool> Simplify(const NTi::TTypePtr& type)
+{
+ using namespace NTi;
+ const auto typeName = type->GetTypeName();
+ switch (typeName) {
+ case ETypeName::Bool:
+ return {VT_BOOLEAN, true};
+
+ case ETypeName::Int8:
+ return {VT_INT8, true};
+ case ETypeName::Int16:
+ return {VT_INT16, true};
+ case ETypeName::Int32:
+ return {VT_INT32, true};
+ case ETypeName::Int64:
+ return {VT_INT64, true};
+
+ case ETypeName::Uint8:
+ return {VT_UINT8, true};
+ case ETypeName::Uint16:
+ return {VT_UINT16, true};
+ case ETypeName::Uint32:
+ return {VT_UINT32, true};
+ case ETypeName::Uint64:
+ return {VT_UINT64, true};
+
+ case ETypeName::Float:
+ return {VT_FLOAT, true};
+ case ETypeName::Double:
+ return {VT_DOUBLE, true};
+
+ case ETypeName::String:
+ return {VT_STRING, true};
+ case ETypeName::Utf8:
+ return {VT_UTF8, true};
+
+ case ETypeName::Date:
+ return {VT_DATE, true};
+ case ETypeName::Datetime:
+ return {VT_DATETIME, true};
+ case ETypeName::Timestamp:
+ return {VT_TIMESTAMP, true};
+ case ETypeName::Interval:
+ return {VT_INTERVAL, true};
+
+ case ETypeName::TzDate:
+ case ETypeName::TzDatetime:
+ case ETypeName::TzTimestamp:
+ break;
+
+ case ETypeName::Json:
+ return {VT_JSON, true};
+ case ETypeName::Decimal:
+ return {VT_STRING, true};
+ case ETypeName::Uuid:
+ break;
+ case ETypeName::Yson:
+ return {VT_ANY, true};
+
+ case ETypeName::Void:
+ return {VT_VOID, false};
+ case ETypeName::Null:
+ return {VT_NULL, false};
+
+ case ETypeName::Optional:
+ {
+ auto itemType = type->AsOptional()->GetItemType();
+ if (itemType->IsPrimitive()) {
+ auto simplified = Simplify(itemType->AsPrimitive());
+ if (simplified.second) {
+ simplified.second = false;
+ return simplified;
+ }
+ }
+ return {VT_ANY, false};
+ }
+ case ETypeName::List:
+ return {VT_ANY, true};
+ case ETypeName::Dict:
+ return {VT_ANY, true};
+ case ETypeName::Struct:
+ return {VT_ANY, true};
+ case ETypeName::Tuple:
+ return {VT_ANY, true};
+ case ETypeName::Variant:
+ return {VT_ANY, true};
+ case ETypeName::Tagged:
+ return Simplify(type->AsTagged()->GetItemType());
+ }
+ ythrow TApiUsageError() << "Unsupported type: " << typeName;
+}
+
+NTi::TTypePtr ToTypeV3(EValueType type, bool required)
+{
+ auto typeV3 = OldTypeToTypeV3(type);
+ if (!Simplify(typeV3).second) {
+ if (required) {
+ ythrow TApiUsageError() << "type: " << type << " cannot be required";
+ } else {
+ return typeV3;
+ }
+ }
+ if (required) {
+ return typeV3;
+ } else {
+ return NTi::Optional(typeV3);
+ }
+}
+
+TColumnSchema::TColumnSchema()
+ : TypeV3_(NTi::Optional(NTi::Int64()))
+{ }
+
+EValueType TColumnSchema::Type() const
+{
+ return Simplify(TypeV3_).first;
+}
+
+TColumnSchema& TColumnSchema::Type(EValueType type) &
+{
+ return Type(ToTypeV3(type, false));
+}
+
+TColumnSchema TColumnSchema::Type(EValueType type) &&
+{
+ return Type(ToTypeV3(type, false));
+}
+
+TColumnSchema& TColumnSchema::Type(const NTi::TTypePtr& type) &
+{
+ Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type");
+ TypeV3_ = type;
+ return *this;
+}
+
+TColumnSchema TColumnSchema::Type(const NTi::TTypePtr& type) &&
+{
+ Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type");
+ TypeV3_ = type;
+ return *this;
+}
+
+TColumnSchema& TColumnSchema::TypeV3(const NTi::TTypePtr& type) &
+{
+ return Type(type);
+}
+
+TColumnSchema TColumnSchema::TypeV3(const NTi::TTypePtr& type) &&
+{
+ return Type(type);
+}
+
+NTi::TTypePtr TColumnSchema::TypeV3() const
+{
+ return TypeV3_;
+}
+
+bool TColumnSchema::Required() const
+{
+ return Simplify(TypeV3_).second;
+}
+
+TColumnSchema& TColumnSchema::Type(EValueType type, bool required) &
+{
+ return Type(ToTypeV3(type, required));
+}
+
+TColumnSchema TColumnSchema::Type(EValueType type, bool required) &&
+{
+ return Type(ToTypeV3(type, required));
+}
+
+bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs)
+{
+ return
+ lhs.Name() == rhs.Name() &&
+ NTi::NEq::TStrictlyEqual()(lhs.TypeV3(), rhs.TypeV3()) &&
+ lhs.SortOrder() == rhs.SortOrder() &&
+ lhs.Lock() == rhs.Lock() &&
+ lhs.Expression() == rhs.Expression() &&
+ lhs.Aggregate() == rhs.Aggregate() &&
+ lhs.Group() == rhs.Group();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool TTableSchema::Empty() const
+{
+ return Columns_.empty();
+}
+
+TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type) &
+{
+ Columns_.push_back(TColumnSchema().Name(name).Type(type));
+ return *this;
+}
+
+TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type) &&
+{
+ return std::move(AddColumn(name, type));
+}
+
+TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &
+{
+ Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder));
+ return *this;
+}
+
+TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &&
+{
+ return std::move(AddColumn(name, type, sortOrder));
+}
+
+TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) &
+{
+ Columns_.push_back(TColumnSchema().Name(name).Type(type));
+ return *this;
+}
+
+TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) &&
+{
+ return std::move(AddColumn(name, type));
+}
+
+TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &
+{
+ Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder));
+ return *this;
+}
+
+TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &&
+{
+ return std::move(AddColumn(name, type, sortOrder));
+}
+
+TTableSchema& TTableSchema::SortBy(const TSortColumns& sortColumns) &
+{
+ Y_ENSURE(sortColumns.Parts_.size() <= Columns_.size());
+
+ THashMap<TString, ui64> sortColumnIndex;
+ for (auto i: xrange(sortColumns.Parts_.size())) {
+ Y_ENSURE(sortColumnIndex.emplace(sortColumns.Parts_[i].Name(), i).second,
+ "Key column name '" << sortColumns.Parts_[i].Name() << "' repeats in columns list");
+ }
+
+ TVector<TColumnSchema> newColumnsSorted(sortColumns.Parts_.size());
+ TVector<TColumnSchema> newColumnsUnsorted;
+ for (auto& column : Columns_) {
+ auto it = sortColumnIndex.find(column.Name());
+ if (it == sortColumnIndex.end()) {
+ column.ResetSortOrder();
+ newColumnsUnsorted.push_back(std::move(column));
+ } else {
+ auto index = it->second;
+ const auto& sortColumn = sortColumns.Parts_[index];
+ column.SortOrder(sortColumn.SortOrder());
+ newColumnsSorted[index] = std::move(column);
+ sortColumnIndex.erase(it);
+ }
+ }
+
+ Y_ENSURE(sortColumnIndex.empty(), "Column name '" << sortColumnIndex.begin()->first
+ << "' not found in table schema");
+
+ newColumnsSorted.insert(newColumnsSorted.end(), newColumnsUnsorted.begin(), newColumnsUnsorted.end());
+ Columns_ = std::move(newColumnsSorted);
+
+ return *this;
+}
+
+TTableSchema TTableSchema::SortBy(const TSortColumns& sortColumns) &&
+{
+ return std::move(SortBy(sortColumns));
+}
+
+TVector<TColumnSchema>& TTableSchema::MutableColumns()
+{
+ return Columns_;
+}
+
+TNode TTableSchema::ToNode() const
+{
+ TNode result;
+ TNodeBuilder builder(&result);
+ Serialize(*this, &builder);
+ return result;
+}
+
+TTableSchema TTableSchema::FromNode(const TNode& node)
+{
+ TTableSchema schema;
+ Deserialize(schema, node);
+ return schema;
+}
+
+bool operator==(const TTableSchema& lhs, const TTableSchema& rhs)
+{
+ return
+ lhs.Columns() == rhs.Columns() &&
+ lhs.Strict() == rhs.Strict() &&
+ lhs.UniqueKeys() == rhs.UniqueKeys();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TKeyBound::TKeyBound(ERelation relation, TKey key)
+ : Relation_(relation)
+ , Key_(std::move(key))
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+
+TTableSchema CreateTableSchema(
+ const Descriptor& messageDescriptor,
+ const TSortColumns& sortColumns,
+ bool keepFieldsWithoutExtension)
+{
+ auto result = CreateTableSchema(messageDescriptor, keepFieldsWithoutExtension);
+ if (!sortColumns.Parts_.empty()) {
+ result.SortBy(sortColumns.Parts_);
+ }
+ return result;
+}
+
+TTableSchema CreateTableSchema(NTi::TTypePtr type)
+{
+ Y_VERIFY(type);
+ TTableSchema schema;
+ Deserialize(schema, NodeFromYsonString(NTi::NIo::AsYtSchema(type.Get())));
+ return schema;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool IsTrivial(const TReadLimit& readLimit)
+{
+ return !readLimit.Key_ && !readLimit.RowIndex_ && !readLimit.Offset_ && !readLimit.TabletIndex_ && !readLimit.KeyBound_;
+}
+
+EValueType NodeTypeToValueType(TNode::EType nodeType)
+{
+ switch (nodeType) {
+ case TNode::EType::Int64: return VT_INT64;
+ case TNode::EType::Uint64: return VT_UINT64;
+ case TNode::EType::String: return VT_STRING;
+ case TNode::EType::Double: return VT_DOUBLE;
+ case TNode::EType::Bool: return VT_BOOLEAN;
+ default:
+ ythrow yexception() << "Cannot convert TNode type " << nodeType << " to EValueType";
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path)
+{
+ static const TVector<TReadRange> empty;
+
+ const auto& maybeRanges = path.GetRanges();
+ if (maybeRanges.Empty()) {
+ return empty;
+ } else if (maybeRanges->size() > 0) {
+ return *maybeRanges;
+ } else {
+ // If you see this exception, that means that caller of this function doesn't known what to do
+ // with RichYPath that has set range list, but the range list is empty.
+ //
+ // To avoid this exception caller must explicitly handle such case.
+ // NB. YT-17683
+ ythrow TApiUsageError() << "Unsupported RichYPath: explicitly empty range list";
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+TString ToString(EValueType type)
+{
+ switch (type) {
+ case VT_INT8:
+ return "int8";
+ case VT_INT16:
+ return "int16";
+ case VT_INT32:
+ return "int32";
+ case VT_INT64:
+ return "int64";
+
+ case VT_UINT8:
+ return "uint8";
+ case VT_UINT16:
+ return "uint16";
+ case VT_UINT32:
+ return "uint32";
+ case VT_UINT64:
+ return "uint64";
+
+ case VT_DOUBLE:
+ return "double";
+
+ case VT_BOOLEAN:
+ return "boolean";
+
+ case VT_STRING:
+ return "string";
+ case VT_UTF8:
+ return "utf8";
+
+ case VT_ANY:
+ return "any";
+
+ case VT_NULL:
+ return "null";
+ case VT_VOID:
+ return "void";
+
+ case VT_DATE:
+ return "date";
+ case VT_DATETIME:
+ return "datetime";
+ case VT_TIMESTAMP:
+ return "timestamp";
+ case VT_INTERVAL:
+ return "interval";
+
+ case VT_FLOAT:
+ return "float";
+
+ case VT_JSON:
+ return "json";
+ }
+ ythrow yexception() << "Invalid value type " << static_cast<int>(type);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NDetail
+} // namespace NYT
+
+template <>
+void Out<NYT::TSortColumn>(IOutputStream& os, const NYT::TSortColumn& sortColumn)
+{
+ if (sortColumn.SortOrder() == NYT::ESortOrder::SO_ASCENDING) {
+ os << sortColumn.Name();
+ } else {
+ os << NYT::BuildYsonStringFluently(NYson::EYsonFormat::Text).Value(sortColumn);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/common.h b/yt/cpp/mapreduce/interface/common.h
new file mode 100644
index 0000000000..b1754ade70
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/common.h
@@ -0,0 +1,1301 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/common.h
+///
+/// Header containing miscellaneous structs and classes used in library.
+
+#include "fwd.h"
+
+#include <library/cpp/type_info/type_info.h>
+#include <library/cpp/yson/node/node.h>
+
+#include <util/generic/guid.h>
+#include <util/generic/map.h>
+#include <util/generic/maybe.h>
+#include <util/generic/ptr.h>
+#include <util/system/type_name.h>
+#include <util/generic/vector.h>
+
+#include <google/protobuf/message.h>
+
+#include <initializer_list>
+#include <type_traits>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @cond Doxygen_Suppress
+#define FLUENT_FIELD(type, name) \
+ type name##_; \
+ TSelf& name(const type& value) \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ static_assert(true)
+
+#define FLUENT_FIELD_ENCAPSULATED(type, name) \
+private: \
+ type name##_; \
+public: \
+ TSelf& name(const type& value) & \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ TSelf name(const type& value) && \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ const type& name() const & \
+ { \
+ return name##_; \
+ } \
+ type name() && \
+ { \
+ return name##_; \
+ } \
+ static_assert(true)
+
+#define FLUENT_FIELD_OPTION(type, name) \
+ TMaybe<type> name##_; \
+ TSelf& name(const type& value) \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ static_assert(true)
+
+#define FLUENT_FIELD_OPTION_ENCAPSULATED(type, name) \
+private: \
+ TMaybe<type> name##_; \
+public: \
+ TSelf& name(const type& value) & \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ TSelf name(const type& value) && \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ TSelf& Reset##name() & \
+ { \
+ name##_ = Nothing(); \
+ return static_cast<TSelf&>(*this); \
+ } \
+ TSelf Reset##name() && \
+ { \
+ name##_ = Nothing(); \
+ return static_cast<TSelf&>(*this); \
+ } \
+ const TMaybe<type>& name() const& \
+ { \
+ return name##_; \
+ } \
+ TMaybe<type> name() && \
+ { \
+ return name##_; \
+ } \
+ static_assert(true)
+
+#define FLUENT_FIELD_DEFAULT(type, name, defaultValue) \
+ type name##_ = defaultValue; \
+ TSelf& name(const type& value) \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ static_assert(true)
+
+#define FLUENT_FIELD_DEFAULT_ENCAPSULATED(type, name, defaultValue) \
+private: \
+ type name##_ = defaultValue; \
+public: \
+ TSelf& name(const type& value) & \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ TSelf name(const type& value) && \
+ { \
+ name##_ = value; \
+ return static_cast<TSelf&>(*this); \
+ } \
+ const type& name() const & \
+ { \
+ return name##_; \
+ } \
+ type name() && \
+ { \
+ return name##_; \
+ } \
+ static_assert(true)
+
+#define FLUENT_VECTOR_FIELD(type, name) \
+ TVector<type> name##s_; \
+ TSelf& Add##name(const type& value) \
+ { \
+ name##s_.push_back(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf& name##s(TVector<type> values) \
+ { \
+ name##s_ = std::move(values); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ static_assert(true)
+
+#define FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(type, name) \
+private: \
+ TMaybe<TVector<type>> name##s_; \
+public: \
+ const TMaybe<TVector<type>>& name##s() const & { \
+ return name##s_; \
+ } \
+ TMaybe<TVector<type>>& name##s() & { \
+ return name##s_; \
+ } \
+ TMaybe<TVector<type>> name##s() && { \
+ return std::move(name##s_); \
+ } \
+ TSelf& Add##name(const type& value) & \
+ { \
+ if (name##s_.Empty()) { \
+ name##s_.ConstructInPlace(); \
+ } \
+ name##s_->push_back(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf Add##name(const type& value) && \
+ { \
+ if (name##s_.Empty()) { \
+ name##s_.ConstructInPlace(); \
+ } \
+ name##s_->push_back(value); \
+ return static_cast<TSelf&&>(*this);\
+ } \
+ TSelf& name##s(TVector<type> values) & \
+ { \
+ name##s_ = std::move(values); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf name##s(TVector<type> values) && \
+ { \
+ name##s_ = std::move(values); \
+ return static_cast<TSelf&&>(*this);\
+ } \
+ TSelf& name##s(TNothing) & \
+ { \
+ name##s_ = Nothing(); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf name##s(TNothing) && \
+ { \
+ name##s_ = Nothing(); \
+ return static_cast<TSelf&&>(*this);\
+ } \
+ TSelf& Reset##name##s() & \
+ { \
+ name##s_ = Nothing(); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf Reset##name##s() && \
+ { \
+ name##s_ = Nothing(); \
+ return static_cast<TSelf&&>(*this);\
+ } \
+ static_assert(true)
+
+#define FLUENT_VECTOR_FIELD_ENCAPSULATED(type, name) \
+private: \
+ TVector<type> name##s_; \
+public: \
+ TSelf& Add##name(const type& value) & \
+ { \
+ name##s_.push_back(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf Add##name(const type& value) && \
+ { \
+ name##s_.push_back(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf& name##s(TVector<type> value) & \
+ { \
+ name##s_ = std::move(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ TSelf name##s(TVector<type> value) && \
+ { \
+ name##s_ = std::move(value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ const TVector<type>& name##s() const & \
+ { \
+ return name##s_; \
+ } \
+ TVector<type> name##s() && \
+ { \
+ return name##s_; \
+ } \
+ static_assert(true)
+
+#define FLUENT_MAP_FIELD(keytype, valuetype, name) \
+ TMap<keytype,valuetype> name##_; \
+ TSelf& Add##name(const keytype& key, const valuetype& value) \
+ { \
+ name##_.emplace(key, value); \
+ return static_cast<TSelf&>(*this);\
+ } \
+ static_assert(true)
+
+/// @endcond
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Convenience class that keeps sequence of items.
+///
+/// Designed to be used as function parameter.
+///
+/// Users of such function can then pass:
+/// - single item,
+/// - initializer list of items,
+/// - vector of items;
+/// as argument to this function.
+///
+/// Example:
+/// ```
+/// void Foo(const TOneOrMany<int>& arg);
+/// ...
+/// Foo(1); // ok
+/// Foo({1, 2, 3}); // ok
+/// ```
+template <class T, class TDerived>
+struct TOneOrMany
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = std::conditional_t<std::is_void_v<TDerived>, TOneOrMany, TDerived>;
+ /// @endcond
+
+ /// Initialize with empty sequence.
+ TOneOrMany() = default;
+
+ // Initialize from initializer list.
+ template<class U>
+ TOneOrMany(std::initializer_list<U> il)
+ {
+ Parts_.assign(il.begin(), il.end());
+ }
+
+ /// Put arguments to sequence
+ template <class U, class... TArgs>
+ requires std::is_convertible_v<U, T>
+ TOneOrMany(U&& arg, TArgs&&... args)
+ {
+ Add(arg, std::forward<TArgs>(args)...);
+ }
+
+ /// Initialize from vector.
+ TOneOrMany(TVector<T> args)
+ : Parts_(std::move(args))
+ { }
+
+ /// @brief Order is defined the same way as in TVector
+ bool operator==(const TOneOrMany& rhs) const
+ {
+ // N.B. We would like to make this method to be `= default`,
+ // but this breaks MSVC compiler for the cases when T doesn't
+ // support comparison.
+ return Parts_ == rhs.Parts_;
+ }
+
+ ///
+ /// @{
+ ///
+ /// @brief Add all arguments to sequence
+ template <class U, class... TArgs>
+ requires std::is_convertible_v<U, T>
+ TSelf& Add(U&& part, TArgs&&... args) &
+ {
+ Parts_.push_back(std::forward<U>(part));
+ if constexpr (sizeof...(args) > 0) {
+ [[maybe_unused]] int dummy[sizeof...(args)] = {(Parts_.push_back(std::forward<TArgs>(args)), 0) ... };
+ }
+ return static_cast<TSelf&>(*this);
+ }
+
+ template <class U, class... TArgs>
+ requires std::is_convertible_v<U, T>
+ TSelf Add(U&& part, TArgs&&... args) &&
+ {
+ return std::move(Add(std::forward<U>(part), std::forward<TArgs>(args)...));
+ }
+ /// @}
+
+ /// Content of sequence.
+ TVector<T> Parts_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Type of the value that can occur in YT table.
+///
+/// @ref NYT::TTableSchema
+/// https://yt.yandex-team.ru/docs/description/storage/data_types
+enum EValueType : int
+{
+ /// Int64, signed integer of 64 bits.
+ VT_INT64,
+
+ /// Uint64, unsigned integer of 64 bits.
+ VT_UINT64,
+
+ /// Double, floating point number of double precision (64 bits).
+ VT_DOUBLE,
+ /// Boolean, `true` or `false`.
+ VT_BOOLEAN,
+
+ /// String, arbitrary byte sequence.
+ VT_STRING,
+
+ /// Any, arbitrary yson document.
+ VT_ANY,
+
+ /// Int8, signed integer of 8 bits.
+ VT_INT8,
+ /// Int16, signed integer of 16 bits.
+ VT_INT16,
+ /// Int32, signed integer of 32 bits.
+ VT_INT32,
+
+ /// Uint8, unsigned integer of 8 bits.
+ VT_UINT8,
+ /// Uint16, unsigned integer of 16 bits.
+ VT_UINT16,
+ /// Uint32, unsigned integer of 32 bits.
+ VT_UINT32,
+
+ /// Utf8, byte sequence that is valid utf8.
+ VT_UTF8,
+
+ /// Null, absence of value (almost never used in schemas)
+ VT_NULL,
+ /// Void, absence of value (almost never used in schemas) the difference between null, and void is yql-specific.
+ VT_VOID,
+
+ /// Date, number of days since Unix epoch (unsigned)
+ VT_DATE,
+ /// Datetime, number of seconds since Unix epoch (unsigned)
+ VT_DATETIME,
+ /// Timestamp, number of milliseconds since Unix epoch (unsigned)
+ VT_TIMESTAMP,
+ /// Interval, difference between two timestamps (signed)
+ VT_INTERVAL,
+
+ /// Float, floating point number (32 bits)
+ VT_FLOAT,
+ /// Json, sequence of bytes that is valid json.
+ VT_JSON,
+};
+
+///
+/// @brief Sort order.
+///
+/// @ref NYT::TTableSchema
+enum ESortOrder : int
+{
+ /// Ascending sort order.
+ SO_ASCENDING /* "ascending" */,
+ /// Descending sort order.
+ SO_DESCENDING /* "descending" */,
+};
+
+///
+/// @brief Value of "optimize_for" attribute.
+///
+/// @ref NYT::TRichYPath
+enum EOptimizeForAttr : i8
+{
+ /// Optimize for scan
+ OF_SCAN_ATTR /* "scan" */,
+
+ /// Optimize for lookup
+ OF_LOOKUP_ATTR /* "lookup" */,
+};
+
+///
+/// @brief Value of "erasure_codec" attribute.
+///
+/// @ref NYT::TRichYPath
+enum EErasureCodecAttr : i8
+{
+ /// @cond Doxygen_Suppress
+ EC_NONE_ATTR /* "none" */,
+ EC_REED_SOLOMON_6_3_ATTR /* "reed_solomon_6_3" */,
+ EC_LRC_12_2_2_ATTR /* "lrc_12_2_2" */,
+ EC_ISA_LRC_12_2_2_ATTR /* "isa_lrc_12_2_2" */,
+ /// @endcond
+};
+
+///
+/// @brief Value of "schema_modification" attribute.
+///
+/// @ref NYT::TRichYPath
+enum ESchemaModificationAttr : i8
+{
+ SM_NONE_ATTR /* "none" */,
+ SM_UNVERSIONED_UPDATE /* "unversioned_update" */,
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Table key column description.
+///
+/// The description includes column name and sort order.
+///
+/// @anchor TSortOrder_backward_compatibility
+/// @note
+/// Many functions that use `TSortOrder` as argument used to take `TString`
+/// (the only allowed sort order was "ascending" and user didn't have to specify it).
+/// @note
+/// This class is designed to provide backward compatibility for such code and therefore
+/// objects of this class can be constructed and assigned from TString-like objects only.
+///
+/// @see NYT::TSortOperationSpec
+class TSortColumn
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TSortColumn;
+ /// @endcond
+
+ /// Column name
+ FLUENT_FIELD_ENCAPSULATED(TString, Name);
+
+ /// Sort order
+ FLUENT_FIELD_DEFAULT_ENCAPSULATED(ESortOrder, SortOrder, ESortOrder::SO_ASCENDING);
+
+ ///
+ /// @{
+ ///
+ /// @brief Construct object from name and sort order
+ ///
+ /// Constructors are intentionally implicit so `TSortColumn` can be compatible with old code.
+ /// @ref TSortOrder_backward_compatibility
+ TSortColumn(TStringBuf name = {}, ESortOrder sortOrder = ESortOrder::SO_ASCENDING);
+ TSortColumn(const TString& name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING);
+ TSortColumn(const char* name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING);
+ /// @}
+
+ /// Check that sort order is ascending, throw exception otherwise.
+ const TSortColumn& EnsureAscending() const;
+
+ /// @brief Convert sort to yson representation as YT API expects it.
+ TNode ToNode() const;
+
+ /// @brief Comparison is default and checks both name and sort order.
+ bool operator == (const TSortColumn& rhs) const = default;
+
+ ///
+ /// @{
+ ///
+ /// @brief Assign object from column name, and set sort order to `ascending`.
+ ///
+ /// This is backward compatibility methods.
+ ///
+ /// @ref TSortOrder_backward_compatibility
+ TSortColumn& operator = (TStringBuf name);
+ TSortColumn& operator = (const TString& name);
+ TSortColumn& operator = (const char* name);
+ /// @}
+
+ bool operator == (const TStringBuf rhsName) const;
+ bool operator != (const TStringBuf rhsName) const;
+ bool operator == (const TString& rhsName) const;
+ bool operator != (const TString& rhsName) const;
+ bool operator == (const char* rhsName) const;
+ bool operator != (const char* rhsName) const;
+
+ // Intentionally implicit conversions.
+ operator TString() const;
+ operator TStringBuf() const;
+ operator std::string() const;
+
+ Y_SAVELOAD_DEFINE(Name_, SortOrder_);
+};
+
+///
+/// @brief List of @ref TSortColumn
+///
+/// Contains a bunch of helper methods such as constructing from single object.
+class TSortColumns
+ : public TOneOrMany<TSortColumn, TSortColumns>
+{
+public:
+ using TOneOrMany<TSortColumn, TSortColumns>::TOneOrMany;
+
+ /// Construct empty list.
+ TSortColumns();
+
+ ///
+ /// @{
+ ///
+ /// @brief Construct list of ascending sort order columns by their names.
+ ///
+ /// Required for backward compatibility.
+ ///
+ /// @ref TSortOrder_backward_compatibility
+ TSortColumns(const TVector<TString>& names);
+ TSortColumns(const TColumnNames& names);
+ /// @}
+
+
+ ///
+ /// @brief Implicit conversion to column list.
+ ///
+ /// If all columns has ascending sort order return list of their names.
+ /// Throw exception otherwise.
+ ///
+ /// Required for backward compatibility.
+ ///
+ /// @ref TSortOrder_backward_compatibility
+ operator TColumnNames() const;
+
+ /// Make sure that all columns are of ascending sort order.
+ const TSortColumns& EnsureAscending() const;
+
+ /// Get list of column names.
+ TVector<TString> GetNames() const;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to create new style type from old style one.
+NTi::TTypePtr ToTypeV3(EValueType type, bool required);
+
+///
+/// @brief Single column description
+///
+/// Each field describing column has setter and getter.
+///
+/// Example reading field:
+/// ```
+/// ... columnSchema.Name() ...
+/// ```
+///
+/// Example setting field:
+/// ```
+/// columnSchema.Name("my-column").Type(VT_INT64); // set name and type
+/// ```
+///
+/// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema
+class TColumnSchema
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TColumnSchema;
+ /// @endcond
+
+ ///
+ /// @brief Construct empty column schemas
+ ///
+ /// @note
+ /// Such schema cannot be used in schema as it it doesn't have name.
+ TColumnSchema();
+
+ ///
+ /// @{
+ ///
+ /// @brief Copy and move constructors are default.
+ TColumnSchema(const TColumnSchema&) = default;
+ TColumnSchema& operator=(const TColumnSchema&) = default;
+ /// @}
+
+
+ FLUENT_FIELD_ENCAPSULATED(TString, Name);
+
+ ///
+ /// @brief Functions to work with type in old manner.
+ ///
+ /// @deprecated New code is recommended to work with types using @ref NTi::TTypePtr from type_info library.
+ TColumnSchema& Type(EValueType type) &;
+ TColumnSchema Type(EValueType type) &&;
+ EValueType Type() const;
+
+ /// @brief Set and get column type.
+ /// @{
+ TColumnSchema& Type(const NTi::TTypePtr& type) &;
+ TColumnSchema Type(const NTi::TTypePtr& type) &&;
+
+ TColumnSchema& TypeV3(const NTi::TTypePtr& type) &;
+ TColumnSchema TypeV3(const NTi::TTypePtr& type) &&;
+ NTi::TTypePtr TypeV3() const;
+ /// @}
+
+ ///
+ /// @brief Raw yson representation of column type
+ /// @deprecated Prefer to use `TypeV3` methods.
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TNode, RawTypeV3);
+
+ /// Column sort order
+ FLUENT_FIELD_OPTION_ENCAPSULATED(ESortOrder, SortOrder);
+
+ ///
+ /// @brief Lock group name
+ ///
+ /// @ref https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#blokirovka-stroki
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Lock);
+
+ /// Expression defining column value
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Expression);
+
+ /// Aggregating function name
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Aggregate);
+
+ ///
+ /// @brief Storage group name
+ ///
+ /// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Group);
+
+ ///
+ /// @brief Column requiredness.
+ ///
+ /// Required columns doesn't accept NULL values.
+ /// Usually if column is required it means that it has Optional<...> type
+ bool Required() const;
+
+ ///
+ /// @{
+ ///
+ /// @brief Set type in old-style manner
+ TColumnSchema& Type(EValueType type, bool required) &;
+ TColumnSchema Type(EValueType type, bool required) &&;
+ /// @}
+
+private:
+ friend void Deserialize(TColumnSchema& columnSchema, const TNode& node);
+ NTi::TTypePtr TypeV3_;
+ bool Required_ = false;
+};
+
+/// Equality check checks all fields of column schema.
+bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs);
+
+///
+/// @brief Description of table schema
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema
+class TTableSchema
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TTableSchema;
+ /// @endcond
+
+ /// Column schema
+ FLUENT_VECTOR_FIELD_ENCAPSULATED(TColumnSchema, Column);
+
+ ///
+ /// @brief Strictness of the schema
+ ///
+ /// Strict schemas are not allowed to have columns not described in schema.
+ /// Nonstrict schemas are allowed to have such columns, all such missing columns are assumed to have
+ FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, Strict, true);
+
+ ///
+ /// @brief Whether keys are unique
+ ///
+ /// This flag can be set only for schemas that have sorted columns.
+ /// If flag is set table cannot have multiple rows with same key.
+ FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, UniqueKeys, false);
+
+ /// Get modifiable column list
+ TVector<TColumnSchema>& MutableColumns();
+
+ /// Check if schema has any described column
+ [[nodiscard]] bool Empty() const;
+
+ /// Add column
+ TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &;
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&;
+ TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &&;
+
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&;
+ TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type) &;
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&;
+ TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type) &&;
+
+ /// Add optional column of specified type
+ TTableSchema& AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &;
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&;
+ TTableSchema AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &&;
+
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&;
+ TTableSchema& AddColumn(const TString& name, EValueType type) &;
+ /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&;
+ TTableSchema AddColumn(const TString& name, EValueType type) &&;
+
+ ///
+ /// @brief Make table schema sorted by specified columns
+ ///
+ /// Resets old key columns if any
+ TTableSchema& SortBy(const TSortColumns& columns) &;
+
+ /// @copydoc NYT::TTableSchema::SortBy(const TSortColumns&)&;
+ TTableSchema SortBy(const TSortColumns& columns) &&;
+
+ /// Get yson description of table schema
+ [[nodiscard]] TNode ToNode() const;
+
+ /// Parse schema from yson node
+ static NYT::TTableSchema FromNode(const TNode& node);
+
+ friend void Deserialize(TTableSchema& tableSchema, const TNode& node);
+};
+
+/// Check for equality of all columns and all schema attributes
+bool operator==(const TTableSchema& lhs, const TTableSchema& rhs);
+
+/// Create table schema by protobuf message descriptor
+TTableSchema CreateTableSchema(
+ const ::google::protobuf::Descriptor& messageDescriptor,
+ const TSortColumns& sortColumns = TSortColumns(),
+ bool keepFieldsWithoutExtension = true);
+
+/// Create table schema by protobuf message type
+template <class TProtoType, typename = std::enable_if_t<std::is_base_of_v<::google::protobuf::Message, TProtoType>>>
+inline TTableSchema CreateTableSchema(
+ const TSortColumns& sortColumns = TSortColumns(),
+ bool keepFieldsWithoutExtension = true)
+{
+ static_assert(
+ std::is_base_of_v<::google::protobuf::Message, TProtoType>,
+ "Template argument must be derived from ::google::protobuf::Message");
+
+ return CreateTableSchema(
+ *TProtoType::descriptor(),
+ sortColumns,
+ keepFieldsWithoutExtension);
+}
+
+///
+/// @brief Create strict table schema from `struct` type.
+///
+/// Names and types of columns are taken from struct member names and types.
+/// `Strict` flag is set to true, all other attribute of schema and columns
+/// are left with default values
+TTableSchema CreateTableSchema(NTi::TTypePtr type);
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Enumeration describing comparison operation used in key bound.
+///
+/// ERelation is a part of @ref NYT::TKeyBound that can be used as
+/// lower or upper key limit in @ref TReadLimit.
+///
+/// Relations `Less` and `LessOrEqual` are for upper limit and
+/// relations `Greater` and `GreaterOrEqual` are for lower limit.
+///
+/// It is a error to use relation in the limit of wrong kind.
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath
+enum class ERelation
+{
+ ///
+ /// @brief Relation "less"
+ ///
+ /// Specifies range of keys that are before specified key.
+ /// Can only be used in upper limit.
+ Less /* "<" */,
+
+ ///
+ /// @brief Relation "less or equal"
+ ///
+ /// Specifies range of keys that are before or equal specified key.
+ /// Can only be used in upper limit.
+ LessOrEqual /* "<=" */,
+
+ ///
+ /// @brief Relation "greater"
+ ///
+ /// Specifies range of keys that are after specified key.
+ /// Can only be used in lower limit.
+ Greater /* ">" */,
+
+ ///
+ /// @brief Relation "greater or equal"
+ ///
+ /// Specifies range of keys that are after or equal than specified key.
+ /// Can only be used in lower limit.
+ GreaterOrEqual /* ">=" */,
+};
+
+///
+/// @brief Key with relation specifying interval of keys in lower or upper limit of @ref NYT::TReadRange
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath
+struct TKeyBound
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TKeyBound;
+
+ explicit TKeyBound(ERelation relation = ERelation::Less, TKey key = TKey{});
+
+ FLUENT_FIELD_DEFAULT_ENCAPSULATED(ERelation, Relation, ERelation::Less);
+ FLUENT_FIELD_DEFAULT_ENCAPSULATED(TKey, Key, TKey{});
+ /// @endcond
+};
+
+///
+/// @brief Description of the read limit.
+///
+/// It is actually a variant and must store exactly one field.
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath
+struct TReadLimit
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TReadLimit;
+ /// @endcond
+
+ ///
+ /// @brief KeyBound specifies table key and whether to include it
+ ///
+ /// It can be used in lower or upper limit when reading tables.
+ FLUENT_FIELD_OPTION(TKeyBound, KeyBound);
+
+ ///
+ /// @brief Table key
+ ///
+ /// It can be used in exact, lower or upper limit when reading tables.
+ FLUENT_FIELD_OPTION(TKey, Key);
+
+ ///
+ /// @brief Row index
+ ///
+ /// It can be used in exact, lower or upper limit when reading tables.
+ FLUENT_FIELD_OPTION(i64, RowIndex);
+
+ ///
+ /// @brief File offset
+ ///
+ /// It can be used in lower or upper limit when reading files.
+ FLUENT_FIELD_OPTION(i64, Offset);
+
+ ///
+ /// @brief Tablet index
+ ///
+ /// It can be used in lower or upper limit in dynamic table operations
+ FLUENT_FIELD_OPTION(i64, TabletIndex);
+};
+
+///
+/// @brief Range of a table or a file
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath
+struct TReadRange
+{
+ using TSelf = TReadRange;
+
+ ///
+ /// @brief Lower limit of the range
+ ///
+ /// It is usually inclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::Greater is used).
+ FLUENT_FIELD(TReadLimit, LowerLimit);
+
+ ///
+ /// @brief Lower limit of the range
+ ///
+ /// It is usually exclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::LessOrEqual is used).
+ FLUENT_FIELD(TReadLimit, UpperLimit);
+
+ /// Exact key or row index.
+ FLUENT_FIELD(TReadLimit, Exact);
+
+ /// Create read range from row indexes.
+ static TReadRange FromRowIndices(i64 lowerLimit, i64 upperLimit)
+ {
+ return TReadRange()
+ .LowerLimit(TReadLimit().RowIndex(lowerLimit))
+ .UpperLimit(TReadLimit().RowIndex(upperLimit));
+ }
+
+ /// Create read range from keys.
+ static TReadRange FromKeys(const TKey& lowerKeyInclusive, const TKey& upperKeyExclusive)
+ {
+ return TReadRange()
+ .LowerLimit(TReadLimit().Key(lowerKeyInclusive))
+ .UpperLimit(TReadLimit().Key(upperKeyExclusive));
+ }
+};
+
+///
+/// @brief Path with additional attributes.
+///
+/// Allows to specify additional attributes for path used in some operations.
+///
+/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath
+struct TRichYPath
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TRichYPath;
+ /// @endcond
+
+ /// Path itself.
+ FLUENT_FIELD(TYPath, Path);
+
+ /// Specifies that path should be appended not overwritten
+ FLUENT_FIELD_OPTION(bool, Append);
+
+ /// @deprecated Deprecated attribute.
+ FLUENT_FIELD_OPTION(bool, PartiallySorted);
+
+ /// Specifies that path is expected to be sorted by these columns.
+ FLUENT_FIELD(TSortColumns, SortedBy);
+
+ /// Add range to read.
+ TRichYPath& AddRange(TReadRange range)
+ {
+ if (!Ranges_) {
+ Ranges_.ConstructInPlace();
+ }
+ Ranges_->push_back(std::move(range));
+ return *this;
+ }
+
+ TRichYPath& ResetRanges()
+ {
+ Ranges_.Clear();
+ return *this;
+ }
+
+ ///
+ /// @{
+ ///
+ /// Return ranges to read.
+ ///
+ /// NOTE: Nothing (in TMaybe) and empty TVector are different ranges.
+ /// Nothing represents universal range (reader reads all table rows).
+ /// Empty TVector represents empty range (reader returns empty set of rows).
+ const TMaybe<TVector<TReadRange>>& GetRanges() const
+ {
+ return Ranges_;
+ }
+
+ TMaybe<TVector<TReadRange>>& MutableRanges()
+ {
+ return Ranges_;
+ }
+
+ ///
+ /// @{
+ ///
+ /// Get range view, that is convenient way to iterate through all ranges.
+ TArrayRef<TReadRange> MutableRangesView()
+ {
+ if (Ranges_.Defined()) {
+ return TArrayRef(Ranges_->data(), Ranges_->size());
+ } else {
+ return {};
+ }
+ }
+
+ TArrayRef<const TReadRange> GetRangesView() const
+ {
+ if (Ranges_.Defined()) {
+ return TArrayRef(Ranges_->data(), Ranges_->size());
+ } else {
+ return {};
+ }
+ }
+ /// @}
+
+ /// @{
+ ///
+ /// Get range by index.
+ const TReadRange& GetRange(ssize_t i) const
+ {
+ return Ranges_.GetRef()[i];
+ }
+
+ TReadRange& MutableRange(ssize_t i)
+ {
+ return Ranges_.GetRef()[i];
+ }
+ /// @}
+
+ ///
+ /// @brief Specifies columns that should be read.
+ ///
+ /// If it's set to Nothing then all columns will be read.
+ /// If empty TColumnNames is specified then each read row will be empty.
+ FLUENT_FIELD_OPTION(TColumnNames, Columns);
+
+ FLUENT_FIELD_OPTION(bool, Teleport);
+ FLUENT_FIELD_OPTION(bool, Primary);
+ FLUENT_FIELD_OPTION(bool, Foreign);
+ FLUENT_FIELD_OPTION(i64, RowCountLimit);
+
+ FLUENT_FIELD_OPTION(TString, FileName);
+
+ /// Specifies original path to be shown in Web UI
+ FLUENT_FIELD_OPTION(TYPath, OriginalPath);
+
+ ///
+ /// @brief Specifies that this path points to executable file
+ ///
+ /// Used in operation specs.
+ FLUENT_FIELD_OPTION(bool, Executable);
+
+ ///
+ /// @brief Specify format to use when loading table.
+ ///
+ /// Used in operation specs.
+ FLUENT_FIELD_OPTION(TNode, Format);
+
+ /// @brief Specifies table schema that will be set on the path
+ FLUENT_FIELD_OPTION(TTableSchema, Schema);
+
+ /// Specifies compression codec that will be set on the path
+ FLUENT_FIELD_OPTION(TString, CompressionCodec);
+
+ /// Specifies erasure codec that will be set on the path
+ FLUENT_FIELD_OPTION(EErasureCodecAttr, ErasureCodec);
+
+ /// Specifies schema modification that will be set on the path
+ FLUENT_FIELD_OPTION(ESchemaModificationAttr, SchemaModification);
+
+ /// Specifies optimize_for attribute that will be set on the path
+ FLUENT_FIELD_OPTION(EOptimizeForAttr, OptimizeFor);
+
+ ///
+ /// @brief Do not put file used in operation into node cache
+ ///
+ /// If BypassArtifactCache == true, file will be loaded into the job's sandbox bypassing the cache on the YT node.
+ /// It helps jobs that use tmpfs to start faster,
+ /// because files will be loaded into tmpfs directly bypassing disk cache
+ FLUENT_FIELD_OPTION(bool, BypassArtifactCache);
+
+ ///
+ /// @brief Timestamp of dynamic table.
+ ///
+ /// NOTE: it is _not_ unix timestamp
+ /// (instead it's transaction timestamp, that is more complex structure).
+ FLUENT_FIELD_OPTION(i64, Timestamp);
+
+ ///
+ /// @brief Specify transaction that should be used to access this path.
+ ///
+ /// Allows to start cross-transactional operations.
+ FLUENT_FIELD_OPTION(TTransactionId, TransactionId);
+
+ using TRenameColumnsDescriptor = THashMap<TString, TString>;
+
+ /// Specifies columnar mapping which will be applied to columns before transfer to job.
+ FLUENT_FIELD_OPTION(TRenameColumnsDescriptor, RenameColumns);
+
+ /// Create empty path with no attributes
+ TRichYPath()
+ { }
+
+ ///
+ /// @{
+ ///
+ /// @brief Create path from string
+ TRichYPath(const char* path)
+ : Path_(path)
+ { }
+
+ TRichYPath(const TYPath& path)
+ : Path_(path)
+ { }
+ /// @}
+
+private:
+ TMaybe<TVector<TReadRange>> Ranges_;
+};
+
+///
+/// @ref Create copy of @ref NYT::TRichYPath with schema derived from proto message.
+///
+///
+template <typename TProtoType>
+TRichYPath WithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns())
+{
+ static_assert(std::is_base_of_v<::google::protobuf::Message, TProtoType>, "TProtoType must be Protobuf message");
+
+ auto schemedPath = path;
+ if (!schemedPath.Schema_) {
+ schemedPath.Schema(CreateTableSchema<TProtoType>(sortBy));
+ }
+ return schemedPath;
+}
+
+///
+/// @brief Create copy of @ref NYT::TRichYPath with schema derived from TRowType if possible.
+///
+/// If TRowType is protobuf message schema is derived from it and set to returned path.
+/// Otherwise schema of original path is left unchanged (and probably unset).
+template <typename TRowType>
+TRichYPath MaybeWithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns())
+{
+ if constexpr (std::is_base_of_v<::google::protobuf::Message, TRowType>) {
+ return WithSchema<TRowType>(path, sortBy);
+ } else {
+ return path;
+ }
+}
+
+///
+/// @brief Get the list of ranges related to path in compatibility mode.
+///
+/// - If path is missing ranges, empty list is returned.
+/// - If path has associated range list and the list is not empty, function returns this list.
+/// - If path has associated range list and this list is empty, exception is thrown.
+///
+/// Before YT-17683 RichYPath didn't support empty range list and empty range actualy meant universal range.
+/// This function emulates this old behavior.
+///
+/// @see https://st.yandex-team.ru/YT-17683
+const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path);
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Statistics about table columns.
+struct TTableColumnarStatistics
+{
+ /// Total data weight for all chunks for each of requested columns.
+ THashMap<TString, i64> ColumnDataWeight;
+
+ /// Total weight of all old chunks that don't keep columnar statistics.
+ i64 LegacyChunksDataWeight = 0;
+
+ /// Timestamps total weight (only for dynamic tables).
+ TMaybe<i64> TimestampTotalWeight;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Description of a partition.
+struct TMultiTablePartition
+{
+ struct TStatistics
+ {
+ i64 ChunkCount = 0;
+ i64 DataWeight = 0;
+ i64 RowCount = 0;
+ };
+
+ /// Ranges of input tables for this partition.
+ TVector<TRichYPath> TableRanges;
+
+ /// Aggregate statistics of all the table ranges in the partition.
+ TStatistics AggregateStatistics;
+};
+
+/// Table partitions from GetTablePartitions command.
+struct TMultiTablePartitions
+{
+ /// Disjoint partitions into which the input tables were divided.
+ TVector<TMultiTablePartition> Partitions;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Contains information about tablet
+///
+/// @see NYT::IClient::GetTabletInfos
+struct TTabletInfo
+{
+ ///
+ /// @brief Indicates the total number of rows added to the tablet (including trimmed ones).
+ ///
+ /// Currently only provided for ordered tablets.
+ i64 TotalRowCount = 0;
+
+ ///
+ /// @brief Contains the number of front rows that are trimmed and are not guaranteed to be accessible.
+ ///
+ /// Only makes sense for ordered tablet.
+ i64 TrimmedRowCount = 0;
+
+ ///
+ /// @brief Tablet cell barrier timestamp, which lags behind the current timestamp
+ ///
+ /// It is guaranteed that all transactions with commit timestamp not exceeding the barrier are fully committed;
+ /// e.g. all their added rows are visible (and are included in @ref NYT::TTabletInfo::TotalRowCount).
+ /// Mostly makes sense for ordered tablets.
+ ui64 BarrierTimestamp;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// List of attributes to retrieve in operations like @ref NYT::ICypressClient::Get
+struct TAttributeFilter
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TAttributeFilter;
+ /// @endcond
+
+ /// List of attributes.
+ FLUENT_VECTOR_FIELD(TString, Attribute);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Check if none of the fields of @ref NYT::TReadLimit is set.
+///
+/// @return true if any field of readLimit is set and false otherwise.
+bool IsTrivial(const TReadLimit& readLimit);
+
+/// Convert yson node type to table schema type
+EValueType NodeTypeToValueType(TNode::EType nodeType);
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Enumeration for specifying how reading from master is performed.
+///
+/// Used in operations like NYT::ICypressClient::Get
+enum class EMasterReadKind : int
+{
+ ///
+ /// @brief Reading from leader.
+ ///
+ /// Should almost never be used since it's expensive and for regular uses has no difference from
+ /// "follower" read.
+ Leader /* "leader" */,
+
+ /// @brief Reading from master follower (default).
+ Follower /* "follower" */,
+ Cache /* "cache" */,
+ MasterCache /* "master_cache" */,
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @cond Doxygen_Suppress
+namespace NDetail {
+
+// MUST NOT BE USED BY CLIENTS
+// TODO: we should use default GENERATE_ENUM_SERIALIZATION
+TString ToString(EValueType type);
+
+} // namespace NDetail
+/// @endcond
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/common_ut.cpp b/yt/cpp/mapreduce/interface/common_ut.cpp
new file mode 100644
index 0000000000..3f19433816
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/common_ut.cpp
@@ -0,0 +1,303 @@
+#include "common_ut.h"
+
+#include "fluent.h"
+
+#include <yt/cpp/mapreduce/interface/common.h>
+
+#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <library/cpp/yson/node/node_io.h>
+#include <library/cpp/yson/node/node_builder.h>
+
+#include <util/generic/xrange.h>
+
+using namespace NYT;
+
+template <class T>
+TString SaveToString(const T& obj)
+{
+ TString s;
+ TStringOutput out(s);
+ ::Save(&out, obj);
+ return s;
+}
+
+template <class T>
+T LoadFromString(TStringBuf s)
+{
+ TMemoryInput in(s);
+ T obj;
+ ::Load(&in, obj);
+ return obj;
+}
+
+template <class T>
+T SaveLoad(const T& obj)
+{
+ return LoadFromString<T>(SaveToString(obj));
+}
+
+Y_UNIT_TEST_SUITE(Common)
+{
+ Y_UNIT_TEST(SortColumnsLegacy)
+ {
+ TSortColumns keys1("a", "b");
+ UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b"}));
+
+ keys1.Add("c", "d");
+ UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"}));
+
+ auto keys2 = TSortColumns(keys1).Add("e", "f");
+ UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"}));
+ UNIT_ASSERT((keys2.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f"}));
+
+ auto keys3 = TSortColumns(keys1).Add("e").Add("f").Add("g");
+ UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"}));
+ UNIT_ASSERT((keys3.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f", "g"}));
+ }
+
+ Y_UNIT_TEST(SortColumn)
+ {
+ auto ascending = TSortColumn("a");
+ UNIT_ASSERT_VALUES_EQUAL(ascending.Name(), "a");
+ UNIT_ASSERT_VALUES_EQUAL(ascending.SortOrder(), ESortOrder::SO_ASCENDING);
+ UNIT_ASSERT_VALUES_EQUAL(ascending, TSortColumn("a", ESortOrder::SO_ASCENDING));
+ UNIT_ASSERT_VALUES_UNEQUAL(ascending, TSortColumn("a", ESortOrder::SO_DESCENDING));
+
+ UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending());
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<TString>(ascending), "a");
+ UNIT_ASSERT_VALUES_EQUAL(ascending, "a");
+
+ auto another = ascending;
+ UNIT_ASSERT_NO_EXCEPTION(another = "another");
+ UNIT_ASSERT_VALUES_EQUAL(another.Name(), "another");
+ UNIT_ASSERT_VALUES_EQUAL(another.SortOrder(), ESortOrder::SO_ASCENDING);
+ UNIT_ASSERT_VALUES_EQUAL(another, TSortColumn("another", ESortOrder::SO_ASCENDING));
+ UNIT_ASSERT_VALUES_UNEQUAL(another, TSortColumn("another", ESortOrder::SO_DESCENDING));
+
+ auto ascendingNode = BuildYsonNodeFluently().Value(ascending);
+ UNIT_ASSERT_VALUES_EQUAL(ascendingNode, TNode("a"));
+
+ UNIT_ASSERT_VALUES_EQUAL(SaveLoad(ascending), ascending);
+ UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(ascending), SaveToString(TString("a")));
+
+ auto descending = TSortColumn("a", ESortOrder::SO_DESCENDING);
+ UNIT_ASSERT_VALUES_EQUAL(descending.Name(), "a");
+ UNIT_ASSERT_VALUES_EQUAL(descending.SortOrder(), ESortOrder::SO_DESCENDING);
+ UNIT_ASSERT_VALUES_EQUAL(descending, TSortColumn("a", ESortOrder::SO_DESCENDING));
+ UNIT_ASSERT_VALUES_UNEQUAL(descending, TSortColumn("a", ESortOrder::SO_ASCENDING));
+
+ UNIT_ASSERT_EXCEPTION(descending.EnsureAscending(), yexception);
+ UNIT_ASSERT_EXCEPTION(static_cast<TString>(descending), yexception);
+ UNIT_ASSERT_EXCEPTION(descending == "a", yexception);
+ UNIT_ASSERT_EXCEPTION(descending = "a", yexception);
+
+ auto descendingNode = BuildYsonNodeFluently().Value(descending);
+ UNIT_ASSERT_VALUES_EQUAL(descendingNode, TNode()("name", "a")("sort_order", "descending"));
+
+ UNIT_ASSERT_VALUES_EQUAL(SaveLoad(descending), descending);
+ UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(descending), SaveToString("a"));
+
+ UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah")), "blah");
+ UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah", ESortOrder::SO_DESCENDING)), "{\"name\"=\"blah\";\"sort_order\"=\"descending\"}");
+ }
+
+ Y_UNIT_TEST(SortColumns)
+ {
+ TSortColumns ascending("a", "b");
+ UNIT_ASSERT(ascending.Parts_ == (TSortColumns{"a", "b"}));
+ UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending());
+ UNIT_ASSERT_VALUES_EQUAL(static_cast<TColumnNames>(ascending).Parts_, (TVector<TString>{"a", "b"}));
+ UNIT_ASSERT_VALUES_EQUAL(ascending.GetNames(), (TVector<TString>{"a", "b"}));
+
+ auto mixed = ascending;
+ mixed.Add(TSortColumn("c", ESortOrder::SO_DESCENDING), "d");
+ UNIT_ASSERT((mixed.Parts_ != TVector<TSortColumn>{"a", "b", "c", "d"}));
+ UNIT_ASSERT((mixed.Parts_ == TVector<TSortColumn>{"a", "b", TSortColumn("c", ESortOrder::SO_DESCENDING), "d"}));
+ UNIT_ASSERT_VALUES_EQUAL(mixed.GetNames(), (TVector<TString>{"a", "b", "c", "d"}));
+ UNIT_ASSERT_EXCEPTION(mixed.EnsureAscending(), yexception);
+ UNIT_ASSERT_EXCEPTION(static_cast<TColumnNames>(mixed), yexception);
+ }
+
+ Y_UNIT_TEST(KeyBound)
+ {
+ auto keyBound = TKeyBound(ERelation::Greater, TKey(7, "a", TNode()("x", "y")));
+ UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::Greater);
+ UNIT_ASSERT_EQUAL(keyBound.Key(), TKey(7, "a", TNode()("x", "y")));
+
+ auto keyBound1 = TKeyBound().Relation(ERelation::Greater).Key(TKey(7, "a", TNode()("x", "y")));
+ auto expectedNode = TNode()
+ .Add(">")
+ .Add(TNode().Add(7).Add("a").Add(TNode()("x", "y")));
+
+ UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound));
+ UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound1));
+
+ keyBound.Relation(ERelation::LessOrEqual);
+ keyBound.Key(TKey("A", 7));
+ UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::LessOrEqual);
+ UNIT_ASSERT_EQUAL(keyBound.Key(), TKey("A", 7));
+
+ UNIT_ASSERT_VALUES_EQUAL(
+ BuildYsonNodeFluently().Value(keyBound),
+ TNode()
+ .Add("<=")
+ .Add(TNode().Add("A").Add(7)));
+ }
+
+ Y_UNIT_TEST(TTableSchema)
+ {
+ TTableSchema schema;
+ schema
+ .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING))
+ .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64))
+ .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64));
+ auto checkSortBy = [](TTableSchema schema, const TVector<TString>& columns) {
+ auto initialSchema = schema;
+ schema.SortBy(columns);
+ for (auto i: xrange(columns.size())) {
+ UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].Name(), columns[i]);
+ UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), ESortOrder::SO_ASCENDING);
+ }
+ for (auto i: xrange(columns.size(), (size_t)initialSchema.Columns().size())) {
+ UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), Nothing());
+ }
+ UNIT_ASSERT_VALUES_EQUAL(initialSchema.Columns().size(), schema.Columns().size());
+ return schema;
+ };
+ auto newSchema = checkSortBy(schema, {"b"});
+ UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[1].Name(), TString("a"));
+ UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[2].Name(), TString("c"));
+ checkSortBy(schema, {"b", "c"});
+ checkSortBy(schema, {"c", "a"});
+ UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"b", "b"}), yexception);
+ UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"a", "junk"}), yexception);
+ }
+
+ Y_UNIT_TEST(TColumnSchema_TypeV3)
+ {
+ {
+ auto column = TColumnSchema().Type(NTi::Interval());
+ UNIT_ASSERT_VALUES_EQUAL(column.Required(), true);
+ UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_INTERVAL);
+ }
+ {
+ auto column = TColumnSchema().Type(NTi::Optional(NTi::Date()));
+ UNIT_ASSERT_VALUES_EQUAL(column.Required(), false);
+ UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_DATE);
+ }
+ {
+ auto column = TColumnSchema().Type(NTi::Null());
+ UNIT_ASSERT_VALUES_EQUAL(column.Required(), false);
+ UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_NULL);
+ }
+ {
+ auto column = TColumnSchema().Type(NTi::Optional(NTi::Null()));
+ UNIT_ASSERT_VALUES_EQUAL(column.Required(), false);
+ UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_ANY);
+ }
+ }
+
+ Y_UNIT_TEST(ToTypeV3)
+ {
+ UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_INT32, true), *NTi::Int32());
+ UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_UTF8, false), *NTi::Optional(NTi::Utf8()));
+ }
+
+ Y_UNIT_TEST(DeserializeColumn)
+ {
+ auto deserialize = [] (TStringBuf yson) {
+ auto node = NodeFromYsonString(yson);
+ TColumnSchema column;
+ Deserialize(column, node);
+ return column;
+ };
+
+ auto column = deserialize("{name=foo; type=int64; required=%false}");
+ UNIT_ASSERT_VALUES_EQUAL(column.Name(), "foo");
+ UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Optional(NTi::Int64()));
+
+ column = deserialize("{name=bar; type=utf8; required=%true; type_v3=utf8}");
+ UNIT_ASSERT_VALUES_EQUAL(column.Name(), "bar");
+ UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Utf8());
+ }
+
+ Y_UNIT_TEST(ColumnSchemaEquality)
+ {
+ auto base = TColumnSchema()
+ .Name("col")
+ .TypeV3(NTi::Optional(NTi::List(NTi::String())))
+ .SortOrder(ESortOrder::SO_ASCENDING)
+ .Lock("lock")
+ .Expression("x + 12")
+ .Aggregate("sum")
+ .Group("group");
+
+ auto other = base;
+ ASSERT_SERIALIZABLES_EQUAL(other, base);
+ other.Name("other");
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.TypeV3(NTi::List(NTi::String()));
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.ResetSortOrder();
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.Lock("lock1");
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.Expression("x + 13");
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.ResetAggregate();
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+
+ other = base;
+ other.Group("group1");
+ ASSERT_SERIALIZABLES_UNEQUAL(other, base);
+ }
+
+ Y_UNIT_TEST(TableSchemaEquality)
+ {
+ auto col1 = TColumnSchema()
+ .Name("col1")
+ .TypeV3(NTi::Optional(NTi::List(NTi::String())))
+ .SortOrder(ESortOrder::SO_ASCENDING);
+
+ auto col2 = TColumnSchema()
+ .Name("col2")
+ .TypeV3(NTi::Uint32());
+
+ auto schema = TTableSchema()
+ .AddColumn(col1)
+ .AddColumn(col2)
+ .Strict(true)
+ .UniqueKeys(true);
+
+ auto other = schema;
+ ASSERT_SERIALIZABLES_EQUAL(other, schema);
+
+ other.Strict(false);
+ ASSERT_SERIALIZABLES_UNEQUAL(other, schema);
+
+ other = schema;
+ other.MutableColumns()[0].TypeV3(NTi::List(NTi::String()));
+ ASSERT_SERIALIZABLES_UNEQUAL(other, schema);
+
+ other = schema;
+ other.MutableColumns().push_back(col1);
+ ASSERT_SERIALIZABLES_UNEQUAL(other, schema);
+
+ other = schema;
+ other.UniqueKeys(false);
+ ASSERT_SERIALIZABLES_UNEQUAL(other, schema);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/common_ut.h b/yt/cpp/mapreduce/interface/common_ut.h
new file mode 100644
index 0000000000..6f70f09bee
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/common_ut.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/yt/cpp/mapreduce/interface/config.cpp b/yt/cpp/mapreduce/interface/config.cpp
new file mode 100644
index 0000000000..b474dc0844
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/config.cpp
@@ -0,0 +1,321 @@
+#include "config.h"
+
+#include "operation.h"
+
+#include <yt/cpp/mapreduce/interface/logging/yt_log.h>
+
+#include <library/cpp/json/json_reader.h>
+#include <library/cpp/svnversion/svnversion.h>
+
+#include <library/cpp/yson/node/node_builder.h>
+#include <library/cpp/yson/node/node_io.h>
+
+#include <library/cpp/yson/json/yson2json_adapter.h>
+
+#include <util/string/strip.h>
+#include <util/folder/dirut.h>
+#include <util/folder/path.h>
+#include <util/stream/file.h>
+#include <util/generic/singleton.h>
+#include <util/string/builder.h>
+#include <util/string/cast.h>
+#include <util/string/type.h>
+#include <util/system/hostname.h>
+#include <util/system/user.h>
+#include <util/system/env.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool TConfig::GetBool(const char* var, bool defaultValue)
+{
+ TString val = GetEnv(var, "");
+ if (val.empty()) {
+ return defaultValue;
+ }
+ return IsTrue(val);
+}
+
+int TConfig::GetInt(const char* var, int defaultValue)
+{
+ int result = 0;
+ TString val = GetEnv(var, "");
+ if (val.empty()) {
+ return defaultValue;
+ }
+ try {
+ result = FromString<int>(val);
+ } catch (const yexception& e) {
+ ythrow yexception() << "Cannot parse " << var << '=' << val << " as integer: " << e.what();
+ }
+ return result;
+}
+
+TDuration TConfig::GetDuration(const char* var, TDuration defaultValue)
+{
+ return TDuration::Seconds(GetInt(var, defaultValue.Seconds()));
+}
+
+EEncoding TConfig::GetEncoding(const char* var)
+{
+ const TString encodingName = GetEnv(var, "identity");
+ EEncoding encoding;
+ if (TryFromString(encodingName, encoding)) {
+ return encoding;
+ } else {
+ ythrow yexception() << var << ": encoding '" << encodingName << "' is not supported";
+ }
+}
+
+ EUploadDeduplicationMode TConfig::GetUploadingDeduplicationMode(
+ const char* var,
+ EUploadDeduplicationMode defaultValue)
+{
+ const TString deduplicationMode = GetEnv(var, TEnumTraits<EUploadDeduplicationMode>::ToString(defaultValue));
+ return TEnumTraits<EUploadDeduplicationMode>::FromString(deduplicationMode);
+}
+
+void TConfig::ValidateToken(const TString& token)
+{
+ for (size_t i = 0; i < token.size(); ++i) {
+ ui8 ch = token[i];
+ if (ch < 0x21 || ch > 0x7e) {
+ ythrow yexception() << "Incorrect token character '" << ch << "' at position " << i;
+ }
+ }
+}
+
+TString TConfig::LoadTokenFromFile(const TString& tokenPath)
+{
+ TFsPath path(tokenPath);
+ return path.IsFile() ? Strip(TIFStream(path).ReadAll()) : TString();
+}
+
+TNode TConfig::LoadJsonSpec(const TString& strSpec)
+{
+ TNode spec;
+ TStringInput input(strSpec);
+ TNodeBuilder builder(&spec);
+ TYson2JsonCallbacksAdapter callbacks(&builder);
+
+ Y_ENSURE(NJson::ReadJson(&input, &callbacks), "Cannot parse json spec: " << strSpec);
+ Y_ENSURE(spec.IsMap(), "Json spec is not a map");
+
+ return spec;
+}
+
+TRichYPath TConfig::LoadApiFilePathOptions(const TString& ysonMap)
+{
+ TNode attributes;
+ try {
+ attributes = NodeFromYsonString(ysonMap);
+ } catch (const yexception& exc) {
+ ythrow yexception() << "Failed to parse YT_API_FILE_PATH_OPTIONS (it must be yson map): " << exc;
+ }
+ TNode pathNode = "";
+ pathNode.Attributes() = attributes;
+ TRichYPath path;
+ Deserialize(path, pathNode);
+ return path;
+}
+
+void TConfig::LoadToken()
+{
+ if (auto envToken = GetEnv("YT_TOKEN")) {
+ Token = envToken;
+ } else if (auto envToken = GetEnv("YT_SECURE_VAULT_YT_TOKEN")) {
+ // If this code runs inside an vanilla peration in YT
+ // it should not use regular environment variable `YT_TOKEN`
+ // because it would be visible in UI.
+ // Token should be passed via `secure_vault` parameter in operation spec.
+ Token = envToken;
+ } else if (auto tokenPath = GetEnv("YT_TOKEN_PATH")) {
+ Token = LoadTokenFromFile(tokenPath);
+ } else {
+ Token = LoadTokenFromFile(GetHomeDir() + "/.yt/token");
+ }
+ ValidateToken(Token);
+}
+
+void TConfig::LoadSpec()
+{
+ TString strSpec = GetEnv("YT_SPEC", "{}");
+ Spec = LoadJsonSpec(strSpec);
+
+ strSpec = GetEnv("YT_TABLE_WRITER", "{}");
+ TableWriter = LoadJsonSpec(strSpec);
+}
+
+void TConfig::LoadTimings()
+{
+ ConnectTimeout = GetDuration("YT_CONNECT_TIMEOUT",
+ TDuration::Seconds(10));
+
+ SocketTimeout = GetDuration("YT_SOCKET_TIMEOUT",
+ GetDuration("YT_SEND_RECEIVE_TIMEOUT", // common
+ TDuration::Seconds(60)));
+
+ AddressCacheExpirationTimeout = TDuration::Minutes(15);
+
+ CacheLockTimeoutPerGb = TDuration::MilliSeconds(1000.0 * 1_GB * 8 / 20_MB); // 20 Mbps = 20 MBps / 8.
+
+ TxTimeout = GetDuration("YT_TX_TIMEOUT",
+ TDuration::Seconds(120));
+
+ PingTimeout = GetDuration("YT_PING_TIMEOUT",
+ TDuration::Seconds(5));
+
+ PingInterval = GetDuration("YT_PING_INTERVAL",
+ TDuration::Seconds(5));
+
+ WaitLockPollInterval = TDuration::Seconds(5);
+
+ RetryInterval = GetDuration("YT_RETRY_INTERVAL",
+ TDuration::Seconds(3));
+
+ ChunkErrorsRetryInterval = GetDuration("YT_CHUNK_ERRORS_RETRY_INTERVAL",
+ TDuration::Seconds(60));
+
+ RateLimitExceededRetryInterval = GetDuration("YT_RATE_LIMIT_EXCEEDED_RETRY_INTERVAL",
+ TDuration::Seconds(60));
+
+ StartOperationRetryInterval = GetDuration("YT_START_OPERATION_RETRY_INTERVAL",
+ TDuration::Seconds(60));
+
+ HostListUpdateInterval = TDuration::Seconds(60);
+}
+
+void TConfig::Reset()
+{
+ Hosts = GetEnv("YT_HOSTS", "hosts");
+ Pool = GetEnv("YT_POOL");
+ Prefix = GetEnv("YT_PREFIX");
+ ApiVersion = GetEnv("YT_VERSION", "v3");
+ LogLevel = GetEnv("YT_LOG_LEVEL", "error");
+
+ ContentEncoding = GetEncoding("YT_CONTENT_ENCODING");
+ AcceptEncoding = GetEncoding("YT_ACCEPT_ENCODING");
+
+ GlobalTxId = GetEnv("YT_TRANSACTION", "");
+
+ UseAsyncTxPinger = false;
+ AsyncHttpClientThreads = 1;
+ AsyncTxPingerPoolThreads = 1;
+
+ ForceIpV4 = GetBool("YT_FORCE_IPV4");
+ ForceIpV6 = GetBool("YT_FORCE_IPV6");
+ UseHosts = GetBool("YT_USE_HOSTS", true);
+
+ LoadToken();
+ LoadSpec();
+ LoadTimings();
+
+ CacheUploadDeduplicationMode = GetUploadingDeduplicationMode("YT_UPLOAD_DEDUPLICATION", EUploadDeduplicationMode::Host);
+
+ RetryCount = Max(GetInt("YT_RETRY_COUNT", 10), 1);
+ ReadRetryCount = Max(GetInt("YT_READ_RETRY_COUNT", 30), 1);
+ StartOperationRetryCount = Max(GetInt("YT_START_OPERATION_RETRY_COUNT", 30), 1);
+
+ RemoteTempFilesDirectory = GetEnv("YT_FILE_STORAGE",
+ "//tmp/yt_wrapper/file_storage");
+ RemoteTempTablesDirectory = GetEnv("YT_TEMP_TABLES_STORAGE",
+ "//tmp/yt_wrapper/table_storage");
+ RemoteTempTablesDirectory = GetEnv("YT_TEMP_DIR",
+ RemoteTempTablesDirectory);
+
+ InferTableSchema = false;
+
+ UseClientProtobuf = GetBool("YT_USE_CLIENT_PROTOBUF", false);
+ NodeReaderFormat = ENodeReaderFormat::Auto;
+ ProtobufFormatWithDescriptors = true;
+
+ MountSandboxInTmpfs = GetBool("YT_MOUNT_SANDBOX_IN_TMPFS");
+
+ ApiFilePathOptions = LoadApiFilePathOptions(GetEnv("YT_API_FILE_PATH_OPTIONS", "{}"));
+
+ ConnectionPoolSize = GetInt("YT_CONNECTION_POOL_SIZE", 16);
+
+ TraceHttpRequestsMode = FromString<ETraceHttpRequestsMode>(to_lower(GetEnv("YT_TRACE_HTTP_REQUESTS", "never")));
+
+ CommandsWithFraming = {
+ "read_table",
+ "get_table_columnar_statistics",
+ "get_job_input",
+ "concatenate",
+ "partition_tables",
+ };
+}
+
+TConfig::TConfig()
+{
+ Reset();
+}
+
+TConfigPtr TConfig::Get()
+{
+ struct TConfigHolder
+ {
+ TConfigHolder()
+ : Config(::MakeIntrusive<TConfig>())
+ { }
+
+ TConfigPtr Config;
+ };
+
+ return Singleton<TConfigHolder>()->Config;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TProcessState::TProcessState()
+{
+ try {
+ FqdnHostName = ::FQDNHostName();
+ } catch (const yexception& e) {
+ try {
+ FqdnHostName = ::HostName();
+ } catch (const yexception& e) {
+ ythrow yexception() << "Cannot get fqdn and host name: " << e.what();
+ }
+ }
+
+ try {
+ UserName = ::GetUsername();
+ } catch (const yexception& e) {
+ ythrow yexception() << "Cannot get user name: " << e.what();
+ }
+
+ Pid = static_cast<int>(getpid());
+
+ if (!ClientVersion) {
+ ClientVersion = ::TStringBuilder() << "YT C++ native " << GetProgramCommitId();
+ }
+}
+
+static TString CensorString(TString input)
+{
+ static const TString prefix = "AQAD-";
+ if (input.find(prefix) == TString::npos) {
+ return input;
+ } else {
+ return TString(input.size(), '*');
+ }
+}
+
+void TProcessState::SetCommandLine(int argc, const char* argv[])
+{
+ for (int i = 0; i < argc; ++i) {
+ CommandLine.push_back(argv[i]);
+ CensoredCommandLine.push_back(CensorString(CommandLine.back()));
+ }
+}
+
+TProcessState* TProcessState::Get()
+{
+ return Singleton<TProcessState>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/config.h b/yt/cpp/mapreduce/interface/config.h
new file mode 100644
index 0000000000..c44ad25f1c
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/config.h
@@ -0,0 +1,228 @@
+#pragma once
+
+#include "fwd.h"
+#include "common.h"
+#include "node.h"
+
+#include <library/cpp/yt/misc/enum.h>
+
+#include <util/generic/maybe.h>
+#include <util/generic/string.h>
+#include <util/generic/hash_set.h>
+
+#include <util/datetime/base.h>
+
+namespace NYT {
+
+enum EEncoding : int
+{
+ E_IDENTITY /* "identity" */,
+ E_GZIP /* "gzip" */,
+ E_BROTLI /* "br" */,
+ E_Z_LZ4 /* "z-lz4" */,
+};
+
+enum class ENodeReaderFormat : int
+{
+ Yson, // Always use YSON format,
+ Skiff, // Always use Skiff format, throw exception if it's not possible (non-strict schema, dynamic table etc.)
+ Auto, // Use Skiff format if it's possible, YSON otherwise
+};
+
+enum class ETraceHttpRequestsMode
+{
+ // Never dump http requests.
+ Never /* "never" */,
+ // Dump failed http requests.
+ Error /* "error" */,
+ // Dump all http requests.
+ Always /* "always" */,
+};
+
+DEFINE_ENUM(EUploadDeduplicationMode,
+ // For each file only one process' thread from all possible hosts can upload it to the file cache at the same time.
+ // The others will wait for the uploading to finish and use already cached file.
+ ((Global) (0))
+
+ // For each file and each particular host only one process' thread can upload it to the file cache at the same time.
+ // The others will wait for the uploading to finish and use already cached file.
+ ((Host) (1))
+
+ // All processes' threads will upload a file to the cache concurrently.
+ ((Disabled) (2))
+);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TConfig
+ : public TThrRefBase
+{
+ TString Hosts;
+ TString Pool;
+ TString Token;
+ TString Prefix;
+ TString ApiVersion;
+ TString LogLevel;
+
+ // Compression for data that is sent to YT cluster.
+ EEncoding ContentEncoding;
+
+ // Compression for data that is read from YT cluster.
+ EEncoding AcceptEncoding;
+
+ TString GlobalTxId;
+
+ bool ForceIpV4;
+ bool ForceIpV6;
+ bool UseHosts;
+
+ TDuration HostListUpdateInterval;
+
+ TNode Spec;
+ TNode TableWriter;
+
+ TDuration ConnectTimeout;
+ TDuration SocketTimeout;
+ TDuration AddressCacheExpirationTimeout;
+ TDuration TxTimeout;
+ TDuration PingTimeout;
+ TDuration PingInterval;
+
+ bool UseAsyncTxPinger;
+ int AsyncHttpClientThreads;
+ int AsyncTxPingerPoolThreads;
+
+ // How often should we poll for lock state
+ TDuration WaitLockPollInterval;
+
+ TDuration RetryInterval;
+ TDuration ChunkErrorsRetryInterval;
+
+ TDuration RateLimitExceededRetryInterval;
+ TDuration StartOperationRetryInterval;
+
+ int RetryCount;
+ int ReadRetryCount;
+ int StartOperationRetryCount;
+
+ /// @brief Period for checking status of running operation.
+ TDuration OperationTrackerPollPeriod = TDuration::Seconds(5);
+
+ TString RemoteTempFilesDirectory;
+ TString RemoteTempTablesDirectory;
+
+ //
+ // Infer schemas for nonexstent tables from typed rows (e.g. protobuf)
+ // when writing from operation or client writer.
+ // This options can be overriden in TOperationOptions and TTableWriterOptions.
+ bool InferTableSchema;
+
+ bool UseClientProtobuf;
+ ENodeReaderFormat NodeReaderFormat;
+ bool ProtobufFormatWithDescriptors;
+
+ int ConnectionPoolSize;
+
+ /// Defines replication factor that is used for files that are uploaded to YT
+ /// to use them in operations.
+ int FileCacheReplicationFactor = 10;
+
+ /// @brief Used when waiting for other process which uploads the same file to the file cache.
+ ///
+ /// If CacheUploadDeduplicationMode is not Disabled, current process can wait for some other
+ /// process which is uploading the same file. This value is proportional to the timeout of waiting,
+ /// actual timeout computes as follows: fileSizeGb * CacheLockTimeoutPerGb.
+ /// Default timeout assumes that host has uploading speed equal to 20 Mb/s.
+ /// If timeout was reached, the file will be uploaded by current process without any other waits.
+ TDuration CacheLockTimeoutPerGb;
+
+ /// @brief Used to prevent concurrent uploading of the same file to the file cache.
+ /// NB: Each mode affects only users with the same mode enabled.
+ EUploadDeduplicationMode CacheUploadDeduplicationMode;
+
+ bool MountSandboxInTmpfs;
+
+ /// @brief Set upload options (e.g.) for files created by library.
+ ///
+ /// Path itself is always ignored but path options (e.g. `BypassArtifactCache`) are used when uploading system files:
+ /// cppbinary, job state, etc
+ TRichYPath ApiFilePathOptions;
+
+ // Testing options, should never be used in user programs.
+ bool UseAbortableResponse = false;
+ bool EnableDebugMetrics = false;
+
+ //
+ // There is optimization used with local YT that enables to skip binary upload and use real binary path.
+ // When EnableLocalModeOptimization is set to false this optimization is completely disabled.
+ bool EnableLocalModeOptimization = true;
+
+ //
+ // If you want see stderr even if you jobs not failed set this true.
+ bool WriteStderrSuccessfulJobs = false;
+
+ //
+ // This configuration is useful for debug.
+ // If set to ETraceHttpRequestsMode::Error library will dump all http error requests.
+ // If set to ETraceHttpRequestsMode::All library will dump all http requests.
+ // All tracing occurres as DEBUG level logging.
+ ETraceHttpRequestsMode TraceHttpRequestsMode = ETraceHttpRequestsMode::Never;
+
+ TString SkynetApiHost;
+
+ // Sets SO_PRIORITY option on the socket
+ TMaybe<int> SocketPriority;
+
+ // Framing settings
+ // (cf. https://yt.yandex-team.ru/docs/description/proxy/http_proxy_reference#framing).
+ THashSet<TString> CommandsWithFraming;
+
+ static bool GetBool(const char* var, bool defaultValue = false);
+ static int GetInt(const char* var, int defaultValue);
+ static TDuration GetDuration(const char* var, TDuration defaultValue);
+ static EEncoding GetEncoding(const char* var);
+ static EUploadDeduplicationMode GetUploadingDeduplicationMode(
+ const char* var,
+ EUploadDeduplicationMode defaultValue);
+
+ static void ValidateToken(const TString& token);
+ static TString LoadTokenFromFile(const TString& tokenPath);
+
+ static TNode LoadJsonSpec(const TString& strSpec);
+
+ static TRichYPath LoadApiFilePathOptions(const TString& ysonMap);
+
+ void LoadToken();
+ void LoadSpec();
+ void LoadTimings();
+
+ void Reset();
+
+ TConfig();
+
+ static TConfigPtr Get();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TProcessState
+{
+ TString FqdnHostName;
+ TString UserName;
+ TVector<TString> CommandLine;
+
+ // Command line with everything that looks like tokens censored.
+ TVector<TString> CensoredCommandLine;
+ int Pid;
+ TString ClientVersion;
+
+ TProcessState();
+
+ void SetCommandLine(int argc, const char* argv[]);
+
+ static TProcessState* Get();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/config_ut.cpp b/yt/cpp/mapreduce/interface/config_ut.cpp
new file mode 100644
index 0000000000..e49ba02108
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/config_ut.cpp
@@ -0,0 +1,20 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <yt/cpp/mapreduce/interface/config.h>
+
+using namespace NYT;
+
+Y_UNIT_TEST_SUITE(ConfigSuite)
+{
+ Y_UNIT_TEST(TestReset) {
+ // very limited test, checks only one config field
+
+ auto origConfig = *TConfig::Get();
+ TConfig::Get()->Reset();
+ UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts);
+
+ TConfig::Get()->Hosts = "hosts/fb867";
+ TConfig::Get()->Reset();
+ UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/constants.h b/yt/cpp/mapreduce/interface/constants.h
new file mode 100644
index 0000000000..4f70410814
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/constants.h
@@ -0,0 +1,19 @@
+#pragma once
+
+
+#include <util/system/defaults.h>
+
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+// Maximum number of input tables for operation.
+// If greater number of input tables are provided behaviour is undefined
+// (it might work ok or it might fail or it might work very slowly).
+constexpr size_t MaxInputTableCount = 1000;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/cypress.cpp b/yt/cpp/mapreduce/interface/cypress.cpp
new file mode 100644
index 0000000000..53686effd2
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/cypress.cpp
@@ -0,0 +1,24 @@
+#include "cypress.h"
+
+#include "config.h"
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ICypressClient::Concatenate(
+ const TVector<TYPath>& sourcePaths,
+ const TYPath& destinationPath,
+ const TConcatenateOptions& options)
+{
+ TVector<TRichYPath> richSourcePaths;
+ richSourcePaths.reserve(sourcePaths.size());
+ for (const auto& path : sourcePaths) {
+ richSourcePaths.emplace_back(path);
+ }
+ Concatenate(richSourcePaths, destinationPath, options);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/cypress.h b/yt/cpp/mapreduce/interface/cypress.h
new file mode 100644
index 0000000000..e05316ebc6
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/cypress.h
@@ -0,0 +1,252 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/cypress.h
+///
+/// Header containing interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands.
+
+#include "fwd.h"
+
+#include "client_method_options.h"
+#include "common.h"
+#include "node.h"
+
+#include <util/generic/maybe.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Client interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands.
+class ICypressClient
+{
+public:
+ virtual ~ICypressClient() = default;
+
+ ///
+ /// @brief Create Cypress node of given type.
+ ///
+ /// @param path Path in Cypress to the new object.
+ /// @param type New node type.
+ /// @param options Optional parameters.
+ ///
+ /// @return Id of the created node.
+ ///
+ /// @note All but the last components must exist unless @ref NYT::TCreateOptions::Recursive is `true`.
+ ///
+ /// @note The node itself must not exist unless @ref NYT::TCreateOptions::IgnoreExisting or @ref NYT::TCreateOptions::Force are `true`.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#create)
+ virtual TNodeId Create(
+ const TYPath& path,
+ ENodeType type,
+ const TCreateOptions& options = TCreateOptions()) = 0;
+
+ ///
+ /// @brief Create table with schema inferred from the template argument.
+ ///
+ /// @tparam TRowType type of C++ representation of the row to be stored in the table.
+ /// @param path Path in Cypress to the new table.
+ /// @param sortColumns List of columns to mark as sorted in schema.
+ /// @param options Optional parameters.
+ ///
+ /// @return Id of the created node.
+ ///
+ /// @note If "schema" is passed in `options.Attributes` it has priority over the deduced schema (the latter is ignored).
+ template <typename TRowType>
+ TNodeId CreateTable(
+ const TYPath& path,
+ const TSortColumns& sortColumns = TSortColumns(),
+ const TCreateOptions& options = TCreateOptions());
+
+ ///
+ /// @brief Remove Cypress node.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remove)
+ virtual void Remove(
+ const TYPath& path,
+ const TRemoveOptions& options = TRemoveOptions()) = 0;
+
+ ///
+ /// @brief Check if Cypress node exists.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#exists)
+ virtual bool Exists(
+ const TYPath& path,
+ const TExistsOptions& options = TExistsOptions()) = 0;
+
+ ///
+ /// @brief Get Cypress node contents.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get)
+ virtual TNode Get(
+ const TYPath& path,
+ const TGetOptions& options = TGetOptions()) = 0;
+
+ ///
+ /// @brief Set Cypress node contents.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#set)
+ virtual void Set(
+ const TYPath& path,
+ const TNode& value,
+ const TSetOptions& options = TSetOptions()) = 0;
+
+ ///
+ /// @brief Set multiple attributes for cypress path.
+ ///
+ /// @param path Path to root of the attributes to be set e.g. "//path/to/table/@";
+ /// it is important to make sure that path ends with "/@".
+ /// @param attributes Map with attributes
+ /// @param options Optional parameters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes)
+ virtual void MultisetAttributes(
+ const TYPath& path,
+ const TNode::TMapType& attributes,
+ const TMultisetAttributesOptions& options = TMultisetAttributesOptions()) = 0;
+
+ ///
+ /// @brief List Cypress map or attribute node keys.
+ ///
+ /// @param path Path in the tree to the node in question.
+ /// @param options Optional parameters.
+ ///
+ /// @return List of keys with attributes (if they were required in @ref NYT::TListOptions::AttributeFilter).
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list)
+ virtual TNode::TListType List(
+ const TYPath& path,
+ const TListOptions& options = TListOptions()) = 0;
+
+ ///
+ /// @brief Copy Cypress node.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#copy)
+ virtual TNodeId Copy(
+ const TYPath& sourcePath,
+ const TYPath& destinationPath,
+ const TCopyOptions& options = TCopyOptions()) = 0;
+
+ ///
+ /// @brief Move Cypress node (equivalent to copy-then-remove).
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#move)
+ virtual TNodeId Move(
+ const TYPath& sourcePath,
+ const TYPath& destinationPath,
+ const TMoveOptions& options = TMoveOptions()) = 0;
+
+ ///
+ /// @brief Create link to Cypress node.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#link)
+ virtual TNodeId Link(
+ const TYPath& targetPath,
+ const TYPath& linkPath,
+ const TLinkOptions& options = TLinkOptions()) = 0;
+
+ ///
+ /// @brief Concatenate several tables into one.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate)
+ virtual void Concatenate(
+ const TVector<TRichYPath>& sourcePaths,
+ const TRichYPath& destinationPath,
+ const TConcatenateOptions& options = TConcatenateOptions()) = 0;
+
+ ///
+ /// @brief Concatenate several tables into one.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate)
+ virtual void Concatenate(
+ const TVector<TYPath>& sourcePaths,
+ const TYPath& destinationPath,
+ const TConcatenateOptions& options = TConcatenateOptions());
+
+ ///
+ /// @brief Canonize YPath, moving all the complex YPath features to attributes.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#parse-ypath)
+ virtual TRichYPath CanonizeYPath(const TRichYPath& path) = 0;
+
+ ///
+ /// @brief Get statistics for given sets of columns in given table ranges.
+ ///
+ /// @note Paths must contain column selectors.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-table-columnar-statistics)
+ virtual TVector<TTableColumnarStatistics> GetTableColumnarStatistics(
+ const TVector<TRichYPath>& paths,
+ const TGetTableColumnarStatisticsOptions& options = {}) = 0;
+
+ ///
+ /// @brief Divide input tables into disjoint partitions.
+ ///
+ /// Resulted partitions are vectors of rich YPaths.
+ /// Each partition can be given to a separate worker for further independent processing.
+ ///
+ virtual TMultiTablePartitions GetTablePartitions(
+ const TVector<TRichYPath>& paths,
+ const TGetTablePartitionsOptions& options) = 0;
+
+ ///
+ /// @brief Get file from file cache.
+ ///
+ /// @param md5Signature MD5 digest of the file.
+ /// @param cachePath Path to the file cache.
+ /// @param options Optional parameters.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-file-from-cache)
+ virtual TMaybe<TYPath> GetFileFromCache(
+ const TString& md5Signature,
+ const TYPath& cachePath,
+ const TGetFileFromCacheOptions& options = TGetFileFromCacheOptions()) = 0;
+
+ ///
+ /// @brief Put file to file cache.
+ ///
+ /// @param filePath Path in Cypress to the file to cache.
+ /// @param md5Signature Expected MD5 digest of the file.
+ /// @param cachePath Path to the file cache.
+ /// @param options Optional parameters.
+ ///
+ /// @note The file in `filePath` must have been written with @ref NYT::TFileWriterOptions::ComputeMD5 set to `true`.
+ ///
+ /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#put-file-to-cache)
+ virtual TYPath PutFileToCache(
+ const TYPath& filePath,
+ const TString& md5Signature,
+ const TYPath& cachePath,
+ const TPutFileToCacheOptions& options = TPutFileToCacheOptions()) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TRowType>
+TNodeId ICypressClient::CreateTable(
+ const TYPath& path,
+ const TSortColumns& sortColumns,
+ const TCreateOptions& options)
+{
+ static_assert(
+ std::is_base_of_v<::google::protobuf::Message, TRowType>,
+ "TRowType must be inherited from google::protobuf::Message");
+
+ TCreateOptions actualOptions = options;
+ if (!actualOptions.Attributes_) {
+ actualOptions.Attributes_ = TNode::CreateMap();
+ }
+
+ if (!actualOptions.Attributes_->HasKey("schema")) {
+ actualOptions.Attributes_->AsMap().emplace(
+ "schema",
+ CreateTableSchema<TRowType>(sortColumns).ToNode());
+ }
+
+ return Create(path, ENodeType::NT_TABLE, actualOptions);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/error_codes.h b/yt/cpp/mapreduce/interface/error_codes.h
new file mode 100644
index 0000000000..d8d76e04fd
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/error_codes.h
@@ -0,0 +1,468 @@
+#pragma once
+
+//
+// generated by generate-error-codes.py
+//
+
+namespace NYT {
+namespace NClusterErrorCodes {
+
+
+
+// from ./core/misc/public.h
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int OK = 0;
+ constexpr int Generic = 1;
+ constexpr int Canceled = 2;
+ constexpr int Timeout = 3;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// from ./core/rpc/public.h
+namespace NRpc {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int TransportError = 100;
+ constexpr int ProtocolError = 101;
+ constexpr int NoSuchService = 102;
+ constexpr int NoSuchMethod = 103;
+ constexpr int Unavailable = 105;
+ constexpr int PoisonPill = 106;
+ constexpr int RequestQueueSizeLimitExceeded = 108;
+ constexpr int AuthenticationError = 109;
+ constexpr int InvalidCsrfToken = 110;
+ constexpr int InvalidCredentials = 111;
+ constexpr int StreamingNotSupported = 112;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NRpc
+
+
+
+// from ./core/bus/public.h
+namespace NBus {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int TransportError = 100;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NBus
+
+
+
+// from ./client/scheduler/public.h
+namespace NScheduler {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int NoSuchOperation = 200;
+ constexpr int InvalidOperationState = 201;
+ constexpr int TooManyOperations = 202;
+ constexpr int NoSuchJob = 203;
+ constexpr int OperationFailedOnJobRestart = 210;
+ constexpr int OperationFailedWithInconsistentLocking = 211;
+ constexpr int OperationControllerCrashed = 212;
+ constexpr int TestingError = 213;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NScheduler
+
+
+
+// from ./client/table_client/public.h
+namespace NTableClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int SortOrderViolation = 301;
+ constexpr int InvalidDoubleValue = 302;
+ constexpr int IncomparableType = 303;
+ constexpr int UnhashableType = 304;
+ // E.g. name table with more than #MaxColumnId columns (may come from legacy chunks).
+ constexpr int CorruptedNameTable = 305;
+ constexpr int UniqueKeyViolation = 306;
+ constexpr int SchemaViolation = 307;
+ constexpr int RowWeightLimitExceeded = 308;
+ constexpr int InvalidColumnFilter = 309;
+ constexpr int InvalidColumnRenaming = 310;
+ constexpr int IncompatibleKeyColumns = 311;
+ constexpr int ReaderDeadlineExpired = 312;
+ constexpr int TimestampOutOfRange = 313;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NTableClient
+
+
+
+// from ./client/cypress_client/public.h
+namespace NCypressClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int SameTransactionLockConflict = 400;
+ constexpr int DescendantTransactionLockConflict = 401;
+ constexpr int ConcurrentTransactionLockConflict = 402;
+ constexpr int PendingLockConflict = 403;
+ constexpr int LockDestroyed = 404;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NCypressClient
+
+
+
+// from ./core/ytree/public.h
+namespace NYTree {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int ResolveError = 500;
+ constexpr int AlreadyExists = 501;
+ constexpr int MaxChildCountViolation = 502;
+ constexpr int MaxStringLengthViolation = 503;
+ constexpr int MaxAttributeSizeViolation = 504;
+ constexpr int MaxKeyLengthViolation = 505;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYTree
+
+
+
+// from ./client/hydra/public.h
+namespace NHydra {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int NoSuchSnapshot = 600;
+ constexpr int NoSuchChangelog = 601;
+ constexpr int InvalidEpoch = 602;
+ constexpr int InvalidVersion = 603;
+ constexpr int OutOfOrderMutations = 609;
+ constexpr int InvalidSnapshotVersion = 610;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NHydra
+
+
+
+// from ./client/chunk_client/public.h
+namespace NChunkClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int AllTargetNodesFailed = 700;
+ constexpr int SendBlocksFailed = 701;
+ constexpr int NoSuchSession = 702;
+ constexpr int SessionAlreadyExists = 703;
+ constexpr int ChunkAlreadyExists = 704;
+ constexpr int WindowError = 705;
+ constexpr int BlockContentMismatch = 706;
+ constexpr int NoSuchBlock = 707;
+ constexpr int NoSuchChunk = 708;
+ constexpr int NoLocationAvailable = 710;
+ constexpr int IOError = 711;
+ constexpr int MasterCommunicationFailed = 712;
+ constexpr int NoSuchChunkTree = 713;
+ constexpr int MasterNotConnected = 714;
+ constexpr int ChunkUnavailable = 716;
+ constexpr int NoSuchChunkList = 717;
+ constexpr int WriteThrottlingActive = 718;
+ constexpr int NoSuchMedium = 719;
+ constexpr int OptimisticLockFailure = 720;
+ constexpr int InvalidBlockChecksum = 721;
+ constexpr int BlockOutOfRange = 722;
+ constexpr int ObjectNotReplicated = 723;
+ constexpr int MissingExtension = 724;
+ constexpr int BandwidthThrottlingFailed = 725;
+ constexpr int ReaderTimeout = 726;
+ constexpr int NoSuchChunkView = 727;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NChunkClient
+
+
+
+// from ./client/election/public.h
+namespace NElection {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int InvalidState = 800;
+ constexpr int InvalidLeader = 801;
+ constexpr int InvalidEpoch = 802;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NElection
+
+
+
+// from ./client/security_client/public.h
+namespace NSecurityClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int AuthenticationError = 900;
+ constexpr int AuthorizationError = 901;
+ constexpr int AccountLimitExceeded = 902;
+ constexpr int UserBanned = 903;
+ constexpr int RequestQueueSizeLimitExceeded = 904;
+ constexpr int NoSuchAccount = 905;
+ constexpr int SafeModeEnabled = 906;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NSecurityClient
+
+
+
+// from ./client/object_client/public.h
+namespace NObjectClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int PrerequisiteCheckFailed = 1000;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NObjectClient
+
+
+
+// from ./server/lib/exec_agent/public.h
+namespace NExecAgent {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int ConfigCreationFailed = 1100;
+ constexpr int AbortByScheduler = 1101;
+ constexpr int ResourceOverdraft = 1102;
+ constexpr int WaitingJobTimeout = 1103;
+ constexpr int SlotNotFound = 1104;
+ constexpr int JobEnvironmentDisabled = 1105;
+ constexpr int JobProxyConnectionFailed = 1106;
+ constexpr int ArtifactCopyingFailed = 1107;
+ constexpr int NodeDirectoryPreparationFailed = 1108;
+ constexpr int SlotLocationDisabled = 1109;
+ constexpr int QuotaSettingFailed = 1110;
+ constexpr int RootVolumePreparationFailed = 1111;
+ constexpr int NotEnoughDiskSpace = 1112;
+ constexpr int ArtifactDownloadFailed = 1113;
+ constexpr int JobProxyPreparationTimeout = 1114;
+ constexpr int JobPreparationTimeout = 1115;
+ constexpr int JobProxyFailed = 1120;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NExecAgent
+
+
+
+// from ./ytlib/job_proxy/public.h
+namespace NJobProxy {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int MemoryLimitExceeded = 1200;
+ constexpr int MemoryCheckFailed = 1201;
+ constexpr int JobTimeLimitExceeded = 1202;
+ constexpr int UnsupportedJobType = 1203;
+ constexpr int JobNotPrepared = 1204;
+ constexpr int UserJobFailed = 1205;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NJobProxy
+
+
+
+// from ./server/node/data_node/public.h
+namespace NDataNode {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int LocalChunkReaderFailed = 1300;
+ constexpr int LayerUnpackingFailed = 1301;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NDataNode
+
+
+
+// from ./core/net/public.h
+namespace NNet {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int Aborted = 1500;
+ constexpr int ResolveTimedOut = 1501;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NNet
+
+
+
+// from ./client/node_tracker_client/public.h
+namespace NNodeTrackerClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int NoSuchNode = 1600;
+ constexpr int InvalidState = 1601;
+ constexpr int NoSuchNetwork = 1602;
+ constexpr int NoSuchRack = 1603;
+ constexpr int NoSuchDataCenter = 1604;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NNodeTrackerClient
+
+
+
+// from ./client/tablet_client/public.h
+namespace NTabletClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int TransactionLockConflict = 1700;
+ constexpr int NoSuchTablet = 1701;
+ constexpr int TabletNotMounted = 1702;
+ constexpr int AllWritesDisabled = 1703;
+ constexpr int InvalidMountRevision = 1704;
+ constexpr int TableReplicaAlreadyExists = 1705;
+ constexpr int InvalidTabletState = 1706;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NTabletClient
+
+
+
+// from ./server/lib/shell/public.h
+namespace NShell {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int ShellExited = 1800;
+ constexpr int ShellManagerShutDown = 1801;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NShell
+
+
+
+// from ./client/api/public.h
+namespace NApi {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int TooManyConcurrentRequests = 1900;
+ constexpr int JobArchiveUnavailable = 1910;
+ constexpr int RetriableArchiveError = 1911;
+ constexpr int NoSuchOperation = 1915;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NApi
+
+
+
+// from ./server/controller_agent/chunk_pools/public.h
+namespace NChunkPools {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int DataSliceLimitExceeded = 2000;
+ constexpr int MaxDataWeightPerJobExceeded = 2001;
+ constexpr int MaxPrimaryDataWeightPerJobExceeded = 2002;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NChunkPools
+
+
+
+// from ./client/api/rpc_proxy/public.h
+namespace NApi {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int ProxyBanned = 2100;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NApi
+
+
+
+// from ./ytlib/controller_agent/public.h
+namespace NControllerAgent {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int AgentCallFailed = 4400;
+ constexpr int NoOnlineNodeToScheduleJob = 4410;
+ constexpr int MaterializationFailed = 4415;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NControllerAgent
+
+
+
+// from ./client/transaction_client/public.h
+namespace NTransactionClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int NoSuchTransaction = 11000;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NTransactionClient
+
+
+
+// from ./server/lib/containers/public.h
+namespace NContainers {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int FailedToStartContainer = 13000;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NContainers
+
+
+
+// from ./ytlib/job_prober_client/public.h
+namespace NJobProberClient {
+
+////////////////////////////////////////////////////////////////////////////////
+
+ constexpr int JobIsNotRunning = 17000;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NJobProberClient
+
+} // namespace NClusterErrorCodes
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/error_ut.cpp b/yt/cpp/mapreduce/interface/error_ut.cpp
new file mode 100644
index 0000000000..03f2751b23
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/error_ut.cpp
@@ -0,0 +1,81 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <library/cpp/json/json_reader.h>
+
+#include <yt/cpp/mapreduce/interface/errors.h>
+#include <yt/cpp/mapreduce/common/helpers.h>
+
+using namespace NYT;
+
+template<>
+void Out<NYT::TNode>(IOutputStream& s, const NYT::TNode& node)
+{
+ s << "TNode:" << NodeToYsonString(node);
+}
+
+Y_UNIT_TEST_SUITE(ErrorSuite)
+{
+ Y_UNIT_TEST(TestParseJson)
+ {
+ // Scary real world error! Бу!
+ const char* jsonText =
+ R"""({)"""
+ R"""("code":500,)"""
+ R"""("message":"Error resolving path //home/user/link",)"""
+ R"""("attributes":{)"""
+ R"""("fid":18446484571700269066,)"""
+ R"""("method":"Create",)"""
+ R"""("tid":17558639495721339338,)"""
+ R"""("datetime":"2017-04-07T13:38:56.474819Z",)"""
+ R"""("pid":414529,)"""
+ R"""("host":"build01-01g.yt.yandex.net"},)"""
+ R"""("inner_errors":[{)"""
+ R"""("code":1,)"""
+ R"""("message":"Node //tt cannot have children",)"""
+ R"""("attributes":{)"""
+ R"""("fid":18446484571700269066,)"""
+ R"""("tid":17558639495721339338,)"""
+ R"""("datetime":"2017-04-07T13:38:56.474725Z",)"""
+ R"""("pid":414529,)"""
+ R"""("host":"build01-01g.yt.yandex.net"},)"""
+ R"""("inner_errors":[]}]})""";
+
+ NJson::TJsonValue jsonValue;
+ ReadJsonFastTree(jsonText, &jsonValue, /*throwOnError=*/ true);
+
+ TYtError error(jsonValue);
+ UNIT_ASSERT_VALUES_EQUAL(error.GetCode(), 500);
+ UNIT_ASSERT_VALUES_EQUAL(error.GetMessage(), R"""(Error resolving path //home/user/link)""");
+ UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors().size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors()[0].GetCode(), 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(error.HasAttributes(), true);
+ UNIT_ASSERT_VALUES_EQUAL(error.GetAttributes().at("method"), TNode("Create"));
+
+ UNIT_ASSERT_VALUES_EQUAL(error.GetAllErrorCodes(), TSet<int>({500, 1}));
+ }
+
+ Y_UNIT_TEST(TestGetYsonText) {
+ const char* jsonText =
+ R"""({)"""
+ R"""("code":500,)"""
+ R"""("message":"outer error",)"""
+ R"""("attributes":{)"""
+ R"""("method":"Create",)"""
+ R"""("pid":414529},)"""
+ R"""("inner_errors":[{)"""
+ R"""("code":1,)"""
+ R"""("message":"inner error",)"""
+ R"""("attributes":{},)"""
+ R"""("inner_errors":[])"""
+ R"""(}]})""";
+ TYtError error;
+ error.ParseFrom(jsonText);
+ TString ysonText = error.GetYsonText();
+ TYtError error2(NodeFromYsonString(ysonText));
+ UNIT_ASSERT_EQUAL(
+ ysonText,
+ R"""({"code"=500;"message"="outer error";"attributes"={"method"="Create";"pid"=414529};"inner_errors"=[{"code"=1;"message"="inner error"}]})""");
+ UNIT_ASSERT_EQUAL(error2.GetYsonText(), ysonText);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/errors.cpp b/yt/cpp/mapreduce/interface/errors.cpp
new file mode 100644
index 0000000000..49a7c7cfc1
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/errors.cpp
@@ -0,0 +1,437 @@
+#include "errors.h"
+
+#include <library/cpp/yson/node/node_io.h>
+#include <library/cpp/yson/node/node_visitor.h>
+
+#include <yt/cpp/mapreduce/interface/error_codes.h>
+
+#include <library/cpp/json/json_reader.h>
+#include <library/cpp/yson/writer.h>
+
+#include <util/string/builder.h>
+#include <util/stream/str.h>
+#include <util/generic/set.h>
+
+namespace NYT {
+
+using namespace NJson;
+
+////////////////////////////////////////////////////////////////////
+
+static void WriteErrorDescription(const TYtError& error, IOutputStream* out)
+{
+ (*out) << '\'' << error.GetMessage() << '\'';
+ const auto& innerErrorList = error.InnerErrors();
+ if (!innerErrorList.empty()) {
+ (*out) << " { ";
+ bool first = true;
+ for (const auto& innerError : innerErrorList) {
+ if (first) {
+ first = false;
+ } else {
+ (*out) << " ; ";
+ }
+ WriteErrorDescription(innerError, out);
+ }
+ (*out) << " }";
+ }
+}
+
+static void SerializeError(const TYtError& error, NYson::IYsonConsumer* consumer)
+{
+ consumer->OnBeginMap();
+ {
+ consumer->OnKeyedItem("code");
+ consumer->OnInt64Scalar(error.GetCode());
+
+ consumer->OnKeyedItem("message");
+ consumer->OnStringScalar(error.GetMessage());
+
+ if (!error.GetAttributes().empty()) {
+ consumer->OnKeyedItem("attributes");
+ consumer->OnBeginMap();
+ {
+ for (const auto& item : error.GetAttributes()) {
+ consumer->OnKeyedItem(item.first);
+ TNodeVisitor(consumer).Visit(item.second);
+ }
+ }
+ consumer->OnEndMap();
+ }
+
+ if (!error.InnerErrors().empty()) {
+ consumer->OnKeyedItem("inner_errors");
+ {
+ consumer->OnBeginList();
+ for (const auto& innerError : error.InnerErrors()) {
+ SerializeError(innerError, consumer);
+ }
+ consumer->OnEndList();
+ }
+ }
+ }
+ consumer->OnEndMap();
+}
+
+static TString DumpJobInfoForException(const TOperationId& operationId, const TVector<TFailedJobInfo>& failedJobInfoList)
+{
+ ::TStringBuilder output;
+ // Exceptions have limit to contain 65508 bytes of text, so we also limit stderr text
+ constexpr size_t MAX_SIZE = 65508 / 2;
+
+ size_t written = 0;
+ for (const auto& failedJobInfo : failedJobInfoList) {
+ if (written >= MAX_SIZE) {
+ break;
+ }
+ TStringStream nextChunk;
+ nextChunk << '\n';
+ nextChunk << "OperationId: " << GetGuidAsString(operationId) << " JobId: " << GetGuidAsString(failedJobInfo.JobId) << '\n';
+ nextChunk << "Error: " << failedJobInfo.Error.FullDescription() << '\n';
+ if (!failedJobInfo.Stderr.empty()) {
+ nextChunk << "Stderr: " << Endl;
+ size_t tmpWritten = written + nextChunk.Str().size();
+ if (tmpWritten >= MAX_SIZE) {
+ break;
+ }
+
+ if (tmpWritten + failedJobInfo.Stderr.size() > MAX_SIZE) {
+ nextChunk << failedJobInfo.Stderr.substr(failedJobInfo.Stderr.size() - (MAX_SIZE - tmpWritten));
+ } else {
+ nextChunk << failedJobInfo.Stderr;
+ }
+ }
+ written += nextChunk.Str().size();
+ output << nextChunk.Str();
+ }
+ return output;
+}
+
+////////////////////////////////////////////////////////////////////
+
+TYtError::TYtError()
+ : Code_(0)
+{ }
+
+TYtError::TYtError(const TString& message)
+ : Code_(NYT::NClusterErrorCodes::Generic)
+ , Message_(message)
+{ }
+
+TYtError::TYtError(int code, const TString& message)
+ : Code_(code)
+ , Message_(message)
+{ }
+
+TYtError::TYtError(const TJsonValue& value)
+{
+ const TJsonValue::TMapType& map = value.GetMap();
+ TJsonValue::TMapType::const_iterator it = map.find("message");
+ if (it != map.end()) {
+ Message_ = it->second.GetString();
+ }
+
+ it = map.find("code");
+ if (it != map.end()) {
+ Code_ = static_cast<int>(it->second.GetInteger());
+ } else {
+ Code_ = NYT::NClusterErrorCodes::Generic;
+ }
+
+ it = map.find("inner_errors");
+ if (it != map.end()) {
+ const TJsonValue::TArray& innerErrors = it->second.GetArray();
+ for (const auto& innerError : innerErrors) {
+ InnerErrors_.push_back(TYtError(innerError));
+ }
+ }
+
+ it = map.find("attributes");
+ if (it != map.end()) {
+ auto attributes = NYT::NodeFromJsonValue(it->second);
+ if (attributes.IsMap()) {
+ Attributes_ = std::move(attributes.AsMap());
+ }
+ }
+}
+
+TYtError::TYtError(const TNode& node)
+{
+ const auto& map = node.AsMap();
+ auto it = map.find("message");
+ if (it != map.end()) {
+ Message_ = it->second.AsString();
+ }
+
+ it = map.find("code");
+ if (it != map.end()) {
+ Code_ = static_cast<int>(it->second.AsInt64());
+ } else {
+ Code_ = NYT::NClusterErrorCodes::Generic;
+ }
+
+ it = map.find("inner_errors");
+ if (it != map.end()) {
+ const auto& innerErrors = it->second.AsList();
+ for (const auto& innerError : innerErrors) {
+ InnerErrors_.push_back(TYtError(innerError));
+ }
+ }
+
+ it = map.find("attributes");
+ if (it != map.end()) {
+ auto& attributes = it->second;
+ if (attributes.IsMap()) {
+ Attributes_ = std::move(attributes.AsMap());
+ }
+ }
+}
+
+int TYtError::GetCode() const
+{
+ return Code_;
+}
+
+const TString& TYtError::GetMessage() const
+{
+ return Message_;
+}
+
+const TVector<TYtError>& TYtError::InnerErrors() const
+{
+ return InnerErrors_;
+}
+
+void TYtError::ParseFrom(const TString& jsonError)
+{
+ TJsonValue value;
+ TStringInput input(jsonError);
+ ReadJsonTree(&input, &value);
+ *this = TYtError(value);
+}
+
+TSet<int> TYtError::GetAllErrorCodes() const
+{
+ TDeque<const TYtError*> queue = {this};
+ TSet<int> result;
+ while (!queue.empty()) {
+ const auto* current = queue.front();
+ queue.pop_front();
+ result.insert(current->Code_);
+ for (const auto& error : current->InnerErrors_) {
+ queue.push_back(&error);
+ }
+ }
+ return result;
+}
+
+bool TYtError::ContainsErrorCode(int code) const
+{
+ if (Code_ == code) {
+ return true;
+ }
+ for (const auto& error : InnerErrors_) {
+ if (error.ContainsErrorCode(code)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool TYtError::ContainsText(const TStringBuf& text) const
+{
+ if (Message_.Contains(text)) {
+ return true;
+ }
+ for (const auto& error : InnerErrors_) {
+ if (error.ContainsText(text)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool TYtError::HasAttributes() const
+{
+ return !Attributes_.empty();
+}
+
+const TNode::TMapType& TYtError::GetAttributes() const
+{
+ return Attributes_;
+}
+
+TString TYtError::GetYsonText() const
+{
+ TStringStream out;
+ ::NYson::TYsonWriter writer(&out, NYson::EYsonFormat::Text);
+ SerializeError(*this, &writer);
+ return std::move(out.Str());
+}
+
+TString TYtError::ShortDescription() const
+{
+ TStringStream out;
+ WriteErrorDescription(*this, &out);
+ return std::move(out.Str());
+}
+
+TString TYtError::FullDescription() const
+{
+ TStringStream s;
+ WriteErrorDescription(*this, &s);
+ s << "; full error: " << GetYsonText();
+ return s.Str();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TErrorResponse::TErrorResponse(int httpCode, const TString& requestId)
+ : HttpCode_(httpCode)
+ , RequestId_(requestId)
+{ }
+
+bool TErrorResponse::IsOk() const
+{
+ return Error_.GetCode() == 0;
+}
+
+void TErrorResponse::SetRawError(const TString& message)
+{
+ Error_ = TYtError(message);
+ Setup();
+}
+
+void TErrorResponse::SetError(TYtError error)
+{
+ Error_ = std::move(error);
+ Setup();
+}
+
+void TErrorResponse::ParseFromJsonError(const TString& jsonError)
+{
+ Error_.ParseFrom(jsonError);
+ Setup();
+}
+
+void TErrorResponse::SetIsFromTrailers(bool isFromTrailers)
+{
+ IsFromTrailers_ = isFromTrailers;
+}
+
+int TErrorResponse::GetHttpCode() const
+{
+ return HttpCode_;
+}
+
+bool TErrorResponse::IsFromTrailers() const
+{
+ return IsFromTrailers_;
+}
+
+bool TErrorResponse::IsTransportError() const
+{
+ return HttpCode_ == 503;
+}
+
+TString TErrorResponse::GetRequestId() const
+{
+ return RequestId_;
+}
+
+const TYtError& TErrorResponse::GetError() const
+{
+ return Error_;
+}
+
+bool TErrorResponse::IsResolveError() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NYTree::ResolveError);
+}
+
+bool TErrorResponse::IsAccessDenied() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::AuthorizationError);
+}
+
+bool TErrorResponse::IsConcurrentTransactionLockConflict() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NCypressClient::ConcurrentTransactionLockConflict);
+}
+
+bool TErrorResponse::IsRequestRateLimitExceeded() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::RequestQueueSizeLimitExceeded);
+}
+
+bool TErrorResponse::IsRequestQueueSizeLimitExceeded() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NRpc::RequestQueueSizeLimitExceeded);
+}
+
+bool TErrorResponse::IsChunkUnavailable() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NChunkClient::ChunkUnavailable);
+}
+
+bool TErrorResponse::IsRequestTimedOut() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::Timeout);
+}
+
+bool TErrorResponse::IsNoSuchTransaction() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NTransactionClient::NoSuchTransaction);
+}
+
+bool TErrorResponse::IsConcurrentOperationsLimitReached() const
+{
+ return Error_.ContainsErrorCode(NClusterErrorCodes::NScheduler::TooManyOperations);
+}
+
+void TErrorResponse::Setup()
+{
+ TStringStream s;
+ *this << Error_.FullDescription();
+}
+
+////////////////////////////////////////////////////////////////////
+
+TOperationFailedError::TOperationFailedError(
+ EState state,
+ TOperationId id,
+ TYtError ytError,
+ TVector<TFailedJobInfo> failedJobInfo)
+ : State_(state)
+ , OperationId_(id)
+ , Error_(std::move(ytError))
+ , FailedJobInfo_(std::move(failedJobInfo))
+{
+ *this << Error_.FullDescription();
+ if (!FailedJobInfo_.empty()) {
+ *this << DumpJobInfoForException(OperationId_, FailedJobInfo_);
+ }
+}
+
+TOperationFailedError::EState TOperationFailedError::GetState() const
+{
+ return State_;
+}
+
+TOperationId TOperationFailedError::GetOperationId() const
+{
+ return OperationId_;
+}
+
+const TYtError& TOperationFailedError::GetError() const
+{
+ return Error_;
+}
+
+const TVector<TFailedJobInfo>& TOperationFailedError::GetFailedJobInfo() const
+{
+ return FailedJobInfo_;
+}
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/errors.h b/yt/cpp/mapreduce/interface/errors.h
new file mode 100644
index 0000000000..afad58ed72
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/errors.h
@@ -0,0 +1,290 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/errors.h
+///
+/// Errors and exceptions emitted by library.
+
+#include "fwd.h"
+#include "common.h"
+
+#include <library/cpp/yson/node/node.h>
+
+#include <util/generic/bt_exception.h>
+#include <util/generic/yexception.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+
+namespace NJson {
+ class TJsonValue;
+} // namespace NJson
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Error that is thrown when library detects invalid usage of API.
+///
+/// For example trying to start operations on empty table list.
+class TApiUsageError
+ : public TWithBackTrace<yexception>
+{ };
+
+///
+/// @brief Error that is thrown when request retries continues for too long.
+///
+/// @see NYT::TRetryConfig
+/// @see NYT::IRetryConfigProvider
+class TRequestRetriesTimeout
+ : public yexception
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Error returned by YT cluster.
+///
+/// An object of this class describe error that happened on YT server.
+/// Internally each error is a tree. Each node of the tree contains:
+/// - integer error code;
+/// - text description of error;
+/// - attributes describing error context.
+///
+/// To get text description of an error one should use
+/// @ref NYT::TYtError::ShortDescription or @ref NYT::TYtError::FullDescription
+///
+/// To distinguish between error kinds @ref NYT::TYtError::ContainsErrorCode should be used.
+///
+/// @see NYT::TErrorResponse
+/// @see NYT::TOperationFailedError
+class TYtError
+{
+public:
+ /// Constructs error with NYT::NClusterErrorCodes::OK code and empty message.
+ TYtError();
+
+ /// Constructs error with NYT::NClusterErrorCodes::Generic code and given message.
+ explicit TYtError(const TString& message);
+
+ /// Constructs error with given code and given message.
+ TYtError(int code, const TString& message);
+
+ /// Construct error from json representation.
+ TYtError(const ::NJson::TJsonValue& value);
+
+ /// Construct error from TNode representation.
+ TYtError(const TNode& value);
+
+ ///
+ /// @brief Check if error or any of inner errors has given error code.
+ ///
+ /// Use this method to distinguish kind of error.
+ bool ContainsErrorCode(int code) const;
+
+ ///
+ /// @brief Get short description of error.
+ ///
+ /// Short description contain text description of error and all inner errors.
+ /// It is human readable but misses some important information (error codes, error attributes).
+ ///
+ /// Usually it's better to use @ref NYT::TYtError::FullDescription to log errors.
+ TString ShortDescription() const;
+
+ ///
+ /// @brief Get full description of error.
+ ///
+ /// Full description contains readable short description
+ /// followed by text yson representation of error that contains error codes and attributes.
+ TString FullDescription() const;
+
+ ///
+ /// @brief Get error code of the topmost error.
+ ///
+ /// @warning Do not use this method to distinguish between error kinds
+ /// @ref NYT::TYtError::ContainsErrorCode should be used instead.
+ int GetCode() const;
+
+ ///
+ /// @brief Get error text of the topmost error.
+ ///
+ /// @warning This method should not be used to log errors
+ /// since text description of inner errors is going to be lost.
+ /// @ref NYT::TYtError::FullDescription should be used instead.
+ const TString& GetMessage() const;
+
+ ///
+ /// @brief Check if error or any of inner errors contains given text chunk.
+ ///
+ /// @warning @ref NYT::TYtError::ContainsErrorCode must be used instead of
+ /// this method when possible. If there is no suitable error code it's
+ /// better to ask yt@ to add one. This method should only be used as workaround.
+ bool ContainsText(const TStringBuf& text) const;
+
+ /// @brief Get inner errors.
+ const TVector<TYtError>& InnerErrors() const;
+
+ /// Parse error from json string.
+ void ParseFrom(const TString& jsonError);
+
+ /// Collect error codes from entire error tree.
+ TSet<int> GetAllErrorCodes() const;
+
+ /// Check if error has any attributes.
+ bool HasAttributes() const;
+
+ /// Get error attributes.
+ const TNode::TMapType& GetAttributes() const;
+
+ /// Get text yson representation of error
+ TString GetYsonText() const;
+
+private:
+ int Code_;
+ TString Message_;
+ TVector<TYtError> InnerErrors_;
+ TNode::TMapType Attributes_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Generic error response returned by server.
+///
+/// TErrorResponse can be thrown from almost any client method when server responds with error.
+///
+class TErrorResponse
+ : public yexception
+{
+public:
+ TErrorResponse(int httpCode, const TString& requestId);
+ TErrorResponse(int httpCode, TYtError error);
+
+ /// Get error object returned by server.
+ const TYtError& GetError() const;
+
+ /// Get if (correlation-id) of request that was responded with error.
+ TString GetRequestId() const;
+
+ /// Get HTTP code of response.
+ int GetHttpCode() const;
+
+ /// Is error parsed from response trailers.
+ bool IsFromTrailers() const;
+
+ /// Check if error was caused by transport problems inside YT cluster.
+ bool IsTransportError() const;
+
+ /// Check if error was caused by failure to resolve cypress path.
+ bool IsResolveError() const;
+
+ /// Check if error was caused by lack of permissions to execute request.
+ bool IsAccessDenied() const;
+
+ /// Check if error was caused by failure to lock object because of another transaction is holding lock.
+ bool IsConcurrentTransactionLockConflict() const;
+
+ /// Check if error was caused by request quota limit exceeding.
+ bool IsRequestRateLimitExceeded() const;
+
+ // YT can't serve request because it is overloaded.
+ bool IsRequestQueueSizeLimitExceeded() const;
+
+ /// Check if error was caused by failure to get chunk. Such errors are almost always temporary.
+ bool IsChunkUnavailable() const;
+
+ /// Check if error was caused by internal YT timeout.
+ bool IsRequestTimedOut() const;
+
+ /// Check if error was caused by trying to work with transaction that was finished or never existed.
+ bool IsNoSuchTransaction() const;
+
+ // User reached their limit of concurrently running operations.
+ bool IsConcurrentOperationsLimitReached() const;
+
+ /// @deprecated This method must not be used.
+ bool IsOk() const;
+
+ void SetRawError(const TString& message);
+ void SetError(TYtError error);
+ void ParseFromJsonError(const TString& jsonError);
+ void SetIsFromTrailers(bool isFromTrailers);
+
+private:
+ void Setup();
+
+private:
+ int HttpCode_;
+ TString RequestId_;
+ TYtError Error_;
+ bool IsFromTrailers_ = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Info about failed jobs.
+///
+/// @see NYT::TOperationFailedError
+struct TFailedJobInfo
+{
+ /// Id of a job.
+ TJobId JobId;
+
+ /// Error describing job failure.
+ TYtError Error;
+
+ /// Stderr of job.
+ ///
+ /// @note YT doesn't store all job stderrs, check @ref NYT::IOperationClient::GetJobStderr
+ /// for list of limitations.
+ ///
+ /// @see NYT::IOperationClient::GetJobStderr
+ TString Stderr;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Error that is thrown when operation watched by library fails.
+///
+/// This error is thrown from operation starting methods when they are started in sync mode (@ refNYT::TOperationOptions::Wait == true)
+/// or from future returned by NYT::IOperation::Watch.
+///
+/// @see NYT::IOperationClient
+class TOperationFailedError
+ : public yexception
+{
+public:
+ /// Final state of operation.
+ enum EState {
+ /// Operation was failed due to some error.
+ Failed,
+ /// Operation didn't experienced errors, but was aborted by user request or by YT.
+ Aborted,
+ };
+
+public:
+ TOperationFailedError(EState state, TOperationId id, TYtError ytError, TVector<TFailedJobInfo> failedJobInfo);
+
+ /// Get final state of operation.
+ EState GetState() const;
+
+ /// Get operation id.
+ TOperationId GetOperationId() const;
+
+ /// Return operation error.
+ const TYtError& GetError() const;
+
+ /// Return info about failed jobs (if any).
+ const TVector<TFailedJobInfo>& GetFailedJobInfo() const;
+
+private:
+ EState State_;
+ TOperationId OperationId_;
+ TYtError Error_;
+ TVector<TFailedJobInfo> FailedJobInfo_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/finish_or_die.h b/yt/cpp/mapreduce/interface/finish_or_die.h
new file mode 100644
index 0000000000..9d7dcece02
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/finish_or_die.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <util/system/yassert.h>
+
+#include <exception>
+
+/// @cond Doxygen_Suppress
+namespace NYT::NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+void FinishOrDie(T* pThis, const char* className) noexcept
+{
+ auto fail = [&] (const char* what) {
+ Y_FAIL(
+ "\n\n"
+ "Destructor of %s caught exception during Finish: %s.\n"
+ "Some data is probably has not been written.\n"
+ "In order to handle such exceptions consider explicitly call Finish() method.\n",
+ className,
+ what);
+ };
+
+ try {
+ pThis->Finish();
+ } catch (const std::exception& ex) {
+ if (!std::uncaught_exceptions()) {
+ fail(ex.what());
+ }
+ } catch (...) {
+ if (!std::uncaught_exceptions()) {
+ fail("<unknown exception>");
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NDetail
+/// @endcond
diff --git a/yt/cpp/mapreduce/interface/fluent.h b/yt/cpp/mapreduce/interface/fluent.h
new file mode 100644
index 0000000000..8ca6e86336
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/fluent.h
@@ -0,0 +1,678 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/fluent.h
+///
+/// Adapters for working with @ref NYson::IYsonConsumer in a structured way, with compile-time syntax checks.
+///
+/// The following documentation is copied verbatim from `yt/core/ytree/fluent.h`.
+///
+/// WHAT IS THIS
+///
+/// Fluent adapters encapsulate invocation of IYsonConsumer methods in a
+/// convenient structured manner. Key advantage of fluent-like code is that
+/// attempt of building syntactically incorrect YSON structure will result
+/// in a compile-time error.
+///
+/// Each fluent object is associated with a context that defines possible YSON
+/// tokens that may appear next. For example, TFluentMap is a fluent object
+/// that corresponds to a location within YSON map right before a key-value
+/// pair or the end of the map.
+///
+/// More precisely, each object that may be obtained by a sequence of fluent
+/// method calls has the full history of its enclosing YSON composite types in
+/// its single template argument hereinafter referred to as TParent. This allows
+/// us not to forget the original context after opening and closing the embedded
+/// composite structure.
+///
+/// It is possible to invoke a separate YSON building procedure by calling
+/// one of convenience Do* methods. There are two possibilities here: it is
+/// possible to delegate invocation context either as a fluent object (like
+/// TFluentMap, TFluentList, TFluentAttributes or TFluentAny) or as a raw
+/// IYsonConsumer*. The latter is discouraged since it is impossible to check
+/// if a given side-built YSON structure fits current fluent context.
+/// For example it is possible to call Do() method inside YSON map passing
+/// consumer to a procedure that will treat context like it is in a list.
+/// Passing typed fluent builder saves you from such a misbehaviour.
+///
+/// TFluentXxx corresponds to an internal class of TXxx
+/// without any history hidden in template argument. It allows you to
+/// write procedures of form:
+///
+/// void BuildSomeAttributesInYson(TFluentMap fluent) { ... }
+///
+/// without thinking about the exact way how this procedure is nested in other
+/// procedures.
+///
+/// An important notation: we will refer to a function whose first argument
+/// is TFluentXxx as TFuncXxx.
+///
+///
+/// BRIEF LIST OF AVAILABLE METHODS
+///
+/// Only the most popular methods are covered here. Refer to the code for the
+/// rest of them.
+///
+/// TAny:
+/// * Value(T value) -> TParent, serialize `value` using underlying consumer.
+/// T should be such that free function Serialize(NYson::IYsonConsumer*, const T&) is
+/// defined;
+/// * BeginMap() -> TFluentMap, open map;
+/// * BeginList() -> TFluentList, open list;
+/// * BeginAttributes() -> TFluentAttributes, open attributes;
+///
+/// * Do(TFuncAny func) -> TAny, delegate invocation to a separate procedure.
+/// * DoIf(bool condition, TFuncAny func) -> TAny, same as Do() but invoke
+/// `func` only if `condition` is true;
+/// * DoFor(TCollection collection, TFuncAny func) -> TAny, same as Do()
+/// but iterate over `collection` and pass each of its elements as a second
+/// argument to `func`. Instead of passing a collection you may it is possible
+/// to pass two iterators as an argument;
+///
+/// * DoMap(TFuncMap func) -> TAny, open a map, delegate invocation to a separate
+/// procedure and close map;
+/// * DoMapFor(TCollection collection, TFuncMap func) -> TAny, open a map, iterate
+/// over `collection` and pass each of its elements as a second argument to `func`
+/// and close map;
+/// * DoList(TFuncList func) -> TAny, same as DoMap();
+/// * DoListFor(TCollection collection, TFuncList func) -> TAny; same as DoMapFor().
+///
+///
+/// TFluentMap:
+/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`;
+/// * EndMap() -> TParent, close map;
+/// * Do(TFuncMap func) -> TFluentMap, same as Do() for TAny;
+/// * DoIf(bool condition, TFuncMap func) -> TFluentMap, same as DoIf() for TAny;
+/// * DoFor(TCollection collection, TFuncMap func) -> TFluentMap, same as DoFor() for TAny.
+///
+///
+/// TFluentList:
+/// * Item() -> TAny, open an new list element;
+/// * EndList() -> TParent, close list;
+/// * Do(TFuncList func) -> TFluentList, same as Do() for TAny;
+/// * DoIf(bool condition, TFuncList func) -> TFluentList, same as DoIf() for TAny;
+/// * DoFor(TCollection collection, TListMap func) -> TFluentList, same as DoFor() for TAny.
+///
+///
+/// TFluentAttributes:
+/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`.
+/// * EndAttributes() -> TParentWithoutAttributes, close attributes. Note that
+/// this method leads to a context that is forces not to have attributes,
+/// preventing us from putting attributes twice before an object.
+/// * Do(TFuncAttributes func) -> TFluentAttributes, same as Do() for TAny;
+/// * DoIf(bool condition, TFuncAttributes func) -> TFluentAttributes, same as DoIf()
+/// for TAny;
+/// * DoFor(TCollection collection, TListAttributes func) -> TFluentAttributes, same as DoFor()
+/// for TAny.
+///
+
+
+#include "common.h"
+#include "serialize.h"
+
+#include <library/cpp/yson/node/serialize.h>
+#include <library/cpp/yson/node/node_builder.h>
+
+#include <library/cpp/yson/consumer.h>
+#include <library/cpp/yson/writer.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/ptr.h>
+#include <util/stream/str.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+struct TFluentYsonUnwrapper
+{
+ using TUnwrapped = T;
+
+ static TUnwrapped Unwrap(T t)
+ {
+ return std::move(t);
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TFluentYsonVoid
+{ };
+
+template <>
+struct TFluentYsonUnwrapper<TFluentYsonVoid>
+{
+ using TUnwrapped = void;
+
+ static TUnwrapped Unwrap(TFluentYsonVoid)
+ { }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This class is actually a namespace for specific fluent adapter classes.
+class TFluentYsonBuilder
+ : private TNonCopyable
+{
+private:
+ template <class T>
+ static void WriteValue(NYT::NYson::IYsonConsumer* consumer, const T& value)
+ {
+ Serialize(value, consumer);
+ }
+
+public:
+ class TFluentAny;
+ template <class TParent> class TAny;
+ template <class TParent> class TToAttributes;
+ template <class TParent> class TAttributes;
+ template <class TParent> class TListType;
+ template <class TParent> class TMapType;
+
+ /// Base class for all fluent adapters.
+ template <class TParent>
+ class TFluentBase
+ {
+ public:
+ /// Implicit conversion to yson consumer
+ operator NYT::NYson::IYsonConsumer* () const
+ {
+ return Consumer;
+ }
+
+ protected:
+ /// @cond Doxygen_Suppress
+ NYT::NYson::IYsonConsumer* Consumer;
+ TParent Parent;
+
+ TFluentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent)
+ : Consumer(consumer)
+ , Parent(std::move(parent))
+ { }
+
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ TUnwrappedParent GetUnwrappedParent()
+ {
+ return TFluentYsonUnwrapper<TParent>::Unwrap(std::move(Parent));
+ }
+ /// @endcond Doxygen_Suppress
+ };
+
+ /// Base class for fluent adapters for fragment of list, map or attributes.
+ template <template <class TParent> class TThis, class TParent>
+ class TFluentFragmentBase
+ : public TFluentBase<TParent>
+ {
+ public:
+ using TDeepThis = TThis<TParent>;
+ using TShallowThis = TThis<TFluentYsonVoid>;
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ explicit TFluentFragmentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent())
+ : TFluentBase<TParent>(consumer, std::move(parent))
+ { }
+
+ /// Delegate invocation to a separate procedure.
+ template <class TFunc>
+ TDeepThis& Do(const TFunc& func)
+ {
+ func(TShallowThis(this->Consumer));
+ return *static_cast<TDeepThis*>(this);
+ }
+
+ /// Conditionally delegate invocation to a separate procedure.
+ template <class TFunc>
+ TDeepThis& DoIf(bool condition, const TFunc& func)
+ {
+ if (condition) {
+ func(TShallowThis(this->Consumer));
+ }
+ return *static_cast<TDeepThis*>(this);
+ }
+
+ /// Calls `func(*this, element)` for each `element` in range `[begin, end)`.
+ template <class TFunc, class TIterator>
+ TDeepThis& DoFor(const TIterator& begin, const TIterator& end, const TFunc& func)
+ {
+ for (auto current = begin; current != end; ++current) {
+ func(TShallowThis(this->Consumer), current);
+ }
+ return *static_cast<TDeepThis*>(this);
+ }
+
+ /// Calls `func(*this, element)` for each `element` in `collection`.
+ template <class TFunc, class TCollection>
+ TDeepThis& DoFor(const TCollection& collection, const TFunc& func)
+ {
+ for (const auto& item : collection) {
+ func(TShallowThis(this->Consumer), item);
+ }
+ return *static_cast<TDeepThis*>(this);
+ }
+
+ };
+
+ /// Fluent adapter of a value without attributes.
+ template <class TParent>
+ class TAnyWithoutAttributes
+ : public TFluentBase<TParent>
+ {
+ public:
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ TAnyWithoutAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent)
+ : TFluentBase<TParent>(consumer, std::move(parent))
+ { }
+
+ /// Pass `value` to underlying consumer.
+ template <class T>
+ TUnwrappedParent Value(const T& value)
+ {
+ WriteValue(this->Consumer, value);
+ return this->GetUnwrappedParent();
+ }
+
+ /// Call `OnEntity()` of underlying consumer.
+ TUnwrappedParent Entity()
+ {
+ this->Consumer->OnEntity();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Serialize `collection` to underlying consumer as a list.
+ template <class TCollection>
+ TUnwrappedParent List(const TCollection& collection)
+ {
+ this->Consumer->OnBeginList();
+ for (const auto& item : collection) {
+ this->Consumer->OnListItem();
+ WriteValue(this->Consumer, item);
+ }
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Serialize maximum `maxSize` elements of `collection` to underlying consumer as a list.
+ template <class TCollection>
+ TUnwrappedParent ListLimited(const TCollection& collection, size_t maxSize)
+ {
+ this->Consumer->OnBeginAttributes();
+ this->Consumer->OnKeyedItem("count");
+ this->Consumer->OnInt64Scalar(collection.size());
+ this->Consumer->OnEndAttributes();
+ this->Consumer->OnBeginList();
+ size_t printedSize = 0;
+ for (const auto& item : collection) {
+ if (printedSize >= maxSize)
+ break;
+ this->Consumer->OnListItem();
+ WriteValue(this->Consumer, item);
+ ++printedSize;
+ }
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a list.
+ TListType<TParent> BeginList()
+ {
+ this->Consumer->OnBeginList();
+ return TListType<TParent>(this->Consumer, this->Parent);
+ }
+
+ /// Open a list, delegate invocation to `func`, then close the list.
+ template <class TFunc>
+ TUnwrappedParent DoList(const TFunc& func)
+ {
+ this->Consumer->OnBeginList();
+ func(TListType<TFluentYsonVoid>(this->Consumer));
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a list, call `func(*this, element)` for each `element` of range, then close the list.
+ template <class TFunc, class TIterator>
+ TUnwrappedParent DoListFor(const TIterator& begin, const TIterator& end, const TFunc& func)
+ {
+ this->Consumer->OnBeginList();
+ for (auto current = begin; current != end; ++current) {
+ func(TListType<TFluentYsonVoid>(this->Consumer), current);
+ }
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a list, call `func(*this, element)` for each `element` of `collection`, then close the list.
+ template <class TFunc, class TCollection>
+ TUnwrappedParent DoListFor(const TCollection& collection, const TFunc& func)
+ {
+ this->Consumer->OnBeginList();
+ for (const auto& item : collection) {
+ func(TListType<TFluentYsonVoid>(this->Consumer), item);
+ }
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a map.
+ TMapType<TParent> BeginMap()
+ {
+ this->Consumer->OnBeginMap();
+ return TMapType<TParent>(this->Consumer, this->Parent);
+ }
+
+ /// Open a map, delegate invocation to `func`, then close the map.
+ template <class TFunc>
+ TUnwrappedParent DoMap(const TFunc& func)
+ {
+ this->Consumer->OnBeginMap();
+ func(TMapType<TFluentYsonVoid>(this->Consumer));
+ this->Consumer->OnEndMap();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a map, call `func(*this, element)` for each `element` of range, then close the map.
+ template <class TFunc, class TIterator>
+ TUnwrappedParent DoMapFor(const TIterator& begin, const TIterator& end, const TFunc& func)
+ {
+ this->Consumer->OnBeginMap();
+ for (auto current = begin; current != end; ++current) {
+ func(TMapType<TFluentYsonVoid>(this->Consumer), current);
+ }
+ this->Consumer->OnEndMap();
+ return this->GetUnwrappedParent();
+ }
+
+ /// Open a map, call `func(*this, element)` for each `element` of `collection`, then close the map.
+ template <class TFunc, class TCollection>
+ TUnwrappedParent DoMapFor(const TCollection& collection, const TFunc& func)
+ {
+ this->Consumer->OnBeginMap();
+ for (const auto& item : collection) {
+ func(TMapType<TFluentYsonVoid>(this->Consumer), item);
+ }
+ this->Consumer->OnEndMap();
+ return this->GetUnwrappedParent();
+ }
+ };
+
+ /// Fluent adapter of any value.
+ template <class TParent>
+ class TAny
+ : public TAnyWithoutAttributes<TParent>
+ {
+ public:
+ using TBase = TAnyWithoutAttributes<TParent>;
+
+ explicit TAny(NYT::NYson::IYsonConsumer* consumer, TParent parent)
+ : TBase(consumer, std::move(parent))
+ { }
+
+ /// Open attributes.
+ TAttributes<TBase> BeginAttributes()
+ {
+ this->Consumer->OnBeginAttributes();
+ return TAttributes<TBase>(
+ this->Consumer,
+ TBase(this->Consumer, this->Parent));
+ }
+ };
+
+ /// Fluent adapter of attributes fragment (the inside part of attributes).
+ template <class TParent = TFluentYsonVoid>
+ class TAttributes
+ : public TFluentFragmentBase<TAttributes, TParent>
+ {
+ public:
+ using TThis = TAttributes<TParent>;
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ explicit TAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent())
+ : TFluentFragmentBase<TFluentYsonBuilder::TAttributes, TParent>(consumer, std::move(parent))
+ { }
+
+ /// Pass attribute key to underlying consumer.
+ TAny<TThis> Item(const TStringBuf& key)
+ {
+ this->Consumer->OnKeyedItem(key);
+ return TAny<TThis>(this->Consumer, *this);
+ }
+
+ /// Pass attribute key to underlying consumer.
+ template <size_t Size>
+ TAny<TThis> Item(const char (&key)[Size])
+ {
+ return Item(TStringBuf(key, Size - 1));
+ }
+
+ //TODO: from TNode
+
+ /// Close the attributes.
+ TUnwrappedParent EndAttributes()
+ {
+ this->Consumer->OnEndAttributes();
+ return this->GetUnwrappedParent();
+ }
+ };
+
+ /// Fluent adapter of list fragment (the inside part of a list).
+ template <class TParent = TFluentYsonVoid>
+ class TListType
+ : public TFluentFragmentBase<TListType, TParent>
+ {
+ public:
+ using TThis = TListType<TParent>;
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ explicit TListType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent())
+ : TFluentFragmentBase<TFluentYsonBuilder::TListType, TParent>(consumer, std::move(parent))
+ { }
+
+ /// Call `OnListItem()` of underlying consumer.
+ TAny<TThis> Item()
+ {
+ this->Consumer->OnListItem();
+ return TAny<TThis>(this->Consumer, *this);
+ }
+
+ // TODO: from TNode
+
+ /// Close the list.
+ TUnwrappedParent EndList()
+ {
+ this->Consumer->OnEndList();
+ return this->GetUnwrappedParent();
+ }
+ };
+
+ /// Fluent adapter of map fragment (the inside part of a map).
+ template <class TParent = TFluentYsonVoid>
+ class TMapType
+ : public TFluentFragmentBase<TMapType, TParent>
+ {
+ public:
+ using TThis = TMapType<TParent>;
+ using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped;
+
+ explicit TMapType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent())
+ : TFluentFragmentBase<TFluentYsonBuilder::TMapType, TParent>(consumer, std::move(parent))
+ { }
+
+ /// Pass map key to underlying consumer.
+ template <size_t Size>
+ TAny<TThis> Item(const char (&key)[Size])
+ {
+ return Item(TStringBuf(key, Size - 1));
+ }
+
+ /// Pass map key to underlying consumer.
+ TAny<TThis> Item(const TStringBuf& key)
+ {
+ this->Consumer->OnKeyedItem(key);
+ return TAny<TThis>(this->Consumer, *this);
+ }
+
+ // TODO: from TNode
+
+ /// Close the map.
+ TUnwrappedParent EndMap()
+ {
+ this->Consumer->OnEndMap();
+ return this->GetUnwrappedParent();
+ }
+ };
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Builder representing any value.
+using TFluentAny = TFluentYsonBuilder::TAny<TFluentYsonVoid>;
+
+/// Builder representing the inside of a list (list fragment).
+using TFluentList = TFluentYsonBuilder::TListType<TFluentYsonVoid>;
+
+/// Builder representing the inside of a map (map fragment).
+using TFluentMap = TFluentYsonBuilder::TMapType<TFluentYsonVoid>;
+
+/// Builder representing the inside of attributes.
+using TFluentAttributes = TFluentYsonBuilder::TAttributes<TFluentYsonVoid>;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Create a fluent adapter to invoke methods of `consumer`.
+static inline TFluentAny BuildYsonFluently(NYT::NYson::IYsonConsumer* consumer)
+{
+ return TFluentAny(consumer, TFluentYsonVoid());
+}
+
+/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a list.
+static inline TFluentList BuildYsonListFluently(NYT::NYson::IYsonConsumer* consumer)
+{
+ return TFluentList(consumer);
+}
+
+/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a map.
+static inline TFluentMap BuildYsonMapFluently(NYT::NYson::IYsonConsumer* consumer)
+{
+ return TFluentMap(consumer);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TFluentYsonWriterState
+ : public TThrRefBase
+{
+public:
+ using TValue = TString;
+
+ explicit TFluentYsonWriterState(::NYson::EYsonFormat format)
+ : Writer(&Output, format)
+ { }
+
+ TString GetValue()
+ {
+ return Output.Str();
+ }
+
+ NYT::NYson::IYsonConsumer* GetConsumer()
+ {
+ return &Writer;
+ }
+
+private:
+ TStringStream Output;
+ ::NYson::TYsonWriter Writer;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TFluentYsonBuilderState
+ : public TThrRefBase
+{
+public:
+ using TValue = TNode;
+
+ explicit TFluentYsonBuilderState()
+ : Builder(&Node)
+ { }
+
+ TNode GetValue()
+ {
+ return std::move(Node);
+ }
+
+ NYT::NYson::IYsonConsumer* GetConsumer()
+ {
+ return &Builder;
+ }
+
+private:
+ TNode Node;
+ TNodeBuilder Builder;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TState>
+class TFluentYsonHolder
+{
+public:
+ explicit TFluentYsonHolder(::TIntrusivePtr<TState> state)
+ : State(state)
+ { }
+
+ ::TIntrusivePtr<TState> GetState() const
+ {
+ return State;
+ }
+
+private:
+ ::TIntrusivePtr<TState> State;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TState>
+struct TFluentYsonUnwrapper< TFluentYsonHolder<TState> >
+{
+ using TUnwrapped = typename TState::TValue;
+
+ static TUnwrapped Unwrap(const TFluentYsonHolder<TState>& holder)
+ {
+ return std::move(holder.GetState()->GetValue());
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TState>
+TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>>
+BuildYsonFluentlyWithState(::TIntrusivePtr<TState> state)
+{
+ return TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>>(
+ state->GetConsumer(),
+ TFluentYsonHolder<TState>(state));
+}
+
+/// Create a fluent adapter returning a `TString` with corresponding YSON when construction is finished.
+inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonWriterState>>
+BuildYsonStringFluently(::NYson::EYsonFormat format = ::NYson::EYsonFormat::Text)
+{
+ ::TIntrusivePtr<TFluentYsonWriterState> state(new TFluentYsonWriterState(format));
+ return BuildYsonFluentlyWithState(state);
+}
+
+/// Create a fluent adapter returning a @ref NYT::TNode when construction is finished.
+inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonBuilderState>>
+BuildYsonNodeFluently()
+{
+ ::TIntrusivePtr<TFluentYsonBuilderState> state(new TFluentYsonBuilderState);
+ return BuildYsonFluentlyWithState(state);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/format.cpp b/yt/cpp/mapreduce/interface/format.cpp
new file mode 100644
index 0000000000..f8318310a4
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/format.cpp
@@ -0,0 +1,135 @@
+#include "format.h"
+#include "protobuf_format.h"
+
+#include "errors.h"
+
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/messagext.h>
+
+namespace NYT {
+
+TTableSchema CreateTableSchema(
+ const ::google::protobuf::Descriptor& messageDescriptor,
+ bool keepFieldsWithoutExtension)
+{
+ return NDetail::CreateTableSchemaImpl(messageDescriptor, keepFieldsWithoutExtension);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TFormat::TFormat(const TNode& config)
+ : Config(config)
+{ }
+
+
+TFormat TFormat::Protobuf(
+ const TVector<const ::google::protobuf::Descriptor*>& descriptors,
+ bool withDescriptors)
+{
+ if (withDescriptors) {
+ return TFormat(NDetail::MakeProtoFormatConfigWithDescriptors(descriptors));
+ } else {
+ return TFormat(NDetail::MakeProtoFormatConfigWithTables(descriptors));
+ }
+}
+
+TFormat TFormat::YsonText()
+{
+ TNode config("yson");
+ config.Attributes()("format", "text");
+ return TFormat(config);
+}
+
+TFormat TFormat::YsonBinary()
+{
+ TNode config("yson");
+ config.Attributes()("format", "binary");
+ return TFormat(config);
+}
+
+TFormat TFormat::YaMRLenval()
+{
+ TNode config("yamr");
+ config.Attributes()("lenval", true)("has_subkey", true);
+ return TFormat(config);
+}
+
+TFormat TFormat::Json()
+{
+ return TFormat(TNode("json"));
+}
+
+bool TFormat::IsTextYson() const
+{
+ if (!Config.IsString() || Config.AsString() != "yson") {
+ return false;
+ }
+ if (!Config.HasAttributes()) {
+ return false;
+ }
+ const auto& attributes = Config.GetAttributes();
+ if (!attributes.HasKey("format") || attributes["format"] != TNode("text")) {
+ return false;
+ }
+ return true;
+}
+
+bool TFormat::IsProtobuf() const
+{
+ return Config.IsString() && Config.AsString() == "protobuf";
+}
+
+bool TFormat::IsYamredDsv() const
+{
+ return Config.IsString() && Config.AsString() == "yamred_dsv";
+}
+
+static TString FormatName(const TFormat& format)
+{
+ if (!format.Config.IsString()) {
+ Y_VERIFY(format.Config.IsUndefined());
+ return "<undefined>";
+ }
+ return format.Config.AsString();
+}
+
+TYamredDsvAttributes TFormat::GetYamredDsvAttributes() const
+{
+ if (!IsYamredDsv()) {
+ ythrow TApiUsageError() << "Cannot get yamred_dsv attributes for " << FormatName(*this) << " format";
+ }
+ TYamredDsvAttributes attributes;
+
+ const auto& nodeAttributes = Config.GetAttributes();
+ {
+ const auto& keyColumns = nodeAttributes["key_column_names"];
+ if (!keyColumns.IsList()) {
+ ythrow yexception() << "Ill-formed format: key_column_names is of non-list type: " << keyColumns.GetType();
+ }
+ for (auto& column : keyColumns.AsList()) {
+ if (!column.IsString()) {
+ ythrow yexception() << "Ill-formed format: key_column_names: " << column.GetType();
+ }
+ attributes.KeyColumnNames.push_back(column.AsString());
+ }
+ }
+
+ if (nodeAttributes.HasKey("subkey_column_names")) {
+ const auto& subkeyColumns = nodeAttributes["subkey_column_names"];
+ if (!subkeyColumns.IsList()) {
+ ythrow yexception() << "Ill-formed format: subkey_column_names is not a list: " << subkeyColumns.GetType();
+ }
+ for (const auto& column : subkeyColumns.AsList()) {
+ if (!column.IsString()) {
+ ythrow yexception() << "Ill-formed format: non-string inside subkey_key_column_names: " << column.GetType();
+ }
+ attributes.SubkeyColumnNames.push_back(column.AsString());
+ }
+ }
+
+ return attributes;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/format.h b/yt/cpp/mapreduce/interface/format.h
new file mode 100644
index 0000000000..e297576464
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/format.h
@@ -0,0 +1,122 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/format.h
+///
+/// Header containing class to work with raw [YT formats](https://yt.yandex-team.ru/docs/description/storage/formats.html).
+
+#include "node.h"
+
+#include <google/protobuf/descriptor.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @deprecated
+struct TYamredDsvAttributes
+{
+ /// Names of key columns.
+ TVector<TString> KeyColumnNames;
+
+ /// Names of subkey columns.
+ TVector<TString> SubkeyColumnNames;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Class representing YT data format.
+///
+/// Normally the user does not need to use it.
+/// However, the class is handy for "raw" operations and table reading and writing,
+/// e.g. @ref NYT::IOperationClient::RawMap and other raw operations,
+/// @ref NYT::IIOClient::CreateRawReader and @ref NYT::IIOClient::CreateRawWriter.
+/// Anyway, the static factory methods should be preferred to the constructor.
+///
+/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/formats.html).
+struct TFormat
+{
+public:
+ /// Format representation understandable by YT.
+ TNode Config;
+
+public:
+ /// @brief Construct format from given YT format representation.
+ ///
+ /// @note Prefer using static factory methods (e.g. @ref NYT::TFormat::YsonBinary, @ref NYT::TFormat::YsonText, @ref NYT::TFormat::Protobuf).
+ explicit TFormat(const TNode& config = TNode());
+
+ /// @brief Create text YSON format.
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON)
+ static TFormat YsonText();
+
+ /// @brief Create binary YSON format.
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON)
+ static TFormat YsonBinary();
+
+ /// @brief Create YaMR format.
+ ///
+ /// @deprecated
+ static TFormat YaMRLenval();
+
+ /// @brief Create protobuf format from protobuf message descriptors.
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html).
+ static TFormat Protobuf(
+ const TVector<const ::google::protobuf::Descriptor*>& descriptors,
+ bool withDescriptors = false);
+
+ /// @brief Create JSON format.
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#JSON)
+ static TFormat Json();
+
+ /// @brief Create protobuf format for the message specified in template parameter.
+ ///
+ /// `T` must be inherited from `Message`.
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html).
+ template<typename T>
+ static inline TFormat Protobuf(bool withDescriptors = false);
+
+ /// @brief Is the format text YSON?
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON)
+ bool IsTextYson() const;
+
+ /// @brief Is the format protobuf?
+ ///
+ /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html)
+ bool IsProtobuf() const;
+
+ /// @brief Is the format YaMR?
+ ///
+ /// @deprecated
+ bool IsYamredDsv() const;
+
+ /// @brief For YAMR format returns its attributes in structured way.
+ ///
+ /// @deprecated
+ TYamredDsvAttributes GetYamredDsvAttributes() const;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+TFormat TFormat::Protobuf(bool withDescriptors) {
+ return TFormat::Protobuf({T::descriptor()}, withDescriptors);
+}
+
+/// @brief Create table schema from protobuf message descriptor.
+///
+/// @param messageDescriptor Message descriptor
+/// @param keepFieldsWithoutExtension Add to schema fields without "column_name" or "key_column_name" extensions.
+TTableSchema CreateTableSchema(
+ const ::google::protobuf::Descriptor& messageDescriptor,
+ bool keepFieldsWithoutExtension);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/format_ut.cpp b/yt/cpp/mapreduce/interface/format_ut.cpp
new file mode 100644
index 0000000000..069c29087d
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/format_ut.cpp
@@ -0,0 +1,235 @@
+#include "common.h"
+#include "errors.h"
+#include "format.h"
+#include "common_ut.h"
+
+#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h>
+#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NYT;
+
+static TNode GetColumns(const TFormat& format, int tableIndex = 0)
+{
+ return format.Config.GetAttributes()["tables"][tableIndex]["columns"];
+}
+
+Y_UNIT_TEST_SUITE(ProtobufFormat)
+{
+ Y_UNIT_TEST(TIntegral)
+ {
+ const auto format = TFormat::Protobuf<NUnitTesting::TIntegral>();
+ auto columns = GetColumns(format);
+
+ struct TColumn
+ {
+ TString Name;
+ TString ProtoType;
+ int FieldNumber;
+ };
+
+ auto expected = TVector<TColumn>{
+ {"DoubleField", "double", 1},
+ {"FloatField", "float", 2},
+ {"Int32Field", "int32", 3},
+ {"Int64Field", "int64", 4},
+ {"Uint32Field", "uint32", 5},
+ {"Uint64Field", "uint64", 6},
+ {"Sint32Field", "sint32", 7},
+ {"Sint64Field", "sint64", 8},
+ {"Fixed32Field", "fixed32", 9},
+ {"Fixed64Field", "fixed64", 10},
+ {"Sfixed32Field", "sfixed32", 11},
+ {"Sfixed64Field", "sfixed64", 12},
+ {"BoolField", "bool", 13},
+ {"EnumField", "enum_string", 14},
+ };
+
+ UNIT_ASSERT_VALUES_EQUAL(columns.Size(), expected.size());
+ for (int i = 0; i < static_cast<int>(columns.Size()); ++i) {
+ UNIT_ASSERT_VALUES_EQUAL(columns[i]["name"], expected[i].Name);
+ UNIT_ASSERT_VALUES_EQUAL(columns[i]["proto_type"], expected[i].ProtoType);
+ UNIT_ASSERT_VALUES_EQUAL(columns[i]["field_number"], expected[i].FieldNumber);
+ }
+ }
+
+ Y_UNIT_TEST(TRowFieldSerializationOption)
+ {
+ const auto format = TFormat::Protobuf<NUnitTesting::TRowFieldSerializationOption>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1);
+ const auto& fields = columns[0]["fields"];
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host");
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string");
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path");
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string");
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode");
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32");
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2");
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "message");
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2);
+ }
+
+ Y_UNIT_TEST(Packed)
+ {
+ const auto format = TFormat::Protobuf<NUnitTesting::TPacked>();
+ auto column = GetColumns(format)[0];
+
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "PackedListInt64");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1);
+ UNIT_ASSERT_VALUES_EQUAL(column["packed"], true);
+ UNIT_ASSERT_VALUES_EQUAL(column["repeated"], true);
+ }
+
+ Y_UNIT_TEST(Cyclic)
+ {
+ UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TA>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TB>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TC>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TD>(), TApiUsageError);
+
+ const auto format = TFormat::Protobuf<NUnitTesting::TCyclic::TE>();
+ auto column = GetColumns(format)[0];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "d");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "message");
+ UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1);
+ }
+
+ Y_UNIT_TEST(Map)
+ {
+ const auto format = TFormat::Protobuf<NUnitTesting::TWithMap>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 5);
+ {
+ const auto& column = columns[0];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message");
+ }
+ {
+ const auto& column = columns[1];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructsLegacy");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message");
+ }
+ {
+ const auto& column = columns[2];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructs");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message");
+ }
+ {
+ const auto& column = columns[3];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapOptionalDict");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message");
+ }
+ {
+ const auto& column = columns[4];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message");
+ }
+ }
+
+ Y_UNIT_TEST(Oneof)
+ {
+ const auto format = TFormat::Protobuf<NUnitTesting::TWithOneof>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4);
+ auto check = [] (const TNode& column, TStringBuf name, TStringBuf oneof2Name) {
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], name);
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field");
+
+ const auto& oneof2 = column["fields"][1];
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], oneof2Name);
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message");
+ const auto& embeddedOneof = oneof2["fields"][1]["fields"][0];
+ UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["name"], "Oneof");
+ UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][1]["name"], "y");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2");
+
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1");
+ };
+
+ check(columns[0], "DefaultSeparateFields", "variant_field_name");
+ check(columns[1], "NoDefault", "Oneof2");
+
+ {
+ const auto& column = columns[2];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1");
+ }
+ {
+ const auto& column = columns[3];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "TopLevelOneof");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "oneof");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "MemberOfTopLevelOneof");
+ }
+ }
+}
+
+Y_UNIT_TEST_SUITE(Proto3)
+{
+ Y_UNIT_TEST(TWithOptional)
+ {
+ const auto format = TFormat::Protobuf<NTestingProto3::TWithOptional>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1);
+ }
+
+ Y_UNIT_TEST(TWithOptionalMessage)
+ {
+ const auto format = TFormat::Protobuf<NTestingProto3::TWithOptionalMessage>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"].Size(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["field_number"], 1);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/fwd.h b/yt/cpp/mapreduce/interface/fwd.h
new file mode 100644
index 0000000000..0434c03d8b
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/fwd.h
@@ -0,0 +1,397 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/fwd.h
+///
+/// Header containing mostly forward declarations of types.
+
+
+#include <util/generic/fwd.h>
+#include <util/system/types.h>
+
+#include <variant>
+
+/// @cond Doxygen_Suppress
+namespace google::protobuf {
+ class Message;
+}
+
+namespace NYT {
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // batch_request.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ class IBatchRequest;
+ using TBatchRequestPtr = ::TIntrusivePtr<IBatchRequest>;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // client.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ enum ELockMode : int;
+
+ struct TStartTransactionOptions;
+
+ struct TLockOptions;
+
+ template <class TDerived>
+ struct TTabletOptions;
+
+ struct TMountTableOptions;
+
+ struct TUnmountTableOptions;
+
+ struct TRemountTableOptions;
+
+ struct TReshardTableOptions;
+
+ struct TAlterTableOptions;
+
+ struct TLookupRowsOptions;
+
+ struct TSelectRowsOptions;
+
+ struct TCreateClientOptions;
+
+ struct TAlterTableReplicaOptions;
+
+ struct TGetFileFromCacheOptions;
+
+ struct TPutFileToCacheOptions;
+
+ struct TCheckPermissionResult;
+ struct TCheckPermissionResponse;
+ struct TCheckPermissionOptions;
+
+ struct TTabletInfo;
+
+ class ILock;
+ using ILockPtr = ::TIntrusivePtr<ILock>;
+
+ class ITransaction;
+ using ITransactionPtr = ::TIntrusivePtr<ITransaction>;
+
+ class ITransactionPinger;
+ using ITransactionPingerPtr = ::TIntrusivePtr<ITransactionPinger>;
+
+ struct IOperation;
+ using IOperationPtr = ::TIntrusivePtr<IOperation>;
+
+ class IClientBase;
+
+ class IClient;
+
+ using IClientPtr = ::TIntrusivePtr<IClient>;
+ using IClientBasePtr = ::TIntrusivePtr<IClientBase>;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // config.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct TConfig;
+ using TConfigPtr = ::TIntrusivePtr<TConfig>;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // cypress.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ enum ENodeType : int;
+
+ struct TCreateOptions;
+
+ struct TRemoveOptions;
+
+ struct TGetOptions;
+
+ struct TSetOptions;
+
+ struct TMultisetAttributesOptions;
+
+ struct TListOptions;
+
+ struct TCopyOptions;
+
+ struct TMoveOptions;
+
+ struct TLinkOptions;
+
+ struct TConcatenateOptions;
+
+ struct TInsertRowsOptions;
+
+ struct TDeleteRowsOptions;
+
+ struct TTrimRowsOptions;
+
+ class ICypressClient;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // errors.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ class TApiUsageError;
+
+ class TYtError;
+
+ class TErrorResponse;
+
+ struct TFailedJobInfo;
+
+ class TOperationFailedError;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // node.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ class TNode;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // common.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ using TTransactionId = TGUID;
+ using TNodeId = TGUID;
+ using TLockId = TGUID;
+ using TOperationId = TGUID;
+ using TTabletCellId = TGUID;
+ using TReplicaId = TGUID;
+ using TJobId = TGUID;
+
+ using TYPath = TString;
+ using TLocalFilePath = TString;
+
+ template <class T, class TDerived = void>
+ struct TOneOrMany;
+
+ // key column values
+ using TKey = TOneOrMany<TNode>;
+
+ class TSortColumn;
+
+ // column names
+ using TColumnNames = TOneOrMany<TString>;
+
+ // key column descriptors.
+ class TSortColumns;
+
+ enum EValueType : int;
+
+ enum ESortOrder : int;
+
+ enum EOptimizeForAttr : i8;
+
+ enum EErasureCodecAttr : i8;
+
+ enum ESchemaModificationAttr : i8;
+
+ enum class EMasterReadKind : int;
+
+ class TColumnSchema;
+
+ class TTableSchema;
+
+ enum class ERelation;
+
+ struct TKeyBound;
+
+ struct TReadLimit;
+
+ struct TReadRange;
+
+ struct TRichYPath;
+
+ struct TAttributeFilter;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // io.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ enum class EFormatType : int;
+
+ struct TFormat;
+
+ class IFileReader;
+
+ using IFileReaderPtr = ::TIntrusivePtr<IFileReader>;
+
+ class IFileWriter;
+
+ using IFileWriterPtr = ::TIntrusivePtr<IFileWriter>;
+
+ class IBlobTableReader;
+ using IBlobTableReaderPtr = ::TIntrusivePtr<IBlobTableReader>;
+
+ class TRawTableReader;
+
+ using TRawTableReaderPtr = ::TIntrusivePtr<TRawTableReader>;
+
+ class TRawTableWriter;
+
+ using TRawTableWriterPtr = ::TIntrusivePtr<TRawTableWriter>;
+
+ template <class T, class = void>
+ class TTableReader;
+
+ template <class T, class = void>
+ class TTableRangesReader;
+
+ template <typename T>
+ using TTableRangesReaderPtr = ::TIntrusivePtr<TTableRangesReader<T>>;
+
+ template <class T>
+ using TTableReaderPtr = ::TIntrusivePtr<TTableReader<T>>;
+
+ template <class T, class = void>
+ class TTableWriter;
+
+ template <class T>
+ using TTableWriterPtr = ::TIntrusivePtr<TTableWriter<T>>;
+
+ struct TYaMRRow;
+
+ using ::google::protobuf::Message;
+
+ class ISkiffRowParser;
+
+ using ISkiffRowParserPtr = ::TIntrusivePtr<ISkiffRowParser>;
+
+ class ISkiffRowSkipper;
+
+ using ISkiffRowSkipperPtr = ::TIntrusivePtr<ISkiffRowSkipper>;
+
+ namespace NDetail {
+
+ class TYdlGenericRowType;
+
+ } // namespace NDetail
+
+ template<class... TYdlRowTypes>
+ class TYdlOneOf;
+
+ template<class... TProtoRowTypes>
+ class TProtoOneOf;
+
+ template<class... TSkiffRowTypes>
+ class TSkiffRowOneOf;
+
+ using TYaMRReader = TTableReader<TYaMRRow>;
+ using TYaMRWriter = TTableWriter<TYaMRRow>;
+ using TNodeReader = TTableReader<TNode>;
+ using TNodeWriter = TTableWriter<TNode>;
+ using TMessageReader = TTableReader<Message>;
+ using TMessageWriter = TTableWriter<Message>;
+ using TYdlTableWriter = TTableWriter<NDetail::TYdlGenericRowType>;
+
+ template <class TDerived>
+ struct TIOOptions;
+
+ struct TFileReaderOptions;
+
+ struct TFileWriterOptions;
+
+ struct TTableReaderOptions;
+
+ class TSkiffRowHints;
+
+ struct TTableWriterOptions;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // job_statistics.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ class TJobStatistics;
+
+ template <typename T>
+ class TJobStatisticsEntry;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // operation.h
+ ////////////////////////////////////////////////////////////////////////////////
+
+ class TFormatHints;
+
+ struct TUserJobSpec;
+
+ struct TMapOperationSpec;
+
+ struct TRawMapOperationSpec;
+
+ struct TReduceOperationSpec;
+
+ struct TMapReduceOperationSpec;
+
+ struct TJoinReduceOperationSpec;
+
+ struct TSortOperationSpec;
+
+ class IIOperationPreparationContext;
+
+ class IJob;
+ using IJobPtr = ::TIntrusivePtr<IJob>;
+
+ class IRawJob;
+ using IRawJobPtr = ::TIntrusivePtr<IRawJob>;
+
+ enum EMergeMode : int;
+
+ struct TMergeOperationSpec;
+
+ struct TEraseOperationSpec;
+
+ template <class TR, class TW>
+ class IMapper;
+
+ template <class TR, class TW>
+ class IReducer;
+
+ template <class TR, class TW>
+ class IAggregatorReducer;
+
+ struct TSuspendOperationOptions;
+
+ struct TResumeOperationOptions;
+
+ enum class EOperationBriefState : int;
+
+ struct TOperationAttributes;
+
+ struct TOperationOptions;
+
+ enum class EOperationAttribute : int;
+
+ struct TOperationAttributeFilter;
+
+ struct TGetOperationOptions;
+
+ struct TListOperationsOptions;
+
+ struct TGetJobOptions;
+
+ struct TListJobsOptions;
+
+ struct IOperationClient;
+
+ enum class EFinishedJobState : int;
+
+ enum class EJobType : int;
+ enum class EJobState : int;
+ enum class ETaskName : int;
+ class TTaskName;
+
+ struct TJobBinaryDefault;
+
+ struct TJobBinaryLocalPath;
+
+ struct TJobBinaryCypressPath;
+
+ using TJobBinaryConfig = std::variant<
+ TJobBinaryDefault,
+ TJobBinaryLocalPath,
+ TJobBinaryCypressPath>;
+
+ struct TRetryConfig;
+ class IRetryConfigProvider;
+ using IRetryConfigProviderPtr = ::TIntrusivePtr<IRetryConfigProvider>;
+}
+/// @endcond
diff --git a/yt/cpp/mapreduce/interface/init.h b/yt/cpp/mapreduce/interface/init.h
new file mode 100644
index 0000000000..302be268fc
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/init.h
@@ -0,0 +1,71 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/init.h
+///
+/// Initialization functions of YT Wrapper.
+
+#include <yt/cpp/mapreduce/interface/wait_proxy.h>
+
+#include <util/generic/fwd.h>
+
+#include <functional>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Options for @ref NYT::Initialize() and @ref NYT::JoblessInitialize() functions
+struct TInitializeOptions
+{
+ using TSelf = TInitializeOptions;
+
+ ///
+ /// @brief Override waiting functions for YT Wrapper.
+ ///
+ /// This options allows to override functions used by this library to wait something.
+ FLUENT_FIELD_DEFAULT(::TIntrusivePtr<IWaitProxy>, WaitProxy, nullptr);
+
+ ///
+ /// @brief Enable/disable cleanup when program execution terminates abnormally.
+ ///
+ /// When set to true, library will abort all active transactions and running operations when program
+ /// terminates on error or signal.
+ FLUENT_FIELD_DEFAULT(bool, CleanupOnTermination, false);
+
+ ///
+ /// @brief Set callback to be called before exit() in job mode.
+ ///
+ /// Provided function will be called just before exit() when program is started in job mode.
+ /// This might be useful for shutting down libraries that are used inside operations.
+ ///
+ /// NOTE: Keep in mind that inside job execution environment differs from client execution environment.
+ /// So JobOnExitFunction should not depend on argc/argv environment variables etc.
+ FLUENT_FIELD_OPTION(std::function<void()>, JobOnExitFunction);
+};
+
+///
+/// @brief Performs basic initialization (logging, termination handlers, etc).
+///
+/// This function never switches to job mode.
+void JoblessInitialize(const TInitializeOptions& options = TInitializeOptions());
+
+///
+/// @brief Performs basic initialization and switches to a job mode if required.
+///
+/// This function performs basic initialization (it sets up logging reads the config, etc) and checks if binary is launched
+/// on YT machine inside a job. If latter is true this function launches proper job and after job is done it calls exit().
+///
+/// This function must be called if application starts any operation.
+/// This function must be called immediately after entering main() function before any argument parsing is done.
+void Initialize(int argc, const char **argv, const TInitializeOptions &options = TInitializeOptions());
+
+/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&)
+void Initialize(int argc, char **argv, const TInitializeOptions &options = TInitializeOptions());
+
+/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&)
+void Initialize(const TInitializeOptions &options = TInitializeOptions());
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/io-inl.h b/yt/cpp/mapreduce/interface/io-inl.h
new file mode 100644
index 0000000000..c35ebb7481
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/io-inl.h
@@ -0,0 +1,1015 @@
+#pragma once
+
+#ifndef IO_INL_H_
+#error "Direct inclusion of this file is not allowed, use io.h"
+#endif
+#undef IO_INL_H_
+
+#include "finish_or_die.h"
+
+#include <util/generic/typetraits.h>
+#include <util/generic/yexception.h>
+#include <util/stream/length.h>
+
+#include <util/system/mutex.h>
+#include <util/system/spinlock.h>
+
+#include <library/cpp/yson/node/node_builder.h>
+
+#include <yt/cpp/mapreduce/interface/serialize.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+template<class T>
+struct TIsProtoOneOf
+ : std::false_type
+{ };
+
+template <class ...TProtoRowTypes>
+struct TIsProtoOneOf<TProtoOneOf<TProtoRowTypes...>>
+ : std::true_type
+{ };
+
+template <class T>
+struct TIsSkiffRowOneOf
+ : std::false_type
+{ };
+
+template <class ...TSkiffRowTypes>
+struct TIsSkiffRowOneOf<TSkiffRowOneOf<TSkiffRowTypes...>>
+ : std::true_type
+{ };
+
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T, class = void>
+struct TRowTraits;
+
+template <>
+struct TRowTraits<TNode>
+{
+ using TRowType = TNode;
+ using IReaderImpl = INodeReaderImpl;
+ using IWriterImpl = INodeWriterImpl;
+};
+
+template <>
+struct TRowTraits<TYaMRRow>
+{
+ using TRowType = TYaMRRow;
+ using IReaderImpl = IYaMRReaderImpl;
+ using IWriterImpl = IYaMRWriterImpl;
+};
+
+template <>
+struct TRowTraits<Message>
+{
+ using TRowType = Message;
+ using IReaderImpl = IProtoReaderImpl;
+ using IWriterImpl = IProtoWriterImpl;
+};
+
+template <class T>
+struct TRowTraits<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>>
+{
+ using TRowType = T;
+ using IReaderImpl = IProtoReaderImpl;
+ using IWriterImpl = IProtoWriterImpl;
+};
+
+template <class T>
+struct TRowTraits<T, std::enable_if_t<TIsSkiffRow<T>::value>>
+{
+ using TRowType = T;
+ using IReaderImpl = ISkiffRowReaderImpl;
+};
+
+template <class... TSkiffRowTypes>
+struct TRowTraits<TSkiffRowOneOf<TSkiffRowTypes...>>
+{
+ using TRowType = TSkiffRowOneOf<TSkiffRowTypes...>;
+ using IReaderImpl = ISkiffRowReaderImpl;
+};
+
+template <class... TProtoRowTypes>
+struct TRowTraits<TProtoOneOf<TProtoRowTypes...>>
+{
+ using TRowType = TProtoOneOf<TProtoRowTypes...>;
+ using IReaderImpl = IProtoReaderImpl;
+ using IWriterImpl = IProtoWriterImpl;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct IReaderImplBase
+ : public TThrRefBase
+{
+ virtual bool IsValid() const = 0;
+ virtual void Next() = 0;
+ virtual ui32 GetTableIndex() const = 0;
+ virtual ui32 GetRangeIndex() const = 0;
+ virtual ui64 GetRowIndex() const = 0;
+ virtual void NextKey() = 0;
+
+ // Not pure virtual because of clients that has already implemented this interface.
+ virtual TMaybe<size_t> GetReadByteCount() const;
+ virtual i64 GetTabletIndex() const;
+ virtual bool IsEndOfStream() const;
+ virtual bool IsRawReaderExhausted() const;
+};
+
+struct INodeReaderImpl
+ : public IReaderImplBase
+{
+ virtual const TNode& GetRow() const = 0;
+ virtual void MoveRow(TNode* row) = 0;
+};
+
+struct IYaMRReaderImpl
+ : public IReaderImplBase
+{
+ virtual const TYaMRRow& GetRow() const = 0;
+ virtual void MoveRow(TYaMRRow* row)
+ {
+ *row = GetRow();
+ }
+};
+
+struct IProtoReaderImpl
+ : public IReaderImplBase
+{
+ virtual void ReadRow(Message* row) = 0;
+};
+
+struct ISkiffRowReaderImpl
+ : public IReaderImplBase
+{
+ virtual void ReadRow(const ISkiffRowParserPtr& parser) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+// We don't include <yt/cpp/mapreduce/interface/logging/yt_log.h> in this file
+// to avoid macro name clashes (specifically YT_LOG_DEBUG)
+void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount);
+
+template <class T>
+class TTableReaderBase
+ : public TThrRefBase
+{
+public:
+ using TRowType = typename TRowTraits<T>::TRowType;
+ using IReaderImpl = typename TRowTraits<T>::IReaderImpl;
+
+ explicit TTableReaderBase(::TIntrusivePtr<IReaderImpl> reader)
+ : Reader_(reader)
+ { }
+
+ ~TTableReaderBase() override
+ {
+ NDetail::LogTableReaderStatistics(ReadRowCount_, Reader_->GetReadByteCount());
+ }
+
+ bool IsValid() const
+ {
+ return Reader_->IsValid();
+ }
+
+ void Next()
+ {
+ Reader_->Next();
+ ++ReadRowCount_;
+ RowState_ = ERowState::None;
+ }
+
+ bool IsEndOfStream()
+ {
+ return Reader_->IsEndOfStream();
+ }
+
+ bool IsRawReaderExhausted()
+ {
+ return Reader_->IsRawReaderExhausted();
+ }
+
+ ui32 GetTableIndex() const
+ {
+ return Reader_->GetTableIndex();
+ }
+
+ ui32 GetRangeIndex() const
+ {
+ return Reader_->GetRangeIndex();
+ }
+
+ ui64 GetRowIndex() const
+ {
+ return Reader_->GetRowIndex();
+ }
+
+ i64 GetTabletIndex() const
+ {
+ return Reader_->GetTabletIndex();
+ }
+
+protected:
+ template <typename TCacher, typename TCacheGetter>
+ const auto& DoGetRowCached(TCacher cacher, TCacheGetter cacheGetter) const
+ {
+ switch (RowState_) {
+ case ERowState::None:
+ cacher();
+ RowState_ = ERowState::Cached;
+ break;
+ case ERowState::Cached:
+ break;
+ case ERowState::MovedOut:
+ ythrow yexception() << "Row is already moved";
+ }
+ return *cacheGetter();
+ }
+
+ template <typename U, typename TMover, typename TCacheMover>
+ void DoMoveRowCached(U* result, TMover mover, TCacheMover cacheMover)
+ {
+ Y_VERIFY(result);
+ switch (RowState_) {
+ case ERowState::None:
+ mover(result);
+ break;
+ case ERowState::Cached:
+ cacheMover(result);
+ break;
+ case ERowState::MovedOut:
+ ythrow yexception() << "Row is already moved";
+ }
+ RowState_ = ERowState::MovedOut;
+ }
+
+private:
+ enum class ERowState
+ {
+ None,
+ Cached,
+ MovedOut,
+ };
+
+protected:
+ ::TIntrusivePtr<IReaderImpl> Reader_;
+
+private:
+ ui64 ReadRowCount_ = 0;
+ mutable ERowState RowState_ = ERowState::None;
+};
+
+template <class T>
+class TSimpleTableReader
+ : public TTableReaderBase<T>
+{
+public:
+ using TBase = TTableReaderBase<T>;
+ using typename TBase::TRowType;
+
+ using TBase::TBase;
+
+ const TRowType& GetRow() const
+ {
+ // Caching is implemented in underlying reader.
+ return TBase::DoGetRowCached(
+ /* cacher */ [&] {},
+ /* cacheGetter */ [&] {
+ return &Reader_->GetRow();
+ });
+ }
+
+ void MoveRow(TRowType* result)
+ {
+ // Caching is implemented in underlying reader.
+ TBase::DoMoveRowCached(
+ result,
+ /* mover */ [&] (TRowType* result) {
+ Reader_->MoveRow(result);
+ },
+ /* cacheMover */ [&] (TRowType* result) {
+ Reader_->MoveRow(result);
+ });
+ }
+
+ TRowType MoveRow()
+ {
+ TRowType result;
+ MoveRow(&result);
+ return result;
+ }
+
+private:
+ using TBase::Reader_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NDetail
+
+template <>
+class TTableReader<TNode>
+ : public NDetail::TSimpleTableReader<TNode>
+{
+ using TSimpleTableReader<TNode>::TSimpleTableReader;
+};
+
+template <>
+class TTableReader<TYaMRRow>
+ : public NDetail::TSimpleTableReader<TYaMRRow>
+{
+ using TSimpleTableReader<TYaMRRow>::TSimpleTableReader;
+};
+
+template <>
+class TTableReader<Message>
+ : public NDetail::TTableReaderBase<Message>
+{
+public:
+ using TBase = NDetail::TTableReaderBase<Message>;
+
+ using TBase::TBase;
+
+ template <class U>
+ const U& GetRow() const
+ {
+ static_assert(TIsBaseOf<Message, U>::Value);
+
+ return TBase::DoGetRowCached(
+ /* cacher */ [&] {
+ CachedRow_.Reset(new U);
+ Reader_->ReadRow(CachedRow_.Get());
+ },
+ /* cacheGetter */ [&] {
+ auto result = dynamic_cast<const U*>(CachedRow_.Get());
+ Y_VERIFY(result);
+ return result;
+ });
+ }
+
+ template <class U>
+ void MoveRow(U* result)
+ {
+ static_assert(TIsBaseOf<Message, U>::Value);
+
+ TBase::DoMoveRowCached(
+ result,
+ /* mover */ [&] (U* result) {
+ Reader_->ReadRow(result);
+ },
+ /* cacheMover */ [&] (U* result) {
+ auto cast = dynamic_cast<U*>(CachedRow_.Get());
+ Y_VERIFY(cast);
+ result->Swap(cast);
+ });
+ }
+
+ template <class U>
+ U MoveRow()
+ {
+ static_assert(TIsBaseOf<Message, U>::Value);
+
+ U result;
+ MoveRow(&result);
+ return result;
+ }
+
+ ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const
+ {
+ return Reader_;
+ }
+
+private:
+ using TBase::Reader_;
+ mutable THolder<Message> CachedRow_;
+};
+
+template<class... TProtoRowTypes>
+class TTableReader<TProtoOneOf<TProtoRowTypes...>>
+ : public NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>>
+{
+public:
+ using TBase = NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>>;
+
+ using TBase::TBase;
+
+ template <class U>
+ const U& GetRow() const
+ {
+ AssertIsOneOf<U>();
+ return TBase::DoGetRowCached(
+ /* cacher */ [&] {
+ Reader_->ReadRow(&std::get<U>(CachedRows_));
+ CachedIndex_ = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value;
+ },
+ /* cacheGetter */ [&] {
+ return &std::get<U>(CachedRows_);
+ });
+ }
+
+ template <class U>
+ void MoveRow(U* result)
+ {
+ AssertIsOneOf<U>();
+ return TBase::DoMoveRowCached(
+ result,
+ /* mover */ [&] (U* result) {
+ Reader_->ReadRow(result);
+ },
+ /* cacheMover */ [&] (U* result) {
+ Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_);
+ *result = std::move(std::get<U>(CachedRows_));
+ });
+ }
+
+ template <class U>
+ U MoveRow()
+ {
+ U result;
+ MoveRow(&result);
+ return result;
+ }
+
+ ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const
+ {
+ return Reader_;
+ }
+
+private:
+ using TBase::Reader_;
+ // std::variant could also be used here, but std::tuple leads to better performance
+ // because of deallocations that std::variant has to do
+ mutable std::tuple<TProtoRowTypes...> CachedRows_;
+ mutable int CachedIndex_;
+
+ template <class U>
+ static constexpr void AssertIsOneOf()
+ {
+ static_assert(
+ (std::is_same<U, TProtoRowTypes>::value || ...),
+ "Template parameter must be one of TProtoOneOf template parameter");
+ }
+};
+
+template <class T>
+class TTableReader<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>>
+ : public TTableReader<TProtoOneOf<T>>
+{
+public:
+ using TRowType = T;
+ using TBase = TTableReader<TProtoOneOf<T>>;
+
+ using TBase::TBase;
+
+ const T& GetRow() const
+ {
+ return TBase::template GetRow<T>();
+ }
+
+ void MoveRow(T* result)
+ {
+ TBase::template MoveRow<T>(result);
+ }
+
+ T MoveRow()
+ {
+ return TBase::template MoveRow<T>();
+ }
+};
+
+template<class... TSkiffRowTypes>
+class TTableReader<TSkiffRowOneOf<TSkiffRowTypes...>>
+ : public NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>>
+{
+public:
+ using TBase = NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>>;
+
+ using TBase::TBase;
+
+ explicit TTableReader(::TIntrusivePtr<typename TBase::IReaderImpl> reader, const TMaybe<TSkiffRowHints>& hints)
+ : TBase(reader)
+ , Parsers_({(CreateSkiffParser<TSkiffRowTypes>(&std::get<TSkiffRowTypes>(CachedRows_), hints))...})
+ { }
+
+ template <class U>
+ const U& GetRow() const
+ {
+ AssertIsOneOf<U>();
+ return TBase::DoGetRowCached(
+ /* cacher */ [&] {
+ auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value;
+ Reader_->ReadRow(Parsers_[index]);
+ CachedIndex_ = index;
+ },
+ /* cacheGetter */ [&] {
+ return &std::get<U>(CachedRows_);
+ });
+ }
+
+ template <class U>
+ void MoveRow(U* result)
+ {
+ AssertIsOneOf<U>();
+ return TBase::DoMoveRowCached(
+ result,
+ /* mover */ [&] (U* result) {
+ auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value;
+ Reader_->ReadRow(Parsers_[index]);
+ *result = std::move(std::get<U>(CachedRows_));
+ },
+ /* cacheMover */ [&] (U* result) {
+ Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_);
+ *result = std::move(std::get<U>(CachedRows_));
+ });
+ }
+
+ template <class U>
+ U MoveRow()
+ {
+ U result;
+ MoveRow(&result);
+ return result;
+ }
+
+ ::TIntrusivePtr<ISkiffRowReaderImpl> GetReaderImpl() const
+ {
+ return Reader_;
+ }
+
+private:
+ using TBase::Reader_;
+ // std::variant could also be used here, but std::tuple leads to better performance
+ // because of deallocations that std::variant has to do
+ mutable std::tuple<TSkiffRowTypes...> CachedRows_;
+ mutable std::vector<ISkiffRowParserPtr> Parsers_;
+ mutable int CachedIndex_;
+
+ template <class U>
+ static constexpr void AssertIsOneOf()
+ {
+ static_assert(
+ (std::is_same<U, TSkiffRowTypes>::value || ...),
+ "Template parameter must be one of TSkiffRowOneOf template parameter");
+ }
+};
+
+template <class T>
+class TTableReader<T, std::enable_if_t<TIsSkiffRow<T>::value>>
+ : public TTableReader<TSkiffRowOneOf<T>>
+{
+public:
+ using TRowType = T;
+ using TBase = TTableReader<TSkiffRowOneOf<T>>;
+
+ using TBase::TBase;
+
+ const T& GetRow()
+ {
+ return TBase::template GetRow<T>();
+ }
+
+ void MoveRow(T* result)
+ {
+ TBase::template MoveRow<T>(result);
+ }
+
+ T MoveRow()
+ {
+ return TBase::template MoveRow<T>();
+ }
+};
+
+template <>
+inline TTableReaderPtr<TNode> IIOClient::CreateTableReader<TNode>(
+ const TRichYPath& path, const TTableReaderOptions& options)
+{
+ return new TTableReader<TNode>(CreateNodeReader(path, options));
+}
+
+template <>
+inline TTableReaderPtr<TYaMRRow> IIOClient::CreateTableReader<TYaMRRow>(
+ const TRichYPath& path, const TTableReaderOptions& options)
+{
+ return new TTableReader<TYaMRRow>(CreateYaMRReader(path, options));
+}
+
+template <class T, class = std::enable_if_t<TIsBaseOf<Message, T>::Value>>
+struct TReaderCreator
+{
+ static TTableReaderPtr<T> Create(::TIntrusivePtr<IProtoReaderImpl> reader)
+ {
+ return new TTableReader<T>(reader);
+ }
+};
+
+template <class T>
+inline TTableReaderPtr<T> IIOClient::CreateTableReader(
+ const TRichYPath& path, const TTableReaderOptions& options)
+{
+ if constexpr (TIsBaseOf<Message, T>::Value) {
+ TAutoPtr<T> prototype(new T);
+ return new TTableReader<T>(CreateProtoReader(path, options, prototype.Get()));
+ } else if constexpr (TIsSkiffRow<T>::value) {
+ const auto& hints = options.FormatHints_ ? options.FormatHints_->SkiffRowHints_ : Nothing();
+ auto schema = GetSkiffSchema<T>(hints);
+ auto skipper = CreateSkiffSkipper<T>(hints);
+ return new TTableReader<T>(CreateSkiffRowReader(path, options, skipper, schema), hints);
+ } else {
+ static_assert(TDependentFalse<T>, "Unsupported type for table reader");
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+TTableReaderPtr<T> CreateTableReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options)
+{
+ return TReaderCreator<T>::Create(NDetail::CreateProtoReader(stream, options, T::descriptor()));
+}
+
+template <class... Ts>
+TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options)
+{
+ return new TTableReader<typename NDetail::TProtoOneOfUnique<Ts...>::TType>(
+ NDetail::CreateProtoReader(stream, options, {Ts::descriptor()...}));
+}
+
+template <class T>
+TTableReaderPtr<T> CreateProtoMultiTableReader(
+ IInputStream* stream,
+ int tableCount,
+ const TTableReaderOptions& options)
+{
+ static_assert(TIsBaseOf<::google::protobuf::Message, T>::Value);
+ TVector<const ::google::protobuf::Descriptor*> descriptors(tableCount, T::descriptor());
+ return new TTableReader<T>(NDetail::CreateProtoReader(stream, options, std::move(descriptors)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+class TTableRangesReader<T>
+ : public TThrRefBase
+{
+public:
+ using TRowType = T;
+
+private:
+ using TReaderImpl = typename TRowTraits<TRowType>::IReaderImpl;
+
+public:
+ TTableRangesReader(::TIntrusivePtr<TReaderImpl> readerImpl)
+ : ReaderImpl_(readerImpl)
+ , Reader_(MakeIntrusive<TTableReader<TRowType>>(readerImpl))
+ , IsValid_(Reader_->IsValid())
+ { }
+
+ TTableReader<T>& GetRange()
+ {
+ return *Reader_;
+ }
+
+ bool IsValid() const
+ {
+ return IsValid_;
+ }
+
+ void Next()
+ {
+ ReaderImpl_->NextKey();
+ if ((IsValid_ = Reader_->IsValid())) {
+ Reader_->Next();
+ }
+ }
+
+private:
+ ::TIntrusivePtr<TReaderImpl> ReaderImpl_;
+ ::TIntrusivePtr<TTableReader<TRowType>> Reader_;
+ bool IsValid_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct IWriterImplBase
+ : public TThrRefBase
+{
+ virtual void AddRow(const T& row, size_t tableIndex) = 0;
+
+ virtual void AddRow(const T& row, size_t tableIndex, size_t /*rowWeight*/)
+ {
+ AddRow(row, tableIndex);
+ }
+
+ virtual void AddRow(T&& row, size_t tableIndex) = 0;
+
+ virtual void AddRow(T&& row, size_t tableIndex, size_t /*rowWeight*/)
+ {
+ AddRow(std::move(row), tableIndex);
+ }
+
+ virtual void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0)
+ {
+ for (const auto& row : rowBatch) {
+ AddRow(row, tableIndex, rowBatchWeight / rowBatch.size());
+ }
+ }
+
+ virtual void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0)
+ {
+ auto rowBatchSize = rowBatch.size();
+ for (auto&& row : std::move(rowBatch)) {
+ AddRow(std::move(row), tableIndex, rowBatchWeight / rowBatchSize);
+ }
+ }
+
+ virtual size_t GetTableCount() const = 0;
+ virtual void FinishTable(size_t tableIndex) = 0;
+ virtual void Abort()
+ { }
+};
+
+struct INodeWriterImpl
+ : public IWriterImplBase<TNode>
+{
+};
+
+struct IYaMRWriterImpl
+ : public IWriterImplBase<TYaMRRow>
+{
+};
+
+struct IProtoWriterImpl
+ : public IWriterImplBase<Message>
+{
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+class TTableWriterBase
+ : public TThrRefBase
+{
+public:
+ using TRowType = T;
+ using IWriterImpl = typename TRowTraits<T>::IWriterImpl;
+
+ explicit TTableWriterBase(::TIntrusivePtr<IWriterImpl> writer)
+ : Writer_(writer)
+ , Locks_(MakeAtomicShared<TVector<TAdaptiveLock>>(writer->GetTableCount()))
+ { }
+
+ ~TTableWriterBase() override
+ {
+ if (Locks_.RefCount() == 1) {
+ NDetail::FinishOrDie(this, "TTableWriterBase");
+ }
+ }
+
+ void Abort()
+ {
+ Writer_->Abort();
+ }
+
+ void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ DoAddRow<T>(row, tableIndex, rowWeight);
+ }
+
+ void AddRow(T&& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ DoAddRow<T>(std::move(row), tableIndex, rowWeight);
+ }
+
+ void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ DoAddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight);
+ }
+
+ void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ DoAddRowBatch<T>(std::move(rowBatch), tableIndex, rowBatchWeight);
+ }
+
+ void Finish()
+ {
+ for (size_t i = 0; i < Locks_->size(); ++i) {
+ auto guard = Guard((*Locks_)[i]);
+ Writer_->FinishTable(i);
+ }
+ }
+
+protected:
+ template <class U>
+ void DoAddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ if (tableIndex >= Locks_->size()) {
+ ythrow TIOException() <<
+ "Table index " << tableIndex <<
+ " is out of range [0, " << Locks_->size() << ")";
+ }
+
+ auto guard = Guard((*Locks_)[tableIndex]);
+ Writer_->AddRow(row, tableIndex, rowWeight);
+ }
+
+ template <class U>
+ void DoAddRow(U&& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ if (tableIndex >= Locks_->size()) {
+ ythrow TIOException() <<
+ "Table index " << tableIndex <<
+ " is out of range [0, " << Locks_->size() << ")";
+ }
+
+ auto guard = Guard((*Locks_)[tableIndex]);
+ Writer_->AddRow(std::move(row), tableIndex, rowWeight);
+ }
+
+ template <class U>
+ void DoAddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ if (tableIndex >= Locks_->size()) {
+ ythrow TIOException() <<
+ "Table index " << tableIndex <<
+ " is out of range [0, " << Locks_->size() << ")";
+ }
+
+ auto guard = Guard((*Locks_)[tableIndex]);
+ Writer_->AddRowBatch(rowBatch, tableIndex, rowBatchWeight);
+ }
+
+ template <class U>
+ void DoAddRowBatch(TVector<U>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ if (tableIndex >= Locks_->size()) {
+ ythrow TIOException() <<
+ "Table index " << tableIndex <<
+ " is out of range [0, " << Locks_->size() << ")";
+ }
+
+ auto guard = Guard((*Locks_)[tableIndex]);
+ Writer_->AddRowBatch(std::move(rowBatch), tableIndex, rowBatchWeight);
+ }
+
+ ::TIntrusivePtr<IWriterImpl> GetWriterImpl()
+ {
+ return Writer_;
+ }
+
+private:
+ ::TIntrusivePtr<IWriterImpl> Writer_;
+ TAtomicSharedPtr<TVector<TAdaptiveLock>> Locks_;
+};
+
+template <>
+class TTableWriter<TNode>
+ : public TTableWriterBase<TNode>
+{
+public:
+ using TBase = TTableWriterBase<TNode>;
+
+ explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer)
+ : TBase(writer)
+ { }
+};
+
+template <>
+class TTableWriter<TYaMRRow>
+ : public TTableWriterBase<TYaMRRow>
+{
+public:
+ using TBase = TTableWriterBase<TYaMRRow>;
+
+ explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer)
+ : TBase(writer)
+ { }
+};
+
+template <>
+class TTableWriter<Message>
+ : public TTableWriterBase<Message>
+{
+public:
+ using TBase = TTableWriterBase<Message>;
+
+ explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer)
+ : TBase(writer)
+ { }
+
+ template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr>
+ void AddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ TBase::AddRow(row, tableIndex, rowWeight);
+ }
+
+ template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr>
+ void AddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ for (const auto& row : rowBatch) {
+ AddRow(row, tableIndex, rowBatchWeight / rowBatch.size());
+ }
+ }
+};
+
+template <class T>
+class TTableWriter<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>>
+ : public TTableWriter<Message>
+{
+public:
+ using TRowType = T;
+ using TBase = TTableWriter<Message>;
+
+ explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer)
+ : TBase(writer)
+ { }
+
+ void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0)
+ {
+ TBase::AddRow<T>(row, tableIndex, rowWeight);
+ }
+
+ void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0)
+ {
+ TBase::AddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight);
+ }
+};
+
+template <>
+inline TTableWriterPtr<TNode> IIOClient::CreateTableWriter<TNode>(
+ const TRichYPath& path, const TTableWriterOptions& options)
+{
+ return new TTableWriter<TNode>(CreateNodeWriter(path, options));
+}
+
+template <>
+inline TTableWriterPtr<TYaMRRow> IIOClient::CreateTableWriter<TYaMRRow>(
+ const TRichYPath& path, const TTableWriterOptions& options)
+{
+ return new TTableWriter<TYaMRRow>(CreateYaMRWriter(path, options));
+}
+
+template <class T>
+inline TTableWriterPtr<T> IIOClient::CreateTableWriter(
+ const TRichYPath& path, const TTableWriterOptions& options)
+{
+ if constexpr (TIsBaseOf<Message, T>::Value) {
+ TAutoPtr<T> prototype(new T);
+ return new TTableWriter<T>(CreateProtoWriter(path, options, prototype.Get()));
+ } else {
+ static_assert(TDependentFalse<T>, "Unsupported type for table writer");
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader)
+{
+ static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)");
+ Y_ENSURE(reader, "reader must be non-null");
+ return ::MakeIntrusive<TTableReader<T>>(reader->GetReaderImpl());
+}
+
+template <typename T>
+TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader)
+{
+ Y_ENSURE(reader, "reader must be non-null");
+ return CreateConcreteProtobufReader<T>(reader.Get());
+}
+
+template <typename T>
+TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader)
+{
+ static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)");
+ Y_ENSURE(reader, "reader must be non-null");
+ return ::MakeIntrusive<TTableReader<Message>>(reader->GetReaderImpl());
+}
+
+template <typename T>
+TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader)
+{
+ Y_ENSURE(reader, "reader must be non-null");
+ return CreateGenericProtobufReader(reader.Get());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/io.cpp b/yt/cpp/mapreduce/interface/io.cpp
new file mode 100644
index 0000000000..f97629721a
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/io.cpp
@@ -0,0 +1,47 @@
+#include "io.h"
+
+#include <yt/cpp/mapreduce/interface/logging/yt_log.h>
+
+#include <util/string/cast.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+TMaybe<size_t> IReaderImplBase::GetReadByteCount() const
+{
+ return Nothing();
+}
+
+i64 IReaderImplBase::GetTabletIndex() const
+{
+ Y_FAIL("Unimplemented");
+}
+
+bool IReaderImplBase::IsEndOfStream() const
+{
+ Y_FAIL("Unimplemented");
+}
+
+bool IReaderImplBase::IsRawReaderExhausted() const
+{
+ Y_FAIL("Unimplemented");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount)
+{
+ TString byteCountStr = (byteCount ? ::ToString(*byteCount) : "<unknown>");
+ YT_LOG_DEBUG("Table reader has read %v rows, %v bytes",
+ rowCount,
+ byteCountStr);
+}
+
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/io.h b/yt/cpp/mapreduce/interface/io.h
new file mode 100644
index 0000000000..e2b20a1802
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/io.h
@@ -0,0 +1,586 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/io.h
+///
+/// Header containing client interface for reading and writing tables and files.
+
+
+#include "fwd.h"
+
+#include "client_method_options.h"
+#include "common.h"
+#include "format.h"
+#include "node.h"
+#include "mpl.h"
+#include "skiff_row.h"
+
+#include <google/protobuf/message.h>
+
+#include <util/stream/input.h>
+#include <util/stream/output.h>
+#include <util/generic/yexception.h>
+#include <util/generic/maybe.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief "Marker" type to use for several protobuf types in @ref NYT::TTableReader.
+///
+/// @tparam Ts Possible types of rows to be read.
+template<class... TProtoRowTypes>
+class TProtoOneOf
+{
+public:
+ static_assert(
+ (TIsBaseOf<::google::protobuf::Message, TProtoRowTypes>::Value && ...),
+ "Template parameters can only be protobuf types");
+
+ TProtoOneOf() = delete;
+};
+
+///
+/// @brief "Marker" type to use for several skiff row types in @ref NYT::TTableReader.
+///
+/// @tparam Ts Possible types of rows to be read.
+template<class... TSkiffRowTypes>
+class TSkiffRowOneOf
+{
+public:
+ static_assert(
+ (TIsSkiffRow<TSkiffRowTypes>::value && ...),
+ "Template parameters can only be SkiffRow types");
+
+ TSkiffRowOneOf() = delete;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @cond Doxygen_Suppress
+namespace NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TTuple>
+struct TProtoOneOfFromTuple;
+
+template <class... Ts>
+struct TProtoOneOfFromTuple<std::tuple<Ts...>>
+{
+ using TType = TProtoOneOf<Ts...>;
+};
+
+template <class... Ts>
+struct TProtoOneOfUnique
+{
+ using TTuple = typename TUniqueTypes<std::tuple<>, std::tuple<Ts...>>::TType;
+ using TType = typename TProtoOneOfFromTuple<TTuple>::TType;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NDetail
+/// @endcond
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct INodeReaderImpl;
+struct IYaMRReaderImpl;
+struct IProtoReaderImpl;
+struct ISkiffRowReaderImpl;
+struct INodeWriterImpl;
+struct IYaMRWriterImpl;
+struct IProtoWriterImpl;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Class of exceptions connected to reading or writing tables or files.
+class TIOException
+ : public yexception
+{ };
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Interface representing YT file reader.
+class IFileReader
+ : public TThrRefBase
+ , public IInputStream
+{ };
+
+/// Interface representing YT file writer.
+class IFileWriter
+ : public TThrRefBase
+ , public IOutputStream
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Low-level interface to read YT table with retries.
+class TRawTableReader
+ : public TThrRefBase
+ , public IInputStream
+{
+public:
+ /// @brief Retry table read starting from the specified `rangeIndex` and `rowIndex`.
+ ///
+ /// @param rangeIndex Index of first range to read
+ /// @param rowIndex Index of first row to read; if `rowIndex == Nothing` entire request will be retried.
+ ///
+ /// @return `true` on successful request retry, `false` if no retry attempts are left (then `Retry()` shouldn't be called any more).
+ ///
+ /// `rowIndex` must be inside the range with index `rangeIndex` if the latter is specified.
+ ///
+ /// After successful retry the user should reset `rangeIndex` / `rowIndex` values and read new ones
+ /// from the stream.
+ virtual bool Retry(
+ const TMaybe<ui32>& rangeIndex,
+ const TMaybe<ui64>& rowIndex) = 0;
+
+ /// Resets retry attempt count to the initial value (then `Retry()` can be called again).
+ virtual void ResetRetries() = 0;
+
+ /// @brief May the input stream contain table ranges?
+ ///
+ /// In the case when it is `true` the `TRawTableReader` user is responsible
+ /// to track active range index in order to pass it to Retry().
+ virtual bool HasRangeIndices() const = 0;
+};
+
+/// @brief Low-level interface to write YT table.
+///
+/// Retries must be handled by implementation.
+class TRawTableWriter
+ : public TThrRefBase
+ , public IOutputStream
+{
+public:
+ /// @brief Call this method after complete row representation is written to the stream.
+ ///
+ /// When this method is called `TRowTableWriter` can check its buffer
+ /// and if it is full send data to YT.
+ /// @note `TRawTableWriter` never sends partial records to YT (due to retries).
+ virtual void NotifyRowEnd() = 0;
+
+ /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers).
+ ///
+ /// By default it does nothing, but implementations are welcome to override this method.
+ virtual void Abort()
+ { }
+};
+
+/// @brief Interface to deal with multiple raw output streams.
+class IProxyOutput
+{
+public:
+ virtual ~IProxyOutput()
+ { }
+
+ /// Get amount of managed streams.
+ virtual size_t GetStreamCount() const = 0;
+
+ /// Get stream corresponding to the specified table index.
+ virtual IOutputStream* GetStream(size_t tableIndex) const = 0;
+
+ /// This handler must be called right after the next row has been written.
+ virtual void OnRowFinished(size_t tableIndex) = 0;
+
+ /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers).
+ ///
+ /// By default it does nothing, but implementations are welcome to override this method.
+ virtual void Abort()
+ { }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Class template to read typed rows from YT tables.
+///
+/// @tparam T Row type.
+///
+/// Correct usage of this class usually looks like
+/// ```
+/// for (const auto& cursor : *reader) {
+/// const auto& row = cursor.GetRow();
+/// ...
+/// }
+/// ```
+/// or, more verbosely,
+/// ```
+/// for (; reader->IsValid(); reader->Next()) {
+/// const auto& row = reader->GetRow();
+/// ...
+/// }
+/// ```
+///
+/// @note Actual (partial) specializations of this template may look a bit different,
+/// e.g. @ref NYT::TTableReader::GetRow, @ref NYT::TTableReader::MoveRow may be method templates.
+template <class T, class>
+class TTableReader
+ : public TThrRefBase
+{
+public:
+ /// Get current row.
+ const T& GetRow() const;
+
+ /// Extract current row; further calls to `GetRow` and `MoveRow` will fail.
+ T MoveRow();
+
+ /// Extract current row to `result`; further calls to `GetRow` and `MoveRow` will fail.
+ void MoveRow(T* result);
+
+ /// Check whether all the rows were read.
+ bool IsValid() const;
+
+ /// Move the cursor to the next row.
+ void Next();
+
+ /// Get table index of the current row.
+ ui32 GetTableIndex() const;
+
+ /// Get range index of the current row (zero if it is unknown or read request contains no ranges)
+ ui32 GetRangeIndex() const;
+
+ /// Get current row index (zero if it unknown).
+ ui64 GetRowIndex() const;
+
+ /// Get current tablet index (for ordered dynamic tables).
+ i64 GetTabletIndex() const;
+
+ /// Returns `true` if job consumed all the input and `false` otherwise.
+ bool IsEndOfStream() const;
+
+ /// Returns `true` if job raw input stream was closed and `false` otherwise.
+ bool IsRawReaderExhausted() const;
+};
+
+/// @brief Iterator for use in range-based-for.
+///
+/// @note Idiomatic usage:
+/// ```
+/// for (const auto& cursor : *reader) {
+/// const auto& row = cursor.GetRow();
+/// ...
+/// }
+/// ```
+template <class T>
+class TTableReaderIterator
+{
+public:
+ /// Construct iterator from table reader (can be `nullptr`).
+ explicit TTableReaderIterator<T>(TTableReader<T>* reader)
+ {
+ if (reader && reader->IsValid()) {
+ Reader_ = reader;
+ } else {
+ Reader_ = nullptr;
+ }
+ }
+
+ /// Equality operator.
+ bool operator==(const TTableReaderIterator& it) const
+ {
+ return Reader_ == it.Reader_;
+ }
+
+ /// Inequality operator.
+ bool operator!=(const TTableReaderIterator& it) const
+ {
+ return Reader_ != it.Reader_;
+ }
+
+ /// Dereference operator.
+ TTableReader<T>& operator*()
+ {
+ return *Reader_;
+ }
+
+ /// Const dereference operator.
+ const TTableReader<T>& operator*() const
+ {
+ return *Reader_;
+ }
+
+ /// Preincrement operator.
+ TTableReaderIterator& operator++()
+ {
+ Reader_->Next();
+ if (!Reader_->IsValid()) {
+ Reader_ = nullptr;
+ }
+ return *this;
+ }
+
+private:
+ TTableReader<T>* Reader_;
+};
+
+/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader.
+///
+/// @see @ref NYT::TTableReaderIterator
+template <class T>
+TTableReaderIterator<T> begin(TTableReader<T>& reader)
+{
+ return TTableReaderIterator<T>(&reader);
+}
+
+/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader.
+///
+/// @see @ref NYT::TTableReaderIterator
+template <class T>
+TTableReaderIterator<T> end(TTableReader<T>&)
+{
+ return TTableReaderIterator<T>(nullptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Class to facilitate reading table rows sorted by key.
+///
+/// Each reader returned from @ref NYT::TTableRangesReader::GetRange represents
+/// a range of rows with the same key.
+///
+/// @note Idiomatic usage:
+/// ```
+/// for (; reader->IsValid(); reader->Next()) {
+/// auto& rangeReader = reader->GetRange();
+/// ...
+/// }
+/// ```
+template <class T, class>
+class TTableRangesReader
+ : public TThrRefBase
+{
+public:
+ /// Get reader for rows with the same key.
+ TTableReader<T>& GetRange();
+
+ /// Check whether all rows are read.
+ bool IsValid() const;
+
+ /// Move cursor to the next range.
+ void Next();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Class template to write typed rows to YT tables.
+template <class T, class>
+class TTableWriter
+ : public TThrRefBase
+{
+public:
+ /// @brief Submit a row for writing.
+ ///
+ /// The row may (and very probably will) *not* be written immediately.
+ void AddRow(const T& row);
+
+ /// Stop writing data as soon as possible (without flushing data, e.g. before aborting parent transaction).
+ void Finish();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Type representing YaMR table row.
+///
+/// @deprecated
+struct TYaMRRow
+{
+ /// Key column.
+ TStringBuf Key;
+
+ /// Subkey column.
+ TStringBuf SubKey;
+
+ /// Value column.
+ TStringBuf Value;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Interface for creating table and file readers and writer.
+class IIOClient
+{
+public:
+ virtual ~IIOClient() = default;
+
+ /// Create a reader for file at `path`.
+ virtual IFileReaderPtr CreateFileReader(
+ const TRichYPath& path,
+ const TFileReaderOptions& options = TFileReaderOptions()) = 0;
+
+ /// Create a writer for file at `path`.
+ virtual IFileWriterPtr CreateFileWriter(
+ const TRichYPath& path,
+ const TFileWriterOptions& options = TFileWriterOptions()) = 0;
+
+ /// Create a typed reader for table at `path`.
+ template <class T>
+ TTableReaderPtr<T> CreateTableReader(
+ const TRichYPath& path,
+ const TTableReaderOptions& options = TTableReaderOptions());
+
+ /// Create a typed writer for table at `path`.
+ template <class T>
+ TTableWriterPtr<T> CreateTableWriter(
+ const TRichYPath& path,
+ const TTableWriterOptions& options = TTableWriterOptions());
+
+ /// Create a writer to write protobuf messages with specified descriptor.
+ virtual TTableWriterPtr<::google::protobuf::Message> CreateTableWriter(
+ const TRichYPath& path,
+ const ::google::protobuf::Descriptor& descriptor,
+ const TTableWriterOptions& options = TTableWriterOptions()) = 0;
+
+ /// Create a reader to read a table using specified format.
+ virtual TRawTableReaderPtr CreateRawReader(
+ const TRichYPath& path,
+ const TFormat& format,
+ const TTableReaderOptions& options = TTableReaderOptions()) = 0;
+
+ /// Create a reader to write a table using specified format.
+ virtual TRawTableWriterPtr CreateRawWriter(
+ const TRichYPath& path,
+ const TFormat& format,
+ const TTableWriterOptions& options = TTableWriterOptions()) = 0;
+
+ ///
+ /// @brief Create a reader for [blob table](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables) at `path`.
+ ///
+ /// @param path Blob table path.
+ /// @param blobId Key identifying the blob.
+ /// @param options Optional parameters
+ ///
+ /// Blob table is a table that stores a number of blobs.
+ /// Blobs are sliced into parts of the same size (maybe except of last part).
+ /// Those parts are stored in the separate rows.
+ ///
+ /// Blob table have constraints on its schema.
+ /// - There must be columns that identify blob (blob id columns). That columns might be of any type.
+ /// - There must be a column of `int64` type that identify part inside the blob (this column is called `part index`).
+ /// - There must be a column of `string` type that stores actual data (this column is called `data column`).
+ virtual IFileReaderPtr CreateBlobTableReader(
+ const TYPath& path,
+ const TKey& blobId,
+ const TBlobTableReaderOptions& options = TBlobTableReaderOptions()) = 0;
+
+private:
+ virtual ::TIntrusivePtr<INodeReaderImpl> CreateNodeReader(
+ const TRichYPath& path, const TTableReaderOptions& options) = 0;
+
+ virtual ::TIntrusivePtr<IYaMRReaderImpl> CreateYaMRReader(
+ const TRichYPath& path, const TTableReaderOptions& options) = 0;
+
+ virtual ::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader(
+ const TRichYPath& path,
+ const TTableReaderOptions& options,
+ const ::google::protobuf::Message* prototype) = 0;
+
+ virtual ::TIntrusivePtr<ISkiffRowReaderImpl> CreateSkiffRowReader(
+ const TRichYPath& path,
+ const TTableReaderOptions& options,
+ const ISkiffRowSkipperPtr& skipper,
+ const NSkiff::TSkiffSchemaPtr& schema) = 0;
+
+ virtual ::TIntrusivePtr<INodeWriterImpl> CreateNodeWriter(
+ const TRichYPath& path, const TTableWriterOptions& options) = 0;
+
+ virtual ::TIntrusivePtr<IYaMRWriterImpl> CreateYaMRWriter(
+ const TRichYPath& path, const TTableWriterOptions& options) = 0;
+
+ virtual ::TIntrusivePtr<IProtoWriterImpl> CreateProtoWriter(
+ const TRichYPath& path,
+ const TTableWriterOptions& options,
+ const ::google::protobuf::Message* prototype) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Create a protobuf table reader from a stream.
+///
+/// @tparam T Protobuf message type to read (must be inherited from `Message`).
+///
+/// @param stream Input stream in YT protobuf format.
+template <typename T>
+TTableReaderPtr<T> CreateTableReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options = {});
+
+///
+/// @brief Create a protobuf multi table reader from a stream.
+///
+/// @tparam Ts Protobuf message types to read (must be inherited from `Message`).
+///
+/// @param stream Input stream in YT protobuf format.
+template <class... Ts>
+TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options = {});
+
+///
+/// @brief Create a homogenous protobuf multi table reader from a stream.
+///
+/// @tparam T Protobuf message type to read (must be inherited from `Message`).
+///
+/// @param stream Input stream in YT protobuf format.
+/// @param tableCount Number of tables in input stream.
+template <class T>
+TTableReaderPtr<T> CreateProtoMultiTableReader(
+ IInputStream* stream,
+ int tableCount,
+ const TTableReaderOptions& options = {});
+
+/// Create a @ref NYT::TNode table reader from a stream.
+template <>
+TTableReaderPtr<TNode> CreateTableReader<TNode>(
+ IInputStream* stream, const TTableReaderOptions& options);
+
+/// Create a @ref NYT::TYaMRRow table reader from a stream.
+template <>
+TTableReaderPtr<TYaMRRow> CreateTableReader<TYaMRRow>(
+ IInputStream* stream, const TTableReaderOptions& options);
+
+namespace NDetail {
+
+/// Create a protobuf table reader from a stream.
+::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options,
+ const ::google::protobuf::Descriptor* descriptor);
+
+
+/// Create a protobuf table reader from a stream that can contain table switches.
+::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader(
+ IInputStream* stream,
+ const TTableReaderOptions& options,
+ TVector<const ::google::protobuf::Descriptor*> descriptors);
+
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Convert generic protobuf table reader to a concrete one (for certain type `T`).
+template <typename T>
+TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader);
+
+/// Convert generic protobuf table reader to a concrete one (for certain type `T`).
+template <typename T>
+TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader);
+
+/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one.
+template <typename T>
+TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader);
+
+/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one.
+template <typename T>
+TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
+
+#define IO_INL_H_
+#include "io-inl.h"
+#undef IO_INL_H_
diff --git a/yt/cpp/mapreduce/interface/job_counters.cpp b/yt/cpp/mapreduce/interface/job_counters.cpp
new file mode 100644
index 0000000000..6d4a2a6fcb
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_counters.cpp
@@ -0,0 +1,164 @@
+#include "job_counters.h"
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////
+
+namespace {
+ ui64 CountTotal(const TNode& data)
+ {
+ if (data.IsMap()) {
+ if (auto totalPtr = data.AsMap().FindPtr("total")) {
+ return data["total"].IntCast<ui64>();
+ } else {
+ ui64 total = 0;
+ for (const auto& keyVal: data.AsMap()) {
+ total += CountTotal(keyVal.second);
+ }
+ return total;
+ }
+ } else {
+ return data.IntCast<ui64>();
+ }
+ }
+
+ TNode GetNode(const TNode& data, const TStringBuf& key)
+ {
+ if (auto resPtr = data.AsMap().FindPtr(key)) {
+ return *resPtr;
+ }
+ return TNode();
+ }
+} // namespace
+
+////////////////////////////////////////////////////////////////////
+
+TJobCounter::TJobCounter(TNode data)
+ : Data_(std::move(data))
+{
+ if (Data_.HasValue()) {
+ Total_ = CountTotal(Data_);
+ }
+}
+
+TJobCounter::TJobCounter(ui64 total)
+ : Total_(total)
+{ }
+
+ui64 TJobCounter::GetTotal() const
+{
+ return Total_;
+}
+
+ui64 TJobCounter::GetValue(const TStringBuf key) const
+{
+ if (Data_.HasValue()) {
+ return CountTotal(Data_[key]);
+ }
+ return 0;
+}
+
+////////////////////////////////////////////////////////////////////
+
+TJobCounters::TJobCounters(const NYT::TNode& counters)
+ : Total_(0)
+{
+ if (!counters.IsMap()) {
+ ythrow yexception() << "TJobCounters must be initialized with Map type TNode";
+ }
+ auto abortedNode = GetNode(counters, "aborted");
+ if (abortedNode.HasValue()) {
+ Aborted_ = TJobCounter(GetNode(abortedNode, "total"));
+ AbortedScheduled_ = TJobCounter(GetNode(abortedNode, "scheduled"));
+ AbortedNonScheduled_ = TJobCounter(GetNode(abortedNode, "non_scheduled"));
+ }
+ auto completedNode = GetNode(counters, "completed");
+ if (completedNode.HasValue()) {
+ Completed_ = TJobCounter(GetNode(completedNode, "total"));
+ CompletedNonInterrupted_ = TJobCounter(GetNode(completedNode, "non-interrupted"));
+ CompletedInterrupted_ = TJobCounter(GetNode(completedNode, "interrupted"));
+ }
+ Lost_ = TJobCounter(GetNode(counters, "lost"));
+ Invalidated_ = TJobCounter(GetNode(counters, "invalidated"));
+ Failed_ = TJobCounter(GetNode(counters, "failed"));
+ Running_ = TJobCounter(GetNode(counters, "running"));
+ Suspended_ = TJobCounter(GetNode(counters, "suspended"));
+ Pending_ = TJobCounter(GetNode(counters, "pending"));
+ Blocked_ = TJobCounter(GetNode(counters, "blocked"));
+ Total_ = CountTotal(counters);
+}
+
+
+const TJobCounter& TJobCounters::GetAborted() const
+{
+ return Aborted_;
+}
+
+const TJobCounter& TJobCounters::GetAbortedScheduled() const
+{
+ return AbortedScheduled_;
+}
+
+const TJobCounter& TJobCounters::GetAbortedNonScheduled() const
+{
+ return AbortedNonScheduled_;
+}
+
+const TJobCounter& TJobCounters::GetCompleted() const
+{
+ return Completed_;
+}
+
+const TJobCounter& TJobCounters::GetCompletedNonInterrupted() const
+{
+ return CompletedNonInterrupted_;
+}
+
+const TJobCounter& TJobCounters::GetCompletedInterrupted() const
+{
+ return CompletedInterrupted_;
+}
+
+const TJobCounter& TJobCounters::GetLost() const
+{
+ return Lost_;
+}
+
+const TJobCounter& TJobCounters::GetInvalidated() const
+{
+ return Invalidated_;
+}
+
+const TJobCounter& TJobCounters::GetFailed() const
+{
+ return Failed_;
+}
+
+const TJobCounter& TJobCounters::GetRunning() const
+{
+ return Running_;
+}
+
+const TJobCounter& TJobCounters::GetSuspended() const
+{
+ return Suspended_;
+}
+
+const TJobCounter& TJobCounters::GetPending() const
+{
+ return Pending_;
+}
+
+const TJobCounter& TJobCounters::GetBlocked() const
+{
+ return Blocked_;
+}
+
+ui64 TJobCounters::GetTotal() const
+{
+ return Total_;
+}
+
+////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/job_counters.h b/yt/cpp/mapreduce/interface/job_counters.h
new file mode 100644
index 0000000000..9257cc1ec1
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_counters.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "fwd.h"
+
+#include <yt/cpp/mapreduce/interface/node.h>
+
+namespace NYT {
+
+class TJobCounter
+{
+private:
+ TNode Data_;
+ ui64 Total_ = 0;
+
+public:
+ TJobCounter() = default;
+
+ TJobCounter(TNode data);
+ TJobCounter(ui64 total);
+
+ ui64 GetTotal() const;
+
+ ui64 GetValue(const TStringBuf key) const;
+};
+
+/// Class representing a collection of job counters.
+class TJobCounters
+{
+public:
+ ///
+ /// Construct empty counter.
+ TJobCounters() = default;
+
+ ///
+ /// Construct counter from counters node.
+ TJobCounters(const NYT::TNode& counters);
+
+ const TJobCounter& GetAborted() const;
+ const TJobCounter& GetAbortedScheduled() const;
+ const TJobCounter& GetAbortedNonScheduled() const;
+ const TJobCounter& GetCompleted() const;
+ const TJobCounter& GetCompletedNonInterrupted() const;
+ const TJobCounter& GetCompletedInterrupted() const;
+ const TJobCounter& GetLost() const;
+ const TJobCounter& GetInvalidated() const;
+ const TJobCounter& GetFailed() const;
+ const TJobCounter& GetRunning() const;
+ const TJobCounter& GetSuspended() const;
+ const TJobCounter& GetPending() const;
+ const TJobCounter& GetBlocked() const;
+
+ ui64 GetTotal() const;
+
+private:
+ ui64 Total_ = 0;
+
+ TJobCounter Aborted_;
+ TJobCounter AbortedScheduled_;
+ TJobCounter AbortedNonScheduled_;
+ TJobCounter Completed_;
+ TJobCounter CompletedNonInterrupted_;
+ TJobCounter CompletedInterrupted_;
+ TJobCounter Lost_;
+ TJobCounter Invalidated_;
+ TJobCounter Failed_;
+ TJobCounter Running_;
+ TJobCounter Suspended_;
+ TJobCounter Pending_;
+ TJobCounter Blocked_;
+};
+
+////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/job_counters_ut.cpp b/yt/cpp/mapreduce/interface/job_counters_ut.cpp
new file mode 100644
index 0000000000..56d3932b8f
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_counters_ut.cpp
@@ -0,0 +1,103 @@
+#include <yt/cpp/mapreduce/interface/job_counters.h>
+#include <yt/cpp/mapreduce/interface/operation.h>
+
+#include <library/cpp/yson/node/node_io.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NYT;
+
+Y_UNIT_TEST_SUITE(JobCounters)
+{
+ Y_UNIT_TEST(Full)
+ {
+ const TString input = R"""(
+ {
+ "completed" = {
+ "total" = 6;
+ "non-interrupted" = 1;
+ "interrupted" = {
+ "whatever_interrupted" = 2;
+ "whatever_else_interrupted" = 3;
+ };
+ };
+ "aborted" = {
+ "non_scheduled" = {
+ "whatever_non_scheduled" = 4;
+ "whatever_else_non_scheduled" = 5;
+ };
+ "scheduled" = {
+ "whatever_scheduled" = 6;
+ "whatever_else_scheduled" = 7;
+ };
+ "total" = 22;
+ };
+ "lost" = 8;
+ "invalidated" = 9;
+ "failed" = 10;
+ "running" = 11;
+ "suspended" = 12;
+ "pending" = 13;
+ "blocked" = 14;
+ "total" = 105;
+ })""";
+
+ TJobCounters counters(NodeFromYsonString(input));
+
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 105);
+
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 6);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 1);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 5);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 22);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 9);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 13);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 8);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 9);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 10);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 11);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 12);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 13);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 14);
+
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_interrupted"), 2);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_else_interrupted"), 3);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_non_scheduled"), 4);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_else_non_scheduled"), 5);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_scheduled"), 6);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_else_scheduled"), 7);
+
+ UNIT_ASSERT_EXCEPTION(counters.GetCompletedInterrupted().GetValue("Nothingness"), yexception);
+ }
+
+ Y_UNIT_TEST(Empty)
+ {
+ const TString input = "{}";
+
+ TJobCounters counters(NodeFromYsonString(input));
+
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 0);
+
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 0);
+ UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 0);
+ }
+
+ Y_UNIT_TEST(Broken)
+ {
+ UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode()), yexception, "TJobCounters");
+ UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1)), yexception, "TJobCounters");
+ UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1.0)), yexception, "TJobCounters");
+ UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode("Whatever")), yexception, "TJobCounters");
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/job_statistics.cpp b/yt/cpp/mapreduce/interface/job_statistics.cpp
new file mode 100644
index 0000000000..bd9791672d
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_statistics.cpp
@@ -0,0 +1,361 @@
+#include "job_statistics.h"
+
+#include "operation.h"
+
+#include <library/cpp/yson/node/node.h>
+#include <library/cpp/yson/node/serialize.h>
+
+#include <library/cpp/yson/writer.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/ptr.h>
+#include <util/stream/file.h>
+#include <util/string/cast.h>
+#include <util/string/subst.h>
+#include <util/system/file.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////
+
+template <>
+i64 ConvertJobStatisticsEntry(i64 value)
+{
+ return value;
+}
+
+template <>
+TDuration ConvertJobStatisticsEntry(i64 value)
+{
+ return TDuration::MilliSeconds(value);
+}
+
+////////////////////////////////////////////////////////////////////
+
+static TTaskName JobTypeToTaskName(EJobType jobType)
+{
+ switch (jobType) {
+ case EJobType::PartitionMap:
+ return ETaskName::PartitionMap0;
+ case EJobType::Partition:
+ return ETaskName::Partition0;
+ default:
+ return ToString(jobType);
+ }
+}
+
+static TTaskName FixTaskName(TString taskName)
+{
+ if (taskName == "partition") {
+ return ETaskName::Partition0;
+ } else if (taskName == "partition_map") {
+ return ETaskName::PartitionMap0;
+ }
+ return taskName;
+}
+
+////////////////////////////////////////////////////////////////////
+
+class TJobStatistics::TData
+ : public TThrRefBase
+{
+public:
+ using TTaskName2Data = THashMap<TString, TJobStatistics::TDataEntry>;
+ using TState2TaskName2Data = THashMap<EJobState, TTaskName2Data>;
+ using TName2State2TaskName2Data = THashMap<TString, TState2TaskName2Data>;
+
+public:
+ TName2State2TaskName2Data Name2State2TaskName2Data;
+
+public:
+ TData() = default;
+
+ TData(const TNode& statisticsNode)
+ {
+ ParseNode(statisticsNode, TString(), &Name2State2TaskName2Data);
+ }
+
+ static void Aggregate(TJobStatistics::TDataEntry* result, const TJobStatistics::TDataEntry& other)
+ {
+ result->Max = Max(result->Max, other.Max);
+ result->Min = Min(result->Min, other.Min);
+ result->Sum += other.Sum;
+ result->Count += other.Count;
+ }
+
+ static void ParseNode(const TNode& node, TState2TaskName2Data* output)
+ {
+ auto getInt = [] (const TNode& theNode, TStringBuf key) {
+ const auto& nodeAsMap = theNode.AsMap();
+ auto it = nodeAsMap.find(key);
+ if (it == nodeAsMap.end()) {
+ ythrow yexception() << "Key '" << key << "' is not found";
+ }
+ const auto& valueNode = it->second;
+ if (!valueNode.IsInt64()) {
+ ythrow yexception() << "Key '" << key << "' is not of int64 type";
+ }
+ return valueNode.AsInt64();
+ };
+
+ for (const auto& [stateStr, taskName2DataNode] : node.AsMap()) {
+ EJobState state;
+ if (!TryFromString(stateStr, state)) {
+ continue;
+ }
+ for (const auto& [taskName, dataNode] : taskName2DataNode.AsMap()) {
+ auto fixedTaskName = FixTaskName(taskName);
+ auto& data = (*output)[state][fixedTaskName.Get()];
+ data.Max = getInt(dataNode, "max");
+ data.Min = getInt(dataNode, "min");
+ data.Sum = getInt(dataNode, "sum");
+ data.Count = getInt(dataNode, "count");
+ }
+ }
+ }
+
+ static void ParseNode(const TNode& node, const TString& curPath, TName2State2TaskName2Data* output)
+ {
+ Y_VERIFY(node.IsMap());
+
+ for (const auto& [key, value] : node.AsMap()) {
+ if (key == "$"sv) {
+ ParseNode(value, &(*output)[curPath]);
+ } else {
+ TString childPath = curPath;
+ if (!childPath.empty()) {
+ childPath.push_back('/');
+ }
+ if (key.find_first_of('/') != key.npos) {
+ TString keyCopy(key);
+ SubstGlobal(keyCopy, "/", "\\/");
+ childPath += keyCopy;
+ } else {
+ childPath += key;
+ }
+ ParseNode(value, childPath, output);
+ }
+ }
+ }
+};
+
+////////////////////////////////////////////////////////////////////
+
+struct TJobStatistics::TFilter
+ : public TThrRefBase
+{
+ TVector<TTaskName> TaskNameFilter;
+ TVector<EJobState> JobStateFilter = {EJobState::Completed};
+};
+
+////////////////////////////////////////////////////////////////////
+
+const TString TJobStatistics::CustomStatisticsNamePrefix_ = "custom/";
+
+TJobStatistics::TJobStatistics()
+ : Data_(::MakeIntrusive<TData>())
+ , Filter_(::MakeIntrusive<TFilter>())
+{ }
+
+
+TJobStatistics::TJobStatistics(const NYT::TNode& statisticsNode)
+ : Data_(::MakeIntrusive<TData>(statisticsNode))
+ , Filter_(::MakeIntrusive<TFilter>())
+{ }
+
+TJobStatistics::TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter)
+ : Data_(data)
+ , Filter_(::MakeIntrusive<TFilter>(*filter))
+{ }
+
+TJobStatistics::TJobStatistics(const TJobStatistics& jobStatistics) = default;
+TJobStatistics::TJobStatistics(TJobStatistics&&) = default;
+
+TJobStatistics& TJobStatistics::operator=(const TJobStatistics& jobStatistics) = default;
+TJobStatistics& TJobStatistics::operator=(TJobStatistics&& jobStatistics) = default;
+
+TJobStatistics::~TJobStatistics() = default;
+
+TJobStatistics TJobStatistics::TaskName(TVector<TTaskName> taskNames) const
+{
+ auto newFilter = ::MakeIntrusive<TFilter>(*Filter_);
+ newFilter->TaskNameFilter = std::move(taskNames);
+ return TJobStatistics(Data_, std::move(newFilter));
+}
+
+TJobStatistics TJobStatistics::JobState(TVector<EJobState> jobStates) const
+{
+ auto newFilter = ::MakeIntrusive<TFilter>(*Filter_);
+ newFilter->JobStateFilter = std::move(jobStates);
+ return TJobStatistics(Data_, std::move(newFilter));
+}
+
+TJobStatistics TJobStatistics::JobType(TVector<EJobType> jobTypes) const
+{
+ TVector<TTaskName> taskNames;
+ for (auto jobType : jobTypes) {
+ taskNames.push_back(JobTypeToTaskName(jobType));
+ }
+ return TaskName(std::move(taskNames));
+}
+
+bool TJobStatistics::HasStatistics(TStringBuf name) const
+{
+ return Data_->Name2State2TaskName2Data.contains(name);
+}
+
+TJobStatisticsEntry<i64> TJobStatistics::GetStatistics(TStringBuf name) const
+{
+ return GetStatisticsAs<i64>(name);
+}
+
+TVector<TString> TJobStatistics::GetStatisticsNames() const
+{
+ TVector<TString> result;
+ result.reserve(Data_->Name2State2TaskName2Data.size());
+ for (const auto& entry : Data_->Name2State2TaskName2Data) {
+ result.push_back(entry.first);
+ }
+ return result;
+}
+
+bool TJobStatistics::HasCustomStatistics(TStringBuf name) const
+{
+ return HasStatistics(CustomStatisticsNamePrefix_ + name);
+}
+
+TJobStatisticsEntry<i64> TJobStatistics::GetCustomStatistics(TStringBuf name) const
+{
+ return GetCustomStatisticsAs<i64>(name);
+}
+
+TVector<TString> TJobStatistics::GetCustomStatisticsNames() const
+{
+ TVector<TString> result;
+ for (const auto& entry : Data_->Name2State2TaskName2Data) {
+ if (entry.first.StartsWith(CustomStatisticsNamePrefix_)) {
+ result.push_back(entry.first.substr(CustomStatisticsNamePrefix_.size()));
+ }
+ }
+ return result;
+}
+
+TMaybe<TJobStatistics::TDataEntry> TJobStatistics::GetStatisticsImpl(TStringBuf name) const
+{
+ auto name2State2TaskName2DataIt = Data_->Name2State2TaskName2Data.find(name);
+ Y_ENSURE(
+ name2State2TaskName2DataIt != Data_->Name2State2TaskName2Data.end(),
+ "Statistics '" << name << "' are missing");
+ const auto& state2TaskName2Data = name2State2TaskName2DataIt->second;
+
+ TMaybe<TDataEntry> result;
+ auto aggregate = [&] (const TDataEntry& data) {
+ if (result) {
+ TData::Aggregate(&result.GetRef(), data);
+ } else {
+ result = data;
+ }
+ };
+
+ auto aggregateTaskName2Data = [&] (const TData::TTaskName2Data& taskName2Data) {
+ if (Filter_->TaskNameFilter.empty()) {
+ for (const auto& [taskName, data] : taskName2Data) {
+ aggregate(data);
+ }
+ } else {
+ for (const auto& taskName : Filter_->TaskNameFilter) {
+ auto it = taskName2Data.find(taskName.Get());
+ if (it == taskName2Data.end()) {
+ continue;
+ }
+ const auto& data = it->second;
+ aggregate(data);
+ }
+ }
+ };
+
+ if (Filter_->JobStateFilter.empty()) {
+ for (const auto& [state, taskName2Data] : state2TaskName2Data) {
+ aggregateTaskName2Data(taskName2Data);
+ }
+ } else {
+ for (auto state : Filter_->JobStateFilter) {
+ auto it = state2TaskName2Data.find(state);
+ if (it == state2TaskName2Data.end()) {
+ continue;
+ }
+ const auto& taskName2Data = it->second;
+ aggregateTaskName2Data(taskName2Data);
+ }
+ }
+
+ return result;
+}
+
+////////////////////////////////////////////////////////////////////
+
+namespace {
+
+constexpr int USER_STATISTICS_FILE_DESCRIPTOR = 5;
+constexpr char PATH_DELIMITER = '/';
+constexpr char ESCAPE = '\\';
+
+IOutputStream* GetStatisticsStream()
+{
+ static TFile file = Duplicate(USER_STATISTICS_FILE_DESCRIPTOR);
+ static TFileOutput stream(file);
+ return &stream;
+}
+
+template <typename T>
+void WriteCustomStatisticsAny(TStringBuf path, const T& value)
+{
+ ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment);
+ int depth = 0;
+ size_t begin = 0;
+ size_t end = 0;
+ TVector<TString> items;
+ while (end <= path.size()) {
+ if (end + 1 < path.size() && path[end] == ESCAPE && path[end + 1] == PATH_DELIMITER) {
+ end += 2;
+ continue;
+ }
+ if (end == path.size() || path[end] == PATH_DELIMITER) {
+ writer.OnBeginMap();
+ items.emplace_back(path.data() + begin, end - begin);
+ SubstGlobal(items.back(), "\\/", "/");
+ writer.OnKeyedItem(TStringBuf(items.back()));
+ ++depth;
+ begin = end + 1;
+ }
+ ++end;
+ }
+ Serialize(value, &writer);
+ while (depth > 0) {
+ writer.OnEndMap();
+ --depth;
+ }
+}
+
+}
+
+////////////////////////////////////////////////////////////////////
+
+void WriteCustomStatistics(const TNode& statistics)
+{
+ ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment);
+ Serialize(statistics, &writer);
+}
+
+void WriteCustomStatistics(TStringBuf path, i64 value)
+{
+ WriteCustomStatisticsAny(path, value);
+}
+
+void FlushCustomStatisticsStream() {
+ GetStatisticsStream()->Flush();
+}
+////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/job_statistics.h b/yt/cpp/mapreduce/interface/job_statistics.h
new file mode 100644
index 0000000000..8af751604f
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_statistics.h
@@ -0,0 +1,268 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/job_statistics.h
+///
+/// Header containing classes and utility functions to work with
+/// [job statistics](https://docs.yandex-team.ru/yt/problems/jobstatistics).
+
+#include "fwd.h"
+
+#include <library/cpp/yson/node/node.h>
+
+#include <util/system/defaults.h>
+#include <util/generic/maybe.h>
+#include <util/generic/ptr.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Convert i64 representation of statistics to other type.
+///
+/// Library defines this template for types TDuration and i64.
+/// Users may define it for their types.
+///
+/// @see @ref NYT::TJobStatistics::GetStatisticsAs method.
+template <typename T>
+T ConvertJobStatisticsEntry(i64 value);
+
+////////////////////////////////////////////////////////////////////
+
+/// Class representing a collection of job statistics.
+class TJobStatistics
+{
+public:
+ ///
+ /// Construct empty statistics.
+ TJobStatistics();
+
+ ///
+ /// Construct statistics from statistics node.
+ TJobStatistics(const NYT::TNode& statistics);
+
+ TJobStatistics(const TJobStatistics& jobStatistics);
+ TJobStatistics(TJobStatistics&& jobStatistics);
+
+ TJobStatistics& operator=(const TJobStatistics& jobStatistics);
+ TJobStatistics& operator=(TJobStatistics&& jobStatistics);
+
+ ~TJobStatistics();
+
+ ///
+ /// @brief Filter statistics by task name.
+ ///
+ /// @param taskNames What task names to include (empty means all).
+ TJobStatistics TaskName(TVector<TTaskName> taskNames) const;
+
+ ///
+ /// @brief Filter statistics by job state.
+ ///
+ /// @param filter What job states to include (empty means all).
+ ///
+ /// @note Default statistics include only (successfully) completed jobs.
+ TJobStatistics JobState(TVector<EJobState> filter) const;
+
+ ///
+ /// @brief Filter statistics by job type.
+ ///
+ /// @param filter What job types to include (empty means all).
+ ///
+ /// @deprecated Use @ref TJobStatistics::TaskName instead.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/jobs#obshaya-shema
+ TJobStatistics JobType(TVector<EJobType> filter) const;
+
+ ///
+ /// @brief Check that given statistics exist.
+ ///
+ /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface).
+ bool HasStatistics(TStringBuf name) const;
+
+ ///
+ /// @brief Get statistics by name.
+ ///
+ /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface).
+ ///
+ /// @note If statistics is missing an exception is thrown. If because of filters
+ /// no fields remain the returned value is empty (all fields are `Nothing`).
+ ///
+ /// @note We don't use `TMaybe<TJobStatisticsEntry>` here;
+ /// instead, @ref NYT::TJobStatisticsEntry methods return `TMaybe<i64>`,
+ /// so user easier use `.GetOrElse`:
+ /// ```
+ /// jobStatistics.GetStatistics("some/statistics/name").Max().GetOrElse(0);
+ /// ```
+ TJobStatisticsEntry<i64> GetStatistics(TStringBuf name) const;
+
+ ///
+ /// @brief Get statistics by name.
+ ///
+ /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface).
+ ///
+ /// @note In order to use `GetStatisticsAs` method, @ref NYT::ConvertJobStatisticsEntry function must be defined
+ /// (the library defines it for `i64` and `TDuration`, user may define it for other types).
+ template <typename T>
+ TJobStatisticsEntry<T> GetStatisticsAs(TStringBuf name) const;
+
+ ///
+ /// Get (slash separated) names of statistics.
+ TVector<TString> GetStatisticsNames() const;
+
+ ///
+ /// @brief Check if given custom statistics exists.
+ ///
+ /// @param name Slash separated custom statistics name.
+ bool HasCustomStatistics(TStringBuf name) const;
+
+ ///
+ /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics).
+ ///
+ /// @param name Slash separated custom statistics name.
+ TJobStatisticsEntry<i64> GetCustomStatistics(TStringBuf name) const;
+
+ ///
+ /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics).
+ ///
+ /// @param name Slash separated custom statistics name.
+ template <typename T>
+ TJobStatisticsEntry<T> GetCustomStatisticsAs(TStringBuf name) const;
+
+ ///
+ /// Get names of all custom statistics.
+ TVector<TString> GetCustomStatisticsNames() const;
+
+private:
+ class TData;
+ struct TFilter;
+
+ struct TDataEntry {
+ i64 Max;
+ i64 Min;
+ i64 Sum;
+ i64 Count;
+ };
+
+ static const TString CustomStatisticsNamePrefix_;
+
+private:
+ TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter);
+
+ TMaybe<TDataEntry> GetStatisticsImpl(TStringBuf name) const;
+
+private:
+ ::TIntrusivePtr<TData> Data_;
+ ::TIntrusivePtr<TFilter> Filter_;
+
+private:
+ template<typename T>
+ friend class TJobStatisticsEntry;
+};
+
+////////////////////////////////////////////////////////////////////
+
+/// Class representing single statistic.
+template <typename T>
+class TJobStatisticsEntry
+{
+public:
+ TJobStatisticsEntry(TMaybe<TJobStatistics::TDataEntry> data)
+ : Data_(std::move(data))
+ { }
+
+ /// Sum of the statistic over all jobs.
+ TMaybe<T> Sum() const
+ {
+ if (Data_) {
+ return ConvertJobStatisticsEntry<T>(Data_->Sum);
+ }
+ return Nothing();
+ }
+
+ /// @brief Average of the statistic over all jobs.
+ ///
+ /// @note Only jobs that emitted statistics are taken into account.
+ TMaybe<T> Avg() const
+ {
+ if (Data_ && Data_->Count) {
+ return ConvertJobStatisticsEntry<T>(Data_->Sum / Data_->Count);
+ }
+ return Nothing();
+ }
+
+ /// @brief Number of jobs that emitted this statistic.
+ TMaybe<T> Count() const
+ {
+ if (Data_) {
+ return ConvertJobStatisticsEntry<T>(Data_->Count);
+ }
+ return Nothing();
+ }
+
+ /// @brief Maximum value of the statistic over all jobs.
+ TMaybe<T> Max() const
+ {
+ if (Data_) {
+ return ConvertJobStatisticsEntry<T>(Data_->Max);
+ }
+ return Nothing();
+ }
+
+ /// @brief Minimum value of the statistic over all jobs.
+ TMaybe<T> Min() const
+ {
+ if (Data_) {
+ return ConvertJobStatisticsEntry<T>(Data_->Min);
+ }
+ return Nothing();
+ }
+
+private:
+ TMaybe<TJobStatistics::TDataEntry> Data_;
+
+private:
+ friend class TJobStatistics;
+};
+
+////////////////////////////////////////////////////////////////////
+
+template <typename T>
+TJobStatisticsEntry<T> TJobStatistics::GetStatisticsAs(TStringBuf name) const
+{
+ return TJobStatisticsEntry<T>(GetStatisticsImpl(name));
+}
+
+template <typename T>
+TJobStatisticsEntry<T> TJobStatistics::GetCustomStatisticsAs(TStringBuf name) const
+{
+ return TJobStatisticsEntry<T>(GetStatisticsImpl(CustomStatisticsNamePrefix_ + name));
+}
+
+////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Write [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats).
+///
+/// @param path Slash-separated path (length must not exceed 512 bytes).
+/// @param value Value of the statistic.
+///
+/// @note The function must be called in job.
+/// Total number of statistics (with different paths) must not exceed 128.
+void WriteCustomStatistics(TStringBuf path, i64 value);
+
+///
+/// @brief Write several [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats) at once.
+///
+/// @param statistics A tree of map nodes with leaves of type `i64`.
+///
+/// @note The call is equivalent to calling @ref NYT::WriteCustomStatistics(TStringBuf, i64) for every path in the given map.
+void WriteCustomStatistics(const TNode& statistics);
+
+///
+/// @brief Flush [custom statistics stream](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats)
+///
+void FlushCustomStatisticsStream();
+////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/job_statistics_ut.cpp b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp
new file mode 100644
index 0000000000..0cf53d771a
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp
@@ -0,0 +1,257 @@
+#include <yt/cpp/mapreduce/interface/job_statistics.h>
+#include <yt/cpp/mapreduce/interface/operation.h>
+
+#include <library/cpp/yson/node/node_io.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NYT;
+
+Y_UNIT_TEST_SUITE(JobStatistics)
+{
+ Y_UNIT_TEST(Simple)
+ {
+ const TString input = R"""(
+ {
+ "data" = {
+ "output" = {
+ "0" = {
+ "uncompressed_data_size" = {
+ "$" = {
+ "completed" = {
+ "simple_sort" = {
+ "max" = 130;
+ "count" = 1;
+ "min" = 130;
+ "sum" = 130;
+ };
+ "map" = {
+ "max" = 42;
+ "count" = 1;
+ "min" = 42;
+ "sum" = 42;
+ };
+ };
+ "aborted" = {
+ "simple_sort" = {
+ "max" = 24;
+ "count" = 1;
+ "min" = 24;
+ "sum" = 24;
+ };
+ };
+ };
+ };
+ };
+ };
+ };
+ })""";
+
+ TJobStatistics stat(NodeFromYsonString(input));
+
+ UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size"));
+ UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics"));
+ UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics");
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"});
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), 24);
+ UNIT_ASSERT_VALUES_EQUAL(stat.JobType({EJobType::Map}).JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), TMaybe<i64>());
+ }
+
+ Y_UNIT_TEST(TestOtherTypes)
+ {
+ const TString input = R"""(
+ {
+ "time" = {
+ "exec" = {
+ "$" = {
+ "completed" = {
+ "map" = {
+ "max" = 2482468;
+ "count" = 38;
+ "min" = 578976;
+ "sum" = 47987270;
+ };
+ };
+ };
+ };
+ };
+ })""";
+
+ TJobStatistics stat(NodeFromYsonString(input));
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsAs<TDuration>("time/exec").Max(), TDuration::MilliSeconds(2482468));
+ }
+
+ Y_UNIT_TEST(Custom)
+ {
+ const TString input = R"""(
+ {
+ "custom" = {
+ "some" = {
+ "path" = {
+ "$" = {
+ "completed" = {
+ "map" = {
+ "max" = -1;
+ "count" = 1;
+ "min" = -1;
+ "sum" = -1;
+ };
+ };
+ };
+ };
+ };
+ "another" = {
+ "path" = {
+ "$" = {
+ "completed" = {
+ "map" = {
+ "max" = 1001;
+ "count" = 2;
+ "min" = 1001;
+ "sum" = 2002;
+ };
+ };
+ };
+ };
+ };
+ };
+ })""";
+
+ TJobStatistics stat(NodeFromYsonString(input));
+
+ UNIT_ASSERT(stat.HasCustomStatistics("some/path"));
+ UNIT_ASSERT(!stat.HasCustomStatistics("nonexistent-statistics"));
+ UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetCustomStatistics("BLAH-BLAH"), yexception, "Statistics");
+
+ const auto names = stat.GetCustomStatisticsNames();
+ const THashSet<TString> expected = {"some/path", "another/path"};
+ UNIT_ASSERT_VALUES_EQUAL(THashSet<TString>(names.begin(), names.end()), expected);
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("some/path").Max(), -1);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("another/path").Avg(), 1001);
+ }
+
+ Y_UNIT_TEST(TaskNames)
+ {
+ const TString input = R"""(
+ {
+ "data" = {
+ "output" = {
+ "0" = {
+ "uncompressed_data_size" = {
+ "$" = {
+ "completed" = {
+ "partition_map" = {
+ "max" = 130;
+ "count" = 1;
+ "min" = 130;
+ "sum" = 130;
+ };
+ "partition(0)" = {
+ "max" = 42;
+ "count" = 1;
+ "min" = 42;
+ "sum" = 42;
+ };
+ };
+ "aborted" = {
+ "simple_sort" = {
+ "max" = 24;
+ "count" = 1;
+ "min" = 24;
+ "sum" = 24;
+ };
+ };
+ };
+ };
+ };
+ };
+ };
+ })""";
+
+ TJobStatistics stat(NodeFromYsonString(input));
+
+ UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size"));
+ UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics"));
+ UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics");
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"});
+
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172);
+ UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .JobState({EJobState::Aborted})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 24);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .JobType({EJobType::Partition})
+ .JobState({EJobState::Aborted})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ TMaybe<i64>());
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({"partition(0)"})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 42);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({"partition"})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ TMaybe<i64>());
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({"partition_map(0)"})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 130);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .JobType({EJobType::Partition})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 42);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .JobType({EJobType::PartitionMap})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 130);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({ETaskName::Partition0})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 42);
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({ETaskName::Partition1})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ TMaybe<i64>());
+ UNIT_ASSERT_VALUES_EQUAL(
+ stat
+ .TaskName({ETaskName::PartitionMap0})
+ .GetStatistics("data/output/0/uncompressed_data_size")
+ .Sum(),
+ 130);
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/logging/logger.cpp b/yt/cpp/mapreduce/interface/logging/logger.cpp
new file mode 100644
index 0000000000..bfa56b94f6
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/logging/logger.cpp
@@ -0,0 +1,188 @@
+#include "logger.h"
+
+#include <util/datetime/base.h>
+
+#include <util/stream/file.h>
+#include <util/stream/format.h>
+#include <util/stream/printf.h>
+#include <util/stream/str.h>
+
+#include <util/system/mutex.h>
+#include <util/system/rwlock.h>
+#include <util/system/thread.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+static TStringBuf StripFileName(TStringBuf path) {
+ TStringBuf l, r;
+ if (path.TryRSplit('/', l, r) || path.TryRSplit('\\', l, r)) {
+ return r;
+ } else {
+ return path;
+ }
+}
+
+static char GetLogLevelCode(ILogger::ELevel level) {
+ switch (level) {
+ case ILogger::FATAL: return 'F';
+ case ILogger::ERROR: return 'E';
+ case ILogger::INFO: return 'I';
+ case ILogger::DEBUG: return 'D';
+ }
+ Y_UNREACHABLE();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TNullLogger
+ : public ILogger
+{
+public:
+ void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override
+ {
+ Y_UNUSED(level);
+ Y_UNUSED(sourceLocation);
+ Y_UNUSED(format);
+ Y_UNUSED(args);
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TLoggerBase
+ : public ILogger
+{
+public:
+ TLoggerBase(ELevel cutLevel)
+ : CutLevel_(cutLevel)
+ { }
+
+ virtual void OutputLine(const TString& line) = 0;
+
+ void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override
+ {
+ if (level > CutLevel_) {
+ return;
+ }
+
+ TStringStream stream;
+ stream << TInstant::Now().ToStringLocal()
+ << " " << GetLogLevelCode(level)
+ << " [" << Hex(TThread::CurrentThreadId(), HF_FULL) << "] ";
+ Printf(stream, format, args);
+ stream << " - " << StripFileName(sourceLocation.File) << ':' << sourceLocation.Line << Endl;
+
+ TGuard<TMutex> guard(Mutex_);
+ OutputLine(stream.Str());
+ }
+
+private:
+ ELevel CutLevel_;
+ TMutex Mutex_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TStdErrLogger
+ : public TLoggerBase
+{
+public:
+ TStdErrLogger(ELevel cutLevel)
+ : TLoggerBase(cutLevel)
+ { }
+
+ void OutputLine(const TString& line) override
+ {
+ Cerr << line;
+ }
+};
+
+ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel)
+{
+ return new TStdErrLogger(cutLevel);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TFileLogger
+ : public TLoggerBase
+{
+public:
+ TFileLogger(ELevel cutLevel, const TString& path, bool append)
+ : TLoggerBase(cutLevel)
+ , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode())))
+ { }
+
+ void OutputLine(const TString& line) override
+ {
+ Stream_ << line;
+ }
+
+private:
+ TUnbufferedFileOutput Stream_;
+};
+
+ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append)
+{
+ return new TFileLogger(cutLevel, path, append);
+}
+////////////////////////////////////////////////////////////////////////////////
+
+class TBufferedFileLogger
+ : public TLoggerBase
+{
+public:
+ TBufferedFileLogger(ELevel cutLevel, const TString& path, bool append)
+ : TLoggerBase(cutLevel)
+ , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode())))
+ { }
+
+ void OutputLine(const TString& line) override
+ {
+ Stream_ << line;
+ }
+
+private:
+ TFileOutput Stream_;
+};
+
+ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append)
+{
+ return new TBufferedFileLogger(cutLevel, path, append);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static TRWMutex LoggerMutex;
+static ILoggerPtr Logger;
+
+struct TLoggerInitializer
+{
+ TLoggerInitializer()
+ {
+ Logger = new TNullLogger;
+ }
+} LoggerInitializer;
+
+void SetLogger(ILoggerPtr logger)
+{
+ auto guard = TWriteGuard(LoggerMutex);
+ if (logger) {
+ Logger = logger;
+ } else {
+ Logger = new TNullLogger;
+ }
+}
+
+ILoggerPtr GetLogger()
+{
+ auto guard = TReadGuard(LoggerMutex);
+ return Logger;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+}
+
diff --git a/yt/cpp/mapreduce/interface/logging/logger.h b/yt/cpp/mapreduce/interface/logging/logger.h
new file mode 100644
index 0000000000..2b5aae87d1
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/logging/logger.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+#include <util/system/compat.h>
+#include <util/system/src_location.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+class ILogger
+ : public TThrRefBase
+{
+public:
+ enum ELevel
+ {
+ FATAL /* "fatal", "FATAL" */,
+ // We don't have such level as `warning', but we support it for compatibility with other APIs.
+ ERROR /* "error", "warning", "ERROR", "WARNING" */,
+ INFO /* "info", "INFO" */,
+ DEBUG /* "debug", "DEBUG" */
+ };
+
+ virtual void Log(ELevel level, const ::TSourceLocation& sourceLocation, const char* format, va_list args) = 0;
+};
+
+using ILoggerPtr = ::TIntrusivePtr<ILogger>;
+
+void SetLogger(ILoggerPtr logger);
+ILoggerPtr GetLogger();
+
+ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel);
+ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false);
+
+/**
+ * Create logger that writes to a file in a buffered manner.
+ * It should result in fewer system calls (useful if you expect a lot of log messages),
+ * but in case of a crash, you would lose some log messages that haven't been flushed yet.
+ */
+ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false);
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/logging/ya.make b/yt/cpp/mapreduce/interface/logging/ya.make
new file mode 100644
index 0000000000..8095bfe4ba
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/logging/ya.make
@@ -0,0 +1,16 @@
+LIBRARY()
+
+INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
+
+SRCS(
+ logger.cpp
+ yt_log.cpp
+)
+
+PEERDIR(
+ library/cpp/yt/logging
+)
+
+GENERATE_ENUM_SERIALIZATION(logger.h)
+
+END()
diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.cpp b/yt/cpp/mapreduce/interface/logging/yt_log.cpp
new file mode 100644
index 0000000000..9fa7b91580
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/logging/yt_log.cpp
@@ -0,0 +1,126 @@
+#include "yt_log.h"
+
+#include "logger.h"
+
+#include <util/generic/guid.h>
+
+#include <util/system/mutex.h>
+
+namespace NYT {
+
+using namespace NLogging;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+class TLogManager
+ : public ILogManager
+{
+public:
+ static constexpr TStringBuf CategoryName = "Wrapper";
+
+public:
+ void RegisterStaticAnchor(
+ TLoggingAnchor* anchor,
+ ::TSourceLocation sourceLocation,
+ TStringBuf anchorMessage) override
+ {
+ if (anchor->Registered.exchange(true)) {
+ return;
+ }
+
+ anchor->Enabled.store(true);
+
+ auto guard = Guard(Mutex_);
+ anchor->SourceLocation = sourceLocation;
+ anchor->AnchorMessage = anchorMessage;
+ }
+
+ void UpdateAnchor(TLoggingAnchor* /*position*/) override
+ { }
+
+ void Enqueue(TLogEvent&& event) override
+ {
+ auto message = TString(event.MessageRef.ToStringBuf());
+ LogMessage(
+ ToImplLevel(event.Level),
+ ::TSourceLocation(event.SourceFile, event.SourceLine),
+ "%.*s",
+ event.MessageRef.size(),
+ event.MessageRef.begin());
+ }
+
+ const TLoggingCategory* GetCategory(TStringBuf categoryName) override
+ {
+ Y_VERIFY(categoryName == CategoryName);
+ return &Category_;
+ }
+
+ void UpdateCategory(TLoggingCategory* /*category*/) override
+ {
+ Y_FAIL();
+ }
+
+ bool GetAbortOnAlert() const override
+ {
+ return false;
+ }
+
+private:
+ static ILogger::ELevel ToImplLevel(ELogLevel level)
+ {
+ switch (level) {
+ case ELogLevel::Minimum:
+ case ELogLevel::Trace:
+ case ELogLevel::Debug:
+ return ILogger::ELevel::DEBUG;
+ case ELogLevel::Info:
+ return ILogger::ELevel::INFO;
+ case ELogLevel::Warning:
+ case ELogLevel::Error:
+ return ILogger::ELevel::ERROR;
+ case ELogLevel::Alert:
+ case ELogLevel::Fatal:
+ case ELogLevel::Maximum:
+ return ILogger::ELevel::FATAL;
+ }
+ }
+
+ static void LogMessage(ILogger::ELevel level, const ::TSourceLocation& sourceLocation, const char* format, ...)
+ {
+ va_list args;
+ va_start(args, format);
+ GetLogger()->Log(level, sourceLocation, format, args);
+ va_end(args);
+ }
+
+private:
+ ::TMutex Mutex_;
+ std::atomic<int> ActualVersion_{1};
+ const TLoggingCategory Category_{
+ .Name{CategoryName},
+ .MinPlainTextLevel{ELogLevel::Minimum},
+ .CurrentVersion{1},
+ .ActualVersion = &ActualVersion_,
+ };
+};
+
+TLogManager LogManager;
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+TLogger Logger(&LogManager, TLogManager::CategoryName);
+
+////////////////////////////////////////////////////////////////////////////////
+
+void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf /*format*/)
+{
+ builder->AppendString(GetGuidAsString(value));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.h b/yt/cpp/mapreduce/interface/logging/yt_log.h
new file mode 100644
index 0000000000..4cf93a6ba1
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/logging/yt_log.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <library/cpp/yt/logging/logger.h>
+
+struct TGUID;
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern NLogging::TLogger Logger;
+
+void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf format);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/mpl.h b/yt/cpp/mapreduce/interface/mpl.h
new file mode 100644
index 0000000000..9865e28b6c
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/mpl.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "fwd.h"
+
+#include <tuple>
+#include <type_traits>
+
+namespace NYT {
+
+/// @cond Doxygen_Suppress
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TBase, class TDerived>
+struct TIsBaseOf
+{
+ static constexpr bool Value = std::is_base_of_v<TBase, TDerived> && !std::is_same_v<TBase, TDerived>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+template <class T, class Tuple>
+struct TIndexInTuple;
+
+template <class T, class... Types>
+struct TIndexInTuple<T, std::tuple<T, Types...>>
+{
+ static constexpr int Value = 0;
+};
+
+template <class T>
+struct TIndexInTuple<T, std::tuple<>>
+{
+ static constexpr int Value = 0;
+};
+
+template <class T, class U, class... Types>
+struct TIndexInTuple<T, std::tuple<U, Types...>>
+{
+ static constexpr int Value = 1 + TIndexInTuple<T, std::tuple<Types...>>::Value;
+};
+
+template <class T, class TTuple>
+constexpr bool DoesTupleContainType = (TIndexInTuple<T, TTuple>::Value < std::tuple_size<TTuple>{});
+
+template <class TOut, class TIn = std::tuple<>>
+struct TUniqueTypes;
+
+template <class... TOut, class TInCar, class... TInCdr>
+struct TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCar, TInCdr...>>
+{
+ using TType = std::conditional_t<
+ DoesTupleContainType<TInCar, std::tuple<TOut...>>,
+ typename TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCdr...>>::TType,
+ typename TUniqueTypes<std::tuple<TOut..., TInCar>, std::tuple<TInCdr...>>::TType
+ >;
+};
+
+template <class TOut>
+struct TUniqueTypes<TOut, std::tuple<>>
+{
+ using TType = TOut;
+};
+
+} // namespace NDetail
+
+/// @endcond Doxygen_Suppress
+
+////////////////////////////////////////////////////////////////////////////////
+
+}
diff --git a/yt/cpp/mapreduce/interface/node.h b/yt/cpp/mapreduce/interface/node.h
new file mode 100644
index 0000000000..fece1b36de
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/node.h
@@ -0,0 +1,7 @@
+#pragma once
+
+// Backward compatibility
+#include "fwd.h"
+#include <library/cpp/yson/node/node.h>
+
+
diff --git a/yt/cpp/mapreduce/interface/operation-inl.h b/yt/cpp/mapreduce/interface/operation-inl.h
new file mode 100644
index 0000000000..8d53cd446f
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/operation-inl.h
@@ -0,0 +1,928 @@
+#pragma once
+
+#ifndef OPERATION_INL_H_
+#error "Direct inclusion of this file is not allowed, use operation.h"
+#include "operation.h"
+#endif
+#undef OPERATION_INL_H_
+
+#include "errors.h"
+
+#include <util/generic/bt_exception.h>
+#include <util/generic/singleton.h>
+#include <util/system/type_name.h>
+
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <util/string/subst.h>
+
+#include <typeindex>
+
+namespace NYT {
+
+namespace NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<class T>
+void Assign(TVector<T>& array, size_t idx, const T& value) {
+ array.resize(std::max(array.size(), idx + 1));
+ array[idx] = value;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TRow>
+TStructuredRowStreamDescription GetStructuredRowStreamDescription()
+{
+ if constexpr (std::is_same_v<TRow, NYT::TNode>) {
+ return TTNodeStructuredRowStream{};
+ } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) {
+ return TTYaMRRowStructuredRowStream{};
+ } else if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) {
+ return TProtobufStructuredRowStream{nullptr};
+ } else if constexpr (TIsBaseOf<::google::protobuf::Message, TRow>::Value) {
+ return TProtobufStructuredRowStream{TRow::descriptor()};
+ } else if constexpr (TIsProtoOneOf<TRow>::value) {
+ return TProtobufStructuredRowStream{nullptr};
+ } else {
+ static_assert(TDependentFalse<TRow>, "Unknown row type");
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TRow>
+TStructuredTablePath Structured(TRichYPath richYPath)
+{
+ return TStructuredTablePath(std::move(richYPath), StructuredTableDescription<TRow>());
+}
+
+template <typename TRow>
+TTableStructure StructuredTableDescription()
+{
+ if constexpr (std::is_same_v<TRow, NYT::TNode>) {
+ return TUnspecifiedTableStructure{};
+ } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) {
+ return TUnspecifiedTableStructure{};
+ } else if constexpr (std::is_base_of_v<::google::protobuf::Message, TRow>) {
+ if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) {
+ static_assert(TDependentFalse<TRow>, "Cannot use ::google::protobuf::Message as table descriptor");
+ } else {
+ return TProtobufTableStructure{TRow::descriptor()};
+ }
+ } else {
+ static_assert(TDependentFalse<TRow>, "Unknown row type");
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TDerived>
+TDerived& TRawOperationIoTableSpec<TDerived>::AddInput(const TRichYPath& path)
+{
+ Inputs_.push_back(path);
+ return static_cast<TDerived&>(*this);
+}
+
+template <typename TDerived>
+TDerived& TRawOperationIoTableSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path)
+{
+ NDetail::Assign(Inputs_, tableIndex, path);
+}
+
+template <typename TDerived>
+TDerived& TRawOperationIoTableSpec<TDerived>::AddOutput(const TRichYPath& path)
+{
+ Outputs_.push_back(path);
+ return static_cast<TDerived&>(*this);
+}
+
+template <typename TDerived>
+TDerived& TRawOperationIoTableSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path)
+{
+ NDetail::Assign(Outputs_, tableIndex, path);
+}
+
+template <typename TDerived>
+const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetInputs() const
+{
+ return Inputs_;
+}
+
+template <typename TDerived>
+const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetOutputs() const
+{
+ return Outputs_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TDerived>
+TDerived& TRawMapReduceOperationIoSpec<TDerived>::AddMapOutput(const TRichYPath& path)
+{
+ MapOutputs_.push_back(path);
+ return static_cast<TDerived&>(*this);
+}
+
+template <typename TDerived>
+TDerived& TRawMapReduceOperationIoSpec<TDerived>::SetMapOutput(size_t tableIndex, const TRichYPath& path)
+{
+ NDetail::Assign(MapOutputs_, tableIndex, path);
+}
+
+template <typename TDerived>
+const TVector<TRichYPath>& TRawMapReduceOperationIoSpec<TDerived>::GetMapOutputs() const
+{
+ return MapOutputs_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+::TIntrusivePtr<INodeReaderImpl> CreateJobNodeReader(TRawTableReaderPtr rawTableReader);
+::TIntrusivePtr<IYaMRReaderImpl> CreateJobYaMRReader(TRawTableReaderPtr rawTableReader);
+::TIntrusivePtr<IProtoReaderImpl> CreateJobProtoReader(TRawTableReaderPtr rawTableReader);
+
+::TIntrusivePtr<INodeWriterImpl> CreateJobNodeWriter(THolder<IProxyOutput> rawTableWriter);
+::TIntrusivePtr<IYaMRWriterImpl> CreateJobYaMRWriter(THolder<IProxyOutput> rawTableWriter);
+::TIntrusivePtr<IProtoWriterImpl> CreateJobProtoWriter(THolder<IProxyOutput> rawTableWriter);
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader);
+
+template <>
+inline ::TIntrusivePtr<INodeReaderImpl> CreateJobReaderImpl<TNode>(TRawTableReaderPtr rawTableReader)
+{
+ return CreateJobNodeReader(rawTableReader);
+}
+
+template <>
+inline ::TIntrusivePtr<IYaMRReaderImpl> CreateJobReaderImpl<TYaMRRow>(TRawTableReaderPtr rawTableReader)
+{
+ return CreateJobYaMRReader(rawTableReader);
+}
+
+template <>
+inline ::TIntrusivePtr<IProtoReaderImpl> CreateJobReaderImpl<Message>(TRawTableReaderPtr rawTableReader)
+{
+ return CreateJobProtoReader(rawTableReader);
+}
+
+template <class T>
+inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader)
+{
+ if constexpr (TIsBaseOf<Message, T>::Value || NDetail::TIsProtoOneOf<T>::value) {
+ return CreateJobProtoReader(rawTableReader);
+ } else {
+ static_assert(TDependentFalse<T>, "Unknown row type");
+ }
+}
+
+template <class T>
+inline TTableReaderPtr<T> CreateJobReader(TRawTableReaderPtr rawTableReader)
+{
+ return new TTableReader<T>(CreateJobReaderImpl<T>(rawTableReader));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter);
+
+template <>
+inline TTableWriterPtr<TNode> CreateJobWriter<TNode>(THolder<IProxyOutput> rawJobWriter)
+{
+ return new TTableWriter<TNode>(CreateJobNodeWriter(std::move(rawJobWriter)));
+}
+
+template <>
+inline TTableWriterPtr<TYaMRRow> CreateJobWriter<TYaMRRow>(THolder<IProxyOutput> rawJobWriter)
+{
+ return new TTableWriter<TYaMRRow>(CreateJobYaMRWriter(std::move(rawJobWriter)));
+}
+
+template <>
+inline TTableWriterPtr<Message> CreateJobWriter<Message>(THolder<IProxyOutput> rawJobWriter)
+{
+ return new TTableWriter<Message>(CreateJobProtoWriter(std::move(rawJobWriter)));
+}
+
+template <class T, class = void>
+struct TProtoWriterCreator;
+
+template <class T>
+struct TProtoWriterCreator<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>>
+{
+ static TTableWriterPtr<T> Create(::TIntrusivePtr<IProtoWriterImpl> writer)
+ {
+ return new TTableWriter<T>(writer);
+ }
+};
+
+template <class T>
+inline TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter)
+{
+ if constexpr (TIsBaseOf<Message, T>::Value) {
+ return TProtoWriterCreator<T>::Create(CreateJobProtoWriter(std::move(rawJobWriter)));
+ } else {
+ static_assert(TDependentFalse<T>, "Unknown row type");
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+void TOperationInputSpecBase::AddInput(const TRichYPath& path)
+{
+ Inputs_.push_back(path);
+ StructuredInputs_.emplace_back(Structured<T>(path));
+}
+
+template <class T>
+void TOperationInputSpecBase::SetInput(size_t tableIndex, const TRichYPath& path)
+{
+ NDetail::Assign(Inputs_, tableIndex, path);
+ NDetail::Assign(StructuredInputs_, tableIndex, Structured<T>(path));
+}
+
+
+template <class T>
+void TOperationOutputSpecBase::AddOutput(const TRichYPath& path)
+{
+ Outputs_.push_back(path);
+ StructuredOutputs_.emplace_back(Structured<T>(path));
+}
+
+template <class T>
+void TOperationOutputSpecBase::SetOutput(size_t tableIndex, const TRichYPath& path)
+{
+ NDetail::Assign(Outputs_, tableIndex, path);
+ NDetail::Assign(StructuredOutputs_, tableIndex, Structured<T>(path));
+}
+
+template <class TDerived>
+template <class T>
+TDerived& TOperationIOSpec<TDerived>::AddInput(const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationInputSpecBase::AddInput<T>(path);
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class T>
+TDerived& TOperationIOSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationInputSpecBase::SetInput<T>(tableIndex, path);
+ return *static_cast<TDerived*>(this);
+}
+
+
+template <class TDerived>
+template <class T>
+TDerived& TOperationIOSpec<TDerived>::AddOutput(const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationOutputSpecBase::AddOutput<T>(path);
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class T>
+TDerived& TOperationIOSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationOutputSpecBase::SetOutput<T>(tableIndex, path);
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+TDerived& TOperationIOSpec<TDerived>::AddStructuredInput(TStructuredTablePath path)
+{
+ TOperationInputSpecBase::AddStructuredInput(std::move(path));
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+TDerived& TOperationIOSpec<TDerived>::AddStructuredOutput(TStructuredTablePath path)
+{
+ TOperationOutputSpecBase::AddStructuredOutput(std::move(path));
+ return *static_cast<TDerived*>(this);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+TVanillaTask& TVanillaTask::AddOutput(const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationOutputSpecBase::AddOutput<T>(path);
+ return *this;
+}
+
+template <class T>
+TVanillaTask& TVanillaTask::SetOutput(size_t tableIndex, const TRichYPath& path)
+{
+ static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)");
+ TOperationOutputSpecBase::SetOutput<T>(tableIndex, path);
+ return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+
+void ResetUseClientProtobuf(const char* methodName);
+
+} // namespace NDetail
+
+template <class TDerived>
+TDerived& TOperationIOSpec<TDerived>::AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path)
+{
+ NDetail::ResetUseClientProtobuf("AddProtobufInput_VerySlow_Deprecated");
+ Inputs_.push_back(path);
+ StructuredInputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr}));
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+TDerived& TOperationIOSpec<TDerived>::AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path)
+{
+ NDetail::ResetUseClientProtobuf("AddProtobufOutput_VerySlow_Deprecated");
+ Outputs_.push_back(path);
+ StructuredOutputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr}));
+ return *static_cast<TDerived*>(this);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TRow>
+TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::Description()
+{
+ for (auto i : Indices_) {
+ Preparer_.InputDescription<TRow>(i);
+ }
+ return *this;
+}
+
+template <typename TRow>
+TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Description(bool inferSchema)
+{
+ for (auto i : Indices_) {
+ Preparer_.OutputDescription<TRow>(i, inferSchema);
+ }
+ return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TCont>
+TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(const TCont& indices)
+{
+ for (auto i : indices) {
+ ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()"));
+ }
+ return TInputGroup(*this, TVector<int>(std::begin(indices), std::end(indices)));
+}
+
+template <typename TCont>
+TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(const TCont& indices)
+{
+ for (auto i : indices) {
+ ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()"));
+ }
+ return TOutputGroup(*this, indices);
+}
+
+
+template <typename TRow>
+TJobOperationPreparer& TJobOperationPreparer::InputDescription(int tableIndex)
+{
+ ValidateMissingInputDescription(tableIndex);
+ InputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>();
+ return *this;
+}
+
+template <typename TRow>
+TJobOperationPreparer& TJobOperationPreparer::OutputDescription(int tableIndex, bool inferSchema)
+{
+ ValidateMissingOutputDescription(tableIndex);
+ OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>();
+ if (inferSchema && !OutputSchemas_[tableIndex]) {
+ OutputSchemas_[tableIndex] = CreateTableSchema<TRow>();
+ }
+ return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TDerived>
+template <class TRow>
+TDerived& TIntermediateTablesHintSpec<TDerived>::HintMapOutput()
+{
+ IntermediateMapOutputDescription_ = StructuredTableDescription<TRow>();
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class TRow>
+TDerived& TIntermediateTablesHintSpec<TDerived>::AddMapOutput(const TRichYPath& path)
+{
+ MapOutputs_.push_back(path);
+ StructuredMapOutputs_.emplace_back(Structured<TRow>(path));
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class TRow>
+TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerInput()
+{
+ IntermediateReduceCombinerInputDescription_ = StructuredTableDescription<TRow>();
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class TRow>
+TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerOutput()
+{
+ IntermediateReduceCombinerOutputDescription_ = StructuredTableDescription<TRow>();
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+template <class TRow>
+TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceInput()
+{
+ IntermediateReducerInputDescription_ = StructuredTableDescription<TRow>();
+ return *static_cast<TDerived*>(this);
+}
+
+template <class TDerived>
+const TVector<TStructuredTablePath>& TIntermediateTablesHintSpec<TDerived>::GetStructuredMapOutputs() const
+{
+ return StructuredMapOutputs_;
+}
+
+template <class TDerived>
+const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateMapOutputDescription() const
+{
+ return IntermediateMapOutputDescription_;
+}
+
+template <class TDerived>
+const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerInputDescription() const
+{
+ return IntermediateReduceCombinerInputDescription_;
+}
+
+template <class TDerived>
+const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerOutputDescription() const
+{
+ return IntermediateReduceCombinerOutputDescription_;
+}
+
+template <class TDerived>
+const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReducerInputDescription() const
+{
+ return IntermediateReducerInputDescription_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TReducerContext
+{
+ bool Break = false;
+ static TReducerContext* Get() { return Singleton<TReducerContext>(); }
+};
+
+template <class TR, class TW>
+inline void IReducer<TR, TW>::Break()
+{
+ TReducerContext::Get()->Break = true;
+}
+
+template <typename TReader, typename TWriter>
+void FeedJobInput(
+ IMapper<TReader, TWriter>* mapper,
+ typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl,
+ TWriter* writer)
+{
+ using TInputRow = typename TReader::TRowType;
+
+ auto reader = MakeIntrusive<TTableReader<TInputRow>>(readerImpl);
+ mapper->Do(reader.Get(), writer);
+}
+
+template <typename TReader, typename TWriter>
+void FeedJobInput(
+ IReducer<TReader, TWriter>* reducer,
+ typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl,
+ TWriter* writer)
+{
+ using TInputRow = typename TReader::TRowType;
+
+ auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl);
+ for (; rangesReader->IsValid(); rangesReader->Next()) {
+ reducer->Do(&rangesReader->GetRange(), writer);
+ if (TReducerContext::Get()->Break) {
+ break;
+ }
+ }
+}
+
+template <typename TReader, typename TWriter>
+void FeedJobInput(
+ IAggregatorReducer<TReader, TWriter>* reducer,
+ typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl,
+ TWriter* writer)
+{
+ using TInputRow = typename TReader::TRowType;
+
+ auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl);
+ reducer->Do(rangesReader.Get(), writer);
+}
+
+template <class TRawJob>
+int RunRawJob(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ TRawJobContext context(outputTableCount);
+
+ TRawJob job;
+ job.Load(jobStateStream);
+ job.Do(context);
+ return 0;
+}
+
+template <>
+inline int RunRawJob<TCommandRawJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */)
+{
+ Y_FAIL();
+}
+
+template <class TVanillaJob>
+int RunVanillaJob(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ TVanillaJob job;
+ job.Load(jobStateStream);
+
+ if constexpr (std::is_base_of<IVanillaJob<>, TVanillaJob>::value) {
+ Y_VERIFY(outputTableCount == 0, "Void vanilla job expects zero 'outputTableCount'");
+ job.Do();
+ } else {
+ Y_VERIFY(outputTableCount, "Vanilla job with table writer expects nonzero 'outputTableCount'");
+ using TOutputRow = typename TVanillaJob::TWriter::TRowType;
+
+ THolder<IProxyOutput> rawJobWriter;
+ if (auto customWriter = job.CreateCustomRawJobWriter(outputTableCount)) {
+ rawJobWriter = std::move(customWriter);
+ } else {
+ rawJobWriter = CreateRawJobWriter(outputTableCount);
+ }
+ auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter));
+
+ job.Start(writer.Get());
+ job.Do(writer.Get());
+ job.Finish(writer.Get());
+
+ writer->Finish();
+ }
+ return 0;
+}
+
+template <>
+inline int RunVanillaJob<TCommandVanillaJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */)
+{
+ Y_FAIL();
+}
+
+template <class TJob>
+ requires TIsBaseOf<IStructuredJob, TJob>::Value
+int RunJob(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ using TInputRow = typename TJob::TReader::TRowType;
+ using TOutputRow = typename TJob::TWriter::TRowType;
+
+ auto job = MakeIntrusive<TJob>();
+ job->Load(jobStateStream);
+
+ TRawTableReaderPtr rawJobReader;
+ if (auto customReader = job->CreateCustomRawJobReader(/*fd*/ 0)) {
+ rawJobReader = customReader;
+ } else {
+ rawJobReader = CreateRawJobReader(/*fd*/ 0);
+ }
+ auto readerImpl = CreateJobReaderImpl<TInputRow>(rawJobReader);
+
+ // Many users don't expect to have jobs with empty input so we skip such jobs.
+ if (!readerImpl->IsValid()) {
+ return 0;
+ }
+
+ THolder<IProxyOutput> rawJobWriter;
+ if (auto customWriter = job->CreateCustomRawJobWriter(outputTableCount)) {
+ rawJobWriter = std::move(customWriter);
+ } else {
+ rawJobWriter = CreateRawJobWriter(outputTableCount);
+ }
+ auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter));
+
+ job->Start(writer.Get());
+ FeedJobInput(job.Get(), readerImpl.Get(), writer.Get());
+ job->Finish(writer.Get());
+
+ writer->Finish();
+
+ return 0;
+}
+
+//
+// We leave RunMapJob/RunReduceJob/RunAggregatorReducer for backward compatibility,
+// some user use them already. :(
+
+template <class TMapper>
+int RunMapJob(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ return RunJob<TMapper>(outputTableCount, jobStateStream);
+}
+
+template <class TReducer>
+int RunReduceJob(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ return RunJob<TReducer>(outputTableCount, jobStateStream);
+}
+
+template <class TReducer>
+int RunAggregatorReducer(size_t outputTableCount, IInputStream& jobStateStream)
+{
+ return RunJob<TReducer>(outputTableCount, jobStateStream);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename = void>
+struct TIsConstructibleFromNode
+ : std::false_type
+{ };
+
+template <typename T>
+struct TIsConstructibleFromNode<T, std::void_t<decltype(T::FromNode(std::declval<TNode&>()))>>
+ : std::true_type
+{ };
+
+template <class TJob>
+::TIntrusivePtr<NYT::IStructuredJob> ConstructJobFromNode(const TNode& node)
+{
+ if constexpr (TIsConstructibleFromNode<TJob>::value) {
+ Y_ENSURE(node.GetType() != TNode::Undefined,
+ "job has FromNode method but constructor arguments were not provided");
+ return TJob::FromNode(node);
+ } else {
+ Y_ENSURE(node.GetType() == TNode::Undefined,
+ "constructor arguments provided but job does not contain FromNode method");
+ return MakeIntrusive<TJob>();
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using TJobFunction = int (*)(size_t, IInputStream&);
+using TConstructJobFunction = ::TIntrusivePtr<NYT::IStructuredJob> (*)(const TNode&);
+
+class TJobFactory
+{
+public:
+ static TJobFactory* Get()
+ {
+ return Singleton<TJobFactory>();
+ }
+
+ template <class TJob>
+ void RegisterJob(const char* name)
+ {
+ RegisterJobImpl<TJob>(name, RunJob<TJob>);
+ JobConstructors[name] = ConstructJobFromNode<TJob>;
+ }
+
+ template <class TRawJob>
+ void RegisterRawJob(const char* name)
+ {
+ RegisterJobImpl<TRawJob>(name, RunRawJob<TRawJob>);
+ }
+
+ template <class TVanillaJob>
+ void RegisterVanillaJob(const char* name)
+ {
+ RegisterJobImpl<TVanillaJob>(name, RunVanillaJob<TVanillaJob>);
+ }
+
+ TString GetJobName(const IJob* job)
+ {
+ const auto typeIndex = std::type_index(typeid(*job));
+ CheckJobRegistered(typeIndex);
+ return JobNames[typeIndex];
+ }
+
+ TJobFunction GetJobFunction(const char* name)
+ {
+ CheckNameRegistered(name);
+ return JobFunctions[name];
+ }
+
+ TConstructJobFunction GetConstructingFunction(const char* name)
+ {
+ CheckNameRegistered(name);
+ return JobConstructors[name];
+ }
+
+private:
+ TMap<std::type_index, TString> JobNames;
+ THashMap<TString, TJobFunction> JobFunctions;
+ THashMap<TString, TConstructJobFunction> JobConstructors;
+
+ template <typename TJob, typename TRunner>
+ void RegisterJobImpl(const char* name, TRunner runner) {
+ const auto typeIndex = std::type_index(typeid(TJob));
+ CheckNotRegistered(typeIndex, name);
+ JobNames[typeIndex] = name;
+ JobFunctions[name] = runner;
+ }
+
+ void CheckNotRegistered(const std::type_index& typeIndex, const char* name)
+ {
+ Y_ENSURE(!JobNames.contains(typeIndex),
+ "type_info '" << typeIndex.name() << "'"
+ "is already registered under name '" << JobNames[typeIndex] << "'");
+ Y_ENSURE(!JobFunctions.contains(name),
+ "job with name '" << name << "' is already registered");
+ }
+
+ void CheckJobRegistered(const std::type_index& typeIndex)
+ {
+ Y_ENSURE(JobNames.contains(typeIndex),
+ "type_info '" << typeIndex.name() << "' is not registered, use REGISTER_* macros");
+ }
+
+ void CheckNameRegistered(const char* name)
+ {
+ Y_ENSURE(JobFunctions.contains(name),
+ "job with name '" << name << "' is not registered, use REGISTER_* macros");
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TMapper>
+struct TMapperRegistrator
+{
+ TMapperRegistrator(const char* name)
+ {
+ static_assert(TMapper::JobType == IJob::EType::Mapper,
+ "REGISTER_MAPPER is not compatible with this job class");
+
+ NYT::TJobFactory::Get()->RegisterJob<TMapper>(name);
+ }
+};
+
+template <class TReducer>
+struct TReducerRegistrator
+{
+ TReducerRegistrator(const char* name)
+ {
+ static_assert(TReducer::JobType == IJob::EType::Reducer ||
+ TReducer::JobType == IJob::EType::ReducerAggregator,
+ "REGISTER_REDUCER is not compatible with this job class");
+
+ NYT::TJobFactory::Get()->RegisterJob<TReducer>(name);
+ }
+};
+
+template <class TRawJob>
+struct TRawJobRegistrator
+{
+ TRawJobRegistrator(const char* name)
+ {
+ static_assert(TRawJob::JobType == IJob::EType::RawJob,
+ "REGISTER_RAW_JOB is not compatible with this job class");
+ NYT::TJobFactory::Get()->RegisterRawJob<TRawJob>(name);
+ }
+};
+
+template <class TVanillaJob>
+struct TVanillaJobRegistrator
+{
+ TVanillaJobRegistrator(const char* name)
+ {
+ static_assert(TVanillaJob::JobType == IJob::EType::VanillaJob,
+ "REGISTER_VANILLA_JOB is not compatible with this job class");
+ NYT::TJobFactory::Get()->RegisterVanillaJob<TVanillaJob>(name);
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+inline TString YtRegistryTypeName(const TString& name) {
+ TString res = name;
+#ifdef _win_
+ SubstGlobal(res, "class ", "");
+#endif
+ return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#define REGISTER_MAPPER(...) \
+static const NYT::TMapperRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data());
+
+#define REGISTER_NAMED_MAPPER(name, ...) \
+static const NYT::TMapperRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name);
+
+#define REGISTER_REDUCER(...) \
+static const NYT::TReducerRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data());
+
+#define REGISTER_NAMED_REDUCER(name, ...) \
+static const NYT::TReducerRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name);
+
+#define REGISTER_NAMED_RAW_JOB(name, ...) \
+static const NYT::TRawJobRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name);
+
+#define REGISTER_RAW_JOB(...) \
+REGISTER_NAMED_RAW_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__)
+
+#define REGISTER_NAMED_VANILLA_JOB(name, ...) \
+static NYT::TVanillaJobRegistrator<__VA_ARGS__> \
+Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name);
+
+#define REGISTER_VANILLA_JOB(...) \
+REGISTER_NAMED_VANILLA_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__)
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetInputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>();
+}
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetOutputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetInputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>();
+}
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetOutputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetInputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>();
+}
+
+template <typename TReader, typename TWriter>
+TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetOutputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename TWriter>
+TStructuredRowStreamDescription IVanillaJob<TWriter>::GetInputRowStreamDescription() const
+{
+ return TVoidStructuredRowStream();
+}
+
+template <typename TWriter>
+TStructuredRowStreamDescription IVanillaJob<TWriter>::GetOutputRowStreamDescription() const
+{
+ return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/operation.cpp b/yt/cpp/mapreduce/interface/operation.cpp
new file mode 100644
index 0000000000..706fc4caa4
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/operation.cpp
@@ -0,0 +1,663 @@
+#include "operation.h"
+
+#include <util/generic/iterator_range.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NDetail {
+ i64 OutputTableCount = -1;
+} // namespace NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+TTaskName::TTaskName(TString taskName)
+ : TaskName_(std::move(taskName))
+{ }
+
+TTaskName::TTaskName(const char* taskName)
+ : TaskName_(taskName)
+{ }
+
+TTaskName::TTaskName(ETaskName taskName)
+ : TaskName_(ToString(taskName))
+{ }
+
+const TString& TTaskName::Get() const
+{
+ return TaskName_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TCommandRawJob::TCommandRawJob(TStringBuf command)
+ : Command_(command)
+{ }
+
+const TString& TCommandRawJob::GetCommand() const
+{
+ return Command_;
+}
+
+void TCommandRawJob::Do(const TRawJobContext& /* jobContext */)
+{
+ Y_FAIL("TCommandRawJob::Do must not be called");
+}
+
+REGISTER_NAMED_RAW_JOB("NYT::TCommandRawJob", TCommandRawJob)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TCommandVanillaJob::TCommandVanillaJob(TStringBuf command)
+ : Command_(command)
+{ }
+
+const TString& TCommandVanillaJob::GetCommand() const
+{
+ return Command_;
+}
+
+void TCommandVanillaJob::Do()
+{
+ Y_FAIL("TCommandVanillaJob::Do must not be called");
+}
+
+REGISTER_NAMED_VANILLA_JOB("NYT::TCommandVanillaJob", TCommandVanillaJob);
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&)
+{
+ return true;
+}
+
+bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs)
+{
+ return lhs.Descriptor == rhs.Descriptor;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const TVector<TStructuredTablePath>& TOperationInputSpecBase::GetStructuredInputs() const
+{
+ return StructuredInputs_;
+}
+
+const TVector<TStructuredTablePath>& TOperationOutputSpecBase::GetStructuredOutputs() const
+{
+ return StructuredOutputs_;
+}
+
+void TOperationInputSpecBase::AddStructuredInput(TStructuredTablePath path)
+{
+ Inputs_.push_back(path.RichYPath);
+ StructuredInputs_.push_back(std::move(path));
+}
+
+void TOperationOutputSpecBase::AddStructuredOutput(TStructuredTablePath path)
+{
+ Outputs_.push_back(path.RichYPath);
+ StructuredOutputs_.push_back(std::move(path));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TVanillaTask& TVanillaTask::AddStructuredOutput(TStructuredTablePath path)
+{
+ TOperationOutputSpecBase::AddStructuredOutput(std::move(path));
+ return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TStructuredRowStreamDescription IVanillaJob<void>::GetInputRowStreamDescription() const
+{
+ return TVoidStructuredRowStream();
+}
+
+TStructuredRowStreamDescription IVanillaJob<void>::GetOutputRowStreamDescription() const
+{
+ return TVoidStructuredRowStream();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TRawJobContext::TRawJobContext(size_t outputTableCount)
+ : InputFile_(Duplicate(0))
+{
+ for (size_t i = 0; i != outputTableCount; ++i) {
+ OutputFileList_.emplace_back(Duplicate(3 * i + 1));
+ }
+}
+
+const TFile& TRawJobContext::GetInputFile() const
+{
+ return InputFile_;
+}
+
+const TVector<TFile>& TRawJobContext::GetOutputFileList() const
+{
+ return OutputFileList_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TUserJobSpec& TUserJobSpec::AddLocalFile(
+ const TLocalFilePath& path,
+ const TAddLocalFileOptions& options)
+{
+ LocalFiles_.emplace_back(path, options);
+ return *this;
+}
+
+TUserJobSpec& TUserJobSpec::JobBinaryLocalPath(TString path, TMaybe<TString> md5)
+{
+ JobBinary_ = TJobBinaryLocalPath{path, md5};
+ return *this;
+}
+
+TUserJobSpec& TUserJobSpec::JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId)
+{
+ JobBinary_ = TJobBinaryCypressPath{path, transactionId};
+ return *this;
+}
+
+const TJobBinaryConfig& TUserJobSpec::GetJobBinary() const
+{
+ return JobBinary_;
+}
+
+TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> TUserJobSpec::GetLocalFiles() const
+{
+ return LocalFiles_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TJobOperationPreparer::TInputGroup::TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices)
+ : Preparer_(preparer)
+ , Indices_(std::move(indices))
+{ }
+
+TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnRenaming(const THashMap<TString, TString>& renaming)
+{
+ for (auto i : Indices_) {
+ Preparer_.InputColumnRenaming(i, renaming);
+ }
+ return *this;
+}
+
+TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnFilter(const TVector<TString>& columns)
+{
+ for (auto i : Indices_) {
+ Preparer_.InputColumnFilter(i, columns);
+ }
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::TInputGroup::EndInputGroup()
+{
+ return Preparer_;
+}
+
+TJobOperationPreparer::TOutputGroup::TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices)
+ : Preparer_(preparer)
+ , Indices_(std::move(indices))
+{ }
+
+TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Schema(const TTableSchema &schema)
+{
+ for (auto i : Indices_) {
+ Preparer_.OutputSchema(i, schema);
+ }
+ return *this;
+}
+
+TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::NoSchema()
+{
+ for (auto i : Indices_) {
+ Preparer_.NoOutputSchema(i);
+ }
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::TOutputGroup::EndOutputGroup()
+{
+ return Preparer_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TJobOperationPreparer::TJobOperationPreparer(const IOperationPreparationContext& context)
+ : Context_(context)
+ , OutputSchemas_(context.GetOutputCount())
+ , InputColumnRenamings_(context.GetInputCount())
+ , InputColumnFilters_(context.GetInputCount())
+ , InputTableDescriptions_(context.GetInputCount())
+ , OutputTableDescriptions_(context.GetOutputCount())
+{ }
+
+TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(int begin, int end)
+{
+ Y_ENSURE_EX(begin <= end, TApiUsageError()
+ << "BeginInputGroup(): begin must not exceed end, got " << begin << ", " << end);
+ TVector<int> indices;
+ for (int i = begin; i < end; ++i) {
+ ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()"));
+ indices.push_back(i);
+ }
+ return TInputGroup(*this, std::move(indices));
+}
+
+
+TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(int begin, int end)
+{
+ Y_ENSURE_EX(begin <= end, TApiUsageError()
+ << "BeginOutputGroup(): begin must not exceed end, got " << begin << ", " << end);
+ TVector<int> indices;
+ for (int i = begin; i < end; ++i) {
+ ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()"));
+ indices.push_back(i);
+ }
+ return TOutputGroup(*this, std::move(indices));
+}
+
+TJobOperationPreparer& TJobOperationPreparer::NodeOutput(int tableIndex)
+{
+ ValidateMissingOutputDescription(tableIndex);
+ OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TNode>();
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::OutputSchema(int tableIndex, TTableSchema schema)
+{
+ ValidateMissingOutputSchema(tableIndex);
+ OutputSchemas_[tableIndex] = std::move(schema);
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::NoOutputSchema(int tableIndex)
+{
+ ValidateMissingOutputSchema(tableIndex);
+ OutputSchemas_[tableIndex] = EmptyNonstrictSchema();
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::InputColumnRenaming(
+ int tableIndex,
+ const THashMap<TString,TString>& renaming)
+{
+ ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnRenaming()"));
+ InputColumnRenamings_[tableIndex] = renaming;
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::InputColumnFilter(int tableIndex, const TVector<TString>& columns)
+{
+ ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnFilter()"));
+ InputColumnFilters_[tableIndex] = columns;
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::FormatHints(TUserJobFormatHints newFormatHints)
+{
+ FormatHints_ = newFormatHints;
+ return *this;
+}
+
+void TJobOperationPreparer::Finish()
+{
+ FinallyValidate();
+}
+
+TVector<TTableSchema> TJobOperationPreparer::GetOutputSchemas()
+{
+ TVector<TTableSchema> result;
+ result.reserve(OutputSchemas_.size());
+ for (auto& schema : OutputSchemas_) {
+ Y_VERIFY(schema.Defined());
+ result.push_back(std::move(*schema));
+ schema.Clear();
+ }
+ return result;
+}
+
+void TJobOperationPreparer::FinallyValidate() const
+{
+ TVector<int> illegallyMissingSchemaIndices;
+ for (int i = 0; i < static_cast<int>(OutputSchemas_.size()); ++i) {
+ if (!OutputSchemas_[i]) {
+ illegallyMissingSchemaIndices.push_back(i);
+ }
+ }
+ if (illegallyMissingSchemaIndices.empty()) {
+ return;
+ }
+ TApiUsageError error;
+ error << "Output table schemas are missing: ";
+ for (auto i : illegallyMissingSchemaIndices) {
+ error << "no. " << i;
+ if (auto path = Context_.GetInputPath(i)) {
+ error << "(" << *path << ")";
+ }
+ error << "; ";
+ }
+ ythrow std::move(error);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void TJobOperationPreparer::ValidateInputTableIndex(int tableIndex, TStringBuf message) const
+{
+ Y_ENSURE_EX(
+ 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetInputCount()),
+ TApiUsageError() <<
+ message << ": input table index " << tableIndex << " us out of range [0;" <<
+ OutputSchemas_.size() << ")");
+}
+
+void TJobOperationPreparer::ValidateOutputTableIndex(int tableIndex, TStringBuf message) const
+{
+ Y_ENSURE_EX(
+ 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetOutputCount()),
+ TApiUsageError() <<
+ message << ": output table index " << tableIndex << " us out of range [0;" <<
+ OutputSchemas_.size() << ")");
+}
+
+void TJobOperationPreparer::ValidateMissingOutputSchema(int tableIndex) const
+{
+ ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputSchema()");
+ Y_ENSURE_EX(!OutputSchemas_[tableIndex],
+ TApiUsageError() <<
+ "Output table schema no. " << tableIndex << " " <<
+ "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " <<
+ "is already set");
+}
+
+void TJobOperationPreparer::ValidateMissingInputDescription(int tableIndex) const
+{
+ ValidateInputTableIndex(tableIndex, "ValidateMissingInputDescription()");
+ Y_ENSURE_EX(!InputTableDescriptions_[tableIndex],
+ TApiUsageError() <<
+ "Description for input no. " << tableIndex << " " <<
+ "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " <<
+ "is already set");
+}
+
+void TJobOperationPreparer::ValidateMissingOutputDescription(int tableIndex) const
+{
+ ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputDescription()");
+ Y_ENSURE_EX(!OutputTableDescriptions_[tableIndex],
+ TApiUsageError() <<
+ "Description for output no. " << tableIndex << " " <<
+ "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " <<
+ "is already set");
+}
+
+TTableSchema TJobOperationPreparer::EmptyNonstrictSchema() {
+ return TTableSchema().Strict(false);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const TVector<THashMap<TString, TString>>& TJobOperationPreparer::GetInputColumnRenamings() const
+{
+ return InputColumnRenamings_;
+}
+
+const TVector<TMaybe<TVector<TString>>>& TJobOperationPreparer::GetInputColumnFilters() const
+{
+ return InputColumnFilters_;
+}
+
+const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetInputDescriptions() const
+{
+ return InputTableDescriptions_;
+}
+
+const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetOutputDescriptions() const
+{
+ return OutputTableDescriptions_;
+}
+
+const TUserJobFormatHints& TJobOperationPreparer::GetFormatHints() const
+{
+ return FormatHints_;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::InputFormatHints(TFormatHints hints)
+{
+ FormatHints_.InputFormatHints(hints);
+ return *this;
+}
+
+TJobOperationPreparer& TJobOperationPreparer::OutputFormatHints(TFormatHints hints)
+{
+ FormatHints_.OutputFormatHints(hints);
+ return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void IJob::PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& resultBuilder) const
+{
+ for (int i = 0; i < context.GetOutputCount(); ++i) {
+ resultBuilder.NoOutputSchema(i);
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+IOperationPtr IOperationClient::Map(
+ const TMapOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ const TOperationOptions& options)
+{
+ Y_VERIFY(mapper.Get());
+
+ return DoMap(
+ spec,
+ std::move(mapper),
+ options);
+}
+
+IOperationPtr IOperationClient::Map(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TMapOperationSpec& spec,
+ const TOperationOptions& options)
+{
+ Y_ENSURE_EX(spec.Inputs_.empty(),
+ TApiUsageError() << "TMapOperationSpec::Inputs MUST be empty");
+ Y_ENSURE_EX(spec.Outputs_.empty(),
+ TApiUsageError() << "TMapOperationSpec::Outputs MUST be empty");
+
+ auto mapSpec = spec;
+ for (const auto& inputPath : input.Parts_) {
+ mapSpec.AddStructuredInput(inputPath);
+ }
+ for (const auto& outputPath : output.Parts_) {
+ mapSpec.AddStructuredOutput(outputPath);
+ }
+ return Map(mapSpec, std::move(mapper), options);
+}
+
+IOperationPtr IOperationClient::Reduce(
+ const TReduceOperationSpec& spec,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options)
+{
+ Y_VERIFY(reducer.Get());
+
+ return DoReduce(
+ spec,
+ std::move(reducer),
+ options);
+}
+
+IOperationPtr IOperationClient::Reduce(
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ const TReduceOperationSpec& spec,
+ const TOperationOptions& options)
+{
+ Y_ENSURE_EX(spec.Inputs_.empty(),
+ TApiUsageError() << "TReduceOperationSpec::Inputs MUST be empty");
+ Y_ENSURE_EX(spec.Outputs_.empty(),
+ TApiUsageError() << "TReduceOperationSpec::Outputs MUST be empty");
+ Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(),
+ TApiUsageError() << "TReduceOperationSpec::ReduceBy MUST be empty");
+
+ auto reduceSpec = spec;
+ for (const auto& inputPath : input.Parts_) {
+ reduceSpec.AddStructuredInput(inputPath);
+ }
+ for (const auto& outputPath : output.Parts_) {
+ reduceSpec.AddStructuredOutput(outputPath);
+ }
+ reduceSpec.ReduceBy(reduceBy);
+ return Reduce(reduceSpec, std::move(reducer), options);
+}
+
+IOperationPtr IOperationClient::JoinReduce(
+ const TJoinReduceOperationSpec& spec,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options)
+{
+ Y_VERIFY(reducer.Get());
+
+ return DoJoinReduce(
+ spec,
+ std::move(reducer),
+ options);
+}
+
+IOperationPtr IOperationClient::MapReduce(
+ const TMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options)
+{
+ Y_VERIFY(reducer.Get());
+
+ return DoMapReduce(
+ spec,
+ std::move(mapper),
+ nullptr,
+ std::move(reducer),
+ options);
+}
+
+IOperationPtr IOperationClient::MapReduce(
+ const TMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reduceCombiner,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options)
+{
+ Y_VERIFY(reducer.Get());
+
+ return DoMapReduce(
+ spec,
+ std::move(mapper),
+ std::move(reduceCombiner),
+ std::move(reducer),
+ options);
+}
+
+IOperationPtr IOperationClient::MapReduce(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ TMapReduceOperationSpec spec,
+ const TOperationOptions& options)
+{
+ Y_ENSURE_EX(spec.Inputs_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty");
+ Y_ENSURE_EX(spec.Outputs_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty");
+ Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty");
+
+ for (const auto& inputPath : input.Parts_) {
+ spec.AddStructuredInput(inputPath);
+ }
+ for (const auto& outputPath : output.Parts_) {
+ spec.AddStructuredOutput(outputPath);
+ }
+ spec.ReduceBy(reduceBy);
+ return MapReduce(spec, std::move(mapper), std::move(reducer), options);
+}
+
+IOperationPtr IOperationClient::MapReduce(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reduceCombiner,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ TMapReduceOperationSpec spec,
+ const TOperationOptions& options)
+{
+ Y_ENSURE_EX(spec.Inputs_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty");
+ Y_ENSURE_EX(spec.Outputs_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty");
+ Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(),
+ TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty");
+
+ for (const auto& inputPath : input.Parts_) {
+ spec.AddStructuredInput(inputPath);
+ }
+ for (const auto& outputPath : output.Parts_) {
+ spec.AddStructuredOutput(outputPath);
+ }
+ spec.ReduceBy(reduceBy);
+ return MapReduce(spec, std::move(mapper), std::move(reduceCombiner), std::move(reducer), options);
+}
+
+IOperationPtr IOperationClient::Sort(
+ const TOneOrMany<TRichYPath>& input,
+ const TRichYPath& output,
+ const TSortColumns& sortBy,
+ const TSortOperationSpec& spec,
+ const TOperationOptions& options)
+{
+ Y_ENSURE_EX(spec.Inputs_.empty(),
+ TApiUsageError() << "TSortOperationSpec::Inputs MUST be empty");
+ Y_ENSURE_EX(spec.Output_.Path_.empty(),
+ TApiUsageError() << "TSortOperationSpec::Output MUST be empty");
+ Y_ENSURE_EX(spec.SortBy_.Parts_.empty(),
+ TApiUsageError() << "TSortOperationSpec::SortBy MUST be empty");
+
+ auto sortSpec = spec;
+ for (const auto& inputPath : input.Parts_) {
+ sortSpec.AddInput(inputPath);
+ }
+ sortSpec.Output(output);
+ sortSpec.SortBy(sortBy);
+ return Sort(sortSpec, options);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TRawTableReaderPtr IStructuredJob::CreateCustomRawJobReader(int) const
+{
+ return nullptr;
+}
+
+THolder<IProxyOutput> IStructuredJob::CreateCustomRawJobWriter(size_t) const
+{
+ return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/operation.h b/yt/cpp/mapreduce/interface/operation.h
new file mode 100644
index 0000000000..171a7e4af7
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/operation.h
@@ -0,0 +1,3494 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/operation.h
+///
+/// Header containing interface to run operations in YT
+/// and retrieve information about them.
+/// @see [the doc](https://yt.yandex-team.ru/docs/description/mr/map_reduce_overview.html).
+
+#include "client_method_options.h"
+#include "errors.h"
+#include "io.h"
+#include "job_statistics.h"
+#include "job_counters.h"
+
+#include <library/cpp/threading/future/future.h>
+#include <library/cpp/type_info/type_info.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/variant.h>
+#include <util/generic/vector.h>
+#include <util/generic/maybe.h>
+#include <util/system/file.h>
+#include <util/system/types.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tag class marking that the row type for table is not specified.
+struct TUnspecifiedTableStructure
+{ };
+
+/// Tag class marking that table rows have protobuf type.
+struct TProtobufTableStructure
+{
+ /// @brief Descriptor of the protobuf type of table rows.
+ ///
+ /// @note If table is tagged with @ref ::google::protobuf::Message instead of real proto class
+ /// this descriptor might be null.
+ const ::google::protobuf::Descriptor* Descriptor = nullptr;
+};
+
+
+/// Tag class to specify table row type.
+using TTableStructure = std::variant<
+ TUnspecifiedTableStructure,
+ TProtobufTableStructure
+>;
+
+bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&);
+bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs);
+
+/// Table path marked with @ref NYT::TTableStructure tag.
+struct TStructuredTablePath
+{
+ TStructuredTablePath(TRichYPath richYPath = TRichYPath(), TTableStructure description = TUnspecifiedTableStructure())
+ : RichYPath(std::move(richYPath))
+ , Description(std::move(description))
+ { }
+
+ TStructuredTablePath(TRichYPath richYPath, const ::google::protobuf::Descriptor* descriptor)
+ : RichYPath(std::move(richYPath))
+ , Description(TProtobufTableStructure({descriptor}))
+ { }
+
+ TStructuredTablePath(TYPath path)
+ : RichYPath(std::move(path))
+ , Description(TUnspecifiedTableStructure())
+ { }
+
+ TStructuredTablePath(const char* path)
+ : RichYPath(path)
+ , Description(TUnspecifiedTableStructure())
+ { }
+
+ TRichYPath RichYPath;
+ TTableStructure Description;
+};
+
+/// Create marked table path from row type.
+template <typename TRow>
+TStructuredTablePath Structured(TRichYPath richYPath);
+
+/// Create tag class from row type.
+template <typename TRow>
+TTableStructure StructuredTableDescription();
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Tag class marking that row stream is empty.
+struct TVoidStructuredRowStream
+{ };
+
+/// Tag class marking that row stream consists of `NYT::TNode`.
+struct TTNodeStructuredRowStream
+{ };
+
+/// Tag class marking that row stream consists of @ref NYT::TYaMRRow.
+struct TTYaMRRowStructuredRowStream
+{ };
+
+/// Tag class marking that row stream consists of protobuf rows of given type.
+struct TProtobufStructuredRowStream
+{
+ /// @brief Descriptor of the protobuf type of table rows.
+ ///
+ /// @note If `Descriptor` is nullptr, then row stream consists of multiple message types.
+ const ::google::protobuf::Descriptor* Descriptor = nullptr;
+};
+
+/// Tag class to specify type of rows in an operation row stream
+using TStructuredRowStreamDescription = std::variant<
+ TVoidStructuredRowStream,
+ TTNodeStructuredRowStream,
+ TTYaMRRowStructuredRowStream,
+ TProtobufStructuredRowStream
+>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Tag class marking that current binary should be used in operation.
+struct TJobBinaryDefault
+{ };
+
+/// Tag class marking that binary from specified local path should be used in operation.
+struct TJobBinaryLocalPath
+{
+ TString Path;
+ TMaybe<TString> MD5CheckSum;
+};
+
+/// Tag class marking that binary from specified Cypress path should be used in operation.
+struct TJobBinaryCypressPath
+{
+ TYPath Path;
+ TMaybe<TTransactionId> TransactionId;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// @cond Doxygen_Suppress
+namespace NDetail {
+ extern i64 OutputTableCount;
+} // namespace NDetail
+/// @endcond
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Auto merge mode.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/automerge
+enum class EAutoMergeMode
+{
+ /// Auto merge is disabled.
+ Disabled /* "disabled" */,
+
+ /// Mode that tries to achieve good chunk sizes and doesn't limit usage of chunk quota for intermediate chunks.
+ Relaxed /* "relaxed" */,
+
+ /// Mode that tries to optimize usage of chunk quota for intermediate chunks, operation might run slower.
+ Economy /* "economy" */,
+
+ ///
+ /// @brief Manual configuration of automerge parameters.
+ ///
+ /// @ref TAutoMergeSpec
+ Manual /* "manual" */,
+};
+
+///
+/// @brief Options for auto merge operation stage.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/automerge
+class TAutoMergeSpec
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TAutoMergeSpec;
+ /// @endcond
+
+ /// Mode of the auto merge.
+ FLUENT_FIELD_OPTION(EAutoMergeMode, Mode);
+
+ /// @brief Upper limit for number of intermediate chunks.
+ ///
+ /// Works only for Manual mode.
+ FLUENT_FIELD_OPTION(i64, MaxIntermediateChunkCount);
+
+ /// @brief Number of chunks limit to merge in one job.
+ ///
+ /// Works only for Manual mode.
+ FLUENT_FIELD_OPTION(i64, ChunkCountPerMergeJob);
+
+ /// @brief Automerge will not merge chunks that are larger than `DesiredChunkSize * (ChunkSizeThreshold / 100.)`
+ ///
+ /// Works only for Manual mode.
+ FLUENT_FIELD_OPTION(i64, ChunkSizeThreshold);
+};
+
+/// Base for operations with auto merge options.
+template <class TDerived>
+class TWithAutoMergeSpec
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Options for auto merge operation stage.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/automerge
+ FLUENT_FIELD_OPTION(TAutoMergeSpec, AutoMerge);
+};
+
+///
+/// @brief Resources controlled by scheduler and used by running operations.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy
+class TSchedulerResources
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TSchedulerResources;
+ /// @endcond
+
+ /// Each job consumes exactly one user slot.
+ FLUENT_FIELD_OPTION_ENCAPSULATED(i64, UserSlots);
+
+ /// Number of (virtual) cpu cores consumed by all jobs.
+ FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Cpu);
+
+ /// Amount of memory in bytes.
+ FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Memory);
+};
+
+/// Base for input format hints of a user job.
+template <class TDerived>
+class TUserJobInputFormatHintsBase
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Fine tune input format of the job.
+ FLUENT_FIELD_OPTION(TFormatHints, InputFormatHints);
+};
+
+/// Base for output format hints of a user job.
+template <class TDerived>
+class TUserJobOutputFormatHintsBase
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Fine tune output format of the job.
+ FLUENT_FIELD_OPTION(TFormatHints, OutputFormatHints);
+};
+
+/// Base for format hints of a user job.
+template <class TDerived>
+class TUserJobFormatHintsBase
+ : public TUserJobInputFormatHintsBase<TDerived>
+ , public TUserJobOutputFormatHintsBase<TDerived>
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+};
+
+/// User job format hints.
+class TUserJobFormatHints
+ : public TUserJobFormatHintsBase<TUserJobFormatHints>
+{ };
+
+/// Spec of input and output tables of a raw operation.
+template <class TDerived>
+class TRawOperationIoTableSpec
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// Add input table path to input path list.
+ TDerived& AddInput(const TRichYPath& path);
+
+ /// Set input table path no. `tableIndex`.
+ TDerived& SetInput(size_t tableIndex, const TRichYPath& path);
+
+ /// Add output table path to output path list.
+ TDerived& AddOutput(const TRichYPath& path);
+
+ /// Set output table path no. `tableIndex`.
+ TDerived& SetOutput(size_t tableIndex, const TRichYPath& path);
+
+ /// Get all input table paths.
+ const TVector<TRichYPath>& GetInputs() const;
+
+ /// Get all output table paths.
+ const TVector<TRichYPath>& GetOutputs() const;
+
+private:
+ TVector<TRichYPath> Inputs_;
+ TVector<TRichYPath> Outputs_;
+};
+
+/// Base spec for IO in "simple" raw operations (Map, Reduce etc.).
+template <class TDerived>
+struct TSimpleRawOperationIoSpec
+ : public TRawOperationIoTableSpec<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Describes format for both input and output.
+ ///
+ /// @note `Format' is overriden by `InputFormat' and `OutputFormat'.
+ FLUENT_FIELD_OPTION(TFormat, Format);
+
+ /// Describes input format.
+ FLUENT_FIELD_OPTION(TFormat, InputFormat);
+
+ /// Describes output format.
+ FLUENT_FIELD_OPTION(TFormat, OutputFormat);
+};
+
+/// Spec for IO in MapReduce operation.
+template <class TDerived>
+class TRawMapReduceOperationIoSpec
+ : public TRawOperationIoTableSpec<TDerived>
+{
+public:
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// @brief Describes format for both input and output of mapper.
+ ///
+ /// @note `MapperFormat' is overriden by `MapperInputFormat' and `MapperOutputFormat'.
+ FLUENT_FIELD_OPTION(TFormat, MapperFormat);
+
+ /// Describes mapper input format.
+ FLUENT_FIELD_OPTION(TFormat, MapperInputFormat);
+
+ /// Describes mapper output format.
+ FLUENT_FIELD_OPTION(TFormat, MapperOutputFormat);
+
+ /// @brief Describes format for both input and output of reduce combiner.
+ ///
+ /// @note `ReduceCombinerFormat' is overriden by `ReduceCombinerInputFormat' and `ReduceCombinerOutputFormat'.
+ FLUENT_FIELD_OPTION(TFormat, ReduceCombinerFormat);
+
+ /// Describes reduce combiner input format.
+ FLUENT_FIELD_OPTION(TFormat, ReduceCombinerInputFormat);
+
+ /// Describes reduce combiner output format.
+ FLUENT_FIELD_OPTION(TFormat, ReduceCombinerOutputFormat);
+
+ /// @brief Describes format for both input and output of reducer.
+ ///
+ /// @note `ReducerFormat' is overriden by `ReducerInputFormat' and `ReducerOutputFormat'.
+ FLUENT_FIELD_OPTION(TFormat, ReducerFormat);
+
+ /// Describes reducer input format.
+ FLUENT_FIELD_OPTION(TFormat, ReducerInputFormat);
+
+ /// Describes reducer output format.
+ FLUENT_FIELD_OPTION(TFormat, ReducerOutputFormat);
+
+ /// Add direct map output table path.
+ TDerived& AddMapOutput(const TRichYPath& path);
+
+ /// Set direct map output table path no. `tableIndex`.
+ TDerived& SetMapOutput(size_t tableIndex, const TRichYPath& path);
+
+ /// Get all direct map output table paths
+ const TVector<TRichYPath>& GetMapOutputs() const;
+
+private:
+ TVector<TRichYPath> MapOutputs_;
+};
+
+///
+/// @brief Base spec of operations with input tables.
+class TOperationInputSpecBase
+{
+public:
+ template <class T, class = void>
+ struct TFormatAdder;
+
+ ///
+ /// @brief Add input table path to input path list and specify type of rows.
+ template <class T>
+ void AddInput(const TRichYPath& path);
+
+ ///
+ /// @brief Add input table path as structured paths.
+ void AddStructuredInput(TStructuredTablePath path);
+
+ ///
+ /// @brief Set input table path and type.
+ template <class T>
+ void SetInput(size_t tableIndex, const TRichYPath& path);
+
+ ///
+ /// @brief All input paths.
+ TVector<TRichYPath> Inputs_;
+
+ ///
+ /// @brief Get all input structured paths.
+ const TVector<TStructuredTablePath>& GetStructuredInputs() const;
+
+private:
+ TVector<TStructuredTablePath> StructuredInputs_;
+ friend struct TOperationIOSpecBase;
+ template <class T>
+ friend struct TOperationIOSpec;
+};
+
+///
+/// @brief Base spec of operations with output tables.
+class TOperationOutputSpecBase
+{
+public:
+ template <class T, class = void>
+ struct TFormatAdder;
+
+ ///
+ /// @brief Add output table path to output path list and specify type of rows.
+ template <class T>
+ void AddOutput(const TRichYPath& path);
+
+ ///
+ /// @brief Add output table path as structured paths.
+ void AddStructuredOutput(TStructuredTablePath path);
+
+ ///
+ /// @brief Set output table path and type.
+ template <class T>
+ void SetOutput(size_t tableIndex, const TRichYPath& path);
+
+ ///
+ /// @brief All output paths.
+ TVector<TRichYPath> Outputs_;
+
+ ///
+ /// @brief Get all output structured paths.
+ const TVector<TStructuredTablePath>& GetStructuredOutputs() const;
+
+private:
+ TVector<TStructuredTablePath> StructuredOutputs_;
+ friend struct TOperationIOSpecBase;
+ template <class T>
+ friend struct TOperationIOSpec;
+};
+
+///
+/// @brief Base spec for operations with inputs and outputs.
+struct TOperationIOSpecBase
+ : public TOperationInputSpecBase
+ , public TOperationOutputSpecBase
+{ };
+
+///
+/// @brief Base spec for operations with inputs and outputs.
+template <class TDerived>
+struct TOperationIOSpec
+ : public TOperationIOSpecBase
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ template <class T>
+ TDerived& AddInput(const TRichYPath& path);
+
+ TDerived& AddStructuredInput(TStructuredTablePath path);
+
+ template <class T>
+ TDerived& SetInput(size_t tableIndex, const TRichYPath& path);
+
+ template <class T>
+ TDerived& AddOutput(const TRichYPath& path);
+
+ TDerived& AddStructuredOutput(TStructuredTablePath path);
+
+ template <class T>
+ TDerived& SetOutput(size_t tableIndex, const TRichYPath& path);
+
+
+ // DON'T USE THESE METHODS! They are left solely for backward compatibility.
+ // These methods are the only way to do equivalent of (Add/Set)(Input/Output)<Message>
+ // but please consider using (Add/Set)(Input/Output)<TConcreteMessage>
+ // (where TConcreteMessage is some descendant of Message)
+ // because they are faster and better (see https://st.yandex-team.ru/YT-6967)
+ TDerived& AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path);
+ TDerived& AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path);
+};
+
+///
+/// @brief Base spec for all operations.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options
+template <class TDerived>
+struct TOperationSpecBase
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Limit on operation execution time.
+ ///
+ /// If operation doesn't finish in time it will be aborted.
+ FLUENT_FIELD_OPTION(TDuration, TimeLimit);
+
+ /// @brief Title to be shown in web interface.
+ FLUENT_FIELD_OPTION(TString, Title);
+
+ /// @brief Pool to be used for this operation.
+ FLUENT_FIELD_OPTION(TString, Pool);
+
+ /// @brief Weight of operation.
+ ///
+ /// Coefficient defining how much resources operation gets relative to its siblings in the same pool.
+ FLUENT_FIELD_OPTION(double, Weight);
+
+ /// @breif Pool tree list that operation will use.
+ FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(TString, PoolTree);
+
+ /// How much resources can be consumed by operation.
+ FLUENT_FIELD_OPTION_ENCAPSULATED(TSchedulerResources, ResourceLimits);
+};
+
+///
+/// @brief Base spec for all operations with user jobs.
+template <class TDerived>
+struct TUserOperationSpecBase
+ : TOperationSpecBase<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ /// How many jobs can fail before operation is failed.
+ FLUENT_FIELD_OPTION(ui64, MaxFailedJobCount);
+
+ /// On any unsuccessful job completion (i.e. abortion or failure) force the whole operation to fail.
+ FLUENT_FIELD_OPTION(bool, FailOnJobRestart);
+
+ ///
+ /// @brief Table to save whole stderr of operation.
+ ///
+ /// @see https://clubs.at.yandex-team.ru/yt/1045
+ FLUENT_FIELD_OPTION(TYPath, StderrTablePath);
+
+ ///
+ /// @brief Table to save coredumps of operation.
+ ///
+ /// @see https://clubs.at.yandex-team.ru/yt/1045
+ FLUENT_FIELD_OPTION(TYPath, CoreTablePath);
+
+ ///
+ /// @brief How long should the scheduler wait for the job to be started on a node.
+ ///
+ /// When you run huge jobs that require preemption of all the other jobs on
+ /// a node, the default timeout might be insufficient and your job may be
+ /// aborted with 'waiting_timeout' reason. This is especially problematic
+ /// when you are setting 'FailOnJobRestart' option.
+ ///
+ /// @note The value must be between 10 seconds and 10 minutes.
+ FLUENT_FIELD_OPTION(TDuration, WaitingJobTimeout);
+};
+
+///
+/// @brief Class to provide information on intermediate mapreduce stream protobuf types.
+///
+/// When using protobuf format it is important to know exact types of proto messages
+/// that are used in input/output.
+///
+/// Sometimes such messages cannot be derived from job class
+/// i.e. when job class uses `NYT::TTableReader<::google::protobuf::Message>`
+/// or `NYT::TTableWriter<::google::protobuf::Message>`.
+///
+/// When using such jobs user can provide exact message type using this class.
+///
+/// @note Only input/output that relate to intermediate tables can be hinted.
+/// Input to map and output of reduce is derived from `AddInput`/`AddOutput`.
+template <class TDerived>
+struct TIntermediateTablesHintSpec
+{
+ /// Specify intermediate map output type.
+ template <class T>
+ TDerived& HintMapOutput();
+
+ /// Specify reduce combiner input.
+ template <class T>
+ TDerived& HintReduceCombinerInput();
+
+ /// Specify reduce combiner output.
+ template <class T>
+ TDerived& HintReduceCombinerOutput();
+
+ /// Specify reducer input.
+ template <class T>
+ TDerived& HintReduceInput();
+
+ ///
+ /// @brief Add output of map stage.
+ ///
+ /// Mapper output table #0 is always intermediate table that is going to be reduced later.
+ /// Rows that mapper write to tables #1, #2, ... are saved in MapOutput tables.
+ template <class T>
+ TDerived& AddMapOutput(const TRichYPath& path);
+
+ TVector<TRichYPath> MapOutputs_;
+
+ const TVector<TStructuredTablePath>& GetStructuredMapOutputs() const;
+ const TMaybe<TTableStructure>& GetIntermediateMapOutputDescription() const;
+ const TMaybe<TTableStructure>& GetIntermediateReduceCombinerInputDescription() const;
+ const TMaybe<TTableStructure>& GetIntermediateReduceCombinerOutputDescription() const;
+ const TMaybe<TTableStructure>& GetIntermediateReducerInputDescription() const;
+
+private:
+ TVector<TStructuredTablePath> StructuredMapOutputs_;
+ TMaybe<TTableStructure> IntermediateMapOutputDescription_;
+ TMaybe<TTableStructure> IntermediateReduceCombinerInputDescription_;
+ TMaybe<TTableStructure> IntermediateReduceCombinerOutputDescription_;
+ TMaybe<TTableStructure> IntermediateReducerInputDescription_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct TAddLocalFileOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TAddLocalFileOptions;
+ /// @endcond
+
+ ///
+ /// @brief Path by which job will see the uploaded file.
+ ///
+ /// Defaults to basename of the local path.
+ FLUENT_FIELD_OPTION(TString, PathInJob);
+
+ ///
+ /// @brief MD5 checksum of uploaded file.
+ ///
+ /// If not specified it is computed by this library.
+ /// If this argument is provided, the user can some cpu and disk IO.
+ FLUENT_FIELD_OPTION(TString, MD5CheckSum);
+
+ ///
+ /// @brief Do not put file into node cache
+ ///
+ /// @see NYT::TRichYPath::BypassArtifactCache
+ FLUENT_FIELD_OPTION(bool, BypassArtifactCache);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Binary to run job profiler on.
+enum class EProfilingBinary
+{
+ /// Profile job proxy.
+ JobProxy /* "job_proxy" */,
+
+ /// Profile user job.
+ UserJob /* "user_job" */,
+};
+
+/// @brief Type of job profiler.
+enum class EProfilerType
+{
+ /// Profile CPU usage.
+ Cpu /* "cpu" */,
+
+ /// Profile memory usage.
+ Memory /* "memory" */,
+
+ /// Profiler peak memory usage.
+ PeakMemory /* "peak_memory" */,
+};
+
+/// @brief Specifies a job profiler.
+struct TJobProfilerSpec
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TJobProfilerSpec;
+ /// @endcond
+
+ /// @brief Binary to profile.
+ FLUENT_FIELD_OPTION(EProfilingBinary, ProfilingBinary);
+
+ /// @brief Type of the profiler.
+ FLUENT_FIELD_OPTION(EProfilerType, ProfilerType);
+
+ /// @brief Probabiliy of the job being selected for profiling.
+ FLUENT_FIELD_OPTION(double, ProfilingProbability);
+
+ /// @brief For sampling profilers, sets the number of samples per second.
+ FLUENT_FIELD_OPTION(int, SamplingFrequency);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Spec of user job.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#user_script_options
+struct TUserJobSpec
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TUserJobSpec;
+ /// @endcond
+
+ ///
+ /// @brief Specify a local file to upload to Cypress and prepare for use in job.
+ TSelf& AddLocalFile(const TLocalFilePath& path, const TAddLocalFileOptions& options = TAddLocalFileOptions());
+
+ ///
+ /// @brief Get the list of all added local files.
+ TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> GetLocalFiles() const;
+
+ /// @brief Paths to files in Cypress to use in job.
+ FLUENT_VECTOR_FIELD(TRichYPath, File);
+
+ ///
+ /// @brief MemoryLimit specifies how much memory job process can use.
+ ///
+ /// @note
+ /// If job uses tmpfs (check @ref NYT::TOperationOptions::MountSandboxInTmpfs)
+ /// YT computes its memory usage as total of:
+ /// - memory usage of job process itself (including mapped files);
+ /// - total size of tmpfs used by this job.
+ ///
+ /// @note
+ /// When @ref NYT::TOperationOptions::MountSandboxInTmpfs is enabled library will compute
+ /// total size of all files used by this job and add this total size to MemoryLimit.
+ /// Thus you shouldn't include size of your files (e.g. binary file) into MemoryLimit.
+ ///
+ /// @note
+ /// Final memory memory_limit passed to YT is calculated as follows:
+ ///
+ /// @note
+ /// ```
+ /// memory_limit = MemoryLimit + <total-size-of-used-files> + ExtraTmpfsSize
+ /// ```
+ ///
+ /// @see NYT::TUserJobSpec::ExtraTmpfsSize
+ FLUENT_FIELD_OPTION(i64, MemoryLimit);
+
+ ///
+ /// @brief Size of data that is going to be written to tmpfs.
+ ///
+ /// This option should be used if job writes data to tmpfs.
+ ///
+ /// ExtraTmpfsSize should not include size of files specified with
+ /// @ref NYT::TUserJobSpec::AddLocalFile or @ref NYT::TUserJobSpec::AddFile
+ /// These files are copied to tmpfs automatically and their total size
+ /// is computed automatically.
+ ///
+ /// @see NYT::TOperationOptions::MountSandboxInTmpfs
+ /// @see NYT::TUserJobSpec::MemoryLimit
+ FLUENT_FIELD_OPTION(i64, ExtraTmpfsSize);
+
+ ///
+ /// @brief Maximum number of CPU cores for a single job to use.
+ FLUENT_FIELD_OPTION(double, CpuLimit);
+
+ ///
+ /// @brief Fraction of @ref NYT::TUserJobSpec::MemoryLimit that job gets at start.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#memory_reserve_factor
+ FLUENT_FIELD_OPTION(double, MemoryReserveFactor);
+
+ ///
+ /// @brief Local path to executable to be used inside jobs.
+ ////
+ /// Provided executable must use C++ YT API library (this library)
+ /// and implement job class that is going to be used.
+ ///
+ /// This option might be useful if we want to start operation from nonlinux machines
+ /// (in that case we use `JobBinary` to provide path to the same program compiled for linux).
+ /// Other example of using this option is uploading executable to cypress in advance
+ /// and save the time required to upload current executable to cache.
+ /// `md5` argument can be used to save cpu time and disk IO when binary MD5 checksum is known.
+ /// When argument is not provided library will compute it itself.
+ TUserJobSpec& JobBinaryLocalPath(TString path, TMaybe<TString> md5 = Nothing());
+
+ ///
+ /// @brief Cypress path to executable to be used inside jobs.
+ TUserJobSpec& JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId = Nothing());
+
+ ///
+ /// @brief String that will be prepended to the command.
+ ///
+ /// This option overrides @ref NYT::TOperationOptions::JobCommandPrefix.
+ FLUENT_FIELD(TString, JobCommandPrefix);
+
+ ///
+ /// @brief String that will be appended to the command.
+ ///
+ /// This option overrides @ref NYT::TOperationOptions::JobCommandSuffix.
+ FLUENT_FIELD(TString, JobCommandSuffix);
+
+ ///
+ /// @brief Map of environment variables that will be set for jobs.
+ FLUENT_MAP_FIELD(TString, TString, Environment);
+
+ ///
+ /// @brief Limit for all files inside job sandbox (in bytes).
+ FLUENT_FIELD_OPTION(ui64, DiskSpaceLimit);
+
+ ///
+ /// @brief Number of ports reserved for the job (passed through environment in YT_PORT_0, YT_PORT_1, ...).
+ FLUENT_FIELD_OPTION(ui16, PortCount);
+
+ ///
+ /// @brief Network project used to isolate job network.
+ FLUENT_FIELD_OPTION(TString, NetworkProject);
+
+ ///
+ /// @brief Limit on job execution time.
+ ///
+ /// Jobs that exceed this limit will be considered failed.
+ FLUENT_FIELD_OPTION(TDuration, JobTimeLimit);
+
+ ///
+ /// @brief Get job binary config.
+ const TJobBinaryConfig& GetJobBinary() const;
+
+ ///
+ /// @brief List of profilers to run.
+ FLUENT_VECTOR_FIELD(TJobProfilerSpec, JobProfiler);
+
+private:
+ TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> LocalFiles_;
+ TJobBinaryConfig JobBinary_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Spec of Map operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/map
+template <typename TDerived>
+struct TMapOperationSpecBase
+ : public TUserOperationSpecBase<TDerived>
+ , public TWithAutoMergeSpec<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Spec of mapper job.
+ FLUENT_FIELD(TUserJobSpec, MapperSpec);
+
+ ///
+ /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table.
+ ///
+ /// When `Ordered' is false (by default), there is no guaranties about order of reading rows.
+ /// In this case mapper might work slightly faster because row delivered from fast node can be processed YT waits
+ /// response from slow nodes.
+ /// When `Ordered' is true, rows will come in order in which they are stored in input tables.
+ FLUENT_FIELD_OPTION(bool, Ordered);
+
+ ///
+ /// @brief Recommended number of jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TMapOperationSpecBase::DataSizePerJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui32, JobCount);
+
+ ///
+ /// @brief Recommended of data size for each job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TMapOperationSpecBase::JobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerJob);
+};
+
+///
+/// @brief Spec of Map operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/map
+struct TMapOperationSpec
+ : public TMapOperationSpecBase<TMapOperationSpec>
+ , public TOperationIOSpec<TMapOperationSpec>
+ , public TUserJobFormatHintsBase<TMapOperationSpec>
+{ };
+
+///
+/// @brief Spec of raw Map operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/map
+struct TRawMapOperationSpec
+ : public TMapOperationSpecBase<TRawMapOperationSpec>
+ , public TSimpleRawOperationIoSpec<TRawMapOperationSpec>
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Spec of Reduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+template <typename TDerived>
+struct TReduceOperationSpecBase
+ : public TUserOperationSpecBase<TDerived>
+ , public TWithAutoMergeSpec<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Spec of reduce job.
+ FLUENT_FIELD(TUserJobSpec, ReducerSpec);
+
+ ///
+ /// @brief Columns to sort rows by (must include `ReduceBy` as prefix).
+ FLUENT_FIELD(TSortColumns, SortBy);
+
+ ///
+ /// @brief Columns to group rows by.
+ FLUENT_FIELD(TSortColumns, ReduceBy);
+
+ ///
+ /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`).
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables
+ FLUENT_FIELD_OPTION(TSortColumns, JoinBy);
+
+ ///
+ /// @brief Guarantee to feed all rows with same `ReduceBy` columns to a single job (`true` by default).
+ FLUENT_FIELD_OPTION(bool, EnableKeyGuarantee);
+
+ ///
+ /// @brief Recommended number of jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TReduceOperationSpecBase::DataSizePerJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui32, JobCount);
+
+ ///
+ /// @brief Recommended of data size for each job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TReduceOperationSpecBase::JobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerJob);
+};
+
+///
+/// @brief Spec of Reduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+struct TReduceOperationSpec
+ : public TReduceOperationSpecBase<TReduceOperationSpec>
+ , public TOperationIOSpec<TReduceOperationSpec>
+ , public TUserJobFormatHintsBase<TReduceOperationSpec>
+{ };
+
+///
+/// @brief Spec of raw Reduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+struct TRawReduceOperationSpec
+ : public TReduceOperationSpecBase<TRawReduceOperationSpec>
+ , public TSimpleRawOperationIoSpec<TRawReduceOperationSpec>
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Spec of JoinReduce operation.
+///
+/// @deprecated Instead the user should run a reduce operation
+/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables
+template <typename TDerived>
+struct TJoinReduceOperationSpecBase
+ : public TUserOperationSpecBase<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Spec of reduce job.
+ FLUENT_FIELD(TUserJobSpec, ReducerSpec);
+
+ ///
+ /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`).
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables
+ FLUENT_FIELD(TSortColumns, JoinBy);
+
+ ///
+ /// @brief Recommended number of jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TJoinReduceOperationSpecBase::DataSizePerJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui32, JobCount);
+
+ ///
+ /// @brief Recommended of data size for each job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TJoinReduceOperationSpecBase::JobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerJob);
+};
+
+///
+/// @brief Spec of JoinReduce operation.
+///
+/// @deprecated Instead the user should run a reduce operation
+/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables
+struct TJoinReduceOperationSpec
+ : public TJoinReduceOperationSpecBase<TJoinReduceOperationSpec>
+ , public TOperationIOSpec<TJoinReduceOperationSpec>
+ , public TUserJobFormatHintsBase<TJoinReduceOperationSpec>
+{ };
+
+///
+/// @brief Spec of raw JoinReduce operation.
+///
+/// @deprecated Instead the user should run a reduce operation
+/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables
+struct TRawJoinReduceOperationSpec
+ : public TJoinReduceOperationSpecBase<TRawJoinReduceOperationSpec>
+ , public TSimpleRawOperationIoSpec<TRawJoinReduceOperationSpec>
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Spec of MapReduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+template <typename TDerived>
+struct TMapReduceOperationSpecBase
+ : public TUserOperationSpecBase<TDerived>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TDerived;
+ /// @endcond
+
+ ///
+ /// @brief Spec of map job.
+ FLUENT_FIELD(TUserJobSpec, MapperSpec);
+
+ ///
+ /// @brief Spec of reduce job.
+ FLUENT_FIELD(TUserJobSpec, ReducerSpec);
+
+ ///
+ /// @brief Spec of reduce combiner.
+ FLUENT_FIELD(TUserJobSpec, ReduceCombinerSpec);
+
+ ///
+ /// @brief Columns to sort rows by (must include `ReduceBy` as prefix).
+ FLUENT_FIELD(TSortColumns, SortBy);
+
+ ///
+ /// @brief Columns to group rows by.
+ FLUENT_FIELD(TSortColumns, ReduceBy);
+
+ ///
+ /// @brief Recommended number of map jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TMapReduceOperationSpecBase::DataSizePerMapJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui32, MapJobCount);
+
+ ///
+ /// @brief Recommended of data size for each map job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TMapReduceOperationSpecBase::MapJobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerMapJob);
+
+ ///
+ /// @brief Recommended number of intermediate data partitions.
+ FLUENT_FIELD_OPTION(ui64, PartitionCount);
+
+ ///
+ /// @brief Recommended size of intermediate data partitions.
+ FLUENT_FIELD_OPTION(ui64, PartitionDataSize);
+
+ ///
+ /// @brief Account to use for intermediate data.
+ FLUENT_FIELD_OPTION(TString, IntermediateDataAccount);
+
+ ///
+ /// @brief Replication factor for intermediate data (1 by default).
+ FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor);
+
+ ///
+ /// @brief Recommended size of data to be passed to a single reduce combiner.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerSortJob);
+
+ ///
+ /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table.
+ ///
+ /// @see @ref NYT::TMapOperationSpec::Ordered for more info.
+ FLUENT_FIELD_OPTION(bool, Ordered);
+
+ ///
+ /// @brief Guarantee to run reduce combiner before reducer.
+ FLUENT_FIELD_OPTION(bool, ForceReduceCombiners);
+};
+
+///
+/// @brief Spec of MapReduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+struct TMapReduceOperationSpec
+ : public TMapReduceOperationSpecBase<TMapReduceOperationSpec>
+ , public TOperationIOSpec<TMapReduceOperationSpec>
+ , public TIntermediateTablesHintSpec<TMapReduceOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TMapReduceOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Format hints for mapper.
+ FLUENT_FIELD_DEFAULT(TUserJobFormatHints, MapperFormatHints, TUserJobFormatHints());
+
+ ///
+ /// @brief Format hints for reducer.
+ FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReducerFormatHints, TUserJobFormatHints());
+
+ ///
+ /// @brief Format hints for reduce combiner.
+ FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReduceCombinerFormatHints, TUserJobFormatHints());
+};
+
+///
+/// @brief Spec of raw MapReduce operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+struct TRawMapReduceOperationSpec
+ : public TMapReduceOperationSpecBase<TRawMapReduceOperationSpec>
+ , public TRawMapReduceOperationIoSpec<TRawMapReduceOperationSpec>
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Schema inference mode.
+///
+/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference
+enum class ESchemaInferenceMode : int
+{
+ FromInput /* "from_input" */,
+ FromOutput /* "from_output" */,
+ Auto /* "auto" */,
+};
+
+///
+/// @brief Spec of Sort operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/sort
+struct TSortOperationSpec
+ : TOperationSpecBase<TSortOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TSortOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Paths to input tables.
+ FLUENT_VECTOR_FIELD(TRichYPath, Input);
+
+ ///
+ /// @brief Path to output table.
+ FLUENT_FIELD(TRichYPath, Output);
+
+ ///
+ /// @brief Columns to sort table by.
+ FLUENT_FIELD(TSortColumns, SortBy);
+
+ ///
+ /// @brief Recommended number of intermediate data partitions.
+ FLUENT_FIELD_OPTION(ui64, PartitionCount);
+
+ ///
+ /// @brief Recommended size of intermediate data partitions.
+ FLUENT_FIELD_OPTION(ui64, PartitionDataSize);
+
+ ///
+ /// @brief Recommended number of partition jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TSortOperationSpec::DataSizePerPartitionJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, PartitionJobCount);
+
+ ///
+ /// @brief Recommended of data size for each partition job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TSortOperationSpec::PartitionJobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerPartitionJob);
+
+ ///
+ /// @brief Inference mode for output table schema.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference
+ FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode);
+
+ ///
+ /// @brief Account to use for intermediate data.
+ FLUENT_FIELD_OPTION(TString, IntermediateDataAccount);
+
+ ///
+ /// @brief Replication factor for intermediate data (1 by default).
+ FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor);
+};
+
+
+///
+/// @brief Merge mode.
+enum EMergeMode : int
+{
+ MM_UNORDERED /* "unordered" */,
+ MM_ORDERED /* "ordered" */,
+ MM_SORTED /* "sorted" */,
+};
+
+///
+/// @brief Spec of Merge operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/merge
+struct TMergeOperationSpec
+ : TOperationSpecBase<TMergeOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TMergeOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Paths to input tables.
+ FLUENT_VECTOR_FIELD(TRichYPath, Input);
+
+ ///
+ /// @brief Path to output table.
+ FLUENT_FIELD(TRichYPath, Output);
+
+ ///
+ /// @brief Columns by which to merge (for @ref NYT::EMergeMode::MM_SORTED).
+ FLUENT_FIELD(TSortColumns, MergeBy);
+
+ ///
+ /// @brief Merge mode.
+ FLUENT_FIELD_DEFAULT(EMergeMode, Mode, MM_UNORDERED);
+
+ ///
+ /// @brief Combine output chunks to larger ones.
+ FLUENT_FIELD_DEFAULT(bool, CombineChunks, false);
+
+ ///
+ /// @brief Guarantee that all input chunks will be read.
+ FLUENT_FIELD_DEFAULT(bool, ForceTransform, false);
+
+ ///
+ /// @brief Recommended number of jobs to run.
+ ///
+ /// `JobCount' has higher priority than @ref NYT::TMergeOperationSpec::DataSizePerJob.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui32, JobCount);
+
+ ///
+ /// @brief Recommended of data size for each job.
+ ///
+ /// `DataSizePerJob` has lower priority that @ref NYT::TMergeOperationSpec::JobCount.
+ /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits.
+ FLUENT_FIELD_OPTION(ui64, DataSizePerJob);
+
+ ///
+ /// @brief Inference mode for output table schema.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference
+ FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode);
+};
+
+///
+/// @brief Spec of Erase operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/erase
+struct TEraseOperationSpec
+ : TOperationSpecBase<TEraseOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TEraseOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Which table (or row range) to erase.
+ FLUENT_FIELD(TRichYPath, TablePath);
+
+ ///
+ /// Combine output chunks to larger ones.
+ FLUENT_FIELD_DEFAULT(bool, CombineChunks, false);
+
+ ///
+ /// @brief Inference mode for output table schema.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference
+ FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode);
+};
+
+///
+/// @brief Spec of RemoteCopy operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy
+struct TRemoteCopyOperationSpec
+ : TOperationSpecBase<TRemoteCopyOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TRemoteCopyOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Source cluster name.
+ FLUENT_FIELD(TString, ClusterName);
+
+ ///
+ /// @brief Network to use for copy (all remote cluster nodes must have it configured).
+ FLUENT_FIELD_OPTION(TString, NetworkName);
+
+ ///
+ /// @brief Paths to input tables.
+ FLUENT_VECTOR_FIELD(TRichYPath, Input);
+
+ ///
+ /// @brief Path to output table.
+ FLUENT_FIELD(TRichYPath, Output);
+
+ ///
+ /// @brief Inference mode for output table schema.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference
+ FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode);
+
+ ///
+ /// @brief Copy user attributes from input to output table (allowed only for single input table).
+ FLUENT_FIELD_DEFAULT(bool, CopyAttributes, false);
+
+ ///
+ /// @brief Names of user attributes to copy from input to output table.
+ ///
+ /// @note To make this option make sense set @ref NYT::TRemoteCopyOperationSpec::CopyAttributes to `true`.
+ FLUENT_VECTOR_FIELD(TString, AttributeKey);
+
+private:
+
+ ///
+ /// @brief Config for remote cluster connection.
+ FLUENT_FIELD_OPTION(TNode, ClusterConnection);
+};
+
+class IVanillaJobBase;
+
+///
+/// @brief Task of Vanilla operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla
+struct TVanillaTask
+ : public TOperationOutputSpecBase
+ , public TUserJobOutputFormatHintsBase<TVanillaTask>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TVanillaTask;
+ /// @endcond
+
+ ///
+ /// @brief Add output table path and specify the task output type (i.e. TMyProtoMessage).
+ template <class T>
+ TSelf& AddOutput(const TRichYPath& path);
+
+ ///
+ /// @brief Add output table path as structured path.
+ TSelf& AddStructuredOutput(TStructuredTablePath path);
+
+ ///
+ /// @brief Set output table path and specify the task output type (i.e. TMyProtoMessage).
+ template <class T>
+ TSelf& SetOutput(size_t tableIndex, const TRichYPath& path);
+
+ ///
+ /// @brief Task name.
+ FLUENT_FIELD(TString, Name);
+
+ ///
+ /// @brief Job to be executed in this task.
+ FLUENT_FIELD(::TIntrusivePtr<IVanillaJobBase>, Job);
+
+ ///
+ /// @brief User job spec.
+ FLUENT_FIELD(TUserJobSpec, Spec);
+
+ ///
+ /// @brief Number of jobs to run and wait for successful completion.
+ ///
+ /// @note If @ref NYT::TUserOperationSpecBase::FailOnJobRestart is `false`, a failed job will be restarted
+ /// and will not count in this amount.
+ FLUENT_FIELD(ui64, JobCount);
+
+ ///
+ /// @brief Network project name.
+ FLUENT_FIELD(TMaybe<TString>, NetworkProject);
+
+};
+
+///
+/// @brief Spec of Vanilla operation.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla
+struct TVanillaOperationSpec
+ : TUserOperationSpecBase<TVanillaOperationSpec>
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TVanillaOperationSpec;
+ /// @endcond
+
+ ///
+ /// @brief Description of tasks to run in this operation.
+ FLUENT_VECTOR_FIELD(TVanillaTask, Task);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Options for @ref NYT::IOperationClient::Map and other operation start commands.
+struct TOperationOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TOperationOptions;
+ /// @endcond
+
+ ///
+ /// @brief Additional field to put to operation spec.
+ FLUENT_FIELD_OPTION(TNode, Spec);
+
+ ///
+ /// @brief Start operation mode.
+ enum class EStartOperationMode : int
+ {
+ ///
+ /// @brief Prepare operation asynchronously. Call IOperation::Start() to start operation.
+ AsyncPrepare,
+
+ ///
+ /// @brief Prepare and start operation asynchronously. Don't wait for operation completion.
+ AsyncStart,
+
+ ///
+ /// @brief Prepare and start operation synchronously. Don't wait for operation completion.
+ SyncStart,
+
+ ///
+ /// @brief Prepare, start and wait for operation completion synchronously.
+ SyncWait,
+ };
+
+ ///
+ /// @brief Start operation mode.
+ FLUENT_FIELD_DEFAULT(EStartOperationMode, StartOperationMode, EStartOperationMode::SyncWait);
+
+ ///
+ /// @brief Wait for operation finish synchronously.
+ ///
+ /// @deprecated Use StartOperationMode() instead.
+ TSelf& Wait(bool value) {
+ StartOperationMode_ = value ? EStartOperationMode::SyncWait : EStartOperationMode::SyncStart;
+ return static_cast<TSelf&>(*this);
+ }
+
+ ///
+ ///
+ /// @brief Use format from table attribute (for YAMR-like format).
+ ///
+ /// @deprecated
+ FLUENT_FIELD_DEFAULT(bool, UseTableFormats, false);
+
+ ///
+ /// @brief Prefix for bash command running the jobs.
+ ///
+ /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec.
+ FLUENT_FIELD(TString, JobCommandPrefix);
+
+ ///
+ /// @brief Suffix for bash command running the jobs.
+ ///
+ /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec.
+ FLUENT_FIELD(TString, JobCommandSuffix);
+
+ ///
+ /// @brief Put all files required by the job into tmpfs.
+ ///
+ /// This option can be set globally using @ref NYT::TConfig::MountSandboxInTmpfs.
+ /// @see https://yt.yandex-team.ru/docs/problems/woodpeckers
+ FLUENT_FIELD_DEFAULT(bool, MountSandboxInTmpfs, false);
+
+ ///
+ /// @brief Path to directory to store temporary files.
+ FLUENT_FIELD_OPTION(TString, FileStorage);
+
+ ///
+ /// @brief Expiration timeout for uploaded files.
+ FLUENT_FIELD_OPTION(TDuration, FileExpirationTimeout);
+
+ ///
+ /// @brief Info to be passed securely to the job.
+ FLUENT_FIELD_OPTION(TNode, SecureVault);
+
+ ///
+ /// @brief File cache mode.
+ enum class EFileCacheMode : int
+ {
+ ///
+ /// @brief Use YT API commands "get_file_from_cache" and "put_file_to_cache".
+ ApiCommandBased,
+
+ ///
+ /// @brief Upload files to random paths inside @ref NYT::TOperationOptions::FileStorage without caching.
+ CachelessRandomPathUpload,
+ };
+
+ ///
+ /// @brief File cache mode.
+ FLUENT_FIELD_DEFAULT(EFileCacheMode, FileCacheMode, EFileCacheMode::ApiCommandBased);
+
+ ///
+ /// @brief Id of transaction within which all Cypress file storage entries will be checked/created.
+ ///
+ /// By default, the root transaction is used.
+ ///
+ /// @note Set a specific transaction only if you
+ /// 1. specify non-default file storage path in @ref NYT::TOperationOptions::FileStorage or in @ref NYT::TConfig::RemoteTempFilesDirectory.
+ /// 2. use `CachelessRandomPathUpload` caching mode (@ref NYT::TOperationOptions::FileCacheMode).
+ FLUENT_FIELD(TTransactionId, FileStorageTransactionId);
+
+ ///
+ /// @brief Ensure stderr and core tables exist before starting operation.
+ ///
+ /// If set to `false`, it is user's responsibility to ensure these tables exist.
+ FLUENT_FIELD_DEFAULT(bool, CreateDebugOutputTables, true);
+
+ ///
+ /// @brief Ensure output tables exist before starting operation.
+ ///
+ /// If set to `false`, it is user's responsibility to ensure output tables exist.
+ FLUENT_FIELD_DEFAULT(bool, CreateOutputTables, true);
+
+ ///
+ /// @brief Try to infer schema of inexistent table from the type of written rows.
+ ///
+ /// @note Default values for this option may differ depending on the row type.
+ /// For protobuf it's currently `false` by default.
+ FLUENT_FIELD_OPTION(bool, InferOutputSchema);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job.
+const TNode& GetJobSecureVault();
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Context passed to @ref NYT::IRawJob::Do.
+class TRawJobContext
+{
+public:
+ explicit TRawJobContext(size_t outputTableCount);
+
+ ///
+ /// @brief Get file corresponding to input stream.
+ const TFile& GetInputFile() const;
+
+ ///
+ /// @brief Get files corresponding to output streams.
+ const TVector<TFile>& GetOutputFileList() const;
+
+private:
+ TFile InputFile_;
+ TVector<TFile> OutputFileList_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface for classes that can be Saved/Loaded (to be used with @ref Y_SAVELOAD_JOB).
+class ISerializableForJob
+{
+public:
+ virtual ~ISerializableForJob() = default;
+
+ ///
+ /// @brief Dump state to output stream to be restored in job.
+ virtual void Save(IOutputStream& stream) const = 0;
+
+ ///
+ /// @brief Load state from a stream.
+ virtual void Load(IInputStream& stream) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Provider of information about operation inputs/outputs during @ref NYT::IJob::PrepareOperation.
+class IOperationPreparationContext
+{
+public:
+ virtual ~IOperationPreparationContext() = default;
+
+ /// @brief Get the number of input tables.
+ virtual int GetInputCount() const = 0;
+
+ /// @brief Get the number of output tables.
+ virtual int GetOutputCount() const = 0;
+
+ /// @brief Get the schema of input table no. `index`.
+ virtual const TTableSchema& GetInputSchema(int index) const = 0;
+
+ /// @brief Get all the input table schemas.
+ virtual const TVector<TTableSchema>& GetInputSchemas() const = 0;
+
+ /// @brief Path to the input table if available (`Nothing()` for intermediate tables).
+ virtual TMaybe<TYPath> GetInputPath(int index) const = 0;
+
+ /// @brief Path to the output table if available (`Nothing()` for intermediate tables).
+ virtual TMaybe<TYPath> GetOutputPath(int index) const = 0;
+};
+
+///
+/// @brief Fluent builder class for @ref NYT::IJob::PrepareOperation.
+///
+/// @note Method calls are supposed to be chained.
+class TJobOperationPreparer
+{
+public:
+
+ ///
+ /// @brief Group of input tables that allows to specify properties on all of them at once.
+ ///
+ /// The instances are created with @ref NYT::TJobOperationPreparer::BeginInputGroup, not directly.
+ class TInputGroup
+ {
+ public:
+ TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices);
+
+ /// @brief Specify the type of input rows.
+ template <typename TRow>
+ TInputGroup& Description();
+
+ /// @brief Specify renaming of input columns.
+ TInputGroup& ColumnRenaming(const THashMap<TString, TString>& renaming);
+
+ /// @brief Specify what input columns to send to job
+ ///
+ /// @note Filter is applied before renaming, so it must specify original column names.
+ TInputGroup& ColumnFilter(const TVector<TString>& columns);
+
+ /// @brief Finish describing the input group.
+ TJobOperationPreparer& EndInputGroup();
+
+ private:
+ TJobOperationPreparer& Preparer_;
+ TVector<int> Indices_;
+ };
+
+ ///
+ /// @brief Group of output tables that allows to specify properties on all of them at once.
+ ///
+ /// The instances are created with @ref NYT::TJobOperationPreparer::BeginOutputGroup, not directly.
+ class TOutputGroup
+ {
+ public:
+ TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices);
+
+ /// @brief Specify the type of output rows.
+ ///
+ /// @tparam TRow type of output rows from tables of this group.
+ /// @param inferSchema Infer schema from `TRow` and specify it for these output tables.
+ template <typename TRow>
+ TOutputGroup& Description(bool inferSchema = true);
+
+ /// @brief Specify schema for these tables.
+ TOutputGroup& Schema(const TTableSchema& schema);
+
+ /// @brief Specify that all the the tables in this group are unschematized.
+ ///
+ /// It is equivalent of `.Schema(TTableSchema().Strict(false)`.
+ TOutputGroup& NoSchema();
+
+ /// @brief Finish describing the output group.
+ TJobOperationPreparer& EndOutputGroup();
+
+ private:
+ TJobOperationPreparer& Preparer_;
+ TVector<int> Indices_;
+ };
+
+public:
+ explicit TJobOperationPreparer(const IOperationPreparationContext& context);
+
+ /// @brief Begin input group consisting of tables with indices `[begin, end)`.
+ ///
+ /// @param begin First index.
+ /// @param end Index after the last one.
+ TInputGroup BeginInputGroup(int begin, int end);
+
+ /// @brief Begin input group consisting of tables with indices from `indices`.
+ ///
+ /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions.
+ /// @param indices Indices of tables to include in the group.
+ template <typename TCont>
+ TInputGroup BeginInputGroup(const TCont& indices);
+
+ /// @brief Begin output group consisting of tables with indices `[begin, end)`.
+ ///
+ /// @param begin First index.
+ /// @param end Index after the last one.
+ TOutputGroup BeginOutputGroup(int begin, int end);
+
+ /// @brief Begin input group consisting of tables with indices from `indices`.
+ ///
+ /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions.
+ /// @param indices Indices of tables to include in the group.
+ template <typename TCont>
+ TOutputGroup BeginOutputGroup(const TCont& indices);
+
+ /// @brief Specify the schema for output table no `tableIndex`.
+ ///
+ /// @note All the output schemas must be specified either with this method, `NoOutputSchema` or `OutputDescription` with `inferSchema == true`
+ TJobOperationPreparer& OutputSchema(int tableIndex, TTableSchema schema);
+
+ /// @brief Mark the output table no. `tableIndex` as unschematized.
+ TJobOperationPreparer& NoOutputSchema(int tableIndex);
+
+ /// @brief Specify renaming of input columns for table no. `tableIndex`.
+ TJobOperationPreparer& InputColumnRenaming(int tableIndex, const THashMap<TString, TString>& renaming);
+
+ /// @brief Specify what input columns of table no. `tableIndex` to send to job
+ ///
+ /// @note Filter is applied before renaming, so it must specify original column names.
+ TJobOperationPreparer& InputColumnFilter(int tableIndex, const TVector<TString>& columns);
+
+ /// @brief Specify the type of input rows for table no. `tableIndex`.
+ ///
+ /// @tparam TRow type of input rows.
+ template <typename TRow>
+ TJobOperationPreparer& InputDescription(int tableIndex);
+
+ /// @brief Specify the type of output rows for table no. `tableIndex`.
+ ///
+ /// @tparam TRow type of output rows.
+ /// @param inferSchema Infer schema from `TRow` and specify it for the output tables.
+ template <typename TRow>
+ TJobOperationPreparer& OutputDescription(int tableIndex, bool inferSchema = true);
+
+ /// @brief Set type of output rows for table no. `tableIndex` to TNode
+ ///
+ /// @note Set schema via `OutputSchema` if needed
+ TJobOperationPreparer& NodeOutput(int tableIndex);
+
+ /// @brief Specify input format hints.
+ ///
+ /// These hints have lower priority than ones specified in spec.
+ TJobOperationPreparer& InputFormatHints(TFormatHints hints);
+
+ /// @brief Specify output format hints.
+ ///
+ /// These hints have lower priority than ones specified in spec.
+ TJobOperationPreparer& OutputFormatHints(TFormatHints hints);
+
+ /// @brief Specify format hints.
+ ///
+ /// These hints have lower priority than ones specified in spec.
+ TJobOperationPreparer& FormatHints(TUserJobFormatHints newFormatHints);
+
+ /// @name "Private" members
+ /// The following methods should not be used by clients in @ref NYT::IJob::PrepareOperation
+ ///@{
+
+ /// @brief Finish the building process.
+ void Finish();
+
+ /// @brief Get output table schemas as specified by the user.
+ TVector<TTableSchema> GetOutputSchemas();
+
+ /// @brief Get input column renamings as specified by the user.
+ const TVector<THashMap<TString, TString>>& GetInputColumnRenamings() const;
+
+ /// @brief Get input column filters as specified by the user.
+ const TVector<TMaybe<TVector<TString>>>& GetInputColumnFilters() const;
+
+ /// @brief Get input column descriptions as specified by the user.
+ const TVector<TMaybe<TTableStructure>>& GetInputDescriptions() const;
+
+ /// @brief Get output column descriptions as specified by the user.
+ const TVector<TMaybe<TTableStructure>>& GetOutputDescriptions() const;
+
+ /// @brief Get format hints as specified by the user.
+ const TUserJobFormatHints& GetFormatHints() const;
+
+ ///@}
+private:
+
+ /// @brief Validate that schema for output table no. `tableIndex` has not been set yet.
+ void ValidateMissingOutputSchema(int tableIndex) const;
+
+ /// @brief Validate that description for input table no. `tableIndex` has not been set yet.
+ void ValidateMissingInputDescription(int tableIndex) const;
+
+ /// @brief Validate that description for output table no. `tableIndex` has not been set yet.
+ void ValidateMissingOutputDescription(int tableIndex) const;
+
+ /// @brief Validate that `tableIndex` is in correct range for input table indices.
+ ///
+ /// @param message Message to add to the exception in case of violation.
+ void ValidateInputTableIndex(int tableIndex, TStringBuf message) const;
+
+ /// @brief Validate that `tableIndex` is in correct range for output table indices.
+ ///
+ /// @param message Message to add to the exception in case of violation.
+ void ValidateOutputTableIndex(int tableIndex, TStringBuf message) const;
+
+ /// @brief Validate that all the output schemas has been set.
+ void FinallyValidate() const;
+
+ static TTableSchema EmptyNonstrictSchema();
+
+private:
+ const IOperationPreparationContext& Context_;
+
+ TVector<TMaybe<TTableSchema>> OutputSchemas_;
+ TVector<THashMap<TString, TString>> InputColumnRenamings_;
+ TVector<TMaybe<TVector<TString>>> InputColumnFilters_;
+ TVector<TMaybe<TTableStructure>> InputTableDescriptions_;
+ TVector<TMaybe<TTableStructure>> OutputTableDescriptions_;
+ TUserJobFormatHints FormatHints_ = {};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface for all user jobs.
+class IJob
+ : public TThrRefBase
+{
+public:
+
+ ///
+ /// @brief Type of job.
+ enum EType
+ {
+ Mapper,
+ Reducer,
+ ReducerAggregator,
+ RawJob,
+ VanillaJob,
+ };
+
+ ///
+ /// @brief Save job state to stream to be restored on cluster nodes.
+ virtual void Save(IOutputStream& stream) const
+ {
+ Y_UNUSED(stream);
+ }
+
+ ///
+ /// @brief Restore job state from a stream.
+ virtual void Load(IInputStream& stream)
+ {
+ Y_UNUSED(stream);
+ }
+
+ ///
+ /// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job.
+ const TNode& SecureVault() const
+ {
+ return GetJobSecureVault();
+ }
+
+ ///
+ /// @brief Get number of output tables.
+ i64 GetOutputTableCount() const
+ {
+ Y_VERIFY(NDetail::OutputTableCount > 0);
+
+ return NDetail::OutputTableCount;
+ }
+
+ ///
+ /// @brief Method allowing user to control some properties of input and output tables and formats.
+ ///
+ /// User can override this method in their job class to:
+ /// - specify output table schemas.
+ /// The most natural way is usually through @ref NYT::TJobOperationPreparer::OutputDescription (especially for protobuf),
+ /// but you can use @ref NYT::TJobOperationPreparer::OutputSchema directly
+ /// - specify output row type (@ref NYT::TJobOperationPreparer::OutputDescription)
+ /// - specify input row type (@ref NYT::TJobOperationPreparer::InputDescription)
+ /// - specify input column filter and renaming (@ref NYT::TJobOperationPreparer::InputColumnFilter and @ref NYT::TJobOperationPreparer::InputColumnRenaming)
+ /// - specify format hints (@ref NYT::TJobOperationPreparer::InputFormatHints,
+ /// NYT::TJobOperationPreparer::OutputFormatHints and @ref NYT::TJobOperationPreparer::FormatHints)
+ /// - maybe something more, cf. the methods of @ref NYT::TJobOperationPreparer.
+ ///
+ /// If one has several similar tables, groups can be used.
+ /// Groups are delimited by @ref NYT::TJobOperationPreparer::BeginInputGroup /
+ /// @ref NYT::TJobOperationPreparer::TInputGroup::EndInputGroup and
+ /// @ref NYT::TJobOperationPreparer::BeginOutputGroup /
+ /// @ref NYT::TJobOperationPreparer::TOutputGroup::EndOutputGroup.
+ /// Example:
+ /// @code{.cpp}
+ /// preparer
+ /// .BeginInputGroup({1,2,4,8})
+ /// .ColumnRenaming({{"a", "b"}, {"c", "d"}})
+ /// .ColumnFilter({"a", "c"})
+ /// .EndInputGroup();
+ /// @endcode
+ ///
+ /// @note All the output table schemas must be set
+ /// (possibly as empty nonstrict using @ref NYT::TJobOperationPreparer::NoOutputSchema or
+ /// @ref NYT::TJobOperationPreparer::TOutputGroup::NoSchema).
+ /// By default all the output table schemas are marked as empty nonstrict.
+ virtual void PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& preparer) const;
+};
+
+///
+/// @brief Declare what fields of currently declared job class to save and restore on cluster node.
+#define Y_SAVELOAD_JOB(...) \
+ virtual void Save(IOutputStream& stream) const override { Save(&stream); } \
+ virtual void Load(IInputStream& stream) override { Load(&stream); } \
+ Y_PASS_VA_ARGS(Y_SAVELOAD_DEFINE(__VA_ARGS__))
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface for jobs with typed inputs and outputs.
+class IStructuredJob
+ : public IJob
+{
+public:
+ ///
+ /// @brief This methods are called when creating table reader and writer for the job.
+ ///
+ /// Override them if you want to implement custom input logic. (e.g. addtitional bufferization)
+ virtual TRawTableReaderPtr CreateCustomRawJobReader(int fd) const;
+ virtual THolder<IProxyOutput> CreateCustomRawJobWriter(size_t outputTableCount) const;
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const = 0;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Create default raw job reader.
+TRawTableReaderPtr CreateRawJobReader(int fd = 0);
+
+///
+/// @brief Create default raw job writer.
+THolder<IProxyOutput> CreateRawJobWriter(size_t outputTableCount);
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Base interface for structured (typed) map jobs.
+class IMapperBase
+ : public IStructuredJob
+{ };
+
+///
+/// @brief Base interface for structured (typed) map jobs with given reader and writer.
+template <class TR, class TW>
+class IMapper
+ : public IMapperBase
+{
+public:
+ using TReader = TR;
+ using TWriter = TW;
+
+public:
+ /// Type of job implemented by this class.
+ static constexpr EType JobType = EType::Mapper;
+
+ ///
+ /// @brief This method is called before feeding input rows to mapper (before `Do` method).
+ virtual void Start(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ ///
+ /// @brief This method is called exactly once for the whole job input.
+ ///
+ /// Read input rows from `reader` and write output ones to `writer`.
+ virtual void Do(TReader* reader, TWriter* writer) = 0;
+
+ ///
+ /// @brief This method is called after feeding input rows to mapper (after `Do` method).
+ virtual void Finish(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Base interface for structured (typed) reduce jobs.
+///
+/// It is common base for @ref NYT::IReducer and @ref NYT::IAggregatorReducer.
+class IReducerBase
+ : public IStructuredJob
+{ };
+
+///
+/// @brief Base interface for structured (typed) reduce jobs with given reader and writer.
+template <class TR, class TW>
+class IReducer
+ : public IReducerBase
+{
+public:
+ using TReader = TR;
+ using TWriter = TW;
+
+public:
+ /// Type of job implemented by this class.
+ static constexpr EType JobType = EType::Reducer;
+
+public:
+
+ ///
+ /// @brief This method is called before feeding input rows to reducer (before `Do` method).
+ virtual void Start(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ ///
+ /// @brief This method is called exactly once for each range with same value of `ReduceBy` (or `JoinBy`) keys.
+ virtual void Do(TReader* reader, TWriter* writer) = 0;
+
+ ///
+ /// @brief This method is called after feeding input rows to reducer (after `Do` method).
+ virtual void Finish(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ ///
+ /// @brief Refuse to process the remaining row ranges and finish the job (successfully).
+ void Break();
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Base interface of jobs used inside reduce operations.
+///
+/// Unlike @ref NYT::IReducer jobs their `Do' method is called only once
+/// and takes whole range of records split by key boundaries.
+///
+/// Template argument `TR` must be @ref NYT::TTableRangesReader.
+template <class TR, class TW>
+class IAggregatorReducer
+ : public IReducerBase
+{
+public:
+ using TReader = TR;
+ using TWriter = TW;
+
+public:
+ /// Type of job implemented by this class.
+ static constexpr EType JobType = EType::ReducerAggregator;
+
+public:
+ ///
+ /// @brief This method is called before feeding input rows to reducer (before `Do` method).
+ virtual void Start(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ ///
+ /// @brief This method is called exactly once for the whole job input.
+ virtual void Do(TReader* reader, TWriter* writer) = 0;
+
+ ///
+ /// @brief This method is called after feeding input rows to reducer (after `Do` method).
+ virtual void Finish(TWriter* writer)
+ {
+ Y_UNUSED(writer);
+ }
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface for raw jobs (i.e. reading and writing byte streams).
+class IRawJob
+ : public IJob
+{
+public:
+ /// Type of job implemented by this class.
+ static constexpr EType JobType = EType::RawJob;
+
+ ///
+ /// @brief This method is called exactly once for the whole job input.
+ virtual void Do(const TRawJobContext& jobContext) = 0;
+};
+
+///
+/// @brief Interface of jobs that run the given bash command.
+class ICommandJob
+ : public IJob
+{
+public:
+ ///
+ /// @brief Get bash command to run.
+ ///
+ /// @note This method is called on the client side.
+ virtual const TString& GetCommand() const = 0;
+};
+
+///
+/// @brief Raw job executing given bash command.
+///
+/// @note The binary will not be uploaded.
+class TCommandRawJob
+ : public IRawJob
+ , public ICommandJob
+{
+public:
+ ///
+ /// @brief Create job with specified command.
+ ///
+ /// @param command Bash command to run.
+ explicit TCommandRawJob(TStringBuf command = {});
+
+ const TString& GetCommand() const override;
+ void Do(const TRawJobContext& jobContext) override;
+
+private:
+ TString Command_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Base interface for vanilla jobs.
+///
+/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla
+class IVanillaJobBase
+ : public virtual IStructuredJob
+{
+public:
+ /// Type of job implemented by this class.
+ static constexpr EType JobType = EType::VanillaJob;
+};
+
+template <class TW = void>
+class IVanillaJob;
+
+///
+/// @brief Interface of vanilla job without outputs.
+template <>
+class IVanillaJob<void>
+ : public IVanillaJobBase
+{
+public:
+ ///
+ /// @brief This method is called exactly once for each vanilla job.
+ virtual void Do() = 0;
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override;
+};
+
+///
+/// @brief Vanilla job executing given bash command.
+///
+/// @note The binary will not be uploaded.
+class TCommandVanillaJob
+ : public IVanillaJob<>
+ , public ICommandJob
+{
+public:
+ ///
+ /// @brief Create job with specified command.
+ ///
+ /// @param command Bash command to run.
+ explicit TCommandVanillaJob(TStringBuf command = {});
+
+ const TString& GetCommand() const override;
+ void Do() override;
+
+private:
+ TString Command_;
+};
+
+///
+/// @brief Interface for vanilla jobs with output tables.
+template <class TW>
+class IVanillaJob
+ : public IVanillaJobBase
+{
+public:
+ using TWriter = TW;
+
+public:
+ ///
+ /// @brief This method is called before `Do` method.
+ virtual void Start(TWriter* /* writer */)
+ { }
+
+ ///
+ /// @brief This method is called exactly once for each vanilla job.
+ ///
+ /// Write output rows to `writer`.
+ virtual void Do(TWriter* writer) = 0;
+
+ ///
+ /// @brief This method is called after `Do` method.
+ virtual void Finish(TWriter* /* writer */)
+ { }
+
+ virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override;
+ virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Attributes to request for an operation.
+enum class EOperationAttribute : int
+{
+ Id /* "id" */,
+ Type /* "type" */,
+ State /* "state" */,
+ AuthenticatedUser /* "authenticated_user" */,
+ StartTime /* "start_time" */,
+ FinishTime /* "finish_time" */,
+ BriefProgress /* "brief_progress" */,
+ BriefSpec /* "brief_spec" */,
+ Suspended /* "suspended" */,
+ Result /* "result" */,
+ Progress /* "progress" */,
+ Events /* "events" */,
+ Spec /* "spec" */,
+ FullSpec /* "full_spec" */,
+ UnrecognizedSpec /* "unrecognized_spec" */,
+};
+
+///
+/// @brief Class describing which attributes to request in @ref NYT::IClient::GetOperation or @ref NYT::IClient::ListOperations.
+struct TOperationAttributeFilter
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TOperationAttributeFilter;
+ /// @endcond
+
+ TVector<EOperationAttribute> Attributes_;
+
+ ///
+ /// @brief Add attribute to the filter. Calls are supposed to be chained.
+ TSelf& Add(EOperationAttribute attribute)
+ {
+ Attributes_.push_back(attribute);
+ return *this;
+ }
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetOperation call.
+struct TGetOperationOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetOperationOptions;
+ /// @endcond
+
+ ///
+ /// @brief What attributes to request (if omitted, the default set of attributes will be requested).
+ FLUENT_FIELD_OPTION(TOperationAttributeFilter, AttributeFilter);
+};
+
+///
+/// @brief "Coarse-grained" state of an operation.
+enum class EOperationBriefState : int
+{
+ InProgress /* "in_progress" */,
+ Completed /* "completed" */,
+ Aborted /* "aborted" */,
+
+ /// Failed
+ Failed /* "failed" */,
+};
+
+///
+/// @brief Operation type.
+enum class EOperationType : int
+{
+ Map /* "map" */,
+ Merge /* "merge" */,
+ Erase /* "erase" */,
+ Sort /* "sort" */,
+ Reduce /* "reduce" */,
+ MapReduce /* "map_reduce" */,
+ RemoteCopy /* "remote_copy" */,
+ JoinReduce /* "join_reduce" */,
+ Vanilla /* "vanilla" */,
+};
+
+///
+/// @brief Operation progress.
+struct TOperationProgress
+{
+ ///
+ /// @brief Total job statistics.
+ TJobStatistics JobStatistics;
+
+ ///
+ /// @brief Job counter for various job states with hierarchy.
+ TJobCounters JobCounters;
+
+ ///
+ /// @brief Time when this progress was built on scheduler or CA.
+ TMaybe<TInstant> BuildTime;
+};
+
+///
+/// @brief Brief operation progress (numbers of jobs in these states).
+struct TOperationBriefProgress
+{
+ ui64 Aborted = 0;
+ ui64 Completed = 0;
+ ui64 Failed = 0;
+ ui64 Lost = 0;
+ ui64 Pending = 0;
+ ui64 Running = 0;
+ ui64 Total = 0;
+};
+
+///
+/// @brief Operation result.
+struct TOperationResult
+{
+ ///
+ /// @brief For a unsuccessfully finished operation: description of error.
+ TMaybe<TYtError> Error;
+};
+
+///
+/// @brief Operation event (change of state).
+struct TOperationEvent
+{
+ ///
+ /// @brief New state of operation.
+ TString State;
+
+ ///
+ /// @brief Time of state change.
+ TInstant Time;
+};
+
+///
+/// @brief Operation info.
+///
+/// A field may be `Nothing()` either if it was not requested (see @ref NYT::TGetOperationOptions::AttributeFilter)
+/// or it is not available (i.e. `FinishTime` for a running operation).
+/// @see https://yt.yandex-team.ru/docs/api/commands#get_operation
+struct TOperationAttributes
+{
+ ///
+ /// @brief Operation id.
+ TMaybe<TOperationId> Id;
+
+ ///
+ /// @brief Operation type.
+ TMaybe<EOperationType> Type;
+
+ ///
+ /// @brief Operation state.
+ TMaybe<TString> State;
+
+ ///
+ /// @brief "Coarse-grained" operation state.
+ TMaybe<EOperationBriefState> BriefState;
+
+ ///
+ /// @brief Name of user that started the operation.
+ TMaybe<TString> AuthenticatedUser;
+
+ ///
+ /// @brief Operation start time.
+ TMaybe<TInstant> StartTime;
+
+ ///
+ /// @brief Operation finish time (if the operation has finished).
+ TMaybe<TInstant> FinishTime;
+
+ ///
+ /// @brief Brief progress of the operation.
+ TMaybe<TOperationBriefProgress> BriefProgress;
+
+ ///
+ /// @brief Brief spec of operation (light-weight fields only).
+ TMaybe<TNode> BriefSpec;
+
+ ///
+ /// @brief Spec of the operation as provided by the user.
+ TMaybe<TNode> Spec;
+
+ ///
+ /// @brief Full spec of operation (all fields not specified by user are filled with default values).
+ TMaybe<TNode> FullSpec;
+
+ ///
+ /// @brief Fields not recognized by scheduler.
+ TMaybe<TNode> UnrecognizedSpec;
+
+ ///
+ /// @brief Is operation suspended.
+ TMaybe<bool> Suspended;
+
+ ///
+ /// @brief Operation result.
+ TMaybe<TOperationResult> Result;
+
+ ///
+ /// @brief Operation progress.
+ TMaybe<TOperationProgress> Progress;
+
+ ///
+ /// @brief List of operation events (changes of state).
+ TMaybe<TVector<TOperationEvent>> Events;
+
+ ///
+ /// @brief Map from alert name to its description.
+ TMaybe<THashMap<TString, TYtError>> Alerts;
+};
+
+///
+/// @brief Direction of cursor for paging, see @ref NYT::TListOperationsOptions::CursorDirection.
+enum class ECursorDirection
+{
+ Past /* "past" */,
+ Future /* "future" */,
+};
+
+///
+/// @brief Options of @ref NYT::IClient::ListOperations command.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_operations
+struct TListOperationsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TListOperationsOptions;
+ /// @endcond
+
+ ///
+ /// @name Time range specification
+ ///
+ /// List operations with start time in half-closed interval
+ /// `[CursorTime, ToTime)` if `CursorDirection == Future` or
+ /// `[FromTime, CursorTime)` if `CursorDirection == Past`.
+ ///@{
+
+ ///
+ /// @brief Search for operations with start time >= `FromTime`.
+ FLUENT_FIELD_OPTION(TInstant, FromTime);
+
+ ///
+ /// @brief Search for operations with start time < `ToTime`.
+ FLUENT_FIELD_OPTION(TInstant, ToTime);
+
+ ///
+ /// @brief Additional restriction on operation start time (useful for pagination).
+ ///
+ /// Search for operations with start time >= `CursorTime` if `CursorDirection == Future`
+ /// and with start time < `CursorTime` if `CursorDirection == Past`
+ FLUENT_FIELD_OPTION(TInstant, CursorTime);
+
+ ///
+ /// @brief Direction of pagination (see @ref NYT::TListOperationsOptions::CursorTime).
+ FLUENT_FIELD_OPTION(ECursorDirection, CursorDirection);
+
+ ///@}
+
+ ///
+ /// @name Filters
+ /// Choose operations satisfying given filters.
+ ///@{
+
+ ///
+ /// @brief Search for `Filter` as a substring in operation text factors
+ /// (e.g. title or input/output table paths).
+ FLUENT_FIELD_OPTION(TString, Filter);
+
+ ///
+ /// @brief Choose operations whose pools include `Pool`.
+ FLUENT_FIELD_OPTION(TString, Pool);
+
+ ///
+ /// @brief Choose operations with given @ref NYT::TOperationAttributes::AuthenticatedUser.
+ FLUENT_FIELD_OPTION(TString, User);
+
+ ///
+ /// @brief Choose operations with given @ref NYT::TOperationAttributes::State.
+ FLUENT_FIELD_OPTION(TString, State);
+
+ ///
+ /// @brief Choose operations with given @ref NYT::TOperationAttributes::Type.
+ FLUENT_FIELD_OPTION(EOperationType, Type);
+
+ ///
+ /// @brief Choose operations having (or not having) any failed jobs.
+ FLUENT_FIELD_OPTION(bool, WithFailedJobs);
+
+ ///@}
+
+ ///
+ /// @brief Search for operations in the archive in addition to Cypress.
+ FLUENT_FIELD_OPTION(bool, IncludeArchive);
+
+ ///
+ /// @brief Include the counters for different filter parameters in the response.
+ ///
+ /// Include number of operations for each pool, user, state, type
+ /// and the number of operations having failed jobs.
+ FLUENT_FIELD_OPTION(bool, IncludeCounters);
+
+ ///
+ /// @brief Return no more than `Limit` operations (current default and maximum value is 1000).
+ FLUENT_FIELD_OPTION(i64, Limit);
+};
+
+///
+/// @brief Response for @ref NYT::IClient::ListOperations command.
+struct TListOperationsResult
+{
+ ///
+ /// @brief Found operations' attributes.
+ TVector<TOperationAttributes> Operations;
+
+ ///
+ /// @name Counters for different filter.
+ ///
+ /// If counters were requested (@ref NYT::TListOperationsOptions::IncludeCounters is `true`)
+ /// the maps contain the number of operations found for each pool, user, state and type.
+ /// NOTE:
+ /// 1) Counters ignore CursorTime and CursorDirection,
+ /// they always are collected in the whole [FromTime, ToTime) interval.
+ /// 2) Each next counter in the sequence [pool, user, state, type, with_failed_jobs]
+ /// takes into account all the previous filters (i.e. if you set User filter to "some-user"
+ /// type counts describe only operations with user "some-user").
+ /// @{
+
+ ///
+ /// @brief Number of operations for each pool.
+ TMaybe<THashMap<TString, i64>> PoolCounts;
+
+ ///
+ /// @brief Number of operations for each user (subject to previous filters).
+ TMaybe<THashMap<TString, i64>> UserCounts;
+
+ ///
+ /// @brief Number of operations for each state (subject to previous filters).
+ TMaybe<THashMap<TString, i64>> StateCounts;
+
+ ///
+ /// @brief Number of operations for each type (subject to previous filters).
+ TMaybe<THashMap<EOperationType, i64>> TypeCounts;
+
+ ///
+ /// @brief Number of operations having failed jobs (subject to all previous filters).
+ TMaybe<i64> WithFailedJobsCount;
+
+ /// @}
+
+ ///
+ /// @brief Whether some operations were not returned due to @ref NYT::TListOperationsOptions::Limit.
+ ///
+ /// `Incomplete == true` means that not all operations satisfying filters
+ /// were returned (limit exceeded) and you need to repeat the request with new @ref NYT::TListOperationsOptions::CursorTime
+ /// (e.g. `CursorTime == *Operations.back().StartTime`, but don't forget to
+ /// remove the duplicates).
+ bool Incomplete;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Data source for @ref NYT::IClient::ListJobs command.
+enum class EListJobsDataSource : int
+{
+ Runtime /* "runtime" */,
+ Archive /* "archive" */,
+ Auto /* "auto" */,
+ Manual /* "manual" */,
+};
+
+///
+/// @brief Job type.
+enum class EJobType : int
+{
+ SchedulerFirst /* "scheduler_first" */,
+ Map /* "map" */,
+ PartitionMap /* "partition_map" */,
+ SortedMerge /* "sorted_merge" */,
+ OrderedMerge /* "ordered_merge" */,
+ UnorderedMerge /* "unordered_merge" */,
+ Partition /* "partition" */,
+ SimpleSort /* "simple_sort" */,
+ FinalSort /* "final_sort" */,
+ SortedReduce /* "sorted_reduce" */,
+ PartitionReduce /* "partition_reduce" */,
+ ReduceCombiner /* "reduce_combiner" */,
+ RemoteCopy /* "remote_copy" */,
+ IntermediateSort /* "intermediate_sort" */,
+ OrderedMap /* "ordered_map" */,
+ JoinReduce /* "join_reduce" */,
+ Vanilla /* "vanilla" */,
+ SchedulerUnknown /* "scheduler_unknown" */,
+ SchedulerLast /* "scheduler_last" */,
+ ReplicatorFirst /* "replicator_first" */,
+ ReplicateChunk /* "replicate_chunk" */,
+ RemoveChunk /* "remove_chunk" */,
+ RepairChunk /* "repair_chunk" */,
+ SealChunk /* "seal_chunk" */,
+ ReplicatorLast /* "replicator_last" */,
+};
+
+///
+/// @brief Well-known task names.
+enum class ETaskName : int
+{
+ Map /* "map" */,
+ PartitionMap0 /* "partition_map(0)" */,
+ SortedMerge /* "sorted_merge" */,
+ OrderedMerge /* "ordered_merge" */,
+ UnorderedMerge /* "unordered_merge" */,
+ Partition0 /* "partition(0)" */,
+ Partition1 /* "partition(1)" */,
+ Partition2 /* "partition(2)" */,
+ SimpleSort /* "simple_sort" */,
+ FinalSort /* "final_sort" */,
+ SortedReduce /* "sorted_reduce" */,
+ PartitionReduce /* "partition_reduce" */,
+ ReduceCombiner /* "reduce_combiner" */,
+ RemoteCopy /* "remote_copy" */,
+ IntermediateSort /* "intermediate_sort" */,
+ OrderedMap /* "ordered_map" */,
+ JoinReduce /* "join_reduce" */,
+};
+
+///
+/// @brief Task name (can either well-known or just a string).
+class TTaskName
+{
+public:
+
+ // Constructors are implicit by design.
+
+ ///
+ /// @brief Construct a custom task name.
+ TTaskName(TString taskName);
+
+ ///
+ /// @brief Construct a custom task name.
+ TTaskName(const char* taskName);
+
+ ///
+ /// @brief Construct a well-known task name.
+ TTaskName(ETaskName taskName);
+
+ const TString& Get() const;
+
+private:
+ TString TaskName_;
+};
+
+///
+/// @brief Job state.
+enum class EJobState : int
+{
+ None /* "none" */,
+ Waiting /* "waiting" */,
+ Running /* "running" */,
+ Aborting /* "aborting" */,
+ Completed /* "completed" */,
+ Failed /* "failed" */,
+ Aborted /* "aborted" */,
+ Lost /* "lost" */,
+};
+
+///
+/// @brief Job sort field.
+///
+/// @see @ref NYT::TListJobsOptions.
+enum class EJobSortField : int
+{
+ Type /* "type" */,
+ State /* "state" */,
+ StartTime /* "start_time" */,
+ FinishTime /* "finish_time" */,
+ Address /* "address" */,
+ Duration /* "duration" */,
+ Progress /* "progress" */,
+ Id /* "id" */,
+};
+
+///
+/// @brief Job sort direction.
+///
+/// @see @ref NYT::TListJobsOptions.
+enum class EJobSortDirection : int
+{
+ Ascending /* "ascending" */,
+ Descending /* "descending" */,
+};
+
+///
+/// @brief Options for @ref NYT::IClient::ListJobs.
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_jobs
+struct TListJobsOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TListJobsOptions;
+ /// @endcond
+
+ ///
+ /// @name Filters
+ /// Return only jobs with given value of parameter (type, state, address and existence of stderr).
+ /// If a field is `Nothing()`, return jobs with all possible values of the corresponding parameter.
+ /// @{
+
+ ///
+ /// @brief Job type.
+ FLUENT_FIELD_OPTION(EJobType, Type);
+
+ ///
+ /// @brief Job state.
+ FLUENT_FIELD_OPTION(EJobState, State);
+
+ ///
+ /// @brief Address of the cluster node where job was running.
+ FLUENT_FIELD_OPTION(TString, Address);
+
+ ///
+ /// @brief Return only jobs whose stderr has been saved.
+ FLUENT_FIELD_OPTION(bool, WithStderr);
+
+ ///
+ /// @brief Return only jobs whose spec has been saved.
+ FLUENT_FIELD_OPTION(bool, WithSpec);
+
+ ///
+ /// @brief Return only jobs whose fail context has been saved.
+ FLUENT_FIELD_OPTION(bool, WithFailContext);
+
+ /// @}
+
+ ///
+ /// @name Sort options
+ /// @{
+
+ ///
+ /// @brief Sort by this field.
+ FLUENT_FIELD_OPTION(EJobSortField, SortField);
+
+ ///
+ /// @brief Sort order.
+ FLUENT_FIELD_OPTION(ESortOrder, SortOrder);
+
+ /// @}
+
+ ///
+ /// @brief Data source.
+ ///
+ /// Where to search for jobs: in scheduler and Cypress ('Runtime'), in archive ('Archive'),
+ /// automatically basing on operation presence in Cypress ('Auto') or choose manually (`Manual').
+ FLUENT_FIELD_OPTION(EListJobsDataSource, DataSource);
+
+ /// @deprecated
+ FLUENT_FIELD_OPTION(bool, IncludeCypress);
+
+ /// @deprecated
+ FLUENT_FIELD_OPTION(bool, IncludeControllerAgent);
+
+ /// @deprecated
+ FLUENT_FIELD_OPTION(bool, IncludeArchive);
+
+ ///
+ /// @brief Maximum number of jobs to return.
+ FLUENT_FIELD_OPTION(i64, Limit);
+
+ ///
+ /// @brief Number of jobs (in specified sort order) to skip.
+ ///
+ /// Together with @ref NYT::TListJobsOptions::Limit may be used for pagination.
+ FLUENT_FIELD_OPTION(i64, Offset);
+};
+
+///
+/// @brief Description of a core dump that happened in the job.
+struct TCoreInfo
+{
+ i64 ProcessId;
+ TString ExecutableName;
+ TMaybe<ui64> Size;
+ TMaybe<TYtError> Error;
+};
+
+///
+/// @brief Job attributes.
+///
+/// A field may be `Nothing()` if it is not available (i.e. `FinishTime` for a running job).
+///
+/// @see https://yt.yandex-team.ru/docs/api/commands#get_job
+struct TJobAttributes
+{
+ ///
+ /// @brief Job id.
+ TMaybe<TJobId> Id;
+
+ ///
+ /// @brief Job type
+ TMaybe<EJobType> Type;
+
+ ///
+ /// @brief Job state.
+ TMaybe<EJobState> State;
+
+ ///
+ /// @brief Address of a cluster node where job was running.
+ TMaybe<TString> Address;
+
+ ///
+ /// @brief The name of the task that job corresponds to.
+ TMaybe<TString> TaskName;
+
+ ///
+ /// @brief Job start time.
+ TMaybe<TInstant> StartTime;
+
+ ///
+ /// @brief Job finish time (for a finished job).
+ TMaybe<TInstant> FinishTime;
+
+ ///
+ /// @brief Estimated ratio of job's completed work.
+ TMaybe<double> Progress;
+
+ ///
+ /// @brief Size of saved job stderr.
+ TMaybe<i64> StderrSize;
+
+ ///
+ /// @brief Error for a unsuccessfully finished job.
+ TMaybe<TYtError> Error;
+
+ ///
+ /// @brief Job brief statistics.
+ TMaybe<TNode> BriefStatistics;
+
+ ///
+ /// @brief Job input paths (with ranges).
+ TMaybe<TVector<TRichYPath>> InputPaths;
+
+ ///
+ /// @brief Infos for core dumps produced by job.
+ TMaybe<TVector<TCoreInfo>> CoreInfos;
+};
+
+///
+/// @brief Response for @ref NYT::IOperation::ListJobs.
+struct TListJobsResult
+{
+ ///
+ /// @brief Jobs.
+ TVector<TJobAttributes> Jobs;
+
+ ///
+ /// @deprecated
+ TMaybe<i64> CypressJobCount;
+
+ ///
+ /// @brief Number of jobs retrieved from controller agent.
+ TMaybe<i64> ControllerAgentJobCount;
+
+ ///
+ /// @brief Number of jobs retrieved from archive.
+ TMaybe<i64> ArchiveJobCount;
+};
+
+////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Options for @ref NYT::IClient::GetJob.
+struct TGetJobOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetJobOptions;
+ /// @endcond
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetJobInput.
+struct TGetJobInputOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetJobInputOptions;
+ /// @endcond
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetJobFailContext.
+struct TGetJobFailContextOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetJobFailContextOptions;
+ /// @endcond
+};
+
+///
+/// @brief Options for @ref NYT::IClient::GetJobStderr.
+struct TGetJobStderrOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetJobStderrOptions;
+ /// @endcond
+};
+
+////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Options for @ref NYT::IOperation::GetFailedJobInfo.
+struct TGetFailedJobInfoOptions
+{
+ /// @cond Doxygen_Suppress
+ using TSelf = TGetFailedJobInfoOptions;
+ /// @endcond
+
+ ///
+ /// @brief How many jobs to download. Which jobs will be chosen is undefined.
+ FLUENT_FIELD_DEFAULT(ui64, MaxJobCount, 10);
+
+ ///
+ /// @brief How much of stderr tail should be downloaded.
+ FLUENT_FIELD_DEFAULT(ui64, StderrTailSize, 64 * 1024);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface representing an operation.
+struct IOperation
+ : public TThrRefBase
+{
+ virtual ~IOperation() = default;
+
+ ///
+ /// @brief Get operation id.
+ virtual const TOperationId& GetId() const = 0;
+
+ ///
+ /// @brief Get URL of the operation in YT Web UI.
+ virtual TString GetWebInterfaceUrl() const = 0;
+
+ ///
+ /// @brief Get last error for not started operations. Get state on YT cluster for started operations.
+ ///
+ /// For not started operations last error is an error that's being retried during operation
+ /// preparation/start (e.g. lock files, start operation request).
+ virtual TString GetStatus() const = 0;
+
+ ///
+ /// @brief Get preparation future.
+ ///
+ /// @return future that is set when operation is prepared.
+ virtual ::NThreading::TFuture<void> GetPreparedFuture() = 0;
+
+ ///
+ /// @brief Start operation synchronously.
+ ///
+ /// @note: Do NOT call this method twice.
+ ///
+ /// If operation is not prepared yet, Start() will block waiting for preparation finish.
+ /// Be ready to catch exception if operation preparation or start failed.
+ virtual void Start() = 0;
+
+ ///
+ /// @brief Is the operation started
+ ///
+ /// Returns true if the operation is started on the cluster
+ virtual bool IsStarted() const = 0;
+
+ ///
+ /// @brief Get start future.
+ ///
+ /// @return future that is set when operation is started.
+ virtual ::NThreading::TFuture<void> GetStartedFuture() = 0;
+
+ ///
+ /// @brief Start watching operation.
+ ///
+ /// @return future that is set when operation is complete.
+ ///
+ /// @note: the user should check value of returned future to ensure that operation completed successfully e.g.
+ /// @code{.cpp}
+ /// auto operationComplete = operation->Watch();
+ /// operationComplete.Wait();
+ /// operationComplete.GetValue(); /// will throw if operation completed with errors
+ /// @endcode
+ ///
+ /// If operation is completed successfully the returned future contains void value.
+ /// If operation is completed with error future contains @ref NYT::TOperationFailedError.
+ /// In rare cases when error occurred while waiting (e.g. YT become unavailable) future might contain other exception.
+ virtual ::NThreading::TFuture<void> Watch() = 0;
+
+ ///
+ /// @brief Get information about failed jobs.
+ ///
+ /// Can be called for operation in any stage.
+ /// Though user should keep in mind that this method always fetches info from cypress
+ /// and doesn't work when operation is archived. Successfully completed operations can be archived
+ /// quite quickly (in about ~30 seconds).
+ virtual TVector<TFailedJobInfo> GetFailedJobInfo(const TGetFailedJobInfoOptions& options = TGetFailedJobInfoOptions()) = 0;
+
+ ///
+ /// Get operation brief state.
+ virtual EOperationBriefState GetBriefState() = 0;
+
+ ///
+ /// @brief Get error (if operation has failed).
+ ///
+ /// @return `Nothing()` if operation is in 'Completed' or 'InProgress' state (or reason for failed / aborted operation).
+ virtual TMaybe<TYtError> GetError() = 0;
+
+ ///
+ /// Get job statistics.
+ virtual TJobStatistics GetJobStatistics() = 0;
+
+ ///
+ /// Get operation progress.
+ ///
+ /// @return `Nothing()` if operation has no running jobs yet, e.g. when it is in "materializing" or "pending" state.
+ virtual TMaybe<TOperationBriefProgress> GetBriefProgress() = 0;
+
+ ///
+ /// @brief Abort operation.
+ ///
+ /// Operation will be finished immediately.
+ /// All results of completed/running jobs will be lost.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op
+ virtual void AbortOperation() = 0;
+
+ ///
+ /// @brief Complete operation.
+ ///
+ /// Operation will be finished immediately.
+ /// All results of completed jobs will appear in output tables.
+ /// All results of running (not completed) jobs will be lost.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op
+ virtual void CompleteOperation() = 0;
+
+ ///
+ /// @brief Suspend operation.
+ ///
+ /// Jobs will not be aborted by default, c.f. @ref NYT::TSuspendOperationOptions.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#suspend_op
+ virtual void SuspendOperation(
+ const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0;
+
+ ///
+ /// @brief Resume previously suspended operation.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#resume_op
+ virtual void ResumeOperation(
+ const TResumeOperationOptions& options = TResumeOperationOptions()) = 0;
+
+ ///
+ /// @brief Get operation attributes.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#get_operation
+ virtual TOperationAttributes GetAttributes(
+ const TGetOperationOptions& options = TGetOperationOptions()) = 0;
+
+ ///
+ /// @brief Update operation runtime parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#update_op_parameters
+ virtual void UpdateParameters(
+ const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0;
+
+ ///
+ /// @brief Get job attributes.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#get_job
+ virtual TJobAttributes GetJob(
+ const TJobId& jobId,
+ const TGetJobOptions& options = TGetJobOptions()) = 0;
+
+ ///
+ /// List jobs satisfying given filters (see @ref NYT::TListJobsOptions).
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#list_jobs
+ virtual TListJobsResult ListJobs(
+ const TListJobsOptions& options = TListJobsOptions()) = 0;
+};
+
+///
+/// @brief Interface of client capable of managing operations.
+struct IOperationClient
+{
+ ///
+ /// @brief Run Map operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param mapper Instance of a job to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/map
+ IOperationPtr Map(
+ const TMapOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run Map operation.
+ ///
+ /// @param mapper Instance of a job to run.
+ /// @param input Input table(s)
+ /// @param output Output table(s)
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/map
+ IOperationPtr Map(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TMapOperationSpec& spec = TMapOperationSpec(),
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run raw Map operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param rawJob Instance of a raw mapper to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/map
+ virtual IOperationPtr RawMap(
+ const TRawMapOperationSpec& spec,
+ ::TIntrusivePtr<IRawJob> rawJob,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run Reduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param reducer Instance of a job to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+ IOperationPtr Reduce(
+ const TReduceOperationSpec& spec,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run Reduce operation.
+ ///
+ /// @param reducer Instance of a job to run.
+ /// @param input Input table(s)
+ /// @param output Output table(s)
+ /// @param reduceBy Columns to group rows by.
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+ IOperationPtr Reduce(
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ const TReduceOperationSpec& spec = TReduceOperationSpec(),
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run raw Reduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param rawJob Instance of a raw reducer to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/reduce
+ virtual IOperationPtr RawReduce(
+ const TRawReduceOperationSpec& spec,
+ ::TIntrusivePtr<IRawJob> rawJob,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run JoinReduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param reducer Instance of a job to run.
+ /// @param options Optional parameters.
+ ///
+ /// @deprecated Use @ref NYT::IOperationClient::Reduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false.
+ IOperationPtr JoinReduce(
+ const TJoinReduceOperationSpec& spec,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run raw JoinReduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param rawJob Instance of a raw reducer to run.
+ /// @param options Optional parameters.
+ ///
+ /// @deprecated Use @ref NYT::IOperationClient::RawReduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false.
+ virtual IOperationPtr RawJoinReduce(
+ const TRawJoinReduceOperationSpec& spec,
+ ::TIntrusivePtr<IRawJob> rawJob,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run MapReduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param mapper Instance of a map job to run (identity mapper if `nullptr`).
+ /// @param reducer Instance of a reduce job to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+ IOperationPtr MapReduce(
+ const TMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run MapReduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param mapper Instance of a map job to run (identity mapper if `nullptr`).
+ /// @param reducerCombiner Instance of a reduce combiner to run (identity reduce combiner if `nullptr`).
+ /// @param reducer Instance of a reduce job to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+ IOperationPtr MapReduce(
+ const TMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reduceCombiner,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run MapReduce operation.
+ ///
+ /// @param mapper Instance of mapper to run (identity mapper if `nullptr`).
+ /// @param reducer Instance of reducer to run.
+ /// @param input Input table(s)
+ /// @param output Output table(s)
+ /// @param reduceBy Columns to group rows by.
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+ IOperationPtr MapReduce(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ TMapReduceOperationSpec spec = TMapReduceOperationSpec(),
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run MapReduce operation.
+ ///
+ /// @param mapper Instance of mapper to run (identity mapper if `nullptr`).
+ /// @param reduceCombiner Instance of reduceCombiner to run (identity reduce combiner if `nullptr`).
+ /// @param reducer Instance of reducer to run.
+ /// @param input Input table(s)
+ /// @param output Output table(s)
+ /// @param reduceBy Columns to group rows by.
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+ IOperationPtr MapReduce(
+ ::TIntrusivePtr<IMapperBase> mapper,
+ ::TIntrusivePtr<IReducerBase> reduceCombiner,
+ ::TIntrusivePtr<IReducerBase> reducer,
+ const TOneOrMany<TStructuredTablePath>& input,
+ const TOneOrMany<TStructuredTablePath>& output,
+ const TSortColumns& reduceBy,
+ TMapReduceOperationSpec spec = TMapReduceOperationSpec(),
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run raw MapReduce operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param mapper Instance of a raw mapper to run (identity mapper if `nullptr`).
+ /// @param mapper Instance of a raw reduce combiner to run (identity reduce combiner if `nullptr`).
+ /// @param mapper Instance of a raw reducer to run.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce
+ virtual IOperationPtr RawMapReduce(
+ const TRawMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IRawJob> mapper,
+ ::TIntrusivePtr<IRawJob> reduceCombiner,
+ ::TIntrusivePtr<IRawJob> reducer,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run Sort operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/sort
+ virtual IOperationPtr Sort(
+ const TSortOperationSpec& spec,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run Sort operation.
+ ///
+ /// @param input Input table(s).
+ /// @param output Output table.
+ /// @param sortBy Columns to sort input rows by.
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/sort
+ IOperationPtr Sort(
+ const TOneOrMany<TRichYPath>& input,
+ const TRichYPath& output,
+ const TSortColumns& sortBy,
+ const TSortOperationSpec& spec = TSortOperationSpec(),
+ const TOperationOptions& options = TOperationOptions());
+
+ ///
+ /// @brief Run Merge operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/merge
+ virtual IOperationPtr Merge(
+ const TMergeOperationSpec& spec,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run Erase operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/erase
+ virtual IOperationPtr Erase(
+ const TEraseOperationSpec& spec,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run RemoteCopy operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy
+ virtual IOperationPtr RemoteCopy(
+ const TRemoteCopyOperationSpec& spec,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Run Vanilla operation.
+ ///
+ /// @param spec Operation spec.
+ /// @param options Optional parameters.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/description/mr/vanilla
+ virtual IOperationPtr RunVanilla(
+ const TVanillaOperationSpec& spec,
+ const TOperationOptions& options = TOperationOptions()) = 0;
+
+ ///
+ /// @brief Abort operation.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op
+ virtual void AbortOperation(
+ const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Complete operation.
+ ///
+ /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op
+ virtual void CompleteOperation(
+ const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Wait for operation to finish.
+ virtual void WaitForOperation(
+ const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Check and return operation status.
+ ///
+ /// @note this function will never return @ref NYT::EOperationBriefState::Failed or @ref NYT::EOperationBriefState::Aborted status,
+ /// it will throw @ref NYT::TOperationFailedError instead.
+ virtual EOperationBriefState CheckOperation(
+ const TOperationId& operationId) = 0;
+
+ ///
+ /// @brief Create an operation object given operation id.
+ ///
+ /// @throw @ref NYT::TErrorResponse if the operation doesn't exist.
+ virtual IOperationPtr AttachOperation(const TOperationId& operationId) = 0;
+
+private:
+ virtual IOperationPtr DoMap(
+ const TMapOperationSpec& spec,
+ ::TIntrusivePtr<IStructuredJob> mapper,
+ const TOperationOptions& options) = 0;
+
+ virtual IOperationPtr DoReduce(
+ const TReduceOperationSpec& spec,
+ ::TIntrusivePtr<IStructuredJob> reducer,
+ const TOperationOptions& options) = 0;
+
+ virtual IOperationPtr DoJoinReduce(
+ const TJoinReduceOperationSpec& spec,
+ ::TIntrusivePtr<IStructuredJob> reducer,
+ const TOperationOptions& options) = 0;
+
+ virtual IOperationPtr DoMapReduce(
+ const TMapReduceOperationSpec& spec,
+ ::TIntrusivePtr<IStructuredJob> mapper,
+ ::TIntrusivePtr<IStructuredJob> reduceCombiner,
+ ::TIntrusivePtr<IStructuredJob> reducer,
+ const TOperationOptions& options) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
+
+#define OPERATION_INL_H_
+#include "operation-inl.h"
+#undef OPERATION_INL_H_
diff --git a/yt/cpp/mapreduce/interface/operation_ut.cpp b/yt/cpp/mapreduce/interface/operation_ut.cpp
new file mode 100644
index 0000000000..0fa62e1568
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/operation_ut.cpp
@@ -0,0 +1,269 @@
+#include <yt/cpp/mapreduce/interface/common_ut.h>
+#include <yt/cpp/mapreduce/interface/job_statistics.h>
+#include <yt/cpp/mapreduce/interface/operation.h>
+#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h>
+
+#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h>
+
+#include <library/cpp/yson/node/node_io.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NYT;
+using namespace NYT::NUnitTesting;
+
+class TDummyInferenceContext
+ : public IOperationPreparationContext
+{
+public:
+ TDummyInferenceContext(int inputCount, int outputCount)
+ : InputCount_(inputCount)
+ , OutputCount_(outputCount)
+ , InputSchemas_(inputCount)
+ { }
+
+ int GetInputCount() const override
+ {
+ return InputCount_;
+ }
+
+ int GetOutputCount() const override
+ {
+ return OutputCount_;
+ }
+
+ const TVector<TTableSchema>& GetInputSchemas() const override
+ {
+ return InputSchemas_;
+ }
+
+ const TTableSchema& GetInputSchema(int index) const override
+ {
+ return InputSchemas_[index];
+ }
+
+ TMaybe<TYPath> GetInputPath(int) const override
+ {
+ return Nothing();
+ }
+
+ TMaybe<TYPath> GetOutputPath(int) const override
+ {
+ return Nothing();
+ }
+
+private:
+ int InputCount_;
+ int OutputCount_;
+ TVector<TTableSchema> InputSchemas_;
+};
+
+Y_UNIT_TEST_SUITE(PrepareOperation)
+{
+
+ Y_UNIT_TEST(BasicSchemas)
+ {
+ auto firstSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64));
+ auto otherSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN));
+ auto thirdSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING));
+
+ TDummyInferenceContext context(3,7);
+ TJobOperationPreparer builder(context);
+
+ builder
+ .OutputSchema(1, firstSchema)
+ .BeginOutputGroup(TVector<int>{2, 5})
+ .Schema(otherSchema)
+ .EndOutputGroup()
+ .BeginOutputGroup(3, 5)
+ .Schema(thirdSchema)
+ .EndOutputGroup()
+ .BeginOutputGroup(TVector<int>{0, 6})
+ .Schema(thirdSchema)
+ .EndOutputGroup();
+
+ UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, otherSchema), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(3, 5).Schema(otherSchema), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(TVector<int>{3,6,7}).Schema(otherSchema), TApiUsageError);
+
+ builder.Finish();
+ auto result = builder.GetOutputSchemas();
+
+ ASSERT_SERIALIZABLES_EQUAL(result[0], thirdSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[1], firstSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[2], otherSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[3], thirdSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[4], thirdSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[5], otherSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[6], thirdSchema);
+ }
+
+ Y_UNIT_TEST(NoSchema)
+ {
+ auto schema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64));
+
+ TDummyInferenceContext context(3,4);
+ TJobOperationPreparer builder(context);
+
+ builder
+ .OutputSchema(1, schema)
+ .NoOutputSchema(0)
+ .BeginOutputGroup(2, 4)
+ .Schema(schema)
+ .EndOutputGroup();
+
+ UNIT_ASSERT_EXCEPTION(builder.OutputSchema(0, schema), TApiUsageError);
+
+ builder.Finish();
+ auto result = builder.GetOutputSchemas();
+
+ UNIT_ASSERT(result[0].Empty());
+
+ ASSERT_SERIALIZABLES_EQUAL(result[1], schema);
+ ASSERT_SERIALIZABLES_EQUAL(result[2], schema);
+ ASSERT_SERIALIZABLES_EQUAL(result[3], schema);
+ }
+
+ Y_UNIT_TEST(Descriptions)
+ {
+ auto urlRowSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("Host").Type(NTi::Optional(NTi::String())))
+ .AddColumn(TColumnSchema().Name("Path").Type(NTi::Optional(NTi::String())))
+ .AddColumn(TColumnSchema().Name("HttpCode").Type(NTi::Optional(NTi::Int32())));
+
+ auto urlRowStruct = NTi::Struct({
+ {"Host", NTi::Optional(NTi::String())},
+ {"Path", NTi::Optional(NTi::String())},
+ {"HttpCode", NTi::Optional(NTi::Int32())},
+ });
+
+ auto rowFieldSerializationOptionSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(NTi::Optional(urlRowStruct)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(NTi::Optional(NTi::String())));
+
+ auto rowSerializedRepeatedFieldsSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("Ints").Type(NTi::List(NTi::Int64())))
+ .AddColumn(TColumnSchema().Name("UrlRows").Type(NTi::List(urlRowStruct)));
+
+ TDummyInferenceContext context(5,7);
+ TJobOperationPreparer builder(context);
+
+ builder
+ .InputDescription<TUrlRow>(0)
+ .BeginInputGroup(2, 3)
+ .Description<TUrlRow>()
+ .EndInputGroup()
+ .BeginInputGroup(TVector<int>{1, 4})
+ .Description<TRowSerializedRepeatedFields>()
+ .EndInputGroup()
+ .InputDescription<TUrlRow>(3);
+
+ UNIT_ASSERT_EXCEPTION(builder.InputDescription<TUrlRow>(0), TApiUsageError);
+
+ builder
+ .OutputDescription<TUrlRow>(0, false)
+ .OutputDescription<TRowFieldSerializationOption>(1)
+ .BeginOutputGroup(2, 4)
+ .Description<TUrlRow>()
+ .EndOutputGroup()
+ .BeginOutputGroup(TVector<int>{4,6})
+ .Description<TRowSerializedRepeatedFields>()
+ .EndOutputGroup()
+ .OutputDescription<TUrlRow>(5, false);
+
+ UNIT_ASSERT_EXCEPTION(builder.OutputDescription<TUrlRow>(0), TApiUsageError);
+ UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(0, urlRowSchema));
+ UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(5, urlRowSchema));
+ UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, urlRowSchema), TApiUsageError);
+
+ builder.Finish();
+ auto result = builder.GetOutputSchemas();
+
+ ASSERT_SERIALIZABLES_EQUAL(result[0], urlRowSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[1], rowFieldSerializationOptionSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[2], urlRowSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[3], urlRowSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[4], rowSerializedRepeatedFieldsSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[5], urlRowSchema);
+ ASSERT_SERIALIZABLES_EQUAL(result[6], rowSerializedRepeatedFieldsSchema);
+
+ auto expectedInputDescriptions = TVector<TMaybe<TTableStructure>>{
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}},
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}},
+ };
+ UNIT_ASSERT_EQUAL(expectedInputDescriptions, builder.GetInputDescriptions());
+
+ auto expectedOutputDescriptions = TVector<TMaybe<TTableStructure>>{
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TRowFieldSerializationOption::descriptor()}},
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}},
+ {TProtobufTableStructure{TUrlRow::descriptor()}},
+ {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}},
+ };
+ UNIT_ASSERT_EQUAL(expectedOutputDescriptions, builder.GetOutputDescriptions());
+ }
+
+ Y_UNIT_TEST(InputColumns)
+ {
+ TDummyInferenceContext context(5, 1);
+ TJobOperationPreparer builder(context);
+ builder
+ .InputColumnFilter(2, {"a", "b"})
+ .BeginInputGroup(0, 2)
+ .ColumnFilter({"b", "c"})
+ .ColumnRenaming({{"b", "B"}, {"c", "C"}})
+ .EndInputGroup()
+ .InputColumnRenaming(3, {{"a", "AAA"}})
+ .NoOutputSchema(0);
+ builder.Finish();
+
+ auto expectedRenamings = TVector<THashMap<TString, TString>>{
+ {{"b", "B"}, {"c", "C"}},
+ {{"b", "B"}, {"c", "C"}},
+ {},
+ {{"a", "AAA"}},
+ {},
+ };
+ UNIT_ASSERT_EQUAL(builder.GetInputColumnRenamings(), expectedRenamings);
+
+ auto expectedFilters = TVector<TMaybe<TVector<TString>>>{
+ {{"b", "c"}},
+ {{"b", "c"}},
+ {{"a", "b"}},
+ {},
+ {},
+ };
+ UNIT_ASSERT_EQUAL(builder.GetInputColumnFilters(), expectedFilters);
+ }
+
+ Y_UNIT_TEST(Bug_r7349102)
+ {
+ auto firstSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64));
+ auto otherSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN));
+ auto thirdSchema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING));
+
+ TDummyInferenceContext context(3,1);
+ TJobOperationPreparer builder(context);
+
+ builder
+ .InputDescription<TUrlRow>(0)
+ .InputDescription<TUrlRow>(1)
+ .InputDescription<TUrlRow>(2)
+ .OutputDescription<TUrlRow>(0);
+
+ builder.Finish();
+ }
+
+} // Y_UNIT_TEST_SUITE(SchemaInference)
diff --git a/yt/cpp/mapreduce/interface/proto3_ut.proto b/yt/cpp/mapreduce/interface/proto3_ut.proto
new file mode 100644
index 0000000000..b24c13085b
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/proto3_ut.proto
@@ -0,0 +1,17 @@
+syntax = "proto3";
+
+import "yt/yt_proto/yt/formats/extension.proto";
+
+package NYT.NTestingProto3;
+
+option (NYT.file_default_field_flags) = SERIALIZATION_YT;
+
+message TWithOptional
+{
+ optional int64 x = 1;
+}
+
+message TWithOptionalMessage
+{
+ optional TWithOptional x = 1;
+}
diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp
new file mode 100644
index 0000000000..5ffa9564d7
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp
@@ -0,0 +1,271 @@
+#include "errors.h"
+#include "format.h"
+#include "common_ut.h"
+
+#include <yt/cpp/mapreduce/interface/protobuf_file_options_ut.pb.h>
+
+#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NYT;
+
+Y_UNIT_TEST_SUITE(ProtobufFileOptions)
+{
+ NTi::TTypePtr GetUrlRowType(bool required)
+ {
+ static const NTi::TTypePtr structType = NTi::Struct({
+ {"Host", ToTypeV3(EValueType::VT_STRING, false)},
+ {"Path", ToTypeV3(EValueType::VT_STRING, false)},
+ {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}});
+ return required ? structType : NTi::TTypePtr(NTi::Optional(structType));
+ }
+
+ Y_UNIT_TEST(TRowFieldSerializationOption)
+ {
+ const auto schema = CreateTableSchema<NTestingFileOptions::TRowFieldSerializationOption>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false))));
+ }
+
+ Y_UNIT_TEST(TRowMixedSerializationOptions)
+ {
+ const auto schema = CreateTableSchema<NTestingFileOptions::TRowMixedSerializationOptions>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false))));
+ }
+
+ Y_UNIT_TEST(FieldSortOrder)
+ {
+ const auto schema = CreateTableSchema<NTestingFileOptions::TFieldSortOrder>();
+
+ auto asInProtoFile = NTi::Optional(NTi::Struct({
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ {"z", NTi::Optional(NTi::Bool())},
+ }));
+ auto byFieldNumber = NTi::Optional(NTi::Struct({
+ {"z", NTi::Optional(NTi::Bool())},
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ }));
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(asInProtoFile))
+ .AddColumn(TColumnSchema().Name("EmbeddedAsInProtoFile").Type(asInProtoFile))
+ .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber)));
+ }
+
+ Y_UNIT_TEST(Map)
+ {
+ const auto schema = CreateTableSchema<NTestingFileOptions::TWithMap>();
+
+ auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) {
+ return NTi::List(NTi::Struct({
+ {"key", NTi::Optional(key)},
+ {"value", NTi::Optional(value)},
+ }));
+ };
+
+ auto embedded = NTi::Struct({
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ });
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("MapDefault")
+ .Type(createKeyValueStruct(NTi::Int64(), embedded)))
+ .AddColumn(TColumnSchema()
+ .Name("MapDict")
+ .Type(NTi::Dict(NTi::Int64(), embedded))));
+ }
+
+ Y_UNIT_TEST(Oneof)
+ {
+ const auto schema = CreateTableSchema<NTestingFileOptions::TWithOneof>();
+
+ auto embedded = NTi::Struct({
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ });
+
+ auto defaultVariantType = NTi::Optional(NTi::Struct({
+ {"field", NTi::Optional(NTi::String())},
+ {"Oneof2", NTi::Optional(NTi::Variant(NTi::Struct({
+ {"y2", NTi::String()},
+ {"z2", embedded},
+ {"x2", NTi::Int64()},
+ })))},
+ {"x1", NTi::Optional(NTi::Int64())},
+ {"y1", NTi::Optional(NTi::String())},
+ {"z1", NTi::Optional(embedded)},
+ }));
+
+ auto noDefaultType = NTi::Optional(NTi::Struct({
+ {"field", NTi::Optional(NTi::String())},
+ {"y2", NTi::Optional(NTi::String())},
+ {"z2", NTi::Optional(embedded)},
+ {"x2", NTi::Optional(NTi::Int64())},
+ {"x1", NTi::Optional(NTi::Int64())},
+ {"y1", NTi::Optional(NTi::String())},
+ {"z1", NTi::Optional(embedded)},
+ }));
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("DefaultVariant")
+ .Type(defaultVariantType)
+ )
+ .AddColumn(TColumnSchema()
+ .Name("NoDefault")
+ .Type(noDefaultType)
+ )
+ .AddColumn(TColumnSchema()
+ .Name("SerializationProtobuf")
+ .Type(NTi::Optional(NTi::Struct({
+ {"x1", NTi::Optional(NTi::Int64())},
+ {"y1", NTi::Optional(NTi::String())},
+ {"z1", NTi::Optional(NTi::String())},
+ })))
+ )
+ .AddColumn(TColumnSchema()
+ .Name("MemberOfTopLevelOneof")
+ .Type(NTi::Optional(NTi::Int64()))
+ )
+ );
+ }
+}
+
+static TNode GetColumns(const TFormat& format, int tableIndex = 0)
+{
+ return format.Config.GetAttributes()["tables"][tableIndex]["columns"];
+}
+
+Y_UNIT_TEST_SUITE(ProtobufFormatFileOptions)
+{
+ Y_UNIT_TEST(TRowFieldSerializationOption)
+ {
+ const auto format = TFormat::Protobuf<NTestingFileOptions::TRowFieldSerializationOption>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "message");
+ UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2");
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2);
+ const auto& fields = columns[1]["fields"];
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host");
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string");
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1);
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path");
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string");
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2);
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode");
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32");
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3);
+ }
+
+ Y_UNIT_TEST(Map)
+ {
+ const auto format = TFormat::Protobuf<NTestingFileOptions::TWithMap>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 2);
+ {
+ const auto& column = columns[0];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message");
+ }
+ {
+ const auto& column = columns[1];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message");
+ }
+ }
+
+ Y_UNIT_TEST(Oneof)
+ {
+ const auto format = TFormat::Protobuf<NTestingFileOptions::TWithOneof>();
+ auto columns = GetColumns(format);
+
+ UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4);
+
+ {
+ const auto& column = columns[0];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "DefaultVariant");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field");
+
+ const auto& oneof2 = column["fields"][1];
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], "Oneof2");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2");
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message");
+ const auto& embeddedFields = oneof2["fields"][1]["fields"];
+ UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y");
+
+ UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2");
+
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1");
+ };
+
+ {
+ const auto& column = columns[1];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "NoDefault");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ const auto& fields = column["fields"];
+ UNIT_ASSERT_VALUES_EQUAL(fields.Size(), 7);
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "field");
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "y2");
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "z2");
+ UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "structured_message");
+ const auto& embeddedFields = fields[2]["fields"];
+ UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x");
+ UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y");
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[3]["name"], "x2");
+
+ UNIT_ASSERT_VALUES_EQUAL(fields[4]["name"], "x1");
+ UNIT_ASSERT_VALUES_EQUAL(fields[5]["name"], "y1");
+ UNIT_ASSERT_VALUES_EQUAL(fields[6]["name"], "z1");
+ };
+
+ {
+ const auto& column = columns[2];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3);
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1");
+ UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1");
+ }
+ {
+ const auto& column = columns[3];
+ UNIT_ASSERT_VALUES_EQUAL(column["name"], "MemberOfTopLevelOneof");
+ UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64");
+ }
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto
new file mode 100644
index 0000000000..4804b2f60c
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto
@@ -0,0 +1,142 @@
+import "yt/yt_proto/yt/formats/extension.proto";
+
+package NYT.NTestingFileOptions;
+
+option (NYT.file_default_field_flags) = SERIALIZATION_YT;
+option (NYT.file_default_field_flags) = MAP_AS_LIST_OF_STRUCTS;
+option (NYT.file_default_message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
+option (NYT.file_default_oneof_flags) = SEPARATE_FIELDS;
+
+message TUrlRow
+{
+ optional string Host = 1 [(NYT.column_name) = "Host"];
+ optional string Path = 2 [(NYT.column_name) = "Path"];
+ optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"];
+}
+
+message TRowFieldSerializationOption
+{
+ optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+ optional TUrlRow UrlRow_2 = 2;
+}
+
+message TRowMixedSerializationOptions
+{
+ option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF;
+ optional TUrlRow UrlRow_1 = 1;
+ optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_YT];
+}
+
+message TRowSerializedRepeatedFields
+{
+ repeated int64 Ints = 1;
+ repeated TUrlRow UrlRows = 2;
+}
+
+message TFieldSortOrder
+{
+ message TEmbeddedDefault {
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ message TEmbeddedAsInProtoFile {
+ option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ message TEmbeddedByFieldNumber {
+ option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER;
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional TEmbeddedDefault EmbeddedDefault = 1;
+ optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2;
+ optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3;
+}
+
+message TWithMap
+{
+ message TEmbedded {
+ optional int64 x = 1;
+ optional string y = 2;
+ }
+
+ map<int64, TEmbedded> MapDefault = 1;
+ map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT];
+}
+
+message TWithOneof
+{
+ message TEmbedded
+ {
+ oneof Oneof {
+ int64 x = 1;
+ string y = 2;
+ }
+ }
+
+ message TDefaultVariant
+ {
+ option (NYT.default_oneof_flags) = VARIANT;
+ optional string field = 1;
+
+ oneof Oneof2
+ {
+ string y2 = 4;
+ TEmbedded z2 = 6;
+ int64 x2 = 2;
+ }
+
+ oneof Oneof1
+ {
+ option (NYT.oneof_flags) = SEPARATE_FIELDS;
+ int64 x1 = 10;
+ string y1 = 3;
+ TEmbedded z1 = 5;
+ }
+ }
+
+ message TNoDefault
+ {
+ optional string field = 1;
+
+ oneof Oneof2
+ {
+ string y2 = 4;
+ TEmbedded z2 = 6;
+ int64 x2 = 2;
+ }
+
+ oneof Oneof1
+ {
+ int64 x1 = 10;
+ string y1 = 3;
+ TEmbedded z1 = 5;
+ }
+ }
+
+ message TSerializationProtobuf
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF;
+ oneof Oneof
+ {
+ int64 x1 = 2;
+ string y1 = 1;
+ TEmbedded z1 = 3;
+ }
+ }
+
+ optional TDefaultVariant DefaultVariant = 1;
+ optional TNoDefault NoDefault = 2;
+ optional TSerializationProtobuf SerializationProtobuf = 3;
+
+ oneof TopLevelOneof
+ {
+ int64 MemberOfTopLevelOneof = 4;
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/protobuf_format.cpp b/yt/cpp/mapreduce/interface/protobuf_format.cpp
new file mode 100644
index 0000000000..3d57ed2797
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_format.cpp
@@ -0,0 +1,1498 @@
+#include "protobuf_format.h"
+
+#include "errors.h"
+
+#include <yt/yt_proto/yt/formats/extension.pb.h>
+
+#include <google/protobuf/text_format.h>
+
+#include <library/cpp/yson/node/node_io.h>
+
+#include <util/generic/hash_set.h>
+#include <util/generic/stack.h>
+#include <util/generic/overloaded.h>
+
+#include <util/stream/output.h>
+#include <util/stream/file.h>
+
+namespace NYT::NDetail {
+
+using ::google::protobuf::Descriptor;
+using ::google::protobuf::DescriptorProto;
+using ::google::protobuf::EnumDescriptor;
+using ::google::protobuf::EnumDescriptorProto;
+using ::google::protobuf::FieldDescriptor;
+using ::google::protobuf::FieldDescriptorProto;
+using ::google::protobuf::OneofDescriptor;
+using ::google::protobuf::Message;
+using ::google::protobuf::FileDescriptor;
+using ::google::protobuf::FileDescriptorProto;
+using ::google::protobuf::FileDescriptorSet;
+using ::google::protobuf::FieldOptions;
+using ::google::protobuf::FileOptions;
+using ::google::protobuf::OneofOptions;
+using ::google::protobuf::MessageOptions;
+
+using ::ToString;
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+
+using TOneofOption = std::variant<
+ EProtobufOneofMode>;
+
+using TFieldOption = std::variant<
+ EProtobufType,
+ EProtobufSerializationMode,
+ EProtobufListMode,
+ EProtobufMapMode,
+ EProtobufEnumWritingMode>;
+
+using TMessageOption = std::variant<
+ EProtobufFieldSortOrder>;
+
+struct TOtherColumns
+{ };
+
+using TValueTypeOrOtherColumns = std::variant<EValueType, TOtherColumns>;
+
+////////////////////////////////////////////////////////////////////////////////
+
+TFieldOption FieldFlagToOption(EWrapperFieldFlag::Enum flag)
+{
+ using EFlag = EWrapperFieldFlag;
+ switch (flag) {
+ case EFlag::SERIALIZATION_PROTOBUF:
+ return EProtobufSerializationMode::Protobuf;
+ case EFlag::SERIALIZATION_YT:
+ return EProtobufSerializationMode::Yt;
+
+ case EFlag::ANY:
+ return EProtobufType::Any;
+ case EFlag::OTHER_COLUMNS:
+ return EProtobufType::OtherColumns;
+ case EFlag::ENUM_INT:
+ return EProtobufType::EnumInt;
+ case EFlag::ENUM_STRING:
+ return EProtobufType::EnumString;
+
+ case EFlag::OPTIONAL_LIST:
+ return EProtobufListMode::Optional;
+ case EFlag::REQUIRED_LIST:
+ return EProtobufListMode::Required;
+
+ case EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY:
+ return EProtobufMapMode::ListOfStructsLegacy;
+ case EFlag::MAP_AS_LIST_OF_STRUCTS:
+ return EProtobufMapMode::ListOfStructs;
+ case EFlag::MAP_AS_DICT:
+ return EProtobufMapMode::Dict;
+ case EFlag::MAP_AS_OPTIONAL_DICT:
+ return EProtobufMapMode::OptionalDict;
+ case EFlag::EMBEDDED:
+ return EProtobufSerializationMode::Embedded;
+
+ case EFlag::ENUM_SKIP_UNKNOWN_VALUES:
+ return EProtobufEnumWritingMode::SkipUnknownValues;
+ case EFlag::ENUM_CHECK_VALUES:
+ return EProtobufEnumWritingMode::CheckValues;
+ }
+ Y_FAIL();
+}
+
+TMessageOption MessageFlagToOption(EWrapperMessageFlag::Enum flag)
+{
+ using EFlag = EWrapperMessageFlag;
+ switch (flag) {
+ case EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE:
+ return EProtobufFieldSortOrder::AsInProtoFile;
+ case EFlag::SORT_FIELDS_BY_FIELD_NUMBER:
+ return EProtobufFieldSortOrder::ByFieldNumber;
+ }
+ Y_FAIL();
+}
+
+TOneofOption OneofFlagToOption(EWrapperOneofFlag::Enum flag)
+{
+ using EFlag = EWrapperOneofFlag;
+ switch (flag) {
+ case EFlag::SEPARATE_FIELDS:
+ return EProtobufOneofMode::SeparateFields;
+ case EFlag::VARIANT:
+ return EProtobufOneofMode::Variant;
+ }
+ Y_FAIL();
+}
+
+EWrapperFieldFlag::Enum OptionToFieldFlag(TFieldOption option)
+{
+ using EFlag = EWrapperFieldFlag;
+ struct TVisitor
+ {
+ EFlag::Enum operator() (EProtobufType type)
+ {
+ switch (type) {
+ case EProtobufType::Any:
+ return EFlag::ANY;
+ case EProtobufType::OtherColumns:
+ return EFlag::OTHER_COLUMNS;
+ case EProtobufType::EnumInt:
+ return EFlag::ENUM_INT;
+ case EProtobufType::EnumString:
+ return EFlag::ENUM_STRING;
+ }
+ Y_FAIL();
+ }
+ EFlag::Enum operator() (EProtobufSerializationMode serializationMode)
+ {
+ switch (serializationMode) {
+ case EProtobufSerializationMode::Yt:
+ return EFlag::SERIALIZATION_YT;
+ case EProtobufSerializationMode::Protobuf:
+ return EFlag::SERIALIZATION_PROTOBUF;
+ case EProtobufSerializationMode::Embedded:
+ return EFlag::EMBEDDED;
+ }
+ Y_FAIL();
+ }
+ EFlag::Enum operator() (EProtobufListMode listMode)
+ {
+ switch (listMode) {
+ case EProtobufListMode::Optional:
+ return EFlag::OPTIONAL_LIST;
+ case EProtobufListMode::Required:
+ return EFlag::REQUIRED_LIST;
+ }
+ Y_FAIL();
+ }
+ EFlag::Enum operator() (EProtobufMapMode mapMode)
+ {
+ switch (mapMode) {
+ case EProtobufMapMode::ListOfStructsLegacy:
+ return EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY;
+ case EProtobufMapMode::ListOfStructs:
+ return EFlag::MAP_AS_LIST_OF_STRUCTS;
+ case EProtobufMapMode::Dict:
+ return EFlag::MAP_AS_DICT;
+ case EProtobufMapMode::OptionalDict:
+ return EFlag::MAP_AS_OPTIONAL_DICT;
+ }
+ Y_FAIL();
+ }
+ EFlag::Enum operator() (EProtobufEnumWritingMode enumWritingMode)
+ {
+ switch (enumWritingMode) {
+ case EProtobufEnumWritingMode::SkipUnknownValues:
+ return EFlag::ENUM_SKIP_UNKNOWN_VALUES;
+ case EProtobufEnumWritingMode::CheckValues:
+ return EFlag::ENUM_CHECK_VALUES;
+ }
+ Y_FAIL();
+ }
+ };
+
+ return std::visit(TVisitor(), option);
+}
+
+EWrapperMessageFlag::Enum OptionToMessageFlag(TMessageOption option)
+{
+ using EFlag = EWrapperMessageFlag;
+ struct TVisitor
+ {
+ EFlag::Enum operator() (EProtobufFieldSortOrder sortOrder)
+ {
+ switch (sortOrder) {
+ case EProtobufFieldSortOrder::AsInProtoFile:
+ return EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
+ case EProtobufFieldSortOrder::ByFieldNumber:
+ return EFlag::SORT_FIELDS_BY_FIELD_NUMBER;
+ }
+ Y_FAIL();
+ }
+ };
+
+ return std::visit(TVisitor(), option);
+}
+
+EWrapperOneofFlag::Enum OptionToOneofFlag(TOneofOption option)
+{
+ using EFlag = EWrapperOneofFlag;
+ struct TVisitor
+ {
+ EFlag::Enum operator() (EProtobufOneofMode mode)
+ {
+ switch (mode) {
+ case EProtobufOneofMode::SeparateFields:
+ return EFlag::SEPARATE_FIELDS;
+ case EProtobufOneofMode::Variant:
+ return EFlag::VARIANT;
+ }
+ Y_FAIL();
+ }
+ };
+
+ return std::visit(TVisitor(), option);
+}
+
+
+template <typename T, typename TOptionToFlag>
+void SetOption(TMaybe<T>& option, T newOption, TOptionToFlag optionToFlag)
+{
+ if (option) {
+ if (*option == newOption) {
+ ythrow yexception() << "Duplicate protobuf flag " << optionToFlag(newOption);
+ } else {
+ ythrow yexception() << "Incompatible protobuf flags " <<
+ optionToFlag(*option) << " and " << optionToFlag(newOption);
+ }
+ }
+ option = newOption;
+}
+
+class TParseProtobufFieldOptionsVisitor
+{
+public:
+ void operator() (EProtobufType type)
+ {
+ SetOption(Type, type);
+ }
+
+ void operator() (EProtobufSerializationMode serializationMode)
+ {
+ SetOption(SerializationMode, serializationMode);
+ }
+
+ void operator() (EProtobufListMode listMode)
+ {
+ SetOption(ListMode, listMode);
+ }
+
+ void operator() (EProtobufMapMode mapMode)
+ {
+ SetOption(MapMode, mapMode);
+ }
+
+ void operator() (EProtobufEnumWritingMode enumWritingMode)
+ {
+ SetOption(EnumWritingMode, enumWritingMode);
+ }
+
+ template <typename T>
+ void SetOption(TMaybe<T>& option, T newOption)
+ {
+ NYT::NDetail::SetOption(option, newOption, OptionToFieldFlag);
+ }
+
+public:
+ TMaybe<EProtobufType> Type;
+ TMaybe<EProtobufSerializationMode> SerializationMode;
+ TMaybe<EProtobufListMode> ListMode;
+ TMaybe<EProtobufMapMode> MapMode;
+ TMaybe<EProtobufEnumWritingMode> EnumWritingMode;
+};
+
+class TParseProtobufMessageOptionsVisitor
+{
+public:
+ void operator() (EProtobufFieldSortOrder fieldSortOrder)
+ {
+ SetOption(FieldSortOrder, fieldSortOrder);
+ }
+
+ template <typename T>
+ void SetOption(TMaybe<T>& option, T newOption)
+ {
+ NYT::NDetail::SetOption(option, newOption, OptionToMessageFlag);
+ }
+
+public:
+ TMaybe<EProtobufFieldSortOrder> FieldSortOrder;
+};
+
+class TParseProtobufOneofOptionsVisitor
+{
+public:
+ void operator() (EProtobufOneofMode mode)
+ {
+ SetOption(Mode, mode);
+ }
+
+ template <typename T>
+ void SetOption(TMaybe<T>& option, T newOption)
+ {
+ NYT::NDetail::SetOption(option, newOption, OptionToOneofFlag);
+ }
+
+public:
+ TMaybe<EProtobufOneofMode> Mode;
+};
+
+void ParseProtobufFieldOptions(
+ const ::google::protobuf::RepeatedField<EWrapperFieldFlag::Enum>& flags,
+ TProtobufFieldOptions* fieldOptions)
+{
+ TParseProtobufFieldOptionsVisitor visitor;
+ for (auto flag : flags) {
+ std::visit(visitor, FieldFlagToOption(flag));
+ }
+ if (visitor.Type) {
+ fieldOptions->Type = *visitor.Type;
+ }
+ if (visitor.SerializationMode) {
+ fieldOptions->SerializationMode = *visitor.SerializationMode;
+ }
+ if (visitor.ListMode) {
+ fieldOptions->ListMode = *visitor.ListMode;
+ }
+ if (visitor.MapMode) {
+ fieldOptions->MapMode = *visitor.MapMode;
+ }
+}
+
+void ParseProtobufMessageOptions(
+ const ::google::protobuf::RepeatedField<EWrapperMessageFlag::Enum>& flags,
+ TProtobufMessageOptions* messageOptions)
+{
+ TParseProtobufMessageOptionsVisitor visitor;
+ for (auto flag : flags) {
+ std::visit(visitor, MessageFlagToOption(flag));
+ }
+ if (visitor.FieldSortOrder) {
+ messageOptions->FieldSortOrder = *visitor.FieldSortOrder;
+ }
+}
+
+void ParseProtobufOneofOptions(
+ const ::google::protobuf::RepeatedField<EWrapperOneofFlag::Enum>& flags,
+ TProtobufOneofOptions* messageOptions)
+{
+ TParseProtobufOneofOptionsVisitor visitor;
+ for (auto flag : flags) {
+ std::visit(visitor, OneofFlagToOption(flag));
+ }
+ if (visitor.Mode) {
+ messageOptions->Mode = *visitor.Mode;
+ }
+}
+
+TProtobufFieldOptions GetDefaultFieldOptions(
+ const Descriptor* descriptor,
+ TProtobufFieldOptions defaultFieldOptions = {})
+{
+ ParseProtobufFieldOptions(
+ descriptor->file()->options().GetRepeatedExtension(file_default_field_flags),
+ &defaultFieldOptions);
+ ParseProtobufFieldOptions(
+ descriptor->options().GetRepeatedExtension(default_field_flags),
+ &defaultFieldOptions);
+ return defaultFieldOptions;
+}
+
+TProtobufOneofOptions GetDefaultOneofOptions(const Descriptor* descriptor)
+{
+ TProtobufOneofOptions defaultOneofOptions;
+ ParseProtobufOneofOptions(
+ descriptor->file()->options().GetRepeatedExtension(file_default_oneof_flags),
+ &defaultOneofOptions);
+ ParseProtobufOneofOptions(
+ descriptor->options().GetRepeatedExtension(default_oneof_flags),
+ &defaultOneofOptions);
+ switch (defaultOneofOptions.Mode) {
+ case EProtobufOneofMode::Variant: {
+ auto defaultFieldOptions = GetDefaultFieldOptions(descriptor);
+ switch (defaultFieldOptions.SerializationMode) {
+ case EProtobufSerializationMode::Protobuf:
+ // For Protobuf serialization mode default is SeparateFields.
+ defaultOneofOptions.Mode = EProtobufOneofMode::SeparateFields;
+ return defaultOneofOptions;
+ case EProtobufSerializationMode::Yt:
+ case EProtobufSerializationMode::Embedded:
+ return defaultOneofOptions;
+ }
+ Y_FAIL();
+ }
+ case EProtobufOneofMode::SeparateFields:
+ return defaultOneofOptions;
+ }
+ Y_FAIL();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ValidateProtobufType(const FieldDescriptor& fieldDescriptor, EProtobufType protobufType)
+{
+ const auto fieldType = fieldDescriptor.type();
+ auto ensureType = [&] (FieldDescriptor::Type expectedType) {
+ Y_ENSURE(fieldType == expectedType,
+ "Type of field " << fieldDescriptor.name() << "does not match specified field flag " <<
+ OptionToFieldFlag(protobufType) << ": "
+ "expected " << FieldDescriptor::TypeName(expectedType) << ", " <<
+ "got " << FieldDescriptor::TypeName(fieldType));
+ };
+ switch (protobufType) {
+ case EProtobufType::Any:
+ ensureType(FieldDescriptor::TYPE_BYTES);
+ return;
+ case EProtobufType::OtherColumns:
+ ensureType(FieldDescriptor::TYPE_BYTES);
+ return;
+ case EProtobufType::EnumInt:
+ ensureType(FieldDescriptor::TYPE_ENUM);
+ return;
+ case EProtobufType::EnumString:
+ ensureType(FieldDescriptor::TYPE_ENUM);
+ return;
+ }
+ Y_FAIL();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TCycleChecker
+{
+private:
+ class TGuard
+ {
+ public:
+ TGuard(TCycleChecker* checker, const Descriptor* descriptor)
+ : Checker_(checker)
+ , Descriptor_(descriptor)
+ {
+ Checker_->ActiveVertices_.insert(Descriptor_);
+ Checker_->Stack_.push(Descriptor_);
+ }
+
+ ~TGuard()
+ {
+ Checker_->ActiveVertices_.erase(Descriptor_);
+ Checker_->Stack_.pop();
+ }
+
+ private:
+ TCycleChecker* Checker_;
+ const Descriptor* Descriptor_;
+ };
+
+public:
+ [[nodiscard]] TGuard Enter(const Descriptor* descriptor)
+ {
+ if (ActiveVertices_.contains(descriptor)) {
+ Y_VERIFY(!Stack_.empty());
+ ythrow TApiUsageError() << "Cyclic reference found for protobuf messages. " <<
+ "Consider removing " << EWrapperFieldFlag::SERIALIZATION_YT << " flag " <<
+ "somewhere on the cycle containing " <<
+ Stack_.top()->full_name() << " and " << descriptor->full_name();
+ }
+ return TGuard(this, descriptor);
+ }
+
+private:
+ THashSet<const Descriptor*> ActiveVertices_;
+ TStack<const Descriptor*> Stack_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+TProtobufFieldOptions GetFieldOptions(
+ const FieldDescriptor* fieldDescriptor,
+ const TMaybe<TProtobufFieldOptions>& defaultFieldOptions)
+{
+ TProtobufFieldOptions options;
+ if (defaultFieldOptions) {
+ options = *defaultFieldOptions;
+ } else {
+ options = GetDefaultFieldOptions(fieldDescriptor->containing_type());
+ }
+ ParseProtobufFieldOptions(fieldDescriptor->options().GetRepeatedExtension(flags), &options);
+ return options;
+}
+
+TProtobufOneofOptions GetOneofOptions(
+ const OneofDescriptor* oneofDescriptor,
+ const TMaybe<TProtobufOneofOptions>& defaultOneofOptions)
+{
+ TProtobufOneofOptions options;
+ if (defaultOneofOptions) {
+ options = *defaultOneofOptions;
+ } else {
+ options = GetDefaultOneofOptions(oneofDescriptor->containing_type());
+ }
+ ParseProtobufOneofOptions(oneofDescriptor->options().GetRepeatedExtension(oneof_flags), &options);
+
+ if (oneofDescriptor->is_synthetic()) {
+ options.Mode = EProtobufOneofMode::SeparateFields;
+ }
+
+ auto variantFieldName = oneofDescriptor->options().GetExtension(variant_field_name);
+ switch (options.Mode) {
+ case EProtobufOneofMode::SeparateFields:
+ if (variantFieldName) {
+ ythrow TApiUsageError() << "\"variant_field_name\" requires (NYT.oneof_flags) = VARIANT";
+ }
+ break;
+ case EProtobufOneofMode::Variant:
+ if (variantFieldName) {
+ options.VariantFieldName = variantFieldName;
+ } else {
+ options.VariantFieldName = oneofDescriptor->name();
+ }
+ break;
+ }
+ return options;
+}
+
+
+TProtobufMessageOptions GetMessageOptions(const Descriptor* descriptor)
+{
+ TProtobufMessageOptions options;
+ ParseProtobufMessageOptions(
+ descriptor->file()->options().GetRepeatedExtension(file_default_message_flags),
+ &options);
+ ParseProtobufMessageOptions(
+ descriptor->options().GetRepeatedExtension(message_flags),
+ &options);
+ return options;
+}
+
+TNode MakeEnumerationConfig(const ::google::protobuf::EnumDescriptor* enumDescriptor)
+{
+ auto config = TNode::CreateMap();
+ for (int i = 0; i < enumDescriptor->value_count(); ++i) {
+ config[enumDescriptor->value(i)->name()] = enumDescriptor->value(i)->number();
+ }
+ return config;
+}
+
+TString DeduceProtobufType(
+ const FieldDescriptor* fieldDescriptor,
+ const TProtobufFieldOptions& options)
+{
+ if (options.Type) {
+ ValidateProtobufType(*fieldDescriptor, *options.Type);
+ return ToString(*options.Type);
+ }
+ switch (fieldDescriptor->type()) {
+ case FieldDescriptor::TYPE_ENUM:
+ return ToString(EProtobufType::EnumString);
+ case FieldDescriptor::TYPE_MESSAGE:
+ switch (options.SerializationMode) {
+ case EProtobufSerializationMode::Protobuf:
+ return "message";
+ case EProtobufSerializationMode::Yt:
+ return "structured_message";
+ case EProtobufSerializationMode::Embedded:
+ return "embedded_message";
+ }
+ Y_FAIL();
+ default:
+ return fieldDescriptor->type_name();
+ }
+ Y_FAIL();
+}
+
+TString GetColumnName(const ::google::protobuf::FieldDescriptor& field)
+{
+ const auto& options = field.options();
+ const auto columnName = options.GetExtension(column_name);
+ if (!columnName.empty()) {
+ return columnName;
+ }
+ const auto keyColumnName = options.GetExtension(key_column_name);
+ if (!keyColumnName.empty()) {
+ return keyColumnName;
+ }
+ return field.name();
+}
+
+TNode MakeProtoFormatMessageFieldsConfig(
+ const Descriptor* descriptor,
+ TNode* enumerations,
+ TCycleChecker& cycleChecker);
+
+TNode MakeProtoFormatMessageFieldsConfig(
+ const Descriptor* descriptor,
+ TNode* enumerations,
+ const TProtobufFieldOptions& defaultFieldOptions,
+ const TProtobufOneofOptions& defaultOneofOptions,
+ TCycleChecker& cycleChecker);
+
+TNode MakeMapFieldsConfig(
+ const FieldDescriptor* fieldDescriptor,
+ TNode* enumerations,
+ const TProtobufFieldOptions& fieldOptions,
+ TCycleChecker& cycleChecker)
+{
+ Y_VERIFY(fieldDescriptor->is_map());
+ auto message = fieldDescriptor->message_type();
+ switch (fieldOptions.MapMode) {
+ case EProtobufMapMode::ListOfStructsLegacy:
+ return MakeProtoFormatMessageFieldsConfig(
+ message,
+ enumerations,
+ cycleChecker);
+ case EProtobufMapMode::ListOfStructs:
+ case EProtobufMapMode::Dict:
+ case EProtobufMapMode::OptionalDict: {
+ TProtobufFieldOptions defaultFieldOptions;
+ defaultFieldOptions.SerializationMode = EProtobufSerializationMode::Yt;
+ return MakeProtoFormatMessageFieldsConfig(
+ message,
+ enumerations,
+ defaultFieldOptions,
+ TProtobufOneofOptions{},
+ cycleChecker);
+ }
+ }
+ Y_FAIL();
+}
+
+TNode MakeProtoFormatFieldConfig(
+ const FieldDescriptor* fieldDescriptor,
+ TNode* enumerations,
+ const TProtobufFieldOptions& defaultOptions,
+ TCycleChecker& cycleChecker)
+{
+ auto fieldConfig = TNode::CreateMap();
+ fieldConfig["field_number"] = fieldDescriptor->number();
+ fieldConfig["name"] = GetColumnName(*fieldDescriptor);
+
+ auto fieldOptions = GetFieldOptions(fieldDescriptor, defaultOptions);
+
+ Y_ENSURE(fieldOptions.SerializationMode != EProtobufSerializationMode::Embedded,
+ "EMBEDDED flag is currently supported only with "
+ "ProtobufFormatWithDescriptors config option set to true");
+
+ if (fieldDescriptor->is_repeated()) {
+ Y_ENSURE_EX(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
+ TApiUsageError() << "Repeated field \"" << fieldDescriptor->full_name() << "\" " <<
+ "must have flag \"" << EWrapperFieldFlag::SERIALIZATION_YT << "\"");
+ }
+ fieldConfig["repeated"] = fieldDescriptor->is_repeated();
+ fieldConfig["packed"] = fieldDescriptor->is_packed();
+
+ fieldConfig["proto_type"] = DeduceProtobufType(fieldDescriptor, fieldOptions);
+
+ if (fieldDescriptor->type() == FieldDescriptor::TYPE_ENUM) {
+ auto* enumeration = fieldDescriptor->enum_type();
+ (*enumerations)[enumeration->full_name()] = MakeEnumerationConfig(enumeration);
+ fieldConfig["enumeration_name"] = enumeration->full_name();
+ }
+
+ if (fieldOptions.SerializationMode != EProtobufSerializationMode::Yt) {
+ return fieldConfig;
+ }
+
+ if (fieldDescriptor->is_map()) {
+ fieldConfig["fields"] = MakeMapFieldsConfig(fieldDescriptor, enumerations, fieldOptions, cycleChecker);
+ return fieldConfig;
+ }
+
+ if (fieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE) {
+ fieldConfig["fields"] = MakeProtoFormatMessageFieldsConfig(
+ fieldDescriptor->message_type(),
+ enumerations,
+ cycleChecker);
+ }
+
+ return fieldConfig;
+}
+
+void MakeProtoFormatOneofConfig(
+ const OneofDescriptor* oneofDescriptor,
+ TNode* enumerations,
+ const TProtobufFieldOptions& defaultFieldOptions,
+ const TProtobufOneofOptions& defaultOneofOptions,
+ TCycleChecker& cycleChecker,
+ TNode* fields)
+{
+ auto addFields = [&] (TNode* fields) {
+ for (int i = 0; i < oneofDescriptor->field_count(); ++i) {
+ fields->Add(MakeProtoFormatFieldConfig(
+ oneofDescriptor->field(i),
+ enumerations,
+ defaultFieldOptions,
+ cycleChecker));
+ }
+ };
+
+ auto oneofOptions = GetOneofOptions(oneofDescriptor, defaultOneofOptions);
+ switch (oneofOptions.Mode) {
+ case EProtobufOneofMode::SeparateFields:
+ addFields(fields);
+ return;
+ case EProtobufOneofMode::Variant: {
+ auto oneofFields = TNode::CreateList();
+ addFields(&oneofFields);
+ auto oneofField = TNode()
+ ("proto_type", "oneof")
+ ("name", oneofOptions.VariantFieldName)
+ ("fields", std::move(oneofFields));
+ fields->Add(std::move(oneofField));
+ return;
+ }
+ }
+ Y_FAIL();
+}
+
+TNode MakeProtoFormatMessageFieldsConfig(
+ const Descriptor* descriptor,
+ TNode* enumerations,
+ const TProtobufFieldOptions& defaultFieldOptions,
+ const TProtobufOneofOptions& defaultOneofOptions,
+ TCycleChecker& cycleChecker)
+{
+ auto fields = TNode::CreateList();
+ THashSet<const OneofDescriptor*> visitedOneofs;
+ auto guard = cycleChecker.Enter(descriptor);
+ for (int fieldIndex = 0; fieldIndex < descriptor->field_count(); ++fieldIndex) {
+ auto fieldDescriptor = descriptor->field(fieldIndex);
+ auto oneofDescriptor = fieldDescriptor->containing_oneof();
+ if (!oneofDescriptor) {
+ fields.Add(MakeProtoFormatFieldConfig(
+ fieldDescriptor,
+ enumerations,
+ defaultFieldOptions,
+ cycleChecker));
+ } else if (!visitedOneofs.contains(oneofDescriptor)) {
+ MakeProtoFormatOneofConfig(
+ oneofDescriptor,
+ enumerations,
+ defaultFieldOptions,
+ defaultOneofOptions,
+ cycleChecker,
+ &fields);
+ visitedOneofs.insert(oneofDescriptor);
+ }
+ }
+ return fields;
+}
+
+TNode MakeProtoFormatMessageFieldsConfig(
+ const Descriptor* descriptor,
+ TNode* enumerations,
+ TCycleChecker& cycleChecker)
+{
+ return MakeProtoFormatMessageFieldsConfig(
+ descriptor,
+ enumerations,
+ GetDefaultFieldOptions(descriptor),
+ GetDefaultOneofOptions(descriptor),
+ cycleChecker);
+}
+
+TNode MakeProtoFormatConfigWithTables(const TVector<const Descriptor*>& descriptors)
+{
+ TNode config("protobuf");
+ config.Attributes()
+ ("enumerations", TNode::CreateMap())
+ ("tables", TNode::CreateList());
+
+ auto& enumerations = config.Attributes()["enumerations"];
+
+ for (auto* descriptor : descriptors) {
+ TCycleChecker cycleChecker;
+ auto columns = MakeProtoFormatMessageFieldsConfig(descriptor, &enumerations, cycleChecker);
+ config.Attributes()["tables"].Add(
+ TNode()("columns", std::move(columns)));
+ }
+
+ return config;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TFileDescriptorSetBuilder
+{
+public:
+ TFileDescriptorSetBuilder()
+ : ExtensionFile_(EWrapperFieldFlag::descriptor()->file())
+ { }
+
+ void AddDescriptor(const Descriptor* descriptor)
+ {
+ auto [it, inserted] = AllDescriptors_.insert(descriptor);
+ if (!inserted) {
+ return;
+ }
+
+ const auto* containingType = descriptor->containing_type();
+ while (containingType) {
+ AddDescriptor(containingType);
+ containingType = containingType->containing_type();
+ }
+ for (int i = 0; i < descriptor->field_count(); ++i) {
+ AddField(descriptor->field(i));
+ }
+ }
+
+ FileDescriptorSet Build()
+ {
+ THashSet<const FileDescriptor*> visitedFiles;
+ TVector<const FileDescriptor*> fileTopoOrder;
+ for (const auto* descriptor : AllDescriptors_) {
+ TraverseDependencies(descriptor->file(), visitedFiles, fileTopoOrder);
+ }
+
+ THashSet<TString> messageTypeNames;
+ THashSet<TString> enumTypeNames;
+ for (const auto* descriptor : AllDescriptors_) {
+ messageTypeNames.insert(descriptor->full_name());
+ }
+ for (const auto* enumDescriptor : EnumDescriptors_) {
+ enumTypeNames.insert(enumDescriptor->full_name());
+ }
+ FileDescriptorSet fileDescriptorSetProto;
+ for (const auto* file : fileTopoOrder) {
+ auto* fileProto = fileDescriptorSetProto.add_file();
+ file->CopyTo(fileProto);
+ Strip(fileProto, messageTypeNames, enumTypeNames);
+ }
+ return fileDescriptorSetProto;
+ }
+
+private:
+ void AddField(const FieldDescriptor* fieldDescriptor)
+ {
+ if (fieldDescriptor->message_type()) {
+ AddDescriptor(fieldDescriptor->message_type());
+ }
+ if (fieldDescriptor->enum_type()) {
+ AddEnumDescriptor(fieldDescriptor->enum_type());
+ }
+ }
+
+ void AddEnumDescriptor(const EnumDescriptor* enumDescriptor)
+ {
+ auto [it, inserted] = EnumDescriptors_.insert(enumDescriptor);
+ if (!inserted) {
+ return;
+ }
+ const auto* containingType = enumDescriptor->containing_type();
+ while (containingType) {
+ AddDescriptor(containingType);
+ containingType = containingType->containing_type();
+ }
+ }
+
+ void TraverseDependencies(
+ const FileDescriptor* current,
+ THashSet<const FileDescriptor*>& visited,
+ TVector<const FileDescriptor*>& topoOrder)
+ {
+ auto [it, inserted] = visited.insert(current);
+ if (!inserted) {
+ return;
+ }
+ for (int i = 0; i < current->dependency_count(); ++i) {
+ TraverseDependencies(current->dependency(i), visited, topoOrder);
+ }
+ topoOrder.push_back(current);
+ }
+
+ template <typename TOptions>
+ void StripUnknownOptions(TOptions* options)
+ {
+ std::vector<const FieldDescriptor*> fields;
+ auto reflection = options->GetReflection();
+ reflection->ListFields(*options, &fields);
+ for (auto field : fields) {
+ if (field->is_extension() && field->file() != ExtensionFile_) {
+ reflection->ClearField(options, field);
+ }
+ }
+ }
+
+ template <typename TRepeatedField, typename TPredicate>
+ void RemoveIf(TRepeatedField* repeatedField, TPredicate predicate)
+ {
+ repeatedField->erase(
+ std::remove_if(repeatedField->begin(), repeatedField->end(), predicate),
+ repeatedField->end());
+ }
+
+ void Strip(
+ const TString& containingTypePrefix,
+ DescriptorProto* messageProto,
+ const THashSet<TString>& messageTypeNames,
+ const THashSet<TString>& enumTypeNames)
+ {
+ const auto prefix = containingTypePrefix + messageProto->name() + '.';
+
+ RemoveIf(messageProto->mutable_nested_type(), [&] (const DescriptorProto& descriptorProto) {
+ return !messageTypeNames.contains(prefix + descriptorProto.name());
+ });
+ RemoveIf(messageProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
+ return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
+ });
+
+ messageProto->clear_extension();
+ StripUnknownOptions(messageProto->mutable_options());
+ for (auto& fieldProto : *messageProto->mutable_field()) {
+ StripUnknownOptions(fieldProto.mutable_options());
+ }
+ for (auto& oneofProto : *messageProto->mutable_oneof_decl()) {
+ StripUnknownOptions(oneofProto.mutable_options());
+ }
+ for (auto& nestedTypeProto : *messageProto->mutable_nested_type()) {
+ Strip(prefix, &nestedTypeProto, messageTypeNames, enumTypeNames);
+ }
+ for (auto& enumProto : *messageProto->mutable_enum_type()) {
+ StripUnknownOptions(enumProto.mutable_options());
+ for (auto& enumValue : *enumProto.mutable_value()) {
+ StripUnknownOptions(enumValue.mutable_options());
+ }
+ }
+ }
+
+ void Strip(
+ FileDescriptorProto* fileProto,
+ const THashSet<TString>& messageTypeNames,
+ const THashSet<TString>& enumTypeNames)
+ {
+ const auto prefix = fileProto->package().Empty()
+ ? ""
+ : fileProto->package() + '.';
+
+ RemoveIf(fileProto->mutable_message_type(), [&] (const DescriptorProto& descriptorProto) {
+ return !messageTypeNames.contains(prefix + descriptorProto.name());
+ });
+ RemoveIf(fileProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
+ return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
+ });
+
+ fileProto->clear_service();
+ fileProto->clear_extension();
+
+ StripUnknownOptions(fileProto->mutable_options());
+ for (auto& messageProto : *fileProto->mutable_message_type()) {
+ Strip(prefix, &messageProto, messageTypeNames, enumTypeNames);
+ }
+ for (auto& enumProto : *fileProto->mutable_enum_type()) {
+ StripUnknownOptions(enumProto.mutable_options());
+ for (auto& enumValue : *enumProto.mutable_value()) {
+ StripUnknownOptions(enumValue.mutable_options());
+ }
+ }
+ }
+
+private:
+ const FileDescriptor* const ExtensionFile_;
+ THashSet<const Descriptor*> AllDescriptors_;
+ THashSet<const EnumDescriptor*> EnumDescriptors_;
+};
+
+TNode MakeProtoFormatConfigWithDescriptors(const TVector<const Descriptor*>& descriptors)
+{
+ TFileDescriptorSetBuilder builder;
+ auto typeNames = TNode::CreateList();
+ for (const auto* descriptor : descriptors) {
+ builder.AddDescriptor(descriptor);
+ typeNames.Add(descriptor->full_name());
+ }
+
+ auto fileDescriptorSetText = builder.Build().ShortDebugString();
+ TNode config("protobuf");
+ config.Attributes()
+ ("file_descriptor_set_text", std::move(fileDescriptorSetText))
+ ("type_names", std::move(typeNames));
+ return config;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using TTypePtrOrOtherColumns = std::variant<NTi::TTypePtr, TOtherColumns>;
+
+struct TMember {
+ TString Name;
+ TTypePtrOrOtherColumns TypeOrOtherColumns;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+TValueTypeOrOtherColumns GetScalarFieldType(
+ const FieldDescriptor& fieldDescriptor,
+ const TProtobufFieldOptions& options)
+{
+ if (options.Type) {
+ switch (*options.Type) {
+ case EProtobufType::EnumInt:
+ return EValueType::VT_INT64;
+ case EProtobufType::EnumString:
+ return EValueType::VT_STRING;
+ case EProtobufType::Any:
+ return EValueType::VT_ANY;
+ case EProtobufType::OtherColumns:
+ return TOtherColumns{};
+ }
+ Y_FAIL();
+ }
+
+ switch (fieldDescriptor.cpp_type()) {
+ case FieldDescriptor::CPPTYPE_INT32:
+ return EValueType::VT_INT32;
+ case FieldDescriptor::CPPTYPE_INT64:
+ return EValueType::VT_INT64;
+ case FieldDescriptor::CPPTYPE_UINT32:
+ return EValueType::VT_UINT32;
+ case FieldDescriptor::CPPTYPE_UINT64:
+ return EValueType::VT_UINT64;
+ case FieldDescriptor::CPPTYPE_FLOAT:
+ case FieldDescriptor::CPPTYPE_DOUBLE:
+ return EValueType::VT_DOUBLE;
+ case FieldDescriptor::CPPTYPE_BOOL:
+ return EValueType::VT_BOOLEAN;
+ case FieldDescriptor::CPPTYPE_STRING:
+ case FieldDescriptor::CPPTYPE_MESSAGE:
+ case FieldDescriptor::CPPTYPE_ENUM:
+ return EValueType::VT_STRING;
+ default:
+ ythrow yexception() <<
+ "Unexpected field type '" << fieldDescriptor.cpp_type_name() << "' " <<
+ "for field " << fieldDescriptor.name();
+ }
+}
+
+bool HasNameExtension(const FieldDescriptor& fieldDescriptor)
+{
+ const auto& options = fieldDescriptor.options();
+ return options.HasExtension(column_name) || options.HasExtension(key_column_name);
+}
+
+void SortFields(TVector<const FieldDescriptor*>& fieldDescriptors, EProtobufFieldSortOrder fieldSortOrder)
+{
+ switch (fieldSortOrder) {
+ case EProtobufFieldSortOrder::AsInProtoFile:
+ return;
+ case EProtobufFieldSortOrder::ByFieldNumber:
+ SortBy(fieldDescriptors, [] (const FieldDescriptor* fieldDescriptor) {
+ return fieldDescriptor->number();
+ });
+ return;
+ }
+ Y_FAIL();
+}
+
+NTi::TTypePtr CreateStruct(TStringBuf fieldName, TVector<TMember> members)
+{
+ TVector<NTi::TStructType::TOwnedMember> structMembers;
+ structMembers.reserve(members.size());
+ for (auto& member : members) {
+ std::visit(TOverloaded{
+ [&] (TOtherColumns) {
+ ythrow TApiUsageError() <<
+ "Could not deduce YT type for field " << member.Name << " of " <<
+ "embedded message field " << fieldName << " " <<
+ "(note that " << EWrapperFieldFlag::OTHER_COLUMNS << " fields " <<
+ "are not allowed inside embedded messages)";
+ },
+ [&] (NTi::TTypePtr& type) {
+ structMembers.emplace_back(std::move(member.Name), std::move(type));
+ },
+ }, member.TypeOrOtherColumns);
+ }
+ return NTi::Struct(std::move(structMembers));
+}
+
+TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor)
+{
+ auto isOtherColumns = [] (const ::google::protobuf::FieldDescriptor& field) {
+ return GetFieldOptions(&field).Type == EProtobufType::OtherColumns;
+ };
+
+ TVector<TString> result;
+ result.reserve(descriptor.field_count());
+ for (int i = 0; i < descriptor.field_count(); ++i) {
+ const auto& field = *descriptor.field(i);
+ if (isOtherColumns(field)) {
+ return {};
+ }
+ result.push_back(GetColumnName(field));
+ }
+ return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class TTableSchemaInferrer
+{
+public:
+ TTableSchemaInferrer(bool keepFieldsWithoutExtension)
+ : KeepFieldsWithoutExtension_(keepFieldsWithoutExtension)
+ { }
+
+ TTableSchema InferSchema(const Descriptor& messageDescriptor);
+
+private:
+ TTypePtrOrOtherColumns GetFieldType(
+ const FieldDescriptor& fieldDescriptor,
+ const TProtobufFieldOptions& defaultOptions);
+
+ void ProcessOneofField(
+ TStringBuf containingFieldName,
+ const OneofDescriptor& oneofDescriptor,
+ const TProtobufFieldOptions& defaultFieldOptions,
+ const TProtobufOneofOptions& defaultOneofOptions,
+ EProtobufFieldSortOrder fieldSortOrder,
+ TVector<TMember>* members);
+
+ TVector<TMember> GetMessageMembers(
+ TStringBuf containingFieldName,
+ const Descriptor& fieldDescriptor,
+ TProtobufFieldOptions defaultFieldOptions,
+ std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder = std::nullopt);
+
+ NTi::TTypePtr GetMessageType(
+ const FieldDescriptor& fieldDescriptor,
+ TProtobufFieldOptions defaultFieldOptions);
+
+ NTi::TTypePtr GetMapType(
+ const FieldDescriptor& fieldDescriptor,
+ const TProtobufFieldOptions& fieldOptions);
+
+private:
+ void GetMessageMembersImpl(
+ TStringBuf containingFieldName,
+ const Descriptor& fieldDescriptor,
+ TProtobufFieldOptions defaultFieldOptions,
+ std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
+ TVector<TMember>* members);
+
+private:
+ const bool KeepFieldsWithoutExtension_;
+ TCycleChecker CycleChecker_;
+};
+
+void TTableSchemaInferrer::ProcessOneofField(
+ TStringBuf containingFieldName,
+ const OneofDescriptor& oneofDescriptor,
+ const TProtobufFieldOptions& defaultFieldOptions,
+ const TProtobufOneofOptions& defaultOneofOptions,
+ EProtobufFieldSortOrder fieldSortOrder,
+ TVector<TMember>* members)
+{
+ auto oneofOptions = GetOneofOptions(&oneofDescriptor, defaultOneofOptions);
+
+ auto addFields = [&] (TVector<TMember>* members, bool removeOptionality) {
+ TVector<const FieldDescriptor*> fieldDescriptors;
+ for (int i = 0; i < oneofDescriptor.field_count(); ++i) {
+ fieldDescriptors.push_back(oneofDescriptor.field(i));
+ }
+ SortFields(fieldDescriptors, fieldSortOrder);
+ for (auto innerFieldDescriptor : fieldDescriptors) {
+ auto typeOrOtherColumns = GetFieldType(
+ *innerFieldDescriptor,
+ defaultFieldOptions);
+ if (auto* maybeType = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
+ maybeType && removeOptionality && (*maybeType)->IsOptional())
+ {
+ typeOrOtherColumns = (*maybeType)->AsOptional()->GetItemType();
+ }
+ members->push_back(TMember{
+ GetColumnName(*innerFieldDescriptor),
+ std::move(typeOrOtherColumns),
+ });
+ }
+ };
+
+ switch (oneofOptions.Mode) {
+ case EProtobufOneofMode::SeparateFields:
+ addFields(members, /* removeOptionality */ false);
+ return;
+ case EProtobufOneofMode::Variant: {
+ TVector<TMember> variantMembers;
+ addFields(&variantMembers, /* removeOptionality */ true);
+ members->push_back(TMember{
+ oneofOptions.VariantFieldName,
+ NTi::Optional(
+ NTi::Variant(
+ CreateStruct(containingFieldName, std::move(variantMembers))
+ )
+ )
+ });
+ return;
+ }
+ }
+ Y_FAIL();
+}
+
+TVector<TMember> TTableSchemaInferrer::GetMessageMembers(
+ TStringBuf containingFieldName,
+ const Descriptor& messageDescriptor,
+ TProtobufFieldOptions defaultFieldOptions,
+ std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder)
+{
+ TVector<TMember> members;
+ GetMessageMembersImpl(
+ containingFieldName,
+ messageDescriptor,
+ defaultFieldOptions,
+ overrideFieldSortOrder,
+ &members
+ );
+ return members;
+}
+
+void TTableSchemaInferrer::GetMessageMembersImpl(
+ TStringBuf containingFieldName,
+ const Descriptor& messageDescriptor,
+ TProtobufFieldOptions defaultFieldOptions,
+ std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
+ TVector<TMember>* members)
+{
+ auto guard = CycleChecker_.Enter(&messageDescriptor);
+ defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor, defaultFieldOptions);
+ auto messageOptions = GetMessageOptions(&messageDescriptor);
+ auto defaultOneofOptions = GetDefaultOneofOptions(&messageDescriptor);
+
+ TVector<const FieldDescriptor*> fieldDescriptors;
+ fieldDescriptors.reserve(messageDescriptor.field_count());
+ for (int i = 0; i < messageDescriptor.field_count(); ++i) {
+ if (!KeepFieldsWithoutExtension_ && !HasNameExtension(*messageDescriptor.field(i))) {
+ continue;
+ }
+ fieldDescriptors.push_back(messageDescriptor.field(i));
+ }
+
+ auto fieldSortOrder = overrideFieldSortOrder.value_or(messageOptions.FieldSortOrder);
+ SortFields(fieldDescriptors, fieldSortOrder);
+
+ THashSet<const OneofDescriptor*> visitedOneofs;
+ for (const auto innerFieldDescriptor : fieldDescriptors) {
+ auto oneofDescriptor = innerFieldDescriptor->containing_oneof();
+ if (oneofDescriptor) {
+ if (visitedOneofs.contains(oneofDescriptor)) {
+ continue;
+ }
+ ProcessOneofField(
+ containingFieldName,
+ *oneofDescriptor,
+ defaultFieldOptions,
+ defaultOneofOptions,
+ messageOptions.FieldSortOrder,
+ members);
+ visitedOneofs.insert(oneofDescriptor);
+ continue;
+ }
+ auto fieldOptions = GetFieldOptions(innerFieldDescriptor, defaultFieldOptions);
+ if (fieldOptions.SerializationMode == EProtobufSerializationMode::Embedded) {
+ Y_ENSURE(innerFieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE,
+ "EMBEDDED column must have message type");
+ Y_ENSURE(innerFieldDescriptor->label() == FieldDescriptor::LABEL_REQUIRED,
+ "EMBEDDED column must be marked required");
+ GetMessageMembersImpl(
+ innerFieldDescriptor->full_name(),
+ *innerFieldDescriptor->message_type(),
+ defaultFieldOptions,
+ /*overrideFieldSortOrder*/ std::nullopt,
+ members);
+ } else {
+ auto typeOrOtherColumns = GetFieldType(
+ *innerFieldDescriptor,
+ defaultFieldOptions);
+ members->push_back(TMember{
+ GetColumnName(*innerFieldDescriptor),
+ std::move(typeOrOtherColumns),
+ });
+ }
+ }
+}
+
+NTi::TTypePtr TTableSchemaInferrer::GetMessageType(
+ const FieldDescriptor& fieldDescriptor,
+ TProtobufFieldOptions defaultFieldOptions)
+{
+ Y_VERIFY(fieldDescriptor.message_type());
+ const auto& messageDescriptor = *fieldDescriptor.message_type();
+ auto members = GetMessageMembers(
+ fieldDescriptor.full_name(),
+ messageDescriptor,
+ defaultFieldOptions);
+
+ return CreateStruct(fieldDescriptor.full_name(), std::move(members));
+}
+
+NTi::TTypePtr TTableSchemaInferrer::GetMapType(
+ const FieldDescriptor& fieldDescriptor,
+ const TProtobufFieldOptions& fieldOptions)
+{
+ Y_VERIFY(fieldDescriptor.is_map());
+ switch (fieldOptions.MapMode) {
+ case EProtobufMapMode::ListOfStructsLegacy:
+ case EProtobufMapMode::ListOfStructs: {
+ TProtobufFieldOptions embeddedOptions;
+ if (fieldOptions.MapMode == EProtobufMapMode::ListOfStructs) {
+ embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
+ }
+ auto list = NTi::List(GetMessageType(fieldDescriptor, embeddedOptions));
+ switch (fieldOptions.ListMode) {
+ case EProtobufListMode::Required:
+ return list;
+ case EProtobufListMode::Optional:
+ return NTi::Optional(std::move(list));
+ }
+ Y_FAIL();
+ }
+ case EProtobufMapMode::Dict:
+ case EProtobufMapMode::OptionalDict: {
+ auto message = fieldDescriptor.message_type();
+ Y_VERIFY(message->field_count() == 2);
+ auto keyVariant = GetScalarFieldType(*message->field(0), TProtobufFieldOptions{});
+ Y_VERIFY(std::holds_alternative<EValueType>(keyVariant));
+ auto key = std::get<EValueType>(keyVariant);
+ TProtobufFieldOptions embeddedOptions;
+ embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
+ auto valueVariant = GetFieldType(*message->field(1), embeddedOptions);
+ Y_VERIFY(std::holds_alternative<NTi::TTypePtr>(valueVariant));
+ auto value = std::get<NTi::TTypePtr>(valueVariant);
+ Y_VERIFY(value->IsOptional());
+ value = value->AsOptional()->GetItemType();
+ auto dict = NTi::Dict(ToTypeV3(key, true), value);
+ if (fieldOptions.MapMode == EProtobufMapMode::OptionalDict) {
+ return NTi::Optional(dict);
+ } else {
+ return dict;
+ }
+ }
+ }
+}
+
+TTypePtrOrOtherColumns TTableSchemaInferrer::GetFieldType(
+ const FieldDescriptor& fieldDescriptor,
+ const TProtobufFieldOptions& defaultOptions)
+{
+ auto fieldOptions = GetFieldOptions(&fieldDescriptor, defaultOptions);
+ if (fieldOptions.Type) {
+ ValidateProtobufType(fieldDescriptor, *fieldOptions.Type);
+ }
+
+ auto getScalarType = [&] {
+ auto valueTypeOrOtherColumns = GetScalarFieldType(fieldDescriptor, fieldOptions);
+ return std::visit(TOverloaded{
+ [] (TOtherColumns) -> TTypePtrOrOtherColumns {
+ return TOtherColumns{};
+ },
+ [] (EValueType valueType) -> TTypePtrOrOtherColumns {
+ return ToTypeV3(valueType, true);
+ }
+ }, valueTypeOrOtherColumns);
+ };
+
+ auto withFieldLabel = [&] (const TTypePtrOrOtherColumns& typeOrOtherColumns) -> TTypePtrOrOtherColumns {
+ switch (fieldDescriptor.label()) {
+ case FieldDescriptor::Label::LABEL_REPEATED: {
+ Y_ENSURE(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
+ "Repeated fields are supported only for YT serialization mode, field \"" + fieldDescriptor.full_name() +
+ "\" has incorrect serialization mode");
+ auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
+ Y_ENSURE(type, "OTHER_COLUMNS field can not be repeated");
+ switch (fieldOptions.ListMode) {
+ case EProtobufListMode::Required:
+ return NTi::TTypePtr(NTi::List(*type));
+ case EProtobufListMode::Optional:
+ return NTi::TTypePtr(NTi::Optional(NTi::List(*type)));
+ }
+ Y_FAIL();
+ }
+ case FieldDescriptor::Label::LABEL_OPTIONAL:
+ return std::visit(TOverloaded{
+ [] (TOtherColumns) -> TTypePtrOrOtherColumns {
+ return TOtherColumns{};
+ },
+ [] (NTi::TTypePtr type) -> TTypePtrOrOtherColumns {
+ return NTi::TTypePtr(NTi::Optional(std::move(type)));
+ }
+ }, typeOrOtherColumns);
+ case FieldDescriptor::LABEL_REQUIRED: {
+ auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
+ Y_ENSURE(type, "OTHER_COLUMNS field can not be required");
+ return *type;
+ }
+ }
+ Y_FAIL();
+ };
+
+ switch (fieldOptions.SerializationMode) {
+ case EProtobufSerializationMode::Protobuf:
+ return withFieldLabel(getScalarType());
+ case EProtobufSerializationMode::Yt:
+ if (fieldDescriptor.type() == FieldDescriptor::TYPE_MESSAGE) {
+ if (fieldDescriptor.is_map()) {
+ return GetMapType(fieldDescriptor, fieldOptions);
+ } else {
+ return withFieldLabel(GetMessageType(fieldDescriptor, TProtobufFieldOptions{}));
+ }
+ } else {
+ return withFieldLabel(getScalarType());
+ }
+ case EProtobufSerializationMode::Embedded:
+ ythrow yexception() << "EMBEDDED field is not allowed for field "
+ << fieldDescriptor.full_name();
+ }
+ Y_FAIL();
+}
+
+TTableSchema TTableSchemaInferrer::InferSchema(const Descriptor& messageDescriptor)
+{
+ TTableSchema result;
+
+ auto defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor);
+ auto members = GetMessageMembers(
+ messageDescriptor.full_name(),
+ messageDescriptor,
+ defaultFieldOptions,
+ // Use special sort order for top level messages.
+ /*overrideFieldSortOrder*/ EProtobufFieldSortOrder::AsInProtoFile);
+
+ for (auto& member : members) {
+ std::visit(TOverloaded{
+ [&] (TOtherColumns) {
+ result.Strict(false);
+ },
+ [&] (NTi::TTypePtr& type) {
+ result.AddColumn(TColumnSchema()
+ .Name(std::move(member.Name))
+ .Type(std::move(type))
+ );
+ },
+ }, member.TypeOrOtherColumns);
+ }
+
+ return result;
+}
+
+TTableSchema CreateTableSchemaImpl(
+ const Descriptor& messageDescriptor,
+ bool keepFieldsWithoutExtension)
+{
+ TTableSchemaInferrer inferrer(keepFieldsWithoutExtension);
+ return inferrer.InferSchema(messageDescriptor);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NDetail
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <>
+void Out<NYT::EWrapperFieldFlag::Enum>(IOutputStream& stream, NYT::EWrapperFieldFlag::Enum value)
+{
+ stream << NYT::EWrapperFieldFlag_Enum_Name(value);
+}
+
+template <>
+void Out<NYT::EWrapperMessageFlag::Enum>(IOutputStream& stream, NYT::EWrapperMessageFlag::Enum value)
+{
+ stream << NYT::EWrapperMessageFlag_Enum_Name(value);
+}
+
+template <>
+void Out<NYT::EWrapperOneofFlag::Enum>(IOutputStream& stream, NYT::EWrapperOneofFlag::Enum value)
+{
+ stream << NYT::EWrapperOneofFlag_Enum_Name(value);
+}
diff --git a/yt/cpp/mapreduce/interface/protobuf_format.h b/yt/cpp/mapreduce/interface/protobuf_format.h
new file mode 100644
index 0000000000..aafbced386
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_format.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include "common.h"
+
+#include <yt/yt_proto/yt/formats/extension.pb.h>
+
+#include <util/generic/maybe.h>
+
+#include <google/protobuf/message.h>
+
+/// @cond Doxygen_Suppress
+namespace NYT::NDetail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+enum class EProtobufType
+{
+ EnumInt /* "enum_int" */,
+ EnumString /* "enum_string" */,
+ Any /* "any" */,
+ OtherColumns /* "other_columns" */,
+};
+
+enum class EProtobufSerializationMode
+{
+ Protobuf,
+ Yt,
+ Embedded,
+};
+
+enum class EProtobufListMode
+{
+ Optional,
+ Required,
+};
+
+enum class EProtobufMapMode
+{
+ ListOfStructsLegacy,
+ ListOfStructs,
+ Dict,
+ OptionalDict,
+};
+
+enum class EProtobufFieldSortOrder
+{
+ AsInProtoFile,
+ ByFieldNumber,
+};
+
+enum class EProtobufOneofMode
+{
+ SeparateFields,
+ Variant,
+};
+
+enum class EProtobufEnumWritingMode
+{
+ SkipUnknownValues,
+ CheckValues,
+};
+
+struct TProtobufOneofOptions
+{
+ EProtobufOneofMode Mode = EProtobufOneofMode::Variant;
+ TString VariantFieldName;
+};
+
+struct TProtobufFieldOptions
+{
+ TMaybe<EProtobufType> Type;
+ EProtobufSerializationMode SerializationMode = EProtobufSerializationMode::Protobuf;
+ EProtobufListMode ListMode = EProtobufListMode::Required;
+ EProtobufMapMode MapMode = EProtobufMapMode::ListOfStructsLegacy;
+};
+
+struct TProtobufMessageOptions
+{
+ EProtobufFieldSortOrder FieldSortOrder = EProtobufFieldSortOrder::ByFieldNumber;
+};
+
+TString GetColumnName(const ::google::protobuf::FieldDescriptor& field);
+
+TProtobufFieldOptions GetFieldOptions(
+ const ::google::protobuf::FieldDescriptor* fieldDescriptor,
+ const TMaybe<TProtobufFieldOptions>& defaultFieldOptions = {});
+
+TProtobufOneofOptions GetOneofOptions(
+ const ::google::protobuf::OneofDescriptor* oneofDescriptor,
+ const TMaybe<TProtobufOneofOptions>& defaultOneofOptions = {});
+
+TProtobufMessageOptions GetMessageOptions(const ::google::protobuf::Descriptor* descriptor);
+
+TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor);
+
+TNode MakeProtoFormatConfigWithTables(const TVector<const ::google::protobuf::Descriptor*>& descriptors);
+TNode MakeProtoFormatConfigWithDescriptors(const TVector<const ::google::protobuf::Descriptor*>& descriptors);
+
+TTableSchema CreateTableSchemaImpl(
+ const ::google::protobuf::Descriptor& messageDescriptor,
+ bool keepFieldsWithoutExtension);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NDetail
+/// @endcond
diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp
new file mode 100644
index 0000000000..19a3d5163f
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp
@@ -0,0 +1,451 @@
+#include "common.h"
+#include "errors.h"
+#include "common_ut.h"
+#include "util/generic/fwd.h"
+
+#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h>
+#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h>
+
+#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <algorithm>
+
+using namespace NYT;
+
+bool IsFieldPresent(const TTableSchema& schema, TStringBuf name)
+{
+ for (const auto& field : schema.Columns()) {
+ if (field.Name() == name) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Y_UNIT_TEST_SUITE(ProtoSchemaTest_Simple)
+{
+ Y_UNIT_TEST(TIntegral)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TIntegral>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false)))
+ .AddColumn(TColumnSchema().Name("FloatField").Type(ToTypeV3(EValueType::VT_DOUBLE, false)))
+ .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("Int64Field").Type(ToTypeV3(EValueType::VT_INT64, false)))
+ .AddColumn(TColumnSchema().Name("Uint32Field").Type(ToTypeV3(EValueType::VT_UINT32, false)))
+ .AddColumn(TColumnSchema().Name("Uint64Field").Type(ToTypeV3(EValueType::VT_UINT64, false)))
+ .AddColumn(TColumnSchema().Name("Sint32Field").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("Sint64Field").Type(ToTypeV3(EValueType::VT_INT64, false)))
+ .AddColumn(TColumnSchema().Name("Fixed32Field").Type(ToTypeV3(EValueType::VT_UINT32, false)))
+ .AddColumn(TColumnSchema().Name("Fixed64Field").Type(ToTypeV3(EValueType::VT_UINT64, false)))
+ .AddColumn(TColumnSchema().Name("Sfixed32Field").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("Sfixed64Field").Type(ToTypeV3(EValueType::VT_INT64, false)))
+ .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false)))
+ .AddColumn(TColumnSchema().Name("EnumField").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(TOneOf)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TOneOf>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false)))
+ .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false))));
+ }
+
+ Y_UNIT_TEST(TWithRequired)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TWithRequired>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("RequiredField").Type(ToTypeV3(EValueType::VT_STRING, true)))
+ .AddColumn(TColumnSchema().Name("NotRequiredField").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(TAggregated)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TAggregated>();
+
+ UNIT_ASSERT_VALUES_EQUAL(6, schema.Columns().size());
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("StringField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("BytesField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("NestedField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("NestedRepeatedField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("NestedOneOfField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("NestedRecursiveField").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(TAliased)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TAliased>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("subkey").Type(ToTypeV3(EValueType::VT_DOUBLE, false)))
+ .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(SortColumns)
+ {
+ const TSortColumns keys = {"key", "subkey"};
+
+ const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys);
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("key")
+ .Type(ToTypeV3(EValueType::VT_INT32, false))
+ .SortOrder(ESortOrder::SO_ASCENDING))
+ .AddColumn(TColumnSchema()
+ .Name("subkey")
+ .Type(ToTypeV3(EValueType::VT_DOUBLE, false))
+ .SortOrder(ESortOrder::SO_ASCENDING))
+ .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(SortColumnsReordered)
+ {
+ const TSortColumns keys = {"subkey"};
+
+ const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys);
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("subkey")
+ .Type(ToTypeV3(EValueType::VT_DOUBLE, false))
+ .SortOrder(ESortOrder::SO_ASCENDING))
+ .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false)))
+ .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(SortColumnsInvalid)
+ {
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"subkey", "subkey"}), yexception);
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"key", "junk"}), yexception);
+ }
+
+ Y_UNIT_TEST(KeepFieldsWithoutExtensionTrue)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, true);
+ UNIT_ASSERT(IsFieldPresent(schema, "key"));
+ UNIT_ASSERT(IsFieldPresent(schema, "subkey"));
+ UNIT_ASSERT(IsFieldPresent(schema, "Data"));
+ UNIT_ASSERT(schema.Strict());
+ }
+
+ Y_UNIT_TEST(KeepFieldsWithoutExtensionFalse)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, false);
+ UNIT_ASSERT(IsFieldPresent(schema, "key"));
+ UNIT_ASSERT(IsFieldPresent(schema, "subkey"));
+ UNIT_ASSERT(!IsFieldPresent(schema, "Data"));
+ UNIT_ASSERT(schema.Strict());
+ }
+
+ Y_UNIT_TEST(ProtobufTypeOption)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TWithTypeOptions>({});
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .Strict(false)
+ .AddColumn(TColumnSchema().Name("ColorIntField").Type(ToTypeV3(EValueType::VT_INT64, false)))
+ .AddColumn(TColumnSchema().Name("ColorStringField").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("AnyField").Type(ToTypeV3(EValueType::VT_ANY, false)))
+ .AddColumn(TColumnSchema().Name("EmbeddedField").Type(
+ NTi::Optional(NTi::Struct({
+ {"ColorIntField", ToTypeV3(EValueType::VT_INT64, false)},
+ {"ColorStringField", ToTypeV3(EValueType::VT_STRING, false)},
+ {"AnyField", ToTypeV3(EValueType::VT_ANY, false)}}))))
+ .AddColumn(TColumnSchema().Name("RepeatedEnumIntField").Type(NTi::List(NTi::Int64()))));
+ }
+
+ Y_UNIT_TEST(ProtobufTypeOption_TypeMismatch)
+ {
+ UNIT_ASSERT_EXCEPTION(
+ CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumInt>({}),
+ yexception);
+ UNIT_ASSERT_EXCEPTION(
+ CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumString>({}),
+ yexception);
+ UNIT_ASSERT_EXCEPTION(
+ CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_Any>({}),
+ yexception);
+ UNIT_ASSERT_EXCEPTION(
+ CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_OtherColumns>({}),
+ yexception);
+ }
+}
+
+Y_UNIT_TEST_SUITE(ProtoSchemaTest_Complex)
+{
+ Y_UNIT_TEST(TRepeated)
+ {
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TRepeated>(), yexception);
+
+ const auto schema = CreateTableSchema<NUnitTesting::TRepeatedYtMode>();
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("Int32Field").Type(NTi::List(ToTypeV3(EValueType::VT_INT32, true)))));
+ }
+
+ Y_UNIT_TEST(TRepeatedOptionalList)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TOptionalList>();
+ auto type = NTi::Optional(NTi::List(NTi::Int64()));
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("OptionalListInt64").TypeV3(type)));
+ }
+
+ NTi::TTypePtr GetUrlRowType(bool required)
+ {
+ static const NTi::TTypePtr structType = NTi::Struct({
+ {"Host", ToTypeV3(EValueType::VT_STRING, false)},
+ {"Path", ToTypeV3(EValueType::VT_STRING, false)},
+ {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}});
+ return required ? structType : NTi::TTypePtr(NTi::Optional(structType));
+ }
+
+ Y_UNIT_TEST(TRowFieldSerializationOption)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TRowFieldSerializationOption>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(TRowMessageSerializationOption)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TRowMessageSerializationOption>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false))));
+ }
+
+ Y_UNIT_TEST(TRowMixedSerializationOptions)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ NTi::TTypePtr GetUrlRowType_ColumnNames(bool required)
+ {
+ static const NTi::TTypePtr type = NTi::Struct({
+ {"Host_ColumnName", ToTypeV3(EValueType::VT_STRING, false)},
+ {"Path_KeyColumnName", ToTypeV3(EValueType::VT_STRING, false)},
+ {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)},
+ });
+ return required ? type : NTi::TTypePtr(NTi::Optional(type));
+ }
+
+ Y_UNIT_TEST(TRowMixedSerializationOptions_ColumnNames)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions_ColumnNames>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType_ColumnNames(false)))
+ .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(NoOptionInheritance)
+ {
+ auto deepestEmbedded = NTi::Optional(NTi::Struct({{"x", ToTypeV3(EValueType::VT_INT64, false)}}));
+
+ const auto schema = CreateTableSchema<NUnitTesting::TNoOptionInheritance>();
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("EmbeddedYt_YtOption")
+ .Type(NTi::Optional(NTi::Struct({{"embedded", deepestEmbedded}}))))
+ .AddColumn(TColumnSchema().Name("EmbeddedYt_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("EmbeddedYt_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema()
+ .Name("EmbeddedProtobuf_YtOption")
+ .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}}))))
+ .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema()
+ .Name("Embedded_YtOption")
+ .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}}))))
+ .AddColumn(TColumnSchema().Name("Embedded_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false)))
+ .AddColumn(TColumnSchema().Name("Embedded_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false))));
+ }
+
+ Y_UNIT_TEST(Cyclic)
+ {
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TA>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TB>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TC>(), TApiUsageError);
+ UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TD>(), TApiUsageError);
+
+ ASSERT_SERIALIZABLES_EQUAL(
+ TTableSchema().AddColumn(
+ TColumnSchema().Name("d").TypeV3(NTi::Optional(NTi::String()))),
+ CreateTableSchema<NUnitTesting::TCyclic::TE>());
+ }
+
+ Y_UNIT_TEST(FieldSortOrder)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TFieldSortOrder>();
+
+ auto byFieldNumber = NTi::Optional(NTi::Struct({
+ {"z", NTi::Optional(NTi::Bool())},
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ }));
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(byFieldNumber))
+ .AddColumn(TColumnSchema()
+ .Name("EmbeddedAsInProtoFile")
+ .Type(NTi::Optional(NTi::Struct({
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ {"z", NTi::Optional(NTi::Bool())},
+ }))))
+ .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber)));
+ }
+
+ Y_UNIT_TEST(Map)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TWithMap>();
+
+ auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) {
+ return NTi::List(NTi::Struct({
+ {"key", NTi::Optional(key)},
+ {"value", NTi::Optional(value)},
+ }));
+ };
+
+ auto embedded = NTi::Struct({
+ {"x", NTi::Optional(NTi::Int64())},
+ {"y", NTi::Optional(NTi::String())},
+ });
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("MapDefault")
+ .Type(createKeyValueStruct(NTi::Int64(), NTi::String())))
+ .AddColumn(TColumnSchema()
+ .Name("MapListOfStructsLegacy")
+ .Type(createKeyValueStruct(NTi::Int64(), NTi::String())))
+ .AddColumn(TColumnSchema()
+ .Name("MapListOfStructs")
+ .Type(createKeyValueStruct(NTi::Int64(), embedded)))
+ .AddColumn(TColumnSchema()
+ .Name("MapOptionalDict")
+ .Type(NTi::Optional(NTi::Dict(NTi::Int64(), embedded))))
+ .AddColumn(TColumnSchema()
+ .Name("MapDict")
+ .Type(NTi::Dict(NTi::Int64(), embedded))));
+ }
+
+ Y_UNIT_TEST(Oneof)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TWithOneof>();
+
+ auto embedded = NTi::Struct({
+ {"Oneof", NTi::Optional(NTi::Variant(NTi::Struct({
+ {"x", NTi::Int64()},
+ {"y", NTi::String()},
+ })))},
+ });
+
+ auto createType = [&] (TString oneof2Name) {
+ return NTi::Optional(NTi::Struct({
+ {"field", NTi::Optional(NTi::String())},
+ {oneof2Name, NTi::Optional(NTi::Variant(NTi::Struct({
+ {"x2", NTi::Int64()},
+ {"y2", NTi::String()},
+ {"z2", embedded},
+ })))},
+ {"y1", NTi::Optional(NTi::String())},
+ {"z1", NTi::Optional(embedded)},
+ {"x1", NTi::Optional(NTi::Int64())},
+ }));
+ };
+
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("DefaultSeparateFields")
+ .Type(createType("variant_field_name")))
+ .AddColumn(TColumnSchema()
+ .Name("NoDefault")
+ .Type(createType("Oneof2")))
+ .AddColumn(TColumnSchema()
+ .Name("SerializationProtobuf")
+ .Type(NTi::Optional(NTi::Struct({
+ {"y1", NTi::Optional(NTi::String())},
+ {"x1", NTi::Optional(NTi::Int64())},
+ {"z1", NTi::Optional(NTi::String())},
+ }))))
+ .AddColumn(TColumnSchema()
+ .Name("TopLevelOneof")
+ .Type(
+ NTi::Optional(
+ NTi::Variant(NTi::Struct({
+ {"MemberOfTopLevelOneof", NTi::Int64()}
+ }))
+ )
+ ))
+ );
+ }
+
+ Y_UNIT_TEST(Embedded)
+ {
+ const auto schema = CreateTableSchema<NUnitTesting::TEmbeddingMessage>();
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .Strict(false)
+ .AddColumn(TColumnSchema().Name("embedded2_num").Type(NTi::Optional(NTi::Uint64())))
+ .AddColumn(TColumnSchema().Name("embedded2_struct").Type(NTi::Optional(NTi::Struct({
+ {"float1", NTi::Optional(NTi::Double())},
+ {"string1", NTi::Optional(NTi::String())},
+ }))))
+ .AddColumn(TColumnSchema().Name("embedded2_repeated").Type(NTi::List(NTi::String())))
+ .AddColumn(TColumnSchema().Name("embedded_num").Type(NTi::Optional(NTi::Uint64())))
+ .AddColumn(TColumnSchema().Name("embedded_extra_field").Type(NTi::Optional(NTi::String())))
+ .AddColumn(TColumnSchema().Name("variant").Type(NTi::Optional(NTi::Variant(NTi::Struct({
+ {"str_variant", NTi::String()},
+ {"uint_variant", NTi::Uint64()},
+ })))))
+ .AddColumn(TColumnSchema().Name("num").Type(NTi::Optional(NTi::Uint64())))
+ .AddColumn(TColumnSchema().Name("extra_field").Type(NTi::Optional(NTi::String())))
+ );
+ }
+}
+
+Y_UNIT_TEST_SUITE(ProtoSchemaTest_Proto3)
+{
+ Y_UNIT_TEST(TWithOptional)
+ {
+ const auto schema = CreateTableSchema<NTestingProto3::TWithOptional>();
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("x").Type(NTi::Optional(NTi::Int64()))
+ )
+ );
+ }
+
+ Y_UNIT_TEST(TWithOptionalMessage)
+ {
+ const auto schema = CreateTableSchema<NTestingProto3::TWithOptionalMessage>();
+ ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema()
+ .AddColumn(TColumnSchema()
+ .Name("x").Type(
+ NTi::Optional(
+ NTi::Struct({{"x", NTi::Optional(NTi::Int64())}})
+ )
+ )
+ )
+ );
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto
new file mode 100644
index 0000000000..60bad6e650
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto
@@ -0,0 +1,402 @@
+import "yt/yt_proto/yt/formats/extension.proto";
+
+package NYT.NUnitTesting;
+
+message TIntegral
+{
+ optional double DoubleField = 1;
+ optional float FloatField = 2;
+ optional int32 Int32Field = 3;
+ optional int64 Int64Field = 4;
+ optional uint32 Uint32Field = 5;
+ optional uint64 Uint64Field = 6;
+ optional sint32 Sint32Field = 7;
+ optional sint64 Sint64Field = 8;
+ optional fixed32 Fixed32Field = 9;
+ optional fixed64 Fixed64Field = 10;
+ optional sfixed32 Sfixed32Field = 11;
+ optional sfixed64 Sfixed64Field = 12;
+ optional bool BoolField = 13;
+ enum TriBool
+ {
+ TRI_FALSE = 0;
+ TRI_TRUE = 1;
+ TRI_UNDEF = -1;
+ }
+ optional TriBool EnumField = 14;
+}
+
+message TRepeated
+{
+ repeated int32 Int32Field = 1;
+}
+
+message TRepeatedYtMode
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ repeated int32 Int32Field = 1;
+}
+
+message TWithTypeOptions
+{
+ enum Color
+ {
+ WHITE = 0;
+ BLUE = 1;
+ RED = -1;
+ }
+
+ message TEmbedded
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT];
+ optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING];
+ optional bytes AnyField = 3 [(NYT.flags) = ANY];
+ }
+
+ optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT];
+ optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING];
+ optional bytes AnyField = 3 [(NYT.flags) = ANY];
+ optional bytes OtherColumnsField = 4 [(NYT.flags) = OTHER_COLUMNS];
+ optional TEmbedded EmbeddedField = 5 [(NYT.flags) = SERIALIZATION_YT];
+ repeated Color RepeatedEnumIntField = 6 [(NYT.flags) = SERIALIZATION_YT, (NYT.flags) = ENUM_INT];
+}
+
+message TWithTypeOptions_TypeMismatch_EnumInt
+{
+ optional int64 EnumField = 1 [(NYT.flags) = ENUM_INT];
+}
+
+message TWithTypeOptions_TypeMismatch_EnumString
+{
+ optional string EnumField = 1 [(NYT.flags) = ENUM_STRING];
+}
+
+message TWithTypeOptions_TypeMismatch_Any
+{
+ optional string AnyField = 1 [(NYT.flags) = ANY];
+}
+
+message TWithTypeOptions_TypeMismatch_OtherColumns
+{
+ optional string OtherColumnsField = 1 [(NYT.flags) = OTHER_COLUMNS];
+}
+
+message TOneOf
+{
+ oneof Chooser
+ {
+ double DoubleField = 1;
+ int32 Int32Field = 2;
+ }
+ optional bool BoolField = 3;
+}
+
+message TWithRequired
+{
+ required string RequiredField = 1;
+ optional string NotRequiredField = 2;
+};
+
+message TAggregated
+{
+ optional string StringField = 1;
+ optional bytes BytesField = 2;
+ optional TIntegral NestedField = 3;
+ optional TRepeated NestedRepeatedField = 4;
+ optional TOneOf NestedOneOfField = 5;
+ optional TAggregated NestedRecursiveField = 6;
+}
+
+message TAliased
+{
+ optional int32 Key = 1 [(NYT.key_column_name) = "key"];
+ optional double Subkey = 2 [(NYT.key_column_name) = "subkey"];
+ optional TAggregated Data = 3;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+message TUrlRow
+{
+ optional string Host = 1 [(NYT.column_name) = "Host"];
+ optional string Path = 2 [(NYT.column_name) = "Path"];
+ optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"];
+}
+
+message TRowFieldSerializationOption
+{
+ optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_YT];
+ optional TUrlRow UrlRow_2 = 2;
+}
+
+message TRowMessageSerializationOption
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TUrlRow UrlRow_1 = 1;
+ optional TUrlRow UrlRow_2 = 2;
+}
+
+message TRowMixedSerializationOptions
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TUrlRow UrlRow_1 = 1;
+ optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+}
+
+message TRowSerializedRepeatedFields
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ repeated int64 Ints = 1;
+ repeated TUrlRow UrlRows = 2;
+}
+
+message TUrlRowWithColumnNames
+{
+ optional string Host = 1 [(NYT.column_name) = "Host_ColumnName", (NYT.key_column_name) = "Host_KeyColumnName"];
+ optional string Path = 2 [(NYT.key_column_name) = "Path_KeyColumnName"];
+ optional sint32 HttpCode = 3;
+}
+
+message TRowMixedSerializationOptions_ColumnNames
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TUrlRowWithColumnNames UrlRow_1 = 1;
+ optional TUrlRowWithColumnNames UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+}
+
+message TNoOptionInheritance
+{
+ message TDeepestEmbedded
+ {
+ optional int64 x = 1;
+ }
+
+ message TEmbedded
+ {
+ optional TDeepestEmbedded embedded = 1;
+ }
+
+ message TEmbeddedYt
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional TDeepestEmbedded embedded = 1;
+ }
+
+ message TEmbeddedProtobuf
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF;
+
+ optional TDeepestEmbedded embedded = 1;
+ }
+
+ optional TEmbeddedYt EmbeddedYt_YtOption = 1 [(NYT.flags) = SERIALIZATION_YT];
+ optional TEmbeddedYt EmbeddedYt_ProtobufOption = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+ optional TEmbeddedYt EmbeddedYt_NoOption = 3;
+ optional TEmbeddedProtobuf EmbeddedProtobuf_YtOption = 4 [(NYT.flags) = SERIALIZATION_YT];
+ optional TEmbeddedProtobuf EmbeddedProtobuf_ProtobufOption = 5 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+ optional TEmbeddedProtobuf EmbeddedProtobuf_NoOption = 6;
+ optional TEmbedded Embedded_YtOption = 7 [(NYT.flags) = SERIALIZATION_YT];
+ optional TEmbedded Embedded_ProtobufOption = 8 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+ optional TEmbedded Embedded_NoOption = 9;
+}
+
+message TOptionalList
+{
+ repeated int64 OptionalListInt64 = 1 [(NYT.flags) = OPTIONAL_LIST, (NYT.flags) = SERIALIZATION_YT];
+}
+
+message TPacked
+{
+ repeated int64 PackedListInt64 = 1 [(NYT.flags) = SERIALIZATION_YT, packed=true];
+}
+
+message TCyclic
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ message TA
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ repeated TB b = 1;
+ optional TC c = 2;
+ }
+
+ message TB
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TD d = 1;
+ }
+
+ message TC
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TD d = 1;
+ }
+
+ message TD
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional TA a = 1;
+ }
+
+ message TE
+ {
+ optional TD d = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF];
+ }
+
+ optional TA a = 1;
+}
+
+message TFieldSortOrder
+{
+ message TEmbeddedDefault {
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ message TEmbeddedAsInProtoFile {
+ option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ message TEmbeddedByFieldNumber {
+ option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER;
+ optional int64 x = 2;
+ optional string y = 12;
+ optional bool z = 1;
+ }
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional TEmbeddedDefault EmbeddedDefault = 1;
+ optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2;
+ optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3;
+}
+
+message TWithMap
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ message TEmbedded {
+ optional int64 x = 1;
+ optional string y = 2;
+ }
+
+ map<int64, TEmbedded> MapDefault = 1;
+ map<int64, TEmbedded> MapListOfStructsLegacy = 2 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS_LEGACY];
+ map<int64, TEmbedded> MapListOfStructs = 3 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS];
+ map<int64, TEmbedded> MapOptionalDict = 4 [(NYT.flags) = MAP_AS_OPTIONAL_DICT];
+ map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT];
+}
+
+message TWithOneof
+{
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ message TEmbedded
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ oneof Oneof {
+ int64 x = 1;
+ string y = 2;
+ }
+ }
+
+ message TDefaultSeparateFields
+ {
+ option (NYT.default_oneof_flags) = SEPARATE_FIELDS;
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional string field = 1;
+
+ oneof Oneof2
+ {
+ option (NYT.variant_field_name) = "variant_field_name";
+ option (NYT.oneof_flags) = VARIANT;
+ string y2 = 4;
+ TEmbedded z2 = 6;
+ int64 x2 = 2;
+ }
+
+ oneof Oneof1
+ {
+ int64 x1 = 10;
+ string y1 = 3;
+ TEmbedded z1 = 5;
+ }
+ }
+
+ message TNoDefault
+ {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+
+ optional string field = 1;
+
+ oneof Oneof2
+ {
+ string y2 = 4;
+ TEmbedded z2 = 6;
+ int64 x2 = 2;
+ }
+
+ oneof Oneof1
+ {
+ option (NYT.oneof_flags) = SEPARATE_FIELDS;
+ int64 x1 = 10;
+ string y1 = 3;
+ TEmbedded z1 = 5;
+ }
+ }
+
+ message TSerializationProtobuf
+ {
+ oneof Oneof
+ {
+ int64 x1 = 2;
+ string y1 = 1;
+ TEmbedded z1 = 3;
+ }
+ }
+
+ optional TDefaultSeparateFields DefaultSeparateFields = 1;
+ optional TNoDefault NoDefault = 2;
+ optional TSerializationProtobuf SerializationProtobuf = 3;
+
+ oneof TopLevelOneof
+ {
+ int64 MemberOfTopLevelOneof = 4;
+ }
+}
+
+message TEmbeddedStruct {
+ optional float float1 = 1;
+ optional string string1 = 2;
+}
+
+message TEmbedded2Message {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ optional uint64 embedded2_num = 10;
+ optional TEmbeddedStruct embedded2_struct = 17;
+ repeated string embedded2_repeated = 42;
+}
+
+message TEmbedded1Message {
+ option (NYT.default_field_flags) = SERIALIZATION_YT;
+ required TEmbedded2Message t2 = 1 [(NYT.flags) = EMBEDDED];
+ oneof variant {
+ string str_variant = 101;
+ uint64 uint_variant = 102;
+ }
+ optional uint64 embedded_num = 10; // make intensional field_num collision!
+ optional string embedded_extra_field = 11;
+}
+
+message TEmbeddingMessage {
+ optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS];
+ required TEmbedded1Message t1 = 2 [(NYT.flags) = EMBEDDED];
+ optional uint64 num = 12;
+ optional string extra_field = 13;
+}
diff --git a/yt/cpp/mapreduce/interface/public.h b/yt/cpp/mapreduce/interface/public.h
new file mode 100644
index 0000000000..bdeda78795
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/public.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <memory>
+
+namespace NYT::NAuth {
+
+struct IServiceTicketAuthPtrWrapper;
+using IServiceTicketAuthPtrWrapperPtr = std::shared_ptr<IServiceTicketAuthPtrWrapper>;
+
+} // namespace NYT::NAuth
diff --git a/yt/cpp/mapreduce/interface/retry_policy.h b/yt/cpp/mapreduce/interface/retry_policy.h
new file mode 100644
index 0000000000..c198839079
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/retry_policy.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <util/datetime/base.h>
+#include <util/generic/ptr.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// A configuration that controls retries of a single request.
+struct TRetryConfig
+{
+ ///
+ /// @brief How long retries of a single YT request can go on.
+ ///
+ /// If this limit is reached while retry count is not yet exceeded @ref TRequestRetriesTimeout exception is thrown.
+ TDuration RetriesTimeLimit = TDuration::Max();
+};
+
+/// The library uses this class to understand how to retry individual requests.
+class IRetryConfigProvider
+ : public virtual TThrRefBase
+{
+public:
+ ///
+ /// @brief Gets retry policy for single request.
+ ///
+ /// CreateRetryConfig is called before ANY request.
+ /// Returned config controls retries of this request.
+ ///
+ /// Must be thread safe since it can be used from different threads
+ /// to perform internal library requests (e.g. pings).
+ ///
+ /// Some methods (e.g. IClient::Map) involve multiple requests to YT and therefore
+ /// this method will be called several times during execution of single method.
+ ///
+ /// If user needs to limit overall retries inside long operation they might create
+ /// retry policy that knows about overall deadline
+ /// @ref NYT::TRetryConfig::RetriesTimeLimit taking into account that overall deadline.
+ /// (E.g. when deadline reached it returns zero limit for retries).
+ virtual TRetryConfig CreateRetryConfig() = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
+
diff --git a/yt/cpp/mapreduce/interface/serialize.cpp b/yt/cpp/mapreduce/interface/serialize.cpp
new file mode 100644
index 0000000000..ae05d9f50d
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/serialize.cpp
@@ -0,0 +1,553 @@
+#include "serialize.h"
+
+#include "common.h"
+#include "fluent.h"
+
+#include <library/cpp/yson/parser.h>
+#include <library/cpp/yson/node/node_io.h>
+#include <library/cpp/yson/node/serialize.h>
+
+#include <library/cpp/type_info/type_io.h>
+
+#include <util/generic/string.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+// const auto& nodeMap = node.AsMap();
+#define DESERIALIZE_ITEM(NAME, MEMBER) \
+ if (const auto* item = nodeMap.FindPtr(NAME)) { \
+ Deserialize(MEMBER, *item); \
+ }
+
+// const auto& attributesMap = node.GetAttributes().AsMap();
+#define DESERIALIZE_ATTR(NAME, MEMBER) \
+ if (const auto* attr = attributesMap.FindPtr(NAME)) { \
+ Deserialize(MEMBER, *attr); \
+ }
+
+////////////////////////////////////////////////////////////////////////////////
+
+void Serialize(const TSortColumn& sortColumn, NYson::IYsonConsumer* consumer)
+{
+ if (sortColumn.SortOrder() == ESortOrder::SO_ASCENDING) {
+ Serialize(sortColumn.Name(), consumer);
+ } else {
+ BuildYsonFluently(consumer).BeginMap()
+ .Item("name").Value(sortColumn.Name())
+ .Item("sort_order").Value(ToString(sortColumn.SortOrder()))
+ .EndMap();
+ }
+}
+
+void Deserialize(TSortColumn& sortColumn, const TNode& node)
+{
+ if (node.IsString()) {
+ sortColumn = TSortColumn(node.AsString());
+ } else if (node.IsMap()) {
+ const auto& name = node["name"].AsString();
+ const auto& sortOrderString = node["sort_order"].AsString();
+ sortColumn = TSortColumn(name, ::FromString<ESortOrder>(sortOrderString));
+ } else {
+ ythrow yexception() << "Expected sort column to be string or map, got " << node.GetType();
+ }
+}
+
+template <class T, class TDerived>
+void SerializeOneOrMany(const TOneOrMany<T, TDerived>& oneOrMany, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).List(oneOrMany.Parts_);
+}
+
+template <class T, class TDerived>
+void DeserializeOneOrMany(TOneOrMany<T, TDerived>& oneOrMany, const TNode& node)
+{
+ Deserialize(oneOrMany.Parts_, node);
+}
+
+void Serialize(const TKey& key, NYson::IYsonConsumer* consumer)
+{
+ SerializeOneOrMany(key, consumer);
+}
+
+void Deserialize(TKey& key, const TNode& node)
+{
+ DeserializeOneOrMany(key, node);
+}
+
+void Serialize(const TSortColumns& sortColumns, NYson::IYsonConsumer* consumer)
+{
+ SerializeOneOrMany(sortColumns, consumer);
+}
+
+void Deserialize(TSortColumns& sortColumns, const TNode& node)
+{
+ DeserializeOneOrMany(sortColumns, node);
+}
+
+void Serialize(const TColumnNames& columnNames, NYson::IYsonConsumer* consumer)
+{
+ SerializeOneOrMany(columnNames, consumer);
+}
+
+void Deserialize(TColumnNames& columnNames, const TNode& node)
+{
+ DeserializeOneOrMany(columnNames, node);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void Deserialize(EValueType& valueType, const TNode& node)
+{
+ const auto& nodeStr = node.AsString();
+ static const THashMap<TString, EValueType> str2ValueType = {
+ {"int8", VT_INT8},
+ {"int16", VT_INT16},
+ {"int32", VT_INT32},
+ {"int64", VT_INT64},
+
+ {"uint8", VT_UINT8},
+ {"uint16", VT_UINT16},
+ {"uint32", VT_UINT32},
+ {"uint64", VT_UINT64},
+
+ {"boolean", VT_BOOLEAN},
+ {"double", VT_DOUBLE},
+
+ {"string", VT_STRING},
+ {"utf8", VT_UTF8},
+
+ {"any", VT_ANY},
+
+ {"null", VT_NULL},
+ {"void", VT_VOID},
+
+ {"date", VT_DATE},
+ {"datetime", VT_DATETIME},
+ {"timestamp", VT_TIMESTAMP},
+ {"interval", VT_INTERVAL},
+ {"float", VT_FLOAT},
+ {"json", VT_JSON},
+ };
+
+ auto it = str2ValueType.find(nodeStr);
+ if (it == str2ValueType.end()) {
+ ythrow yexception() << "Invalid value type '" << nodeStr << "'";
+ }
+
+ valueType = it->second;
+}
+
+void Deserialize(ESortOrder& sortOrder, const TNode& node)
+{
+ sortOrder = FromString<ESortOrder>(node.AsString());
+}
+
+void Deserialize(EOptimizeForAttr& optimizeFor, const TNode& node)
+{
+ optimizeFor = FromString<EOptimizeForAttr>(node.AsString());
+}
+
+void Deserialize(EErasureCodecAttr& erasureCodec, const TNode& node)
+{
+ erasureCodec = FromString<EErasureCodecAttr>(node.AsString());
+}
+
+void Deserialize(ESchemaModificationAttr& schemaModification, const TNode& node)
+{
+ schemaModification = FromString<ESchemaModificationAttr>(node.AsString());
+}
+
+void Serialize(const TColumnSchema& columnSchema, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginMap()
+ .Item("name").Value(columnSchema.Name())
+ .DoIf(!columnSchema.RawTypeV3().Defined(),
+ [&] (TFluentMap fluent) {
+ fluent.Item("type").Value(NDetail::ToString(columnSchema.Type()));
+ fluent.Item("required").Value(columnSchema.Required());
+ if (columnSchema.Type() == VT_ANY
+ && *columnSchema.TypeV3() != *NTi::Optional(NTi::Yson()))
+ {
+ // A lot of user canonize serialized schema.
+ // To be backward compatible we only set type_v3 for new types.
+ fluent.Item("type_v3").Value(columnSchema.TypeV3());
+ }
+ }
+ )
+ .DoIf(columnSchema.RawTypeV3().Defined(), [&] (TFluentMap fluent) {
+ const auto& rawTypeV3 = *columnSchema.RawTypeV3();
+ fluent.Item("type_v3").Value(rawTypeV3);
+
+ // We going set old fields `type` and `required` to be compatible
+ // with old clusters that doesn't support type_v3 yet.
+
+ // if type is simple return its name otherwise return empty optional
+ auto isRequired = [](TStringBuf simpleType) {
+ return simpleType != "null" && simpleType != "void";
+ };
+ auto getSimple = [] (const TNode& typeV3) -> TMaybe<TString> {
+ static const THashMap<TString,TString> typeV3ToOld = {
+ {"bool", "boolean"},
+ {"yson", "any"},
+ };
+ TMaybe<TString> result;
+ if (typeV3.IsString()) {
+ result = typeV3.AsString();
+ } else if (typeV3.IsMap() && typeV3.Size() == 1) {
+ Y_VERIFY(typeV3["type_name"].IsString(), "invalid type is passed");
+ result = typeV3["type_name"].AsString();
+ }
+ if (result) {
+ auto it = typeV3ToOld.find(*result);
+ if (it != typeV3ToOld.end()) {
+ result = it->second;
+ }
+ }
+ return result;
+ };
+ auto simplify = [&](const TNode& typeV3) -> TMaybe<std::pair<TString, bool>> {
+ auto simple = getSimple(typeV3);
+ if (simple) {
+ return std::make_pair(*simple, isRequired(*simple));
+ }
+ if (typeV3.IsMap() && typeV3["type_name"] == "optional") {
+ auto simpleItem = getSimple(typeV3["item"]);
+ if (simpleItem && isRequired(*simpleItem)) {
+ return std::make_pair(*simpleItem, false);
+ }
+ }
+ return {};
+ };
+
+ auto simplified = simplify(rawTypeV3);
+
+ if (simplified) {
+ const auto& [simpleType, required] = *simplified;
+ fluent
+ .Item("type").Value(simpleType)
+ .Item("required").Value(required);
+ return;
+ }
+ })
+ .DoIf(columnSchema.SortOrder().Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("sort_order").Value(ToString(*columnSchema.SortOrder()));
+ })
+ .DoIf(columnSchema.Lock().Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("lock").Value(*columnSchema.Lock());
+ })
+ .DoIf(columnSchema.Expression().Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("expression").Value(*columnSchema.Expression());
+ })
+ .DoIf(columnSchema.Aggregate().Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("aggregate").Value(*columnSchema.Aggregate());
+ })
+ .DoIf(columnSchema.Group().Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("group").Value(*columnSchema.Group());
+ })
+ .EndMap();
+}
+
+void Deserialize(TColumnSchema& columnSchema, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("name", columnSchema.Name_);
+ DESERIALIZE_ITEM("type_v3", columnSchema.RawTypeV3_);
+ DESERIALIZE_ITEM("sort_order", columnSchema.SortOrder_);
+ DESERIALIZE_ITEM("lock", columnSchema.Lock_);
+ DESERIALIZE_ITEM("expression", columnSchema.Expression_);
+ DESERIALIZE_ITEM("aggregate", columnSchema.Aggregate_);
+ DESERIALIZE_ITEM("group", columnSchema.Group_);
+
+ if (nodeMap.contains("type_v3")) {
+ NTi::TTypePtr type;
+ DESERIALIZE_ITEM("type_v3", type);
+ columnSchema.Type(type);
+ } else {
+ EValueType oldType = VT_INT64;
+ bool required = false;
+ DESERIALIZE_ITEM("type", oldType);
+ DESERIALIZE_ITEM("required", required);
+ columnSchema.Type(ToTypeV3(oldType, required));
+ }
+}
+
+void Serialize(const TTableSchema& tableSchema, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginAttributes()
+ .Item("strict").Value(tableSchema.Strict())
+ .Item("unique_keys").Value(tableSchema.UniqueKeys())
+ .EndAttributes()
+ .List(tableSchema.Columns());
+}
+
+void Deserialize(TTableSchema& tableSchema, const TNode& node)
+{
+ const auto& attributesMap = node.GetAttributes().AsMap();
+ DESERIALIZE_ATTR("strict", tableSchema.Strict_);
+ DESERIALIZE_ATTR("unique_keys", tableSchema.UniqueKeys_);
+ Deserialize(tableSchema.Columns_, node);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void Serialize(const TKeyBound& keyBound, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginList()
+ .Item().Value(ToString(keyBound.Relation()))
+ .Item().Value(keyBound.Key())
+ .EndList();
+}
+
+void Deserialize(TKeyBound& keyBound, const TNode& node)
+{
+ const auto& nodeList = node.AsList();
+ Y_ENSURE(nodeList.size() == 2);
+
+ const auto& relationNode = nodeList[0];
+ keyBound.Relation(::FromString<ERelation>(relationNode.AsString()));
+
+ const auto& keyNode = nodeList[1];
+ TKey key;
+ Deserialize(key, keyNode);
+ keyBound.Key(std::move(key));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void Serialize(const TReadLimit& readLimit, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginMap()
+ .DoIf(readLimit.KeyBound_.Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("key_bound").Value(*readLimit.KeyBound_);
+ })
+ .DoIf(readLimit.Key_.Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("key").Value(*readLimit.Key_);
+ })
+ .DoIf(readLimit.RowIndex_.Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("row_index").Value(*readLimit.RowIndex_);
+ })
+ .DoIf(readLimit.Offset_.Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("offset").Value(*readLimit.Offset_);
+ })
+ .DoIf(readLimit.TabletIndex_.Defined(), [&] (TFluentMap fluent) {
+ fluent.Item("tablet_index").Value(*readLimit.TabletIndex_);
+ })
+ .EndMap();
+}
+
+void Deserialize(TReadLimit& readLimit, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("key_bound", readLimit.KeyBound_);
+ DESERIALIZE_ITEM("key", readLimit.Key_);
+ DESERIALIZE_ITEM("row_index", readLimit.RowIndex_);
+ DESERIALIZE_ITEM("offset", readLimit.Offset_);
+ DESERIALIZE_ITEM("tablet_index", readLimit.TabletIndex_);
+}
+
+void Serialize(const TReadRange& readRange, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginMap()
+ .DoIf(!IsTrivial(readRange.LowerLimit_), [&] (TFluentMap fluent) {
+ fluent.Item("lower_limit").Value(readRange.LowerLimit_);
+ })
+ .DoIf(!IsTrivial(readRange.UpperLimit_), [&] (TFluentMap fluent) {
+ fluent.Item("upper_limit").Value(readRange.UpperLimit_);
+ })
+ .DoIf(!IsTrivial(readRange.Exact_), [&] (TFluentMap fluent) {
+ fluent.Item("exact").Value(readRange.Exact_);
+ })
+ .EndMap();
+}
+
+void Deserialize(TReadRange& readRange, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("lower_limit", readRange.LowerLimit_);
+ DESERIALIZE_ITEM("upper_limit", readRange.UpperLimit_);
+ DESERIALIZE_ITEM("exact", readRange.Exact_);
+}
+
+void Serialize(const THashMap<TString, TString>& renameColumns, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer)
+ .DoMapFor(renameColumns, [] (TFluentMap fluent, const auto& item) {
+ fluent.Item(item.first).Value(item.second);
+ });
+}
+
+void Serialize(const TRichYPath& path, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).BeginAttributes()
+ .DoIf(path.GetRanges().Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("ranges").List(*path.GetRanges());
+ })
+ .DoIf(path.Columns_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("columns").Value(*path.Columns_);
+ })
+ .DoIf(path.Append_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("append").Value(*path.Append_);
+ })
+ .DoIf(path.PartiallySorted_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("partially_sorted").Value(*path.PartiallySorted_);
+ })
+ .DoIf(!path.SortedBy_.Parts_.empty(), [&] (TFluentAttributes fluent) {
+ fluent.Item("sorted_by").Value(path.SortedBy_);
+ })
+ .DoIf(path.Teleport_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("teleport").Value(*path.Teleport_);
+ })
+ .DoIf(path.Primary_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("primary").Value(*path.Primary_);
+ })
+ .DoIf(path.Foreign_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("foreign").Value(*path.Foreign_);
+ })
+ .DoIf(path.RowCountLimit_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("row_count_limit").Value(*path.RowCountLimit_);
+ })
+ .DoIf(path.FileName_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("file_name").Value(*path.FileName_);
+ })
+ .DoIf(path.OriginalPath_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("original_path").Value(*path.OriginalPath_);
+ })
+ .DoIf(path.Executable_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("executable").Value(*path.Executable_);
+ })
+ .DoIf(path.Format_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("format").Value(*path.Format_);
+ })
+ .DoIf(path.Schema_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("schema").Value(*path.Schema_);
+ })
+ .DoIf(path.Timestamp_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("timestamp").Value(*path.Timestamp_);
+ })
+ .DoIf(path.CompressionCodec_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("compression_codec").Value(*path.CompressionCodec_);
+ })
+ .DoIf(path.ErasureCodec_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("erasure_codec").Value(ToString(*path.ErasureCodec_));
+ })
+ .DoIf(path.SchemaModification_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("schema_modification").Value(ToString(*path.SchemaModification_));
+ })
+ .DoIf(path.OptimizeFor_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("optimize_for").Value(ToString(*path.OptimizeFor_));
+ })
+ .DoIf(path.TransactionId_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("transaction_id").Value(GetGuidAsString(*path.TransactionId_));
+ })
+ .DoIf(path.RenameColumns_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("rename_columns").Value(*path.RenameColumns_);
+ })
+ .DoIf(path.BypassArtifactCache_.Defined(), [&] (TFluentAttributes fluent) {
+ fluent.Item("bypass_artifact_cache").Value(*path.BypassArtifactCache_);
+ })
+ .EndAttributes()
+ .Value(path.Path_);
+}
+
+void Deserialize(TRichYPath& path, const TNode& node)
+{
+ path = {};
+
+ const auto& attributesMap = node.GetAttributes().AsMap();
+ DESERIALIZE_ATTR("ranges", path.MutableRanges());
+ DESERIALIZE_ATTR("columns", path.Columns_);
+ DESERIALIZE_ATTR("append", path.Append_);
+ DESERIALIZE_ATTR("partially_sorted", path.PartiallySorted_);
+ DESERIALIZE_ATTR("sorted_by", path.SortedBy_);
+ DESERIALIZE_ATTR("teleport", path.Teleport_);
+ DESERIALIZE_ATTR("primary", path.Primary_);
+ DESERIALIZE_ATTR("foreign", path.Foreign_);
+ DESERIALIZE_ATTR("row_count_limit", path.RowCountLimit_);
+ DESERIALIZE_ATTR("file_name", path.FileName_);
+ DESERIALIZE_ATTR("original_path", path.OriginalPath_);
+ DESERIALIZE_ATTR("executable", path.Executable_);
+ DESERIALIZE_ATTR("format", path.Format_);
+ DESERIALIZE_ATTR("schema", path.Schema_);
+ DESERIALIZE_ATTR("timestamp", path.Timestamp_);
+ DESERIALIZE_ATTR("compression_codec", path.CompressionCodec_);
+ DESERIALIZE_ATTR("erasure_codec", path.ErasureCodec_);
+ DESERIALIZE_ATTR("schema_modification", path.SchemaModification_);
+ DESERIALIZE_ATTR("optimize_for", path.OptimizeFor_);
+ DESERIALIZE_ATTR("transaction_id", path.TransactionId_);
+ DESERIALIZE_ATTR("rename_columns", path.RenameColumns_);
+ DESERIALIZE_ATTR("bypass_artifact_cache", path.BypassArtifactCache_);
+ Deserialize(path.Path_, node);
+}
+
+void Serialize(const TAttributeFilter& filter, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).List(filter.Attributes_);
+}
+
+void Deserialize(TTableColumnarStatistics& statistics, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("column_data_weights", statistics.ColumnDataWeight);
+ DESERIALIZE_ITEM("legacy_chunks_data_weight", statistics.LegacyChunksDataWeight);
+ DESERIALIZE_ITEM("timestamp_total_weight", statistics.TimestampTotalWeight);
+}
+
+void Deserialize(TMultiTablePartition::TStatistics& statistics, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("chunk_count", statistics.ChunkCount);
+ DESERIALIZE_ITEM("data_weight", statistics.DataWeight);
+ DESERIALIZE_ITEM("row_count", statistics.RowCount);
+}
+
+void Deserialize(TMultiTablePartition& partition, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("table_ranges", partition.TableRanges);
+ DESERIALIZE_ITEM("aggregate_statistics", partition.AggregateStatistics);
+}
+
+void Deserialize(TMultiTablePartitions& partitions, const TNode& node)
+{
+ const auto& nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("partitions", partitions.Partitions);
+}
+
+void Serialize(const TGUID& value, NYson::IYsonConsumer* consumer)
+{
+ BuildYsonFluently(consumer).Value(GetGuidAsString(value));
+}
+
+void Deserialize(TGUID& value, const TNode& node)
+{
+ value = GetGuid(node.AsString());
+}
+
+void Deserialize(TTabletInfo& value, const TNode& node)
+{
+ auto nodeMap = node.AsMap();
+ DESERIALIZE_ITEM("total_row_count", value.TotalRowCount)
+ DESERIALIZE_ITEM("trimmed_row_count", value.TrimmedRowCount)
+ DESERIALIZE_ITEM("barrier_timestamp", value.BarrierTimestamp)
+}
+
+void Serialize(const NTi::TTypePtr& type, NYson::IYsonConsumer* consumer)
+{
+ auto yson = NTi::NIo::SerializeYson(type.Get());
+ ::NYson::ParseYsonStringBuffer(yson, consumer);
+}
+
+void Deserialize(NTi::TTypePtr& type, const TNode& node)
+{
+ auto yson = NodeToYsonString(node, NYson::EYsonFormat::Binary);
+ type = NTi::NIo::DeserializeYson(*NTi::HeapFactory(), yson);
+}
+
+#undef DESERIALIZE_ITEM
+#undef DESERIALIZE_ATTR
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/serialize.h b/yt/cpp/mapreduce/interface/serialize.h
new file mode 100644
index 0000000000..223dd446ba
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/serialize.h
@@ -0,0 +1,90 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/serialize.h
+///
+/// Header containing declaration of functions for serializing to/from YSON.
+
+#include "common.h"
+
+#include <library/cpp/type_info/fwd.h>
+
+namespace NYT::NYson {
+struct IYsonConsumer;
+} // namespace NYT::NYson
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+void Deserialize(TMaybe<T>& value, const TNode& node)
+{
+ value.ConstructInPlace();
+ Deserialize(value.GetRef(), node);
+}
+
+template <class T>
+void Deserialize(TVector<T>& value, const TNode& node)
+{
+ for (const auto& element : node.AsList()) {
+ value.emplace_back();
+ Deserialize(value.back(), element);
+ }
+}
+
+template <class T>
+void Deserialize(THashMap<TString, T>& value, const TNode& node)
+{
+ for (const auto& item : node.AsMap()) {
+ Deserialize(value[item.first], item.second);
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void Serialize(const TKey& key, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TKey& key, const TNode& node);
+
+void Serialize(const TSortColumns& sortColumns, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TSortColumns& sortColumns, const TNode& node);
+
+void Serialize(const TColumnNames& columnNames, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TColumnNames& columnNames, const TNode& node);
+
+void Serialize(const TSortColumn& sortColumn, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TSortColumn& sortColumn, const TNode& node);
+
+void Serialize(const TKeyBound& keyBound, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TKeyBound& keyBound, const TNode& node);
+
+void Serialize(const TReadLimit& readLimit, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TReadLimit& readLimit, const TNode& node);
+
+void Serialize(const TReadRange& readRange, NYT::NYson::IYsonConsumer* consumer);
+
+void Serialize(const TRichYPath& path, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TRichYPath& path, const TNode& node);
+
+void Serialize(const TAttributeFilter& filter, NYT::NYson::IYsonConsumer* consumer);
+
+void Serialize(const TColumnSchema& columnSchema, NYT::NYson::IYsonConsumer* consumer);
+void Serialize(const TTableSchema& tableSchema, NYT::NYson::IYsonConsumer* consumer);
+
+void Deserialize(EValueType& valueType, const TNode& node);
+void Deserialize(TTableSchema& tableSchema, const TNode& node);
+void Deserialize(TColumnSchema& columnSchema, const TNode& node);
+void Deserialize(TTableColumnarStatistics& statistics, const TNode& node);
+void Deserialize(TMultiTablePartition& partition, const TNode& node);
+void Deserialize(TMultiTablePartitions& partitions, const TNode& node);
+void Deserialize(TTabletInfo& tabletInfos, const TNode& node);
+
+void Serialize(const TGUID& path, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(TGUID& value, const TNode& node);
+
+void Serialize(const NTi::TTypePtr& type, NYT::NYson::IYsonConsumer* consumer);
+void Deserialize(NTi::TTypePtr& type, const TNode& node);
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/serialize_ut.cpp b/yt/cpp/mapreduce/interface/serialize_ut.cpp
new file mode 100644
index 0000000000..59d4501ee8
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/serialize_ut.cpp
@@ -0,0 +1,49 @@
+#include <yt/cpp/mapreduce/interface/serialize.h>
+#include <yt/cpp/mapreduce/interface/common.h>
+
+#include <library/cpp/yson/node/node_builder.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/serialized_enum.h>
+
+using namespace NYT;
+
+Y_UNIT_TEST_SUITE(Serialization)
+{
+ Y_UNIT_TEST(TableSchema)
+ {
+ auto schema = TTableSchema()
+ .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING))
+ .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64))
+ .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64, true));
+
+ auto schemaNode = schema.ToNode();
+ UNIT_ASSERT(schemaNode.IsList());
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode.Size(), 3);
+
+
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["name"], "a");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["type"], "string");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["required"], false);
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["sort_order"], "ascending");
+
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["name"], "b");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["type"], "uint64");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["required"], false);
+
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["name"], "c");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["type"], "int64");
+ UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["required"], true);
+ }
+
+ Y_UNIT_TEST(ValueTypeSerialization)
+ {
+ for (const auto value : GetEnumAllValues<EValueType>()) {
+ TNode serialized = NYT::NDetail::ToString(value);
+ EValueType deserialized;
+ Deserialize(deserialized, serialized);
+ UNIT_ASSERT_VALUES_EQUAL(value, deserialized);
+ }
+ }
+}
diff --git a/yt/cpp/mapreduce/interface/skiff_row.cpp b/yt/cpp/mapreduce/interface/skiff_row.cpp
new file mode 100644
index 0000000000..7838bdaee9
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/skiff_row.cpp
@@ -0,0 +1 @@
+#include "skiff_row.h"
diff --git a/yt/cpp/mapreduce/interface/skiff_row.h b/yt/cpp/mapreduce/interface/skiff_row.h
new file mode 100644
index 0000000000..5dd335cb65
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/skiff_row.h
@@ -0,0 +1,127 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/skiff_row.h
+/// Header containing interfaces that you need to define for using TSkiffRowTableReader
+/// What you need to do for your struct type TMyType:
+/// 1. Write `true` specialization TIsSkiffRow<TMyType>;
+/// 2. Write specialization GetSkiffSchema<TMyType>();
+/// 3. Write your own parser derived from ISkiffRowParser and write specialization GetSkiffParser<TMyType>() which returns this parser.
+
+#include "fwd.h"
+
+#include <yt/cpp/mapreduce/skiff/skiff_schema.h>
+
+#include <yt/cpp/mapreduce/interface/format.h>
+
+#include <library/cpp/skiff/skiff.h>
+
+#include <util/generic/maybe.h>
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Need to write `true_type` specialization for your row type `T`.
+/// And implement two functions: `GetSkiffSchema` and `CreateSkiffParser`.
+///
+/// Example:
+///
+/// template <>
+/// struct TIsSkiffRow<T>
+/// : std::true_type
+/// { };
+///
+template<class T>
+struct TIsSkiffRow
+ : std::false_type
+{ };
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Return skiff schema for row type `T`.
+/// Need to write its specialization.
+template <typename T>
+NSkiff::TSkiffSchemaPtr GetSkiffSchema(const TMaybe<TSkiffRowHints>& /*hints*/)
+{
+ static_assert(TDependentFalse<T>, "Unimplemented `GetSkiffSchema` method");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Allow to parse rows as user's structs from stream (TCheckedInDebugSkiffParser).
+/// Need to write derived class for your own row type.
+///
+/// Example:
+///
+/// class TMySkiffRowParser : public ISkiffRowParser
+/// {
+/// public:
+/// TMySkiffRowParser(TMySkiffRow* row)
+/// : Row_(row)
+/// {}
+///
+/// void Parse(NSkiff::TCheckedInDebugSkiffParser* parser)
+/// . {
+/// Row_->SomeInt64Field = parser->ParseInt64();
+/// }
+///
+/// private:
+/// TMySkiffRow* Row_;
+/// }
+///
+class ISkiffRowParser
+ : public TThrRefBase
+{
+public:
+ //! Read one row from parser
+ virtual void Parse(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0;
+};
+
+//! Creates a parser for row type `T`.
+template <typename T>
+ISkiffRowParserPtr CreateSkiffParser(T* /*row*/, const TMaybe<TSkiffRowHints>& /*hints*/)
+{
+ static_assert(TDependentFalse<T>, "Unimplemented `CreateSkiffParser` function");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Allow to skip row content without getting row.
+/// By default row will be parsed using your parser derived from ISkiffRowParser.
+/// If you want, you can write more optimal skipper, but it isn't required.
+class ISkiffRowSkipper
+ : public TThrRefBase
+{
+public:
+ virtual void SkipRow(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0;
+};
+
+//! Default ISkiffRowSkipper implementation.
+template <typename T>
+class TSkiffRowSkipper : public ISkiffRowSkipper {
+public:
+ explicit TSkiffRowSkipper(const TMaybe<TSkiffRowHints>& hints)
+ : Parser_(CreateSkiffParser<T>(&Row_, hints))
+ { }
+
+ void SkipRow(NSkiff::TCheckedInDebugSkiffParser* parser) {
+ Parser_->Parse(parser);
+ }
+
+private:
+ T Row_;
+ ISkiffRowParserPtr Parser_;
+};
+
+//! Creates a skipper for row type 'T'.
+/// You don't need to write its specialization.
+template <typename T>
+ISkiffRowSkipperPtr CreateSkiffSkipper(const TMaybe<TSkiffRowHints>& hints)
+{
+ return ::MakeIntrusive<TSkiffRowSkipper<T>>(hints);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/tvm.cpp b/yt/cpp/mapreduce/interface/tvm.cpp
new file mode 100644
index 0000000000..bfa3f0304e
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/tvm.cpp
@@ -0,0 +1 @@
+#include "tvm.h"
diff --git a/yt/cpp/mapreduce/interface/tvm.h b/yt/cpp/mapreduce/interface/tvm.h
new file mode 100644
index 0000000000..d8d16d841b
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/tvm.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <yt/yt/library/tvm/tvm_base.h>
+
+#include <library/cpp/yt/memory/intrusive_ptr.h>
+
+namespace NYT::NAuth {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This wrapper is required because NYT::NAuth::IServiceTicketAuthPtr is NYT::TIntrusivePtr,
+/// and, if we used this pointer in interfaces of `mapreduce/yt` client, a lot of users of this library
+/// could get unexpected build errors that `TIntrusivePtr` is ambigious
+/// (from `::` namespace and from `::NYT::` namespace).
+/// So we use this wrapper in our interfaces to avoid such problems for users.
+struct IServiceTicketAuthPtrWrapper
+{
+ //
+ /// Construct wrapper from NYT::TIntrusivePtr
+ ///
+ /// This constructor is implicit so users can transparently pass NYT::TIntrusivePtr to the functions of
+ /// mapreduce/yt client.
+ template <class T, class = typename std::enable_if_t<std::is_convertible_v<T*, IServiceTicketAuth*>>>
+ IServiceTicketAuthPtrWrapper(const TIntrusivePtr<T> ptr)
+ : Ptr(ptr)
+ {
+ }
+
+ /// Wrapped pointer
+ NYT::TIntrusivePtr<IServiceTicketAuth> Ptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT::NAuth
diff --git a/yt/cpp/mapreduce/interface/ut/ya.make b/yt/cpp/mapreduce/interface/ut/ya.make
new file mode 100644
index 0000000000..0219e6430c
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/ut/ya.make
@@ -0,0 +1,25 @@
+UNITTEST_FOR(yt/cpp/mapreduce/interface)
+
+SRCS(
+ common_ut.cpp
+ config_ut.cpp
+ error_ut.cpp
+ format_ut.cpp
+ job_counters_ut.cpp
+ job_statistics_ut.cpp
+ operation_ut.cpp
+ proto3_ut.proto
+ protobuf_table_schema_ut.cpp
+ protobuf_file_options_ut.cpp
+ protobuf_table_schema_ut.proto
+ protobuf_file_options_ut.proto
+ serialize_ut.cpp
+)
+
+PEERDIR(
+ contrib/libs/protobuf
+ library/cpp/testing/unittest
+ yt/yt_proto/yt/formats
+)
+
+END()
diff --git a/yt/cpp/mapreduce/interface/wait_proxy.h b/yt/cpp/mapreduce/interface/wait_proxy.h
new file mode 100644
index 0000000000..f7d8e0638e
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/wait_proxy.h
@@ -0,0 +1,54 @@
+#pragma once
+
+///
+/// @file yt/cpp/mapreduce/interface/serialize.h
+///
+/// Header containing interface to enable customizable waiting.
+
+#include <yt/cpp/mapreduce/interface/common.h>
+
+#include <util/datetime/base.h>
+
+namespace NThreading {
+template <typename T>
+class TFuture;
+}
+
+class TSystemEvent;
+class TCondVar;
+class TMutex;
+
+namespace NYT {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// @brief Interface to facilitate customizable waiting.
+///
+/// All the waiting functions in the library are obliged to use the methods of a wait proxy instead of direct function calls.
+class IWaitProxy
+ : public TThrRefBase
+{
+public:
+ virtual ~IWaitProxy() = default;
+
+ ///
+ /// @brief Wait for the future setting with timeout.
+ virtual bool WaitFuture(const ::NThreading::TFuture<void>& future, TDuration timeout) = 0;
+
+ ///
+ /// @brief Wait for a system event with timeout.
+ virtual bool WaitEvent(TSystemEvent& event, TDuration timeout) = 0;
+
+ ///
+ /// @brief Wait for the notification on the condition variable with timeout.
+ virtual bool WaitCondVar(TCondVar& condVar, TMutex& mutex, TDuration timeout) = 0;
+
+ ///
+ /// @brief Sleep in the current thread for (approximately) specified amount of time.
+ virtual void Sleep(TDuration timeout) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace NYT
diff --git a/yt/cpp/mapreduce/interface/ya.make b/yt/cpp/mapreduce/interface/ya.make
new file mode 100644
index 0000000000..0e94f14633
--- /dev/null
+++ b/yt/cpp/mapreduce/interface/ya.make
@@ -0,0 +1,46 @@
+LIBRARY()
+
+INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc)
+
+SRCS(
+ batch_request.cpp
+ client.cpp
+ client_method_options.cpp
+ common.cpp
+ config.cpp
+ cypress.cpp
+ errors.cpp
+ format.cpp
+ job_counters.cpp
+ job_statistics.cpp
+ io.cpp
+ operation.cpp
+ protobuf_format.cpp
+ serialize.cpp
+ skiff_row.cpp
+ tvm.cpp
+)
+
+PEERDIR(
+ contrib/libs/protobuf
+ library/cpp/type_info
+ library/cpp/threading/future
+ library/cpp/yson/node
+ yt/cpp/mapreduce/interface/logging
+ yt/yt_proto/yt/formats
+ yt/yt/library/tvm
+)
+
+GENERATE_ENUM_SERIALIZATION(client_method_options.h)
+GENERATE_ENUM_SERIALIZATION(client.h)
+GENERATE_ENUM_SERIALIZATION(common.h)
+GENERATE_ENUM_SERIALIZATION(config.h)
+GENERATE_ENUM_SERIALIZATION(cypress.h)
+GENERATE_ENUM_SERIALIZATION(job_counters.h)
+GENERATE_ENUM_SERIALIZATION(job_statistics.h)
+GENERATE_ENUM_SERIALIZATION(operation.h)
+GENERATE_ENUM_SERIALIZATION(protobuf_format.h)
+
+END()
+
+RECURSE_FOR_TESTS(ut)